digital_scriptorium 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '085e39f712849a9c7e65b62b1f9a7715830b60a87adad5d67607f42f74535304'
4
+ data.tar.gz: 8739348215acc0f8df17b17155df21ed641ffff4845d4d72a6ab3c9652e9cfca
5
+ SHA512:
6
+ metadata.gz: f8d519fa3b4246f3c182738d4ad42567046dbf0d691a3f07958809669ace36b25d2f4697b1671f46bff0fae97c9692d389b108696c334252708377a68320906a
7
+ data.tar.gz: 50f9861fb347aee4fc8a556ffc013009c23b84ac631c17fc705e4466852ea63a617c06aae5718431df908035d258928615b64a2bdf45cbaa16e46efc684d90d1
data/.ignore ADDED
@@ -0,0 +1 @@
1
+ *.json
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,30 @@
1
+ require:
2
+ - rubocop-rake
3
+ - rubocop-rspec
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 3.0
7
+ NewCops: enable
8
+ Layout/LineLength:
9
+ Exclude:
10
+ - spec/**/*
11
+ Metrics/AbcSize:
12
+ Enabled: false
13
+ Metrics/BlockLength:
14
+ Enabled: false
15
+ Metrics/CyclomaticComplexity:
16
+ Enabled: false
17
+ Metrics/MethodLength:
18
+ Enabled: false
19
+ Metrics/ModuleLength:
20
+ Enabled: false
21
+ Metrics/PerceivedComplexity:
22
+ Enabled: false
23
+ RSpec/ExampleLength:
24
+ Enabled: false
25
+ RSpec/MultipleExpectations:
26
+ Enabled: false
27
+ RSpec/MultipleMemoizedHelpers:
28
+ Enabled: false
29
+ Style/SafeNavigationChainLength:
30
+ Enabled: false
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Michael Holloway
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # DigitalScriptorium
2
+
3
+ This gem provides code to support the transformation of Digital Scriptorium Wikibase data exports into collections of Apache Solr records that can be searched using [DS Catalog](https://search.digital-scriptorium.org/).
4
+
5
+ See [here](doc/overview.md) for a technical overview of the logic for transforming Wikibase items in the export to Solr records.
6
+
7
+ ## Installation
8
+
9
+ Install the gem and add to the application's Gemfile by executing:
10
+
11
+ ```bash
12
+ bundle add digital_scriptorium
13
+ ```
14
+
15
+ If bundler is not being used to manage dependencies, install the gem by executing:
16
+
17
+ ```bash
18
+ gem install digital_scriptorium
19
+ ```
20
+
21
+ ## Development
22
+
23
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
24
+
25
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
26
+
27
+ ## Contributing
28
+
29
+ Bug reports and pull requests are welcome on GitHub at https://github.com/mdholloway/digital_scriptorium.
30
+
31
+ ## License
32
+
33
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require 'rubocop/rake_task'
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
data/doc/overview.md ADDED
@@ -0,0 +1,13 @@
1
+ # How it works
2
+
3
+ For a general description of the Wikibase data model, see [Wikibase/DataModel](https://www.mediawiki.org/wiki/Wikibase/DataModel) on mediawiki.org.
4
+
5
+ The Digital Scriptorium Wikibase data export is a JSON-formatted array of Wikibase entities. The bulk of the entities in the export consist of triplets that together form a meta-record consisting of one each of the DS Catalog core model types: manuscipts, holdings, and records. The export also contains entities representing property definitions and authoritative references to common topics.
6
+
7
+ The [ExportRepresenter](../lib/digital_scriptorium/export_representer.rb) class can be used to deserialize an export in its entirety. The resulting [Export](../lib/digital_scriptorium/export.rb) object is essentially an array of Item and Property objects. Entities in the export are modeled using domain-specific classes provided by the [wikibase_representable](https://rubygems.org/gems/wikibase_representable) gem, such as Items, Properties, Statements (also known as Claims), and Snaks, which represent the primary claim of any statement as well as any qualifiers. Convenience methods are also provided to facilitate extracting data values.
8
+
9
+ The conversion script [wikibase_to_solr_new.rb](../wikibase_to_solr_new.rb) proceeds by deserializing the export and converting the resulting array of Wikibase objects to a hash keyed by entity ID. It then iterates over the elements of the hash. When it finds a record item based on the value of its instance-of (P16) claim, it retrieves the linked manuscript item, as well as the holding item linked in turn to the manuscript item, from the export hash by entity ID. It then iterates over the claims attached to manuscript, holding, and record in turn, extracting the Solr fields requested based on the property ID that is the subject of the claim and adding them to the Solr record to be produced for the meta-record. Claims for most properties are transformed to Solr fields using a generic algorithm implemented in [ClaimTransformer](../lib/digital_scriptorium/claim_transformer.rb). Name and date claims require some special handling, and are handled in dedicated claim transformer classes ([NameClaimTransformer](../lib/digital_scriptorium/name_claim_transformer.rb) and [DateClaimTransformer](../lib/digital_scriptorium/date_claim_transformer.rb) respectively). After all claims from the manuscript, holding, and record have been processed, the resulting Solr record is written to the output file.
10
+
11
+ The specific Solr fields produced for each claim are controlled by the configuration file [property_config.yml](../property_config.yml). This file also defines the prefix (representing the property name) to be attached to each field for a given property, and whether a claim based on the property might have a related authority qualifier.
12
+
13
+ The script was written so as not to rely on the structure of the export file beyond that it will be a JSON array of Wikibase entities, with records linked to manuscripts and manuscripts linked to holdings by P3 (described manuscript) and P2 (holding) claims respectively.
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'wikibase_representable'
4
+
5
+ module DigitalScriptorium
6
+ # Transformer for converting claims of Digital Scriptorium items into Solr fields.
7
+ class ClaimTransformer
8
+ include PropertyId
9
+ include WikibaseRepresentable::Model
10
+
11
+ def self.transform(claim, export_hash, config)
12
+ solr_props = {}
13
+
14
+ prefix = config['prefix']
15
+ requested_fields = config['fields']
16
+ authority_property_id = config['authority']
17
+
18
+ value = primary_value_from_claim(claim, export_hash)
19
+
20
+ solr_props['id'] = [value] if requested_fields.include? 'id'
21
+ solr_props["#{prefix}_meta"] = [value] if requested_fields.include? 'meta'
22
+
23
+ display_props = { 'PV' => value }
24
+
25
+ if authority_property_id && claim.qualifiers_by_property_id?(authority_property_id)
26
+ authority_id = claim.qualifier_by_property_id(authority_property_id).entity_id_value
27
+ authority = export_hash[authority_id]
28
+
29
+ if authority
30
+ label = authority.label('en')
31
+ display_props['QL'] = label
32
+
33
+ external_uri = authority.claim_by_property_id(EXTERNAL_URI)&.data_value
34
+
35
+ wikidata_id = authority.claim_by_property_id(WIKIDATA_QID)&.data_value
36
+ wikidata_uri = wikidata_id && "https://www.wikidata.org/wiki/#{wikidata_id}"
37
+
38
+ # Only one or the other of these seem to exist for a given item in practice.
39
+ display_props['QU'] = external_uri if external_uri
40
+ display_props['QU'] = wikidata_uri if wikidata_uri
41
+
42
+ solr_props["#{config['prefix']}_display"] = [display_props.to_json] if config['fields'].include? 'display'
43
+ solr_props["#{config['prefix']}_search"] = [value, label].uniq if config['fields'].include? 'search'
44
+ solr_props["#{config['prefix']}_facet"] = [label] if config['fields'].include? 'facet'
45
+
46
+ return solr_props
47
+ end
48
+ end
49
+
50
+ solr_props["#{config['prefix']}_display"] = [display_props.to_json] if config['fields'].include? 'display'
51
+ solr_props["#{config['prefix']}_search"] = [value] if config['fields'].include? 'search'
52
+ solr_props["#{config['prefix']}_facet"] = [value] if config['fields'].include? 'facet'
53
+
54
+ solr_props['images_facet'] = ['Yes'] if value && claim.property_id == IIIF_MANIFEST
55
+ solr_props["#{config['prefix']}_link"] = [value] if config['fields'].include? 'link'
56
+
57
+ solr_props
58
+ end
59
+
60
+ def self.primary_value_from_claim(claim, export_hash)
61
+ if claim.value_type? EntityIdValue
62
+ entity_id = claim.entity_id_value
63
+ referenced_item = export_hash[entity_id]
64
+ referenced_item.label('en')
65
+ elsif claim.value_type? TimeValue
66
+ claim.time_value
67
+ else
68
+ claim.data_value
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'time'
4
+
5
+ module DigitalScriptorium
6
+ # Special-purpose transformer for date (P23) claims
7
+ class DateClaimTransformer
8
+ include PropertyId
9
+
10
+ def self.transform(claim, export_hash, config)
11
+ solr_props = ClaimTransformer.transform(claim, export_hash, config)
12
+ return solr_props unless claim.qualifiers
13
+
14
+ century = claim.qualifier_by_property_id(CENTURY).time_value
15
+ earliest = claim.qualifier_by_property_id(EARLIEST_DATE).time_value
16
+ latest = claim.qualifier_by_property_id(LATEST_DATE).time_value
17
+
18
+ solr_props['century_int'] = [Time.parse(century).year] unless century.nil?
19
+ solr_props['earliest_int'] = [Time.parse(earliest).year] unless earliest.nil?
20
+ solr_props['latest_int'] = [Time.parse(latest).year] unless latest.nil?
21
+
22
+ solr_props
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'wikibase_representable'
4
+
5
+ module DigitalScriptorium
6
+ # Represents a Digital Scriptorium item
7
+ class DsItem < WikibaseRepresentable::Model::Item
8
+ def instance_of_claims
9
+ claims_by_property_id PropertyId::INSTANCE_OF # P16
10
+ end
11
+
12
+ def ds_id
13
+ claim_by_property_id(PropertyId::DS_ID)&.data_value # P1
14
+ end
15
+
16
+ def holding_id
17
+ claim_by_property_id(PropertyId::MANUSCRIPT_HOLDING)&.entity_id_value # P2
18
+ end
19
+
20
+ def described_manuscript_id
21
+ claim_by_property_id(PropertyId::DESCRIBED_MANUSCRIPT)&.entity_id_value # P3
22
+ end
23
+
24
+ def iiif_manifest
25
+ claim_by_property_id(PropertyId::IIIF_MANIFEST)&.entity_id_value # P41
26
+ end
27
+
28
+ def core_model_item?
29
+ instance_of_claims.any? { |claim| ItemId::CORE_MODEL_ITEMS.include? claim.entity_id_value }
30
+ end
31
+
32
+ def manuscript?
33
+ instance_of_claims.any? { |claim| claim.entity_id_value == ItemId::MANUSCRIPT }
34
+ end
35
+
36
+ def holding?
37
+ instance_of_claims.any? { |claim| claim.entity_id_value == ItemId::HOLDING }
38
+ end
39
+
40
+ def record?
41
+ instance_of_claims.any? { |claim| claim.entity_id_value == ItemId::RECORD }
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # Represents a meta record consisting of a manuscript, its holding information, and metadata record.
5
+ class DsMeta
6
+ attr_reader :holding, :manuscript, :record
7
+
8
+ def initialize(record, export_hash)
9
+ manuscript = export_hash[record.described_manuscript_id]
10
+ holding = export_hash[manuscript.holding_id]
11
+
12
+ @holding = holding
13
+ @manuscript = manuscript
14
+ @record = record
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # Simple model class representing a Wikibase JSON export.
5
+ # Provides a to_hash method to facilitate entity lookups by ID.
6
+ class Export < Array
7
+ def to_hash
8
+ hash = {}
9
+ each do |el|
10
+ hash[el.id] = el
11
+ end
12
+ hash
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'representable/json/collection'
4
+ require 'wikibase_representable'
5
+
6
+ module DigitalScriptorium
7
+ # Representer class for deserializing Wikibase data exports from JSON.
8
+ class ExportRepresenter < Representable::Decorator
9
+ include Representable::JSON::Collection
10
+ include WikibaseRepresentable::Model
11
+ include WikibaseRepresentable::Representers
12
+
13
+ items decorator: lambda { |input:, **|
14
+ input.type == Item::ENTITY_TYPE ? ItemRepresenter : PropertyRepresenter
15
+ }, class: lambda { |input:, **|
16
+ input['type'] == Item::ENTITY_TYPE ? DsItem : Property
17
+ }
18
+ end
19
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # An item representing a Digital Scriptorium holding (instance of Q2)
5
+ class Holding < DsItem
6
+ def institution_as_recorded_claims
7
+ claims_by_property_id HOLDING_INSTITUTION_AS_RECORDED # P5
8
+ end
9
+
10
+ def status_claims
11
+ claims_by_property_id HOLDING_STATUS # P6
12
+ end
13
+
14
+ def institutional_id_claims
15
+ claims_by_property_id INSTITUTIONAL_ID # P7
16
+ end
17
+
18
+ def shelfmark_claims
19
+ claims_by_property_id SHELFMARK # P8
20
+ end
21
+
22
+ def link_to_institutional_record_claims
23
+ claims_by_property_id LINK_TO_INSTITUTIONAL_RECORD # P9
24
+ end
25
+
26
+ def start_time_claims
27
+ claims_by_property_id START_TIME # P38
28
+ end
29
+
30
+ def end_time_claims
31
+ claims_by_property_id END_TIME # P39
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # Constants for core model item IDs.
5
+ module ItemId
6
+ MANUSCRIPT = 'Q1'
7
+ HOLDING = 'Q2'
8
+ RECORD = 'Q3'
9
+
10
+ CORE_MODEL_ITEMS = Set[MANUSCRIPT, HOLDING, RECORD]
11
+ end
12
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # An item representing a Digital Scriptorium manuscript (instance of Q1)
5
+ class Manuscript < DsItem
6
+ include PropertyId
7
+
8
+ def ds_id
9
+ claim_by_property_id(DS_ID).data_value # P1
10
+ end
11
+
12
+ def holding_id
13
+ claim_by_property_id(MANUSCRIPT_HOLDING).entity_id_value # P2
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # Special-purpose transformer for name (P14) claims
5
+ class NameClaimTransformer
6
+ include PropertyId
7
+
8
+ def self.transform(claim, export_hash)
9
+ return {} unless claim.qualifiers_by_property_id? ROLE_IN_AUTHORITY_FILE
10
+
11
+ role_entity_id = claim.qualifier_by_property_id(ROLE_IN_AUTHORITY_FILE).entity_id_value
12
+ role_item = export_hash[role_entity_id]
13
+ role_label = role_item.label('en')
14
+ prefix = role_label.downcase.split.last
15
+
16
+ recorded_name = claim.data_value
17
+ display_names = { 'PV' => recorded_name }
18
+ search_names = [recorded_name]
19
+
20
+ name_in_original_script = claim.qualifier_by_property_id(IN_ORIGINAL_SCRIPT)&.data_value&.value
21
+ display_names['AGR'] = name_in_original_script if name_in_original_script
22
+ search_names << name_in_original_script if name_in_original_script
23
+
24
+ unless claim.qualifiers_by_property_id? NAME_IN_AUTHORITY_FILE
25
+ return {
26
+ "#{prefix}_display" => [display_names.to_json],
27
+ "#{prefix}_search" => search_names,
28
+ "#{prefix}_facet" => [recorded_name]
29
+ }
30
+ end
31
+
32
+ name_entity_id = claim.qualifier_by_property_id(NAME_IN_AUTHORITY_FILE).entity_id_value
33
+ name_item = export_hash[name_entity_id]
34
+ name_label = name_item.label('en')
35
+
36
+ wikidata_id = name_item.claim_by_property_id(WIKIDATA_QID).data_value
37
+ wikidata_url = "https://www.wikidata.org/wiki/#{wikidata_id}"
38
+
39
+ search_names << name_label
40
+ display_names['QL'] = name_label
41
+ display_names['QU'] = wikidata_url if wikidata_url
42
+
43
+ {
44
+ "#{prefix}_display" => [display_names.to_json],
45
+ "#{prefix}_search" => search_names,
46
+ "#{prefix}_facet" => [name_label]
47
+ }
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # Constants for Digital Scriptorium Wikibase property IDs
5
+ module PropertyId
6
+ DS_ID = 'P1'
7
+ MANUSCRIPT_HOLDING = 'P2'
8
+ DESCRIBED_MANUSCRIPT = 'P3'
9
+ HOLDING_INSTITUTION_IN_AUTHORITY_FILE = 'P4'
10
+ HOLDING_INSTITUTION_AS_RECORDED = 'P5' # qualifiers: P4
11
+ HOLDING_STATUS = 'P6'
12
+ INSTITUTIONAL_ID = 'P7'
13
+ SHELFMARK = 'P8'
14
+ LINK_TO_INSTITUTIONAL_RECORD = 'P9'
15
+ TITLE_AS_RECORDED = 'P10' # qualifiers: P11, P13
16
+ STANDARD_TITLE = 'P11'
17
+ UNIFORM_TITLE_AS_RECORDED = 'P12'
18
+ IN_ORIGINAL_SCRIPT = 'P13'
19
+ ASSOCIATED_NAME_AS_RECORDED = 'P14' # qualifiers: P15, P17
20
+ ROLE_IN_AUTHORITY_FILE = 'P15'
21
+ INSTANCE_OF = 'P16'
22
+ NAME_IN_AUTHORITY_FILE = 'P17'
23
+ GENRE_AS_RECORDED = 'P18' # qualifiers: P20
24
+ SUBJECT_AS_RECORDED = 'P19' # qualifiers: P20
25
+ TERM_IN_AUTHORITY_FILE = 'P20'
26
+ LANGUAGE_AS_RECORDED = 'P21' # qualifiers: P22
27
+ LANGUAGE_IN_AUTHORITY_FILE = 'P22'
28
+ PRODUCTION_DATE_AS_RECORDED = 'P23' # qualifiers: P25, P24, P37, P36
29
+ PRODUCTION_CENTURY_IN_AUTHORITY_FILE = 'P24'
30
+ CENTURY = 'P25'
31
+ DATED = 'P26'
32
+ PRODUCTION_PLACE_AS_RECORDED = 'P27' # qualifiers: P28
33
+ PLACE_IN_AUTHORITY_FILE = 'P28'
34
+ PHYSICAL_DESCRIPTION = 'P29'
35
+ MATERIAL_AS_RECORDED = 'P30' # qualifiers: P31
36
+ MATERIAL_IN_AUTHORITY_FILE = 'P31'
37
+ NOTE = 'P32'
38
+ ACKNOWLEDGEMENTS = 'P33'
39
+ DATE_ADDED = 'P34'
40
+ DATE_LAST_UPDATED = 'P35'
41
+ LATEST_DATE = 'P36'
42
+ EARLIEST_DATE = 'P37'
43
+ START_TIME = 'P38'
44
+ END_TIME = 'P39'
45
+ EXTERNAL_IDENTIFIER = 'P40'
46
+ IIIF_MANIFEST = 'P41'
47
+ WIKIDATA_QID = 'P42'
48
+ VIAF_ID = 'P43'
49
+ EXTERNAL_URI = 'P44'
50
+ EQUIVALENT_PROPERTY = 'P45'
51
+ FORMATTER_URL = 'P46'
52
+ SUBCLASS_OF = 'P47'
53
+ end
54
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ # An item representing a Digital Scriptorium record (instance of Q3)
5
+ class Record < DsItem
6
+ include PropertyId
7
+
8
+ def described_manuscript_id
9
+ claim_by_property_id(DESCRIBED_MANUSCRIPT).entity_id_value # P3
10
+ end
11
+
12
+ def title_as_recorded_claims
13
+ claims_by_property_id TITLE_AS_RECORDED # P10
14
+ end
15
+
16
+ def uniform_title_as_recorded_claims
17
+ claims_by_property_id UNIFORM_TITLE_AS_RECORDED # P12
18
+ end
19
+
20
+ def associated_name_as_recorded_claims
21
+ claims_by_property_id ASSOCIATED_NAME_AS_RECORDED # P14
22
+ end
23
+
24
+ def genre_as_recorded_claims
25
+ claims_by_property_id GENRE_AS_RECORDED # P18
26
+ end
27
+
28
+ def language_as_recorded_claims
29
+ claims_by_property_id LANGUAGE_AS_RECORDED # P21
30
+ end
31
+
32
+ def production_date_as_recorded_claims
33
+ claims_by_property_id PRODUCTION_DATE_AS_RECORDED # P23
34
+ end
35
+
36
+ def dated_claims
37
+ claims_by_property_id DATED # P26
38
+ end
39
+
40
+ def production_place_as_recorded_claims
41
+ claims_by_property_id PRODUCTION_PLACE_AS_RECORDED # P27
42
+ end
43
+
44
+ def physical_description_claims
45
+ claims_by_property_id PHYSICAL_DESCRIPTION # P29
46
+ end
47
+
48
+ def material_as_recorded_claims
49
+ claims_by_property_id MATERIAL_AS_RECORDED # P30
50
+ end
51
+
52
+ def note_claims
53
+ claims_by_property_id NOTE # P32
54
+ end
55
+
56
+ def acknowledgements_claims
57
+ claims_by_property_id ACKNOWLEDGEMENTS # P33
58
+ end
59
+
60
+ def date_added_claims
61
+ claims_by_property_id DATE_ADDED # P34
62
+ end
63
+
64
+ def date_last_updated_claims
65
+ claims_by_property_id DATE_LAST_UPDATED # P35
66
+ end
67
+
68
+ def iiif_manifest_claims
69
+ claims_by_property_id IIIF_MANIFEST # P41
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DigitalScriptorium
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digital_scriptorium/item_id'
4
+ require 'digital_scriptorium/property_id'
5
+
6
+ require 'digital_scriptorium/ds_item'
7
+ require 'digital_scriptorium/ds_meta'
8
+ require 'digital_scriptorium/holding'
9
+ require 'digital_scriptorium/manuscript'
10
+ require 'digital_scriptorium/record'
11
+
12
+ require 'digital_scriptorium/export'
13
+ require 'digital_scriptorium/export_representer'
14
+
15
+ require 'digital_scriptorium/claim_transformer'
16
+ require 'digital_scriptorium/date_claim_transformer'
17
+ require 'digital_scriptorium/name_claim_transformer'
@@ -0,0 +1,106 @@
1
+ P1:
2
+ prefix: id
3
+ fields:
4
+ - id
5
+ - display
6
+ - search
7
+ P5:
8
+ prefix: institution
9
+ fields:
10
+ - display
11
+ - search
12
+ - facet
13
+ authority: P4
14
+ P6:
15
+ prefix: holding_status
16
+ fields:
17
+ - display
18
+ P8:
19
+ prefix: shelfmark
20
+ fields:
21
+ - display
22
+ - search
23
+ P9:
24
+ prefix: institutional_record
25
+ fields:
26
+ - link
27
+ P10:
28
+ prefix: title
29
+ fields:
30
+ - display
31
+ - search
32
+ - facet
33
+ authority: P11
34
+ P12:
35
+ prefix: uniform_title
36
+ fields:
37
+ - search
38
+ # NOTE: P14 can translate to any of a few different Solr fields based on the value of the
39
+ # associated role (P15) qualifier, and is handled in its own dedicated processing method
40
+ P14:
41
+ prefix: associated_name
42
+ fields: []
43
+ P18:
44
+ prefix: term
45
+ fields:
46
+ - display
47
+ - search
48
+ - facet
49
+ authority: P20
50
+ P19:
51
+ prefix: term
52
+ fields:
53
+ - display
54
+ - search
55
+ - facet
56
+ authority: P20
57
+ P21:
58
+ prefix: language
59
+ fields:
60
+ - display
61
+ - search
62
+ - facet
63
+ authority: P22
64
+ P23:
65
+ prefix: date
66
+ fields:
67
+ - meta
68
+ - display
69
+ - search
70
+ - facet
71
+ authority: P24
72
+ P26:
73
+ prefix: dated
74
+ fields:
75
+ - display
76
+ - facet
77
+ P27:
78
+ prefix: place
79
+ fields:
80
+ - display
81
+ - search
82
+ - facet
83
+ authority: P28
84
+ P29:
85
+ prefix: physical_description
86
+ fields:
87
+ - display
88
+ - search
89
+ P30:
90
+ prefix: material
91
+ fields:
92
+ - facet
93
+ authority: P31
94
+ P32:
95
+ prefix: note
96
+ fields:
97
+ - display
98
+ - search
99
+ P33:
100
+ prefix: acknowledgements
101
+ fields:
102
+ - display
103
+ P41:
104
+ prefix: iiif_manifest
105
+ fields:
106
+ - link
@@ -0,0 +1,4 @@
1
+ module DigitalScriptorium
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digital_scriptorium'
4
+ require 'json'
5
+ require 'optparse'
6
+ require 'time'
7
+ require 'tty-spinner'
8
+ require 'yaml'
9
+ require 'zlib'
10
+
11
+ dir = File.dirname __FILE__
12
+
13
+ input_file = File.expand_path 'wikibase_export.json.gz', dir
14
+ output_file = File.expand_path 'solr_import.json', dir
15
+ config_file = File.expand_path 'property_config.yml', dir
16
+ pretty_print = false
17
+
18
+ OptionParser.new do |opts|
19
+ opts.banner = 'Usage: wikibase_to_solr.rb [options]'
20
+
21
+ opts.on('-i', '--in FILE', 'The file path to the gzipped Wikibase JSON export file.') do |f|
22
+ input_file = File.expand_path f, dir
23
+ end
24
+
25
+ opts.on('-o', '--out FILE', 'The file path to output the formatted Solr JSON file.') do |f|
26
+ output_file = File.expand_path f, dir
27
+ end
28
+
29
+ opts.on('-c', '--config FILE', 'The file path to the property configuration file.') do |f|
30
+ config_file = File.expand_path f, dir
31
+ end
32
+
33
+ opts.on('-p', '--pretty-print', 'Whether to pretty-print the JSON output.') do
34
+ pretty_print = true
35
+ end
36
+ end.parse!
37
+
38
+ def merge(solr_item, new_props)
39
+ solr_item.merge(new_props) do |_, old_val, new_val|
40
+ old_val.nil? ? new_val : (old_val + new_val).uniq
41
+ end
42
+ end
43
+
44
+ def merge_transformed_fields(solr_item, claim, export_hash, property_config)
45
+ if claim.property_id == DigitalScriptorium::PropertyId::ASSOCIATED_NAME_AS_RECORDED
46
+ merge(solr_item, DigitalScriptorium::NameClaimTransformer.transform(claim, export_hash))
47
+ elsif claim.property_id == DigitalScriptorium::PropertyId::PRODUCTION_DATE_AS_RECORDED
48
+ merge(solr_item,
49
+ DigitalScriptorium::DateClaimTransformer.transform(claim, export_hash, property_config))
50
+ else
51
+ merge(solr_item,
52
+ DigitalScriptorium::ClaimTransformer.transform(claim, export_hash, property_config))
53
+ end
54
+ end
55
+
56
+ start_time = Time.now
57
+
58
+ config = YAML.load_file(config_file)
59
+
60
+ loading_spinner = TTY::Spinner.new('[:spinner] Loading export data', hide_cursor: true)
61
+ loading_spinner.auto_spin
62
+
63
+ export_json = Zlib::GzipReader.open(input_file).read
64
+ export_hash = DigitalScriptorium::ExportRepresenter.new(DigitalScriptorium::Export.new)
65
+ .from_json(export_json)
66
+ .to_hash
67
+ loaded_time = Time.now
68
+ loading_spinner.success("(#{format('%0.02f', loaded_time - start_time)}s)")
69
+
70
+ item_count = 0
71
+ generating_spinner = TTY::Spinner.new('[:spinner] Generating Solr documents', hide_cursor: true)
72
+ generating_spinner.auto_spin
73
+
74
+ File.open(output_file, 'w') do |file|
75
+ file << '['
76
+ file << "\n" if pretty_print
77
+
78
+ export_hash.each_with_index do |(_, entity), idx|
79
+ next unless entity.is_a?(DigitalScriptorium::DsItem) &&
80
+ entity.claims_by_property_id?(DigitalScriptorium::PropertyId::INSTANCE_OF) &&
81
+ entity.record?
82
+
83
+ meta = DigitalScriptorium::DsMeta.new(entity, export_hash)
84
+ solr_item = { 'qid_meta' => [meta.holding.id, meta.manuscript.id, meta.record.id] }
85
+
86
+ [meta.holding, meta.manuscript, meta.record].each do |item|
87
+ item.claims.each do |property_id, claims|
88
+ claims.each do |claim|
89
+ next unless (property_config = config[property_id])
90
+
91
+ solr_item = merge_transformed_fields(solr_item, claim, export_hash, property_config)
92
+ end
93
+ end
94
+ end
95
+
96
+ file << (pretty_print ? JSON.pretty_generate(solr_item) : JSON.generate(solr_item))
97
+ file << ',' if idx < export_hash.size - 1
98
+ file << "\n" if pretty_print
99
+
100
+ item_count += 1
101
+ end
102
+
103
+ file << ']'
104
+ end
105
+
106
+ finish_time = Time.now
107
+ generating_spinner.success("(#{format('%0.02f', finish_time - loaded_time)}s)")
108
+ puts "Generated #{item_count} Solr documents in #{format('%0.02f', finish_time - start_time)} seconds"
metadata ADDED
@@ -0,0 +1,121 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: digital_scriptorium
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Michael Holloway
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 2025-01-06 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: multi_json
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.15'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '1.15'
26
+ - !ruby/object:Gem::Dependency
27
+ name: representable
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '3.2'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '3.2'
40
+ - !ruby/object:Gem::Dependency
41
+ name: tty-spinner
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '0.9'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '0.9'
54
+ - !ruby/object:Gem::Dependency
55
+ name: wikibase_representable
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '0.1'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '0.1'
68
+ email:
69
+ - michael@mdholloway.org
70
+ executables: []
71
+ extensions: []
72
+ extra_rdoc_files: []
73
+ files:
74
+ - ".ignore"
75
+ - ".rspec"
76
+ - ".rubocop.yml"
77
+ - LICENSE.txt
78
+ - README.md
79
+ - Rakefile
80
+ - doc/overview.md
81
+ - lib/digital_scriptorium.rb
82
+ - lib/digital_scriptorium/claim_transformer.rb
83
+ - lib/digital_scriptorium/date_claim_transformer.rb
84
+ - lib/digital_scriptorium/ds_item.rb
85
+ - lib/digital_scriptorium/ds_meta.rb
86
+ - lib/digital_scriptorium/export.rb
87
+ - lib/digital_scriptorium/export_representer.rb
88
+ - lib/digital_scriptorium/holding.rb
89
+ - lib/digital_scriptorium/item_id.rb
90
+ - lib/digital_scriptorium/manuscript.rb
91
+ - lib/digital_scriptorium/name_claim_transformer.rb
92
+ - lib/digital_scriptorium/property_id.rb
93
+ - lib/digital_scriptorium/record.rb
94
+ - lib/digital_scriptorium/version.rb
95
+ - property_config.yml
96
+ - sig/digital_scriptorium.rbs
97
+ - wikibase_to_solr_new.rb
98
+ homepage: https://github.com/mdholloway/digital_scriptorium
99
+ licenses:
100
+ - MIT
101
+ metadata:
102
+ homepage_uri: https://github.com/mdholloway/digital_scriptorium
103
+ rubygems_mfa_required: 'true'
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: 3.0.0
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubygems_version: 3.6.2
119
+ specification_version: 4
120
+ summary: Supporting code for the Digital Scriptorium DS Catalog 2.0 project
121
+ test_files: []