digital_scriptorium 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.ignore +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +30 -0
- data/LICENSE.txt +21 -0
- data/README.md +33 -0
- data/Rakefile +12 -0
- data/doc/overview.md +13 -0
- data/lib/digital_scriptorium/claim_transformer.rb +72 -0
- data/lib/digital_scriptorium/date_claim_transformer.rb +25 -0
- data/lib/digital_scriptorium/ds_item.rb +44 -0
- data/lib/digital_scriptorium/ds_meta.rb +17 -0
- data/lib/digital_scriptorium/export.rb +15 -0
- data/lib/digital_scriptorium/export_representer.rb +19 -0
- data/lib/digital_scriptorium/holding.rb +34 -0
- data/lib/digital_scriptorium/item_id.rb +12 -0
- data/lib/digital_scriptorium/manuscript.rb +16 -0
- data/lib/digital_scriptorium/name_claim_transformer.rb +50 -0
- data/lib/digital_scriptorium/property_id.rb +54 -0
- data/lib/digital_scriptorium/record.rb +72 -0
- data/lib/digital_scriptorium/version.rb +5 -0
- data/lib/digital_scriptorium.rb +17 -0
- data/property_config.yml +106 -0
- data/sig/digital_scriptorium.rbs +4 -0
- data/wikibase_to_solr_new.rb +108 -0
- metadata +121 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '085e39f712849a9c7e65b62b1f9a7715830b60a87adad5d67607f42f74535304'
|
4
|
+
data.tar.gz: 8739348215acc0f8df17b17155df21ed641ffff4845d4d72a6ab3c9652e9cfca
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f8d519fa3b4246f3c182738d4ad42567046dbf0d691a3f07958809669ace36b25d2f4697b1671f46bff0fae97c9692d389b108696c334252708377a68320906a
|
7
|
+
data.tar.gz: 50f9861fb347aee4fc8a556ffc013009c23b84ac631c17fc705e4466852ea63a617c06aae5718431df908035d258928615b64a2bdf45cbaa16e46efc684d90d1
|
data/.ignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.json
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require:
|
2
|
+
- rubocop-rake
|
3
|
+
- rubocop-rspec
|
4
|
+
|
5
|
+
AllCops:
|
6
|
+
TargetRubyVersion: 3.0
|
7
|
+
NewCops: enable
|
8
|
+
Layout/LineLength:
|
9
|
+
Exclude:
|
10
|
+
- spec/**/*
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Enabled: false
|
13
|
+
Metrics/BlockLength:
|
14
|
+
Enabled: false
|
15
|
+
Metrics/CyclomaticComplexity:
|
16
|
+
Enabled: false
|
17
|
+
Metrics/MethodLength:
|
18
|
+
Enabled: false
|
19
|
+
Metrics/ModuleLength:
|
20
|
+
Enabled: false
|
21
|
+
Metrics/PerceivedComplexity:
|
22
|
+
Enabled: false
|
23
|
+
RSpec/ExampleLength:
|
24
|
+
Enabled: false
|
25
|
+
RSpec/MultipleExpectations:
|
26
|
+
Enabled: false
|
27
|
+
RSpec/MultipleMemoizedHelpers:
|
28
|
+
Enabled: false
|
29
|
+
Style/SafeNavigationChainLength:
|
30
|
+
Enabled: false
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2024 Michael Holloway
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# DigitalScriptorium
|
2
|
+
|
3
|
+
This gem provides code to support the transformation of Digital Scriptorium Wikibase data exports into collections of Apache Solr records that can be searched using [DS Catalog](https://search.digital-scriptorium.org/).
|
4
|
+
|
5
|
+
See [here](doc/overview.md) for a technical overview of the logic for transforming Wikibase items in the export to Solr records.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Install the gem and add to the application's Gemfile by executing:
|
10
|
+
|
11
|
+
```bash
|
12
|
+
bundle add digital_scriptorium
|
13
|
+
```
|
14
|
+
|
15
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
16
|
+
|
17
|
+
```bash
|
18
|
+
gem install digital_scriptorium
|
19
|
+
```
|
20
|
+
|
21
|
+
## Development
|
22
|
+
|
23
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
24
|
+
|
25
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
26
|
+
|
27
|
+
## Contributing
|
28
|
+
|
29
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/mdholloway/digital_scriptorium.
|
30
|
+
|
31
|
+
## License
|
32
|
+
|
33
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/doc/overview.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# How it works
|
2
|
+
|
3
|
+
For a general description of the Wikibase data model, see [Wikibase/DataModel](https://www.mediawiki.org/wiki/Wikibase/DataModel) on mediawiki.org.
|
4
|
+
|
5
|
+
The Digital Scriptorium Wikibase data export is a JSON-formatted array of Wikibase entities. The bulk of the entities in the export consist of triplets that together form a meta-record consisting of one each of the DS Catalog core model types: manuscipts, holdings, and records. The export also contains entities representing property definitions and authoritative references to common topics.
|
6
|
+
|
7
|
+
The [ExportRepresenter](../lib/digital_scriptorium/export_representer.rb) class can be used to deserialize an export in its entirety. The resulting [Export](../lib/digital_scriptorium/export.rb) object is essentially an array of Item and Property objects. Entities in the export are modeled using domain-specific classes provided by the [wikibase_representable](https://rubygems.org/gems/wikibase_representable) gem, such as Items, Properties, Statements (also known as Claims), and Snaks, which represent the primary claim of any statement as well as any qualifiers. Convenience methods are also provided to facilitate extracting data values.
|
8
|
+
|
9
|
+
The conversion script [wikibase_to_solr_new.rb](../wikibase_to_solr_new.rb) proceeds by deserializing the export and converting the resulting array of Wikibase objects to a hash keyed by entity ID. It then iterates over the elements of the hash. When it finds a record item based on the value of its instance-of (P16) claim, it retrieves the linked manuscript item, as well as the holding item linked in turn to the manuscript item, from the export hash by entity ID. It then iterates over the claims attached to manuscript, holding, and record in turn, extracting the Solr fields requested based on the property ID that is the subject of the claim and adding them to the Solr record to be produced for the meta-record. Claims for most properties are transformed to Solr fields using a generic algorithm implemented in [ClaimTransformer](../lib/digital_scriptorium/claim_transformer.rb). Name and date claims require some special handling, and are handled in dedicated claim transformer classes ([NameClaimTransformer](../lib/digital_scriptorium/name_claim_transformer.rb) and [DateClaimTransformer](../lib/digital_scriptorium/date_claim_transformer.rb) respectively). After all claims from the manuscript, holding, and record have been processed, the resulting Solr record is written to the output file.
|
10
|
+
|
11
|
+
The specific Solr fields produced for each claim are controlled by the configuration file [property_config.yml](../property_config.yml). This file also defines the prefix (representing the property name) to be attached to each field for a given property, and whether a claim based on the property might have a related authority qualifier.
|
12
|
+
|
13
|
+
The script was written so as not to rely on the structure of the export file beyond that it will be a JSON array of Wikibase entities, with records linked to manuscripts and manuscripts linked to holdings by P3 (described manuscript) and P2 (holding) claims respectively.
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'wikibase_representable'
|
4
|
+
|
5
|
+
module DigitalScriptorium
|
6
|
+
# Transformer for converting claims of Digital Scriptorium items into Solr fields.
|
7
|
+
class ClaimTransformer
|
8
|
+
include PropertyId
|
9
|
+
include WikibaseRepresentable::Model
|
10
|
+
|
11
|
+
def self.transform(claim, export_hash, config)
|
12
|
+
solr_props = {}
|
13
|
+
|
14
|
+
prefix = config['prefix']
|
15
|
+
requested_fields = config['fields']
|
16
|
+
authority_property_id = config['authority']
|
17
|
+
|
18
|
+
value = primary_value_from_claim(claim, export_hash)
|
19
|
+
|
20
|
+
solr_props['id'] = [value] if requested_fields.include? 'id'
|
21
|
+
solr_props["#{prefix}_meta"] = [value] if requested_fields.include? 'meta'
|
22
|
+
|
23
|
+
display_props = { 'PV' => value }
|
24
|
+
|
25
|
+
if authority_property_id && claim.qualifiers_by_property_id?(authority_property_id)
|
26
|
+
authority_id = claim.qualifier_by_property_id(authority_property_id).entity_id_value
|
27
|
+
authority = export_hash[authority_id]
|
28
|
+
|
29
|
+
if authority
|
30
|
+
label = authority.label('en')
|
31
|
+
display_props['QL'] = label
|
32
|
+
|
33
|
+
external_uri = authority.claim_by_property_id(EXTERNAL_URI)&.data_value
|
34
|
+
|
35
|
+
wikidata_id = authority.claim_by_property_id(WIKIDATA_QID)&.data_value
|
36
|
+
wikidata_uri = wikidata_id && "https://www.wikidata.org/wiki/#{wikidata_id}"
|
37
|
+
|
38
|
+
# Only one or the other of these seem to exist for a given item in practice.
|
39
|
+
display_props['QU'] = external_uri if external_uri
|
40
|
+
display_props['QU'] = wikidata_uri if wikidata_uri
|
41
|
+
|
42
|
+
solr_props["#{config['prefix']}_display"] = [display_props.to_json] if config['fields'].include? 'display'
|
43
|
+
solr_props["#{config['prefix']}_search"] = [value, label].uniq if config['fields'].include? 'search'
|
44
|
+
solr_props["#{config['prefix']}_facet"] = [label] if config['fields'].include? 'facet'
|
45
|
+
|
46
|
+
return solr_props
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
solr_props["#{config['prefix']}_display"] = [display_props.to_json] if config['fields'].include? 'display'
|
51
|
+
solr_props["#{config['prefix']}_search"] = [value] if config['fields'].include? 'search'
|
52
|
+
solr_props["#{config['prefix']}_facet"] = [value] if config['fields'].include? 'facet'
|
53
|
+
|
54
|
+
solr_props['images_facet'] = ['Yes'] if value && claim.property_id == IIIF_MANIFEST
|
55
|
+
solr_props["#{config['prefix']}_link"] = [value] if config['fields'].include? 'link'
|
56
|
+
|
57
|
+
solr_props
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.primary_value_from_claim(claim, export_hash)
|
61
|
+
if claim.value_type? EntityIdValue
|
62
|
+
entity_id = claim.entity_id_value
|
63
|
+
referenced_item = export_hash[entity_id]
|
64
|
+
referenced_item.label('en')
|
65
|
+
elsif claim.value_type? TimeValue
|
66
|
+
claim.time_value
|
67
|
+
else
|
68
|
+
claim.data_value
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'time'
|
4
|
+
|
5
|
+
module DigitalScriptorium
|
6
|
+
# Special-purpose transformer for date (P23) claims
|
7
|
+
class DateClaimTransformer
|
8
|
+
include PropertyId
|
9
|
+
|
10
|
+
def self.transform(claim, export_hash, config)
|
11
|
+
solr_props = ClaimTransformer.transform(claim, export_hash, config)
|
12
|
+
return solr_props unless claim.qualifiers
|
13
|
+
|
14
|
+
century = claim.qualifier_by_property_id(CENTURY).time_value
|
15
|
+
earliest = claim.qualifier_by_property_id(EARLIEST_DATE).time_value
|
16
|
+
latest = claim.qualifier_by_property_id(LATEST_DATE).time_value
|
17
|
+
|
18
|
+
solr_props['century_int'] = [Time.parse(century).year] unless century.nil?
|
19
|
+
solr_props['earliest_int'] = [Time.parse(earliest).year] unless earliest.nil?
|
20
|
+
solr_props['latest_int'] = [Time.parse(latest).year] unless latest.nil?
|
21
|
+
|
22
|
+
solr_props
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'wikibase_representable'
|
4
|
+
|
5
|
+
module DigitalScriptorium
|
6
|
+
# Represents a Digital Scriptorium item
|
7
|
+
class DsItem < WikibaseRepresentable::Model::Item
|
8
|
+
def instance_of_claims
|
9
|
+
claims_by_property_id PropertyId::INSTANCE_OF # P16
|
10
|
+
end
|
11
|
+
|
12
|
+
def ds_id
|
13
|
+
claim_by_property_id(PropertyId::DS_ID)&.data_value # P1
|
14
|
+
end
|
15
|
+
|
16
|
+
def holding_id
|
17
|
+
claim_by_property_id(PropertyId::MANUSCRIPT_HOLDING)&.entity_id_value # P2
|
18
|
+
end
|
19
|
+
|
20
|
+
def described_manuscript_id
|
21
|
+
claim_by_property_id(PropertyId::DESCRIBED_MANUSCRIPT)&.entity_id_value # P3
|
22
|
+
end
|
23
|
+
|
24
|
+
def iiif_manifest
|
25
|
+
claim_by_property_id(PropertyId::IIIF_MANIFEST)&.entity_id_value # P41
|
26
|
+
end
|
27
|
+
|
28
|
+
def core_model_item?
|
29
|
+
instance_of_claims.any? { |claim| ItemId::CORE_MODEL_ITEMS.include? claim.entity_id_value }
|
30
|
+
end
|
31
|
+
|
32
|
+
def manuscript?
|
33
|
+
instance_of_claims.any? { |claim| claim.entity_id_value == ItemId::MANUSCRIPT }
|
34
|
+
end
|
35
|
+
|
36
|
+
def holding?
|
37
|
+
instance_of_claims.any? { |claim| claim.entity_id_value == ItemId::HOLDING }
|
38
|
+
end
|
39
|
+
|
40
|
+
def record?
|
41
|
+
instance_of_claims.any? { |claim| claim.entity_id_value == ItemId::RECORD }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DigitalScriptorium
|
4
|
+
# Represents a meta record consisting of a manuscript, its holding information, and metadata record.
|
5
|
+
class DsMeta
|
6
|
+
attr_reader :holding, :manuscript, :record
|
7
|
+
|
8
|
+
def initialize(record, export_hash)
|
9
|
+
manuscript = export_hash[record.described_manuscript_id]
|
10
|
+
holding = export_hash[manuscript.holding_id]
|
11
|
+
|
12
|
+
@holding = holding
|
13
|
+
@manuscript = manuscript
|
14
|
+
@record = record
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DigitalScriptorium
|
4
|
+
# Simple model class representing a Wikibase JSON export.
|
5
|
+
# Provides a to_hash method to facilitate entity lookups by ID.
|
6
|
+
class Export < Array
|
7
|
+
def to_hash
|
8
|
+
hash = {}
|
9
|
+
each do |el|
|
10
|
+
hash[el.id] = el
|
11
|
+
end
|
12
|
+
hash
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'representable/json/collection'
|
4
|
+
require 'wikibase_representable'
|
5
|
+
|
6
|
+
module DigitalScriptorium
|
7
|
+
# Representer class for deserializing Wikibase data exports from JSON.
|
8
|
+
class ExportRepresenter < Representable::Decorator
|
9
|
+
include Representable::JSON::Collection
|
10
|
+
include WikibaseRepresentable::Model
|
11
|
+
include WikibaseRepresentable::Representers
|
12
|
+
|
13
|
+
items decorator: lambda { |input:, **|
|
14
|
+
input.type == Item::ENTITY_TYPE ? ItemRepresenter : PropertyRepresenter
|
15
|
+
}, class: lambda { |input:, **|
|
16
|
+
input['type'] == Item::ENTITY_TYPE ? DsItem : Property
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DigitalScriptorium
|
4
|
+
# An item representing a Digital Scriptorium holding (instance of Q2)
|
5
|
+
class Holding < DsItem
|
6
|
+
def institution_as_recorded_claims
|
7
|
+
claims_by_property_id HOLDING_INSTITUTION_AS_RECORDED # P5
|
8
|
+
end
|
9
|
+
|
10
|
+
def status_claims
|
11
|
+
claims_by_property_id HOLDING_STATUS # P6
|
12
|
+
end
|
13
|
+
|
14
|
+
def institutional_id_claims
|
15
|
+
claims_by_property_id INSTITUTIONAL_ID # P7
|
16
|
+
end
|
17
|
+
|
18
|
+
def shelfmark_claims
|
19
|
+
claims_by_property_id SHELFMARK # P8
|
20
|
+
end
|
21
|
+
|
22
|
+
def link_to_institutional_record_claims
|
23
|
+
claims_by_property_id LINK_TO_INSTITUTIONAL_RECORD # P9
|
24
|
+
end
|
25
|
+
|
26
|
+
def start_time_claims
|
27
|
+
claims_by_property_id START_TIME # P38
|
28
|
+
end
|
29
|
+
|
30
|
+
def end_time_claims
|
31
|
+
claims_by_property_id END_TIME # P39
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DigitalScriptorium
|
4
|
+
# An item representing a Digital Scriptorium manuscript (instance of Q1)
|
5
|
+
class Manuscript < DsItem
|
6
|
+
include PropertyId
|
7
|
+
|
8
|
+
def ds_id
|
9
|
+
claim_by_property_id(DS_ID).data_value # P1
|
10
|
+
end
|
11
|
+
|
12
|
+
def holding_id
|
13
|
+
claim_by_property_id(MANUSCRIPT_HOLDING).entity_id_value # P2
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DigitalScriptorium
|
4
|
+
# Special-purpose transformer for name (P14) claims
|
5
|
+
class NameClaimTransformer
|
6
|
+
include PropertyId
|
7
|
+
|
8
|
+
def self.transform(claim, export_hash)
|
9
|
+
return {} unless claim.qualifiers_by_property_id? ROLE_IN_AUTHORITY_FILE
|
10
|
+
|
11
|
+
role_entity_id = claim.qualifier_by_property_id(ROLE_IN_AUTHORITY_FILE).entity_id_value
|
12
|
+
role_item = export_hash[role_entity_id]
|
13
|
+
role_label = role_item.label('en')
|
14
|
+
prefix = role_label.downcase.split.last
|
15
|
+
|
16
|
+
recorded_name = claim.data_value
|
17
|
+
display_names = { 'PV' => recorded_name }
|
18
|
+
search_names = [recorded_name]
|
19
|
+
|
20
|
+
name_in_original_script = claim.qualifier_by_property_id(IN_ORIGINAL_SCRIPT)&.data_value&.value
|
21
|
+
display_names['AGR'] = name_in_original_script if name_in_original_script
|
22
|
+
search_names << name_in_original_script if name_in_original_script
|
23
|
+
|
24
|
+
unless claim.qualifiers_by_property_id? NAME_IN_AUTHORITY_FILE
|
25
|
+
return {
|
26
|
+
"#{prefix}_display" => [display_names.to_json],
|
27
|
+
"#{prefix}_search" => search_names,
|
28
|
+
"#{prefix}_facet" => [recorded_name]
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
name_entity_id = claim.qualifier_by_property_id(NAME_IN_AUTHORITY_FILE).entity_id_value
|
33
|
+
name_item = export_hash[name_entity_id]
|
34
|
+
name_label = name_item.label('en')
|
35
|
+
|
36
|
+
wikidata_id = name_item.claim_by_property_id(WIKIDATA_QID).data_value
|
37
|
+
wikidata_url = "https://www.wikidata.org/wiki/#{wikidata_id}"
|
38
|
+
|
39
|
+
search_names << name_label
|
40
|
+
display_names['QL'] = name_label
|
41
|
+
display_names['QU'] = wikidata_url if wikidata_url
|
42
|
+
|
43
|
+
{
|
44
|
+
"#{prefix}_display" => [display_names.to_json],
|
45
|
+
"#{prefix}_search" => search_names,
|
46
|
+
"#{prefix}_facet" => [name_label]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DigitalScriptorium
|
4
|
+
# Constants for Digital Scriptorium Wikibase property IDs
|
5
|
+
module PropertyId
|
6
|
+
DS_ID = 'P1'
|
7
|
+
MANUSCRIPT_HOLDING = 'P2'
|
8
|
+
DESCRIBED_MANUSCRIPT = 'P3'
|
9
|
+
HOLDING_INSTITUTION_IN_AUTHORITY_FILE = 'P4'
|
10
|
+
HOLDING_INSTITUTION_AS_RECORDED = 'P5' # qualifiers: P4
|
11
|
+
HOLDING_STATUS = 'P6'
|
12
|
+
INSTITUTIONAL_ID = 'P7'
|
13
|
+
SHELFMARK = 'P8'
|
14
|
+
LINK_TO_INSTITUTIONAL_RECORD = 'P9'
|
15
|
+
TITLE_AS_RECORDED = 'P10' # qualifiers: P11, P13
|
16
|
+
STANDARD_TITLE = 'P11'
|
17
|
+
UNIFORM_TITLE_AS_RECORDED = 'P12'
|
18
|
+
IN_ORIGINAL_SCRIPT = 'P13'
|
19
|
+
ASSOCIATED_NAME_AS_RECORDED = 'P14' # qualifiers: P15, P17
|
20
|
+
ROLE_IN_AUTHORITY_FILE = 'P15'
|
21
|
+
INSTANCE_OF = 'P16'
|
22
|
+
NAME_IN_AUTHORITY_FILE = 'P17'
|
23
|
+
GENRE_AS_RECORDED = 'P18' # qualifiers: P20
|
24
|
+
SUBJECT_AS_RECORDED = 'P19' # qualifiers: P20
|
25
|
+
TERM_IN_AUTHORITY_FILE = 'P20'
|
26
|
+
LANGUAGE_AS_RECORDED = 'P21' # qualifiers: P22
|
27
|
+
LANGUAGE_IN_AUTHORITY_FILE = 'P22'
|
28
|
+
PRODUCTION_DATE_AS_RECORDED = 'P23' # qualifiers: P25, P24, P37, P36
|
29
|
+
PRODUCTION_CENTURY_IN_AUTHORITY_FILE = 'P24'
|
30
|
+
CENTURY = 'P25'
|
31
|
+
DATED = 'P26'
|
32
|
+
PRODUCTION_PLACE_AS_RECORDED = 'P27' # qualifiers: P28
|
33
|
+
PLACE_IN_AUTHORITY_FILE = 'P28'
|
34
|
+
PHYSICAL_DESCRIPTION = 'P29'
|
35
|
+
MATERIAL_AS_RECORDED = 'P30' # qualifiers: P31
|
36
|
+
MATERIAL_IN_AUTHORITY_FILE = 'P31'
|
37
|
+
NOTE = 'P32'
|
38
|
+
ACKNOWLEDGEMENTS = 'P33'
|
39
|
+
DATE_ADDED = 'P34'
|
40
|
+
DATE_LAST_UPDATED = 'P35'
|
41
|
+
LATEST_DATE = 'P36'
|
42
|
+
EARLIEST_DATE = 'P37'
|
43
|
+
START_TIME = 'P38'
|
44
|
+
END_TIME = 'P39'
|
45
|
+
EXTERNAL_IDENTIFIER = 'P40'
|
46
|
+
IIIF_MANIFEST = 'P41'
|
47
|
+
WIKIDATA_QID = 'P42'
|
48
|
+
VIAF_ID = 'P43'
|
49
|
+
EXTERNAL_URI = 'P44'
|
50
|
+
EQUIVALENT_PROPERTY = 'P45'
|
51
|
+
FORMATTER_URL = 'P46'
|
52
|
+
SUBCLASS_OF = 'P47'
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DigitalScriptorium
|
4
|
+
# An item representing a Digital Scriptorium record (instance of Q3)
|
5
|
+
class Record < DsItem
|
6
|
+
include PropertyId
|
7
|
+
|
8
|
+
def described_manuscript_id
|
9
|
+
claim_by_property_id(DESCRIBED_MANUSCRIPT).entity_id_value # P3
|
10
|
+
end
|
11
|
+
|
12
|
+
def title_as_recorded_claims
|
13
|
+
claims_by_property_id TITLE_AS_RECORDED # P10
|
14
|
+
end
|
15
|
+
|
16
|
+
def uniform_title_as_recorded_claims
|
17
|
+
claims_by_property_id UNIFORM_TITLE_AS_RECORDED # P12
|
18
|
+
end
|
19
|
+
|
20
|
+
def associated_name_as_recorded_claims
|
21
|
+
claims_by_property_id ASSOCIATED_NAME_AS_RECORDED # P14
|
22
|
+
end
|
23
|
+
|
24
|
+
def genre_as_recorded_claims
|
25
|
+
claims_by_property_id GENRE_AS_RECORDED # P18
|
26
|
+
end
|
27
|
+
|
28
|
+
def language_as_recorded_claims
|
29
|
+
claims_by_property_id LANGUAGE_AS_RECORDED # P21
|
30
|
+
end
|
31
|
+
|
32
|
+
def production_date_as_recorded_claims
|
33
|
+
claims_by_property_id PRODUCTION_DATE_AS_RECORDED # P23
|
34
|
+
end
|
35
|
+
|
36
|
+
def dated_claims
|
37
|
+
claims_by_property_id DATED # P26
|
38
|
+
end
|
39
|
+
|
40
|
+
def production_place_as_recorded_claims
|
41
|
+
claims_by_property_id PRODUCTION_PLACE_AS_RECORDED # P27
|
42
|
+
end
|
43
|
+
|
44
|
+
def physical_description_claims
|
45
|
+
claims_by_property_id PHYSICAL_DESCRIPTION # P29
|
46
|
+
end
|
47
|
+
|
48
|
+
def material_as_recorded_claims
|
49
|
+
claims_by_property_id MATERIAL_AS_RECORDED # P30
|
50
|
+
end
|
51
|
+
|
52
|
+
def note_claims
|
53
|
+
claims_by_property_id NOTE # P32
|
54
|
+
end
|
55
|
+
|
56
|
+
def acknowledgements_claims
|
57
|
+
claims_by_property_id ACKNOWLEDGEMENTS # P33
|
58
|
+
end
|
59
|
+
|
60
|
+
def date_added_claims
|
61
|
+
claims_by_property_id DATE_ADDED # P34
|
62
|
+
end
|
63
|
+
|
64
|
+
def date_last_updated_claims
|
65
|
+
claims_by_property_id DATE_LAST_UPDATED # P35
|
66
|
+
end
|
67
|
+
|
68
|
+
def iiif_manifest_claims
|
69
|
+
claims_by_property_id IIIF_MANIFEST # P41
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'digital_scriptorium/item_id'
|
4
|
+
require 'digital_scriptorium/property_id'
|
5
|
+
|
6
|
+
require 'digital_scriptorium/ds_item'
|
7
|
+
require 'digital_scriptorium/ds_meta'
|
8
|
+
require 'digital_scriptorium/holding'
|
9
|
+
require 'digital_scriptorium/manuscript'
|
10
|
+
require 'digital_scriptorium/record'
|
11
|
+
|
12
|
+
require 'digital_scriptorium/export'
|
13
|
+
require 'digital_scriptorium/export_representer'
|
14
|
+
|
15
|
+
require 'digital_scriptorium/claim_transformer'
|
16
|
+
require 'digital_scriptorium/date_claim_transformer'
|
17
|
+
require 'digital_scriptorium/name_claim_transformer'
|
data/property_config.yml
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
P1:
|
2
|
+
prefix: id
|
3
|
+
fields:
|
4
|
+
- id
|
5
|
+
- display
|
6
|
+
- search
|
7
|
+
P5:
|
8
|
+
prefix: institution
|
9
|
+
fields:
|
10
|
+
- display
|
11
|
+
- search
|
12
|
+
- facet
|
13
|
+
authority: P4
|
14
|
+
P6:
|
15
|
+
prefix: holding_status
|
16
|
+
fields:
|
17
|
+
- display
|
18
|
+
P8:
|
19
|
+
prefix: shelfmark
|
20
|
+
fields:
|
21
|
+
- display
|
22
|
+
- search
|
23
|
+
P9:
|
24
|
+
prefix: institutional_record
|
25
|
+
fields:
|
26
|
+
- link
|
27
|
+
P10:
|
28
|
+
prefix: title
|
29
|
+
fields:
|
30
|
+
- display
|
31
|
+
- search
|
32
|
+
- facet
|
33
|
+
authority: P11
|
34
|
+
P12:
|
35
|
+
prefix: uniform_title
|
36
|
+
fields:
|
37
|
+
- search
|
38
|
+
# NOTE: P14 can translate to any of a few different Solr fields based on the value of the
|
39
|
+
# associated role (P15) qualifier, and is handled in its own dedicated processing method
|
40
|
+
P14:
|
41
|
+
prefix: associated_name
|
42
|
+
fields: []
|
43
|
+
P18:
|
44
|
+
prefix: term
|
45
|
+
fields:
|
46
|
+
- display
|
47
|
+
- search
|
48
|
+
- facet
|
49
|
+
authority: P20
|
50
|
+
P19:
|
51
|
+
prefix: term
|
52
|
+
fields:
|
53
|
+
- display
|
54
|
+
- search
|
55
|
+
- facet
|
56
|
+
authority: P20
|
57
|
+
P21:
|
58
|
+
prefix: language
|
59
|
+
fields:
|
60
|
+
- display
|
61
|
+
- search
|
62
|
+
- facet
|
63
|
+
authority: P22
|
64
|
+
P23:
|
65
|
+
prefix: date
|
66
|
+
fields:
|
67
|
+
- meta
|
68
|
+
- display
|
69
|
+
- search
|
70
|
+
- facet
|
71
|
+
authority: P24
|
72
|
+
P26:
|
73
|
+
prefix: dated
|
74
|
+
fields:
|
75
|
+
- display
|
76
|
+
- facet
|
77
|
+
P27:
|
78
|
+
prefix: place
|
79
|
+
fields:
|
80
|
+
- display
|
81
|
+
- search
|
82
|
+
- facet
|
83
|
+
authority: P28
|
84
|
+
P29:
|
85
|
+
prefix: physical_description
|
86
|
+
fields:
|
87
|
+
- display
|
88
|
+
- search
|
89
|
+
P30:
|
90
|
+
prefix: material
|
91
|
+
fields:
|
92
|
+
- facet
|
93
|
+
authority: P31
|
94
|
+
P32:
|
95
|
+
prefix: note
|
96
|
+
fields:
|
97
|
+
- display
|
98
|
+
- search
|
99
|
+
P33:
|
100
|
+
prefix: acknowledgements
|
101
|
+
fields:
|
102
|
+
- display
|
103
|
+
P41:
|
104
|
+
prefix: iiif_manifest
|
105
|
+
fields:
|
106
|
+
- link
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'digital_scriptorium'
|
4
|
+
require 'json'
|
5
|
+
require 'optparse'
|
6
|
+
require 'time'
|
7
|
+
require 'tty-spinner'
|
8
|
+
require 'yaml'
|
9
|
+
require 'zlib'
|
10
|
+
|
11
|
+
dir = File.dirname __FILE__
|
12
|
+
|
13
|
+
input_file = File.expand_path 'wikibase_export.json.gz', dir
|
14
|
+
output_file = File.expand_path 'solr_import.json', dir
|
15
|
+
config_file = File.expand_path 'property_config.yml', dir
|
16
|
+
pretty_print = false
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
opts.banner = 'Usage: wikibase_to_solr.rb [options]'
|
20
|
+
|
21
|
+
opts.on('-i', '--in FILE', 'The file path to the gzipped Wikibase JSON export file.') do |f|
|
22
|
+
input_file = File.expand_path f, dir
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on('-o', '--out FILE', 'The file path to output the formatted Solr JSON file.') do |f|
|
26
|
+
output_file = File.expand_path f, dir
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on('-c', '--config FILE', 'The file path to the property configuration file.') do |f|
|
30
|
+
config_file = File.expand_path f, dir
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on('-p', '--pretty-print', 'Whether to pretty-print the JSON output.') do
|
34
|
+
pretty_print = true
|
35
|
+
end
|
36
|
+
end.parse!
|
37
|
+
|
38
|
+
def merge(solr_item, new_props)
|
39
|
+
solr_item.merge(new_props) do |_, old_val, new_val|
|
40
|
+
old_val.nil? ? new_val : (old_val + new_val).uniq
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def merge_transformed_fields(solr_item, claim, export_hash, property_config)
|
45
|
+
if claim.property_id == DigitalScriptorium::PropertyId::ASSOCIATED_NAME_AS_RECORDED
|
46
|
+
merge(solr_item, DigitalScriptorium::NameClaimTransformer.transform(claim, export_hash))
|
47
|
+
elsif claim.property_id == DigitalScriptorium::PropertyId::PRODUCTION_DATE_AS_RECORDED
|
48
|
+
merge(solr_item,
|
49
|
+
DigitalScriptorium::DateClaimTransformer.transform(claim, export_hash, property_config))
|
50
|
+
else
|
51
|
+
merge(solr_item,
|
52
|
+
DigitalScriptorium::ClaimTransformer.transform(claim, export_hash, property_config))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
start_time = Time.now
|
57
|
+
|
58
|
+
config = YAML.load_file(config_file)
|
59
|
+
|
60
|
+
loading_spinner = TTY::Spinner.new('[:spinner] Loading export data', hide_cursor: true)
|
61
|
+
loading_spinner.auto_spin
|
62
|
+
|
63
|
+
export_json = Zlib::GzipReader.open(input_file).read
|
64
|
+
export_hash = DigitalScriptorium::ExportRepresenter.new(DigitalScriptorium::Export.new)
|
65
|
+
.from_json(export_json)
|
66
|
+
.to_hash
|
67
|
+
loaded_time = Time.now
|
68
|
+
loading_spinner.success("(#{format('%0.02f', loaded_time - start_time)}s)")
|
69
|
+
|
70
|
+
item_count = 0
|
71
|
+
generating_spinner = TTY::Spinner.new('[:spinner] Generating Solr documents', hide_cursor: true)
|
72
|
+
generating_spinner.auto_spin
|
73
|
+
|
74
|
+
File.open(output_file, 'w') do |file|
|
75
|
+
file << '['
|
76
|
+
file << "\n" if pretty_print
|
77
|
+
|
78
|
+
export_hash.each_with_index do |(_, entity), idx|
|
79
|
+
next unless entity.is_a?(DigitalScriptorium::DsItem) &&
|
80
|
+
entity.claims_by_property_id?(DigitalScriptorium::PropertyId::INSTANCE_OF) &&
|
81
|
+
entity.record?
|
82
|
+
|
83
|
+
meta = DigitalScriptorium::DsMeta.new(entity, export_hash)
|
84
|
+
solr_item = { 'qid_meta' => [meta.holding.id, meta.manuscript.id, meta.record.id] }
|
85
|
+
|
86
|
+
[meta.holding, meta.manuscript, meta.record].each do |item|
|
87
|
+
item.claims.each do |property_id, claims|
|
88
|
+
claims.each do |claim|
|
89
|
+
next unless (property_config = config[property_id])
|
90
|
+
|
91
|
+
solr_item = merge_transformed_fields(solr_item, claim, export_hash, property_config)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
file << (pretty_print ? JSON.pretty_generate(solr_item) : JSON.generate(solr_item))
|
97
|
+
file << ',' if idx < export_hash.size - 1
|
98
|
+
file << "\n" if pretty_print
|
99
|
+
|
100
|
+
item_count += 1
|
101
|
+
end
|
102
|
+
|
103
|
+
file << ']'
|
104
|
+
end
|
105
|
+
|
106
|
+
finish_time = Time.now
|
107
|
+
generating_spinner.success("(#{format('%0.02f', finish_time - loaded_time)}s)")
|
108
|
+
puts "Generated #{item_count} Solr documents in #{format('%0.02f', finish_time - start_time)} seconds"
|
metadata
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: digital_scriptorium
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Michael Holloway
|
8
|
+
bindir: exe
|
9
|
+
cert_chain: []
|
10
|
+
date: 2025-01-06 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: multi_json
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '1.15'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '1.15'
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: representable
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - "~>"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '3.2'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '3.2'
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: tty-spinner
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0.9'
|
47
|
+
type: :runtime
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0.9'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: wikibase_representable
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0.1'
|
61
|
+
type: :runtime
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '0.1'
|
68
|
+
email:
|
69
|
+
- michael@mdholloway.org
|
70
|
+
executables: []
|
71
|
+
extensions: []
|
72
|
+
extra_rdoc_files: []
|
73
|
+
files:
|
74
|
+
- ".ignore"
|
75
|
+
- ".rspec"
|
76
|
+
- ".rubocop.yml"
|
77
|
+
- LICENSE.txt
|
78
|
+
- README.md
|
79
|
+
- Rakefile
|
80
|
+
- doc/overview.md
|
81
|
+
- lib/digital_scriptorium.rb
|
82
|
+
- lib/digital_scriptorium/claim_transformer.rb
|
83
|
+
- lib/digital_scriptorium/date_claim_transformer.rb
|
84
|
+
- lib/digital_scriptorium/ds_item.rb
|
85
|
+
- lib/digital_scriptorium/ds_meta.rb
|
86
|
+
- lib/digital_scriptorium/export.rb
|
87
|
+
- lib/digital_scriptorium/export_representer.rb
|
88
|
+
- lib/digital_scriptorium/holding.rb
|
89
|
+
- lib/digital_scriptorium/item_id.rb
|
90
|
+
- lib/digital_scriptorium/manuscript.rb
|
91
|
+
- lib/digital_scriptorium/name_claim_transformer.rb
|
92
|
+
- lib/digital_scriptorium/property_id.rb
|
93
|
+
- lib/digital_scriptorium/record.rb
|
94
|
+
- lib/digital_scriptorium/version.rb
|
95
|
+
- property_config.yml
|
96
|
+
- sig/digital_scriptorium.rbs
|
97
|
+
- wikibase_to_solr_new.rb
|
98
|
+
homepage: https://github.com/mdholloway/digital_scriptorium
|
99
|
+
licenses:
|
100
|
+
- MIT
|
101
|
+
metadata:
|
102
|
+
homepage_uri: https://github.com/mdholloway/digital_scriptorium
|
103
|
+
rubygems_mfa_required: 'true'
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: 3.0.0
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubygems_version: 3.6.2
|
119
|
+
specification_version: 4
|
120
|
+
summary: Supporting code for the Digital Scriptorium DS Catalog 2.0 project
|
121
|
+
test_files: []
|