cdmbl 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +124 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cdmbl.gemspec +37 -0
- data/lib/cdmbl/default_callback.rb +8 -0
- data/lib/cdmbl/default_solr.rb +25 -0
- data/lib/cdmbl/etl_run.rb +69 -0
- data/lib/cdmbl/etl_worker.rb +30 -0
- data/lib/cdmbl/extractor.rb +75 -0
- data/lib/cdmbl/field_formatter.rb +14 -0
- data/lib/cdmbl/field_transformer.rb +40 -0
- data/lib/cdmbl/formatters.rb +107 -0
- data/lib/cdmbl/hooks.rb +25 -0
- data/lib/cdmbl/loader.rb +19 -0
- data/lib/cdmbl/oai_filter.rb +44 -0
- data/lib/cdmbl/oai_request.rb +40 -0
- data/lib/cdmbl/oai_set_lookup.rb +23 -0
- data/lib/cdmbl/rake_task.rb +6 -0
- data/lib/cdmbl/record_transformer.rb +29 -0
- data/lib/cdmbl/tasks/etl.rake +11 -0
- data/lib/cdmbl/transformer.rb +147 -0
- data/lib/cdmbl/version.rb +3 -0
- data/lib/cdmbl.rb +16 -0
- metadata +253 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8924ce455e1357c39e7c19d768585b129893a9c7
|
4
|
+
data.tar.gz: ddc8ad09c407ac6b11beae7d823e8b4ce4541a3f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7c784f79054655b97f606851baa1f64f5fbcef07a3858bc1156612b9f21eb7c8d0eaea742ae1f3210de9a136e709c1a196be47f628047e18d982b15dd74f5400
|
7
|
+
data.tar.gz: 0e015661706bfcca63d6a8d38443b70672f37add879cffc5b4021d89a9d7e6ae38676b6fa0b5e5922f043a9be10e9f720e87e3c687e52dd089b76199f20d4c28
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at fenne035@umn.edu. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 University of Minnesota Libraries
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
# CDMBL: CONTENTdm on Blacklight
|
2
|
+
|
3
|
+
Use [Blacklight](https://github.com/projectblacklight/blacklight) as a front end for your CONTENTdm instance.
|
4
|
+
|
5
|
+
At the moment, CDMBL consists only of a micro [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) system dedicated to extracting metadata records from a CONTENTdm instance (using the [CONTENTdm API gem](https://github.com/UMNLibraries/contentdm_api), transforming them into Solr documents, and loading them into Solr. After initially populating the entire index, CDMBL allows for selective harvesting for incremental Solr index updates.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'cdmbl', :git => 'https://github.com/UMNLibraries/cdmbl'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Add the CDMBL rake task to your project Rakefile:
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
require 'cdmbl/rake_task'
|
23
|
+
```
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
Run the ingester
|
28
|
+
|
29
|
+
rake cdmbl:ingest[solr_url,oai_endpoint,cdm_endpoint,minimum_date]
|
30
|
+
|
31
|
+
|Argument| Definition|
|
32
|
+
|--:|---|
|
33
|
+
|solr_url| The full URL to your Solr core instance (same as your blacklight.yml solr url)|
|
34
|
+
|oai_endpoint| A URL to your OAI instance (e.g. http://reflections.mndigital.org/oai/oai.php) |
|
35
|
+
|cdm_endpoint| A URL to your CONTENTdm API endpoint (e.g. https://server16022.contentdm.oclc.org/dmwebservices/index.php) |
|
36
|
+
|minimum_date| Date from which to [selectively harvest](https://www.openarchives.org/OAI/openarchivesprotocol.html#SelectiveHarvesting) identifiers from the OAI endpoint. These identifiers are used to determine which records to delete from your index and which records to request from the CONTENTdm API|
|
37
|
+
|
38
|
+
For example:
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
rake "cdmbl:ingest[http://solr:8983/solr/foo-bar-core, http://reflections.mndigital.org/oai/oai.php, https://server16022.contentdm.oclc.org/dmwebservices/index.php, 2015-01-01]"
|
42
|
+
```
|
43
|
+
|
44
|
+
### Custom Rake Task
|
45
|
+
|
46
|
+
You might also create your own rake task to run your modified field transformers:
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
require 'cdmbl'
|
50
|
+
|
51
|
+
namespace :cdmbl do
|
52
|
+
desc 'Launch a background job to index metadata from CONTENTdm int Solr.'
|
53
|
+
task :ingest do
|
54
|
+
solr_config = { url: 'http://solr:8983/solr/foo-bar-core' }
|
55
|
+
etl_config = { oai_endpoint: 'http://reflections.mndigital.org/oai/oai.php',
|
56
|
+
cdm_endpoint: 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
|
57
|
+
field_mappings: my_field_mappings,
|
58
|
+
minimum_date: '2016-09-01'}
|
59
|
+
CDMBL::ETLWorker.perform_async(solr_config, etl_config)
|
60
|
+
end
|
61
|
+
```
|
62
|
+
|
63
|
+
### Your Own Custom Solr Field Mappings (see above code snippet)
|
64
|
+
|
65
|
+
The default CONTENTdm to Solr field transformation rules may be overriden by calling the CDMBL::ETLWorker (a [Sidekiq worker](https://github.com/mperham/sidekiq)) directly. These rules may be found in the default_mappings method of the [CDMBL::Transformer Class](https://github.com/UMNLibraries/cdmbl/blob/master/lib/cdmbl/transformer.rb).
|
66
|
+
|
67
|
+
The transformer expects mappings in the following format:
|
68
|
+
|
69
|
+
```ruby
|
70
|
+
def your_custom_field_mappings
|
71
|
+
[
|
72
|
+
{dest_path: 'title_tei', origin_path: 'title', formatters: [StripFormatter]},
|
73
|
+
]
|
74
|
+
end
|
75
|
+
```
|
76
|
+
|Argument| Definition|
|
77
|
+
|--:|---|
|
78
|
+
|dest_path| The 'destination path' is the name of the field you will be sending to Solr for this field mapping. |
|
79
|
+
|origin_path| Where to get the field data from the original record for this mapping. |
|
80
|
+
|formatters| [Formatters](https://github.com/UMNLibraries/cdmbl/blob/master/lib/cdmbl/formatters.rb) perform tasks such as stripping white space or splitting CONTENTdm multi-valued fields (delimited by semicolons) into JSON arrays. |
|
81
|
+
|
82
|
+
**Note:** The first formatter receives the value found at the declared `origin_path`. Each formatter declared after the initial formatter will receive a value produced by the preceding formatter.
|
83
|
+
|
84
|
+
Formatters are very simple stateless classes that take a value, do something to it, and respond with a modified version of this value via a class method called `format`. Examples of other formatters may be found in the [Formatters file](https://github.com/UMNLibraries/cdmbl/blob/master/lib/cdmbl/formatters.rb). For Example:
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
class SplitFormatter
|
88
|
+
def self.format(value)
|
89
|
+
(value.respond_to?(:split)) ? value.split(';') : value
|
90
|
+
end
|
91
|
+
end
|
92
|
+
```
|
93
|
+
|
94
|
+
You might also want to simply override some of the default mappings or add your own:
|
95
|
+
|
96
|
+
```ruby
|
97
|
+
mappings = CDMBL::Transformer.default_mappings.merge(your_custom_field_mappings)
|
98
|
+
```
|
99
|
+
## A Custom Post-indexing Callback
|
100
|
+
|
101
|
+
If you would like to perform some action (e.g. send an email) following the completion of the CDMBL indexing process, you may declare your own callback hook (anything with "Callback" in the class name declared within the CDMBL module space will be used). To do so in Rails, create a Rails initializer file `config/initializers/cdmbl.rb`:
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
module CDMBL
|
105
|
+
class Callback
|
106
|
+
def self.call!
|
107
|
+
Rails.logger.info("My Custom CDMBL Callback")
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
```
|
112
|
+
## Development
|
113
|
+
|
114
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
115
|
+
|
116
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
117
|
+
|
118
|
+
## Contributing
|
119
|
+
|
120
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/UMNLibraries/cdmbl. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
121
|
+
|
122
|
+
## License
|
123
|
+
|
124
|
+
[MIT](/LICENSE.txt)
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "cdmbl"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/cdmbl.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'cdmbl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'cdmbl'
|
8
|
+
spec.version = CDMBL::VERSION
|
9
|
+
spec.authors = ['chadfennell']
|
10
|
+
spec.email = ['fenne035@umn.edu']
|
11
|
+
|
12
|
+
spec.summary = %q{Use Blacklight (Solr) as a front end for your CONTENTdm instance.}
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
spec.bindir = 'exe'
|
17
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'hash_at_path', '~> 0.1'
|
21
|
+
spec.add_dependency 'contentdm_api', '~> 0.2'
|
22
|
+
spec.add_dependency 'sidekiq', '~> 3.4'
|
23
|
+
spec.add_dependency 'titleize', '~> 1.4'
|
24
|
+
spec.add_dependency 'rsolr', '~> 1.0'
|
25
|
+
# This gem generally wants to be in a rails app, but just to avoid adding
|
26
|
+
# another external dependency for XML procssing, we rely on activesupport's
|
27
|
+
# Has.to_xml feature for testing and to allow this gem to function
|
28
|
+
# independently from a rails app
|
29
|
+
spec.add_dependency 'activesupport', '~> 4.2'
|
30
|
+
|
31
|
+
spec.add_development_dependency 'bundler', '~> 1.12'
|
32
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
33
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
34
|
+
spec.add_development_dependency 'yard', '~> 0.9.0'
|
35
|
+
spec.add_development_dependency 'webmock', '~> 1.24', '>= 1.24.0'
|
36
|
+
spec.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.1'
|
37
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'rsolr'
|
2
|
+
|
3
|
+
module CDMBL
|
4
|
+
# Commnicate with Solr: add / delete stuff
|
5
|
+
class DefaultSolr
|
6
|
+
attr_reader :url, :client
|
7
|
+
def initialize(url: 'http://localhost:8983', client: RSolr)
|
8
|
+
@url = url
|
9
|
+
@client = client
|
10
|
+
end
|
11
|
+
|
12
|
+
def connection
|
13
|
+
@connection ||= client.connect url: url
|
14
|
+
end
|
15
|
+
|
16
|
+
def add(records)
|
17
|
+
connection.add records
|
18
|
+
connection.commit
|
19
|
+
end
|
20
|
+
|
21
|
+
def delete(ids)
|
22
|
+
connection.delete_by_id ids
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module CDMBL
|
2
|
+
# TODO: extract params into a an ETL Profile and delegate
|
3
|
+
class ETLRun
|
4
|
+
attr_reader :oai_endpoint,
|
5
|
+
:cdm_endpoint,
|
6
|
+
:resumption_token,
|
7
|
+
:field_mappings,
|
8
|
+
:minimum_date,
|
9
|
+
:oai_requester,
|
10
|
+
:extractor,
|
11
|
+
:transformer,
|
12
|
+
:loader,
|
13
|
+
:solr_client
|
14
|
+
def initialize(oai_endpoint: '',
|
15
|
+
cdm_endpoint: '',
|
16
|
+
resumption_token: false,
|
17
|
+
field_mappings: false,
|
18
|
+
minimum_date: '1900-01-01',
|
19
|
+
oai_requester: OaiRequest,
|
20
|
+
extractor: Extractor,
|
21
|
+
transformer: Transformer,
|
22
|
+
loader: Loader,
|
23
|
+
solr_client: SolrClient.new)
|
24
|
+
|
25
|
+
@oai_endpoint = oai_endpoint
|
26
|
+
@cdm_endpoint = cdm_endpoint
|
27
|
+
@resumption_token = resumption_token
|
28
|
+
@field_mappings = field_mappings
|
29
|
+
@oai_requester = oai_requester
|
30
|
+
@minimum_date = minimum_date
|
31
|
+
@extractor = extractor
|
32
|
+
@transformer = transformer
|
33
|
+
@loader = loader
|
34
|
+
@solr_client = solr_client
|
35
|
+
end
|
36
|
+
|
37
|
+
def load!(resumption_token: false)
|
38
|
+
persister.load!
|
39
|
+
end
|
40
|
+
|
41
|
+
def next_resumption_token
|
42
|
+
extraction.next_resumption_token
|
43
|
+
end
|
44
|
+
|
45
|
+
def persister
|
46
|
+
loader.new(records: transformation.records,
|
47
|
+
deletable_ids: extraction.deletable_ids,
|
48
|
+
solr_client: solr_client)
|
49
|
+
end
|
50
|
+
|
51
|
+
def transformation
|
52
|
+
@transformation ||= transformer.new(cdm_records: extraction.records,
|
53
|
+
oai_sets: extraction.set_lookup,
|
54
|
+
field_mappings: field_mappings)
|
55
|
+
end
|
56
|
+
|
57
|
+
def extraction
|
58
|
+
@extraction ||= extractor.new(oai_request: oai_request,
|
59
|
+
cdm_endpoint: cdm_endpoint)
|
60
|
+
end
|
61
|
+
|
62
|
+
def oai_request
|
63
|
+
@oai_request ||= oai_requester.new(base_uri: oai_endpoint,
|
64
|
+
resumption_token: resumption_token,
|
65
|
+
from: minimum_date)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMBL
|
3
|
+
class ETLWorker
|
4
|
+
attr_reader :solr_config, :etl_config
|
5
|
+
include Sidekiq::Worker
|
6
|
+
def perform(solr_config, etl_config, recursive = true)
|
7
|
+
@etl_config = etl_config.symbolize_keys
|
8
|
+
@solr_config = solr_config.symbolize_keys
|
9
|
+
puts "Ingesting resumptionToken batch: #{etl_config['resumption_token']}"
|
10
|
+
etl_run.load!
|
11
|
+
if etl_run.next_resumption_token && recursive
|
12
|
+
ETLWorker.perform_async(solr_config, next_etl_config)
|
13
|
+
else
|
14
|
+
CDMBL::Callback.call!(solr_client)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def etl_run
|
19
|
+
@etl_run ||= ETLRun.new(etl_config.merge(solr_client: solr_client))
|
20
|
+
end
|
21
|
+
|
22
|
+
def solr_client
|
23
|
+
@solr_client ||= CDMBL::Solr.new(solr_config)
|
24
|
+
end
|
25
|
+
|
26
|
+
def next_etl_config
|
27
|
+
etl_config.merge(resumption_token: etl_run.next_resumption_token)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'contentdm_api'
|
2
|
+
require 'active_support/core_ext/hash/conversions'
|
3
|
+
require 'hash_at_path'
|
4
|
+
require 'forwardable'
|
5
|
+
|
6
|
+
module CDMBL
|
7
|
+
# This extractor uses the SimpleGet extractor initially and then makes
|
8
|
+
# subsequent passes at the full ContentDM API with identifiers taken from
|
9
|
+
# the contentdm api
|
10
|
+
class Extractor
|
11
|
+
extend ::Forwardable
|
12
|
+
def_delegators :@oai_request, :sets, :identifiers
|
13
|
+
attr_reader :oai_request, :cdm_item, :cdm_endpoint, :oai_set_lookup, :oai_filter
|
14
|
+
|
15
|
+
def initialize(oai_request: OaiRequest.new,
|
16
|
+
cdm_endpoint: '',
|
17
|
+
oai_set_lookup: OAISetLookup,
|
18
|
+
cdm_item: CONTENTdmAPI::Item,
|
19
|
+
oai_filter: OAIFilter)
|
20
|
+
@oai_request = oai_request
|
21
|
+
@cdm_item = cdm_item
|
22
|
+
@cdm_endpoint = cdm_endpoint
|
23
|
+
@oai_set_lookup = oai_set_lookup
|
24
|
+
@oai_filter = oai_filter
|
25
|
+
end
|
26
|
+
|
27
|
+
def set_lookup
|
28
|
+
oai_set_lookup.new(oai_sets: to_hash(sets)).keyed
|
29
|
+
end
|
30
|
+
|
31
|
+
def records
|
32
|
+
local_identifiers.map { |identifier| cdm_request(*identifier) }
|
33
|
+
end
|
34
|
+
|
35
|
+
def deletable_ids
|
36
|
+
oai_ids.deletable_ids
|
37
|
+
end
|
38
|
+
|
39
|
+
def local_identifiers
|
40
|
+
oai_ids.updatable_ids
|
41
|
+
end
|
42
|
+
|
43
|
+
def next_resumption_token
|
44
|
+
oai_identifiers.at_path('OAI_PMH/ListIdentifiers/resumptionToken')
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def oai_ids
|
50
|
+
oai_filter.new(headers: oai_headers)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Get the local collection and id from an OAI namespaced identifier
|
54
|
+
# e.g. oai:reflections.mndigital.org:p16022coll44/3
|
55
|
+
def extract_identifiers(identifier)
|
56
|
+
identifier.split(':').last.split('/')
|
57
|
+
end
|
58
|
+
|
59
|
+
def oai_headers
|
60
|
+
oai_identifiers.at_path('OAI_PMH/ListIdentifiers/header')
|
61
|
+
end
|
62
|
+
|
63
|
+
def oai_identifiers
|
64
|
+
to_hash(identifiers)
|
65
|
+
end
|
66
|
+
|
67
|
+
def cdm_request(collection, id)
|
68
|
+
cdm_item.new(base_url: cdm_endpoint, collection: collection, id: id).metadata
|
69
|
+
end
|
70
|
+
|
71
|
+
def to_hash(xml)
|
72
|
+
Hash.from_xml(xml)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module CDMBL
|
2
|
+
class FieldFormatter
|
3
|
+
attr_reader :value, :formatters
|
4
|
+
def initialize(value: {}, formatters: [DefaultFormatter])
|
5
|
+
@value = value
|
6
|
+
@formatters = formatters
|
7
|
+
end
|
8
|
+
|
9
|
+
def format!
|
10
|
+
formatters.reduce(value) { |memo, formatter| formatter.format(memo) }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'hash_at_path'
|
2
|
+
|
3
|
+
module CDMBL
|
4
|
+
class FieldTransformer
|
5
|
+
attr_reader :field_value, :dest_path, :formatters, :formatter_klass
|
6
|
+
def initialize(origin_path: '',
|
7
|
+
dest_path: '',
|
8
|
+
record: {},
|
9
|
+
formatters: [],
|
10
|
+
formatter_klass: FieldFormatter)
|
11
|
+
@field_value = compact(record.at_path(origin_path))
|
12
|
+
@dest_path = dest_path
|
13
|
+
@formatters = (!formatters.nil?) ? formatters : [DefaultFormatter]
|
14
|
+
@formatter_klass = formatter_klass
|
15
|
+
end
|
16
|
+
|
17
|
+
def reduce
|
18
|
+
(blank?(value)) ? {} : { "#{dest_path}" => value }
|
19
|
+
end
|
20
|
+
|
21
|
+
def value
|
22
|
+
@value ||= (!blank?(field_value)) ? transform_field : nil
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def compact(record)
|
28
|
+
(record.respond_to?(:compact)) ? record.compact : record
|
29
|
+
end
|
30
|
+
|
31
|
+
# File activesupport/lib/active_support/core_ext/object/blank.rb, line 14
|
32
|
+
def blank?(val)
|
33
|
+
val.respond_to?(:empty?) ? !!val.empty? : !val
|
34
|
+
end
|
35
|
+
|
36
|
+
def transform_field
|
37
|
+
formatter_klass.new(value: field_value, formatters: formatters).format!
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'titleize'
|
2
|
+
# A handful of very simple formatters to clean up CONTENTdm API metadata
|
3
|
+
module CDMBL
|
4
|
+
|
5
|
+
class DefaultFormatter
|
6
|
+
def self.format(value)
|
7
|
+
value
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Titlieze
|
12
|
+
def self.format(value)
|
13
|
+
if value.respond_to?(:map)
|
14
|
+
value.map {|value| value.titleize }
|
15
|
+
else
|
16
|
+
value.titleize
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class ImageId
|
22
|
+
def self.format(value)
|
23
|
+
value
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class ToJsonFormatter
|
28
|
+
def self.format(values)
|
29
|
+
values.to_json if values.respond_to?(:to_json)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class StripSemicolonFormatter
|
34
|
+
def self.format(values)
|
35
|
+
if values.respond_to?(:map)
|
36
|
+
values.map {|value| value.gsub(/;/, '') }
|
37
|
+
else
|
38
|
+
values.gsub(/;/, '')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class StripFormatter
|
44
|
+
def self.format(values)
|
45
|
+
if values.respond_to?(:map)
|
46
|
+
values.map {|value| value.strip }
|
47
|
+
else
|
48
|
+
values.strip
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class SplitFormatter
|
54
|
+
def self.format(value)
|
55
|
+
(value.respond_to?(:split)) ? value.split(';') : value
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
class AddSetSpecFormatter
|
60
|
+
def self.format(value)
|
61
|
+
value.merge('setSpec' => value['id'].split('/').first)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class SetSpecFormatter
|
66
|
+
def self.format(value)
|
67
|
+
value['setSpec']
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class CollectionNameFormatter
|
72
|
+
def self.format(value)
|
73
|
+
value['oai_sets'].fetch(value['setSpec'], {})
|
74
|
+
.fetch(:name, '')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class CollectionDescriptionFormatter
|
79
|
+
def self.format(value)
|
80
|
+
value['oai_sets'].fetch(value['setSpec'], {})
|
81
|
+
.fetch(:description, '')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class FilterBadCollections
|
86
|
+
def self.format(value)
|
87
|
+
(/Collection information undefined/i =~ value) ? '' : value
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
class ToIFormatter
|
92
|
+
def self.format(value)
|
93
|
+
value.to_i if value.respond_to?(:to_i)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class LocationFormatter
|
98
|
+
def self.format(record)
|
99
|
+
if record['latitu'] && record['longit'] && record['latitu'] != '' && record['longit'] != '' && record['latitu'] != {}
|
100
|
+
"#{record['latitu']}, #{record['longit']}"
|
101
|
+
else
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
data/lib/cdmbl/hooks.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module CDMBL
|
2
|
+
def self.const_missing(name)
|
3
|
+
if name.to_s == 'Solr'
|
4
|
+
hook(pattern: name.to_s, default: DefaultSolr)
|
5
|
+
elsif name.to_s == 'Callback'
|
6
|
+
hook(pattern: name.to_s, default: DefaultCallback)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.hook(pattern: '', default: false)
|
11
|
+
if find_hook(pattern, default)
|
12
|
+
Object.const_get("CDMBL::#{find_hook(pattern, default)}")
|
13
|
+
else
|
14
|
+
default
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.find_hook(pattern, default)
|
19
|
+
CDMBL.constants.find do |konst|
|
20
|
+
if Object.const_get("CDMBL::#{konst}") != default
|
21
|
+
/#{pattern}/ =~ konst.to_s
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/cdmbl/loader.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module CDMBL
|
2
|
+
|
3
|
+
class Loader
|
4
|
+
attr_reader :solr_client, :records, :deletable_ids
|
5
|
+
|
6
|
+
def initialize(records: [],
|
7
|
+
deletable_ids: [],
|
8
|
+
solr_client: CDMBL::DefaultSolr)
|
9
|
+
@solr_client = solr_client
|
10
|
+
@records = records
|
11
|
+
@deletable_ids = deletable_ids
|
12
|
+
end
|
13
|
+
|
14
|
+
def load!
|
15
|
+
solr_client.delete deletable_ids unless deletable_ids.empty?
|
16
|
+
solr_client.add records
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module CDMBL
|
2
|
+
# This class has been named in a way that makes it hard to pronounce
|
3
|
+
class OAIFilter
|
4
|
+
attr_reader :headers
|
5
|
+
def initialize(headers: [])
|
6
|
+
@headers = headers
|
7
|
+
end
|
8
|
+
|
9
|
+
def updatable_ids
|
10
|
+
get_ids(find_mutatables_by { |id| id['status'] != 'deleted' })
|
11
|
+
end
|
12
|
+
|
13
|
+
def deletable_ids
|
14
|
+
deletables.map { |deletable| deletable.join('/')}
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def deletables
|
20
|
+
get_ids(find_mutatables_by { |id| id['status'] == 'deleted' })
|
21
|
+
end
|
22
|
+
|
23
|
+
def mutatables
|
24
|
+
@mutables ||= headers.map do |header|
|
25
|
+
header.merge(ids: extract_identifiers(header['identifier']))
|
26
|
+
end.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
# Get the local collection and id from an OAI namespaced identifier
|
30
|
+
# e.g. oai:reflections.mndigital.org:p16022coll44/3
|
31
|
+
def extract_identifiers(identifier)
|
32
|
+
identifier.split(':').last.split('/')
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_ids(header_items)
|
36
|
+
header_items.map { |header| header[:ids] }
|
37
|
+
end
|
38
|
+
|
39
|
+
def find_mutatables_by
|
40
|
+
mutatables.find_all { |id| yield(id) }
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module CDMBL
|
2
|
+
class OaiRequest
|
3
|
+
attr_reader :base_uri, :resumption_token, :client, :from
|
4
|
+
def initialize(base_uri: '',
|
5
|
+
resumption_token: false,
|
6
|
+
from: '1900-01-01',
|
7
|
+
client: Net::HTTP)
|
8
|
+
@base_uri = base_uri
|
9
|
+
@resumption_token = resumption_token
|
10
|
+
@client = client
|
11
|
+
@from = from
|
12
|
+
end
|
13
|
+
|
14
|
+
def identifiers
|
15
|
+
(resumption_token) ? request(batch_uri) : request(first_batch_uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
def sets
|
19
|
+
@sets ||= request(sets_uri)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def first_batch_uri
|
25
|
+
"#{base_uri}?verb=ListIdentifiers&metadataPrefix=oai_dc&from=#{from}"
|
26
|
+
end
|
27
|
+
|
28
|
+
def batch_uri
|
29
|
+
"#{base_uri}?verb=ListIdentifiers&resumptionToken=#{resumption_token}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def sets_uri
|
33
|
+
"#{base_uri}?verb=ListSets"
|
34
|
+
end
|
35
|
+
|
36
|
+
def request(location)
|
37
|
+
client.get_response(URI(location)).body
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'active_support/core_ext/hash/conversions'
|
2
|
+
require 'hash_at_path'
|
3
|
+
|
4
|
+
module CDMBL
|
5
|
+
# Takes a hash representing an OAI ListSets response and turns that into a
|
6
|
+
# lookup table based on the setSpec
|
7
|
+
class OAISetLookup
|
8
|
+
attr_reader :oai_sets
|
9
|
+
def initialize(oai_sets: {})
|
10
|
+
@oai_sets = oai_sets
|
11
|
+
end
|
12
|
+
|
13
|
+
def keyed
|
14
|
+
oai_sets.at_path('OAI_PMH/ListSets/set').inject({}) {|memo, set| memo.merge(to_key(set)) }
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def to_key(set)
|
20
|
+
{set['setSpec'] => {name: set['setName'], description: set.at_path('setDescription/dc/description')}}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module CDMBL
|
2
|
+
class RecordTransformer
|
3
|
+
attr_reader :record, :field_mappings, :field_transformer
|
4
|
+
def initialize(record: {},
|
5
|
+
field_mappings: [],
|
6
|
+
field_transformer: FieldTransformer)
|
7
|
+
@record = record
|
8
|
+
@field_mappings = field_mappings
|
9
|
+
@field_transformer = field_transformer
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform!
|
13
|
+
field_mappings.inject({}) do |dest_record, mapping|
|
14
|
+
dest_record.merge(transform_field(record, mapping))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def transform_field(record, mapping)
|
21
|
+
field_transformer.new(origin_path: mapping[:origin_path],
|
22
|
+
dest_path: mapping[:dest_path],
|
23
|
+
formatters: mapping[:formatters],
|
24
|
+
record: record).reduce
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'cdmbl'
|
2
|
+
|
3
|
+
namespace :cdmbl do
|
4
|
+
desc 'Launch a background job to index metadata from CONTENTdm to Solr.'
|
5
|
+
task :ingest, [:solr_url, :oai_endpoint, :cdm_endpoint, :minimum_date] do |t, args|
|
6
|
+
solr_config = { url: args[solr_url] }
|
7
|
+
etl_config = { oai_endpoint: args[:oai_endpoint], cdm_endpoint: args[:cdm_endpoint], minimum_date: args[:minimum_date] }
|
8
|
+
CDMBL::ETLWorker.perform_async(solr_config, etl_config)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'titleize'
|
3
|
+
|
4
|
+
module CDMBL
|
5
|
+
|
6
|
+
class Transformer
|
7
|
+
attr_reader :cdm_records, :oai_sets, :field_mappings, :record_transformer
|
8
|
+
def initialize(cdm_records: [],
|
9
|
+
oai_sets: {},
|
10
|
+
field_mappings: false,
|
11
|
+
record_transformer: RecordTransformer)
|
12
|
+
@cdm_records = cdm_records
|
13
|
+
@oai_sets = oai_sets
|
14
|
+
@field_mappings = mappings_init(field_mappings)
|
15
|
+
@record_transformer = record_transformer
|
16
|
+
end
|
17
|
+
|
18
|
+
def records
|
19
|
+
cdm_records.map { |record| to_solr(record) }.compact
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def mappings_init(mappings)
|
25
|
+
(mappings) ? mappings : self.class.default_mappings
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_solr(record)
|
29
|
+
# Remove empty records (move this behavior to the CONTENTdm API gem) and
|
30
|
+
# bail early on the transformation process
|
31
|
+
if {'id' => record['id']} == record
|
32
|
+
return nil
|
33
|
+
else
|
34
|
+
record_transformer.new(record: record.merge('oai_sets' => oai_sets),
|
35
|
+
field_mappings: field_mappings).transform!
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#TODO: Make a
|
40
|
+
def with_location(dest_record, record)
|
41
|
+
if record['latitu'] && record['longit'] && record['latitu'] != '' && record['longit'] != ''
|
42
|
+
dest_record.merge({'location_llsi': "#{record['latitu']}, #{record['longit']}"})
|
43
|
+
else
|
44
|
+
dest_record
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.default_mappings
|
49
|
+
[
|
50
|
+
{dest_path: 'location_llsi', origin_path: '/', formatters: [LocationFormatter]},
|
51
|
+
{dest_path: 'id', origin_path: 'id', formatters: [StripFormatter]},
|
52
|
+
{dest_path: 'setspec_ssi', origin_path: '/', formatters: [AddSetSpecFormatter, SetSpecFormatter]},
|
53
|
+
{dest_path: 'collection_name_ssi', origin_path: '/', formatters: [AddSetSpecFormatter, CollectionNameFormatter]},
|
54
|
+
{dest_path: 'collection_name_tei', origin_path: '/', formatters: [AddSetSpecFormatter, CollectionNameFormatter]},
|
55
|
+
{dest_path: 'collection_description_tei', origin_path: '/', formatters: [AddSetSpecFormatter, CollectionDescriptionFormatter, FilterBadCollections]},
|
56
|
+
{dest_path: 'title_tei', origin_path: 'title', formatters: [StripFormatter]},
|
57
|
+
{dest_path: 'title_ssi', origin_path: 'title', formatters: [StripFormatter]},
|
58
|
+
{dest_path: 'title_sort', origin_path: 'title', formatters: [StripFormatter]},
|
59
|
+
{dest_path: 'title_unstem_search', origin_path: 'title', formatters: [StripFormatter]},
|
60
|
+
{dest_path: 'contributor_teim', origin_path: 'contri', formatters: [StripFormatter]},
|
61
|
+
{dest_path: 'contributor_unstem_search', origin_path: 'contri', formatters: [StripFormatter]},
|
62
|
+
{dest_path: 'contributor_ssim', origin_path: 'contri', formatters: [SplitFormatter, StripFormatter]},
|
63
|
+
{dest_path: 'creator_teim', origin_path: 'photog', formatters: [StripFormatter]},
|
64
|
+
{dest_path: 'creator_unstem_search', origin_path: 'photog', formatters: [StripFormatter]},
|
65
|
+
{dest_path: 'creator_ssim', origin_path: 'photog', formatters: [SplitFormatter, StripFormatter]},
|
66
|
+
{dest_path: 'creator_sort', origin_path: 'photog', formatters: [StripFormatter]},
|
67
|
+
{dest_path: 'description_tei', origin_path: 'descri', formatters: [StripFormatter]},
|
68
|
+
{dest_path: 'description_ts', origin_path: 'descri', formatters: [StripFormatter]},
|
69
|
+
{dest_path: 'dat_ssi', origin_path: 'dat', formatters: [StripFormatter]},
|
70
|
+
{dest_path: 'dat_sort', origin_path: 'dat', formatters: [StripFormatter]},
|
71
|
+
{dest_path: 'publishing_agency_tei', origin_path: 'publia', formatters: [StripFormatter]},
|
72
|
+
{dest_path: 'publishing_agency_unstem_search', origin_path: 'publia', formatters: [StripFormatter]},
|
73
|
+
{dest_path: 'publishing_agency_ssi', origin_path: 'publia', formatters: [StripFormatter]},
|
74
|
+
{dest_path: 'dimensions_ssi', origin_path: 'dimens', formatters: [StripFormatter]},
|
75
|
+
{dest_path: 'topic_teim', origin_path: 'genera', formatters: [StripFormatter, SplitFormatter, StripFormatter]},
|
76
|
+
{dest_path: 'topic_ssim', origin_path: 'genera', formatters: [Titlieze, StripFormatter, SplitFormatter, StripFormatter]},
|
77
|
+
{dest_path: 'topic_unstem_search', origin_path: 'genera', formatters: [StripSemicolonFormatter, StripFormatter]},
|
78
|
+
{dest_path: 'type_ssi', origin_path: 'type', formatters: [Titlieze, StripSemicolonFormatter]},
|
79
|
+
{dest_path: 'physical_format_ssi', origin_path: 'physic', formatters: [StripSemicolonFormatter]},
|
80
|
+
{dest_path: 'physical_format_tei', origin_path: 'physic', formatters: [StripSemicolonFormatter]},
|
81
|
+
{dest_path: 'formal_subject_unstem_search', origin_path: 'specif', formatters: [StripFormatter]},
|
82
|
+
{dest_path: 'formal_subject_ssim', origin_path: 'specif', formatters: [Titlieze, StripFormatter, SplitFormatter, StripFormatter]},
|
83
|
+
{dest_path: 'formal_subject_teim', origin_path: 'specif', formatters: [Titlieze, StripFormatter, SplitFormatter, StripFormatter]},
|
84
|
+
{dest_path: 'subject_unstem_search', origin_path: 'subjec', formatters: [StripFormatter]},
|
85
|
+
{dest_path: 'subject_teim', origin_path: 'subjec', formatters: [StripFormatter, SplitFormatter, StripFormatter]},
|
86
|
+
{dest_path: 'subject_ssim', origin_path: 'subjec', formatters: [StripFormatter, SplitFormatter, StripFormatter]},
|
87
|
+
{dest_path: 'city_ssi', origin_path: 'city', formatters: [StripFormatter]},
|
88
|
+
{dest_path: 'city_unstem_search', origin_path: 'city', formatters: [StripFormatter]},
|
89
|
+
{dest_path: 'district_ssi', origin_path: 'distri', formatters: [StripFormatter]},
|
90
|
+
{dest_path: 'district_unstem_search', origin_path: 'distri', formatters: [StripFormatter]},
|
91
|
+
{dest_path: 'county_ssim', origin_path: 'county', formatters: [Titlieze, StripFormatter, SplitFormatter, StripFormatter]},
|
92
|
+
{dest_path: 'county_unstem_search', origin_path: 'county', formatters: [StripFormatter]},
|
93
|
+
{dest_path: 'state_ssi', origin_path: 'state', formatters: [StripFormatter]},
|
94
|
+
{dest_path: 'state_unstem_search', origin_path: 'state', formatters: [StripFormatter]},
|
95
|
+
{dest_path: 'country_ssi', origin_path: 'countr', formatters: [StripFormatter]},
|
96
|
+
{dest_path: 'country_unstem_search', origin_path: 'countr', formatters: [StripFormatter]},
|
97
|
+
{dest_path: 'language_ssi', origin_path: 'langua', formatters: [StripFormatter]},
|
98
|
+
{dest_path: 'language_unstem_search', origin_path: 'langua', formatters: [StripFormatter]},
|
99
|
+
{dest_path: 'contributing_unstem_search', origin_path: 'contra', formatters: [StripFormatter]},
|
100
|
+
{dest_path: 'contributing_organization_tei', origin_path: 'contra', formatters: [StripFormatter]},
|
101
|
+
{dest_path: 'contributing_organization_ssi', origin_path: 'contra', formatters: [Titlieze, StripFormatter]},
|
102
|
+
{dest_path: 'contact_information_ssi', origin_path: 'contac', formatters: [StripFormatter]},
|
103
|
+
{dest_path: 'rights_ssi', origin_path: 'righta', formatters: [StripFormatter]},
|
104
|
+
{dest_path: 'local_identifier_ssi', origin_path: 'identi', formatters: [StripFormatter]},
|
105
|
+
{dest_path: 'identifier_ssi', origin_path: 'resour', formatters: [StripFormatter]},
|
106
|
+
{dest_path: 'project_ssi', origin_path: 'projec', formatters: [StripFormatter]},
|
107
|
+
{dest_path: 'fiscal_sponsor_ssi', origin_path: 'fiscal', formatters: [StripFormatter]},
|
108
|
+
{dest_path: 'publisher_ssi', origin_path: 'publis', formatters: [StripFormatter]},
|
109
|
+
{dest_path: 'date_ssi', origin_path: 'date', formatters: [StripFormatter]},
|
110
|
+
{dest_path: 'format_ssi', origin_path: 'format', formatters: [StripFormatter]},
|
111
|
+
{dest_path: 'digspa_ssi', origin_path: 'digspa'},
|
112
|
+
{dest_path: 'digspb_ssi', origin_path: 'digspb'},
|
113
|
+
{dest_path: 'digspc_ssi', origin_path: 'digspc'},
|
114
|
+
{dest_path: 'digspd_ssi', origin_path: 'digspd'},
|
115
|
+
{dest_path: 'digspe_ssi', origin_path: 'digspe'},
|
116
|
+
{dest_path: 'digspf_ssi', origin_path: 'digspf'},
|
117
|
+
{dest_path: 'digspg_ssi', origin_path: 'digspg'},
|
118
|
+
{dest_path: 'digsph_ssi', origin_path: 'digsph'},
|
119
|
+
{dest_path: 'digspi_ssi', origin_path: 'digspi'},
|
120
|
+
{dest_path: 'digspj_ssi', origin_path: 'digspj'},
|
121
|
+
{dest_path: 'digspk_ssi', origin_path: 'digspk'},
|
122
|
+
{dest_path: 'transcription_tesi', origin_path: 'transc', formatters: [StripFormatter]},
|
123
|
+
{dest_path: 'translation_tesi', origin_path: 'transl', formatters: [StripFormatter]},
|
124
|
+
{dest_path: 'fullrs_tes', origin_path: 'fullrs', formatters: [StripFormatter]},
|
125
|
+
{dest_path: 'find_ssi', origin_path: 'find', formatters: [StripFormatter]},
|
126
|
+
{dest_path: 'dmaccess_ssi', origin_path: 'dmaccess', formatters: [StripFormatter]},
|
127
|
+
{dest_path: 'dmimage_ssi', origin_path: 'dmimage', formatters: [StripFormatter]},
|
128
|
+
{dest_path: 'dmcreated_ssi', origin_path: 'dmcreated', formatters: [StripFormatter]},
|
129
|
+
{dest_path: 'dmmodified_ssi', origin_path: 'dmmodified', formatters: [StripFormatter]},
|
130
|
+
{dest_path: 'dmoclcno_ssi', origin_path: 'dmoclcno', formatters: [StripFormatter]},
|
131
|
+
{dest_path: 'restriction_code_ssi', origin_path: 'restrictionCode', formatters: [StripFormatter]},
|
132
|
+
{dest_path: 'cdmfilesize_ssi', origin_path: 'cdmfilesize', formatters: [StripFormatter]},
|
133
|
+
{dest_path: 'cdmfilesizeformatted_ssi', origin_path: 'cdmfilesizeformatted', formatters: [StripFormatter]},
|
134
|
+
{dest_path: 'cdmprintpdf_is', origin_path: 'cdmprintpdf', formatters: [ToIFormatter]},
|
135
|
+
{dest_path: 'cdmhasocr_is', origin_path: 'cdmhasocr', formatters: [ToIFormatter]},
|
136
|
+
{dest_path: 'cdmisnewspaper_is', origin_path: 'cdmisnewspaper', formatters: [ToIFormatter]},
|
137
|
+
{dest_path: 'image_uri_ssi', origin_path: 'image_uri', formatters: [StripFormatter]},
|
138
|
+
{dest_path: 'record_type_ssi', origin_path: 'record_type', formatters: [StripFormatter]},
|
139
|
+
{dest_path: 'geographic_feature_ssim', origin_path: 'geogra', formatters: [Titlieze, StripFormatter, SplitFormatter, StripFormatter]},
|
140
|
+
{dest_path: 'geographic_feature_teim', origin_path: 'geogra', formatters: [StripFormatter]},
|
141
|
+
{dest_path: 'geographic_feature_unstem_search', origin_path: 'geogra', formatters: [StripFormatter]},
|
142
|
+
{dest_path: 'compound_objects_ts', origin_path: 'compound_objects', formatters: [ToJsonFormatter]},
|
143
|
+
{dest_path: 'image_ids_ssim', origin_path: 'compound_objects', formatters: [ImageId]},
|
144
|
+
]
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
data/lib/cdmbl.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'cdmbl/version'
|
2
|
+
require 'cdmbl/extractor'
|
3
|
+
require 'cdmbl/oai_request'
|
4
|
+
require 'cdmbl/oai_set_lookup'
|
5
|
+
require 'cdmbl/formatters'
|
6
|
+
require 'cdmbl/field_transformer'
|
7
|
+
require 'cdmbl/field_formatter'
|
8
|
+
require 'cdmbl/record_transformer'
|
9
|
+
require 'cdmbl/transformer'
|
10
|
+
require 'cdmbl/default_solr'
|
11
|
+
require 'cdmbl/loader'
|
12
|
+
require 'cdmbl/etl_run'
|
13
|
+
require 'cdmbl/etl_worker'
|
14
|
+
require 'cdmbl/default_callback'
|
15
|
+
require 'cdmbl/hooks'
|
16
|
+
require 'cdmbl/oai_filter'
|
metadata
ADDED
@@ -0,0 +1,253 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cdmbl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- chadfennell
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-09-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: hash_at_path
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: contentdm_api
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.2'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: sidekiq
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.4'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.4'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: titleize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.4'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.4'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rsolr
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: activesupport
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '4.2'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '4.2'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: bundler
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.12'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.12'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rake
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '10.0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '10.0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: minitest
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '5.0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '5.0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: yard
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: 0.9.0
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.9.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: webmock
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.24'
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: 1.24.0
|
163
|
+
type: :development
|
164
|
+
prerelease: false
|
165
|
+
version_requirements: !ruby/object:Gem::Requirement
|
166
|
+
requirements:
|
167
|
+
- - "~>"
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: '1.24'
|
170
|
+
- - ">="
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: 1.24.0
|
173
|
+
- !ruby/object:Gem::Dependency
|
174
|
+
name: vcr
|
175
|
+
requirement: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '3.0'
|
180
|
+
- - ">="
|
181
|
+
- !ruby/object:Gem::Version
|
182
|
+
version: 3.0.1
|
183
|
+
type: :development
|
184
|
+
prerelease: false
|
185
|
+
version_requirements: !ruby/object:Gem::Requirement
|
186
|
+
requirements:
|
187
|
+
- - "~>"
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: '3.0'
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: 3.0.1
|
193
|
+
description:
|
194
|
+
email:
|
195
|
+
- fenne035@umn.edu
|
196
|
+
executables: []
|
197
|
+
extensions: []
|
198
|
+
extra_rdoc_files: []
|
199
|
+
files:
|
200
|
+
- ".gitignore"
|
201
|
+
- ".travis.yml"
|
202
|
+
- CODE_OF_CONDUCT.md
|
203
|
+
- Gemfile
|
204
|
+
- LICENSE.txt
|
205
|
+
- README.md
|
206
|
+
- Rakefile
|
207
|
+
- bin/console
|
208
|
+
- bin/setup
|
209
|
+
- cdmbl.gemspec
|
210
|
+
- lib/cdmbl.rb
|
211
|
+
- lib/cdmbl/default_callback.rb
|
212
|
+
- lib/cdmbl/default_solr.rb
|
213
|
+
- lib/cdmbl/etl_run.rb
|
214
|
+
- lib/cdmbl/etl_worker.rb
|
215
|
+
- lib/cdmbl/extractor.rb
|
216
|
+
- lib/cdmbl/field_formatter.rb
|
217
|
+
- lib/cdmbl/field_transformer.rb
|
218
|
+
- lib/cdmbl/formatters.rb
|
219
|
+
- lib/cdmbl/hooks.rb
|
220
|
+
- lib/cdmbl/loader.rb
|
221
|
+
- lib/cdmbl/oai_filter.rb
|
222
|
+
- lib/cdmbl/oai_request.rb
|
223
|
+
- lib/cdmbl/oai_set_lookup.rb
|
224
|
+
- lib/cdmbl/rake_task.rb
|
225
|
+
- lib/cdmbl/record_transformer.rb
|
226
|
+
- lib/cdmbl/tasks/etl.rake
|
227
|
+
- lib/cdmbl/transformer.rb
|
228
|
+
- lib/cdmbl/version.rb
|
229
|
+
homepage:
|
230
|
+
licenses:
|
231
|
+
- MIT
|
232
|
+
metadata: {}
|
233
|
+
post_install_message:
|
234
|
+
rdoc_options: []
|
235
|
+
require_paths:
|
236
|
+
- lib
|
237
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
238
|
+
requirements:
|
239
|
+
- - ">="
|
240
|
+
- !ruby/object:Gem::Version
|
241
|
+
version: '0'
|
242
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
243
|
+
requirements:
|
244
|
+
- - ">="
|
245
|
+
- !ruby/object:Gem::Version
|
246
|
+
version: '0'
|
247
|
+
requirements: []
|
248
|
+
rubyforge_project:
|
249
|
+
rubygems_version: 2.5.1
|
250
|
+
signing_key:
|
251
|
+
specification_version: 4
|
252
|
+
summary: Use Blacklight (Solr) as a front end for your CONTENTdm instance.
|
253
|
+
test_files: []
|