cdmdexer 0.17.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +4 -0
- data/.travis.yml +8 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +149 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cdmdexer.gemspec +35 -0
- data/lib/cdmdexer/cdm_item.rb +89 -0
- data/lib/cdmdexer/default_cdm_notification.rb +8 -0
- data/lib/cdmdexer/default_completed_callback.rb +8 -0
- data/lib/cdmdexer/default_loader_notification.rb +8 -0
- data/lib/cdmdexer/default_oai_notification.rb +8 -0
- data/lib/cdmdexer/default_solr.rb +35 -0
- data/lib/cdmdexer/etl_by_set_specs.rb +18 -0
- data/lib/cdmdexer/etl_worker.rb +113 -0
- data/lib/cdmdexer/field_formatter.rb +13 -0
- data/lib/cdmdexer/field_mapping.rb +28 -0
- data/lib/cdmdexer/field_transformer.rb +41 -0
- data/lib/cdmdexer/filtered_set_specs.rb +41 -0
- data/lib/cdmdexer/formatters.rb +169 -0
- data/lib/cdmdexer/hooks.rb +31 -0
- data/lib/cdmdexer/load_worker.rb +36 -0
- data/lib/cdmdexer/loader.rb +19 -0
- data/lib/cdmdexer/oai_client.rb +26 -0
- data/lib/cdmdexer/oai_request.rb +100 -0
- data/lib/cdmdexer/rake_task.rb +6 -0
- data/lib/cdmdexer/record_transformer.rb +25 -0
- data/lib/cdmdexer/regex_filter_callback.rb +19 -0
- data/lib/cdmdexer/tasks/delete.rake +12 -0
- data/lib/cdmdexer/tasks/etl.rake +96 -0
- data/lib/cdmdexer/transform_worker.rb +93 -0
- data/lib/cdmdexer/transformer.rb +171 -0
- data/lib/cdmdexer/version.rb +3 -0
- data/lib/cdmdexer.rb +26 -0
- data/travis.yml +6 -0
- metadata +223 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 8d3dccb39ef4048b79af386f1d3696f9d7497efb3c53e388480e1505e027d99b
|
4
|
+
data.tar.gz: 262051a8e4e246be6092a5cbc54245cbed91449c859f5c408bde2f7b8d9ae068
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 41aeca2b754fc5681e5bfe120e4d690e4f56f5b767bd924f50a0363ca2fdfad2238e2c58507d61e79805c37e0218267903e662c968768c5d20fce807d98cecc9
|
7
|
+
data.tar.gz: 2a99d19a2639e15b5811692101d9630a67296b9ee1a32bda4e1cf70be5dcf4a0aba8815602b9172b935b8bf1482531026f5ebc8ee5289439a4baee76a7b6873c
|
data/.gitignore
ADDED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
sudo: false
|
2
|
+
language: ruby
|
3
|
+
rvm:
|
4
|
+
- 2.3.0
|
5
|
+
before_install: gem install bundler -v 1.12.5
|
6
|
+
env:
|
7
|
+
matrix:
|
8
|
+
secure: n+qvS/V5ZeCZ15RMNPeTxtAd+79XZniWNoYCriGtrJkELt/7bc4teXB8thowXHrm4Sx2lHW9uE8cR2s56RyRoV0PQU+nenjylAHPkfwlu/isutw3PGGFJoF3O89bG5jtQCQcequSH+PHRLc0xYoKaRLOMEuO0RPzd+hALt3jobzs5Q4hvvFinF8yDOTG0Qo24CtvZqNMAz0QtkWJExIZ8i2G7DnkY7Jy6DaJjv+A3MdTe2xTikSjWOvFjMrRVd1v+aP9gUTNVTy5aahpKSwub7TzsS/yj8rSI8717As/sSxepWXBg1N1AnrH6ttgjpdyjELXYlTrCx01u4sQMW3HRKG4WJzSVKaQ9W6fF82TQySh6hAaiO5/HYZ73+TTCnEk6eFJT/FaT7yNyOwhRA2Wq091PVFgfZOFy+Hxbj3dWVZm0hS5odaQ5c/aUYR8EiyfUwgHQQxl2QYjUMci+5QOa5k5TmqBHq4iqa0khC5t/zbCvY5ImC4/ySTtIQZnAtHxh3ZhdrukQn0LgLQ4BKVGZEz0wyiaMQrkf8XUgv5pkegTcAhkaz7vZs5j0wtY6CV0q9t5D2n8Xj9amo3SOfTtJjOLZ3xBJhjJRr4J219wN0EFtHt+3rXVoKcw3Vz1ysyV1Jn2B68LF0LPHwSFKDGqptP3axBJNs/utGcDTHAx8hU=
|
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at fenne035@umn.edu. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 University of Minnesota Libraries
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/UMNLibraries/cdmdexer.svg?branch=master)](https://travis-ci.org/UMNLibraries/cdmdexer)
|
2
|
+
|
3
|
+
# CDMDEXER: Index CONTENTdm Content
|
4
|
+
|
5
|
+
A micro [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) system dedicated to extracting metadata records from a CONTENTdm instance (using the [CONTENTdm API gem](https://github.com/UMNLibraries/contentdm_api), transforming them into Solr documents, and loading them into Solr.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'cdmdexer'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install cdmdexer
|
22
|
+
|
23
|
+
Add the CDMDEXER rake task to your project Rakefile:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
require 'cdmdexer/rake_task'
|
27
|
+
```
|
28
|
+
|
29
|
+
### GeoNames (optional)
|
30
|
+
|
31
|
+
In order to make use of the GeoNames service, you must purchase a [GeoNames Premium Webservices Account](http://www.geonames.org/commercial-webservices.html). If you do not have a `geonam` field in your CONTENTdm schema, you may ignore this instruction. Add your credentials to your shell environment once you have secured a GeoNames user:
|
32
|
+
|
33
|
+
```
|
34
|
+
# e.g. within your .bash_profile or .zprofile file
|
35
|
+
export export GEONAMES_USER="yourusernamehere"
|
36
|
+
```
|
37
|
+
|
38
|
+
## Usage
|
39
|
+
|
40
|
+
Run the ingester
|
41
|
+
|
42
|
+
rake cdmdexer:batch[solr_url,oai_endpoint,cdm_endpoint,set_spec, batch_size, max_compounds]
|
43
|
+
|
44
|
+
|Argument| Definition|
|
45
|
+
|--:|---|
|
46
|
+
|solr_url| The full URL to your Solr core instance (same as your blacklight.yml solr url)|
|
47
|
+
|oai_endpoint| A URL to your OAI instance (e.g. https://server16022.contentdm.oclc.org/oai/oai.php) |
|
48
|
+
|cdm_endpoint| A URL to your CONTENTdm API endpoint (e.g. https://server16022.contentdm.oclc.org/dmwebservices/index.php) |
|
49
|
+
|set_spec| Selectively harvest from a single collection with [setSpec](http://www.openarchives.org/OAI/openarchivesprotocol.html#Set)|
|
50
|
+
|batch_size| The number of records to transform at a time. **Note**: it is within the record transformation process that the CONTENTdm API is requested. This API can be sluggish, so we conservatively transform batches of ten records at a time to prevent timeouts.|
|
51
|
+
|max_compounds| CONTENTdm records with many compounds can take a long time to load from the CONTENTdm API as multiple requests must happen in order to get the metadata for each child record of a parent compound object. For this reason, records with ten or more compound children are, by default, processed in batches of one. This setting allows you to override this behavior.|
|
52
|
+
|
53
|
+
For example:
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
rake "cdmdexer:ingest[http://solr:8983/solr/foo-bar-core, https://server16022.contentdm.oclc.org/oai/oai.php, https://server16022.contentdm.oclc.org/dmwebservices/index.php, 2015-01-01]"
|
57
|
+
```
|
58
|
+
|
59
|
+
### Custom Rake Tasks
|
60
|
+
|
61
|
+
You might also create your own rake task to run your modified field transformers:
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
require 'cdmdexer'
|
65
|
+
|
66
|
+
namespace :cdmdexer do
|
67
|
+
desc "ingest batches of records"
|
68
|
+
##
|
69
|
+
# e.g. rake mdl_ingester:ingest[2015-09-14, 2]
|
70
|
+
task :batch, [:batch_size, :set_spec] => :environment do |t, args|
|
71
|
+
config =
|
72
|
+
{
|
73
|
+
oai_endpoint: 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
|
74
|
+
cdm_endpoint: 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
|
75
|
+
set_spec: (args[:set_spec] != '""') ? args[:set_spec] : nil,
|
76
|
+
batch_size: (args[:batch_size]) ? args[:batch_size] : 30,
|
77
|
+
solr_config: solr_config
|
78
|
+
}
|
79
|
+
CDMDEXER::ETLWorker.perform_async(config)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
```
|
83
|
+
### Your Own Custom Solr Field Mappings (see above code snippet)
|
84
|
+
|
85
|
+
The default CONTENTdm to Solr field transformation rules may be overriden by calling the CDMDEXER::ETLWorker (a [Sidekiq worker](https://github.com/mperham/sidekiq)) directly. These rules may be found in the default_mappings method of the [CDMDEXER::Transformer Class](https://github.com/UMNLibraries/cdmdexer/blob/master/lib/cdmdexer/transformer.rb).
|
86
|
+
|
87
|
+
The transformer expects mappings in the following format:
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
def your_custom_field_mappings
|
91
|
+
[
|
92
|
+
{dest_path: 'title_tei', origin_path: 'title', formatters: [StripFormatter]},
|
93
|
+
]
|
94
|
+
end
|
95
|
+
```
|
96
|
+
|Argument| Definition|
|
97
|
+
|--:|---|
|
98
|
+
|dest_path| The 'destination path' is the name of the field you will be sending to Solr for this field mapping. |
|
99
|
+
|origin_path| Where to get the field data from the original record for this mapping. |
|
100
|
+
|formatters| [Formatters](https://github.com/UMNLibraries/cdmdexer/blob/master/lib/cdmdexer/formatters.rb) perform tasks such as stripping white space or splitting CONTENTdm multi-valued fields (delimited by semicolons) into JSON arrays. |
|
101
|
+
|
102
|
+
**Note:** The first formatter receives the value found at the declared `origin_path`. Each formatter declared after the initial formatter will receive a value produced by the preceding formatter.
|
103
|
+
|
104
|
+
Formatters are very simple stateless classes that take a value, do something to it, and respond with a modified version of this value via a class method called `format`. Examples of other formatters may be found in the [Formatters file](https://github.com/UMNLibraries/cdmdexer/blob/master/lib/cdmdexer/formatters.rb). For Example:
|
105
|
+
|
106
|
+
```ruby
|
107
|
+
class SplitFormatter
|
108
|
+
def self.format(value)
|
109
|
+
(value.respond_to?(:split)) ? value.split(';') : value
|
110
|
+
end
|
111
|
+
end
|
112
|
+
```
|
113
|
+
|
114
|
+
You might also want to simply override some of the default mappings or add your own:
|
115
|
+
|
116
|
+
```ruby
|
117
|
+
mappings = CDMDEXER::Transformer.default_mappings.merge(your_custom_field_mappings)
|
118
|
+
```
|
119
|
+
## A Custom Post-indexing Callback
|
120
|
+
|
121
|
+
If you would like to perform some action (e.g. send an email) following the completion of the CDMDEXER indexing process, you may declare your own callback hook (anything with "Callback" in the class name declared within the CDMDEXER module space will be used). To do so in Rails, create a Rails initializer file `config/initializers/cdmdexer.rb`:
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
module CDMDEXER
|
125
|
+
class Callback
|
126
|
+
def self.call!
|
127
|
+
Rails.logger.info("My Custom CDMDEXER Callback")
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
```
|
132
|
+
## Development
|
133
|
+
|
134
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
135
|
+
|
136
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
137
|
+
|
138
|
+
## Contributing
|
139
|
+
|
140
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/UMNLibraries/cdmdexer. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
141
|
+
|
142
|
+
## License
|
143
|
+
|
144
|
+
[MIT](/LICENSE.txt)
|
145
|
+
|
146
|
+
## TODO
|
147
|
+
|
148
|
+
* Make StripFormatter the default formatter so it doesn't need to be declared for every field
|
149
|
+
* Re-brand project: CONTENTdm Indexer. CDMDEXER doesn't necessarily require Blacklight. Moreover only handles indexing.
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "cdmdexer"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/cdmdexer.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'cdmdexer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'cdmdexer'
|
8
|
+
spec.version = CDMDEXER::VERSION
|
9
|
+
spec.authors = ['chadfennell']
|
10
|
+
spec.email = ['fenne035@umn.edu']
|
11
|
+
|
12
|
+
spec.summary = %q{Load CONTENTdm data into a Solr Index. CDMDEXER expects to run inside a Rails application.}
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
spec.bindir = 'exe'
|
17
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'hash_at_path', '~> 0.1'
|
21
|
+
spec.add_dependency 'contentdm_api', '~> 0.5.0'
|
22
|
+
spec.add_dependency 'sidekiq', '>= 3.5'
|
23
|
+
spec.add_dependency 'titleize', '~> 1.4'
|
24
|
+
spec.add_dependency 'rsolr', '~> 2.0'
|
25
|
+
# CDMDEXER expects to run in a rails app, but just to avoid adding
|
26
|
+
# another external dependency for XML procssing, we rely on activesupport's
|
27
|
+
# Has.to_jsonl feature for testing and to allow this gem to function
|
28
|
+
# independently from a rails app
|
29
|
+
spec.add_dependency 'rails', '>= 5.2'
|
30
|
+
|
31
|
+
spec.add_development_dependency 'bundler', '~> 1.12'
|
32
|
+
spec.add_development_dependency 'rake', '~> 12.0'
|
33
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
34
|
+
spec.add_development_dependency 'yard', '~> 0.9.0'
|
35
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
class CdmItem
|
3
|
+
attr_reader :cdm_endpoint,
|
4
|
+
:record,
|
5
|
+
:collection,
|
6
|
+
:id,
|
7
|
+
:cdm_api_klass,
|
8
|
+
:cdm_notification_klass
|
9
|
+
|
10
|
+
def initialize(record: :MISSING_RECORD,
|
11
|
+
cdm_endpoint: :MISSING_ENDPOINT,
|
12
|
+
cdm_api_klass: CONTENTdmAPI::Item,
|
13
|
+
cdm_notification_klass: CDMDEXER::CdmNotification)
|
14
|
+
@record = record
|
15
|
+
@collection, @id = record['id'].split(':')
|
16
|
+
@cdm_endpoint = cdm_endpoint
|
17
|
+
@cdm_api_klass = cdm_api_klass
|
18
|
+
@cdm_notification_klass = cdm_notification_klass
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_h
|
22
|
+
# Preserve the record hash. It may contain compound data that has been
|
23
|
+
# resubmitted here by the transformer_worker as it recurses through
|
24
|
+
# compounds in order to extract their full metadata
|
25
|
+
@to_h ||= record.merge(metadata)
|
26
|
+
end
|
27
|
+
|
28
|
+
def page
|
29
|
+
primary_record.fetch('page', [])
|
30
|
+
.each_with_index.map { |page, i| to_compound(page, i) }
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def metadata
|
36
|
+
if first_page_id
|
37
|
+
# There are cases when we will not want to have to query for the
|
38
|
+
# metadata of the first item of a compound. So, include the metadata of
|
39
|
+
# the first page in its parent record metadata.
|
40
|
+
#
|
41
|
+
# Use-case: you want to grab a thumbnail for the compound record. In
|
42
|
+
# this case, you'll need the format field of the first record in order
|
43
|
+
# to determine which thumbnail generation mechanism to use (e.g. CDM
|
44
|
+
# thumb vs getting a thumbnail for a video from Kaltura)
|
45
|
+
primary_record.merge('first_page' => request(first_page_id))
|
46
|
+
else
|
47
|
+
primary_record
|
48
|
+
end.merge(
|
49
|
+
'page' => page,
|
50
|
+
# When an item has pages, these pages are resubmitted to CdmItem
|
51
|
+
# as records in order to get their full metadata. But we want to
|
52
|
+
# remember that they are actually secondary / child pages
|
53
|
+
'record_type' => record.fetch('record_type', 'primary')
|
54
|
+
)
|
55
|
+
end
|
56
|
+
|
57
|
+
def first_page_id
|
58
|
+
(page.first || {}).fetch('id', '').split(':').last
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_compound(page, i)
|
62
|
+
# raise "#{collection}:#{page['pageptr']}".inspect
|
63
|
+
page.merge(
|
64
|
+
# Child id is a combo of the page id and parent collection
|
65
|
+
'id' => "#{collection}:#{page['pageptr']}",
|
66
|
+
'parent_id' => record['id'],
|
67
|
+
'record_type' => 'secondary',
|
68
|
+
'child_index' => i
|
69
|
+
)
|
70
|
+
end
|
71
|
+
|
72
|
+
def primary_record
|
73
|
+
@primary_record ||= request(id)
|
74
|
+
end
|
75
|
+
|
76
|
+
# CDM's id format is collection/id. We use collection:id
|
77
|
+
def to_solr_id(record)
|
78
|
+
record.merge('id' => record['id'].split('/').join(':'))
|
79
|
+
end
|
80
|
+
|
81
|
+
def request(id)
|
82
|
+
cdm_notification_klass.call!(collection, id, cdm_endpoint)
|
83
|
+
to_solr_id(cdm_api_klass.new(base_url: cdm_endpoint,
|
84
|
+
collection: collection,
|
85
|
+
with_compound: false,
|
86
|
+
id: id).metadata)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'rsolr'
|
2
|
+
|
3
|
+
module CDMDEXER
|
4
|
+
# Commnicate with Solr: add / delete stuff
|
5
|
+
class DefaultSolr
|
6
|
+
attr_reader :url, :client
|
7
|
+
def initialize(url: 'http://localhost:8983/solr/core-here', client: RSolr)
|
8
|
+
@url = url
|
9
|
+
@client = client
|
10
|
+
end
|
11
|
+
|
12
|
+
def ids(start: 0)
|
13
|
+
connection.get('select',
|
14
|
+
:params => { :q => '*:*',
|
15
|
+
:defType => 'edismax',
|
16
|
+
:fl => '',
|
17
|
+
:rows => 10,
|
18
|
+
:start => start
|
19
|
+
}
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
23
|
+
def connection
|
24
|
+
@connection ||= client.connect url: url
|
25
|
+
end
|
26
|
+
|
27
|
+
def add(records)
|
28
|
+
connection.add records
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(ids)
|
32
|
+
connection.delete_by_id ids
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
class ETLBySetSpecs
|
3
|
+
attr_reader :set_specs, :etl_config, :etl_worker_klass
|
4
|
+
def initialize(set_specs: [:missing_setspec],
|
5
|
+
etl_config: :missing_etl_config,
|
6
|
+
etl_worker_klass: ETLWorker)
|
7
|
+
@set_specs = set_specs
|
8
|
+
@etl_config = etl_config
|
9
|
+
@etl_worker_klass = etl_worker_klass
|
10
|
+
end
|
11
|
+
|
12
|
+
def run!
|
13
|
+
set_specs.map do |set_spec|
|
14
|
+
etl_worker_klass.perform_async(etl_config.merge(set_spec: set_spec))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMDEXER
|
3
|
+
# Extract records from OAI, delete records marked for deletion
|
4
|
+
# and send everything else to a transformation / load worker
|
5
|
+
class ETLWorker
|
6
|
+
include Sidekiq::Worker
|
7
|
+
|
8
|
+
extend ::Forwardable
|
9
|
+
def_delegators :@oai_request,
|
10
|
+
:deletable_ids,
|
11
|
+
:updatables,
|
12
|
+
:next_resumption_token
|
13
|
+
|
14
|
+
attr_reader :config,
|
15
|
+
:solr_config,
|
16
|
+
:cdm_endpoint,
|
17
|
+
:oai_endpoint,
|
18
|
+
:field_mappings,
|
19
|
+
:resumption_token,
|
20
|
+
:batch_size,
|
21
|
+
:is_recursive
|
22
|
+
|
23
|
+
attr_writer :oai_request_klass,
|
24
|
+
:etl_worker_klass,
|
25
|
+
:load_worker_klass,
|
26
|
+
:completed_callback_klass,
|
27
|
+
:transform_worker_klass
|
28
|
+
|
29
|
+
def perform(config)
|
30
|
+
# Sidekiq stores params in JSON, so we can't inject dependencies. This
|
31
|
+
# results in the long set of arguments that follows. Otherwise, we'd
|
32
|
+
# simply inject the OAI request and extractor objects
|
33
|
+
@config = config
|
34
|
+
@solr_config = config.fetch('solr_config').symbolize_keys
|
35
|
+
@cdm_endpoint = config.fetch('cdm_endpoint')
|
36
|
+
@oai_endpoint = config.fetch('oai_endpoint')
|
37
|
+
@field_mappings = config.fetch('field_mappings', false)
|
38
|
+
@resumption_token = config.fetch('resumption_token', nil)
|
39
|
+
@batch_size = config.fetch('batch_size', 5).to_i
|
40
|
+
@is_recursive = config.fetch('is_recursive', true)
|
41
|
+
|
42
|
+
@oai_request = oai_request_klass.new(
|
43
|
+
endpoint_url: oai_endpoint,
|
44
|
+
resumption_token: resumption_token,
|
45
|
+
set_spec: config.fetch('set_spec', nil)
|
46
|
+
)
|
47
|
+
|
48
|
+
run_batch!
|
49
|
+
run_next_batch!
|
50
|
+
end
|
51
|
+
|
52
|
+
# Because Sidekiq serializes params to JSON, we provide custom setters
|
53
|
+
# for dependencies (normally these would be default params in the
|
54
|
+
# constructor) so that they may be mocked and tested
|
55
|
+
def completed_callback_klass
|
56
|
+
@completed_callback_klass ||= CDMDEXER::CompletedCallback
|
57
|
+
end
|
58
|
+
|
59
|
+
def etl_worker_klass
|
60
|
+
@etl_worker_klass ||= ETLWorker
|
61
|
+
end
|
62
|
+
|
63
|
+
def oai_request_klass
|
64
|
+
@oai_request_klass ||= OaiRequest
|
65
|
+
end
|
66
|
+
|
67
|
+
def load_worker_klass
|
68
|
+
@load_worker_klass ||= LoadWorker
|
69
|
+
end
|
70
|
+
|
71
|
+
def transform_worker_klass
|
72
|
+
@transform_worker_klass ||= TransformWorker
|
73
|
+
end
|
74
|
+
|
75
|
+
# Recurse through OAI batches one at a time
|
76
|
+
def run_next_batch!
|
77
|
+
if next_resumption_token && is_recursive
|
78
|
+
etl_worker_klass.perform_async(next_config)
|
79
|
+
else
|
80
|
+
completed_callback_klass.call!(solr_config)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
# Extract an oai response, delete the deletables, transform and load the
|
87
|
+
# updatable items
|
88
|
+
def run_batch!
|
89
|
+
# Delete records that OAI has marked for deletion
|
90
|
+
delete_deletables!
|
91
|
+
transform_and_load!
|
92
|
+
end
|
93
|
+
|
94
|
+
def next_config
|
95
|
+
config.merge(resumption_token: next_resumption_token)
|
96
|
+
end
|
97
|
+
|
98
|
+
def transform_and_load!
|
99
|
+
updatables.each_slice(batch_size) do |records|
|
100
|
+
transform_worker_klass.perform_async(records,
|
101
|
+
solr_config,
|
102
|
+
cdm_endpoint,
|
103
|
+
oai_endpoint,
|
104
|
+
field_mappings,
|
105
|
+
batch_size)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def delete_deletables!
|
110
|
+
load_worker_klass.perform_async([], deletable_ids, solr_config)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
class FieldFormatter
|
3
|
+
attr_reader :value, :formatters
|
4
|
+
def initialize(value: {}, formatters: [DefaultFormatter])
|
5
|
+
@value = value
|
6
|
+
@formatters = formatters
|
7
|
+
end
|
8
|
+
|
9
|
+
def format!
|
10
|
+
formatters.reduce(value) { |memo, formatter| formatter.format(memo) }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|