cdmdexer 0.17.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.rubocop.yml +4 -0
  4. data/.travis.yml +8 -0
  5. data/CODE_OF_CONDUCT.md +49 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +21 -0
  8. data/README.md +149 -0
  9. data/Rakefile +11 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/cdmdexer.gemspec +35 -0
  13. data/lib/cdmdexer/cdm_item.rb +89 -0
  14. data/lib/cdmdexer/default_cdm_notification.rb +8 -0
  15. data/lib/cdmdexer/default_completed_callback.rb +8 -0
  16. data/lib/cdmdexer/default_loader_notification.rb +8 -0
  17. data/lib/cdmdexer/default_oai_notification.rb +8 -0
  18. data/lib/cdmdexer/default_solr.rb +35 -0
  19. data/lib/cdmdexer/etl_by_set_specs.rb +18 -0
  20. data/lib/cdmdexer/etl_worker.rb +113 -0
  21. data/lib/cdmdexer/field_formatter.rb +13 -0
  22. data/lib/cdmdexer/field_mapping.rb +28 -0
  23. data/lib/cdmdexer/field_transformer.rb +41 -0
  24. data/lib/cdmdexer/filtered_set_specs.rb +41 -0
  25. data/lib/cdmdexer/formatters.rb +169 -0
  26. data/lib/cdmdexer/hooks.rb +31 -0
  27. data/lib/cdmdexer/load_worker.rb +36 -0
  28. data/lib/cdmdexer/loader.rb +19 -0
  29. data/lib/cdmdexer/oai_client.rb +26 -0
  30. data/lib/cdmdexer/oai_request.rb +100 -0
  31. data/lib/cdmdexer/rake_task.rb +6 -0
  32. data/lib/cdmdexer/record_transformer.rb +25 -0
  33. data/lib/cdmdexer/regex_filter_callback.rb +19 -0
  34. data/lib/cdmdexer/tasks/delete.rake +12 -0
  35. data/lib/cdmdexer/tasks/etl.rake +96 -0
  36. data/lib/cdmdexer/transform_worker.rb +93 -0
  37. data/lib/cdmdexer/transformer.rb +171 -0
  38. data/lib/cdmdexer/version.rb +3 -0
  39. data/lib/cdmdexer.rb +26 -0
  40. data/travis.yml +6 -0
  41. metadata +223 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 8d3dccb39ef4048b79af386f1d3696f9d7497efb3c53e388480e1505e027d99b
4
+ data.tar.gz: 262051a8e4e246be6092a5cbc54245cbed91449c859f5c408bde2f7b8d9ae068
5
+ SHA512:
6
+ metadata.gz: 41aeca2b754fc5681e5bfe120e4d690e4f56f5b767bd924f50a0363ca2fdfad2238e2c58507d61e79805c37e0218267903e662c968768c5d20fce807d98cecc9
7
+ data.tar.gz: 2a99d19a2639e15b5811692101d9630a67296b9ee1a32bda4e1cf70be5dcf4a0aba8815602b9172b935b8bf1482531026f5ebc8ee5289439a4baee76a7b6873c
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.gem
data/.rubocop.yml ADDED
@@ -0,0 +1,4 @@
1
+ Layout/IndentationWidth:
2
+ # Number of spaces for each indentation level.
3
+ Width: 2
4
+ IgnoredPatterns: []
data/.travis.yml ADDED
@@ -0,0 +1,8 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.0
5
+ before_install: gem install bundler -v 1.12.5
6
+ env:
7
+ matrix:
8
+ secure: n+qvS/V5ZeCZ15RMNPeTxtAd+79XZniWNoYCriGtrJkELt/7bc4teXB8thowXHrm4Sx2lHW9uE8cR2s56RyRoV0PQU+nenjylAHPkfwlu/isutw3PGGFJoF3O89bG5jtQCQcequSH+PHRLc0xYoKaRLOMEuO0RPzd+hALt3jobzs5Q4hvvFinF8yDOTG0Qo24CtvZqNMAz0QtkWJExIZ8i2G7DnkY7Jy6DaJjv+A3MdTe2xTikSjWOvFjMrRVd1v+aP9gUTNVTy5aahpKSwub7TzsS/yj8rSI8717As/sSxepWXBg1N1AnrH6ttgjpdyjELXYlTrCx01u4sQMW3HRKG4WJzSVKaQ9W6fF82TQySh6hAaiO5/HYZ73+TTCnEk6eFJT/FaT7yNyOwhRA2Wq091PVFgfZOFy+Hxbj3dWVZm0hS5odaQ5c/aUYR8EiyfUwgHQQxl2QYjUMci+5QOa5k5TmqBHq4iqa0khC5t/zbCvY5ImC4/ySTtIQZnAtHxh3ZhdrukQn0LgLQ4BKVGZEz0wyiaMQrkf8XUgv5pkegTcAhkaz7vZs5j0wtY6CV0q9t5D2n8Xj9amo3SOfTtJjOLZ3xBJhjJRr4J219wN0EFtHt+3rXVoKcw3Vz1ysyV1Jn2B68LF0LPHwSFKDGqptP3axBJNs/utGcDTHAx8hU=
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at fenne035@umn.edu. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cdmdexer.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 University of Minnesota Libraries
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,149 @@
1
+ [![Build Status](https://travis-ci.org/UMNLibraries/cdmdexer.svg?branch=master)](https://travis-ci.org/UMNLibraries/cdmdexer)
2
+
3
+ # CDMDEXER: Index CONTENTdm Content
4
+
5
+ A micro [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) system dedicated to extracting metadata records from a CONTENTdm instance (using the [CONTENTdm API gem](https://github.com/UMNLibraries/contentdm_api), transforming them into Solr documents, and loading them into Solr.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'cdmdexer'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install cdmdexer
22
+
23
+ Add the CDMDEXER rake task to your project Rakefile:
24
+
25
+ ```ruby
26
+ require 'cdmdexer/rake_task'
27
+ ```
28
+
29
+ ### GeoNames (optional)
30
+
31
+ In order to make use of the GeoNames service, you must purchase a [GeoNames Premium Webservices Account](http://www.geonames.org/commercial-webservices.html). If you do not have a `geonam` field in your CONTENTdm schema, you may ignore this instruction. Add your credentials to your shell environment once you have secured a GeoNames user:
32
+
33
+ ```
34
+ # e.g. within your .bash_profile or .zprofile file
35
+ export export GEONAMES_USER="yourusernamehere"
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ Run the ingester
41
+
42
+ rake cdmdexer:batch[solr_url,oai_endpoint,cdm_endpoint,set_spec, batch_size, max_compounds]
43
+
44
+ |Argument| Definition|
45
+ |--:|---|
46
+ |solr_url| The full URL to your Solr core instance (same as your blacklight.yml solr url)|
47
+ |oai_endpoint| A URL to your OAI instance (e.g. https://server16022.contentdm.oclc.org/oai/oai.php) |
48
+ |cdm_endpoint| A URL to your CONTENTdm API endpoint (e.g. https://server16022.contentdm.oclc.org/dmwebservices/index.php) |
49
+ |set_spec| Selectively harvest from a single collection with [setSpec](http://www.openarchives.org/OAI/openarchivesprotocol.html#Set)|
50
+ |batch_size| The number of records to transform at a time. **Note**: it is within the record transformation process that the CONTENTdm API is requested. This API can be sluggish, so we conservatively transform batches of ten records at a time to prevent timeouts.|
51
+ |max_compounds| CONTENTdm records with many compounds can take a long time to load from the CONTENTdm API as multiple requests must happen in order to get the metadata for each child record of a parent compound object. For this reason, records with ten or more compound children are, by default, processed in batches of one. This setting allows you to override this behavior.|
52
+
53
+ For example:
54
+
55
+ ```ruby
56
+ rake "cdmdexer:ingest[http://solr:8983/solr/foo-bar-core, https://server16022.contentdm.oclc.org/oai/oai.php, https://server16022.contentdm.oclc.org/dmwebservices/index.php, 2015-01-01]"
57
+ ```
58
+
59
+ ### Custom Rake Tasks
60
+
61
+ You might also create your own rake task to run your modified field transformers:
62
+
63
+ ```ruby
64
+ require 'cdmdexer'
65
+
66
+ namespace :cdmdexer do
67
+ desc "ingest batches of records"
68
+ ##
69
+ # e.g. rake mdl_ingester:ingest[2015-09-14, 2]
70
+ task :batch, [:batch_size, :set_spec] => :environment do |t, args|
71
+ config =
72
+ {
73
+ oai_endpoint: 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
74
+ cdm_endpoint: 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
75
+ set_spec: (args[:set_spec] != '""') ? args[:set_spec] : nil,
76
+ batch_size: (args[:batch_size]) ? args[:batch_size] : 30,
77
+ solr_config: solr_config
78
+ }
79
+ CDMDEXER::ETLWorker.perform_async(config)
80
+ end
81
+ end
82
+ ```
83
+ ### Your Own Custom Solr Field Mappings (see above code snippet)
84
+
85
+ The default CONTENTdm to Solr field transformation rules may be overriden by calling the CDMDEXER::ETLWorker (a [Sidekiq worker](https://github.com/mperham/sidekiq)) directly. These rules may be found in the default_mappings method of the [CDMDEXER::Transformer Class](https://github.com/UMNLibraries/cdmdexer/blob/master/lib/cdmdexer/transformer.rb).
86
+
87
+ The transformer expects mappings in the following format:
88
+
89
+ ```ruby
90
+ def your_custom_field_mappings
91
+ [
92
+ {dest_path: 'title_tei', origin_path: 'title', formatters: [StripFormatter]},
93
+ ]
94
+ end
95
+ ```
96
+ |Argument| Definition|
97
+ |--:|---|
98
+ |dest_path| The 'destination path' is the name of the field you will be sending to Solr for this field mapping. |
99
+ |origin_path| Where to get the field data from the original record for this mapping. |
100
+ |formatters| [Formatters](https://github.com/UMNLibraries/cdmdexer/blob/master/lib/cdmdexer/formatters.rb) perform tasks such as stripping white space or splitting CONTENTdm multi-valued fields (delimited by semicolons) into JSON arrays. |
101
+
102
+ **Note:** The first formatter receives the value found at the declared `origin_path`. Each formatter declared after the initial formatter will receive a value produced by the preceding formatter.
103
+
104
+ Formatters are very simple stateless classes that take a value, do something to it, and respond with a modified version of this value via a class method called `format`. Examples of other formatters may be found in the [Formatters file](https://github.com/UMNLibraries/cdmdexer/blob/master/lib/cdmdexer/formatters.rb). For Example:
105
+
106
+ ```ruby
107
+ class SplitFormatter
108
+ def self.format(value)
109
+ (value.respond_to?(:split)) ? value.split(';') : value
110
+ end
111
+ end
112
+ ```
113
+
114
+ You might also want to simply override some of the default mappings or add your own:
115
+
116
+ ```ruby
117
+ mappings = CDMDEXER::Transformer.default_mappings.merge(your_custom_field_mappings)
118
+ ```
119
+ ## A Custom Post-indexing Callback
120
+
121
+ If you would like to perform some action (e.g. send an email) following the completion of the CDMDEXER indexing process, you may declare your own callback hook (anything with "Callback" in the class name declared within the CDMDEXER module space will be used). To do so in Rails, create a Rails initializer file `config/initializers/cdmdexer.rb`:
122
+
123
+ ```ruby
124
+ module CDMDEXER
125
+ class Callback
126
+ def self.call!
127
+ Rails.logger.info("My Custom CDMDEXER Callback")
128
+ end
129
+ end
130
+ end
131
+ ```
132
+ ## Development
133
+
134
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
135
+
136
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
137
+
138
+ ## Contributing
139
+
140
+ Bug reports and pull requests are welcome on GitHub at https://github.com/UMNLibraries/cdmdexer. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
141
+
142
+ ## License
143
+
144
+ [MIT](/LICENSE.txt)
145
+
146
+ ## TODO
147
+
148
+ * Make StripFormatter the default formatter so it doesn't need to be declared for every field
149
+ * Re-brand project: CONTENTdm Indexer. CDMDEXER doesn't necessarily require Blacklight. Moreover only handles indexing.
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+ require 'cdmdexer/rake_task'
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList['test/**/*_test.rb']
9
+ end
10
+
11
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "cdmdexer"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/cdmdexer.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'cdmdexer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'cdmdexer'
8
+ spec.version = CDMDEXER::VERSION
9
+ spec.authors = ['chadfennell']
10
+ spec.email = ['fenne035@umn.edu']
11
+
12
+ spec.summary = %q{Load CONTENTdm data into a Solr Index. CDMDEXER expects to run inside a Rails application.}
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
16
+ spec.bindir = 'exe'
17
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_dependency 'hash_at_path', '~> 0.1'
21
+ spec.add_dependency 'contentdm_api', '~> 0.5.0'
22
+ spec.add_dependency 'sidekiq', '>= 3.5'
23
+ spec.add_dependency 'titleize', '~> 1.4'
24
+ spec.add_dependency 'rsolr', '~> 2.0'
25
+ # CDMDEXER expects to run in a rails app, but just to avoid adding
26
+ # another external dependency for XML procssing, we rely on activesupport's
27
+ # Has.to_jsonl feature for testing and to allow this gem to function
28
+ # independently from a rails app
29
+ spec.add_dependency 'rails', '>= 5.2'
30
+
31
+ spec.add_development_dependency 'bundler', '~> 1.12'
32
+ spec.add_development_dependency 'rake', '~> 12.0'
33
+ spec.add_development_dependency 'minitest', '~> 5.0'
34
+ spec.add_development_dependency 'yard', '~> 0.9.0'
35
+ end
@@ -0,0 +1,89 @@
1
+ module CDMDEXER
2
+ class CdmItem
3
+ attr_reader :cdm_endpoint,
4
+ :record,
5
+ :collection,
6
+ :id,
7
+ :cdm_api_klass,
8
+ :cdm_notification_klass
9
+
10
+ def initialize(record: :MISSING_RECORD,
11
+ cdm_endpoint: :MISSING_ENDPOINT,
12
+ cdm_api_klass: CONTENTdmAPI::Item,
13
+ cdm_notification_klass: CDMDEXER::CdmNotification)
14
+ @record = record
15
+ @collection, @id = record['id'].split(':')
16
+ @cdm_endpoint = cdm_endpoint
17
+ @cdm_api_klass = cdm_api_klass
18
+ @cdm_notification_klass = cdm_notification_klass
19
+ end
20
+
21
+ def to_h
22
+ # Preserve the record hash. It may contain compound data that has been
23
+ # resubmitted here by the transformer_worker as it recurses through
24
+ # compounds in order to extract their full metadata
25
+ @to_h ||= record.merge(metadata)
26
+ end
27
+
28
+ def page
29
+ primary_record.fetch('page', [])
30
+ .each_with_index.map { |page, i| to_compound(page, i) }
31
+ end
32
+
33
+ private
34
+
35
+ def metadata
36
+ if first_page_id
37
+ # There are cases when we will not want to have to query for the
38
+ # metadata of the first item of a compound. So, include the metadata of
39
+ # the first page in its parent record metadata.
40
+ #
41
+ # Use-case: you want to grab a thumbnail for the compound record. In
42
+ # this case, you'll need the format field of the first record in order
43
+ # to determine which thumbnail generation mechanism to use (e.g. CDM
44
+ # thumb vs getting a thumbnail for a video from Kaltura)
45
+ primary_record.merge('first_page' => request(first_page_id))
46
+ else
47
+ primary_record
48
+ end.merge(
49
+ 'page' => page,
50
+ # When an item has pages, these pages are resubmitted to CdmItem
51
+ # as records in order to get their full metadata. But we want to
52
+ # remember that they are actually secondary / child pages
53
+ 'record_type' => record.fetch('record_type', 'primary')
54
+ )
55
+ end
56
+
57
+ def first_page_id
58
+ (page.first || {}).fetch('id', '').split(':').last
59
+ end
60
+
61
+ def to_compound(page, i)
62
+ # raise "#{collection}:#{page['pageptr']}".inspect
63
+ page.merge(
64
+ # Child id is a combo of the page id and parent collection
65
+ 'id' => "#{collection}:#{page['pageptr']}",
66
+ 'parent_id' => record['id'],
67
+ 'record_type' => 'secondary',
68
+ 'child_index' => i
69
+ )
70
+ end
71
+
72
+ def primary_record
73
+ @primary_record ||= request(id)
74
+ end
75
+
76
+ # CDM's id format is collection/id. We use collection:id
77
+ def to_solr_id(record)
78
+ record.merge('id' => record['id'].split('/').join(':'))
79
+ end
80
+
81
+ def request(id)
82
+ cdm_notification_klass.call!(collection, id, cdm_endpoint)
83
+ to_solr_id(cdm_api_klass.new(base_url: cdm_endpoint,
84
+ collection: collection,
85
+ with_compound: false,
86
+ id: id).metadata)
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,8 @@
1
+ module CDMDEXER
2
+ # An example callback
3
+ class DefaultCdmNotification
4
+ def self.call!(collection, id, endpoint)
5
+ puts "A CONTENTdm API request notification: #{endpoint} #{collection}:#{id}"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module CDMDEXER
2
+ # An example callback
3
+ class DefaultCompletedCallback
4
+ def self.call!(solr_client)
5
+ puts "A callback task"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module CDMDEXER
2
+ # An example callback
3
+ class DefaultLoaderNotification
4
+ def self.call!(ingestables, deletables)
5
+ puts "A CONTENTdm API request notification: Loading #{ingestables.length} records and deleting #{deletables.length}"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ module CDMDEXER
2
+ # An example callback
3
+ class DefaultOaiNotification
4
+ def self.call!(location)
5
+ puts "An OAI callback task for #{location}"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,35 @@
1
+ require 'rsolr'
2
+
3
+ module CDMDEXER
4
+ # Commnicate with Solr: add / delete stuff
5
+ class DefaultSolr
6
+ attr_reader :url, :client
7
+ def initialize(url: 'http://localhost:8983/solr/core-here', client: RSolr)
8
+ @url = url
9
+ @client = client
10
+ end
11
+
12
+ def ids(start: 0)
13
+ connection.get('select',
14
+ :params => { :q => '*:*',
15
+ :defType => 'edismax',
16
+ :fl => '',
17
+ :rows => 10,
18
+ :start => start
19
+ }
20
+ )
21
+ end
22
+
23
+ def connection
24
+ @connection ||= client.connect url: url
25
+ end
26
+
27
+ def add(records)
28
+ connection.add records
29
+ end
30
+
31
+ def delete(ids)
32
+ connection.delete_by_id ids
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,18 @@
1
+ module CDMDEXER
2
+ class ETLBySetSpecs
3
+ attr_reader :set_specs, :etl_config, :etl_worker_klass
4
+ def initialize(set_specs: [:missing_setspec],
5
+ etl_config: :missing_etl_config,
6
+ etl_worker_klass: ETLWorker)
7
+ @set_specs = set_specs
8
+ @etl_config = etl_config
9
+ @etl_worker_klass = etl_worker_klass
10
+ end
11
+
12
+ def run!
13
+ set_specs.map do |set_spec|
14
+ etl_worker_klass.perform_async(etl_config.merge(set_spec: set_spec))
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,113 @@
1
+ require 'sidekiq'
2
+ module CDMDEXER
3
+ # Extract records from OAI, delete records marked for deletion
4
+ # and send everything else to a transformation / load worker
5
+ class ETLWorker
6
+ include Sidekiq::Worker
7
+
8
+ extend ::Forwardable
9
+ def_delegators :@oai_request,
10
+ :deletable_ids,
11
+ :updatables,
12
+ :next_resumption_token
13
+
14
+ attr_reader :config,
15
+ :solr_config,
16
+ :cdm_endpoint,
17
+ :oai_endpoint,
18
+ :field_mappings,
19
+ :resumption_token,
20
+ :batch_size,
21
+ :is_recursive
22
+
23
+ attr_writer :oai_request_klass,
24
+ :etl_worker_klass,
25
+ :load_worker_klass,
26
+ :completed_callback_klass,
27
+ :transform_worker_klass
28
+
29
+ def perform(config)
30
+ # Sidekiq stores params in JSON, so we can't inject dependencies. This
31
+ # results in the long set of arguments that follows. Otherwise, we'd
32
+ # simply inject the OAI request and extractor objects
33
+ @config = config
34
+ @solr_config = config.fetch('solr_config').symbolize_keys
35
+ @cdm_endpoint = config.fetch('cdm_endpoint')
36
+ @oai_endpoint = config.fetch('oai_endpoint')
37
+ @field_mappings = config.fetch('field_mappings', false)
38
+ @resumption_token = config.fetch('resumption_token', nil)
39
+ @batch_size = config.fetch('batch_size', 5).to_i
40
+ @is_recursive = config.fetch('is_recursive', true)
41
+
42
+ @oai_request = oai_request_klass.new(
43
+ endpoint_url: oai_endpoint,
44
+ resumption_token: resumption_token,
45
+ set_spec: config.fetch('set_spec', nil)
46
+ )
47
+
48
+ run_batch!
49
+ run_next_batch!
50
+ end
51
+
52
+ # Because Sidekiq serializes params to JSON, we provide custom setters
53
+ # for dependencies (normally these would be default params in the
54
+ # constructor) so that they may be mocked and tested
55
+ def completed_callback_klass
56
+ @completed_callback_klass ||= CDMDEXER::CompletedCallback
57
+ end
58
+
59
+ def etl_worker_klass
60
+ @etl_worker_klass ||= ETLWorker
61
+ end
62
+
63
+ def oai_request_klass
64
+ @oai_request_klass ||= OaiRequest
65
+ end
66
+
67
+ def load_worker_klass
68
+ @load_worker_klass ||= LoadWorker
69
+ end
70
+
71
+ def transform_worker_klass
72
+ @transform_worker_klass ||= TransformWorker
73
+ end
74
+
75
+ # Recurse through OAI batches one at a time
76
+ def run_next_batch!
77
+ if next_resumption_token && is_recursive
78
+ etl_worker_klass.perform_async(next_config)
79
+ else
80
+ completed_callback_klass.call!(solr_config)
81
+ end
82
+ end
83
+
84
+ private
85
+
86
+ # Extract an oai response, delete the deletables, transform and load the
87
+ # updatable items
88
+ def run_batch!
89
+ # Delete records that OAI has marked for deletion
90
+ delete_deletables!
91
+ transform_and_load!
92
+ end
93
+
94
+ def next_config
95
+ config.merge(resumption_token: next_resumption_token)
96
+ end
97
+
98
+ def transform_and_load!
99
+ updatables.each_slice(batch_size) do |records|
100
+ transform_worker_klass.perform_async(records,
101
+ solr_config,
102
+ cdm_endpoint,
103
+ oai_endpoint,
104
+ field_mappings,
105
+ batch_size)
106
+ end
107
+ end
108
+
109
+ def delete_deletables!
110
+ load_worker_klass.perform_async([], deletable_ids, solr_config)
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,13 @@
1
+ module CDMDEXER
2
+ class FieldFormatter
3
+ attr_reader :value, :formatters
4
+ def initialize(value: {}, formatters: [DefaultFormatter])
5
+ @value = value
6
+ @formatters = formatters
7
+ end
8
+
9
+ def format!
10
+ formatters.reduce(value) { |memo, formatter| formatter.format(memo) }
11
+ end
12
+ end
13
+ end