cdmbl 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/README.md +23 -14
- data/cdmbl.gemspec +1 -1
- data/lib/cdmbl/batch_deleter_worker.rb +1 -2
- data/lib/cdmbl/compound_filter.rb +45 -0
- data/lib/cdmbl/compound_lookup.rb +43 -0
- data/lib/cdmbl/default_solr.rb +0 -1
- data/lib/cdmbl/etl_worker.rb +112 -61
- data/lib/cdmbl/extract_worker.rb +141 -0
- data/lib/cdmbl/extractor.rb +29 -29
- data/lib/cdmbl/load_worker.rb +35 -0
- data/lib/cdmbl/oai_request.rb +3 -6
- data/lib/cdmbl/tasks/etl.rake +32 -12
- data/lib/cdmbl/tasks/extract.rake +9 -0
- data/lib/cdmbl/transform_worker.rb +93 -0
- data/lib/cdmbl/version.rb +1 -1
- data/lib/cdmbl.rb +5 -1
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 818b769be7195ae37538de0892e275545da57a44
|
4
|
+
data.tar.gz: 5a1a6c25b618ea2cd1d047e3ef3047e29e54db87
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aba7fdaefa7ca9e8031b9ac42c9384ebd41ca7bff629d978715aa57cb416248c2b4f5e5e234cf223b61406ffa597591e1906584e4657ed39fc60a361641c7c1d
|
7
|
+
data.tar.gz: 7a1e280b31c57cdf98207481f02946bfdaa6abe2dbfebe36ae9722ea9b444da3dec939859d8db66ce01565b476b4e211ec65d679c707a1378c3f882d3fd9e9bd
|
data/.rubocop.yml
ADDED
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
Use [Blacklight](https://github.com/projectblacklight/blacklight) as a front end for your CONTENTdm instance.
|
6
6
|
|
7
|
-
At the moment, CDMBL consists only of a micro [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) system dedicated to extracting metadata records from a CONTENTdm instance (using the [CONTENTdm API gem](https://github.com/UMNLibraries/contentdm_api), transforming them into Solr documents, and loading them into Solr.
|
7
|
+
At the moment, CDMBL consists only of a micro [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) system dedicated to extracting metadata records from a CONTENTdm instance (using the [CONTENTdm API gem](https://github.com/UMNLibraries/contentdm_api), transforming them into Solr documents, and loading them into Solr.
|
8
8
|
|
9
9
|
## Installation
|
10
10
|
|
@@ -41,22 +41,24 @@ export export GEONAMES_USER="yourusernamehere"
|
|
41
41
|
|
42
42
|
Run the ingester
|
43
43
|
|
44
|
-
rake cdmbl:
|
44
|
+
rake cdmbl:batch[solr_url,oai_endpoint,cdm_endpoint,set_spec, batch_size, max_compounds]
|
45
45
|
|
46
46
|
|Argument| Definition|
|
47
47
|
|--:|---|
|
48
48
|
|solr_url| The full URL to your Solr core instance (same as your blacklight.yml solr url)|
|
49
|
-
|oai_endpoint| A URL to your OAI instance (e.g.
|
49
|
+
|oai_endpoint| A URL to your OAI instance (e.g. https://server16022.contentdm.oclc.org/oai/oai.php) |
|
50
50
|
|cdm_endpoint| A URL to your CONTENTdm API endpoint (e.g. https://server16022.contentdm.oclc.org/dmwebservices/index.php) |
|
51
|
-
|
|
51
|
+
|set_spec| Selectively harvest from a single collection with [setSpec](http://www.openarchives.org/OAI/openarchivesprotocol.html#Set)|
|
52
|
+
|batch_size| The number of records to transform at a time. **Note**: it is within the record transformation process that the CONTENTdm API is requested. This API can be sluggish, so we conservatively transform batches of ten records at a time to prevent timeouts.|
|
53
|
+
|max_compounds| CONTENTdm records with many compounds can take a long time to load from the CONTENTdm API as multiple requests must happen in order to get the metadata for each child record of a parent compound object. For this reason, records with ten or more compound children are, by default, processed in batches of one. This setting allows you to override this behavior.|
|
52
54
|
|
53
55
|
For example:
|
54
56
|
|
55
57
|
```ruby
|
56
|
-
rake "cdmbl:ingest[http://solr:8983/solr/foo-bar-core,
|
58
|
+
rake "cdmbl:ingest[http://solr:8983/solr/foo-bar-core, https://server16022.contentdm.oclc.org/oai/oai.php, https://server16022.contentdm.oclc.org/dmwebservices/index.php, 2015-01-01]"
|
57
59
|
```
|
58
60
|
|
59
|
-
### Custom Rake
|
61
|
+
### Custom Rake Tasks
|
60
62
|
|
61
63
|
You might also create your own rake task to run your modified field transformers:
|
62
64
|
|
@@ -64,14 +66,21 @@ You might also create your own rake task to run your modified field transformers
|
|
64
66
|
require 'cdmbl'
|
65
67
|
|
66
68
|
namespace :cdmbl do
|
67
|
-
desc
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
69
|
+
desc "ingest batches of records"
|
70
|
+
##
|
71
|
+
# e.g. rake mdl_ingester:ingest[2015-09-14, 2]
|
72
|
+
task :batch, [:batch_size, :set_spec] => :environment do |t, args|
|
73
|
+
config =
|
74
|
+
{
|
75
|
+
oai_endpoint: 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
|
76
|
+
cdm_endpoint: 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
|
77
|
+
set_spec: (args[:set_spec] != '""') ? args[:set_spec] : nil,
|
78
|
+
max_compounds: (args[:max_compounds]) ? args[:max_compounds] : 2,
|
79
|
+
batch_size: (args[:batch_size]) ? args[:batch_size] : 30,
|
80
|
+
solr_config: solr_config
|
81
|
+
}
|
82
|
+
CDMBL::ETLWorker.perform_async(config)
|
83
|
+
end
|
75
84
|
end
|
76
85
|
```
|
77
86
|
### Your Own Custom Solr Field Mappings (see above code snippet)
|
data/cdmbl.gemspec
CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency 'rsolr', '~> 2.0'
|
25
25
|
# This gem generally wants to be in a rails app, but just to avoid adding
|
26
26
|
# another external dependency for XML procssing, we rely on activesupport's
|
27
|
-
# Has.
|
27
|
+
# Has.to_jsonl feature for testing and to allow this gem to function
|
28
28
|
# independently from a rails app
|
29
29
|
spec.add_dependency 'activesupport', '>= 4.2'
|
30
30
|
|
@@ -3,8 +3,7 @@ module CDMBL
|
|
3
3
|
class BatchDeleterWorker
|
4
4
|
include Sidekiq::Worker
|
5
5
|
attr_reader :start, :prefix, :oai_url, :solr_url
|
6
|
-
|
7
|
-
sidekiq_options :backtrace => true
|
6
|
+
attr_writer :batch_deleter_klass, :oai_client, :solr_client
|
8
7
|
def perform(start = 0, prefix = '', oai_url = '', solr_url = '')
|
9
8
|
@start = start
|
10
9
|
@prefix = prefix
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module CDMBL
|
2
|
+
# Takes a list of record id/collection data, uses CompoundLookup to
|
3
|
+
# identifiy records with large numbers of compounds and sorts them
|
4
|
+
# into a large and a small heap
|
5
|
+
class CompoundFilter
|
6
|
+
attr_reader :record_ids,
|
7
|
+
:max_compounds,
|
8
|
+
:cdm_endpoint,
|
9
|
+
:compound_lookup_klass
|
10
|
+
def initialize(record_ids: [],
|
11
|
+
max_compounds: 10,
|
12
|
+
cdm_endpoint: '',
|
13
|
+
compound_lookup_klass: CompoundLookup)
|
14
|
+
@record_ids = record_ids
|
15
|
+
@max_compounds = max_compounds
|
16
|
+
@cdm_endpoint = cdm_endpoint
|
17
|
+
@compound_lookup_klass = compound_lookup_klass
|
18
|
+
end
|
19
|
+
|
20
|
+
def filter(large: true)
|
21
|
+
ids(records.select { |record| record[:large] == large })
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def ids(records)
|
27
|
+
records.map { |record| record[:id] }
|
28
|
+
end
|
29
|
+
|
30
|
+
def records
|
31
|
+
@records ||= record_ids.map do |identifier|
|
32
|
+
{
|
33
|
+
large: count(*identifier) >= max_compounds,
|
34
|
+
id: identifier
|
35
|
+
}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def count(collection, id)
|
40
|
+
compound_lookup_klass.new(cdm_endpoint: cdm_endpoint,
|
41
|
+
collection: collection,
|
42
|
+
id: id).count
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module CDMBL
|
2
|
+
# Fetching the full metadata for compound records is expensive. This class
|
3
|
+
# lets us check on how many compounds a CDM record has so that we know
|
4
|
+
class CompoundLookup
|
5
|
+
attr_reader :cdm_endpoint,
|
6
|
+
:collection,
|
7
|
+
:id,
|
8
|
+
:request_klass,
|
9
|
+
:service_klass
|
10
|
+
|
11
|
+
def initialize(cdm_endpoint: '',
|
12
|
+
collection: '',
|
13
|
+
id: '',
|
14
|
+
request_klass: CONTENTdmAPI::Request,
|
15
|
+
service_klass: CONTENTdmAPI::Service)
|
16
|
+
@cdm_endpoint = cdm_endpoint
|
17
|
+
@collection = collection
|
18
|
+
@id = id
|
19
|
+
@request_klass = request_klass
|
20
|
+
@service_klass = service_klass
|
21
|
+
end
|
22
|
+
|
23
|
+
def count
|
24
|
+
page.respond_to?(:length) ? page.length : 0
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def page
|
30
|
+
JSON.parse(request).fetch('page', [])
|
31
|
+
end
|
32
|
+
|
33
|
+
def service
|
34
|
+
@service ||= service_klass.new(function: 'dmGetCompoundObjectInfo',
|
35
|
+
params: [collection, id])
|
36
|
+
end
|
37
|
+
|
38
|
+
def request
|
39
|
+
@request ||= request_klass.new(base_url: cdm_endpoint,
|
40
|
+
service: service).fetch
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/cdmbl/default_solr.rb
CHANGED
data/lib/cdmbl/etl_worker.rb
CHANGED
@@ -1,89 +1,140 @@
|
|
1
1
|
require 'sidekiq'
|
2
2
|
module CDMBL
|
3
|
+
# Extract records from OAI, delete records marked for deletion, sort the
|
4
|
+
# remaning records them into "big and small" record piles based upon how many
|
5
|
+
# compounds a record has, chunk the small records into batches and the big
|
6
|
+
# records individuall and then send these records to a transformation worker
|
3
7
|
class ETLWorker
|
4
8
|
include Sidekiq::Worker
|
5
|
-
|
6
|
-
|
7
|
-
:
|
9
|
+
attr_reader :config,
|
10
|
+
:solr_config,
|
11
|
+
:cdm_endpoint,
|
12
|
+
:oai_endpoint,
|
13
|
+
:field_mappings,
|
14
|
+
:resumption_token,
|
15
|
+
:set_spec,
|
16
|
+
:max_compounds,
|
8
17
|
:batch_size,
|
9
|
-
:is_recursive
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
@
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@
|
26
|
-
@
|
27
|
-
|
28
|
-
|
29
|
-
|
18
|
+
:is_recursive
|
19
|
+
|
20
|
+
attr_writer :compound_filter_klass,
|
21
|
+
:extractor_klass,
|
22
|
+
:etl_worker_klass,
|
23
|
+
:load_worker_klass,
|
24
|
+
:completed_callback_klass,
|
25
|
+
:transform_worker_klass
|
26
|
+
|
27
|
+
def perform(config)
|
28
|
+
# Sidekiq stores params in JSON, so we can't inject dependencies. This
|
29
|
+
# results in the long set of arguments that follows. Otherwise, we'd
|
30
|
+
# simply inject the OAI request and extractor objects
|
31
|
+
@config = config
|
32
|
+
@solr_config = config.fetch('solr_config').symbolize_keys
|
33
|
+
@cdm_endpoint = config.fetch('cdm_endpoint')
|
34
|
+
@oai_endpoint = config.fetch('oai_endpoint')
|
35
|
+
@field_mappings = config.fetch('field_mappings', false)
|
36
|
+
@resumption_token = config.fetch('resumption_token', nil)
|
37
|
+
@set_spec = config.fetch('set_spec', nil)
|
38
|
+
@max_compounds = config.fetch('max_compounds', 10)
|
39
|
+
@batch_size = config.fetch('batch_size', 5).to_i
|
40
|
+
@is_recursive = config.fetch('is_recursive', true)
|
41
|
+
extract_batch!
|
42
|
+
next_batch!
|
43
|
+
end
|
44
|
+
|
45
|
+
# Because Sidekiq serializes params to JSON, we provide custom setters
|
46
|
+
# for dependencies (normally these would be default params in the
|
47
|
+
# constructor) so that they may be mocked and tested
|
48
|
+
def completed_callback_klass
|
49
|
+
@completed_callback_klass ||= CDMBL::CompletedCallback
|
50
|
+
end
|
51
|
+
|
52
|
+
def etl_worker_klass
|
53
|
+
@etl_worker_klass ||= ETLWorker
|
54
|
+
end
|
55
|
+
|
56
|
+
def compound_filter_klass
|
57
|
+
@compound_filter_klass ||= CompoundFilter
|
58
|
+
end
|
59
|
+
|
60
|
+
def extractor_klass
|
61
|
+
@extractor_klass ||= Extractor
|
62
|
+
end
|
63
|
+
|
64
|
+
def load_worker_klass
|
65
|
+
@load_worker_klass ||= LoadWorker
|
66
|
+
end
|
67
|
+
|
68
|
+
def transform_worker_klass
|
69
|
+
@transform_worker_klass ||= TransformWorker
|
70
|
+
end
|
71
|
+
|
72
|
+
# Recurse through OAI batches one at a time
|
73
|
+
def next_batch!
|
74
|
+
if next_resumption_token && is_recursive
|
75
|
+
etl_worker_klass.perform_async(next_config)
|
30
76
|
else
|
31
|
-
|
32
|
-
if extraction.next_resumption_token && is_recursive
|
33
|
-
# Call the next batch of records
|
34
|
-
ETLWorker.perform_async(solr_config, next_etl_config, batch_size)
|
35
|
-
else
|
36
|
-
CDMBL::CompletedCallback.call!(solr_client)
|
37
|
-
end
|
77
|
+
completed_callback_klass.call!(solr_config)
|
38
78
|
end
|
39
79
|
end
|
40
80
|
|
41
81
|
private
|
42
82
|
|
43
|
-
#
|
44
|
-
def
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
ids,
|
53
|
-
delete_ids)
|
54
|
-
sent_deleted = true
|
55
|
-
end
|
83
|
+
# Extract an oai response - a batch of records
|
84
|
+
def extract_batch!
|
85
|
+
# Delete records that OAI has marked for deletion
|
86
|
+
delete_deletables!
|
87
|
+
# Records with few compounds are processed in batches
|
88
|
+
transform_small_records!
|
89
|
+
# Large records are all transformed and loaded one by one to avoid
|
90
|
+
# timeouts
|
91
|
+
transform_large_records!
|
56
92
|
end
|
57
93
|
|
58
|
-
def
|
59
|
-
|
60
|
-
etl_run.load!(deletables, transformation.records)
|
94
|
+
def next_config
|
95
|
+
config.merge(resumption_token: next_resumption_token)
|
61
96
|
end
|
62
97
|
|
63
|
-
def
|
64
|
-
@
|
98
|
+
def next_resumption_token
|
99
|
+
@next_resumption_token ||= extraction.next_resumption_token
|
65
100
|
end
|
66
101
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
102
|
+
def transform_small_records!
|
103
|
+
compound_filter.filter(large: false).each_slice(batch_size) do |ids|
|
104
|
+
transform!(ids)
|
70
105
|
end
|
71
106
|
end
|
72
107
|
|
73
|
-
def
|
74
|
-
|
108
|
+
def transform_large_records!
|
109
|
+
compound_filter.filter(large: true).each do |id|
|
110
|
+
transform!([id])
|
111
|
+
end
|
75
112
|
end
|
76
113
|
|
77
|
-
def
|
78
|
-
|
114
|
+
def transform!(ids)
|
115
|
+
transform_worker_klass.perform_async(ids,
|
116
|
+
solr_config,
|
117
|
+
cdm_endpoint,
|
118
|
+
oai_endpoint,
|
119
|
+
field_mappings)
|
79
120
|
end
|
80
121
|
|
81
|
-
def
|
82
|
-
|
122
|
+
def delete_deletables!
|
123
|
+
load_worker_klass.perform_async([], extraction.deletable_ids, solr_config)
|
83
124
|
end
|
84
125
|
|
85
|
-
def
|
86
|
-
|
126
|
+
def compound_filter
|
127
|
+
@compound_filter ||=
|
128
|
+
compound_filter_klass.new(record_ids: extraction.local_identifiers,
|
129
|
+
cdm_endpoint: cdm_endpoint,
|
130
|
+
max_compounds: max_compounds)
|
131
|
+
end
|
132
|
+
|
133
|
+
def extraction
|
134
|
+
@extraction ||=
|
135
|
+
extractor_klass.new(oai_endpoint: oai_endpoint,
|
136
|
+
resumption_token: resumption_token,
|
137
|
+
set_spec: set_spec)
|
87
138
|
end
|
88
139
|
end
|
89
|
-
end
|
140
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# require 'sidekiq'
|
2
|
+
# module CDMBL
|
3
|
+
# # Extract records from OAI, delete records marked for deletion, sort the
|
4
|
+
# # remaning records them into "big and small" record piles based upon how many
|
5
|
+
# # compounds a record has, chunk the small records into batches and the big
|
6
|
+
# # records individuall and then send these records to a transformation worker
|
7
|
+
# class ExtractWorker
|
8
|
+
# include Sidekiq::Worker
|
9
|
+
|
10
|
+
# attr_reader :config,
|
11
|
+
# :solr_config,
|
12
|
+
# :cdm_endpoint,
|
13
|
+
# :oai_endpoint,
|
14
|
+
# :field_mappings,
|
15
|
+
# :resumption_token,
|
16
|
+
# :set_spec,
|
17
|
+
# :max_compounds,
|
18
|
+
# :batch_size,
|
19
|
+
# :is_recursive
|
20
|
+
|
21
|
+
# attr_writer :compound_filter_klass,
|
22
|
+
# :extractor_klass,
|
23
|
+
# :extraction_worker_klass,
|
24
|
+
# :load_worker_klass,
|
25
|
+
# :completed_callback_klass,
|
26
|
+
# :transform_worker_klass
|
27
|
+
|
28
|
+
# def perform(config)
|
29
|
+
# # Sidekiq stores params in JSON, so we can't inject dependencies. This
|
30
|
+
# # results in the long set of arguments that follows. Otherwise, we'd
|
31
|
+
# # simply inject the OAI request and extractor objects
|
32
|
+
# @config = config
|
33
|
+
# @solr_config = config.fetch('solr_config').symbolize_keys
|
34
|
+
# @cdm_endpoint = config.fetch('cdm_endpoint')
|
35
|
+
# @oai_endpoint = config.fetch('oai_endpoint')
|
36
|
+
# @field_mappings = config.fetch('field_mappings', [])
|
37
|
+
# @resumption_token = config.fetch('resumption_token', nil)
|
38
|
+
# @set_spec = config.fetch('set_spec', nil)
|
39
|
+
# @max_compounds = config.fetch('max_compounds', 10)
|
40
|
+
# @batch_size = config.fetch('batch_size', 5).to_i
|
41
|
+
# @is_recursive = config.fetch('is_recursive', true)
|
42
|
+
# extract_batch!
|
43
|
+
# next_batch!
|
44
|
+
# end
|
45
|
+
|
46
|
+
# # Because Sidekiq serializes params to JSON, we provide custom setters
|
47
|
+
# # for dependencies (normally these would be default params in the
|
48
|
+
# # constructor) so that they may be mocked and tested
|
49
|
+
# def completed_callback_klass
|
50
|
+
# @completed_callback_klass ||= CDMBL::CompletedCallback
|
51
|
+
# end
|
52
|
+
|
53
|
+
# def extraction_worker_klass
|
54
|
+
# @extraction_worker_klass ||= ExtractionWorker
|
55
|
+
# end
|
56
|
+
|
57
|
+
# def compound_filter_klass
|
58
|
+
# @compound_filter_klass ||= CompoundFilter
|
59
|
+
# end
|
60
|
+
|
61
|
+
# def extractor_klass
|
62
|
+
# @extractor_klass ||= Extractor
|
63
|
+
# end
|
64
|
+
|
65
|
+
# def load_worker_klass
|
66
|
+
# @load_worker_klass ||= LoadWorker
|
67
|
+
# end
|
68
|
+
|
69
|
+
# def transform_worker_klass
|
70
|
+
# @transform_worker_klass ||= TransformWorker
|
71
|
+
# end
|
72
|
+
|
73
|
+
# # Recurse through OAI batches one at a time
|
74
|
+
# def next_batch!
|
75
|
+
# if next_resumption_token && is_recursive
|
76
|
+
# extraction_worker_klass.perform_async(next_config)
|
77
|
+
# else
|
78
|
+
# completed_callback_klass.call!(solr_config)
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
|
82
|
+
# private
|
83
|
+
|
84
|
+
# # Extract an oai response - a batch of records
|
85
|
+
# def extract_batch!
|
86
|
+
# # Delete records that OAI has marked for deletion
|
87
|
+
# delete_deletables!
|
88
|
+
# # Records with few compounds are processed in batches
|
89
|
+
# transform_small_records!
|
90
|
+
# # Large records are all transformed and loaded one by one to avoid
|
91
|
+
# # timeouts
|
92
|
+
# transform_large_records!
|
93
|
+
# end
|
94
|
+
|
95
|
+
# def next_config
|
96
|
+
# config.merge(resumption_token: next_resumption_token)
|
97
|
+
# end
|
98
|
+
|
99
|
+
# def next_resumption_token
|
100
|
+
# @next_resumption_token ||= extraction.next_resumption_token
|
101
|
+
# end
|
102
|
+
|
103
|
+
# def transform_small_records!
|
104
|
+
# compound_filter.filter(large: false).each_slice(batch_size) do |ids|
|
105
|
+
# transform!(ids)
|
106
|
+
# end
|
107
|
+
# end
|
108
|
+
|
109
|
+
# def transform_large_records!
|
110
|
+
# compound_filter.filter(large: true).each do |id|
|
111
|
+
# transform!([id])
|
112
|
+
# end
|
113
|
+
# end
|
114
|
+
|
115
|
+
# def transform!(ids)
|
116
|
+
# transform_worker_klass.perform_async(ids,
|
117
|
+
# solr_config,
|
118
|
+
# cdm_endpoint,
|
119
|
+
# oai_endpoint,
|
120
|
+
# field_mappings)
|
121
|
+
# end
|
122
|
+
|
123
|
+
# def delete_deletables!
|
124
|
+
# load_worker_klass.perform_async([], extraction.deletable_ids, solr_config)
|
125
|
+
# end
|
126
|
+
|
127
|
+
# def compound_filter
|
128
|
+
# @compound_filter ||=
|
129
|
+
# compound_filter_klass.new(record_ids: extraction.local_identifiers,
|
130
|
+
# cdm_endpoint: cdm_endpoint,
|
131
|
+
# max_compounds: max_compounds)
|
132
|
+
# end
|
133
|
+
|
134
|
+
# def extraction
|
135
|
+
# @extraction ||=
|
136
|
+
# extractor_klass.new(oai_endpoint: oai_endpoint,
|
137
|
+
# resumption_token: resumption_token,
|
138
|
+
# set_spec: set_spec)
|
139
|
+
# end
|
140
|
+
# end
|
141
|
+
# end
|
data/lib/cdmbl/extractor.rb
CHANGED
@@ -4,32 +4,27 @@ require 'hash_at_path'
|
|
4
4
|
require 'forwardable'
|
5
5
|
|
6
6
|
module CDMBL
|
7
|
-
#
|
8
|
-
# subsequent passes at the full ContentDM API with identifiers taken from
|
9
|
-
# the contentdm api
|
7
|
+
# Retrieve OAI records and sort them into add/updatables and deletables
|
10
8
|
class Extractor
|
11
9
|
extend ::Forwardable
|
12
10
|
def_delegators :@oai_request, :sets, :identifiers
|
13
11
|
attr_reader :oai_request,
|
14
|
-
:
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:oai_filter
|
12
|
+
:oai_request_klass,
|
13
|
+
:oai_filter_klass,
|
14
|
+
:oai_set_lookup_klass
|
18
15
|
|
19
|
-
def initialize(
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
@
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
29
|
-
|
30
|
-
|
31
|
-
def set_lookup
|
32
|
-
oai_set_lookup.new(oai_sets: sets).keyed
|
16
|
+
def initialize(oai_endpoint: '',
|
17
|
+
resumption_token: nil,
|
18
|
+
set_spec: nil,
|
19
|
+
oai_request_klass: OaiRequest,
|
20
|
+
oai_filter_klass: OAIFilter,
|
21
|
+
oai_set_lookup_klass: OAISetLookup)
|
22
|
+
@oai_request_klass = oai_request_klass
|
23
|
+
@oai_filter_klass = oai_filter_klass
|
24
|
+
@oai_set_lookup_klass = oai_set_lookup_klass
|
25
|
+
@oai_request = oai_requester(oai_endpoint,
|
26
|
+
resumption_token,
|
27
|
+
set_spec)
|
33
28
|
end
|
34
29
|
|
35
30
|
def deletable_ids
|
@@ -44,16 +39,21 @@ module CDMBL
|
|
44
39
|
oai_identifiers.at_path('OAI_PMH/ListIdentifiers/resumptionToken')
|
45
40
|
end
|
46
41
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
42
|
+
def oai_ids
|
43
|
+
oai_filter_klass.new(headers: oai_headers)
|
44
|
+
end
|
45
|
+
|
46
|
+
def set_lookup
|
47
|
+
oai_set_lookup_klass.new(oai_sets: sets).keyed
|
51
48
|
end
|
52
49
|
|
53
50
|
private
|
54
51
|
|
55
|
-
def
|
56
|
-
|
52
|
+
def oai_requester(oai_endpoint, resumption_token, set_spec)
|
53
|
+
@oai_requester ||=
|
54
|
+
oai_request_klass.new(base_uri: oai_endpoint,
|
55
|
+
resumption_token: resumption_token,
|
56
|
+
set: set_spec)
|
57
57
|
end
|
58
58
|
|
59
59
|
# Get the local collection and id from an OAI namespaced identifier
|
@@ -67,7 +67,7 @@ module CDMBL
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def oai_identifiers
|
70
|
-
identifiers
|
70
|
+
@oai_identifiers ||= identifiers
|
71
71
|
end
|
72
72
|
end
|
73
|
-
end
|
73
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMBL
|
3
|
+
# Load Records into a solr index
|
4
|
+
class LoadWorker
|
5
|
+
include Sidekiq::Worker
|
6
|
+
attr_reader :solr_config, :records, :deletables
|
7
|
+
attr_writer :loader_klass, :solr_klass
|
8
|
+
def perform(records = [], deletables = [], solr_config = {})
|
9
|
+
@solr_config = solr_config.symbolize_keys
|
10
|
+
@records = records
|
11
|
+
@deletables = deletables
|
12
|
+
load!
|
13
|
+
end
|
14
|
+
|
15
|
+
def loader_klass
|
16
|
+
@loader_klass ||= Loader
|
17
|
+
end
|
18
|
+
|
19
|
+
def solr_klass
|
20
|
+
@solr_klass ||= DefaultSolr
|
21
|
+
end
|
22
|
+
|
23
|
+
def load!
|
24
|
+
loader_klass.new(records: records,
|
25
|
+
deletable_ids: deletables,
|
26
|
+
solr_client: solr_client).load!
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def solr_client
|
32
|
+
@solr_client ||= solr_klass.new(solr_config)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/cdmbl/oai_request.rb
CHANGED
@@ -4,19 +4,16 @@ module CDMBL
|
|
4
4
|
attr_reader :base_uri,
|
5
5
|
:resumption_token,
|
6
6
|
:client,
|
7
|
-
:from,
|
8
7
|
:set,
|
9
8
|
:identifier
|
10
9
|
def initialize(base_uri: '',
|
11
|
-
resumption_token:
|
12
|
-
|
13
|
-
set: false,
|
10
|
+
resumption_token: nil,
|
11
|
+
set: nil,
|
14
12
|
identifier: '',
|
15
13
|
client: Net::HTTP)
|
16
14
|
@base_uri = base_uri
|
17
15
|
@resumption_token = resumption_token
|
18
16
|
@client = client
|
19
|
-
@from = (from) ? "&from=#{from}" : ''
|
20
17
|
@set = (set) ? "&set=#{set}" : ''
|
21
18
|
@identifier = identifier
|
22
19
|
end
|
@@ -32,7 +29,7 @@ module CDMBL
|
|
32
29
|
private
|
33
30
|
|
34
31
|
def first_batch_uri
|
35
|
-
"#{base_uri}?verb=ListIdentifiers&metadataPrefix=oai_dc#{
|
32
|
+
"#{base_uri}?verb=ListIdentifiers&metadataPrefix=oai_dc#{set}"
|
36
33
|
end
|
37
34
|
|
38
35
|
def batch_uri
|
data/lib/cdmbl/tasks/etl.rake
CHANGED
@@ -2,17 +2,37 @@ require 'cdmbl'
|
|
2
2
|
|
3
3
|
namespace :cdmbl do
|
4
4
|
desc 'Launch a background job to index metadata from CONTENTdm to Solr.'
|
5
|
-
task :
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
5
|
+
task :batch, [
|
6
|
+
:solr_url,
|
7
|
+
:oai_endpoint,
|
8
|
+
:cdm_endpoint,
|
9
|
+
:set_spec,
|
10
|
+
:batch_size,
|
11
|
+
:max_compounds
|
12
|
+
] do |t, args|
|
13
|
+
CDMBL::ETLWorker.perform_async(
|
14
|
+
solr_config: { url: args.fetch(:solr_url) },
|
15
|
+
oai_endpoint: args.fetch(:oai_endpoint),
|
16
|
+
cdm_endpoint: args.fetch(:cdm_endpoint),
|
17
|
+
set_spec: args[:set_spec] != '""' ? args[:set_spec] : nil,
|
18
|
+
batch_size: args.fetch(:batch_size, 10),
|
19
|
+
max_compounds: args.fetch(:max_compounds, 10)
|
20
|
+
)
|
16
21
|
end
|
17
|
-
end
|
18
22
|
|
23
|
+
desc 'Launch a background job to index a single record.'
|
24
|
+
task :record, [
|
25
|
+
:collection,
|
26
|
+
:id,
|
27
|
+
:solr_url,
|
28
|
+
:cdm_endpoint,
|
29
|
+
:oai_endpoint
|
30
|
+
] do |t, args|
|
31
|
+
CDMBL::TransformWorker.perform_async(
|
32
|
+
[[args.fetch(:collection), args.fetch(:id)]],
|
33
|
+
{ url: args.fetch(:solr_url) },
|
34
|
+
args.fetch(:cdm_endpoint),
|
35
|
+
args.fetch(:oai_endpoint)
|
36
|
+
)
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMBL
|
3
|
+
class TransformWorker
|
4
|
+
include Sidekiq::Worker
|
5
|
+
attr_reader :identifiers,
|
6
|
+
:solr_config,
|
7
|
+
:cdm_endpoint,
|
8
|
+
:oai_endpoint,
|
9
|
+
:field_mappings
|
10
|
+
|
11
|
+
attr_writer :cdm_api_klass,
|
12
|
+
:oai_request_klass,
|
13
|
+
:oai_set_lookup_klass,
|
14
|
+
:cdm_notification_klass,
|
15
|
+
:load_worker_klass,
|
16
|
+
:transformer_klass
|
17
|
+
|
18
|
+
def perform(identifiers,
|
19
|
+
solr_config,
|
20
|
+
cdm_endpoint,
|
21
|
+
oai_endpoint,
|
22
|
+
field_mappings)
|
23
|
+
|
24
|
+
@identifiers = identifiers
|
25
|
+
@solr_config = solr_config
|
26
|
+
@cdm_endpoint = cdm_endpoint
|
27
|
+
@oai_endpoint = oai_endpoint
|
28
|
+
@field_mappings = field_mappings
|
29
|
+
|
30
|
+
transform_and_load!
|
31
|
+
end
|
32
|
+
|
33
|
+
def oai_set_lookup_klass
|
34
|
+
@oai_set_lookup_klass ||= OAISetLookup
|
35
|
+
end
|
36
|
+
|
37
|
+
def oai_request_klass
|
38
|
+
@oai_request_klass ||= OaiRequest
|
39
|
+
end
|
40
|
+
|
41
|
+
def cdm_api_klass
|
42
|
+
@cdm_api_klass ||= CONTENTdmAPI::Item
|
43
|
+
end
|
44
|
+
|
45
|
+
def cdm_notification_klass
|
46
|
+
@cdm_notification_klass ||= CdmNotification
|
47
|
+
end
|
48
|
+
|
49
|
+
def transformer_klass
|
50
|
+
@transformer_klass ||= Transformer
|
51
|
+
end
|
52
|
+
|
53
|
+
def load_worker_klass
|
54
|
+
@load_worker_klass ||= LoadWorker
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def transform_and_load!
|
60
|
+
load_worker_klass.perform_async(transformed_records, [], solr_config)
|
61
|
+
end
|
62
|
+
|
63
|
+
def transformed_records
|
64
|
+
@transformation ||=
|
65
|
+
transformer_klass.new(cdm_records: records,
|
66
|
+
oai_sets: set_lookup,
|
67
|
+
field_mappings: field_mappings).records
|
68
|
+
end
|
69
|
+
|
70
|
+
def set_lookup
|
71
|
+
oai_set_lookup_klass.new(oai_sets: sets).keyed
|
72
|
+
end
|
73
|
+
|
74
|
+
def records
|
75
|
+
identifiers.map do |identifier|
|
76
|
+
cdm_request(*identifier)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# e.g. local_identifiers.map { |identifier| extractor.cdm_request(*identifier) }
|
81
|
+
def cdm_request(collection, id)
|
82
|
+
cdm_notification_klass.call!(collection, id, cdm_endpoint)
|
83
|
+
cdm_api_klass.new(base_url: cdm_endpoint,
|
84
|
+
collection: collection,
|
85
|
+
id: id).metadata
|
86
|
+
end
|
87
|
+
|
88
|
+
def sets
|
89
|
+
@oai_request ||=
|
90
|
+
oai_request_klass.new(base_uri: oai_endpoint).sets
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/cdmbl/version.rb
CHANGED
data/lib/cdmbl.rb
CHANGED
@@ -21,4 +21,8 @@ require 'cdmbl/oai_client'
|
|
21
21
|
require 'cdmbl/oai_get_record'
|
22
22
|
require 'cdmbl/oai_deletables'
|
23
23
|
require 'cdmbl/batch_deleter'
|
24
|
-
require 'cdmbl/batch_deleter_worker'
|
24
|
+
require 'cdmbl/batch_deleter_worker'
|
25
|
+
require 'cdmbl/compound_lookup'
|
26
|
+
require 'cdmbl/compound_filter'
|
27
|
+
require 'cdmbl/load_worker'
|
28
|
+
require 'cdmbl/transform_worker'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cdmbl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- chadfennell
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: hash_at_path
|
@@ -198,6 +198,7 @@ extensions: []
|
|
198
198
|
extra_rdoc_files: []
|
199
199
|
files:
|
200
200
|
- ".gitignore"
|
201
|
+
- ".rubocop.yml"
|
201
202
|
- ".travis.yml"
|
202
203
|
- CODE_OF_CONDUCT.md
|
203
204
|
- Gemfile
|
@@ -210,6 +211,8 @@ files:
|
|
210
211
|
- lib/cdmbl.rb
|
211
212
|
- lib/cdmbl/batch_deleter.rb
|
212
213
|
- lib/cdmbl/batch_deleter_worker.rb
|
214
|
+
- lib/cdmbl/compound_filter.rb
|
215
|
+
- lib/cdmbl/compound_lookup.rb
|
213
216
|
- lib/cdmbl/default_cdm_notification.rb
|
214
217
|
- lib/cdmbl/default_completed_callback.rb
|
215
218
|
- lib/cdmbl/default_loader_notification.rb
|
@@ -217,11 +220,13 @@ files:
|
|
217
220
|
- lib/cdmbl/default_solr.rb
|
218
221
|
- lib/cdmbl/etl_run.rb
|
219
222
|
- lib/cdmbl/etl_worker.rb
|
223
|
+
- lib/cdmbl/extract_worker.rb
|
220
224
|
- lib/cdmbl/extractor.rb
|
221
225
|
- lib/cdmbl/field_formatter.rb
|
222
226
|
- lib/cdmbl/field_transformer.rb
|
223
227
|
- lib/cdmbl/formatters.rb
|
224
228
|
- lib/cdmbl/hooks.rb
|
229
|
+
- lib/cdmbl/load_worker.rb
|
225
230
|
- lib/cdmbl/loader.rb
|
226
231
|
- lib/cdmbl/oai_client.rb
|
227
232
|
- lib/cdmbl/oai_deletables.rb
|
@@ -233,6 +238,8 @@ files:
|
|
233
238
|
- lib/cdmbl/record_transformer.rb
|
234
239
|
- lib/cdmbl/tasks/delete.rake
|
235
240
|
- lib/cdmbl/tasks/etl.rake
|
241
|
+
- lib/cdmbl/tasks/extract.rake
|
242
|
+
- lib/cdmbl/transform_worker.rb
|
236
243
|
- lib/cdmbl/transformer.rb
|
237
244
|
- lib/cdmbl/version.rb
|
238
245
|
- travis.yml
|