cdmbl 0.7.2 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/README.md +23 -14
- data/cdmbl.gemspec +1 -1
- data/lib/cdmbl/batch_deleter_worker.rb +1 -2
- data/lib/cdmbl/compound_filter.rb +45 -0
- data/lib/cdmbl/compound_lookup.rb +43 -0
- data/lib/cdmbl/default_solr.rb +0 -1
- data/lib/cdmbl/etl_worker.rb +112 -61
- data/lib/cdmbl/extract_worker.rb +141 -0
- data/lib/cdmbl/extractor.rb +29 -29
- data/lib/cdmbl/load_worker.rb +35 -0
- data/lib/cdmbl/oai_request.rb +3 -6
- data/lib/cdmbl/tasks/etl.rake +32 -12
- data/lib/cdmbl/tasks/extract.rake +9 -0
- data/lib/cdmbl/transform_worker.rb +93 -0
- data/lib/cdmbl/version.rb +1 -1
- data/lib/cdmbl.rb +5 -1
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 818b769be7195ae37538de0892e275545da57a44
|
4
|
+
data.tar.gz: 5a1a6c25b618ea2cd1d047e3ef3047e29e54db87
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aba7fdaefa7ca9e8031b9ac42c9384ebd41ca7bff629d978715aa57cb416248c2b4f5e5e234cf223b61406ffa597591e1906584e4657ed39fc60a361641c7c1d
|
7
|
+
data.tar.gz: 7a1e280b31c57cdf98207481f02946bfdaa6abe2dbfebe36ae9722ea9b444da3dec939859d8db66ce01565b476b4e211ec65d679c707a1378c3f882d3fd9e9bd
|
data/.rubocop.yml
ADDED
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
Use [Blacklight](https://github.com/projectblacklight/blacklight) as a front end for your CONTENTdm instance.
|
6
6
|
|
7
|
-
At the moment, CDMBL consists only of a micro [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) system dedicated to extracting metadata records from a CONTENTdm instance (using the [CONTENTdm API gem](https://github.com/UMNLibraries/contentdm_api), transforming them into Solr documents, and loading them into Solr.
|
7
|
+
At the moment, CDMBL consists only of a micro [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) system dedicated to extracting metadata records from a CONTENTdm instance (using the [CONTENTdm API gem](https://github.com/UMNLibraries/contentdm_api), transforming them into Solr documents, and loading them into Solr.
|
8
8
|
|
9
9
|
## Installation
|
10
10
|
|
@@ -41,22 +41,24 @@ export export GEONAMES_USER="yourusernamehere"
|
|
41
41
|
|
42
42
|
Run the ingester
|
43
43
|
|
44
|
-
rake cdmbl:
|
44
|
+
rake cdmbl:batch[solr_url,oai_endpoint,cdm_endpoint,set_spec, batch_size, max_compounds]
|
45
45
|
|
46
46
|
|Argument| Definition|
|
47
47
|
|--:|---|
|
48
48
|
|solr_url| The full URL to your Solr core instance (same as your blacklight.yml solr url)|
|
49
|
-
|oai_endpoint| A URL to your OAI instance (e.g.
|
49
|
+
|oai_endpoint| A URL to your OAI instance (e.g. https://server16022.contentdm.oclc.org/oai/oai.php) |
|
50
50
|
|cdm_endpoint| A URL to your CONTENTdm API endpoint (e.g. https://server16022.contentdm.oclc.org/dmwebservices/index.php) |
|
51
|
-
|
|
51
|
+
|set_spec| Selectively harvest from a single collection with [setSpec](http://www.openarchives.org/OAI/openarchivesprotocol.html#Set)|
|
52
|
+
|batch_size| The number of records to transform at a time. **Note**: it is within the record transformation process that the CONTENTdm API is requested. This API can be sluggish, so we conservatively transform batches of ten records at a time to prevent timeouts.|
|
53
|
+
|max_compounds| CONTENTdm records with many compounds can take a long time to load from the CONTENTdm API as multiple requests must happen in order to get the metadata for each child record of a parent compound object. For this reason, records with ten or more compound children are, by default, processed in batches of one. This setting allows you to override this behavior.|
|
52
54
|
|
53
55
|
For example:
|
54
56
|
|
55
57
|
```ruby
|
56
|
-
rake "cdmbl:ingest[http://solr:8983/solr/foo-bar-core,
|
58
|
+
rake "cdmbl:ingest[http://solr:8983/solr/foo-bar-core, https://server16022.contentdm.oclc.org/oai/oai.php, https://server16022.contentdm.oclc.org/dmwebservices/index.php, 2015-01-01]"
|
57
59
|
```
|
58
60
|
|
59
|
-
### Custom Rake
|
61
|
+
### Custom Rake Tasks
|
60
62
|
|
61
63
|
You might also create your own rake task to run your modified field transformers:
|
62
64
|
|
@@ -64,14 +66,21 @@ You might also create your own rake task to run your modified field transformers
|
|
64
66
|
require 'cdmbl'
|
65
67
|
|
66
68
|
namespace :cdmbl do
|
67
|
-
desc
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
69
|
+
desc "ingest batches of records"
|
70
|
+
##
|
71
|
+
# e.g. rake mdl_ingester:ingest[2015-09-14, 2]
|
72
|
+
task :batch, [:batch_size, :set_spec] => :environment do |t, args|
|
73
|
+
config =
|
74
|
+
{
|
75
|
+
oai_endpoint: 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
|
76
|
+
cdm_endpoint: 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
|
77
|
+
set_spec: (args[:set_spec] != '""') ? args[:set_spec] : nil,
|
78
|
+
max_compounds: (args[:max_compounds]) ? args[:max_compounds] : 2,
|
79
|
+
batch_size: (args[:batch_size]) ? args[:batch_size] : 30,
|
80
|
+
solr_config: solr_config
|
81
|
+
}
|
82
|
+
CDMBL::ETLWorker.perform_async(config)
|
83
|
+
end
|
75
84
|
end
|
76
85
|
```
|
77
86
|
### Your Own Custom Solr Field Mappings (see above code snippet)
|
data/cdmbl.gemspec
CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency 'rsolr', '~> 2.0'
|
25
25
|
# This gem generally wants to be in a rails app, but just to avoid adding
|
26
26
|
# another external dependency for XML procssing, we rely on activesupport's
|
27
|
-
# Has.
|
27
|
+
# Has.to_jsonl feature for testing and to allow this gem to function
|
28
28
|
# independently from a rails app
|
29
29
|
spec.add_dependency 'activesupport', '>= 4.2'
|
30
30
|
|
@@ -3,8 +3,7 @@ module CDMBL
|
|
3
3
|
class BatchDeleterWorker
|
4
4
|
include Sidekiq::Worker
|
5
5
|
attr_reader :start, :prefix, :oai_url, :solr_url
|
6
|
-
|
7
|
-
sidekiq_options :backtrace => true
|
6
|
+
attr_writer :batch_deleter_klass, :oai_client, :solr_client
|
8
7
|
def perform(start = 0, prefix = '', oai_url = '', solr_url = '')
|
9
8
|
@start = start
|
10
9
|
@prefix = prefix
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module CDMBL
|
2
|
+
# Takes a list of record id/collection data, uses CompoundLookup to
|
3
|
+
# identifiy records with large numbers of compounds and sorts them
|
4
|
+
# into a large and a small heap
|
5
|
+
class CompoundFilter
|
6
|
+
attr_reader :record_ids,
|
7
|
+
:max_compounds,
|
8
|
+
:cdm_endpoint,
|
9
|
+
:compound_lookup_klass
|
10
|
+
def initialize(record_ids: [],
|
11
|
+
max_compounds: 10,
|
12
|
+
cdm_endpoint: '',
|
13
|
+
compound_lookup_klass: CompoundLookup)
|
14
|
+
@record_ids = record_ids
|
15
|
+
@max_compounds = max_compounds
|
16
|
+
@cdm_endpoint = cdm_endpoint
|
17
|
+
@compound_lookup_klass = compound_lookup_klass
|
18
|
+
end
|
19
|
+
|
20
|
+
def filter(large: true)
|
21
|
+
ids(records.select { |record| record[:large] == large })
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def ids(records)
|
27
|
+
records.map { |record| record[:id] }
|
28
|
+
end
|
29
|
+
|
30
|
+
def records
|
31
|
+
@records ||= record_ids.map do |identifier|
|
32
|
+
{
|
33
|
+
large: count(*identifier) >= max_compounds,
|
34
|
+
id: identifier
|
35
|
+
}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def count(collection, id)
|
40
|
+
compound_lookup_klass.new(cdm_endpoint: cdm_endpoint,
|
41
|
+
collection: collection,
|
42
|
+
id: id).count
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module CDMBL
|
2
|
+
# Fetching the full metadata for compound records is expensive. This class
|
3
|
+
# lets us check on how many compounds a CDM record has so that we know
|
4
|
+
class CompoundLookup
|
5
|
+
attr_reader :cdm_endpoint,
|
6
|
+
:collection,
|
7
|
+
:id,
|
8
|
+
:request_klass,
|
9
|
+
:service_klass
|
10
|
+
|
11
|
+
def initialize(cdm_endpoint: '',
|
12
|
+
collection: '',
|
13
|
+
id: '',
|
14
|
+
request_klass: CONTENTdmAPI::Request,
|
15
|
+
service_klass: CONTENTdmAPI::Service)
|
16
|
+
@cdm_endpoint = cdm_endpoint
|
17
|
+
@collection = collection
|
18
|
+
@id = id
|
19
|
+
@request_klass = request_klass
|
20
|
+
@service_klass = service_klass
|
21
|
+
end
|
22
|
+
|
23
|
+
def count
|
24
|
+
page.respond_to?(:length) ? page.length : 0
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def page
|
30
|
+
JSON.parse(request).fetch('page', [])
|
31
|
+
end
|
32
|
+
|
33
|
+
def service
|
34
|
+
@service ||= service_klass.new(function: 'dmGetCompoundObjectInfo',
|
35
|
+
params: [collection, id])
|
36
|
+
end
|
37
|
+
|
38
|
+
def request
|
39
|
+
@request ||= request_klass.new(base_url: cdm_endpoint,
|
40
|
+
service: service).fetch
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/cdmbl/default_solr.rb
CHANGED
data/lib/cdmbl/etl_worker.rb
CHANGED
@@ -1,89 +1,140 @@
|
|
1
1
|
require 'sidekiq'
|
2
2
|
module CDMBL
|
3
|
+
# Extract records from OAI, delete records marked for deletion, sort the
|
4
|
+
# remaning records them into "big and small" record piles based upon how many
|
5
|
+
# compounds a record has, chunk the small records into batches and the big
|
6
|
+
# records individuall and then send these records to a transformation worker
|
3
7
|
class ETLWorker
|
4
8
|
include Sidekiq::Worker
|
5
|
-
|
6
|
-
|
7
|
-
:
|
9
|
+
attr_reader :config,
|
10
|
+
:solr_config,
|
11
|
+
:cdm_endpoint,
|
12
|
+
:oai_endpoint,
|
13
|
+
:field_mappings,
|
14
|
+
:resumption_token,
|
15
|
+
:set_spec,
|
16
|
+
:max_compounds,
|
8
17
|
:batch_size,
|
9
|
-
:is_recursive
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
@
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@
|
26
|
-
@
|
27
|
-
|
28
|
-
|
29
|
-
|
18
|
+
:is_recursive
|
19
|
+
|
20
|
+
attr_writer :compound_filter_klass,
|
21
|
+
:extractor_klass,
|
22
|
+
:etl_worker_klass,
|
23
|
+
:load_worker_klass,
|
24
|
+
:completed_callback_klass,
|
25
|
+
:transform_worker_klass
|
26
|
+
|
27
|
+
def perform(config)
|
28
|
+
# Sidekiq stores params in JSON, so we can't inject dependencies. This
|
29
|
+
# results in the long set of arguments that follows. Otherwise, we'd
|
30
|
+
# simply inject the OAI request and extractor objects
|
31
|
+
@config = config
|
32
|
+
@solr_config = config.fetch('solr_config').symbolize_keys
|
33
|
+
@cdm_endpoint = config.fetch('cdm_endpoint')
|
34
|
+
@oai_endpoint = config.fetch('oai_endpoint')
|
35
|
+
@field_mappings = config.fetch('field_mappings', false)
|
36
|
+
@resumption_token = config.fetch('resumption_token', nil)
|
37
|
+
@set_spec = config.fetch('set_spec', nil)
|
38
|
+
@max_compounds = config.fetch('max_compounds', 10)
|
39
|
+
@batch_size = config.fetch('batch_size', 5).to_i
|
40
|
+
@is_recursive = config.fetch('is_recursive', true)
|
41
|
+
extract_batch!
|
42
|
+
next_batch!
|
43
|
+
end
|
44
|
+
|
45
|
+
# Because Sidekiq serializes params to JSON, we provide custom setters
|
46
|
+
# for dependencies (normally these would be default params in the
|
47
|
+
# constructor) so that they may be mocked and tested
|
48
|
+
def completed_callback_klass
|
49
|
+
@completed_callback_klass ||= CDMBL::CompletedCallback
|
50
|
+
end
|
51
|
+
|
52
|
+
def etl_worker_klass
|
53
|
+
@etl_worker_klass ||= ETLWorker
|
54
|
+
end
|
55
|
+
|
56
|
+
def compound_filter_klass
|
57
|
+
@compound_filter_klass ||= CompoundFilter
|
58
|
+
end
|
59
|
+
|
60
|
+
def extractor_klass
|
61
|
+
@extractor_klass ||= Extractor
|
62
|
+
end
|
63
|
+
|
64
|
+
def load_worker_klass
|
65
|
+
@load_worker_klass ||= LoadWorker
|
66
|
+
end
|
67
|
+
|
68
|
+
def transform_worker_klass
|
69
|
+
@transform_worker_klass ||= TransformWorker
|
70
|
+
end
|
71
|
+
|
72
|
+
# Recurse through OAI batches one at a time
|
73
|
+
def next_batch!
|
74
|
+
if next_resumption_token && is_recursive
|
75
|
+
etl_worker_klass.perform_async(next_config)
|
30
76
|
else
|
31
|
-
|
32
|
-
if extraction.next_resumption_token && is_recursive
|
33
|
-
# Call the next batch of records
|
34
|
-
ETLWorker.perform_async(solr_config, next_etl_config, batch_size)
|
35
|
-
else
|
36
|
-
CDMBL::CompletedCallback.call!(solr_client)
|
37
|
-
end
|
77
|
+
completed_callback_klass.call!(solr_config)
|
38
78
|
end
|
39
79
|
end
|
40
80
|
|
41
81
|
private
|
42
82
|
|
43
|
-
#
|
44
|
-
def
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
ids,
|
53
|
-
delete_ids)
|
54
|
-
sent_deleted = true
|
55
|
-
end
|
83
|
+
# Extract an oai response - a batch of records
|
84
|
+
def extract_batch!
|
85
|
+
# Delete records that OAI has marked for deletion
|
86
|
+
delete_deletables!
|
87
|
+
# Records with few compounds are processed in batches
|
88
|
+
transform_small_records!
|
89
|
+
# Large records are all transformed and loaded one by one to avoid
|
90
|
+
# timeouts
|
91
|
+
transform_large_records!
|
56
92
|
end
|
57
93
|
|
58
|
-
def
|
59
|
-
|
60
|
-
etl_run.load!(deletables, transformation.records)
|
94
|
+
def next_config
|
95
|
+
config.merge(resumption_token: next_resumption_token)
|
61
96
|
end
|
62
97
|
|
63
|
-
def
|
64
|
-
@
|
98
|
+
def next_resumption_token
|
99
|
+
@next_resumption_token ||= extraction.next_resumption_token
|
65
100
|
end
|
66
101
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
102
|
+
def transform_small_records!
|
103
|
+
compound_filter.filter(large: false).each_slice(batch_size) do |ids|
|
104
|
+
transform!(ids)
|
70
105
|
end
|
71
106
|
end
|
72
107
|
|
73
|
-
def
|
74
|
-
|
108
|
+
def transform_large_records!
|
109
|
+
compound_filter.filter(large: true).each do |id|
|
110
|
+
transform!([id])
|
111
|
+
end
|
75
112
|
end
|
76
113
|
|
77
|
-
def
|
78
|
-
|
114
|
+
def transform!(ids)
|
115
|
+
transform_worker_klass.perform_async(ids,
|
116
|
+
solr_config,
|
117
|
+
cdm_endpoint,
|
118
|
+
oai_endpoint,
|
119
|
+
field_mappings)
|
79
120
|
end
|
80
121
|
|
81
|
-
def
|
82
|
-
|
122
|
+
def delete_deletables!
|
123
|
+
load_worker_klass.perform_async([], extraction.deletable_ids, solr_config)
|
83
124
|
end
|
84
125
|
|
85
|
-
def
|
86
|
-
|
126
|
+
def compound_filter
|
127
|
+
@compound_filter ||=
|
128
|
+
compound_filter_klass.new(record_ids: extraction.local_identifiers,
|
129
|
+
cdm_endpoint: cdm_endpoint,
|
130
|
+
max_compounds: max_compounds)
|
131
|
+
end
|
132
|
+
|
133
|
+
def extraction
|
134
|
+
@extraction ||=
|
135
|
+
extractor_klass.new(oai_endpoint: oai_endpoint,
|
136
|
+
resumption_token: resumption_token,
|
137
|
+
set_spec: set_spec)
|
87
138
|
end
|
88
139
|
end
|
89
|
-
end
|
140
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# require 'sidekiq'
|
2
|
+
# module CDMBL
|
3
|
+
# # Extract records from OAI, delete records marked for deletion, sort the
|
4
|
+
# # remaning records them into "big and small" record piles based upon how many
|
5
|
+
# # compounds a record has, chunk the small records into batches and the big
|
6
|
+
# # records individuall and then send these records to a transformation worker
|
7
|
+
# class ExtractWorker
|
8
|
+
# include Sidekiq::Worker
|
9
|
+
|
10
|
+
# attr_reader :config,
|
11
|
+
# :solr_config,
|
12
|
+
# :cdm_endpoint,
|
13
|
+
# :oai_endpoint,
|
14
|
+
# :field_mappings,
|
15
|
+
# :resumption_token,
|
16
|
+
# :set_spec,
|
17
|
+
# :max_compounds,
|
18
|
+
# :batch_size,
|
19
|
+
# :is_recursive
|
20
|
+
|
21
|
+
# attr_writer :compound_filter_klass,
|
22
|
+
# :extractor_klass,
|
23
|
+
# :extraction_worker_klass,
|
24
|
+
# :load_worker_klass,
|
25
|
+
# :completed_callback_klass,
|
26
|
+
# :transform_worker_klass
|
27
|
+
|
28
|
+
# def perform(config)
|
29
|
+
# # Sidekiq stores params in JSON, so we can't inject dependencies. This
|
30
|
+
# # results in the long set of arguments that follows. Otherwise, we'd
|
31
|
+
# # simply inject the OAI request and extractor objects
|
32
|
+
# @config = config
|
33
|
+
# @solr_config = config.fetch('solr_config').symbolize_keys
|
34
|
+
# @cdm_endpoint = config.fetch('cdm_endpoint')
|
35
|
+
# @oai_endpoint = config.fetch('oai_endpoint')
|
36
|
+
# @field_mappings = config.fetch('field_mappings', [])
|
37
|
+
# @resumption_token = config.fetch('resumption_token', nil)
|
38
|
+
# @set_spec = config.fetch('set_spec', nil)
|
39
|
+
# @max_compounds = config.fetch('max_compounds', 10)
|
40
|
+
# @batch_size = config.fetch('batch_size', 5).to_i
|
41
|
+
# @is_recursive = config.fetch('is_recursive', true)
|
42
|
+
# extract_batch!
|
43
|
+
# next_batch!
|
44
|
+
# end
|
45
|
+
|
46
|
+
# # Because Sidekiq serializes params to JSON, we provide custom setters
|
47
|
+
# # for dependencies (normally these would be default params in the
|
48
|
+
# # constructor) so that they may be mocked and tested
|
49
|
+
# def completed_callback_klass
|
50
|
+
# @completed_callback_klass ||= CDMBL::CompletedCallback
|
51
|
+
# end
|
52
|
+
|
53
|
+
# def extraction_worker_klass
|
54
|
+
# @extraction_worker_klass ||= ExtractionWorker
|
55
|
+
# end
|
56
|
+
|
57
|
+
# def compound_filter_klass
|
58
|
+
# @compound_filter_klass ||= CompoundFilter
|
59
|
+
# end
|
60
|
+
|
61
|
+
# def extractor_klass
|
62
|
+
# @extractor_klass ||= Extractor
|
63
|
+
# end
|
64
|
+
|
65
|
+
# def load_worker_klass
|
66
|
+
# @load_worker_klass ||= LoadWorker
|
67
|
+
# end
|
68
|
+
|
69
|
+
# def transform_worker_klass
|
70
|
+
# @transform_worker_klass ||= TransformWorker
|
71
|
+
# end
|
72
|
+
|
73
|
+
# # Recurse through OAI batches one at a time
|
74
|
+
# def next_batch!
|
75
|
+
# if next_resumption_token && is_recursive
|
76
|
+
# extraction_worker_klass.perform_async(next_config)
|
77
|
+
# else
|
78
|
+
# completed_callback_klass.call!(solr_config)
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
|
82
|
+
# private
|
83
|
+
|
84
|
+
# # Extract an oai response - a batch of records
|
85
|
+
# def extract_batch!
|
86
|
+
# # Delete records that OAI has marked for deletion
|
87
|
+
# delete_deletables!
|
88
|
+
# # Records with few compounds are processed in batches
|
89
|
+
# transform_small_records!
|
90
|
+
# # Large records are all transformed and loaded one by one to avoid
|
91
|
+
# # timeouts
|
92
|
+
# transform_large_records!
|
93
|
+
# end
|
94
|
+
|
95
|
+
# def next_config
|
96
|
+
# config.merge(resumption_token: next_resumption_token)
|
97
|
+
# end
|
98
|
+
|
99
|
+
# def next_resumption_token
|
100
|
+
# @next_resumption_token ||= extraction.next_resumption_token
|
101
|
+
# end
|
102
|
+
|
103
|
+
# def transform_small_records!
|
104
|
+
# compound_filter.filter(large: false).each_slice(batch_size) do |ids|
|
105
|
+
# transform!(ids)
|
106
|
+
# end
|
107
|
+
# end
|
108
|
+
|
109
|
+
# def transform_large_records!
|
110
|
+
# compound_filter.filter(large: true).each do |id|
|
111
|
+
# transform!([id])
|
112
|
+
# end
|
113
|
+
# end
|
114
|
+
|
115
|
+
# def transform!(ids)
|
116
|
+
# transform_worker_klass.perform_async(ids,
|
117
|
+
# solr_config,
|
118
|
+
# cdm_endpoint,
|
119
|
+
# oai_endpoint,
|
120
|
+
# field_mappings)
|
121
|
+
# end
|
122
|
+
|
123
|
+
# def delete_deletables!
|
124
|
+
# load_worker_klass.perform_async([], extraction.deletable_ids, solr_config)
|
125
|
+
# end
|
126
|
+
|
127
|
+
# def compound_filter
|
128
|
+
# @compound_filter ||=
|
129
|
+
# compound_filter_klass.new(record_ids: extraction.local_identifiers,
|
130
|
+
# cdm_endpoint: cdm_endpoint,
|
131
|
+
# max_compounds: max_compounds)
|
132
|
+
# end
|
133
|
+
|
134
|
+
# def extraction
|
135
|
+
# @extraction ||=
|
136
|
+
# extractor_klass.new(oai_endpoint: oai_endpoint,
|
137
|
+
# resumption_token: resumption_token,
|
138
|
+
# set_spec: set_spec)
|
139
|
+
# end
|
140
|
+
# end
|
141
|
+
# end
|
data/lib/cdmbl/extractor.rb
CHANGED
@@ -4,32 +4,27 @@ require 'hash_at_path'
|
|
4
4
|
require 'forwardable'
|
5
5
|
|
6
6
|
module CDMBL
|
7
|
-
#
|
8
|
-
# subsequent passes at the full ContentDM API with identifiers taken from
|
9
|
-
# the contentdm api
|
7
|
+
# Retrieve OAI records and sort them into add/updatables and deletables
|
10
8
|
class Extractor
|
11
9
|
extend ::Forwardable
|
12
10
|
def_delegators :@oai_request, :sets, :identifiers
|
13
11
|
attr_reader :oai_request,
|
14
|
-
:
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:oai_filter
|
12
|
+
:oai_request_klass,
|
13
|
+
:oai_filter_klass,
|
14
|
+
:oai_set_lookup_klass
|
18
15
|
|
19
|
-
def initialize(
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
@
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
29
|
-
|
30
|
-
|
31
|
-
def set_lookup
|
32
|
-
oai_set_lookup.new(oai_sets: sets).keyed
|
16
|
+
def initialize(oai_endpoint: '',
|
17
|
+
resumption_token: nil,
|
18
|
+
set_spec: nil,
|
19
|
+
oai_request_klass: OaiRequest,
|
20
|
+
oai_filter_klass: OAIFilter,
|
21
|
+
oai_set_lookup_klass: OAISetLookup)
|
22
|
+
@oai_request_klass = oai_request_klass
|
23
|
+
@oai_filter_klass = oai_filter_klass
|
24
|
+
@oai_set_lookup_klass = oai_set_lookup_klass
|
25
|
+
@oai_request = oai_requester(oai_endpoint,
|
26
|
+
resumption_token,
|
27
|
+
set_spec)
|
33
28
|
end
|
34
29
|
|
35
30
|
def deletable_ids
|
@@ -44,16 +39,21 @@ module CDMBL
|
|
44
39
|
oai_identifiers.at_path('OAI_PMH/ListIdentifiers/resumptionToken')
|
45
40
|
end
|
46
41
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
42
|
+
def oai_ids
|
43
|
+
oai_filter_klass.new(headers: oai_headers)
|
44
|
+
end
|
45
|
+
|
46
|
+
def set_lookup
|
47
|
+
oai_set_lookup_klass.new(oai_sets: sets).keyed
|
51
48
|
end
|
52
49
|
|
53
50
|
private
|
54
51
|
|
55
|
-
def
|
56
|
-
|
52
|
+
def oai_requester(oai_endpoint, resumption_token, set_spec)
|
53
|
+
@oai_requester ||=
|
54
|
+
oai_request_klass.new(base_uri: oai_endpoint,
|
55
|
+
resumption_token: resumption_token,
|
56
|
+
set: set_spec)
|
57
57
|
end
|
58
58
|
|
59
59
|
# Get the local collection and id from an OAI namespaced identifier
|
@@ -67,7 +67,7 @@ module CDMBL
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def oai_identifiers
|
70
|
-
identifiers
|
70
|
+
@oai_identifiers ||= identifiers
|
71
71
|
end
|
72
72
|
end
|
73
|
-
end
|
73
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMBL
|
3
|
+
# Load Records into a solr index
|
4
|
+
class LoadWorker
|
5
|
+
include Sidekiq::Worker
|
6
|
+
attr_reader :solr_config, :records, :deletables
|
7
|
+
attr_writer :loader_klass, :solr_klass
|
8
|
+
def perform(records = [], deletables = [], solr_config = {})
|
9
|
+
@solr_config = solr_config.symbolize_keys
|
10
|
+
@records = records
|
11
|
+
@deletables = deletables
|
12
|
+
load!
|
13
|
+
end
|
14
|
+
|
15
|
+
def loader_klass
|
16
|
+
@loader_klass ||= Loader
|
17
|
+
end
|
18
|
+
|
19
|
+
def solr_klass
|
20
|
+
@solr_klass ||= DefaultSolr
|
21
|
+
end
|
22
|
+
|
23
|
+
def load!
|
24
|
+
loader_klass.new(records: records,
|
25
|
+
deletable_ids: deletables,
|
26
|
+
solr_client: solr_client).load!
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def solr_client
|
32
|
+
@solr_client ||= solr_klass.new(solr_config)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/cdmbl/oai_request.rb
CHANGED
@@ -4,19 +4,16 @@ module CDMBL
|
|
4
4
|
attr_reader :base_uri,
|
5
5
|
:resumption_token,
|
6
6
|
:client,
|
7
|
-
:from,
|
8
7
|
:set,
|
9
8
|
:identifier
|
10
9
|
def initialize(base_uri: '',
|
11
|
-
resumption_token:
|
12
|
-
|
13
|
-
set: false,
|
10
|
+
resumption_token: nil,
|
11
|
+
set: nil,
|
14
12
|
identifier: '',
|
15
13
|
client: Net::HTTP)
|
16
14
|
@base_uri = base_uri
|
17
15
|
@resumption_token = resumption_token
|
18
16
|
@client = client
|
19
|
-
@from = (from) ? "&from=#{from}" : ''
|
20
17
|
@set = (set) ? "&set=#{set}" : ''
|
21
18
|
@identifier = identifier
|
22
19
|
end
|
@@ -32,7 +29,7 @@ module CDMBL
|
|
32
29
|
private
|
33
30
|
|
34
31
|
def first_batch_uri
|
35
|
-
"#{base_uri}?verb=ListIdentifiers&metadataPrefix=oai_dc#{
|
32
|
+
"#{base_uri}?verb=ListIdentifiers&metadataPrefix=oai_dc#{set}"
|
36
33
|
end
|
37
34
|
|
38
35
|
def batch_uri
|
data/lib/cdmbl/tasks/etl.rake
CHANGED
@@ -2,17 +2,37 @@ require 'cdmbl'
|
|
2
2
|
|
3
3
|
namespace :cdmbl do
|
4
4
|
desc 'Launch a background job to index metadata from CONTENTdm to Solr.'
|
5
|
-
task :
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
5
|
+
task :batch, [
|
6
|
+
:solr_url,
|
7
|
+
:oai_endpoint,
|
8
|
+
:cdm_endpoint,
|
9
|
+
:set_spec,
|
10
|
+
:batch_size,
|
11
|
+
:max_compounds
|
12
|
+
] do |t, args|
|
13
|
+
CDMBL::ETLWorker.perform_async(
|
14
|
+
solr_config: { url: args.fetch(:solr_url) },
|
15
|
+
oai_endpoint: args.fetch(:oai_endpoint),
|
16
|
+
cdm_endpoint: args.fetch(:cdm_endpoint),
|
17
|
+
set_spec: args[:set_spec] != '""' ? args[:set_spec] : nil,
|
18
|
+
batch_size: args.fetch(:batch_size, 10),
|
19
|
+
max_compounds: args.fetch(:max_compounds, 10)
|
20
|
+
)
|
16
21
|
end
|
17
|
-
end
|
18
22
|
|
23
|
+
desc 'Launch a background job to index a single record.'
|
24
|
+
task :record, [
|
25
|
+
:collection,
|
26
|
+
:id,
|
27
|
+
:solr_url,
|
28
|
+
:cdm_endpoint,
|
29
|
+
:oai_endpoint
|
30
|
+
] do |t, args|
|
31
|
+
CDMBL::TransformWorker.perform_async(
|
32
|
+
[[args.fetch(:collection), args.fetch(:id)]],
|
33
|
+
{ url: args.fetch(:solr_url) },
|
34
|
+
args.fetch(:cdm_endpoint),
|
35
|
+
args.fetch(:oai_endpoint)
|
36
|
+
)
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMBL
|
3
|
+
class TransformWorker
|
4
|
+
include Sidekiq::Worker
|
5
|
+
attr_reader :identifiers,
|
6
|
+
:solr_config,
|
7
|
+
:cdm_endpoint,
|
8
|
+
:oai_endpoint,
|
9
|
+
:field_mappings
|
10
|
+
|
11
|
+
attr_writer :cdm_api_klass,
|
12
|
+
:oai_request_klass,
|
13
|
+
:oai_set_lookup_klass,
|
14
|
+
:cdm_notification_klass,
|
15
|
+
:load_worker_klass,
|
16
|
+
:transformer_klass
|
17
|
+
|
18
|
+
def perform(identifiers,
|
19
|
+
solr_config,
|
20
|
+
cdm_endpoint,
|
21
|
+
oai_endpoint,
|
22
|
+
field_mappings)
|
23
|
+
|
24
|
+
@identifiers = identifiers
|
25
|
+
@solr_config = solr_config
|
26
|
+
@cdm_endpoint = cdm_endpoint
|
27
|
+
@oai_endpoint = oai_endpoint
|
28
|
+
@field_mappings = field_mappings
|
29
|
+
|
30
|
+
transform_and_load!
|
31
|
+
end
|
32
|
+
|
33
|
+
def oai_set_lookup_klass
|
34
|
+
@oai_set_lookup_klass ||= OAISetLookup
|
35
|
+
end
|
36
|
+
|
37
|
+
def oai_request_klass
|
38
|
+
@oai_request_klass ||= OaiRequest
|
39
|
+
end
|
40
|
+
|
41
|
+
def cdm_api_klass
|
42
|
+
@cdm_api_klass ||= CONTENTdmAPI::Item
|
43
|
+
end
|
44
|
+
|
45
|
+
def cdm_notification_klass
|
46
|
+
@cdm_notification_klass ||= CdmNotification
|
47
|
+
end
|
48
|
+
|
49
|
+
def transformer_klass
|
50
|
+
@transformer_klass ||= Transformer
|
51
|
+
end
|
52
|
+
|
53
|
+
def load_worker_klass
|
54
|
+
@load_worker_klass ||= LoadWorker
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def transform_and_load!
|
60
|
+
load_worker_klass.perform_async(transformed_records, [], solr_config)
|
61
|
+
end
|
62
|
+
|
63
|
+
def transformed_records
|
64
|
+
@transformation ||=
|
65
|
+
transformer_klass.new(cdm_records: records,
|
66
|
+
oai_sets: set_lookup,
|
67
|
+
field_mappings: field_mappings).records
|
68
|
+
end
|
69
|
+
|
70
|
+
def set_lookup
|
71
|
+
oai_set_lookup_klass.new(oai_sets: sets).keyed
|
72
|
+
end
|
73
|
+
|
74
|
+
def records
|
75
|
+
identifiers.map do |identifier|
|
76
|
+
cdm_request(*identifier)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# e.g. local_identifiers.map { |identifier| extractor.cdm_request(*identifier) }
|
81
|
+
def cdm_request(collection, id)
|
82
|
+
cdm_notification_klass.call!(collection, id, cdm_endpoint)
|
83
|
+
cdm_api_klass.new(base_url: cdm_endpoint,
|
84
|
+
collection: collection,
|
85
|
+
id: id).metadata
|
86
|
+
end
|
87
|
+
|
88
|
+
def sets
|
89
|
+
@oai_request ||=
|
90
|
+
oai_request_klass.new(base_uri: oai_endpoint).sets
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/cdmbl/version.rb
CHANGED
data/lib/cdmbl.rb
CHANGED
@@ -21,4 +21,8 @@ require 'cdmbl/oai_client'
|
|
21
21
|
require 'cdmbl/oai_get_record'
|
22
22
|
require 'cdmbl/oai_deletables'
|
23
23
|
require 'cdmbl/batch_deleter'
|
24
|
-
require 'cdmbl/batch_deleter_worker'
|
24
|
+
require 'cdmbl/batch_deleter_worker'
|
25
|
+
require 'cdmbl/compound_lookup'
|
26
|
+
require 'cdmbl/compound_filter'
|
27
|
+
require 'cdmbl/load_worker'
|
28
|
+
require 'cdmbl/transform_worker'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cdmbl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- chadfennell
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: hash_at_path
|
@@ -198,6 +198,7 @@ extensions: []
|
|
198
198
|
extra_rdoc_files: []
|
199
199
|
files:
|
200
200
|
- ".gitignore"
|
201
|
+
- ".rubocop.yml"
|
201
202
|
- ".travis.yml"
|
202
203
|
- CODE_OF_CONDUCT.md
|
203
204
|
- Gemfile
|
@@ -210,6 +211,8 @@ files:
|
|
210
211
|
- lib/cdmbl.rb
|
211
212
|
- lib/cdmbl/batch_deleter.rb
|
212
213
|
- lib/cdmbl/batch_deleter_worker.rb
|
214
|
+
- lib/cdmbl/compound_filter.rb
|
215
|
+
- lib/cdmbl/compound_lookup.rb
|
213
216
|
- lib/cdmbl/default_cdm_notification.rb
|
214
217
|
- lib/cdmbl/default_completed_callback.rb
|
215
218
|
- lib/cdmbl/default_loader_notification.rb
|
@@ -217,11 +220,13 @@ files:
|
|
217
220
|
- lib/cdmbl/default_solr.rb
|
218
221
|
- lib/cdmbl/etl_run.rb
|
219
222
|
- lib/cdmbl/etl_worker.rb
|
223
|
+
- lib/cdmbl/extract_worker.rb
|
220
224
|
- lib/cdmbl/extractor.rb
|
221
225
|
- lib/cdmbl/field_formatter.rb
|
222
226
|
- lib/cdmbl/field_transformer.rb
|
223
227
|
- lib/cdmbl/formatters.rb
|
224
228
|
- lib/cdmbl/hooks.rb
|
229
|
+
- lib/cdmbl/load_worker.rb
|
225
230
|
- lib/cdmbl/loader.rb
|
226
231
|
- lib/cdmbl/oai_client.rb
|
227
232
|
- lib/cdmbl/oai_deletables.rb
|
@@ -233,6 +238,8 @@ files:
|
|
233
238
|
- lib/cdmbl/record_transformer.rb
|
234
239
|
- lib/cdmbl/tasks/delete.rake
|
235
240
|
- lib/cdmbl/tasks/etl.rake
|
241
|
+
- lib/cdmbl/tasks/extract.rake
|
242
|
+
- lib/cdmbl/transform_worker.rb
|
236
243
|
- lib/cdmbl/transformer.rb
|
237
244
|
- lib/cdmbl/version.rb
|
238
245
|
- travis.yml
|