cdmdexer 0.17.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +4 -0
- data/.travis.yml +8 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +149 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cdmdexer.gemspec +35 -0
- data/lib/cdmdexer/cdm_item.rb +89 -0
- data/lib/cdmdexer/default_cdm_notification.rb +8 -0
- data/lib/cdmdexer/default_completed_callback.rb +8 -0
- data/lib/cdmdexer/default_loader_notification.rb +8 -0
- data/lib/cdmdexer/default_oai_notification.rb +8 -0
- data/lib/cdmdexer/default_solr.rb +35 -0
- data/lib/cdmdexer/etl_by_set_specs.rb +18 -0
- data/lib/cdmdexer/etl_worker.rb +113 -0
- data/lib/cdmdexer/field_formatter.rb +13 -0
- data/lib/cdmdexer/field_mapping.rb +28 -0
- data/lib/cdmdexer/field_transformer.rb +41 -0
- data/lib/cdmdexer/filtered_set_specs.rb +41 -0
- data/lib/cdmdexer/formatters.rb +169 -0
- data/lib/cdmdexer/hooks.rb +31 -0
- data/lib/cdmdexer/load_worker.rb +36 -0
- data/lib/cdmdexer/loader.rb +19 -0
- data/lib/cdmdexer/oai_client.rb +26 -0
- data/lib/cdmdexer/oai_request.rb +100 -0
- data/lib/cdmdexer/rake_task.rb +6 -0
- data/lib/cdmdexer/record_transformer.rb +25 -0
- data/lib/cdmdexer/regex_filter_callback.rb +19 -0
- data/lib/cdmdexer/tasks/delete.rake +12 -0
- data/lib/cdmdexer/tasks/etl.rake +96 -0
- data/lib/cdmdexer/transform_worker.rb +93 -0
- data/lib/cdmdexer/transformer.rb +171 -0
- data/lib/cdmdexer/version.rb +3 -0
- data/lib/cdmdexer.rb +26 -0
- data/travis.yml +6 -0
- metadata +223 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
class FieldMapping
|
3
|
+
attr_reader :config
|
4
|
+
def initialize(config: {})
|
5
|
+
@config = symbolize(config)
|
6
|
+
end
|
7
|
+
|
8
|
+
def origin_path
|
9
|
+
config.fetch(:origin_path)
|
10
|
+
end
|
11
|
+
|
12
|
+
def dest_path
|
13
|
+
config.fetch(:dest_path)
|
14
|
+
end
|
15
|
+
|
16
|
+
def formatters
|
17
|
+
config.fetch(:formatters, [DefaultFormatter]).map do |formatter|
|
18
|
+
formatter.is_a?(String) ? Object.const_get(formatter) : formatter
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def symbolize(config)
|
25
|
+
config.inject({}) { |memo, (k, v)| memo[k.to_sym] = v; memo }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'hash_at_path'
|
2
|
+
|
3
|
+
module CDMDEXER
|
4
|
+
class FieldTransformer
|
5
|
+
extend Forwardable
|
6
|
+
def_delegators :@field_mapping, :origin_path, :dest_path, :formatters
|
7
|
+
attr_reader :field_value, :field_mapping, :formatter_klass
|
8
|
+
def initialize(field_mapping: FieldMapping.new,
|
9
|
+
record: {},
|
10
|
+
formatter_klass: FieldFormatter)
|
11
|
+
@field_mapping = field_mapping
|
12
|
+
@field_value = compact(record.at_path(origin_path))
|
13
|
+
@formatter_klass = formatter_klass
|
14
|
+
end
|
15
|
+
|
16
|
+
def reduce
|
17
|
+
(blank?(value)) ? {} : { "#{dest_path}" => value }
|
18
|
+
end
|
19
|
+
|
20
|
+
def value
|
21
|
+
@value ||= (!blank?(field_value)) ? transform_field : nil
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def compact(record)
|
27
|
+
(record.respond_to?(:compact)) ? record.compact : record
|
28
|
+
end
|
29
|
+
|
30
|
+
# File activesupport/lib/active_support/core_ext/object/blank.rb, line 14
|
31
|
+
def blank?(val)
|
32
|
+
val.respond_to?(:empty?) ? !!val.empty? : !val
|
33
|
+
end
|
34
|
+
|
35
|
+
def transform_field
|
36
|
+
formatter_klass.new(value: field_value, formatters: formatters).format!
|
37
|
+
rescue StandardError => e
|
38
|
+
raise "Mapping Error:#{field_mapping.config} Error:#{e.message}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
module DefaultFilterSetCallback
|
3
|
+
def valid?(set: {})
|
4
|
+
true
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
8
|
+
class FilteredSetSpecs
|
9
|
+
attr_reader :oai_base_url,
|
10
|
+
:oai_client,
|
11
|
+
:callback
|
12
|
+
|
13
|
+
def initialize(oai_base_url: :missing_oai_base_url,
|
14
|
+
oai_client: OaiClient,
|
15
|
+
callback: CDMDEXER::DefaultSetFilterCallback.new)
|
16
|
+
@oai_base_url = oai_base_url
|
17
|
+
@oai_client = oai_client
|
18
|
+
@callback = callback
|
19
|
+
end
|
20
|
+
|
21
|
+
def set_specs
|
22
|
+
filtered_sets.map { |set| set['setSpec'] }
|
23
|
+
end
|
24
|
+
|
25
|
+
def filtered_sets
|
26
|
+
@filtered_sets ||= sets.select do |set|
|
27
|
+
callback.valid?(set: set)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def sets
|
34
|
+
@sets ||= list_sets['OAI_PMH']['ListSets']['set']
|
35
|
+
end
|
36
|
+
|
37
|
+
def list_sets
|
38
|
+
@list_sets ||= oai_client.new(base_url: oai_base_url).request('verb=ListSets')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require 'titleize'
|
2
|
+
require 'json'
|
3
|
+
require 'net/http'
|
4
|
+
# A handful of very simple formatters to clean up CONTENTdm API metadata
|
5
|
+
module CDMDEXER
|
6
|
+
|
7
|
+
|
8
|
+
class GeoNameID
|
9
|
+
def self.format(value)
|
10
|
+
value.split('/').last
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class GeoNameIDToJson
|
15
|
+
URL = "http://ws.geonames.net/getJSON?username=#{ENV['GEONAMES_USER']}&token=#{ENV['GEONAMES_TOKEN']}"
|
16
|
+
def self.format(value)
|
17
|
+
JSON.parse(Net::HTTP.get_response(URI.parse("#{URL}&geonameId=#{value}")).body)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class GeoNameToLocation
|
22
|
+
def self.format(value)
|
23
|
+
return if !value.respond_to?(:fetch)
|
24
|
+
return if !value['lat'] || !value['lng']
|
25
|
+
"#{value['lat']},#{value['lng']}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class GeoNameToPlaceName
|
30
|
+
def self.format(value)
|
31
|
+
return if !value.respond_to?(:fetch)
|
32
|
+
[
|
33
|
+
value['name'],
|
34
|
+
value['adminName1'],
|
35
|
+
value['adminName2']
|
36
|
+
].select { |place| place != 'Minnesota'}.compact.uniq
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class DefaultFormatter
|
41
|
+
def self.format(value)
|
42
|
+
value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class KeywordFormatter
|
47
|
+
def self.format(value)
|
48
|
+
vals = []
|
49
|
+
vals << value['genera'].split(';') if value['genera'].respond_to?(:split)
|
50
|
+
vals << value['specif'].split(';') if value['specif'].respond_to?(:split)
|
51
|
+
vals << value['subjec'].split(';') if value['subjec'].respond_to?(:split)
|
52
|
+
vals.flatten.uniq.sort
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class UniqueFormatter
|
57
|
+
def self.format(value)
|
58
|
+
if value.respond_to?(:uniq)
|
59
|
+
value.uniq
|
60
|
+
else
|
61
|
+
value.titleize
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class Titlieze
|
67
|
+
def self.format(value)
|
68
|
+
if value.respond_to?(:map)
|
69
|
+
value.map {|value| value.titleize }
|
70
|
+
else
|
71
|
+
value.titleize
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
class ImageId
|
77
|
+
def self.format(value)
|
78
|
+
value
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
class ToJsonFormatter
|
83
|
+
def self.format(values)
|
84
|
+
values.to_json if values.respond_to?(:to_json)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class StripSemicolonFormatter
|
89
|
+
def self.format(values)
|
90
|
+
if values.respond_to?(:map)
|
91
|
+
values.map {|value| value.gsub(/;/, '') }
|
92
|
+
else
|
93
|
+
values.gsub(/;/, '')
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
class StripFormatter
|
99
|
+
def self.format(values)
|
100
|
+
return '' if values.nil?
|
101
|
+
if values.respond_to?(:map)
|
102
|
+
values.map {|value| value.strip }
|
103
|
+
else
|
104
|
+
values.strip
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
class SplitFormatter
|
110
|
+
def self.format(value)
|
111
|
+
(value.respond_to?(:split)) ? value.split(';') : value
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
class JoinFormatter
|
116
|
+
def self.format(value)
|
117
|
+
(value.respond_to?(:join)) ? value.join('; ') : value
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
class AddSetSpecFormatter
|
122
|
+
def self.format(value)
|
123
|
+
value.merge('setSpec' => value['id'].split('/').first)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
class SetSpecFormatter
|
128
|
+
def self.format(value)
|
129
|
+
value['setSpec']
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class CollectionNameFormatter
|
134
|
+
def self.format(value)
|
135
|
+
value['oai_sets'].fetch(value['setSpec'], {})
|
136
|
+
.fetch(:name, '')
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
class CollectionDescriptionFormatter
|
141
|
+
def self.format(value)
|
142
|
+
value['oai_sets'].fetch(value['setSpec'], {})
|
143
|
+
.fetch(:description, '')
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
class FilterBadCollections
|
148
|
+
def self.format(value)
|
149
|
+
(/Collection information undefined/i =~ value) ? '' : value
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
class ToIFormatter
|
154
|
+
def self.format(value)
|
155
|
+
value.to_i if value.respond_to?(:to_i)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
class LocationFormatter
|
160
|
+
def self.format(record)
|
161
|
+
if record['latitu'] && record['longit'] && record['latitu'] != '' && record['longit'] != '' && record['latitu'] != {}
|
162
|
+
"#{record['latitu']}, #{record['longit']}"
|
163
|
+
else
|
164
|
+
nil
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
def self.const_missing(name)
|
3
|
+
if name.to_s == 'Solr'
|
4
|
+
hook(pattern: name.to_s, default: DefaultSolr)
|
5
|
+
elsif name.to_s == 'CompletedCallback'
|
6
|
+
hook(pattern: name.to_s, default: DefaultCompletedCallback)
|
7
|
+
elsif name.to_s == 'OaiNotification'
|
8
|
+
hook(pattern: name.to_s, default: DefaultOaiNotification)
|
9
|
+
elsif name.to_s == 'LoaderNotification'
|
10
|
+
hook(pattern: name.to_s, default: DefaultLoaderNotification)
|
11
|
+
elsif name.to_s == 'CdmNotification'
|
12
|
+
hook(pattern: name.to_s, default: DefaultCdmNotification)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.hook(pattern: '', default: false)
|
17
|
+
if find_hook(pattern, default)
|
18
|
+
Object.const_get("CDMDEXER::#{find_hook(pattern, default)}")
|
19
|
+
else
|
20
|
+
default
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.find_hook(pattern, default)
|
25
|
+
CDMDEXER.constants.find do |konst|
|
26
|
+
if Object.const_get("CDMDEXER::#{konst}") != default
|
27
|
+
/#{pattern}/ =~ konst.to_s
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMDEXER
|
3
|
+
# Load Records into a solr index
|
4
|
+
class LoadWorker
|
5
|
+
include Sidekiq::Worker
|
6
|
+
sidekiq_options queue: 'critical'
|
7
|
+
attr_reader :solr_config, :records, :deletables
|
8
|
+
attr_writer :loader_klass, :solr_klass
|
9
|
+
def perform(records = [], deletables = [], solr_config = {})
|
10
|
+
@solr_config = solr_config.symbolize_keys
|
11
|
+
@records = records
|
12
|
+
@deletables = deletables
|
13
|
+
load!
|
14
|
+
end
|
15
|
+
|
16
|
+
def loader_klass
|
17
|
+
@loader_klass ||= Loader
|
18
|
+
end
|
19
|
+
|
20
|
+
def solr_klass
|
21
|
+
@solr_klass ||= DefaultSolr
|
22
|
+
end
|
23
|
+
|
24
|
+
def load!
|
25
|
+
loader_klass.new(records: records,
|
26
|
+
deletable_ids: deletables,
|
27
|
+
solr_client: solr_client).load!
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def solr_client
|
33
|
+
@solr_client ||= solr_klass.new(solr_config)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
|
3
|
+
class Loader
|
4
|
+
attr_reader :solr_client, :records, :deletable_ids
|
5
|
+
|
6
|
+
def initialize(records: [],
|
7
|
+
deletable_ids: [],
|
8
|
+
solr_client: CDMDEXER::DefaultSolr)
|
9
|
+
@solr_client = solr_client
|
10
|
+
@records = records
|
11
|
+
@deletable_ids = deletable_ids
|
12
|
+
end
|
13
|
+
|
14
|
+
def load!
|
15
|
+
solr_client.delete deletable_ids unless deletable_ids.empty?
|
16
|
+
solr_client.add records
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
require 'json'
|
3
|
+
require 'http'
|
4
|
+
module CDMDEXER
|
5
|
+
class OaiClient
|
6
|
+
attr_reader :base_url, :client
|
7
|
+
def initialize(base_url: '', client: HTTP)
|
8
|
+
@base_url = base_url
|
9
|
+
@client = client
|
10
|
+
end
|
11
|
+
|
12
|
+
def request(query)
|
13
|
+
hashify get("#{base_url}?#{query}")
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def get(url)
|
19
|
+
client.get(url).to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
def hashify(xml)
|
23
|
+
Hash.from_xml(xml)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'hash_at_path'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module CDMDEXER
|
5
|
+
# Light wrapper around OAI requests
|
6
|
+
# Enhances OAI responses with handles sets, records, etc and adds a little
|
7
|
+
# extra value to their data with a keyed set lookup, filters for deleted
|
8
|
+
# and non-deleted records
|
9
|
+
class OaiRequest
|
10
|
+
attr_reader :endpoint_url,
|
11
|
+
:resumption_token,
|
12
|
+
:client,
|
13
|
+
:set_spec
|
14
|
+
|
15
|
+
def initialize(endpoint_url: '',
|
16
|
+
resumption_token: nil,
|
17
|
+
set_spec: nil,
|
18
|
+
client: Net::HTTP)
|
19
|
+
@endpoint_url = endpoint_url
|
20
|
+
@resumption_token = resumption_token
|
21
|
+
@client = client
|
22
|
+
@set_spec = set_spec ? "&set=#{set_spec}" : ''
|
23
|
+
end
|
24
|
+
|
25
|
+
def records
|
26
|
+
headers.map do |header|
|
27
|
+
header.merge(
|
28
|
+
id: header['identifier'].split(':').last.split('/').join(':')
|
29
|
+
)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def sets
|
34
|
+
# Ensure a result of one set is still an array
|
35
|
+
@sets ||= force_array request(sets_endpoint_url).at_path('OAI_PMH/ListSets/set')
|
36
|
+
end
|
37
|
+
|
38
|
+
def set_lookup
|
39
|
+
sets.inject({}) { |memo, set| memo.merge(to_key(set)) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def next_resumption_token
|
43
|
+
identifier_request.at_path('OAI_PMH/ListIdentifiers/resumptionToken')
|
44
|
+
end
|
45
|
+
|
46
|
+
def deletable_ids
|
47
|
+
records.select { |record| record['status'] == 'deleted' }
|
48
|
+
.map { |record| record[:id] }
|
49
|
+
end
|
50
|
+
|
51
|
+
def updatables
|
52
|
+
records.reject { |record| record['status'] == 'deleted' }
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# TODO: Add some error handling if this ever turns up empty
|
58
|
+
def headers
|
59
|
+
force_array identifier_request.at_path('OAI_PMH/ListIdentifiers/header')
|
60
|
+
end
|
61
|
+
|
62
|
+
# Ensure results are a single level array
|
63
|
+
# (single row sets, records, etc)
|
64
|
+
def force_array(result)
|
65
|
+
[result].flatten
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_key(set)
|
69
|
+
{
|
70
|
+
set['setSpec'] =>
|
71
|
+
{
|
72
|
+
name: set['setName'],
|
73
|
+
description: set.at_path('setDescription/dc/description')
|
74
|
+
}
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def identifier_request
|
79
|
+
@identifier_request ||=
|
80
|
+
resumption_token ? request(batch_endpoint_url) : request(first_batch_endpoint_url)
|
81
|
+
end
|
82
|
+
|
83
|
+
def first_batch_endpoint_url
|
84
|
+
"#{endpoint_url}?verb=ListIdentifiers&metadataPrefix=oai_dc#{set_spec}"
|
85
|
+
end
|
86
|
+
|
87
|
+
def batch_endpoint_url
|
88
|
+
"#{endpoint_url}?verb=ListIdentifiers&resumptionToken=#{resumption_token}"
|
89
|
+
end
|
90
|
+
|
91
|
+
def sets_endpoint_url
|
92
|
+
"#{endpoint_url}?verb=ListSets"
|
93
|
+
end
|
94
|
+
|
95
|
+
def request(location)
|
96
|
+
CDMDEXER::OaiNotification.call!(location)
|
97
|
+
Hash.from_xml(client.get_response(URI(location)).body)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
class RecordTransformer
|
3
|
+
attr_reader :record, :field_mappings, :field_transformer
|
4
|
+
def initialize(record: {},
|
5
|
+
field_mappings: [],
|
6
|
+
field_transformer: FieldTransformer)
|
7
|
+
@record = record
|
8
|
+
@field_mappings = field_mappings
|
9
|
+
@field_transformer = field_transformer
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform!
|
13
|
+
field_mappings.inject({}) do |dest_record, field_mapping|
|
14
|
+
dest_record.merge(transform_field(record, field_mapping))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def transform_field(record, field_mapping)
|
21
|
+
field_transformer.new(field_mapping: field_mapping,
|
22
|
+
record: record).reduce
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
# Search an OAI ListSets field using a regular expression
|
3
|
+
class RegexFilterCallback
|
4
|
+
attr_reader :field, :pattern, :inclusive
|
5
|
+
def initialize(field: 'setName', pattern: /.*/, inclusive: true)
|
6
|
+
@field = field
|
7
|
+
@pattern = pattern
|
8
|
+
@inclusive = inclusive
|
9
|
+
end
|
10
|
+
|
11
|
+
def valid?(set: {})
|
12
|
+
inclusive ? matches?(set) : !matches?(set)
|
13
|
+
end
|
14
|
+
|
15
|
+
def matches?(set)
|
16
|
+
pattern.match?(set[field])
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'cdmdexer'
|
2
|
+
|
3
|
+
namespace :cdmdexer do
|
4
|
+
desc "delete all records that aren't in a given OAI endpoint"
|
5
|
+
task :delete_batch, [:start, :prefix, :oai_url, :solr_url] do |t, args|
|
6
|
+
CDMDEXER::BatchDeleterWorker.perform_async(args[:start].to_i,
|
7
|
+
args[:prefix],
|
8
|
+
args[:oai_url],
|
9
|
+
args[:solr_url])
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'cdmdexer'
|
2
|
+
|
3
|
+
namespace :cdmdexer do
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
desc 'Ingest a Collection Syncronously'
|
8
|
+
task :collection_sync do
|
9
|
+
# config = etl.config
|
10
|
+
# raise etl.config.keys.inspect
|
11
|
+
CDMDEXER::ETLWorker.new.perform(
|
12
|
+
'solr_config' => {:url=>"http://solr:8983/solr/mdl-1"},
|
13
|
+
'oai_endpoint' => 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
|
14
|
+
'cdm_endpoint' => 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
|
15
|
+
'set_spec' => 'mpls',
|
16
|
+
'batch_size' => 10,
|
17
|
+
'max_compounds' => 10
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
desc 'Launch a background job to index metadata from CONTENTdm to Solr.'
|
23
|
+
task :batch, [
|
24
|
+
:solr_url,
|
25
|
+
:oai_endpoint,
|
26
|
+
:cdm_endpoint,
|
27
|
+
:set_spec,
|
28
|
+
:batch_size,
|
29
|
+
:max_compounds
|
30
|
+
] do |t, args|
|
31
|
+
CDMDEXER::ETLWorker.perform_async(
|
32
|
+
solr_config: { url: args.fetch(:solr_url) },
|
33
|
+
oai_endpoint: args.fetch(:oai_endpoint),
|
34
|
+
cdm_endpoint: args.fetch(:cdm_endpoint),
|
35
|
+
set_spec: args[:set_spec] != '""' ? args[:set_spec] : nil,
|
36
|
+
batch_size: args.fetch(:batch_size, 10),
|
37
|
+
max_compounds: args.fetch(:max_compounds, 10)
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
desc 'Launch an indexing worker for each collection with an optional regex
|
42
|
+
pattern to match setSpec. Patterns can be inclusive or exclusive.'
|
43
|
+
task :by_collections, [
|
44
|
+
:solr_url,
|
45
|
+
:oai_endpoint,
|
46
|
+
:cdm_endpoint,
|
47
|
+
:set_spec_pattern,
|
48
|
+
:inclusive,
|
49
|
+
:batch_size
|
50
|
+
] do |t, args|
|
51
|
+
oai_endpoint = args.fetch(:oai_endpoint)
|
52
|
+
# Optional args
|
53
|
+
pattern = args.fetch(:set_spec_pattern, false)
|
54
|
+
inclusive = args.fetch(:inclusive, 'true') == 'true'
|
55
|
+
# Define your own callback if you want to use other set related fields
|
56
|
+
# Use the RegexFilterCallback as an example of how to build your own filter
|
57
|
+
set_specs =
|
58
|
+
if pattern
|
59
|
+
filter = CDMDEXER::RegexFilterCallback.new(field: 'setName',
|
60
|
+
pattern: Regexp.new(pattern),
|
61
|
+
inclusive: inclusive)
|
62
|
+
CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint,
|
63
|
+
callback: filter).set_specs
|
64
|
+
else
|
65
|
+
CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint).set_specs
|
66
|
+
end
|
67
|
+
|
68
|
+
puts "Indexing Sets: '#{set_specs.join(', ')}'"
|
69
|
+
|
70
|
+
etl_config = {
|
71
|
+
solr_config: { url: args.fetch(:solr_url) },
|
72
|
+
oai_endpoint: args.fetch(:oai_endpoint),
|
73
|
+
cdm_endpoint: args.fetch(:cdm_endpoint),
|
74
|
+
batch_size: args.fetch(:batch_size, 5),
|
75
|
+
max_compounds: args.fetch(:max_compounds, 10)
|
76
|
+
}
|
77
|
+
|
78
|
+
CDMDEXER::ETLBySetSpecs.new(set_specs: set_specs, etl_config: etl_config).run!
|
79
|
+
end
|
80
|
+
|
81
|
+
desc 'Launch a background job to index a single record.'
|
82
|
+
task :record, [
|
83
|
+
:collection,
|
84
|
+
:id,
|
85
|
+
:solr_url,
|
86
|
+
:cdm_endpoint,
|
87
|
+
:oai_endpoint
|
88
|
+
] do |t, args|
|
89
|
+
CDMDEXER::TransformWorker.perform_async(
|
90
|
+
[[args.fetch(:collection), args.fetch(:id)]],
|
91
|
+
{ url: args.fetch(:solr_url) },
|
92
|
+
args.fetch(:cdm_endpoint),
|
93
|
+
args.fetch(:oai_endpoint)
|
94
|
+
)
|
95
|
+
end
|
96
|
+
end
|