cdmdexer 0.17.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +4 -0
- data/.travis.yml +8 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +149 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cdmdexer.gemspec +35 -0
- data/lib/cdmdexer/cdm_item.rb +89 -0
- data/lib/cdmdexer/default_cdm_notification.rb +8 -0
- data/lib/cdmdexer/default_completed_callback.rb +8 -0
- data/lib/cdmdexer/default_loader_notification.rb +8 -0
- data/lib/cdmdexer/default_oai_notification.rb +8 -0
- data/lib/cdmdexer/default_solr.rb +35 -0
- data/lib/cdmdexer/etl_by_set_specs.rb +18 -0
- data/lib/cdmdexer/etl_worker.rb +113 -0
- data/lib/cdmdexer/field_formatter.rb +13 -0
- data/lib/cdmdexer/field_mapping.rb +28 -0
- data/lib/cdmdexer/field_transformer.rb +41 -0
- data/lib/cdmdexer/filtered_set_specs.rb +41 -0
- data/lib/cdmdexer/formatters.rb +169 -0
- data/lib/cdmdexer/hooks.rb +31 -0
- data/lib/cdmdexer/load_worker.rb +36 -0
- data/lib/cdmdexer/loader.rb +19 -0
- data/lib/cdmdexer/oai_client.rb +26 -0
- data/lib/cdmdexer/oai_request.rb +100 -0
- data/lib/cdmdexer/rake_task.rb +6 -0
- data/lib/cdmdexer/record_transformer.rb +25 -0
- data/lib/cdmdexer/regex_filter_callback.rb +19 -0
- data/lib/cdmdexer/tasks/delete.rake +12 -0
- data/lib/cdmdexer/tasks/etl.rake +96 -0
- data/lib/cdmdexer/transform_worker.rb +93 -0
- data/lib/cdmdexer/transformer.rb +171 -0
- data/lib/cdmdexer/version.rb +3 -0
- data/lib/cdmdexer.rb +26 -0
- data/travis.yml +6 -0
- metadata +223 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
class FieldMapping
|
3
|
+
attr_reader :config
|
4
|
+
def initialize(config: {})
|
5
|
+
@config = symbolize(config)
|
6
|
+
end
|
7
|
+
|
8
|
+
def origin_path
|
9
|
+
config.fetch(:origin_path)
|
10
|
+
end
|
11
|
+
|
12
|
+
def dest_path
|
13
|
+
config.fetch(:dest_path)
|
14
|
+
end
|
15
|
+
|
16
|
+
def formatters
|
17
|
+
config.fetch(:formatters, [DefaultFormatter]).map do |formatter|
|
18
|
+
formatter.is_a?(String) ? Object.const_get(formatter) : formatter
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def symbolize(config)
|
25
|
+
config.inject({}) { |memo, (k, v)| memo[k.to_sym] = v; memo }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'hash_at_path'
|
2
|
+
|
3
|
+
module CDMDEXER
|
4
|
+
class FieldTransformer
|
5
|
+
extend Forwardable
|
6
|
+
def_delegators :@field_mapping, :origin_path, :dest_path, :formatters
|
7
|
+
attr_reader :field_value, :field_mapping, :formatter_klass
|
8
|
+
def initialize(field_mapping: FieldMapping.new,
|
9
|
+
record: {},
|
10
|
+
formatter_klass: FieldFormatter)
|
11
|
+
@field_mapping = field_mapping
|
12
|
+
@field_value = compact(record.at_path(origin_path))
|
13
|
+
@formatter_klass = formatter_klass
|
14
|
+
end
|
15
|
+
|
16
|
+
def reduce
|
17
|
+
(blank?(value)) ? {} : { "#{dest_path}" => value }
|
18
|
+
end
|
19
|
+
|
20
|
+
def value
|
21
|
+
@value ||= (!blank?(field_value)) ? transform_field : nil
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def compact(record)
|
27
|
+
(record.respond_to?(:compact)) ? record.compact : record
|
28
|
+
end
|
29
|
+
|
30
|
+
# File activesupport/lib/active_support/core_ext/object/blank.rb, line 14
|
31
|
+
def blank?(val)
|
32
|
+
val.respond_to?(:empty?) ? !!val.empty? : !val
|
33
|
+
end
|
34
|
+
|
35
|
+
def transform_field
|
36
|
+
formatter_klass.new(value: field_value, formatters: formatters).format!
|
37
|
+
rescue StandardError => e
|
38
|
+
raise "Mapping Error:#{field_mapping.config} Error:#{e.message}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
module DefaultFilterSetCallback
|
3
|
+
def valid?(set: {})
|
4
|
+
true
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
8
|
+
class FilteredSetSpecs
|
9
|
+
attr_reader :oai_base_url,
|
10
|
+
:oai_client,
|
11
|
+
:callback
|
12
|
+
|
13
|
+
def initialize(oai_base_url: :missing_oai_base_url,
|
14
|
+
oai_client: OaiClient,
|
15
|
+
callback: CDMDEXER::DefaultSetFilterCallback.new)
|
16
|
+
@oai_base_url = oai_base_url
|
17
|
+
@oai_client = oai_client
|
18
|
+
@callback = callback
|
19
|
+
end
|
20
|
+
|
21
|
+
def set_specs
|
22
|
+
filtered_sets.map { |set| set['setSpec'] }
|
23
|
+
end
|
24
|
+
|
25
|
+
def filtered_sets
|
26
|
+
@filtered_sets ||= sets.select do |set|
|
27
|
+
callback.valid?(set: set)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def sets
|
34
|
+
@sets ||= list_sets['OAI_PMH']['ListSets']['set']
|
35
|
+
end
|
36
|
+
|
37
|
+
def list_sets
|
38
|
+
@list_sets ||= oai_client.new(base_url: oai_base_url).request('verb=ListSets')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require 'titleize'
|
2
|
+
require 'json'
|
3
|
+
require 'net/http'
|
4
|
+
# A handful of very simple formatters to clean up CONTENTdm API metadata
|
5
|
+
module CDMDEXER
|
6
|
+
|
7
|
+
|
8
|
+
class GeoNameID
|
9
|
+
def self.format(value)
|
10
|
+
value.split('/').last
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class GeoNameIDToJson
|
15
|
+
URL = "http://ws.geonames.net/getJSON?username=#{ENV['GEONAMES_USER']}&token=#{ENV['GEONAMES_TOKEN']}"
|
16
|
+
def self.format(value)
|
17
|
+
JSON.parse(Net::HTTP.get_response(URI.parse("#{URL}&geonameId=#{value}")).body)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class GeoNameToLocation
|
22
|
+
def self.format(value)
|
23
|
+
return if !value.respond_to?(:fetch)
|
24
|
+
return if !value['lat'] || !value['lng']
|
25
|
+
"#{value['lat']},#{value['lng']}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class GeoNameToPlaceName
|
30
|
+
def self.format(value)
|
31
|
+
return if !value.respond_to?(:fetch)
|
32
|
+
[
|
33
|
+
value['name'],
|
34
|
+
value['adminName1'],
|
35
|
+
value['adminName2']
|
36
|
+
].select { |place| place != 'Minnesota'}.compact.uniq
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class DefaultFormatter
|
41
|
+
def self.format(value)
|
42
|
+
value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class KeywordFormatter
|
47
|
+
def self.format(value)
|
48
|
+
vals = []
|
49
|
+
vals << value['genera'].split(';') if value['genera'].respond_to?(:split)
|
50
|
+
vals << value['specif'].split(';') if value['specif'].respond_to?(:split)
|
51
|
+
vals << value['subjec'].split(';') if value['subjec'].respond_to?(:split)
|
52
|
+
vals.flatten.uniq.sort
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class UniqueFormatter
|
57
|
+
def self.format(value)
|
58
|
+
if value.respond_to?(:uniq)
|
59
|
+
value.uniq
|
60
|
+
else
|
61
|
+
value.titleize
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class Titlieze
|
67
|
+
def self.format(value)
|
68
|
+
if value.respond_to?(:map)
|
69
|
+
value.map {|value| value.titleize }
|
70
|
+
else
|
71
|
+
value.titleize
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
class ImageId
|
77
|
+
def self.format(value)
|
78
|
+
value
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
class ToJsonFormatter
|
83
|
+
def self.format(values)
|
84
|
+
values.to_json if values.respond_to?(:to_json)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class StripSemicolonFormatter
|
89
|
+
def self.format(values)
|
90
|
+
if values.respond_to?(:map)
|
91
|
+
values.map {|value| value.gsub(/;/, '') }
|
92
|
+
else
|
93
|
+
values.gsub(/;/, '')
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
class StripFormatter
|
99
|
+
def self.format(values)
|
100
|
+
return '' if values.nil?
|
101
|
+
if values.respond_to?(:map)
|
102
|
+
values.map {|value| value.strip }
|
103
|
+
else
|
104
|
+
values.strip
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
class SplitFormatter
|
110
|
+
def self.format(value)
|
111
|
+
(value.respond_to?(:split)) ? value.split(';') : value
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
class JoinFormatter
|
116
|
+
def self.format(value)
|
117
|
+
(value.respond_to?(:join)) ? value.join('; ') : value
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
class AddSetSpecFormatter
|
122
|
+
def self.format(value)
|
123
|
+
value.merge('setSpec' => value['id'].split('/').first)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
class SetSpecFormatter
|
128
|
+
def self.format(value)
|
129
|
+
value['setSpec']
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class CollectionNameFormatter
|
134
|
+
def self.format(value)
|
135
|
+
value['oai_sets'].fetch(value['setSpec'], {})
|
136
|
+
.fetch(:name, '')
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
class CollectionDescriptionFormatter
|
141
|
+
def self.format(value)
|
142
|
+
value['oai_sets'].fetch(value['setSpec'], {})
|
143
|
+
.fetch(:description, '')
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
class FilterBadCollections
|
148
|
+
def self.format(value)
|
149
|
+
(/Collection information undefined/i =~ value) ? '' : value
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
class ToIFormatter
|
154
|
+
def self.format(value)
|
155
|
+
value.to_i if value.respond_to?(:to_i)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
class LocationFormatter
|
160
|
+
def self.format(record)
|
161
|
+
if record['latitu'] && record['longit'] && record['latitu'] != '' && record['longit'] != '' && record['latitu'] != {}
|
162
|
+
"#{record['latitu']}, #{record['longit']}"
|
163
|
+
else
|
164
|
+
nil
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
def self.const_missing(name)
|
3
|
+
if name.to_s == 'Solr'
|
4
|
+
hook(pattern: name.to_s, default: DefaultSolr)
|
5
|
+
elsif name.to_s == 'CompletedCallback'
|
6
|
+
hook(pattern: name.to_s, default: DefaultCompletedCallback)
|
7
|
+
elsif name.to_s == 'OaiNotification'
|
8
|
+
hook(pattern: name.to_s, default: DefaultOaiNotification)
|
9
|
+
elsif name.to_s == 'LoaderNotification'
|
10
|
+
hook(pattern: name.to_s, default: DefaultLoaderNotification)
|
11
|
+
elsif name.to_s == 'CdmNotification'
|
12
|
+
hook(pattern: name.to_s, default: DefaultCdmNotification)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.hook(pattern: '', default: false)
|
17
|
+
if find_hook(pattern, default)
|
18
|
+
Object.const_get("CDMDEXER::#{find_hook(pattern, default)}")
|
19
|
+
else
|
20
|
+
default
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.find_hook(pattern, default)
|
25
|
+
CDMDEXER.constants.find do |konst|
|
26
|
+
if Object.const_get("CDMDEXER::#{konst}") != default
|
27
|
+
/#{pattern}/ =~ konst.to_s
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
module CDMDEXER
|
3
|
+
# Load Records into a solr index
|
4
|
+
class LoadWorker
|
5
|
+
include Sidekiq::Worker
|
6
|
+
sidekiq_options queue: 'critical'
|
7
|
+
attr_reader :solr_config, :records, :deletables
|
8
|
+
attr_writer :loader_klass, :solr_klass
|
9
|
+
def perform(records = [], deletables = [], solr_config = {})
|
10
|
+
@solr_config = solr_config.symbolize_keys
|
11
|
+
@records = records
|
12
|
+
@deletables = deletables
|
13
|
+
load!
|
14
|
+
end
|
15
|
+
|
16
|
+
def loader_klass
|
17
|
+
@loader_klass ||= Loader
|
18
|
+
end
|
19
|
+
|
20
|
+
def solr_klass
|
21
|
+
@solr_klass ||= DefaultSolr
|
22
|
+
end
|
23
|
+
|
24
|
+
def load!
|
25
|
+
loader_klass.new(records: records,
|
26
|
+
deletable_ids: deletables,
|
27
|
+
solr_client: solr_client).load!
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def solr_client
|
33
|
+
@solr_client ||= solr_klass.new(solr_config)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
|
3
|
+
class Loader
|
4
|
+
attr_reader :solr_client, :records, :deletable_ids
|
5
|
+
|
6
|
+
def initialize(records: [],
|
7
|
+
deletable_ids: [],
|
8
|
+
solr_client: CDMDEXER::DefaultSolr)
|
9
|
+
@solr_client = solr_client
|
10
|
+
@records = records
|
11
|
+
@deletable_ids = deletable_ids
|
12
|
+
end
|
13
|
+
|
14
|
+
def load!
|
15
|
+
solr_client.delete deletable_ids unless deletable_ids.empty?
|
16
|
+
solr_client.add records
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
require 'json'
|
3
|
+
require 'http'
|
4
|
+
module CDMDEXER
|
5
|
+
class OaiClient
|
6
|
+
attr_reader :base_url, :client
|
7
|
+
def initialize(base_url: '', client: HTTP)
|
8
|
+
@base_url = base_url
|
9
|
+
@client = client
|
10
|
+
end
|
11
|
+
|
12
|
+
def request(query)
|
13
|
+
hashify get("#{base_url}?#{query}")
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def get(url)
|
19
|
+
client.get(url).to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
def hashify(xml)
|
23
|
+
Hash.from_xml(xml)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'hash_at_path'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module CDMDEXER
|
5
|
+
# Light wrapper around OAI requests
|
6
|
+
# Enhances OAI responses with handles sets, records, etc and adds a little
|
7
|
+
# extra value to their data with a keyed set lookup, filters for deleted
|
8
|
+
# and non-deleted records
|
9
|
+
class OaiRequest
|
10
|
+
attr_reader :endpoint_url,
|
11
|
+
:resumption_token,
|
12
|
+
:client,
|
13
|
+
:set_spec
|
14
|
+
|
15
|
+
def initialize(endpoint_url: '',
|
16
|
+
resumption_token: nil,
|
17
|
+
set_spec: nil,
|
18
|
+
client: Net::HTTP)
|
19
|
+
@endpoint_url = endpoint_url
|
20
|
+
@resumption_token = resumption_token
|
21
|
+
@client = client
|
22
|
+
@set_spec = set_spec ? "&set=#{set_spec}" : ''
|
23
|
+
end
|
24
|
+
|
25
|
+
def records
|
26
|
+
headers.map do |header|
|
27
|
+
header.merge(
|
28
|
+
id: header['identifier'].split(':').last.split('/').join(':')
|
29
|
+
)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def sets
|
34
|
+
# Ensure a result of one set is still an array
|
35
|
+
@sets ||= force_array request(sets_endpoint_url).at_path('OAI_PMH/ListSets/set')
|
36
|
+
end
|
37
|
+
|
38
|
+
def set_lookup
|
39
|
+
sets.inject({}) { |memo, set| memo.merge(to_key(set)) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def next_resumption_token
|
43
|
+
identifier_request.at_path('OAI_PMH/ListIdentifiers/resumptionToken')
|
44
|
+
end
|
45
|
+
|
46
|
+
def deletable_ids
|
47
|
+
records.select { |record| record['status'] == 'deleted' }
|
48
|
+
.map { |record| record[:id] }
|
49
|
+
end
|
50
|
+
|
51
|
+
def updatables
|
52
|
+
records.reject { |record| record['status'] == 'deleted' }
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# TODO: Add some error handling if this ever turns up empty
|
58
|
+
def headers
|
59
|
+
force_array identifier_request.at_path('OAI_PMH/ListIdentifiers/header')
|
60
|
+
end
|
61
|
+
|
62
|
+
# Ensure results are a single level array
|
63
|
+
# (single row sets, records, etc)
|
64
|
+
def force_array(result)
|
65
|
+
[result].flatten
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_key(set)
|
69
|
+
{
|
70
|
+
set['setSpec'] =>
|
71
|
+
{
|
72
|
+
name: set['setName'],
|
73
|
+
description: set.at_path('setDescription/dc/description')
|
74
|
+
}
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def identifier_request
|
79
|
+
@identifier_request ||=
|
80
|
+
resumption_token ? request(batch_endpoint_url) : request(first_batch_endpoint_url)
|
81
|
+
end
|
82
|
+
|
83
|
+
def first_batch_endpoint_url
|
84
|
+
"#{endpoint_url}?verb=ListIdentifiers&metadataPrefix=oai_dc#{set_spec}"
|
85
|
+
end
|
86
|
+
|
87
|
+
def batch_endpoint_url
|
88
|
+
"#{endpoint_url}?verb=ListIdentifiers&resumptionToken=#{resumption_token}"
|
89
|
+
end
|
90
|
+
|
91
|
+
def sets_endpoint_url
|
92
|
+
"#{endpoint_url}?verb=ListSets"
|
93
|
+
end
|
94
|
+
|
95
|
+
def request(location)
|
96
|
+
CDMDEXER::OaiNotification.call!(location)
|
97
|
+
Hash.from_xml(client.get_response(URI(location)).body)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
class RecordTransformer
|
3
|
+
attr_reader :record, :field_mappings, :field_transformer
|
4
|
+
def initialize(record: {},
|
5
|
+
field_mappings: [],
|
6
|
+
field_transformer: FieldTransformer)
|
7
|
+
@record = record
|
8
|
+
@field_mappings = field_mappings
|
9
|
+
@field_transformer = field_transformer
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform!
|
13
|
+
field_mappings.inject({}) do |dest_record, field_mapping|
|
14
|
+
dest_record.merge(transform_field(record, field_mapping))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def transform_field(record, field_mapping)
|
21
|
+
field_transformer.new(field_mapping: field_mapping,
|
22
|
+
record: record).reduce
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module CDMDEXER
|
2
|
+
# Search an OAI ListSets field using a regular expression
|
3
|
+
class RegexFilterCallback
|
4
|
+
attr_reader :field, :pattern, :inclusive
|
5
|
+
def initialize(field: 'setName', pattern: /.*/, inclusive: true)
|
6
|
+
@field = field
|
7
|
+
@pattern = pattern
|
8
|
+
@inclusive = inclusive
|
9
|
+
end
|
10
|
+
|
11
|
+
def valid?(set: {})
|
12
|
+
inclusive ? matches?(set) : !matches?(set)
|
13
|
+
end
|
14
|
+
|
15
|
+
def matches?(set)
|
16
|
+
pattern.match?(set[field])
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'cdmdexer'
|
2
|
+
|
3
|
+
namespace :cdmdexer do
|
4
|
+
desc "delete all records that aren't in a given OAI endpoint"
|
5
|
+
task :delete_batch, [:start, :prefix, :oai_url, :solr_url] do |t, args|
|
6
|
+
CDMDEXER::BatchDeleterWorker.perform_async(args[:start].to_i,
|
7
|
+
args[:prefix],
|
8
|
+
args[:oai_url],
|
9
|
+
args[:solr_url])
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'cdmdexer'
|
2
|
+
|
3
|
+
namespace :cdmdexer do
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
desc 'Ingest a Collection Syncronously'
|
8
|
+
task :collection_sync do
|
9
|
+
# config = etl.config
|
10
|
+
# raise etl.config.keys.inspect
|
11
|
+
CDMDEXER::ETLWorker.new.perform(
|
12
|
+
'solr_config' => {:url=>"http://solr:8983/solr/mdl-1"},
|
13
|
+
'oai_endpoint' => 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
|
14
|
+
'cdm_endpoint' => 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
|
15
|
+
'set_spec' => 'mpls',
|
16
|
+
'batch_size' => 10,
|
17
|
+
'max_compounds' => 10
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
desc 'Launch a background job to index metadata from CONTENTdm to Solr.'
|
23
|
+
task :batch, [
|
24
|
+
:solr_url,
|
25
|
+
:oai_endpoint,
|
26
|
+
:cdm_endpoint,
|
27
|
+
:set_spec,
|
28
|
+
:batch_size,
|
29
|
+
:max_compounds
|
30
|
+
] do |t, args|
|
31
|
+
CDMDEXER::ETLWorker.perform_async(
|
32
|
+
solr_config: { url: args.fetch(:solr_url) },
|
33
|
+
oai_endpoint: args.fetch(:oai_endpoint),
|
34
|
+
cdm_endpoint: args.fetch(:cdm_endpoint),
|
35
|
+
set_spec: args[:set_spec] != '""' ? args[:set_spec] : nil,
|
36
|
+
batch_size: args.fetch(:batch_size, 10),
|
37
|
+
max_compounds: args.fetch(:max_compounds, 10)
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
desc 'Launch an indexing worker for each collection with an optional regex
|
42
|
+
pattern to match setSpec. Patterns can be inclusive or exclusive.'
|
43
|
+
task :by_collections, [
|
44
|
+
:solr_url,
|
45
|
+
:oai_endpoint,
|
46
|
+
:cdm_endpoint,
|
47
|
+
:set_spec_pattern,
|
48
|
+
:inclusive,
|
49
|
+
:batch_size
|
50
|
+
] do |t, args|
|
51
|
+
oai_endpoint = args.fetch(:oai_endpoint)
|
52
|
+
# Optional args
|
53
|
+
pattern = args.fetch(:set_spec_pattern, false)
|
54
|
+
inclusive = args.fetch(:inclusive, 'true') == 'true'
|
55
|
+
# Define your own callback if you want to use other set related fields
|
56
|
+
# Use the RegexFilterCallback as an example of how to build your own filter
|
57
|
+
set_specs =
|
58
|
+
if pattern
|
59
|
+
filter = CDMDEXER::RegexFilterCallback.new(field: 'setName',
|
60
|
+
pattern: Regexp.new(pattern),
|
61
|
+
inclusive: inclusive)
|
62
|
+
CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint,
|
63
|
+
callback: filter).set_specs
|
64
|
+
else
|
65
|
+
CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint).set_specs
|
66
|
+
end
|
67
|
+
|
68
|
+
puts "Indexing Sets: '#{set_specs.join(', ')}'"
|
69
|
+
|
70
|
+
etl_config = {
|
71
|
+
solr_config: { url: args.fetch(:solr_url) },
|
72
|
+
oai_endpoint: args.fetch(:oai_endpoint),
|
73
|
+
cdm_endpoint: args.fetch(:cdm_endpoint),
|
74
|
+
batch_size: args.fetch(:batch_size, 5),
|
75
|
+
max_compounds: args.fetch(:max_compounds, 10)
|
76
|
+
}
|
77
|
+
|
78
|
+
CDMDEXER::ETLBySetSpecs.new(set_specs: set_specs, etl_config: etl_config).run!
|
79
|
+
end
|
80
|
+
|
81
|
+
desc 'Launch a background job to index a single record.'
|
82
|
+
task :record, [
|
83
|
+
:collection,
|
84
|
+
:id,
|
85
|
+
:solr_url,
|
86
|
+
:cdm_endpoint,
|
87
|
+
:oai_endpoint
|
88
|
+
] do |t, args|
|
89
|
+
CDMDEXER::TransformWorker.perform_async(
|
90
|
+
[[args.fetch(:collection), args.fetch(:id)]],
|
91
|
+
{ url: args.fetch(:solr_url) },
|
92
|
+
args.fetch(:cdm_endpoint),
|
93
|
+
args.fetch(:oai_endpoint)
|
94
|
+
)
|
95
|
+
end
|
96
|
+
end
|