cdmdexer 0.17.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.rubocop.yml +4 -0
  4. data/.travis.yml +8 -0
  5. data/CODE_OF_CONDUCT.md +49 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +21 -0
  8. data/README.md +149 -0
  9. data/Rakefile +11 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/cdmdexer.gemspec +35 -0
  13. data/lib/cdmdexer/cdm_item.rb +89 -0
  14. data/lib/cdmdexer/default_cdm_notification.rb +8 -0
  15. data/lib/cdmdexer/default_completed_callback.rb +8 -0
  16. data/lib/cdmdexer/default_loader_notification.rb +8 -0
  17. data/lib/cdmdexer/default_oai_notification.rb +8 -0
  18. data/lib/cdmdexer/default_solr.rb +35 -0
  19. data/lib/cdmdexer/etl_by_set_specs.rb +18 -0
  20. data/lib/cdmdexer/etl_worker.rb +113 -0
  21. data/lib/cdmdexer/field_formatter.rb +13 -0
  22. data/lib/cdmdexer/field_mapping.rb +28 -0
  23. data/lib/cdmdexer/field_transformer.rb +41 -0
  24. data/lib/cdmdexer/filtered_set_specs.rb +41 -0
  25. data/lib/cdmdexer/formatters.rb +169 -0
  26. data/lib/cdmdexer/hooks.rb +31 -0
  27. data/lib/cdmdexer/load_worker.rb +36 -0
  28. data/lib/cdmdexer/loader.rb +19 -0
  29. data/lib/cdmdexer/oai_client.rb +26 -0
  30. data/lib/cdmdexer/oai_request.rb +100 -0
  31. data/lib/cdmdexer/rake_task.rb +6 -0
  32. data/lib/cdmdexer/record_transformer.rb +25 -0
  33. data/lib/cdmdexer/regex_filter_callback.rb +19 -0
  34. data/lib/cdmdexer/tasks/delete.rake +12 -0
  35. data/lib/cdmdexer/tasks/etl.rake +96 -0
  36. data/lib/cdmdexer/transform_worker.rb +93 -0
  37. data/lib/cdmdexer/transformer.rb +171 -0
  38. data/lib/cdmdexer/version.rb +3 -0
  39. data/lib/cdmdexer.rb +26 -0
  40. data/travis.yml +6 -0
  41. metadata +223 -0
@@ -0,0 +1,28 @@
1
+ module CDMDEXER
2
+ class FieldMapping
3
+ attr_reader :config
4
+ def initialize(config: {})
5
+ @config = symbolize(config)
6
+ end
7
+
8
+ def origin_path
9
+ config.fetch(:origin_path)
10
+ end
11
+
12
+ def dest_path
13
+ config.fetch(:dest_path)
14
+ end
15
+
16
+ def formatters
17
+ config.fetch(:formatters, [DefaultFormatter]).map do |formatter|
18
+ formatter.is_a?(String) ? Object.const_get(formatter) : formatter
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def symbolize(config)
25
+ config.inject({}) { |memo, (k, v)| memo[k.to_sym] = v; memo }
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,41 @@
1
+ require 'hash_at_path'
2
+
3
+ module CDMDEXER
4
+ class FieldTransformer
5
+ extend Forwardable
6
+ def_delegators :@field_mapping, :origin_path, :dest_path, :formatters
7
+ attr_reader :field_value, :field_mapping, :formatter_klass
8
+ def initialize(field_mapping: FieldMapping.new,
9
+ record: {},
10
+ formatter_klass: FieldFormatter)
11
+ @field_mapping = field_mapping
12
+ @field_value = compact(record.at_path(origin_path))
13
+ @formatter_klass = formatter_klass
14
+ end
15
+
16
+ def reduce
17
+ (blank?(value)) ? {} : { "#{dest_path}" => value }
18
+ end
19
+
20
+ def value
21
+ @value ||= (!blank?(field_value)) ? transform_field : nil
22
+ end
23
+
24
+ private
25
+
26
+ def compact(record)
27
+ (record.respond_to?(:compact)) ? record.compact : record
28
+ end
29
+
30
+ # File activesupport/lib/active_support/core_ext/object/blank.rb, line 14
31
+ def blank?(val)
32
+ val.respond_to?(:empty?) ? !!val.empty? : !val
33
+ end
34
+
35
+ def transform_field
36
+ formatter_klass.new(value: field_value, formatters: formatters).format!
37
+ rescue StandardError => e
38
+ raise "Mapping Error:#{field_mapping.config} Error:#{e.message}"
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,41 @@
1
+ module CDMDEXER
2
+ module DefaultFilterSetCallback
3
+ def valid?(set: {})
4
+ true
5
+ end
6
+ end
7
+
8
+ class FilteredSetSpecs
9
+ attr_reader :oai_base_url,
10
+ :oai_client,
11
+ :callback
12
+
13
+ def initialize(oai_base_url: :missing_oai_base_url,
14
+ oai_client: OaiClient,
15
+ callback: CDMDEXER::DefaultSetFilterCallback.new)
16
+ @oai_base_url = oai_base_url
17
+ @oai_client = oai_client
18
+ @callback = callback
19
+ end
20
+
21
+ def set_specs
22
+ filtered_sets.map { |set| set['setSpec'] }
23
+ end
24
+
25
+ def filtered_sets
26
+ @filtered_sets ||= sets.select do |set|
27
+ callback.valid?(set: set)
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ def sets
34
+ @sets ||= list_sets['OAI_PMH']['ListSets']['set']
35
+ end
36
+
37
+ def list_sets
38
+ @list_sets ||= oai_client.new(base_url: oai_base_url).request('verb=ListSets')
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,169 @@
1
+ require 'titleize'
2
+ require 'json'
3
+ require 'net/http'
4
+ # A handful of very simple formatters to clean up CONTENTdm API metadata
5
+ module CDMDEXER
6
+
7
+
8
+ class GeoNameID
9
+ def self.format(value)
10
+ value.split('/').last
11
+ end
12
+ end
13
+
14
+ class GeoNameIDToJson
15
+ URL = "http://ws.geonames.net/getJSON?username=#{ENV['GEONAMES_USER']}&token=#{ENV['GEONAMES_TOKEN']}"
16
+ def self.format(value)
17
+ JSON.parse(Net::HTTP.get_response(URI.parse("#{URL}&geonameId=#{value}")).body)
18
+ end
19
+ end
20
+
21
+ class GeoNameToLocation
22
+ def self.format(value)
23
+ return if !value.respond_to?(:fetch)
24
+ return if !value['lat'] || !value['lng']
25
+ "#{value['lat']},#{value['lng']}"
26
+ end
27
+ end
28
+
29
+ class GeoNameToPlaceName
30
+ def self.format(value)
31
+ return if !value.respond_to?(:fetch)
32
+ [
33
+ value['name'],
34
+ value['adminName1'],
35
+ value['adminName2']
36
+ ].select { |place| place != 'Minnesota'}.compact.uniq
37
+ end
38
+ end
39
+
40
+ class DefaultFormatter
41
+ def self.format(value)
42
+ value
43
+ end
44
+ end
45
+
46
+ class KeywordFormatter
47
+ def self.format(value)
48
+ vals = []
49
+ vals << value['genera'].split(';') if value['genera'].respond_to?(:split)
50
+ vals << value['specif'].split(';') if value['specif'].respond_to?(:split)
51
+ vals << value['subjec'].split(';') if value['subjec'].respond_to?(:split)
52
+ vals.flatten.uniq.sort
53
+ end
54
+ end
55
+
56
+ class UniqueFormatter
57
+ def self.format(value)
58
+ if value.respond_to?(:uniq)
59
+ value.uniq
60
+ else
61
+ value.titleize
62
+ end
63
+ end
64
+ end
65
+
66
+ class Titlieze
67
+ def self.format(value)
68
+ if value.respond_to?(:map)
69
+ value.map {|value| value.titleize }
70
+ else
71
+ value.titleize
72
+ end
73
+ end
74
+ end
75
+
76
+ class ImageId
77
+ def self.format(value)
78
+ value
79
+ end
80
+ end
81
+
82
+ class ToJsonFormatter
83
+ def self.format(values)
84
+ values.to_json if values.respond_to?(:to_json)
85
+ end
86
+ end
87
+
88
+ class StripSemicolonFormatter
89
+ def self.format(values)
90
+ if values.respond_to?(:map)
91
+ values.map {|value| value.gsub(/;/, '') }
92
+ else
93
+ values.gsub(/;/, '')
94
+ end
95
+ end
96
+ end
97
+
98
+ class StripFormatter
99
+ def self.format(values)
100
+ return '' if values.nil?
101
+ if values.respond_to?(:map)
102
+ values.map {|value| value.strip }
103
+ else
104
+ values.strip
105
+ end
106
+ end
107
+ end
108
+
109
+ class SplitFormatter
110
+ def self.format(value)
111
+ (value.respond_to?(:split)) ? value.split(';') : value
112
+ end
113
+ end
114
+
115
+ class JoinFormatter
116
+ def self.format(value)
117
+ (value.respond_to?(:join)) ? value.join('; ') : value
118
+ end
119
+ end
120
+
121
+ class AddSetSpecFormatter
122
+ def self.format(value)
123
+ value.merge('setSpec' => value['id'].split('/').first)
124
+ end
125
+ end
126
+
127
+ class SetSpecFormatter
128
+ def self.format(value)
129
+ value['setSpec']
130
+ end
131
+ end
132
+
133
+ class CollectionNameFormatter
134
+ def self.format(value)
135
+ value['oai_sets'].fetch(value['setSpec'], {})
136
+ .fetch(:name, '')
137
+ end
138
+ end
139
+
140
+ class CollectionDescriptionFormatter
141
+ def self.format(value)
142
+ value['oai_sets'].fetch(value['setSpec'], {})
143
+ .fetch(:description, '')
144
+ end
145
+ end
146
+
147
+ class FilterBadCollections
148
+ def self.format(value)
149
+ (/Collection information undefined/i =~ value) ? '' : value
150
+ end
151
+ end
152
+
153
+ class ToIFormatter
154
+ def self.format(value)
155
+ value.to_i if value.respond_to?(:to_i)
156
+ end
157
+ end
158
+
159
+ class LocationFormatter
160
+ def self.format(record)
161
+ if record['latitu'] && record['longit'] && record['latitu'] != '' && record['longit'] != '' && record['latitu'] != {}
162
+ "#{record['latitu']}, #{record['longit']}"
163
+ else
164
+ nil
165
+ end
166
+ end
167
+ end
168
+
169
+ end
@@ -0,0 +1,31 @@
1
+ module CDMDEXER
2
+ def self.const_missing(name)
3
+ if name.to_s == 'Solr'
4
+ hook(pattern: name.to_s, default: DefaultSolr)
5
+ elsif name.to_s == 'CompletedCallback'
6
+ hook(pattern: name.to_s, default: DefaultCompletedCallback)
7
+ elsif name.to_s == 'OaiNotification'
8
+ hook(pattern: name.to_s, default: DefaultOaiNotification)
9
+ elsif name.to_s == 'LoaderNotification'
10
+ hook(pattern: name.to_s, default: DefaultLoaderNotification)
11
+ elsif name.to_s == 'CdmNotification'
12
+ hook(pattern: name.to_s, default: DefaultCdmNotification)
13
+ end
14
+ end
15
+
16
+ def self.hook(pattern: '', default: false)
17
+ if find_hook(pattern, default)
18
+ Object.const_get("CDMDEXER::#{find_hook(pattern, default)}")
19
+ else
20
+ default
21
+ end
22
+ end
23
+
24
+ def self.find_hook(pattern, default)
25
+ CDMDEXER.constants.find do |konst|
26
+ if Object.const_get("CDMDEXER::#{konst}") != default
27
+ /#{pattern}/ =~ konst.to_s
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ require 'sidekiq'
2
+ module CDMDEXER
3
+ # Load Records into a solr index
4
+ class LoadWorker
5
+ include Sidekiq::Worker
6
+ sidekiq_options queue: 'critical'
7
+ attr_reader :solr_config, :records, :deletables
8
+ attr_writer :loader_klass, :solr_klass
9
+ def perform(records = [], deletables = [], solr_config = {})
10
+ @solr_config = solr_config.symbolize_keys
11
+ @records = records
12
+ @deletables = deletables
13
+ load!
14
+ end
15
+
16
+ def loader_klass
17
+ @loader_klass ||= Loader
18
+ end
19
+
20
+ def solr_klass
21
+ @solr_klass ||= DefaultSolr
22
+ end
23
+
24
+ def load!
25
+ loader_klass.new(records: records,
26
+ deletable_ids: deletables,
27
+ solr_client: solr_client).load!
28
+ end
29
+
30
+ private
31
+
32
+ def solr_client
33
+ @solr_client ||= solr_klass.new(solr_config)
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,19 @@
1
+ module CDMDEXER
2
+
3
+ class Loader
4
+ attr_reader :solr_client, :records, :deletable_ids
5
+
6
+ def initialize(records: [],
7
+ deletable_ids: [],
8
+ solr_client: CDMDEXER::DefaultSolr)
9
+ @solr_client = solr_client
10
+ @records = records
11
+ @deletable_ids = deletable_ids
12
+ end
13
+
14
+ def load!
15
+ solr_client.delete deletable_ids unless deletable_ids.empty?
16
+ solr_client.add records
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+
2
+ require 'json'
3
+ require 'http'
4
+ module CDMDEXER
5
+ class OaiClient
6
+ attr_reader :base_url, :client
7
+ def initialize(base_url: '', client: HTTP)
8
+ @base_url = base_url
9
+ @client = client
10
+ end
11
+
12
+ def request(query)
13
+ hashify get("#{base_url}?#{query}")
14
+ end
15
+
16
+ private
17
+
18
+ def get(url)
19
+ client.get(url).to_s
20
+ end
21
+
22
+ def hashify(xml)
23
+ Hash.from_xml(xml)
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,100 @@
1
+ require 'hash_at_path'
2
+ require 'json'
3
+
4
+ module CDMDEXER
5
+ # Light wrapper around OAI requests
6
+ # Enhances OAI responses with handles sets, records, etc and adds a little
7
+ # extra value to their data with a keyed set lookup, filters for deleted
8
+ # and non-deleted records
9
+ class OaiRequest
10
+ attr_reader :endpoint_url,
11
+ :resumption_token,
12
+ :client,
13
+ :set_spec
14
+
15
+ def initialize(endpoint_url: '',
16
+ resumption_token: nil,
17
+ set_spec: nil,
18
+ client: Net::HTTP)
19
+ @endpoint_url = endpoint_url
20
+ @resumption_token = resumption_token
21
+ @client = client
22
+ @set_spec = set_spec ? "&set=#{set_spec}" : ''
23
+ end
24
+
25
+ def records
26
+ headers.map do |header|
27
+ header.merge(
28
+ id: header['identifier'].split(':').last.split('/').join(':')
29
+ )
30
+ end
31
+ end
32
+
33
+ def sets
34
+ # Ensure a result of one set is still an array
35
+ @sets ||= force_array request(sets_endpoint_url).at_path('OAI_PMH/ListSets/set')
36
+ end
37
+
38
+ def set_lookup
39
+ sets.inject({}) { |memo, set| memo.merge(to_key(set)) }
40
+ end
41
+
42
+ def next_resumption_token
43
+ identifier_request.at_path('OAI_PMH/ListIdentifiers/resumptionToken')
44
+ end
45
+
46
+ def deletable_ids
47
+ records.select { |record| record['status'] == 'deleted' }
48
+ .map { |record| record[:id] }
49
+ end
50
+
51
+ def updatables
52
+ records.reject { |record| record['status'] == 'deleted' }
53
+ end
54
+
55
+ private
56
+
57
+ # TODO: Add some error handling if this ever turns up empty
58
+ def headers
59
+ force_array identifier_request.at_path('OAI_PMH/ListIdentifiers/header')
60
+ end
61
+
62
+ # Ensure results are a single level array
63
+ # (single row sets, records, etc)
64
+ def force_array(result)
65
+ [result].flatten
66
+ end
67
+
68
+ def to_key(set)
69
+ {
70
+ set['setSpec'] =>
71
+ {
72
+ name: set['setName'],
73
+ description: set.at_path('setDescription/dc/description')
74
+ }
75
+ }
76
+ end
77
+
78
+ def identifier_request
79
+ @identifier_request ||=
80
+ resumption_token ? request(batch_endpoint_url) : request(first_batch_endpoint_url)
81
+ end
82
+
83
+ def first_batch_endpoint_url
84
+ "#{endpoint_url}?verb=ListIdentifiers&metadataPrefix=oai_dc#{set_spec}"
85
+ end
86
+
87
+ def batch_endpoint_url
88
+ "#{endpoint_url}?verb=ListIdentifiers&resumptionToken=#{resumption_token}"
89
+ end
90
+
91
+ def sets_endpoint_url
92
+ "#{endpoint_url}?verb=ListSets"
93
+ end
94
+
95
+ def request(location)
96
+ CDMDEXER::OaiNotification.call!(location)
97
+ Hash.from_xml(client.get_response(URI(location)).body)
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,6 @@
1
+ require_relative '../cdmdexer'
2
+ module CDMDEXER
3
+ module RakeTask
4
+ Dir[File.expand_path(File.join(File.dirname(__FILE__),"tasks/*.rake"))].each { |ext| load ext } if defined?(Rake)
5
+ end
6
+ end
@@ -0,0 +1,25 @@
1
+ module CDMDEXER
2
+ class RecordTransformer
3
+ attr_reader :record, :field_mappings, :field_transformer
4
+ def initialize(record: {},
5
+ field_mappings: [],
6
+ field_transformer: FieldTransformer)
7
+ @record = record
8
+ @field_mappings = field_mappings
9
+ @field_transformer = field_transformer
10
+ end
11
+
12
+ def transform!
13
+ field_mappings.inject({}) do |dest_record, field_mapping|
14
+ dest_record.merge(transform_field(record, field_mapping))
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def transform_field(record, field_mapping)
21
+ field_transformer.new(field_mapping: field_mapping,
22
+ record: record).reduce
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,19 @@
1
+ module CDMDEXER
2
+ # Search an OAI ListSets field using a regular expression
3
+ class RegexFilterCallback
4
+ attr_reader :field, :pattern, :inclusive
5
+ def initialize(field: 'setName', pattern: /.*/, inclusive: true)
6
+ @field = field
7
+ @pattern = pattern
8
+ @inclusive = inclusive
9
+ end
10
+
11
+ def valid?(set: {})
12
+ inclusive ? matches?(set) : !matches?(set)
13
+ end
14
+
15
+ def matches?(set)
16
+ pattern.match?(set[field])
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,12 @@
1
+ require 'cdmdexer'
2
+
3
+ namespace :cdmdexer do
4
+ desc "delete all records that aren't in a given OAI endpoint"
5
+ task :delete_batch, [:start, :prefix, :oai_url, :solr_url] do |t, args|
6
+ CDMDEXER::BatchDeleterWorker.perform_async(args[:start].to_i,
7
+ args[:prefix],
8
+ args[:oai_url],
9
+ args[:solr_url])
10
+ end
11
+ end
12
+
@@ -0,0 +1,96 @@
1
+ require 'cdmdexer'
2
+
3
+ namespace :cdmdexer do
4
+
5
+
6
+
7
+ desc 'Ingest a Collection Syncronously'
8
+ task :collection_sync do
9
+ # config = etl.config
10
+ # raise etl.config.keys.inspect
11
+ CDMDEXER::ETLWorker.new.perform(
12
+ 'solr_config' => {:url=>"http://solr:8983/solr/mdl-1"},
13
+ 'oai_endpoint' => 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
14
+ 'cdm_endpoint' => 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
15
+ 'set_spec' => 'mpls',
16
+ 'batch_size' => 10,
17
+ 'max_compounds' => 10
18
+ )
19
+ end
20
+
21
+
22
+ desc 'Launch a background job to index metadata from CONTENTdm to Solr.'
23
+ task :batch, [
24
+ :solr_url,
25
+ :oai_endpoint,
26
+ :cdm_endpoint,
27
+ :set_spec,
28
+ :batch_size,
29
+ :max_compounds
30
+ ] do |t, args|
31
+ CDMDEXER::ETLWorker.perform_async(
32
+ solr_config: { url: args.fetch(:solr_url) },
33
+ oai_endpoint: args.fetch(:oai_endpoint),
34
+ cdm_endpoint: args.fetch(:cdm_endpoint),
35
+ set_spec: args[:set_spec] != '""' ? args[:set_spec] : nil,
36
+ batch_size: args.fetch(:batch_size, 10),
37
+ max_compounds: args.fetch(:max_compounds, 10)
38
+ )
39
+ end
40
+
41
+ desc 'Launch an indexing worker for each collection with an optional regex
42
+ pattern to match setSpec. Patterns can be inclusive or exclusive.'
43
+ task :by_collections, [
44
+ :solr_url,
45
+ :oai_endpoint,
46
+ :cdm_endpoint,
47
+ :set_spec_pattern,
48
+ :inclusive,
49
+ :batch_size
50
+ ] do |t, args|
51
+ oai_endpoint = args.fetch(:oai_endpoint)
52
+ # Optional args
53
+ pattern = args.fetch(:set_spec_pattern, false)
54
+ inclusive = args.fetch(:inclusive, 'true') == 'true'
55
+ # Define your own callback if you want to use other set related fields
56
+ # Use the RegexFilterCallback as an example of how to build your own filter
57
+ set_specs =
58
+ if pattern
59
+ filter = CDMDEXER::RegexFilterCallback.new(field: 'setName',
60
+ pattern: Regexp.new(pattern),
61
+ inclusive: inclusive)
62
+ CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint,
63
+ callback: filter).set_specs
64
+ else
65
+ CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint).set_specs
66
+ end
67
+
68
+ puts "Indexing Sets: '#{set_specs.join(', ')}'"
69
+
70
+ etl_config = {
71
+ solr_config: { url: args.fetch(:solr_url) },
72
+ oai_endpoint: args.fetch(:oai_endpoint),
73
+ cdm_endpoint: args.fetch(:cdm_endpoint),
74
+ batch_size: args.fetch(:batch_size, 5),
75
+ max_compounds: args.fetch(:max_compounds, 10)
76
+ }
77
+
78
+ CDMDEXER::ETLBySetSpecs.new(set_specs: set_specs, etl_config: etl_config).run!
79
+ end
80
+
81
+ desc 'Launch a background job to index a single record.'
82
+ task :record, [
83
+ :collection,
84
+ :id,
85
+ :solr_url,
86
+ :cdm_endpoint,
87
+ :oai_endpoint
88
+ ] do |t, args|
89
+ CDMDEXER::TransformWorker.perform_async(
90
+ [[args.fetch(:collection), args.fetch(:id)]],
91
+ { url: args.fetch(:solr_url) },
92
+ args.fetch(:cdm_endpoint),
93
+ args.fetch(:oai_endpoint)
94
+ )
95
+ end
96
+ end