cdmdexer 0.17.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.rubocop.yml +4 -0
  4. data/.travis.yml +8 -0
  5. data/CODE_OF_CONDUCT.md +49 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +21 -0
  8. data/README.md +149 -0
  9. data/Rakefile +11 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/cdmdexer.gemspec +35 -0
  13. data/lib/cdmdexer/cdm_item.rb +89 -0
  14. data/lib/cdmdexer/default_cdm_notification.rb +8 -0
  15. data/lib/cdmdexer/default_completed_callback.rb +8 -0
  16. data/lib/cdmdexer/default_loader_notification.rb +8 -0
  17. data/lib/cdmdexer/default_oai_notification.rb +8 -0
  18. data/lib/cdmdexer/default_solr.rb +35 -0
  19. data/lib/cdmdexer/etl_by_set_specs.rb +18 -0
  20. data/lib/cdmdexer/etl_worker.rb +113 -0
  21. data/lib/cdmdexer/field_formatter.rb +13 -0
  22. data/lib/cdmdexer/field_mapping.rb +28 -0
  23. data/lib/cdmdexer/field_transformer.rb +41 -0
  24. data/lib/cdmdexer/filtered_set_specs.rb +41 -0
  25. data/lib/cdmdexer/formatters.rb +169 -0
  26. data/lib/cdmdexer/hooks.rb +31 -0
  27. data/lib/cdmdexer/load_worker.rb +36 -0
  28. data/lib/cdmdexer/loader.rb +19 -0
  29. data/lib/cdmdexer/oai_client.rb +26 -0
  30. data/lib/cdmdexer/oai_request.rb +100 -0
  31. data/lib/cdmdexer/rake_task.rb +6 -0
  32. data/lib/cdmdexer/record_transformer.rb +25 -0
  33. data/lib/cdmdexer/regex_filter_callback.rb +19 -0
  34. data/lib/cdmdexer/tasks/delete.rake +12 -0
  35. data/lib/cdmdexer/tasks/etl.rake +96 -0
  36. data/lib/cdmdexer/transform_worker.rb +93 -0
  37. data/lib/cdmdexer/transformer.rb +171 -0
  38. data/lib/cdmdexer/version.rb +3 -0
  39. data/lib/cdmdexer.rb +26 -0
  40. data/travis.yml +6 -0
  41. metadata +223 -0
@@ -0,0 +1,28 @@
1
+ module CDMDEXER
2
+ class FieldMapping
3
+ attr_reader :config
4
+ def initialize(config: {})
5
+ @config = symbolize(config)
6
+ end
7
+
8
+ def origin_path
9
+ config.fetch(:origin_path)
10
+ end
11
+
12
+ def dest_path
13
+ config.fetch(:dest_path)
14
+ end
15
+
16
+ def formatters
17
+ config.fetch(:formatters, [DefaultFormatter]).map do |formatter|
18
+ formatter.is_a?(String) ? Object.const_get(formatter) : formatter
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def symbolize(config)
25
+ config.inject({}) { |memo, (k, v)| memo[k.to_sym] = v; memo }
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,41 @@
1
+ require 'hash_at_path'
2
+
3
+ module CDMDEXER
4
+ class FieldTransformer
5
+ extend Forwardable
6
+ def_delegators :@field_mapping, :origin_path, :dest_path, :formatters
7
+ attr_reader :field_value, :field_mapping, :formatter_klass
8
+ def initialize(field_mapping: FieldMapping.new,
9
+ record: {},
10
+ formatter_klass: FieldFormatter)
11
+ @field_mapping = field_mapping
12
+ @field_value = compact(record.at_path(origin_path))
13
+ @formatter_klass = formatter_klass
14
+ end
15
+
16
+ def reduce
17
+ (blank?(value)) ? {} : { "#{dest_path}" => value }
18
+ end
19
+
20
+ def value
21
+ @value ||= (!blank?(field_value)) ? transform_field : nil
22
+ end
23
+
24
+ private
25
+
26
+ def compact(record)
27
+ (record.respond_to?(:compact)) ? record.compact : record
28
+ end
29
+
30
+ # File activesupport/lib/active_support/core_ext/object/blank.rb, line 14
31
+ def blank?(val)
32
+ val.respond_to?(:empty?) ? !!val.empty? : !val
33
+ end
34
+
35
+ def transform_field
36
+ formatter_klass.new(value: field_value, formatters: formatters).format!
37
+ rescue StandardError => e
38
+ raise "Mapping Error:#{field_mapping.config} Error:#{e.message}"
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,41 @@
1
+ module CDMDEXER
2
+ module DefaultFilterSetCallback
3
+ def valid?(set: {})
4
+ true
5
+ end
6
+ end
7
+
8
+ class FilteredSetSpecs
9
+ attr_reader :oai_base_url,
10
+ :oai_client,
11
+ :callback
12
+
13
+ def initialize(oai_base_url: :missing_oai_base_url,
14
+ oai_client: OaiClient,
15
+ callback: CDMDEXER::DefaultSetFilterCallback.new)
16
+ @oai_base_url = oai_base_url
17
+ @oai_client = oai_client
18
+ @callback = callback
19
+ end
20
+
21
+ def set_specs
22
+ filtered_sets.map { |set| set['setSpec'] }
23
+ end
24
+
25
+ def filtered_sets
26
+ @filtered_sets ||= sets.select do |set|
27
+ callback.valid?(set: set)
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ def sets
34
+ @sets ||= list_sets['OAI_PMH']['ListSets']['set']
35
+ end
36
+
37
+ def list_sets
38
+ @list_sets ||= oai_client.new(base_url: oai_base_url).request('verb=ListSets')
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,169 @@
1
+ require 'titleize'
2
+ require 'json'
3
+ require 'net/http'
4
+ # A handful of very simple formatters to clean up CONTENTdm API metadata
5
+ module CDMDEXER
6
+
7
+
8
+ class GeoNameID
9
+ def self.format(value)
10
+ value.split('/').last
11
+ end
12
+ end
13
+
14
+ class GeoNameIDToJson
15
+ URL = "http://ws.geonames.net/getJSON?username=#{ENV['GEONAMES_USER']}&token=#{ENV['GEONAMES_TOKEN']}"
16
+ def self.format(value)
17
+ JSON.parse(Net::HTTP.get_response(URI.parse("#{URL}&geonameId=#{value}")).body)
18
+ end
19
+ end
20
+
21
+ class GeoNameToLocation
22
+ def self.format(value)
23
+ return if !value.respond_to?(:fetch)
24
+ return if !value['lat'] || !value['lng']
25
+ "#{value['lat']},#{value['lng']}"
26
+ end
27
+ end
28
+
29
+ class GeoNameToPlaceName
30
+ def self.format(value)
31
+ return if !value.respond_to?(:fetch)
32
+ [
33
+ value['name'],
34
+ value['adminName1'],
35
+ value['adminName2']
36
+ ].select { |place| place != 'Minnesota'}.compact.uniq
37
+ end
38
+ end
39
+
40
+ class DefaultFormatter
41
+ def self.format(value)
42
+ value
43
+ end
44
+ end
45
+
46
+ class KeywordFormatter
47
+ def self.format(value)
48
+ vals = []
49
+ vals << value['genera'].split(';') if value['genera'].respond_to?(:split)
50
+ vals << value['specif'].split(';') if value['specif'].respond_to?(:split)
51
+ vals << value['subjec'].split(';') if value['subjec'].respond_to?(:split)
52
+ vals.flatten.uniq.sort
53
+ end
54
+ end
55
+
56
+ class UniqueFormatter
57
+ def self.format(value)
58
+ if value.respond_to?(:uniq)
59
+ value.uniq
60
+ else
61
+ value.titleize
62
+ end
63
+ end
64
+ end
65
+
66
+ class Titlieze
67
+ def self.format(value)
68
+ if value.respond_to?(:map)
69
+ value.map {|value| value.titleize }
70
+ else
71
+ value.titleize
72
+ end
73
+ end
74
+ end
75
+
76
+ class ImageId
77
+ def self.format(value)
78
+ value
79
+ end
80
+ end
81
+
82
+ class ToJsonFormatter
83
+ def self.format(values)
84
+ values.to_json if values.respond_to?(:to_json)
85
+ end
86
+ end
87
+
88
+ class StripSemicolonFormatter
89
+ def self.format(values)
90
+ if values.respond_to?(:map)
91
+ values.map {|value| value.gsub(/;/, '') }
92
+ else
93
+ values.gsub(/;/, '')
94
+ end
95
+ end
96
+ end
97
+
98
+ class StripFormatter
99
+ def self.format(values)
100
+ return '' if values.nil?
101
+ if values.respond_to?(:map)
102
+ values.map {|value| value.strip }
103
+ else
104
+ values.strip
105
+ end
106
+ end
107
+ end
108
+
109
+ class SplitFormatter
110
+ def self.format(value)
111
+ (value.respond_to?(:split)) ? value.split(';') : value
112
+ end
113
+ end
114
+
115
+ class JoinFormatter
116
+ def self.format(value)
117
+ (value.respond_to?(:join)) ? value.join('; ') : value
118
+ end
119
+ end
120
+
121
+ class AddSetSpecFormatter
122
+ def self.format(value)
123
+ value.merge('setSpec' => value['id'].split('/').first)
124
+ end
125
+ end
126
+
127
+ class SetSpecFormatter
128
+ def self.format(value)
129
+ value['setSpec']
130
+ end
131
+ end
132
+
133
+ class CollectionNameFormatter
134
+ def self.format(value)
135
+ value['oai_sets'].fetch(value['setSpec'], {})
136
+ .fetch(:name, '')
137
+ end
138
+ end
139
+
140
+ class CollectionDescriptionFormatter
141
+ def self.format(value)
142
+ value['oai_sets'].fetch(value['setSpec'], {})
143
+ .fetch(:description, '')
144
+ end
145
+ end
146
+
147
+ class FilterBadCollections
148
+ def self.format(value)
149
+ (/Collection information undefined/i =~ value) ? '' : value
150
+ end
151
+ end
152
+
153
+ class ToIFormatter
154
+ def self.format(value)
155
+ value.to_i if value.respond_to?(:to_i)
156
+ end
157
+ end
158
+
159
+ class LocationFormatter
160
+ def self.format(record)
161
+ if record['latitu'] && record['longit'] && record['latitu'] != '' && record['longit'] != '' && record['latitu'] != {}
162
+ "#{record['latitu']}, #{record['longit']}"
163
+ else
164
+ nil
165
+ end
166
+ end
167
+ end
168
+
169
+ end
@@ -0,0 +1,31 @@
1
+ module CDMDEXER
2
+ def self.const_missing(name)
3
+ if name.to_s == 'Solr'
4
+ hook(pattern: name.to_s, default: DefaultSolr)
5
+ elsif name.to_s == 'CompletedCallback'
6
+ hook(pattern: name.to_s, default: DefaultCompletedCallback)
7
+ elsif name.to_s == 'OaiNotification'
8
+ hook(pattern: name.to_s, default: DefaultOaiNotification)
9
+ elsif name.to_s == 'LoaderNotification'
10
+ hook(pattern: name.to_s, default: DefaultLoaderNotification)
11
+ elsif name.to_s == 'CdmNotification'
12
+ hook(pattern: name.to_s, default: DefaultCdmNotification)
13
+ end
14
+ end
15
+
16
+ def self.hook(pattern: '', default: false)
17
+ if find_hook(pattern, default)
18
+ Object.const_get("CDMDEXER::#{find_hook(pattern, default)}")
19
+ else
20
+ default
21
+ end
22
+ end
23
+
24
+ def self.find_hook(pattern, default)
25
+ CDMDEXER.constants.find do |konst|
26
+ if Object.const_get("CDMDEXER::#{konst}") != default
27
+ /#{pattern}/ =~ konst.to_s
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ require 'sidekiq'
2
+ module CDMDEXER
3
+ # Load Records into a solr index
4
+ class LoadWorker
5
+ include Sidekiq::Worker
6
+ sidekiq_options queue: 'critical'
7
+ attr_reader :solr_config, :records, :deletables
8
+ attr_writer :loader_klass, :solr_klass
9
+ def perform(records = [], deletables = [], solr_config = {})
10
+ @solr_config = solr_config.symbolize_keys
11
+ @records = records
12
+ @deletables = deletables
13
+ load!
14
+ end
15
+
16
+ def loader_klass
17
+ @loader_klass ||= Loader
18
+ end
19
+
20
+ def solr_klass
21
+ @solr_klass ||= DefaultSolr
22
+ end
23
+
24
+ def load!
25
+ loader_klass.new(records: records,
26
+ deletable_ids: deletables,
27
+ solr_client: solr_client).load!
28
+ end
29
+
30
+ private
31
+
32
+ def solr_client
33
+ @solr_client ||= solr_klass.new(solr_config)
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,19 @@
1
+ module CDMDEXER
2
+
3
+ class Loader
4
+ attr_reader :solr_client, :records, :deletable_ids
5
+
6
+ def initialize(records: [],
7
+ deletable_ids: [],
8
+ solr_client: CDMDEXER::DefaultSolr)
9
+ @solr_client = solr_client
10
+ @records = records
11
+ @deletable_ids = deletable_ids
12
+ end
13
+
14
+ def load!
15
+ solr_client.delete deletable_ids unless deletable_ids.empty?
16
+ solr_client.add records
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+
2
+ require 'json'
3
+ require 'http'
4
+ module CDMDEXER
5
+ class OaiClient
6
+ attr_reader :base_url, :client
7
+ def initialize(base_url: '', client: HTTP)
8
+ @base_url = base_url
9
+ @client = client
10
+ end
11
+
12
+ def request(query)
13
+ hashify get("#{base_url}?#{query}")
14
+ end
15
+
16
+ private
17
+
18
+ def get(url)
19
+ client.get(url).to_s
20
+ end
21
+
22
+ def hashify(xml)
23
+ Hash.from_xml(xml)
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,100 @@
1
+ require 'hash_at_path'
2
+ require 'json'
3
+
4
+ module CDMDEXER
5
+ # Light wrapper around OAI requests
6
+ # Enhances OAI responses with handles sets, records, etc and adds a little
7
+ # extra value to their data with a keyed set lookup, filters for deleted
8
+ # and non-deleted records
9
+ class OaiRequest
10
+ attr_reader :endpoint_url,
11
+ :resumption_token,
12
+ :client,
13
+ :set_spec
14
+
15
+ def initialize(endpoint_url: '',
16
+ resumption_token: nil,
17
+ set_spec: nil,
18
+ client: Net::HTTP)
19
+ @endpoint_url = endpoint_url
20
+ @resumption_token = resumption_token
21
+ @client = client
22
+ @set_spec = set_spec ? "&set=#{set_spec}" : ''
23
+ end
24
+
25
+ def records
26
+ headers.map do |header|
27
+ header.merge(
28
+ id: header['identifier'].split(':').last.split('/').join(':')
29
+ )
30
+ end
31
+ end
32
+
33
+ def sets
34
+ # Ensure a result of one set is still an array
35
+ @sets ||= force_array request(sets_endpoint_url).at_path('OAI_PMH/ListSets/set')
36
+ end
37
+
38
+ def set_lookup
39
+ sets.inject({}) { |memo, set| memo.merge(to_key(set)) }
40
+ end
41
+
42
+ def next_resumption_token
43
+ identifier_request.at_path('OAI_PMH/ListIdentifiers/resumptionToken')
44
+ end
45
+
46
+ def deletable_ids
47
+ records.select { |record| record['status'] == 'deleted' }
48
+ .map { |record| record[:id] }
49
+ end
50
+
51
+ def updatables
52
+ records.reject { |record| record['status'] == 'deleted' }
53
+ end
54
+
55
+ private
56
+
57
+ # TODO: Add some error handling if this ever turns up empty
58
+ def headers
59
+ force_array identifier_request.at_path('OAI_PMH/ListIdentifiers/header')
60
+ end
61
+
62
+ # Ensure results are a single level array
63
+ # (single row sets, records, etc)
64
+ def force_array(result)
65
+ [result].flatten
66
+ end
67
+
68
+ def to_key(set)
69
+ {
70
+ set['setSpec'] =>
71
+ {
72
+ name: set['setName'],
73
+ description: set.at_path('setDescription/dc/description')
74
+ }
75
+ }
76
+ end
77
+
78
+ def identifier_request
79
+ @identifier_request ||=
80
+ resumption_token ? request(batch_endpoint_url) : request(first_batch_endpoint_url)
81
+ end
82
+
83
+ def first_batch_endpoint_url
84
+ "#{endpoint_url}?verb=ListIdentifiers&metadataPrefix=oai_dc#{set_spec}"
85
+ end
86
+
87
+ def batch_endpoint_url
88
+ "#{endpoint_url}?verb=ListIdentifiers&resumptionToken=#{resumption_token}"
89
+ end
90
+
91
+ def sets_endpoint_url
92
+ "#{endpoint_url}?verb=ListSets"
93
+ end
94
+
95
+ def request(location)
96
+ CDMDEXER::OaiNotification.call!(location)
97
+ Hash.from_xml(client.get_response(URI(location)).body)
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,6 @@
1
+ require_relative '../cdmdexer'
2
+ module CDMDEXER
3
+ module RakeTask
4
+ Dir[File.expand_path(File.join(File.dirname(__FILE__),"tasks/*.rake"))].each { |ext| load ext } if defined?(Rake)
5
+ end
6
+ end
@@ -0,0 +1,25 @@
1
+ module CDMDEXER
2
+ class RecordTransformer
3
+ attr_reader :record, :field_mappings, :field_transformer
4
+ def initialize(record: {},
5
+ field_mappings: [],
6
+ field_transformer: FieldTransformer)
7
+ @record = record
8
+ @field_mappings = field_mappings
9
+ @field_transformer = field_transformer
10
+ end
11
+
12
+ def transform!
13
+ field_mappings.inject({}) do |dest_record, field_mapping|
14
+ dest_record.merge(transform_field(record, field_mapping))
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def transform_field(record, field_mapping)
21
+ field_transformer.new(field_mapping: field_mapping,
22
+ record: record).reduce
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,19 @@
1
+ module CDMDEXER
2
+ # Search an OAI ListSets field using a regular expression
3
+ class RegexFilterCallback
4
+ attr_reader :field, :pattern, :inclusive
5
+ def initialize(field: 'setName', pattern: /.*/, inclusive: true)
6
+ @field = field
7
+ @pattern = pattern
8
+ @inclusive = inclusive
9
+ end
10
+
11
+ def valid?(set: {})
12
+ inclusive ? matches?(set) : !matches?(set)
13
+ end
14
+
15
+ def matches?(set)
16
+ pattern.match?(set[field])
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,12 @@
1
+ require 'cdmdexer'
2
+
3
+ namespace :cdmdexer do
4
+ desc "delete all records that aren't in a given OAI endpoint"
5
+ task :delete_batch, [:start, :prefix, :oai_url, :solr_url] do |t, args|
6
+ CDMDEXER::BatchDeleterWorker.perform_async(args[:start].to_i,
7
+ args[:prefix],
8
+ args[:oai_url],
9
+ args[:solr_url])
10
+ end
11
+ end
12
+
@@ -0,0 +1,96 @@
1
+ require 'cdmdexer'
2
+
3
+ namespace :cdmdexer do
4
+
5
+
6
+
7
+ desc 'Ingest a Collection Syncronously'
8
+ task :collection_sync do
9
+ # config = etl.config
10
+ # raise etl.config.keys.inspect
11
+ CDMDEXER::ETLWorker.new.perform(
12
+ 'solr_config' => {:url=>"http://solr:8983/solr/mdl-1"},
13
+ 'oai_endpoint' => 'http://cdm16022.contentdm.oclc.org/oai/oai.php',
14
+ 'cdm_endpoint' => 'https://server16022.contentdm.oclc.org/dmwebservices/index.php',
15
+ 'set_spec' => 'mpls',
16
+ 'batch_size' => 10,
17
+ 'max_compounds' => 10
18
+ )
19
+ end
20
+
21
+
22
+ desc 'Launch a background job to index metadata from CONTENTdm to Solr.'
23
+ task :batch, [
24
+ :solr_url,
25
+ :oai_endpoint,
26
+ :cdm_endpoint,
27
+ :set_spec,
28
+ :batch_size,
29
+ :max_compounds
30
+ ] do |t, args|
31
+ CDMDEXER::ETLWorker.perform_async(
32
+ solr_config: { url: args.fetch(:solr_url) },
33
+ oai_endpoint: args.fetch(:oai_endpoint),
34
+ cdm_endpoint: args.fetch(:cdm_endpoint),
35
+ set_spec: args[:set_spec] != '""' ? args[:set_spec] : nil,
36
+ batch_size: args.fetch(:batch_size, 10),
37
+ max_compounds: args.fetch(:max_compounds, 10)
38
+ )
39
+ end
40
+
41
+ desc 'Launch an indexing worker for each collection with an optional regex
42
+ pattern to match setSpec. Patterns can be inclusive or exclusive.'
43
+ task :by_collections, [
44
+ :solr_url,
45
+ :oai_endpoint,
46
+ :cdm_endpoint,
47
+ :set_spec_pattern,
48
+ :inclusive,
49
+ :batch_size
50
+ ] do |t, args|
51
+ oai_endpoint = args.fetch(:oai_endpoint)
52
+ # Optional args
53
+ pattern = args.fetch(:set_spec_pattern, false)
54
+ inclusive = args.fetch(:inclusive, 'true') == 'true'
55
+ # Define your own callback if you want to use other set related fields
56
+ # Use the RegexFilterCallback as an example of how to build your own filter
57
+ set_specs =
58
+ if pattern
59
+ filter = CDMDEXER::RegexFilterCallback.new(field: 'setName',
60
+ pattern: Regexp.new(pattern),
61
+ inclusive: inclusive)
62
+ CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint,
63
+ callback: filter).set_specs
64
+ else
65
+ CDMDEXER::FilteredSetSpecs.new(oai_base_url: oai_endpoint).set_specs
66
+ end
67
+
68
+ puts "Indexing Sets: '#{set_specs.join(', ')}'"
69
+
70
+ etl_config = {
71
+ solr_config: { url: args.fetch(:solr_url) },
72
+ oai_endpoint: args.fetch(:oai_endpoint),
73
+ cdm_endpoint: args.fetch(:cdm_endpoint),
74
+ batch_size: args.fetch(:batch_size, 5),
75
+ max_compounds: args.fetch(:max_compounds, 10)
76
+ }
77
+
78
+ CDMDEXER::ETLBySetSpecs.new(set_specs: set_specs, etl_config: etl_config).run!
79
+ end
80
+
81
+ desc 'Launch a background job to index a single record.'
82
+ task :record, [
83
+ :collection,
84
+ :id,
85
+ :solr_url,
86
+ :cdm_endpoint,
87
+ :oai_endpoint
88
+ ] do |t, args|
89
+ CDMDEXER::TransformWorker.perform_async(
90
+ [[args.fetch(:collection), args.fetch(:id)]],
91
+ { url: args.fetch(:solr_url) },
92
+ args.fetch(:cdm_endpoint),
93
+ args.fetch(:oai_endpoint)
94
+ )
95
+ end
96
+ end