spotlight-oaipmh-resources 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. checksums.yaml +7 -0
  2. data/Rakefile +56 -0
  3. data/app/controllers/spotlight/resources/harvester_controller.rb +58 -0
  4. data/app/jobs/spotlight/resources/perform_harvests_job.rb +44 -0
  5. data/app/mailer/spotlight/harvesting_complete_mailer.rb +20 -0
  6. data/app/models/spotlight/resources/exceptions.rb +17 -0
  7. data/app/models/spotlight/resources/harvest_type.rb +7 -0
  8. data/app/models/spotlight/resources/harvester.rb +46 -0
  9. data/app/models/spotlight/resources/oaipmh_harvester.rb +41 -0
  10. data/app/models/spotlight/resources/oaipmh_mods_converter.rb +468 -0
  11. data/app/models/spotlight/resources/oaipmh_mods_item.rb +61 -0
  12. data/app/models/spotlight/resources/solr_converter.rb +180 -0
  13. data/app/models/spotlight/resources/solr_harvester.rb +42 -0
  14. data/app/models/spotlight/resources/solr_harvesting_item.rb +50 -0
  15. data/app/services/spotlight/resources/oaipmh_builder.rb +166 -0
  16. data/app/services/spotlight/resources/solr_harvesting_builder.rb +115 -0
  17. data/app/views/catalog/_show.html.erb +10 -0
  18. data/app/views/spotlight/harvesting_complete_mailer/harvest_failed.html.erb +6 -0
  19. data/app/views/spotlight/harvesting_complete_mailer/harvest_indexed.html.erb +13 -0
  20. data/app/views/spotlight/resources/harvester/_form.html.erb +36 -0
  21. data/config/default_solr_mapping.yml +20 -0
  22. data/config/locales/en.yml +32 -0
  23. data/config/mapping.yml +172 -0
  24. data/config/marc_mapping.yml +190 -0
  25. data/config/routes.rb +5 -0
  26. data/lib/generators/spotlight/oaipmh/resources/install_generator.rb +16 -0
  27. data/lib/spotlight/oaipmh/resources.rb +11 -0
  28. data/lib/spotlight/oaipmh/resources/engine.rb +23 -0
  29. data/lib/spotlight/oaipmh/resources/version.rb +8 -0
  30. metadata +253 -0
@@ -0,0 +1,61 @@
1
+ require 'oai'
2
+ require 'mods'
3
+
4
+ include OAI::XPath
5
+ include Spotlight::Resources::Exceptions
6
+ module Spotlight::Resources
7
+ class OaipmhModsItem
8
+ attr_reader :titles, :id
9
+ attr_accessor :metadata, :sidecar_data
10
+ def initialize(exhibit, converter)
11
+ @solr_hash = {}
12
+ @exhibit = exhibit
13
+ @converter = converter
14
+ end
15
+
16
+ def to_solr
17
+ add_document_id
18
+ solr_hash
19
+ end
20
+
21
+ def parse_mods_record()
22
+
23
+ @modsrecord = Mods::Record.new.from_str(metadata.elements.to_a[0].to_s)
24
+
25
+ if (@modsrecord.mods_ng_xml.record_info && @modsrecord.mods_ng_xml.record_info.recordIdentifier)
26
+ @id = @modsrecord.mods_ng_xml.record_info.recordIdentifier.text
27
+ #Strip out all of the decimals
28
+ @id = @id.gsub('.', '')
29
+ @id = @exhibit.id.to_s + "-" + @id.to_s
30
+ end
31
+
32
+ begin
33
+ @titles = @modsrecord.full_titles
34
+ rescue NoMethodError
35
+ @titles = nil
36
+ end
37
+
38
+ if (@titles.blank? && @id.blank?)
39
+ raise InvalidModsRecord, "A mods record was found that has no title and no identifier."
40
+ elsif (@titles.blank?)
41
+ raise InvalidModsRecord, "Mods record " + @id + " must have a title. This mods record was not updated in Spotlight."
42
+ elsif (@id.blank?)
43
+ raise InvalidModsRecord, "Mods record " + @titles[0] + "must have a title. This mods record was not updated in Spotlight."
44
+ end
45
+
46
+ @solr_hash = @converter.convert(@modsrecord)
47
+ @sidecar_data = @converter.sidecar_hash
48
+ end
49
+
50
+ # private
51
+
52
+ attr_reader :solr_hash, :exhibit
53
+
54
+
55
+
56
+ def add_document_id
57
+ solr_hash[:id] = @id.to_s
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,180 @@
1
+ include Spotlight::Resources::Exceptions
2
+ module Spotlight::Resources
3
+
4
+ class SolrEntry
5
+ attr_accessor :solr_field
6
+ end
7
+ class ConverterItem
8
+ attr_accessor :spotlight_field, :solr_items, :default_value, :delimiter, :multivalue_facets
9
+
10
+ def initialize()
11
+ delimiter = ", "
12
+ end
13
+
14
+ def extract_values(solrmd)
15
+
16
+ values = extract_solr_values(solrmd)
17
+
18
+ #Remove duplicates
19
+ values = values.uniq
20
+
21
+ finalvalue = nil
22
+ if (!values.empty?)
23
+ #if multiple values, allow for faceting on each item by keeping it as an array
24
+ if (!multivalue_facets.nil? && (multivalue_facets.eql?("yes") || multivalue_facets))
25
+
26
+ finalvalue = values;
27
+ else
28
+ finalvalue = values.join(delimiter)
29
+ end
30
+ end
31
+ finalvalue
32
+ end
33
+
34
+ private
35
+
36
+ def extract_solr_values(solrmd)
37
+ values = Array.new
38
+ if (!solr_items.nil?)
39
+ retvals = Array.new
40
+ solr_items.each do |item|
41
+ myretval = solrmd[item.solr_field]
42
+
43
+ if (myretval.blank? && !default_value.blank?)
44
+ value = default_value
45
+ values << value
46
+ elsif (!myretval.blank?)
47
+ values << myretval
48
+ end
49
+ end
50
+ end
51
+ values
52
+ end
53
+
54
+ end
55
+
56
+ class SolrConverter
57
+ STANDARD_SPOTLIGHT_FIELDS = ['unique-id_tesim', 'full_title_tesim', 'spotlight_upload_description_tesim', 'thumbnail_url_ssm', 'full_image_url_ssm', 'spotlight_upload_date_tesim"', 'spotlight_upload_attribution_tesim']
58
+
59
+ attr_accessor :sidecar_hash
60
+
61
+ #Initialize with the name of the set being converted
62
+ def initialize(set, exhibitslug, mapping_file)
63
+ @set = set
64
+ @exhibitslug = exhibitslug
65
+ @mapping_file = mapping_file
66
+ @converter_items = Array.new
67
+ @sidecar_hash = {}
68
+ end
69
+
70
+ def convert(solrrecord)
71
+ if (@converter_items.empty?)
72
+ parse_mapping_file(mapping_file)
73
+ end
74
+
75
+ solr_hash = {}
76
+
77
+ @converter_items.each do |item|
78
+ value = item.extract_values(solrrecord)
79
+
80
+ #Not sure why but if a value isn't assigned, the last existing value for the field gets
81
+ #placed in all non-existing values
82
+ solr_hash[get_spotlight_field_name(item.spotlight_field)] = value
83
+ @sidecar_hash[item.spotlight_field] = value
84
+
85
+ end
86
+ solr_hash
87
+ end
88
+
89
+ #Some spotlight fields use the exhibit slug, others do not
90
+ def get_spotlight_field_name(spotlight_field)
91
+ if (!STANDARD_SPOTLIGHT_FIELDS.include?(spotlight_field))
92
+ spotlight_field = 'exhibit_' + @exhibitslug + '_' + spotlight_field
93
+ end
94
+ spotlight_field
95
+ end
96
+
97
+
98
+ #Retrieves the mapping file for the set, if one exists, otherwise uses the generic mapping file
99
+ def mapping_file
100
+ if (@mapping_file == nil)
101
+ engine_root = Spotlight::Oaipmh::Resources::Engine.root
102
+ @mapping_file = File.join(engine_root, 'config', 'default_solr_mapping.yml')
103
+ else
104
+ @mapping_file = Rails.root.join("public/uploads/solrmapping", @mapping_file)
105
+ end
106
+ @mapping_file
107
+ end
108
+
109
+
110
+ #private
111
+
112
+ #parses the mapping file into a model
113
+ def parse_mapping_file(file)
114
+
115
+ mapping_config = YAML.load_file(file)
116
+ mapping_config.each do |field|
117
+
118
+ item = ConverterItem.new
119
+ #validate the spotlight-field is not null
120
+ if (!field.key?("spotlight-field") || field['spotlight-field'].blank?)
121
+ raise InvalidMappingFile, "spotlight-field is required for each entry"
122
+ end
123
+ item.spotlight_field = field['spotlight-field']
124
+
125
+ if (field.key?("delimiter"))
126
+ item.delimiter = field["delimiter"]
127
+ end
128
+ if (field.key?("default-value"))
129
+ item.default_value = field["default-value"]
130
+ end
131
+
132
+ if (field.key?("multivalue-breaks"))
133
+ item.multivalue_facets = field["multivalue-breaks"]
134
+ end
135
+
136
+ #must have a solr-field value
137
+ if (!field.key?("solr-field"))
138
+ raise InvalidMappingFile, "solr-field is required for each entry"
139
+ end
140
+
141
+ #if using xpath, then add the values from xpath
142
+ if (field.key?('solr-field'))
143
+ item.solr_items = Array.new
144
+ field['solr-field'].each do |solr_field|
145
+ if (!solr_field.key?("field-name") || solr_field['field-name'].blank?)
146
+ raise InvalidMappingFile, "field-name is required for each solr-field entry"
147
+ end
148
+
149
+ solritem = SolrEntry.new
150
+ solritem.solr_field = solr_field['field-name']
151
+ item.solr_items << solritem
152
+
153
+ end
154
+ end
155
+
156
+ #If it is the unique field, set it
157
+ if (field['spotlight-field'].eql?("unique-id_tesim"))
158
+ delimiter = ""
159
+ if (!field["delimiter"].blank?)
160
+ delimiter = field["delimiter"]
161
+ end
162
+
163
+ fields = Array.new
164
+ item.solr_items.each do |solr_item|
165
+ fields << solr_item.solr_field
166
+ end
167
+ @unique_id_field = fields.join(delimiter)
168
+ end
169
+
170
+ @converter_items << item
171
+ end
172
+ @converter_items
173
+ end
174
+
175
+ def get_unique_id_field()
176
+ @unique_id_field
177
+ end
178
+
179
+ end
180
+ end
@@ -0,0 +1,42 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module Spotlight::Resources
5
+ class SolrHarvester
6
+ ROW_COUNT = 50
7
+
8
+ def initialize(base_url, set)
9
+ @url = base_url + set
10
+ @base_url = base_url
11
+ @set = set
12
+ end
13
+
14
+ def get_harvests
15
+ @solr_connection = RSolr.connect :url => @url
16
+ response = @solr_connection.paginate 0, ROW_COUNT, 'select', :params => {:q => '*:*', :wt => 'json'}
17
+ end
18
+
19
+ def paginate (page)
20
+ if (@solr_connection.nil?)
21
+ @solr_connection = RSolr.connect :url => @url
22
+ end
23
+ response = @solr_connection.paginate page, ROW_COUNT, 'select', :params => {:q => '*:*', :wt => 'json'}
24
+ end
25
+
26
+
27
+ def self.mapping_files
28
+ if (Dir.exist?('public/uploads/solrmapping'))
29
+ files = Dir.entries('public/uploads/solrmapping')
30
+ files.delete(".")
31
+ files.delete("..")
32
+ else
33
+ files = Array.new
34
+ end
35
+
36
+ files.insert(0, "New Mapping File")
37
+ files.insert(0, "Default Mapping File")
38
+ files
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,50 @@
1
+
2
+ include Spotlight::Resources::Exceptions
3
+ module Spotlight::Resources
4
+ class SolrHarvestingItem
5
+ attr_reader :titles, :id
6
+ attr_accessor :metadata, :sidecar_data
7
+ def initialize(exhibit, converter)
8
+ @solr_hash = {}
9
+ @exhibit = exhibit
10
+ @converter = converter
11
+ end
12
+
13
+ def to_solr
14
+ add_document_id
15
+ solr_hash
16
+ end
17
+
18
+ def parse_record(unique_id_field)
19
+ if (!metadata[unique_id_field].blank?)
20
+ if (metadata[unique_id_field].kind_of?(Array))
21
+ @id = metadata[unique_id_field][0]
22
+ else
23
+ @id = metadata[unique_id_field]
24
+ end
25
+
26
+ #Strip out all of the decimals
27
+ @id = @id.gsub('.', '')
28
+ @id = @exhibit.id.to_s + "-" + @id.to_s
29
+ end
30
+
31
+ @solr_hash = @converter.convert(metadata)
32
+ @sidecar_data = @converter.sidecar_hash
33
+ end
34
+
35
+ # private
36
+
37
+ attr_reader :solr_hash, :exhibit
38
+
39
+
40
+ def add_document_id
41
+ if (!@id.blank?)
42
+ solr_hash[:id] = @id.to_s
43
+ else
44
+ #Generate a random number if no unique id is supplied.
45
+ solr_hash[:id] = rand.to_s[2..11]
46
+ end
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,166 @@
1
+ module Spotlight
2
+ module Resources
3
+ # transforms a OaipmhHarvester into solr documents
4
+ class OaipmhBuilder < Spotlight::SolrDocumentBuilder
5
+
6
+ def to_solr
7
+ begin
8
+ return to_enum(:to_solr) { 0 } unless block_given?
9
+
10
+ base_doc = super
11
+
12
+
13
+ mapping_file = nil
14
+ if (!resource.data[:mapping_file].eql?("Default Mapping File") && !resource.data[:mapping_file].eql?("New Mapping File"))
15
+ mapping_file = resource.data[:mapping_file]
16
+ end
17
+
18
+ max_batch_count = Spotlight::Oaipmh::Resources::Engine.config.oai_harvest_batch_max
19
+
20
+ @oai_mods_converter = OaipmhModsConverter.new(resource.data[:set], resource.exhibit.slug, mapping_file)
21
+
22
+ count = 0
23
+ totalrecords = 0
24
+ failed_items = nil
25
+
26
+ #If the resumption token was stored, begin there.
27
+ if (resource.data.include?(:cursor) && !resource.data[:cursor].blank?)
28
+ cursor = resource.data[:cursor]
29
+ harvests = resource.paginate(cursor)
30
+
31
+ else
32
+ harvests = resource.harvests
33
+ end
34
+
35
+ resumption_token = harvests.resumption_token
36
+
37
+ if (resource.data.include?(:count) && !resource.data[:count].blank?)
38
+ totalrecords = resource.data[:count]
39
+ end
40
+
41
+ last_page_evaluated = false
42
+ while (!last_page_evaluated)
43
+ if (resumption_token.nil?)
44
+ last_page_evaluated = true
45
+ end
46
+ harvests.each do |record|
47
+ @item = OaipmhModsItem.new(exhibit, @oai_mods_converter)
48
+ @item.metadata = record.metadata
49
+ @item.parse_mods_record()
50
+ begin
51
+ @item_solr = @item.to_solr
52
+ @item_sidecar = @item.sidecar_data
53
+
54
+ repository_field_name = @oai_mods_converter.get_spotlight_field_name("repository_ssim")
55
+
56
+ process_images()
57
+
58
+ #Add the sidecar info for editing
59
+ sidecar ||= resource.document_model.new(id: @item.id).sidecar(resource.exhibit)
60
+ sidecar.update(data: @item_sidecar)
61
+ yield base_doc.merge(@item_solr) if @item_solr.present?
62
+
63
+ count = count + 1
64
+ totalrecords = totalrecords + 1
65
+ curtime = Time.zone.now
66
+ resource.get_job_entry.update(job_item_count: totalrecords, end_time: curtime)
67
+
68
+ rescue Exception => e
69
+ Delayed::Worker.logger.add(Logger::ERROR, @item.id + ' did not index successfully')
70
+ Delayed::Worker.logger.add(Logger::ERROR, e.message)
71
+ Delayed::Worker.logger.add(Logger::ERROR, e.backtrace)
72
+ if (failed_items.nil?)
73
+ failed_items = Array.new
74
+ end
75
+ failed_items << @item.id
76
+ end
77
+ end
78
+
79
+ #Stop harvesting if the batch has reached the maximum allowed value
80
+ if (!resumption_token.nil?)
81
+ if (max_batch_count != -1 && count >= max_batch_count)
82
+ schedule_next_batch(resumption_token, totalrecords, failed_items)
83
+ break
84
+ else
85
+ harvests = resource.paginate(resumption_token)
86
+ resumption_token = harvests.resumption_token
87
+ end
88
+ end
89
+
90
+ end
91
+ rescue Exception => e
92
+ resource.get_job_entry.failed!
93
+ Delayed::Worker.logger.add(Logger::ERROR, resource.data[:set] + ' harvest failed')
94
+ Delayed::Worker.logger.add(Logger::ERROR, e.message)
95
+ Delayed::Worker.logger.add(Logger::ERROR, e.backtrace)
96
+ Spotlight::HarvestingCompleteMailer.harvest_failed(resource.data[:set], resource.exhibit, resource.data[:user], e.message).deliver_now
97
+ raise
98
+ end
99
+ if (last_page_evaluated)
100
+ resource.get_job_entry.succeeded!
101
+ #Send job message
102
+ Spotlight::HarvestingCompleteMailer.harvest_indexed(resource.data[:set], resource.exhibit, resource.data[:user], failed_items).deliver_now
103
+ end
104
+ end
105
+
106
+ private
107
+ def schedule_next_batch(cursor, count, failed_items)
108
+ Spotlight::Resources::PerformHarvestsJob.perform_later(resource.data[:type], resource.data[:base_url], resource.data[:set], resource.data[:mapping_file], resource.exhibit, resource.data[:user], resource.data[:job_entry], cursor, count, failed_items)
109
+ end
110
+
111
+ def process_images()
112
+ if (@item_solr.key?('thumbnail_url_ssm') && !@item_solr['thumbnail_url_ssm'].blank? && !@item_solr['thumbnail_url_ssm'].eql?('null'))
113
+ thumburl = fetch_ids_uri(@item_solr['thumbnail_url_ssm'])
114
+ if (!thumburl.blank? && !thumburl.eql?('null'))
115
+ thumburl = transform_ids_uri_to_iiif(thumburl)
116
+ @item_solr['thumbnail_url_ssm'] = thumburl
117
+ end
118
+ end
119
+ if (@item_solr.key?('full_image_url_ssm') && !@item_solr['full_image_url_ssm'].blank? && !@item_solr['full_image_url_ssm'].eql?('null'))
120
+
121
+ fullurl = fetch_ids_uri(@item_solr['full_image_url_ssm'])
122
+ if (!fullurl.blank?)
123
+
124
+ #If it is http, make it https
125
+ if (fullurl.include?('http://'))
126
+ fullurl = fullurl.sub(/http:\/\//, "https://")
127
+ end
128
+ #if it is IDS, then add ?buttons=y so that mirador works
129
+ if (fullurl.include?('https://ids') && !fullurl.include?('?buttons=y'))
130
+ fullurl = fullurl + '?buttons=y'
131
+ end
132
+ @item_solr['full_image_url_ssm'] = fullurl
133
+ end
134
+ end
135
+ end
136
+
137
+
138
+ #Resolves urn-3 uris
139
+ def fetch_ids_uri(uri_str)
140
+ if (uri_str =~ /urn-3/)
141
+ response = Net::HTTP.get_response(URI.parse(URI.encode(uri_str)))['location']
142
+ elsif (uri_str.include?('?'))
143
+ uri_str = uri_str.slice(0..(uri_str.index('?')-1))
144
+ else
145
+ uri_str
146
+ end
147
+ end
148
+
149
+ #Returns the uri for the iiif
150
+ def transform_ids_uri_to_iiif(ids_uri)
151
+ #Strip of parameters
152
+ uri = ids_uri.sub(/\?.+/, "")
153
+ #Change /view/ to /iiif/
154
+ uri = uri.sub(%r|/view/|, "/iiif/")
155
+ #Append /default.jpg to end if it doesn't exist
156
+ if (!uri.include?('default.jpg') && !uri.include?('native.jpg'))
157
+ uri = uri + "/full/180,/0/default.jpg"
158
+ elsif (uri.include?("full/,150/"))
159
+ uri = uri.sub(/full\/,150\//,"full/180,/")
160
+ end
161
+ uri
162
+ end
163
+
164
+ end
165
+ end
166
+ end