search_solr_tools 3.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,183 @@
1
+ require 'multi_json'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'rest-client'
5
+ require 'rsolr'
6
+ require 'time'
7
+
8
+ module SearchSolrTools
9
+ module Harvesters
10
+ # base class for solr harvesters
11
+ class Base
12
+ attr_accessor :environment
13
+
14
+ DELETE_DOCUMENTS_RATIO = 0.1
15
+ XML_CONTENT_TYPE = 'text/xml; charset=utf-8'
16
+ JSON_CONTENT_TYPE = 'application/json; charset=utf-8'
17
+
18
+ def initialize(env = 'development', die_on_failure = false)
19
+ @environment = env
20
+ @die_on_failure = die_on_failure
21
+ end
22
+
23
+ def solr_url
24
+ env = SolrEnvironments[@environment]
25
+ "http://#{env[:host]}:#{env[:port]}/#{env[:collection_path]}"
26
+ end
27
+
28
+ # Some data providers require encoding (such as URI.encode),
29
+ # while others barf on encoding. The default is to just
30
+ # return url, override this in the subclass if special
31
+ # encoding is needed.
32
+ def encode_data_provider_url(url)
33
+ url
34
+ end
35
+
36
+ def harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name])
37
+ start_time = Time.now.utc.iso8601
38
+ harvest_method.call
39
+ delete_old_documents start_time, delete_constraints, solr_core
40
+ end
41
+
42
+ def delete_old_documents(timestamp, constraints, solr_core, force = false)
43
+ constraints = sanitize_data_centers_constraints(constraints)
44
+ delete_query = "last_update:[* TO #{timestamp}] AND #{constraints}"
45
+ solr = RSolr.connect url: solr_url + "/#{solr_core}"
46
+ unchanged_count = (solr.get 'select', params: { q: delete_query, rows: 0 })['response']['numFound'].to_i
47
+ if unchanged_count == 0
48
+ puts "All documents were updated after #{timestamp}, nothing to delete"
49
+ else
50
+ puts "Begin removing documents older than #{timestamp}"
51
+ remove_documents(solr, delete_query, constraints, force, unchanged_count)
52
+ end
53
+ end
54
+
55
+ def sanitize_data_centers_constraints(query_string)
56
+ # Remove lucene special characters, preserve the query parameter and compress whitespace
57
+ query_string.gsub!(/[:&|!~\-\(\)\{\}\[\]\^\*\?\+]+/, ' ')
58
+ query_string.gsub!(/data_centers /, 'data_centers:')
59
+ query_string.squeeze(' ').strip
60
+ end
61
+
62
+ def remove_documents(solr, delete_query, constraints, force, numfound)
63
+ all_response_count = (solr.get 'select', params: { q: constraints, rows: 0 })['response']['numFound']
64
+ if force || (numfound / all_response_count.to_f < DELETE_DOCUMENTS_RATIO)
65
+ puts "Deleting #{numfound} documents for #{constraints}"
66
+ solr.delete_by_query delete_query
67
+ solr.commit
68
+ else
69
+ puts "Failed to delete records older than current harvest start because they exceeded #{DELETE_DOCUMENTS_RATIO} of the total records for this data center."
70
+ puts "\tTotal records: #{all_response_count}"
71
+ puts "\tNon-updated records: #{numfound}"
72
+ end
73
+ end
74
+
75
+ # Update Solr with an array of Nokogiri xml documents, report number of successfully added documents
76
+ def insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
77
+ success = 0
78
+ failure = 0
79
+ docs.each do |doc|
80
+ insert_solr_doc(doc, content_type, core) ? success += 1 : failure += 1
81
+ end
82
+ puts "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
83
+ puts "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."
84
+ fail 'Some documents failed to be inserted into Solr' if failure > 0
85
+ end
86
+
87
+ def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
88
+ url = solr_url + "/#{core}/update?commit=true"
89
+ success = false
90
+
91
+ # Some of the docs will cause Solr to crash - CPU goes to 195% with `top` and it
92
+ # doesn't seem to recover.
93
+ return success unless doc_valid?(doc) if content_type == XML_CONTENT_TYPE
94
+
95
+ doc_serialized = get_serialized_doc(doc, content_type)
96
+
97
+ # Some docs will cause solr to time out during the POST
98
+ begin
99
+ RestClient.post(url, doc_serialized, content_type: content_type) do |response, _request, _result|
100
+ success = response.code == 200
101
+ puts "Error for #{doc_serialized}\n\n response: #{response.body}" unless success
102
+ end
103
+ rescue => e
104
+ puts "Rest exception while POSTing to Solr: #{e}, for doc: #{doc_serialized}"
105
+ end
106
+
107
+ success
108
+ end
109
+
110
+ def get_serialized_doc(doc, content_type)
111
+ if content_type.eql?(XML_CONTENT_TYPE)
112
+ return doc.respond_to?(:to_xml) ? doc.to_xml : doc
113
+ elsif content_type.eql?(JSON_CONTENT_TYPE)
114
+ return MultiJson.dump(doc)
115
+ else
116
+ return doc
117
+ end
118
+ end
119
+
120
+ # Get results from some ISO end point specified in the query string
121
+ def get_results(request_url, metadata_path, content_type = 'application/xml')
122
+ timeout = 300
123
+ retries_left = 3
124
+
125
+ request_url = encode_data_provider_url(request_url)
126
+
127
+ begin
128
+ puts "Request: #{request_url}"
129
+ response = open(request_url, read_timeout: timeout, 'Content-Type' => content_type)
130
+ rescue OpenURI::HTTPError, Timeout::Error => e
131
+ retries_left -= 1
132
+ puts "## REQUEST FAILED ## Retrying #{retries_left} more times..."
133
+
134
+ retry if retries_left > 0
135
+
136
+ raise e if @die_on_failure
137
+ return
138
+ end
139
+ doc = Nokogiri.XML(response)
140
+ doc.xpath(metadata_path, Helpers::IsoNamespaces.namespaces(doc))
141
+ end
142
+
143
+ # returns Nokogiri XML document with content
144
+ # '<?xml version="1.0"?><add/>'
145
+ def create_new_solr_add_doc
146
+ doc = Nokogiri::XML::Document.new
147
+ doc.root = Nokogiri::XML::Node.new('add', doc)
148
+ doc
149
+ end
150
+
151
+ # returns a Nokogiri XML document with content
152
+ # '<?xml version="1.0"?><add> <child /> </add>'
153
+ def create_new_solr_add_doc_with_child(child)
154
+ doc = create_new_solr_add_doc
155
+ doc.root.add_child(child)
156
+ doc
157
+ end
158
+
159
+ # Make sure that Solr is able to accept this doc in a POST
160
+ def doc_valid?(doc)
161
+ spatial_coverages = doc.xpath(".//field[@name='spatial_coverages']").first
162
+ return true if spatial_coverages.nil?
163
+
164
+ spatial_coverages = spatial_coverages.text.split(' ')
165
+
166
+ # We've only seen the failure with 4 spatial coverage values
167
+ return true if spatial_coverages.size < 4
168
+
169
+ valid_solr_spatial_coverage?(spatial_coverages)
170
+ end
171
+
172
+ # spatial_coverages is an array with length 4:
173
+ # [North, East, South, West]
174
+ def valid_solr_spatial_coverage?(spatial_coverages)
175
+ north, east, south, west = spatial_coverages
176
+
177
+ polar_point = (north == south) && (north.to_f.abs == 90)
178
+
179
+ (east == west) || !polar_point
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,55 @@
1
+ require 'json'
2
+ require 'rest-client'
3
+
4
+ module SearchSolrTools
5
+ module Harvesters
6
+ # Harvests data from BcoDmo endpoint, translates and adds it to solr
7
+ class BcoDmo < Base
8
+ def initialize(env = 'development', die_on_failure = false)
9
+ super env, die_on_failure
10
+ @translator = Translators::BcodmoJsonToSolr.new
11
+ @wkt_parser = RGeo::WKRep::WKTParser.new(nil, {}) # (factory_generator_=nil,
12
+ end
13
+
14
+ def harvest_and_delete
15
+ super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
16
+ end
17
+
18
+ def harvest_bcodmo_into_solr
19
+ result = translate_bcodmo
20
+ insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
21
+ fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
22
+ end
23
+
24
+ def translate_bcodmo
25
+ documents = []
26
+ failure_ids = []
27
+ request_json(SolrEnvironments[@environment][:bcodmo_url]).each do |record|
28
+ geometry = request_json(record['geometryUrl'])
29
+ results = parse_record(record, geometry)
30
+ results[:documents].each { |d| documents << d }
31
+ results[:failure_ids].each { |id| failure_ids << id }
32
+ end
33
+ { add_docs: documents, failure_ids: failure_ids }
34
+ end
35
+
36
+ def request_json(url)
37
+ JSON.parse(RestClient.get(url))
38
+ end
39
+
40
+ def parse_record(record, geometry)
41
+ documents = []
42
+ failure_ids = []
43
+ begin
44
+ JSON.parse(RestClient.get(record['datasets'])).each do |dataset|
45
+ documents << { 'add' => { 'doc' => @translator.translate(dataset, record, geometry) } }
46
+ end
47
+ rescue => e
48
+ puts "Failed to add record #{record['id']} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
49
+ failure_ids << record['id']
50
+ end
51
+ { documents: documents, failure_ids: failure_ids }
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,63 @@
1
+
2
+ module SearchSolrTools
3
+ module Harvesters
4
+ # Harvests data from CISL and inserts it into Solr after it has been translated
5
+ class Cisl < Oai
6
+ def initialize(env = 'development', die_on_failure = false)
7
+ super
8
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]
9
+ @translator = Helpers::IsoToSolr.new :cisl
10
+
11
+ # Used in query string params, resumptionToken
12
+ @dataset = '0bdd2d39-3493-4fa2-98f9-6766596bdc50'
13
+ end
14
+
15
+ def metadata_url
16
+ SolrEnvironments[@environment][:cisl_url]
17
+ end
18
+
19
+ def results
20
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
21
+
22
+ @resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces)
23
+ @resumption_token = format_resumption_token(@resumption_token.first.text)
24
+
25
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
26
+ end
27
+
28
+ private
29
+
30
+ def request_params
31
+ {
32
+ verb: 'ListRecords',
33
+ metadataPrefix: 'dif',
34
+ set: @dataset,
35
+ resumptionToken: @resumption_token
36
+ }.delete_if { |_k, v| v.nil? }
37
+ end
38
+
39
+ # The ruby response is lacking quotes, which the token requires in order to work...
40
+ # Also, the response back seems to be inconsistent - sometimes it adds &quot; instead of '"',
41
+ # which makes the token fail to work.
42
+ # To get around this I'd prefer to make assumptions about the token and let it break if
43
+ # they change the formatting. For now, all fields other than offset should be able to be
44
+ # assumed to remain constant.
45
+ # If the input is empty, then we are done - return an empty string, which is checked for
46
+ # in the harvest loop.
47
+ def format_resumption_token(resumption_token)
48
+ return '' if resumption_token.empty?
49
+
50
+ resumption_token =~ /offset:(\d+)/
51
+ offset = Regexp.last_match(1)
52
+
53
+ {
54
+ from: nil,
55
+ until: nil,
56
+ set: @dataset,
57
+ metadataPrefix: 'dif',
58
+ offset: offset
59
+ }.to_json
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,50 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from ECHO and inserts it into Solr after it has been translated
4
+ class Echo < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 1000
8
+ @translator = Helpers::IsoToSolr.new :echo
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of ECHO catalog from #{echo_url}"
13
+ super(method(:harvest_echo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ECHO][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from ECHO and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_echo_into_solr
19
+ page_num = 1
20
+ while (entries = get_results_from_echo(page_num)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_echo(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}\n\n"
25
+ raise e if @die_on_failure
26
+ end
27
+ page_num += 1
28
+ end
29
+ end
30
+
31
+ def echo_url
32
+ SolrEnvironments[@environment][:echo_url]
33
+ end
34
+
35
+ def get_results_from_echo(page_num)
36
+ get_results build_request(@page_size, page_num), './/results/result', 'application/echo10+xml'
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_echo(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_request(max_records = '25', page_num = '1')
46
+ echo_url + '?page_size=' + max_records.to_s + '&page_num=' + page_num.to_s
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'base'
2
+ require 'json'
3
+ require 'rgeo/geo_json'
4
+
5
+ module SearchSolrTools
6
+ module Harvesters
7
+ class Eol < Base
8
+ def initialize(env = 'development', die_on_failure = false)
9
+ super env, die_on_failure
10
+ @translator = SearchSolrTools::Translators::EolToSolr.new
11
+ end
12
+
13
+ def harvest_and_delete
14
+ puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
15
+ SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
16
+ super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
17
+ end
18
+
19
+ def harvest_eol_into_solr
20
+ solr_add_queries = eol_dataset_urls.map do |dataset|
21
+ begin
22
+ doc = open_xml_document(dataset)
23
+ if doc.xpath('//xmlns:metadata').size > 1
24
+ # THREDDS allows for a dataset of datasests, EOL should not utilize this
25
+ fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
26
+ end
27
+ metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
28
+ { 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
29
+ rescue => e
30
+ puts "ERROR: #{e}"
31
+ puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
32
+ raise e if @die_on_failure
33
+ next
34
+ end
35
+ end
36
+ insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
37
+ end
38
+
39
+ def eol_dataset_urls
40
+ SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
41
+ doc = open_xml_document(endpoint)
42
+ doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
43
+ end
44
+ end
45
+
46
+ def open_xml_document(url)
47
+ Nokogiri::XML(open(url)) do |config|
48
+ config.strict
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,55 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from ICES and inserts it into Solr after it has been translated
4
+ class Ices < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :ices
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of ICES catalog from #{ices_url}"
13
+ super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from ICES and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_ices_into_solr
19
+ start_index = 1
20
+ while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}"
25
+ raise e if @die_on_failure
26
+ end
27
+ start_index += @page_size
28
+ end
29
+ end
30
+
31
+ def ices_url
32
+ SolrEnvironments[@environment][:ices_url]
33
+ end
34
+
35
+ def get_results_from_ices(start_index)
36
+ get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_ices(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
46
+ Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
47
+ 'resultType' => resultType,
48
+ 'maxRecords' => maxRecords,
49
+ 'startPosition' => startPosition,
50
+ 'constraintLanguage' => 'CQL_TEXT'
51
+ )
52
+ end
53
+ end
54
+ end
55
+ end