search_solr_tools 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,183 @@
1
+ require 'multi_json'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'rest-client'
5
+ require 'rsolr'
6
+ require 'time'
7
+
8
+ module SearchSolrTools
9
+ module Harvesters
10
+ # base class for solr harvesters
11
+ class Base
12
+ attr_accessor :environment
13
+
14
+ DELETE_DOCUMENTS_RATIO = 0.1
15
+ XML_CONTENT_TYPE = 'text/xml; charset=utf-8'
16
+ JSON_CONTENT_TYPE = 'application/json; charset=utf-8'
17
+
18
+ def initialize(env = 'development', die_on_failure = false)
19
+ @environment = env
20
+ @die_on_failure = die_on_failure
21
+ end
22
+
23
+ def solr_url
24
+ env = SolrEnvironments[@environment]
25
+ "http://#{env[:host]}:#{env[:port]}/#{env[:collection_path]}"
26
+ end
27
+
28
+ # Some data providers require encoding (such as URI.encode),
29
+ # while others barf on encoding. The default is to just
30
+ # return url, override this in the subclass if special
31
+ # encoding is needed.
32
+ def encode_data_provider_url(url)
33
+ url
34
+ end
35
+
36
+ def harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name])
37
+ start_time = Time.now.utc.iso8601
38
+ harvest_method.call
39
+ delete_old_documents start_time, delete_constraints, solr_core
40
+ end
41
+
42
+ def delete_old_documents(timestamp, constraints, solr_core, force = false)
43
+ constraints = sanitize_data_centers_constraints(constraints)
44
+ delete_query = "last_update:[* TO #{timestamp}] AND #{constraints}"
45
+ solr = RSolr.connect url: solr_url + "/#{solr_core}"
46
+ unchanged_count = (solr.get 'select', params: { q: delete_query, rows: 0 })['response']['numFound'].to_i
47
+ if unchanged_count == 0
48
+ puts "All documents were updated after #{timestamp}, nothing to delete"
49
+ else
50
+ puts "Begin removing documents older than #{timestamp}"
51
+ remove_documents(solr, delete_query, constraints, force, unchanged_count)
52
+ end
53
+ end
54
+
55
+ def sanitize_data_centers_constraints(query_string)
56
+ # Remove lucene special characters, preserve the query parameter and compress whitespace
57
+ query_string.gsub!(/[:&|!~\-\(\)\{\}\[\]\^\*\?\+]+/, ' ')
58
+ query_string.gsub!(/data_centers /, 'data_centers:')
59
+ query_string.squeeze(' ').strip
60
+ end
61
+
62
+ def remove_documents(solr, delete_query, constraints, force, numfound)
63
+ all_response_count = (solr.get 'select', params: { q: constraints, rows: 0 })['response']['numFound']
64
+ if force || (numfound / all_response_count.to_f < DELETE_DOCUMENTS_RATIO)
65
+ puts "Deleting #{numfound} documents for #{constraints}"
66
+ solr.delete_by_query delete_query
67
+ solr.commit
68
+ else
69
+ puts "Failed to delete records older than current harvest start because they exceeded #{DELETE_DOCUMENTS_RATIO} of the total records for this data center."
70
+ puts "\tTotal records: #{all_response_count}"
71
+ puts "\tNon-updated records: #{numfound}"
72
+ end
73
+ end
74
+
75
+ # Update Solr with an array of Nokogiri xml documents, report number of successfully added documents
76
+ def insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
77
+ success = 0
78
+ failure = 0
79
+ docs.each do |doc|
80
+ insert_solr_doc(doc, content_type, core) ? success += 1 : failure += 1
81
+ end
82
+ puts "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
83
+ puts "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."
84
+ fail 'Some documents failed to be inserted into Solr' if failure > 0
85
+ end
86
+
87
+ def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
88
+ url = solr_url + "/#{core}/update?commit=true"
89
+ success = false
90
+
91
+ # Some of the docs will cause Solr to crash - CPU goes to 195% with `top` and it
92
+ # doesn't seem to recover.
93
+ return success unless doc_valid?(doc) if content_type == XML_CONTENT_TYPE
94
+
95
+ doc_serialized = get_serialized_doc(doc, content_type)
96
+
97
+ # Some docs will cause solr to time out during the POST
98
+ begin
99
+ RestClient.post(url, doc_serialized, content_type: content_type) do |response, _request, _result|
100
+ success = response.code == 200
101
+ puts "Error for #{doc_serialized}\n\n response: #{response.body}" unless success
102
+ end
103
+ rescue => e
104
+ puts "Rest exception while POSTing to Solr: #{e}, for doc: #{doc_serialized}"
105
+ end
106
+
107
+ success
108
+ end
109
+
110
+ def get_serialized_doc(doc, content_type)
111
+ if content_type.eql?(XML_CONTENT_TYPE)
112
+ return doc.respond_to?(:to_xml) ? doc.to_xml : doc
113
+ elsif content_type.eql?(JSON_CONTENT_TYPE)
114
+ return MultiJson.dump(doc)
115
+ else
116
+ return doc
117
+ end
118
+ end
119
+
120
+ # Get results from some ISO end point specified in the query string
121
+ def get_results(request_url, metadata_path, content_type = 'application/xml')
122
+ timeout = 300
123
+ retries_left = 3
124
+
125
+ request_url = encode_data_provider_url(request_url)
126
+
127
+ begin
128
+ puts "Request: #{request_url}"
129
+ response = open(request_url, read_timeout: timeout, 'Content-Type' => content_type)
130
+ rescue OpenURI::HTTPError, Timeout::Error => e
131
+ retries_left -= 1
132
+ puts "## REQUEST FAILED ## Retrying #{retries_left} more times..."
133
+
134
+ retry if retries_left > 0
135
+
136
+ raise e if @die_on_failure
137
+ return
138
+ end
139
+ doc = Nokogiri.XML(response)
140
+ doc.xpath(metadata_path, Helpers::IsoNamespaces.namespaces(doc))
141
+ end
142
+
143
+ # returns Nokogiri XML document with content
144
+ # '<?xml version="1.0"?><add/>'
145
+ def create_new_solr_add_doc
146
+ doc = Nokogiri::XML::Document.new
147
+ doc.root = Nokogiri::XML::Node.new('add', doc)
148
+ doc
149
+ end
150
+
151
+ # returns a Nokogiri XML document with content
152
+ # '<?xml version="1.0"?><add> <child /> </add>'
153
+ def create_new_solr_add_doc_with_child(child)
154
+ doc = create_new_solr_add_doc
155
+ doc.root.add_child(child)
156
+ doc
157
+ end
158
+
159
+ # Make sure that Solr is able to accept this doc in a POST
160
+ def doc_valid?(doc)
161
+ spatial_coverages = doc.xpath(".//field[@name='spatial_coverages']").first
162
+ return true if spatial_coverages.nil?
163
+
164
+ spatial_coverages = spatial_coverages.text.split(' ')
165
+
166
+ # We've only seen the failure with 4 spatial coverage values
167
+ return true if spatial_coverages.size < 4
168
+
169
+ valid_solr_spatial_coverage?(spatial_coverages)
170
+ end
171
+
172
+ # spatial_coverages is an array with length 4:
173
+ # [North, East, South, West]
174
+ def valid_solr_spatial_coverage?(spatial_coverages)
175
+ north, east, south, west = spatial_coverages
176
+
177
+ polar_point = (north == south) && (north.to_f.abs == 90)
178
+
179
+ (east == west) || !polar_point
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,55 @@
1
+ require 'json'
2
+ require 'rest-client'
3
+
4
+ module SearchSolrTools
5
+ module Harvesters
6
+ # Harvests data from BcoDmo endpoint, translates and adds it to solr
7
+ class BcoDmo < Base
8
+ def initialize(env = 'development', die_on_failure = false)
9
+ super env, die_on_failure
10
+ @translator = Translators::BcodmoJsonToSolr.new
11
+ @wkt_parser = RGeo::WKRep::WKTParser.new(nil, {}) # (factory_generator_=nil,
12
+ end
13
+
14
+ def harvest_and_delete
15
+ super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
16
+ end
17
+
18
+ def harvest_bcodmo_into_solr
19
+ result = translate_bcodmo
20
+ insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
21
+ fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
22
+ end
23
+
24
+ def translate_bcodmo
25
+ documents = []
26
+ failure_ids = []
27
+ request_json(SolrEnvironments[@environment][:bcodmo_url]).each do |record|
28
+ geometry = request_json(record['geometryUrl'])
29
+ results = parse_record(record, geometry)
30
+ results[:documents].each { |d| documents << d }
31
+ results[:failure_ids].each { |id| failure_ids << id }
32
+ end
33
+ { add_docs: documents, failure_ids: failure_ids }
34
+ end
35
+
36
+ def request_json(url)
37
+ JSON.parse(RestClient.get(url))
38
+ end
39
+
40
+ def parse_record(record, geometry)
41
+ documents = []
42
+ failure_ids = []
43
+ begin
44
+ JSON.parse(RestClient.get(record['datasets'])).each do |dataset|
45
+ documents << { 'add' => { 'doc' => @translator.translate(dataset, record, geometry) } }
46
+ end
47
+ rescue => e
48
+ puts "Failed to add record #{record['id']} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
49
+ failure_ids << record['id']
50
+ end
51
+ { documents: documents, failure_ids: failure_ids }
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,63 @@
1
+
2
+ module SearchSolrTools
3
+ module Harvesters
4
+ # Harvests data from CISL and inserts it into Solr after it has been translated
5
+ class Cisl < Oai
6
+ def initialize(env = 'development', die_on_failure = false)
7
+ super
8
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]
9
+ @translator = Helpers::IsoToSolr.new :cisl
10
+
11
+ # Used in query string params, resumptionToken
12
+ @dataset = '0bdd2d39-3493-4fa2-98f9-6766596bdc50'
13
+ end
14
+
15
+ def metadata_url
16
+ SolrEnvironments[@environment][:cisl_url]
17
+ end
18
+
19
+ def results
20
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
21
+
22
+ @resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces)
23
+ @resumption_token = format_resumption_token(@resumption_token.first.text)
24
+
25
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
26
+ end
27
+
28
+ private
29
+
30
+ def request_params
31
+ {
32
+ verb: 'ListRecords',
33
+ metadataPrefix: 'dif',
34
+ set: @dataset,
35
+ resumptionToken: @resumption_token
36
+ }.delete_if { |_k, v| v.nil? }
37
+ end
38
+
39
+ # The ruby response is lacking quotes, which the token requires in order to work...
40
+ # Also, the response back seems to be inconsistent - sometimes it adds &quot; instead of '"',
41
+ # which makes the token fail to work.
42
+ # To get around this I'd prefer to make assumptions about the token and let it break if
43
+ # they change the formatting. For now, all fields other than offset should be able to be
44
+ # assumed to remain constant.
45
+ # If the input is empty, then we are done - return an empty string, which is checked for
46
+ # in the harvest loop.
47
+ def format_resumption_token(resumption_token)
48
+ return '' if resumption_token.empty?
49
+
50
+ resumption_token =~ /offset:(\d+)/
51
+ offset = Regexp.last_match(1)
52
+
53
+ {
54
+ from: nil,
55
+ until: nil,
56
+ set: @dataset,
57
+ metadataPrefix: 'dif',
58
+ offset: offset
59
+ }.to_json
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,50 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from ECHO and inserts it into Solr after it has been translated
4
+ class Echo < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 1000
8
+ @translator = Helpers::IsoToSolr.new :echo
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of ECHO catalog from #{echo_url}"
13
+ super(method(:harvest_echo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ECHO][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from ECHO and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_echo_into_solr
19
+ page_num = 1
20
+ while (entries = get_results_from_echo(page_num)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_echo(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}\n\n"
25
+ raise e if @die_on_failure
26
+ end
27
+ page_num += 1
28
+ end
29
+ end
30
+
31
+ def echo_url
32
+ SolrEnvironments[@environment][:echo_url]
33
+ end
34
+
35
+ def get_results_from_echo(page_num)
36
+ get_results build_request(@page_size, page_num), './/results/result', 'application/echo10+xml'
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_echo(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_request(max_records = '25', page_num = '1')
46
+ echo_url + '?page_size=' + max_records.to_s + '&page_num=' + page_num.to_s
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'base'
2
+ require 'json'
3
+ require 'rgeo/geo_json'
4
+
5
+ module SearchSolrTools
6
+ module Harvesters
7
+ class Eol < Base
8
+ def initialize(env = 'development', die_on_failure = false)
9
+ super env, die_on_failure
10
+ @translator = SearchSolrTools::Translators::EolToSolr.new
11
+ end
12
+
13
+ def harvest_and_delete
14
+ puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
15
+ SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
16
+ super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
17
+ end
18
+
19
+ def harvest_eol_into_solr
20
+ solr_add_queries = eol_dataset_urls.map do |dataset|
21
+ begin
22
+ doc = open_xml_document(dataset)
23
+ if doc.xpath('//xmlns:metadata').size > 1
24
+ # THREDDS allows for a dataset of datasests, EOL should not utilize this
25
+ fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
26
+ end
27
+ metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
28
+ { 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
29
+ rescue => e
30
+ puts "ERROR: #{e}"
31
+ puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
32
+ raise e if @die_on_failure
33
+ next
34
+ end
35
+ end
36
+ insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
37
+ end
38
+
39
+ def eol_dataset_urls
40
+ SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
41
+ doc = open_xml_document(endpoint)
42
+ doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
43
+ end
44
+ end
45
+
46
+ def open_xml_document(url)
47
+ Nokogiri::XML(open(url)) do |config|
48
+ config.strict
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,55 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from ICES and inserts it into Solr after it has been translated
4
+ class Ices < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :ices
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of ICES catalog from #{ices_url}"
13
+ super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from ICES and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_ices_into_solr
19
+ start_index = 1
20
+ while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}"
25
+ raise e if @die_on_failure
26
+ end
27
+ start_index += @page_size
28
+ end
29
+ end
30
+
31
+ def ices_url
32
+ SolrEnvironments[@environment][:ices_url]
33
+ end
34
+
35
+ def get_results_from_ices(start_index)
36
+ get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_ices(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
46
+ Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
47
+ 'resultType' => resultType,
48
+ 'maxRecords' => maxRecords,
49
+ 'startPosition' => startPosition,
50
+ 'constraintLanguage' => 'CQL_TEXT'
51
+ )
52
+ end
53
+ end
54
+ end
55
+ end