search_solr_tools 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,32 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ class Nmi < Oai
4
+ def initialize(env = 'development', die_on_failure = false)
5
+ super
6
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:NMI][:long_name]
7
+ @translator = Helpers::IsoToSolr.new :nmi
8
+ end
9
+
10
+ def metadata_url
11
+ SolrEnvironments[@environment][:nmi_url]
12
+ end
13
+
14
+ # resumption_token must be empty to stop the harvest loop; NMI's feed does not
15
+ # provide any resumption token and gets all the records in just one go
16
+ def results
17
+ @resumption_token = ''
18
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
19
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
20
+ end
21
+
22
+ private
23
+
24
+ def request_params
25
+ {
26
+ verb: 'ListRecords',
27
+ metadataPrefix: 'dif'
28
+ }
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,72 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from NODC and inserts it into Solr after it has been translated
4
+ class Nodc < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :nodc
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of NODC catalog from #{nodc_url}"
13
+ super(method(:harvest_nodc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NODC][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from NODC and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_nodc_into_solr
19
+ start_index = 1
20
+ while (entries = get_results_from_nodc(start_index)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_nodc(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}"
25
+ raise e if @die_on_failure
26
+ end
27
+ start_index += @page_size
28
+ end
29
+ end
30
+
31
+ def nodc_url
32
+ SolrEnvironments[@environment][:nodc_url]
33
+ end
34
+
35
+ def get_results_from_nodc(start_index)
36
+ get_results build_csw_request('results', @page_size, start_index), '//gmi:MI_Metadata'
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_nodc(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
46
+ Helpers::CswIsoQueryBuilder.get_query_string(nodc_url,
47
+ 'resultType' => resultType,
48
+ 'maxRecords' => maxRecords,
49
+ 'startPosition' => startPosition,
50
+ 'constraint' => bbox_constraint
51
+ )
52
+ end
53
+
54
+ def bbox_constraint
55
+ bbox = {
56
+ west: '-180',
57
+ south: '45',
58
+ east: '180',
59
+ north: '90'
60
+ }
61
+
62
+ URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
63
+ 'xmlns:gml="http://www.opengis.net/gml" ' \
64
+ 'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
65
+ '<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
66
+ '<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
67
+ '<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
68
+ '</gml:Envelope></ogc:BBOX></Filter>'
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,33 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ class NsidcAutoSuggest < AutoSuggest
4
+ def harvest_and_delete
5
+ puts 'Building auto-suggest indexes for NSIDC'
6
+ super(method(:harvest), "source:\"NSIDC\"", @env_settings[:auto_suggest_collection_name])
7
+ end
8
+
9
+ def harvest
10
+ url = "#{solr_url}/#{@env_settings[:collection_name]}/select?q=*%3A*&fq=source%3ANSIDC&rows=0&wt=json&indent=true&facet=true&facet.mincount=1&facet.sort=count&facet.limit=-1"
11
+ super url, fields
12
+ end
13
+
14
+ def fields
15
+ { 'authoritative_id' => { weight: 1, source: 'NSIDC', creator: method(:standard_add_creator) },
16
+ 'full_title' => { weight: 2, source: 'NSIDC', creator: method(:standard_add_creator) },
17
+ 'copy_parameters' => { weight: 5, source: 'NSIDC', creator: method(:standard_add_creator) },
18
+ 'full_platforms' => { weight: 2, source: 'NSIDC', creator: method(:short_full_split_add_creator) },
19
+ 'full_sensors' => { weight: 2, source: 'NSIDC', creator: method(:short_full_split_add_creator) },
20
+ 'full_authors' => { weight: 1, source: 'NSIDC', creator: method(:standard_add_creator) }
21
+ }
22
+ end
23
+
24
+ def short_full_split_add_creator(value, count, field_weight, source)
25
+ add_docs = []
26
+ value.split(' > ').each do |v|
27
+ add_docs.concat(standard_add_creator(v, count, field_weight, source))
28
+ end
29
+ add_docs
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,60 @@
1
+ require 'json'
2
+ require 'rest-client'
3
+
4
+ require 'search_solr_tools'
5
+
6
+ module SearchSolrTools
7
+ module Harvesters
8
+ # Harvests data from NSIDC OAI and inserts it into Solr after it has been translated
9
+ class NsidcJson < Base
10
+ def initialize(env = 'development', die_on_failure = false)
11
+ super env, die_on_failure
12
+ @translator = Translators::NsidcJsonToSolr.new
13
+ Helpers::FacetConfiguration.import_bin_configuration(env)
14
+ end
15
+
16
+ def harvest_and_delete
17
+ puts "Running harvest of NSIDC catalog from #{nsidc_json_url}"
18
+ super(method(:harvest_nsidc_json_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NSIDC][:long_name]}\"")
19
+ end
20
+
21
+ # get translated entries from NSIDC OAI and add them to Solr
22
+ # this is the main entry point for the class
23
+ def harvest_nsidc_json_into_solr
24
+ result = docs_with_translated_entries_from_nsidc
25
+ insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
26
+ fail 'Failed to harvest and insert some authoritative IDs' if result[:failure_ids].length > 0
27
+ end
28
+
29
+ def nsidc_json_url
30
+ SolrEnvironments[@environment][:nsidc_dataset_metadata_url]
31
+ end
32
+
33
+ def result_ids_from_nsidc
34
+ get_results SolrEnvironments[@environment][:nsidc_oai_identifiers_url], '//xmlns:identifier'
35
+ end
36
+
37
+ def fetch_json_from_nsidc(id)
38
+ json_response = RestClient.get(nsidc_json_url + id + '.json')
39
+ JSON.parse(json_response)
40
+ end
41
+
42
+ def docs_with_translated_entries_from_nsidc
43
+ docs = []
44
+ failure_ids = []
45
+
46
+ result_ids_from_nsidc.each do |r|
47
+ id = r.text.split('/').last
48
+ begin
49
+ docs << { 'add' => { 'doc' => @translator.translate(fetch_json_from_nsidc(id)) } }
50
+ rescue => e
51
+ puts "Failed to fetch #{id} with error #{e}: #{e.backtrace}"
52
+ failure_ids << id
53
+ end
54
+ end
55
+
56
+ { add_docs: docs, failure_ids: failure_ids }
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,59 @@
1
+ require 'json'
2
+
3
+ module SearchSolrTools
4
+ module Harvesters
5
+ # Base class for harvesting Oai feeds into SOLR
6
+ class Oai < Base
7
+ # Used in query string params, resumptionToken
8
+
9
+ def initialize(env = 'development', die_on_failure = false)
10
+ super env, die_on_failure
11
+ # This is updated when we harvest based on the response
12
+ # from the server.
13
+ @resumption_token = nil
14
+ end
15
+
16
+ def encode_data_provider_url(url)
17
+ URI.encode(url)
18
+ end
19
+
20
+ def harvest_and_delete
21
+ puts "Running #{self.class.name} at #{metadata_url}"
22
+ super(method(:harvest), %(data_centers:"#{@data_centers}"))
23
+ end
24
+
25
+ def harvest
26
+ while @resumption_token.nil? || !@resumption_token.empty?
27
+ begin
28
+ insert_solr_docs(translated_docs(results))
29
+ rescue => e
30
+ puts "ERROR: #{e}"
31
+ raise e if @die_on_failure
32
+ end
33
+ end
34
+ end
35
+
36
+ def results
37
+ fail NotImplementedError
38
+ end
39
+
40
+ def metadata_url
41
+ fail NotImplementedError
42
+ end
43
+
44
+ def translated_docs(entries)
45
+ entries.map { |e| create_new_solr_add_doc_with_child(@translator.translate(e).root) }
46
+ end
47
+
48
+ private
49
+
50
+ def request_params
51
+ fail NotImplementedError
52
+ end
53
+
54
+ def request_string
55
+ "#{metadata_url}#{Helpers::QueryBuilder.build(request_params)}"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,38 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from Polar data catalogue and inserts it into
4
+ # Solr after it has been translated
5
+ class Pdc < Oai
6
+ def initialize(env = 'development', die_on_failure = false)
7
+ super
8
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:PDC][:long_name]
9
+ @translator = Helpers::IsoToSolr.new :pdc
10
+ end
11
+
12
+ def metadata_url
13
+ SolrEnvironments[@environment][:pdc_url]
14
+ end
15
+
16
+ def results
17
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
18
+
19
+ @resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces).first.text
20
+
21
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
22
+ end
23
+
24
+ private
25
+
26
+ def request_params
27
+ # If a 'resumptionToken' is supplied with any arguments other than 'verb',
28
+ # the response from PDC gives a badArgument error, saying "The argument
29
+ # 'resumptionToken' must be supplied without other arguments"
30
+ {
31
+ verb: 'ListRecords',
32
+ metadataPrefix: @resumption_token.nil? ? 'iso' : nil,
33
+ resumptionToken: @resumption_token
34
+ }.delete_if { |_k, v| v.nil? }
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,33 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests the RDA feed
4
+ class Rda < Oai
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super
7
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:RDA][:long_name]
8
+ @translator = Helpers::IsoToSolr.new :rda
9
+ end
10
+
11
+ def metadata_url
12
+ SolrEnvironments[@environment][:rda_url]
13
+ end
14
+
15
+ # resumption_token must be empty to stop the harvest loop; RDA's feed does not
16
+ # provide any resumption token and gets all the records in just one go
17
+ def results
18
+ @resumption_token = ''
19
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
20
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
21
+ end
22
+
23
+ private
24
+
25
+ def request_params
26
+ {
27
+ verb: 'ListRecords',
28
+ metadataPrefix: 'dif'
29
+ }
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,57 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from TDAR and inserts it into Solr after it has been translated
4
+ class Tdar < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :tdar
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of TDAR catalog from #{tdar_url}"
13
+ super(method(:harvest_tdar_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:TDAR][:long_name]}\"")
14
+ end
15
+
16
+ def harvest_tdar_into_solr
17
+ start_record = 1
18
+ while (entries = get_results_from_tdar(start_record)) && (entries.length > 0)
19
+ begin
20
+ insert_solr_docs get_docs_with_translated_entries_from_tdar(entries)
21
+ rescue => e
22
+ puts "ERROR: #{e}\n\n"
23
+ raise e if @die_on_failure
24
+ end
25
+ start_record += @page_size
26
+ end
27
+ end
28
+
29
+ def tdar_url
30
+ SolrEnvironments[@environment][:tdar_url]
31
+ end
32
+
33
+ def get_results_from_tdar(start_record)
34
+ get_results build_request(@page_size, start_record), './/atom:entry', 'application/xml'
35
+ end
36
+
37
+ def get_docs_with_translated_entries_from_tdar(entries)
38
+ docs = []
39
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
40
+ docs
41
+ end
42
+
43
+ def build_request(max_records = '25', start_record = '1')
44
+ request_url = tdar_url + '?_tDAR.searchType=ACADIS_RSS&'\
45
+ 'resourceTypes=DATASET&'\
46
+ 'groups[0].latitudeLongitudeBoxes[0].maximumLongitude=180&'\
47
+ 'groups[0].latitudeLongitudeBoxes[0].minimumLatitude=45&'\
48
+ 'groups[0].latitudeLongitudeBoxes[0].minimumLongitude=-180&'\
49
+ 'groups[0].latitudeLongitudeBoxes[0].maximumLatitude=90&'\
50
+ 'geoMode=ENVELOPE&'\
51
+ 'recordsPerPage=' + max_records.to_s + '&startRecord=' + start_record.to_s
52
+
53
+ request_url
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,74 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from USGS and inserts it into Solr after it has been translated
4
+ class Usgs < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :usgs
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of USGS catalog from #{usgs_url}"
13
+ super(method(:harvest_usgs_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:USGS][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from USGS and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_usgs_into_solr
19
+ start_index = 1
20
+ while (entries = get_results_from_usgs(start_index)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_usgs(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}"
25
+ raise e if @die_on_failure
26
+ end
27
+ start_index += @page_size
28
+ end
29
+ end
30
+
31
+ def usgs_url
32
+ SolrEnvironments[@environment][:usgs_url]
33
+ end
34
+
35
+ def get_results_from_usgs(start_index)
36
+ get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata', ''
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_usgs(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
46
+ Helpers::CswIsoQueryBuilder.get_query_string(usgs_url,
47
+ 'resultType' => resultType,
48
+ 'maxRecords' => maxRecords,
49
+ 'startPosition' => startPosition,
50
+ 'TypeNames' => '',
51
+ 'constraint' => bbox_constraint
52
+
53
+ )
54
+ end
55
+
56
+ def bbox_constraint
57
+ bbox = {
58
+ west: '-180',
59
+ south: '45',
60
+ east: '180',
61
+ north: '90'
62
+ }
63
+
64
+ URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
65
+ 'xmlns:gml="http://www.opengis.net/gml" ' \
66
+ 'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
67
+ '<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
68
+ '<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
69
+ '<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
70
+ '</gml:Envelope></ogc:BBOX></Filter>'
71
+ end
72
+ end
73
+ end
74
+ end