search_solr_tools 3.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,32 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ class Nmi < Oai
4
+ def initialize(env = 'development', die_on_failure = false)
5
+ super
6
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:NMI][:long_name]
7
+ @translator = Helpers::IsoToSolr.new :nmi
8
+ end
9
+
10
+ def metadata_url
11
+ SolrEnvironments[@environment][:nmi_url]
12
+ end
13
+
14
+ # resumption_token must be empty to stop the harvest loop; NMI's feed does not
15
+ # provide any resumption token and gets all the records in just one go
16
+ def results
17
+ @resumption_token = ''
18
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
19
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
20
+ end
21
+
22
+ private
23
+
24
+ def request_params
25
+ {
26
+ verb: 'ListRecords',
27
+ metadataPrefix: 'dif'
28
+ }
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,72 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from NODC and inserts it into Solr after it has been translated
4
+ class Nodc < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :nodc
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of NODC catalog from #{nodc_url}"
13
+ super(method(:harvest_nodc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NODC][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from NODC and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_nodc_into_solr
19
+ start_index = 1
20
+ while (entries = get_results_from_nodc(start_index)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_nodc(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}"
25
+ raise e if @die_on_failure
26
+ end
27
+ start_index += @page_size
28
+ end
29
+ end
30
+
31
+ def nodc_url
32
+ SolrEnvironments[@environment][:nodc_url]
33
+ end
34
+
35
+ def get_results_from_nodc(start_index)
36
+ get_results build_csw_request('results', @page_size, start_index), '//gmi:MI_Metadata'
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_nodc(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
46
+ Helpers::CswIsoQueryBuilder.get_query_string(nodc_url,
47
+ 'resultType' => resultType,
48
+ 'maxRecords' => maxRecords,
49
+ 'startPosition' => startPosition,
50
+ 'constraint' => bbox_constraint
51
+ )
52
+ end
53
+
54
+ def bbox_constraint
55
+ bbox = {
56
+ west: '-180',
57
+ south: '45',
58
+ east: '180',
59
+ north: '90'
60
+ }
61
+
62
+ URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
63
+ 'xmlns:gml="http://www.opengis.net/gml" ' \
64
+ 'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
65
+ '<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
66
+ '<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
67
+ '<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
68
+ '</gml:Envelope></ogc:BBOX></Filter>'
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,33 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ class NsidcAutoSuggest < AutoSuggest
4
+ def harvest_and_delete
5
+ puts 'Building auto-suggest indexes for NSIDC'
6
+ super(method(:harvest), "source:\"NSIDC\"", @env_settings[:auto_suggest_collection_name])
7
+ end
8
+
9
+ def harvest
10
+ url = "#{solr_url}/#{@env_settings[:collection_name]}/select?q=*%3A*&fq=source%3ANSIDC&rows=0&wt=json&indent=true&facet=true&facet.mincount=1&facet.sort=count&facet.limit=-1"
11
+ super url, fields
12
+ end
13
+
14
+ def fields
15
+ { 'authoritative_id' => { weight: 1, source: 'NSIDC', creator: method(:standard_add_creator) },
16
+ 'full_title' => { weight: 2, source: 'NSIDC', creator: method(:standard_add_creator) },
17
+ 'copy_parameters' => { weight: 5, source: 'NSIDC', creator: method(:standard_add_creator) },
18
+ 'full_platforms' => { weight: 2, source: 'NSIDC', creator: method(:short_full_split_add_creator) },
19
+ 'full_sensors' => { weight: 2, source: 'NSIDC', creator: method(:short_full_split_add_creator) },
20
+ 'full_authors' => { weight: 1, source: 'NSIDC', creator: method(:standard_add_creator) }
21
+ }
22
+ end
23
+
24
+ def short_full_split_add_creator(value, count, field_weight, source)
25
+ add_docs = []
26
+ value.split(' > ').each do |v|
27
+ add_docs.concat(standard_add_creator(v, count, field_weight, source))
28
+ end
29
+ add_docs
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,60 @@
1
+ require 'json'
2
+ require 'rest-client'
3
+
4
+ require 'search_solr_tools'
5
+
6
+ module SearchSolrTools
7
+ module Harvesters
8
+ # Harvests data from NSIDC OAI and inserts it into Solr after it has been translated
9
+ class NsidcJson < Base
10
+ def initialize(env = 'development', die_on_failure = false)
11
+ super env, die_on_failure
12
+ @translator = Translators::NsidcJsonToSolr.new
13
+ Helpers::FacetConfiguration.import_bin_configuration(env)
14
+ end
15
+
16
+ def harvest_and_delete
17
+ puts "Running harvest of NSIDC catalog from #{nsidc_json_url}"
18
+ super(method(:harvest_nsidc_json_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NSIDC][:long_name]}\"")
19
+ end
20
+
21
+ # get translated entries from NSIDC OAI and add them to Solr
22
+ # this is the main entry point for the class
23
+ def harvest_nsidc_json_into_solr
24
+ result = docs_with_translated_entries_from_nsidc
25
+ insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
26
+ fail 'Failed to harvest and insert some authoritative IDs' if result[:failure_ids].length > 0
27
+ end
28
+
29
+ def nsidc_json_url
30
+ SolrEnvironments[@environment][:nsidc_dataset_metadata_url]
31
+ end
32
+
33
+ def result_ids_from_nsidc
34
+ get_results SolrEnvironments[@environment][:nsidc_oai_identifiers_url], '//xmlns:identifier'
35
+ end
36
+
37
+ def fetch_json_from_nsidc(id)
38
+ json_response = RestClient.get(nsidc_json_url + id + '.json')
39
+ JSON.parse(json_response)
40
+ end
41
+
42
+ def docs_with_translated_entries_from_nsidc
43
+ docs = []
44
+ failure_ids = []
45
+
46
+ result_ids_from_nsidc.each do |r|
47
+ id = r.text.split('/').last
48
+ begin
49
+ docs << { 'add' => { 'doc' => @translator.translate(fetch_json_from_nsidc(id)) } }
50
+ rescue => e
51
+ puts "Failed to fetch #{id} with error #{e}: #{e.backtrace}"
52
+ failure_ids << id
53
+ end
54
+ end
55
+
56
+ { add_docs: docs, failure_ids: failure_ids }
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,59 @@
1
+ require 'json'
2
+
3
+ module SearchSolrTools
4
+ module Harvesters
5
+ # Base class for harvesting Oai feeds into SOLR
6
+ class Oai < Base
7
+ # Used in query string params, resumptionToken
8
+
9
+ def initialize(env = 'development', die_on_failure = false)
10
+ super env, die_on_failure
11
+ # This is updated when we harvest based on the response
12
+ # from the server.
13
+ @resumption_token = nil
14
+ end
15
+
16
+ def encode_data_provider_url(url)
17
+ URI.encode(url)
18
+ end
19
+
20
+ def harvest_and_delete
21
+ puts "Running #{self.class.name} at #{metadata_url}"
22
+ super(method(:harvest), %(data_centers:"#{@data_centers}"))
23
+ end
24
+
25
+ def harvest
26
+ while @resumption_token.nil? || !@resumption_token.empty?
27
+ begin
28
+ insert_solr_docs(translated_docs(results))
29
+ rescue => e
30
+ puts "ERROR: #{e}"
31
+ raise e if @die_on_failure
32
+ end
33
+ end
34
+ end
35
+
36
+ def results
37
+ fail NotImplementedError
38
+ end
39
+
40
+ def metadata_url
41
+ fail NotImplementedError
42
+ end
43
+
44
+ def translated_docs(entries)
45
+ entries.map { |e| create_new_solr_add_doc_with_child(@translator.translate(e).root) }
46
+ end
47
+
48
+ private
49
+
50
+ def request_params
51
+ fail NotImplementedError
52
+ end
53
+
54
+ def request_string
55
+ "#{metadata_url}#{Helpers::QueryBuilder.build(request_params)}"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,38 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from Polar data catalogue and inserts it into
4
+ # Solr after it has been translated
5
+ class Pdc < Oai
6
+ def initialize(env = 'development', die_on_failure = false)
7
+ super
8
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:PDC][:long_name]
9
+ @translator = Helpers::IsoToSolr.new :pdc
10
+ end
11
+
12
+ def metadata_url
13
+ SolrEnvironments[@environment][:pdc_url]
14
+ end
15
+
16
+ def results
17
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
18
+
19
+ @resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces).first.text
20
+
21
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
22
+ end
23
+
24
+ private
25
+
26
+ def request_params
27
+ # If a 'resumptionToken' is supplied with any arguments other than 'verb',
28
+ # the response from PDC gives a badArgument error, saying "The argument
29
+ # 'resumptionToken' must be supplied without other arguments"
30
+ {
31
+ verb: 'ListRecords',
32
+ metadataPrefix: @resumption_token.nil? ? 'iso' : nil,
33
+ resumptionToken: @resumption_token
34
+ }.delete_if { |_k, v| v.nil? }
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,33 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests the RDA feed
4
+ class Rda < Oai
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super
7
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:RDA][:long_name]
8
+ @translator = Helpers::IsoToSolr.new :rda
9
+ end
10
+
11
+ def metadata_url
12
+ SolrEnvironments[@environment][:rda_url]
13
+ end
14
+
15
+ # resumption_token must be empty to stop the harvest loop; RDA's feed does not
16
+ # provide any resumption token and gets all the records in just one go
17
+ def results
18
+ @resumption_token = ''
19
+ list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
20
+ list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
21
+ end
22
+
23
+ private
24
+
25
+ def request_params
26
+ {
27
+ verb: 'ListRecords',
28
+ metadataPrefix: 'dif'
29
+ }
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,57 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from TDAR and inserts it into Solr after it has been translated
4
+ class Tdar < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :tdar
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of TDAR catalog from #{tdar_url}"
13
+ super(method(:harvest_tdar_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:TDAR][:long_name]}\"")
14
+ end
15
+
16
+ def harvest_tdar_into_solr
17
+ start_record = 1
18
+ while (entries = get_results_from_tdar(start_record)) && (entries.length > 0)
19
+ begin
20
+ insert_solr_docs get_docs_with_translated_entries_from_tdar(entries)
21
+ rescue => e
22
+ puts "ERROR: #{e}\n\n"
23
+ raise e if @die_on_failure
24
+ end
25
+ start_record += @page_size
26
+ end
27
+ end
28
+
29
+ def tdar_url
30
+ SolrEnvironments[@environment][:tdar_url]
31
+ end
32
+
33
+ def get_results_from_tdar(start_record)
34
+ get_results build_request(@page_size, start_record), './/atom:entry', 'application/xml'
35
+ end
36
+
37
+ def get_docs_with_translated_entries_from_tdar(entries)
38
+ docs = []
39
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
40
+ docs
41
+ end
42
+
43
+ def build_request(max_records = '25', start_record = '1')
44
+ request_url = tdar_url + '?_tDAR.searchType=ACADIS_RSS&'\
45
+ 'resourceTypes=DATASET&'\
46
+ 'groups[0].latitudeLongitudeBoxes[0].maximumLongitude=180&'\
47
+ 'groups[0].latitudeLongitudeBoxes[0].minimumLatitude=45&'\
48
+ 'groups[0].latitudeLongitudeBoxes[0].minimumLongitude=-180&'\
49
+ 'groups[0].latitudeLongitudeBoxes[0].maximumLatitude=90&'\
50
+ 'geoMode=ENVELOPE&'\
51
+ 'recordsPerPage=' + max_records.to_s + '&startRecord=' + start_record.to_s
52
+
53
+ request_url
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,74 @@
1
+ module SearchSolrTools
2
+ module Harvesters
3
+ # Harvests data from USGS and inserts it into Solr after it has been translated
4
+ class Usgs < Base
5
+ def initialize(env = 'development', die_on_failure = false)
6
+ super env, die_on_failure
7
+ @page_size = 100
8
+ @translator = Helpers::IsoToSolr.new :usgs
9
+ end
10
+
11
+ def harvest_and_delete
12
+ puts "Running harvest of USGS catalog from #{usgs_url}"
13
+ super(method(:harvest_usgs_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:USGS][:long_name]}\"")
14
+ end
15
+
16
+ # get translated entries from USGS and add them to Solr
17
+ # this is the main entry point for the class
18
+ def harvest_usgs_into_solr
19
+ start_index = 1
20
+ while (entries = get_results_from_usgs(start_index)) && (entries.length > 0)
21
+ begin
22
+ insert_solr_docs get_docs_with_translated_entries_from_usgs(entries)
23
+ rescue => e
24
+ puts "ERROR: #{e}"
25
+ raise e if @die_on_failure
26
+ end
27
+ start_index += @page_size
28
+ end
29
+ end
30
+
31
+ def usgs_url
32
+ SolrEnvironments[@environment][:usgs_url]
33
+ end
34
+
35
+ def get_results_from_usgs(start_index)
36
+ get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata', ''
37
+ end
38
+
39
+ def get_docs_with_translated_entries_from_usgs(entries)
40
+ docs = []
41
+ entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
42
+ docs
43
+ end
44
+
45
+ def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
46
+ Helpers::CswIsoQueryBuilder.get_query_string(usgs_url,
47
+ 'resultType' => resultType,
48
+ 'maxRecords' => maxRecords,
49
+ 'startPosition' => startPosition,
50
+ 'TypeNames' => '',
51
+ 'constraint' => bbox_constraint
52
+
53
+ )
54
+ end
55
+
56
+ def bbox_constraint
57
+ bbox = {
58
+ west: '-180',
59
+ south: '45',
60
+ east: '180',
61
+ north: '90'
62
+ }
63
+
64
+ URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
65
+ 'xmlns:gml="http://www.opengis.net/gml" ' \
66
+ 'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
67
+ '<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
68
+ '<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
69
+ '<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
70
+ '</gml:Envelope></ogc:BBOX></Filter>'
71
+ end
72
+ end
73
+ end
74
+ end