search_solr_tools 6.1.0 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +11 -2
  3. data/bin/search_solr_tools +5 -17
  4. data/lib/search_solr_tools/config/environments.rb +3 -1
  5. data/lib/search_solr_tools/config/environments.yaml +0 -32
  6. data/lib/search_solr_tools/errors/harvest_error.rb +44 -31
  7. data/lib/search_solr_tools/harvesters/auto_suggest.rb +5 -3
  8. data/lib/search_solr_tools/harvesters/base.rb +21 -20
  9. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +7 -5
  10. data/lib/search_solr_tools/harvesters/nsidc_json.rb +9 -8
  11. data/lib/search_solr_tools/helpers/bounding_box_util.rb +8 -8
  12. data/lib/search_solr_tools/helpers/facet_configuration.rb +3 -1
  13. data/lib/search_solr_tools/helpers/harvest_status.rb +10 -8
  14. data/lib/search_solr_tools/helpers/iso_namespaces.rb +3 -1
  15. data/lib/search_solr_tools/helpers/solr_format.rb +25 -45
  16. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +13 -10
  17. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +2 -0
  18. data/lib/search_solr_tools/translators/nsidc_json.rb +48 -44
  19. data/lib/search_solr_tools/version.rb +3 -1
  20. data/lib/search_solr_tools.rb +3 -2
  21. metadata +3 -45
  22. data/lib/search_solr_tools/harvesters/adc.rb +0 -49
  23. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +0 -46
  24. data/lib/search_solr_tools/harvesters/bcodmo.rb +0 -64
  25. data/lib/search_solr_tools/harvesters/data_one.rb +0 -49
  26. data/lib/search_solr_tools/harvesters/echo.rb +0 -52
  27. data/lib/search_solr_tools/harvesters/eol.rb +0 -51
  28. data/lib/search_solr_tools/harvesters/gtnp.rb +0 -67
  29. data/lib/search_solr_tools/harvesters/ices.rb +0 -58
  30. data/lib/search_solr_tools/harvesters/ncdc_paleo.rb +0 -62
  31. data/lib/search_solr_tools/harvesters/nmi.rb +0 -34
  32. data/lib/search_solr_tools/harvesters/nodc.rb +0 -75
  33. data/lib/search_solr_tools/harvesters/oai.rb +0 -62
  34. data/lib/search_solr_tools/harvesters/pdc.rb +0 -40
  35. data/lib/search_solr_tools/harvesters/r2r.rb +0 -61
  36. data/lib/search_solr_tools/harvesters/rda.rb +0 -35
  37. data/lib/search_solr_tools/harvesters/tdar.rb +0 -71
  38. data/lib/search_solr_tools/harvesters/usgs.rb +0 -76
  39. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +0 -29
  40. data/lib/search_solr_tools/helpers/data_one_format.rb +0 -74
  41. data/lib/search_solr_tools/helpers/iso_to_solr.rb +0 -97
  42. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +0 -197
  43. data/lib/search_solr_tools/helpers/ncdc_paleo_format.rb +0 -61
  44. data/lib/search_solr_tools/helpers/query_builder.rb +0 -13
  45. data/lib/search_solr_tools/helpers/r2r_format.rb +0 -25
  46. data/lib/search_solr_tools/helpers/selectors.rb +0 -22
  47. data/lib/search_solr_tools/helpers/tdar_format.rb +0 -70
  48. data/lib/search_solr_tools/helpers/usgs_format.rb +0 -50
  49. data/lib/search_solr_tools/selectors/adc.rb +0 -96
  50. data/lib/search_solr_tools/selectors/data_one.rb +0 -96
  51. data/lib/search_solr_tools/selectors/echo_iso.rb +0 -112
  52. data/lib/search_solr_tools/selectors/ices_iso.rb +0 -108
  53. data/lib/search_solr_tools/selectors/ncdc_paleo.rb +0 -90
  54. data/lib/search_solr_tools/selectors/nmi.rb +0 -107
  55. data/lib/search_solr_tools/selectors/nodc_iso.rb +0 -108
  56. data/lib/search_solr_tools/selectors/pdc_iso.rb +0 -109
  57. data/lib/search_solr_tools/selectors/r2r.rb +0 -115
  58. data/lib/search_solr_tools/selectors/rda.rb +0 -107
  59. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +0 -91
  60. data/lib/search_solr_tools/selectors/usgs_iso.rb +0 -107
  61. data/lib/search_solr_tools/translators/bcodmo_json.rb +0 -89
  62. data/lib/search_solr_tools/translators/eol_to_solr.rb +0 -84
  63. data/lib/search_solr_tools/translators/gtnp_json.rb +0 -59
@@ -1,64 +0,0 @@
1
- require_relative 'base'
2
- require 'json'
3
- require 'rest-client'
4
-
5
- module SearchSolrTools
6
- module Harvesters
7
- # Harvests data from BcoDmo endpoint, translates and adds it to solr
8
- class BcoDmo < Base
9
- def initialize(env = 'development', die_on_failure = false)
10
- super env, die_on_failure
11
- @translator = Translators::BcodmoJsonToSolr.new
12
- @wkt_parser = RGeo::WKRep::WKTParser.new(nil, {}) # (factory_generator_=nil,
13
- end
14
-
15
- def harvest_and_delete
16
- puts "Running harvest of BCO-DMO catalog from #{bcodmo_url}"
17
- super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
18
- end
19
-
20
- def bcodmo_url
21
- SolrEnvironments[@environment][:bcodmo_url]
22
- end
23
-
24
- def harvest_bcodmo_into_solr
25
- result = translate_bcodmo
26
- insert_solr_docs(result[:add_docs], Base::JSON_CONTENT_TYPE)
27
-
28
- errors_exist = result[:failure_ids].length > 0
29
- fail 'Failed to harvest some records from BCO-DMO' if errors_exist && @die_on_failure
30
- end
31
-
32
- def translate_bcodmo
33
- documents = []
34
- failure_ids = []
35
- request_json(SolrEnvironments[@environment][:bcodmo_url]).each do |record|
36
- geometry = request_json(record['geometryUrl'])
37
- results = parse_record(record, geometry)
38
- results[:documents].each { |d| documents << d }
39
- results[:failure_ids].each { |id| failure_ids << id }
40
- end
41
- { add_docs: documents, failure_ids: failure_ids }
42
- end
43
-
44
- def request_json(url)
45
- puts "Request: #{url}"
46
- JSON.parse(RestClient.get(url))
47
- end
48
-
49
- def parse_record(record, geometry)
50
- documents = []
51
- failure_ids = []
52
- begin
53
- JSON.parse(RestClient.get(record['datasets'])).each do |dataset|
54
- documents << { 'add' => { 'doc' => @translator.translate(dataset, record, geometry) } }
55
- end
56
- rescue => e
57
- puts "Failed to add record #{record['id']} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
58
- failure_ids << record['id']
59
- end
60
- { documents: documents, failure_ids: failure_ids }
61
- end
62
- end
63
- end
64
- end
@@ -1,49 +0,0 @@
1
- require_relative 'base'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- class DataOne < Base
6
- def initialize(env = 'development', die_on_failure = false)
7
- super
8
- @page_size = 250
9
- @translator = Helpers::IsoToSolr.new :data_one
10
- end
11
-
12
- def harvest_and_delete
13
- puts "Running harvest of dataONE catalog from #{metadata_url}"
14
- super(method(:harvest_data_one_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:DATA_ONE][:long_name]}\"")
15
- end
16
-
17
- def harvest_data_one_into_solr
18
- start = 0
19
- while (entries = get_results_from_data_one(start)) && (entries.length > 0)
20
- begin
21
- insert_solr_docs(get_docs_with_translated_entries_from_data_one(entries))
22
- rescue => e
23
- puts "ERROR: #{e}\n\n"
24
- raise e if @die_on_failure
25
- end
26
- start += @page_size
27
- end
28
- end
29
-
30
- def get_results_from_data_one(start)
31
- get_results(build_request(start, @page_size), './response/result/doc')
32
- end
33
-
34
- def metadata_url
35
- SolrEnvironments[@environment][:data_one_url]
36
- end
37
-
38
- def get_docs_with_translated_entries_from_data_one(entries)
39
- entries.map do |e|
40
- create_new_solr_add_doc_with_child(@translator.translate(e).root)
41
- end
42
- end
43
-
44
- def build_request(start = 0, max_records = 100)
45
- "#{metadata_url}&start=#{start}&rows=#{max_records}"
46
- end
47
- end
48
- end
49
- end
@@ -1,52 +0,0 @@
1
- require_relative 'base'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- # Harvests data from ECHO and inserts it into Solr after it has been translated
6
- class Echo < Base
7
- def initialize(env = 'development', die_on_failure = false)
8
- super env, die_on_failure
9
- @page_size = 100
10
- @translator = Helpers::IsoToSolr.new :echo
11
- end
12
-
13
- def harvest_and_delete
14
- puts "Running harvest of ECHO catalog from #{echo_url}"
15
- super(method(:harvest_echo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ECHO][:long_name]}\"")
16
- end
17
-
18
- # get translated entries from ECHO and add them to Solr
19
- # this is the main entry point for the class
20
- def harvest_echo_into_solr
21
- page_num = 1
22
- while (entries = get_results_from_echo(page_num)) && (entries.length > 0)
23
- begin
24
- insert_solr_docs get_docs_with_translated_entries_from_echo(entries)
25
- rescue => e
26
- puts "ERROR: #{e}\n\n"
27
- raise e if @die_on_failure
28
- end
29
- page_num += 1
30
- end
31
- end
32
-
33
- def echo_url
34
- SolrEnvironments[@environment][:echo_url]
35
- end
36
-
37
- def get_results_from_echo(page_num)
38
- get_results build_request(@page_size, page_num), './/results/result', 'application/echo10+xml'
39
- end
40
-
41
- def get_docs_with_translated_entries_from_echo(entries)
42
- entries.map do |entry|
43
- create_new_solr_add_doc_with_child(@translator.translate(entry).root)
44
- end
45
- end
46
-
47
- def build_request(max_records = '25', page_num = '1')
48
- echo_url + '&page_size=' + max_records.to_s + '&page_num=' + page_num.to_s
49
- end
50
- end
51
- end
52
- end
@@ -1,51 +0,0 @@
1
- require_relative 'base'
2
- require 'json'
3
- require 'rgeo/geo_json'
4
-
5
- module SearchSolrTools
6
- module Harvesters
7
- class Eol < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @translator = SearchSolrTools::Translators::EolToSolr.new
11
- end
12
-
13
- def harvest_and_delete
14
- puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
15
- SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
16
- super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
17
- end
18
-
19
- def harvest_eol_into_solr
20
- solr_add_queries = eol_dataset_urls.map do |dataset|
21
- begin
22
- doc = open_xml_document(dataset)
23
- if doc.xpath('//xmlns:metadata').size > 1
24
- # THREDDS allows for a dataset of datasests, EOL should not utilize this
25
- fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
26
- end
27
- metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
28
- { 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
29
- rescue => e
30
- puts "ERROR: #{e}"
31
- puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
32
- raise e if @die_on_failure
33
- next
34
- end
35
- end
36
- insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
37
- end
38
-
39
- def eol_dataset_urls
40
- SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
41
- doc = open_xml_document(endpoint)
42
- doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
43
- end
44
- end
45
-
46
- def open_xml_document(url)
47
- Nokogiri::XML(open(url), &:strict)
48
- end
49
- end
50
- end
51
- end
@@ -1,67 +0,0 @@
1
- require_relative 'base'
2
- require 'json'
3
- require 'rest-client'
4
-
5
- module SearchSolrTools
6
- module Harvesters
7
- # Harvests data from GTN-P endpoints, translates and adds it to solr
8
- class GtnP < Base
9
- def initialize(env = 'development', die_on_failure = false)
10
- super env, die_on_failure
11
- @translator = Translators::GtnpJsonToSolr.new
12
- end
13
-
14
- def gtnp_service_urls
15
- json_records = []
16
- SearchSolrTools::SolrEnvironments[:common][:gtnp].flat_map do |endpoint|
17
- record = request_json(endpoint)
18
- json_records << record
19
- end
20
- json_records
21
- end
22
-
23
- def harvest_and_delete
24
- puts 'Running harvest of GTN-P catalog using the following configured GTN-P URLs:'
25
- SearchSolrTools::SolrEnvironments[:common][:gtnp].each { |x| puts x }
26
- super(method(:harvest_gtnp_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:GTNP][:long_name]}\"")
27
- end
28
-
29
- def harvest_gtnp_into_solr
30
- result = translate_gtnp
31
- insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
32
- fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
33
- end
34
-
35
- def translate_gtnp
36
- documents = []
37
- failure_ids = []
38
- gtnp_records = gtnp_service_urls
39
- gtnp_records.each do |record|
40
- results = parse_record(record)
41
- results[:documents].each { |d| documents << d }
42
- results[:failure_ids].each { |id| failure_ids << id }
43
- end
44
- { add_docs: documents, failure_ids: failure_ids }
45
- end
46
-
47
- def request_json(url)
48
- JSON.parse(RestClient.get(url))
49
- end
50
-
51
- def parse_record(record)
52
- documents = []
53
- failure_ids = []
54
- begin
55
- record.drop(1).each do |dataset|
56
- trans_doc = @translator.translate(dataset, record[0])
57
- documents << { 'add' => { 'doc' => trans_doc } }
58
- end
59
- rescue => e
60
- puts "Failed to add record #{record[0][:title]} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
61
- failure_ids << record[0][:title]
62
- end
63
- { documents: documents, failure_ids: failure_ids }
64
- end
65
- end
66
- end
67
- end
@@ -1,58 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/csw_iso_query_builder'
3
-
4
- module SearchSolrTools
5
- module Harvesters
6
- # Harvests data from ICES and inserts it into Solr after it has been translated
7
- class Ices < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @page_size = 100
11
- @translator = Helpers::IsoToSolr.new :ices
12
- end
13
-
14
- def harvest_and_delete
15
- puts "Running harvest of ICES catalog from #{ices_url}"
16
- super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
17
- end
18
-
19
- # get translated entries from ICES and add them to Solr
20
- # this is the main entry point for the class
21
- def harvest_ices_into_solr
22
- start_index = 1
23
- while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
24
- begin
25
- insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
26
- rescue => e
27
- puts "ERROR: #{e}"
28
- raise e if @die_on_failure
29
- end
30
- start_index += @page_size
31
- end
32
- end
33
-
34
- def ices_url
35
- SolrEnvironments[@environment][:ices_url]
36
- end
37
-
38
- def get_results_from_ices(start_index)
39
- get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
40
- end
41
-
42
- def get_docs_with_translated_entries_from_ices(entries)
43
- entries.map do |entry|
44
- create_new_solr_add_doc_with_child(@translator.translate(entry).root)
45
- end
46
- end
47
-
48
- def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
49
- Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
50
- 'resultType' => resultType,
51
- 'maxRecords' => maxRecords,
52
- 'startPosition' => startPosition,
53
- 'constraintLanguage' => 'CQL_TEXT',
54
- 'outputSchema' => 'http://www.isotc211.org/2005/gmd')
55
- end
56
- end
57
- end
58
- end
@@ -1,62 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/csw_iso_query_builder'
3
-
4
- module SearchSolrTools
5
- module Harvesters
6
- # Harvests data from NODC PALEO and inserts it into Solr after it has been translated
7
- class NcdcPaleo < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @page_size = 50
11
- @translator = Helpers::IsoToSolr.new :ncdc_paleo
12
- end
13
-
14
- def harvest_and_delete
15
- puts "Running harvest of NCDC Paleo catalog from #{ncdc_paleo_url}"
16
- super(method(:harvest_ncdc_paleo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NCDC_PALEO][:long_name]}\"")
17
- end
18
-
19
- def harvest_ncdc_paleo_into_solr
20
- start_index = 1
21
- while (entries = get_results_from_ncdc_paleo_url(start_index)) && (entries.length > 0)
22
- begin
23
- insert_solr_docs get_docs_with_translated_entries_from_ncdc_paleo(entries)
24
- rescue => e
25
- puts "ERROR: #{e}"
26
- raise e if @die_on_failure
27
- end
28
- start_index += @page_size
29
- end
30
- end
31
-
32
- def ncdc_paleo_url
33
- SolrEnvironments[@environment][:ncdc_paleo_url]
34
- end
35
-
36
- def get_results_from_ncdc_paleo_url(start_index)
37
- get_results build_csw_request('results', @page_size, start_index), '//csw:Record'
38
- end
39
-
40
- def get_docs_with_translated_entries_from_ncdc_paleo(entries)
41
- auth_ids = entries.map { |e| e.xpath("./dc:identifier[@scheme='urn:x-esri:specification:ServiceType:ArcIMS:Metadata:DocID']").text }
42
-
43
- auth_ids.map do |record|
44
- result_xml = get_results("https://gis.ncdc.noaa.gov/gptpaleo/csw?getxml=#{record}",
45
- '/rdf:RDF/rdf:Description').first
46
- solr_doc = create_new_solr_add_doc_with_child(@translator.translate(result_xml).root)
47
- insert_node = solr_doc.at_xpath('//doc')
48
- insert_node.add_child("<field name='authoritative_id'>#{record}</field>")
49
- insert_node.add_child("<field name='dataset_url'>https://gis.ncdc.noaa.gov/gptpaleo/catalog/search/resource/details.page?uuid=#{record}")
50
- solr_doc.root
51
- end
52
- end
53
-
54
- def build_csw_request(resultType = 'results', maxRecords = '1000', startPosition = '1')
55
- Helpers::CswIsoQueryBuilder.get_query_string(ncdc_paleo_url,
56
- 'resultType' => resultType,
57
- 'maxRecords' => maxRecords,
58
- 'startPosition' => startPosition)
59
- end
60
- end
61
- end
62
- end
@@ -1,34 +0,0 @@
1
- require_relative 'oai'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- class Nmi < Oai
6
- def initialize(env = 'development', die_on_failure = false)
7
- super
8
- @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:NMI][:long_name]
9
- @translator = Helpers::IsoToSolr.new :nmi
10
- end
11
-
12
- def metadata_url
13
- SolrEnvironments[@environment][:nmi_url]
14
- end
15
-
16
- # resumption_token must be empty to stop the harvest loop; NMI's feed does not
17
- # provide any resumption token and gets all the records in just one go
18
- def results
19
- @resumption_token = ''
20
- list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
21
- list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
22
- end
23
-
24
- private
25
-
26
- def request_params
27
- {
28
- verb: 'ListRecords',
29
- metadataPrefix: 'dif'
30
- }
31
- end
32
- end
33
- end
34
- end
@@ -1,75 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/csw_iso_query_builder'
3
-
4
- module SearchSolrTools
5
- module Harvesters
6
- # Harvests data from NODC and inserts it into Solr after it has been translated
7
- class Nodc < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @page_size = 50
11
- @translator = Helpers::IsoToSolr.new :nodc
12
- end
13
-
14
- def harvest_and_delete
15
- puts "Running harvest of NODC catalog from #{nodc_url}"
16
- super(method(:harvest_nodc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NODC][:long_name]}\"")
17
- end
18
-
19
- # get translated entries from NODC and add them to Solr
20
- # this is the main entry point for the class
21
- def harvest_nodc_into_solr
22
- start_index = 1
23
- while (entries = get_results_from_nodc(start_index)) && (entries.length > 0)
24
- begin
25
- insert_solr_docs get_docs_with_translated_entries_from_nodc(entries)
26
- rescue => e
27
- puts "ERROR: #{e}"
28
- raise e if @die_on_failure
29
- end
30
- start_index += @page_size
31
- end
32
- end
33
-
34
- def nodc_url
35
- SolrEnvironments[@environment][:nodc_url]
36
- end
37
-
38
- def get_results_from_nodc(start_index)
39
- get_results build_csw_request('results', @page_size, start_index), '//gmi:MI_Metadata'
40
- end
41
-
42
- def get_docs_with_translated_entries_from_nodc(entries)
43
- entries.map do |entry|
44
- create_new_solr_add_doc_with_child(@translator.translate(entry).root)
45
- end
46
- end
47
-
48
- def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
49
- Helpers::CswIsoQueryBuilder.get_query_string(nodc_url,
50
- 'resultType' => resultType,
51
- 'maxRecords' => maxRecords,
52
- 'startPosition' => startPosition,
53
- 'constraint' => bbox_constraint,
54
- 'outputSchema' => 'http://www.isotc211.org/2005/gmd')
55
- end
56
-
57
- def bbox_constraint
58
- bbox = {
59
- west: '-180',
60
- south: '45',
61
- east: '180',
62
- north: '90'
63
- }
64
-
65
- URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
66
- 'xmlns:gml="http://www.opengis.net/gml" ' \
67
- 'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
68
- '<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
69
- '<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
70
- '<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
71
- '</gml:Envelope></ogc:BBOX></Filter>'
72
- end
73
- end
74
- end
75
- end
@@ -1,62 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/query_builder'
3
-
4
- require 'json'
5
-
6
- module SearchSolrTools
7
- module Harvesters
8
- # Base class for harvesting Oai feeds into SOLR
9
- class Oai < Base
10
- # Used in query string params, resumptionToken
11
-
12
- def initialize(env = 'development', die_on_failure = false)
13
- super env, die_on_failure
14
- # This is updated when we harvest based on the response
15
- # from the server.
16
- @resumption_token = nil
17
- end
18
-
19
- def encode_data_provider_url(url)
20
- URI.encode(url)
21
- end
22
-
23
- def harvest_and_delete
24
- puts "Running #{self.class.name} at #{metadata_url}"
25
- super(method(:harvest), %(data_centers:"#{@data_centers}"))
26
- end
27
-
28
- def harvest
29
- while @resumption_token.nil? || !@resumption_token.empty?
30
- begin
31
- insert_solr_docs(translated_docs(results))
32
- rescue => e
33
- puts "ERROR: #{e.class} #{e}"
34
- raise e if @die_on_failure
35
- end
36
- end
37
- end
38
-
39
- def results
40
- fail NotImplementedError
41
- end
42
-
43
- def metadata_url
44
- fail NotImplementedError
45
- end
46
-
47
- def translated_docs(entries)
48
- entries.map { |e| create_new_solr_add_doc_with_child(@translator.translate(e).root) }
49
- end
50
-
51
- private
52
-
53
- def request_params
54
- fail NotImplementedError
55
- end
56
-
57
- def request_string
58
- "#{metadata_url}#{Helpers::QueryBuilder.build(request_params)}"
59
- end
60
- end
61
- end
62
- end
@@ -1,40 +0,0 @@
1
- require_relative 'oai'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- # Harvests data from Polar data catalogue and inserts it into
6
- # Solr after it has been translated
7
- class Pdc < Oai
8
- def initialize(env = 'development', die_on_failure = false)
9
- super
10
- @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:PDC][:long_name]
11
- @translator = Helpers::IsoToSolr.new :pdc
12
- end
13
-
14
- def metadata_url
15
- SolrEnvironments[@environment][:pdc_url]
16
- end
17
-
18
- def results
19
- list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
20
-
21
- @resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces).first.text
22
-
23
- list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
24
- end
25
-
26
- private
27
-
28
- def request_params
29
- # If a 'resumptionToken' is supplied with any arguments other than 'verb',
30
- # the response from PDC gives a badArgument error, saying "The argument
31
- # 'resumptionToken' must be supplied without other arguments"
32
- {
33
- verb: 'ListRecords',
34
- metadataPrefix: @resumption_token.nil? ? 'iso' : nil,
35
- resumptionToken: @resumption_token
36
- }.delete_if { |_k, v| v.nil? }
37
- end
38
- end
39
- end
40
- end
@@ -1,61 +0,0 @@
1
- require_relative 'base'
2
-
3
- require 'nokogiri'
4
- require 'rest-client'
5
-
6
- module SearchSolrTools
7
- module Harvesters
8
- class R2R < Base
9
- def initialize(env = 'development', die_on_failure = false)
10
- super
11
- @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
12
- @translator = Helpers::IsoToSolr.new :r2r
13
- @metadata_url = SolrEnvironments[@environment][:r2r_url]
14
- end
15
-
16
- def harvest_and_delete
17
- puts "Running #{self.class.name} at #{@metadata_url}"
18
- super(method(:harvest), %(data_centers:"#{@data_centers}"))
19
- end
20
-
21
- # rubocop: disable MethodLength
22
- # rubocop: disable AbcSize
23
- def harvest
24
- # first fetch list of available records at http://get.rvdata.us/services/cruise/
25
- # then loop through each one of those, using the root <gmi:MI_Metadata> tag
26
- puts "Getting list of records from #{@data_centers}"
27
- RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
28
- unless resp.code == 200
29
- puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
30
- next
31
- end
32
-
33
- doc = Nokogiri::HTML(resp.body)
34
-
35
- urls = doc.xpath('//a').map do |node|
36
- "#{@metadata_url}#{node.attr('href')}"
37
- end
38
-
39
- urls.each_slice(50) do |url_subset|
40
- # each result is a nokogirii doc with root element
41
- # <gmi:MI_Metadata>
42
- results = url_subset.map do |url|
43
- get_results(url, '//gmi:MI_Metadata').first
44
- end
45
-
46
- begin
47
- translated = results.map do |e|
48
- create_new_solr_add_doc_with_child(@translator.translate(e).root)
49
- end
50
-
51
- insert_solr_docs(translated)
52
- rescue => e
53
- puts "ERROR: #{e}"
54
- raise e if @die_on_failure
55
- end
56
- end
57
- end
58
- end
59
- end
60
- end
61
- end