search_solr_tools 6.1.0 → 6.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +11 -2
  3. data/bin/search_solr_tools +5 -17
  4. data/lib/search_solr_tools/config/environments.rb +3 -1
  5. data/lib/search_solr_tools/config/environments.yaml +0 -32
  6. data/lib/search_solr_tools/errors/harvest_error.rb +44 -31
  7. data/lib/search_solr_tools/harvesters/auto_suggest.rb +5 -3
  8. data/lib/search_solr_tools/harvesters/base.rb +21 -20
  9. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +7 -5
  10. data/lib/search_solr_tools/harvesters/nsidc_json.rb +9 -8
  11. data/lib/search_solr_tools/helpers/bounding_box_util.rb +8 -8
  12. data/lib/search_solr_tools/helpers/facet_configuration.rb +3 -1
  13. data/lib/search_solr_tools/helpers/harvest_status.rb +10 -8
  14. data/lib/search_solr_tools/helpers/iso_namespaces.rb +3 -1
  15. data/lib/search_solr_tools/helpers/solr_format.rb +25 -45
  16. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +13 -10
  17. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +2 -0
  18. data/lib/search_solr_tools/translators/nsidc_json.rb +48 -44
  19. data/lib/search_solr_tools/version.rb +3 -1
  20. data/lib/search_solr_tools.rb +3 -2
  21. metadata +3 -45
  22. data/lib/search_solr_tools/harvesters/adc.rb +0 -49
  23. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +0 -46
  24. data/lib/search_solr_tools/harvesters/bcodmo.rb +0 -64
  25. data/lib/search_solr_tools/harvesters/data_one.rb +0 -49
  26. data/lib/search_solr_tools/harvesters/echo.rb +0 -52
  27. data/lib/search_solr_tools/harvesters/eol.rb +0 -51
  28. data/lib/search_solr_tools/harvesters/gtnp.rb +0 -67
  29. data/lib/search_solr_tools/harvesters/ices.rb +0 -58
  30. data/lib/search_solr_tools/harvesters/ncdc_paleo.rb +0 -62
  31. data/lib/search_solr_tools/harvesters/nmi.rb +0 -34
  32. data/lib/search_solr_tools/harvesters/nodc.rb +0 -75
  33. data/lib/search_solr_tools/harvesters/oai.rb +0 -62
  34. data/lib/search_solr_tools/harvesters/pdc.rb +0 -40
  35. data/lib/search_solr_tools/harvesters/r2r.rb +0 -61
  36. data/lib/search_solr_tools/harvesters/rda.rb +0 -35
  37. data/lib/search_solr_tools/harvesters/tdar.rb +0 -71
  38. data/lib/search_solr_tools/harvesters/usgs.rb +0 -76
  39. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +0 -29
  40. data/lib/search_solr_tools/helpers/data_one_format.rb +0 -74
  41. data/lib/search_solr_tools/helpers/iso_to_solr.rb +0 -97
  42. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +0 -197
  43. data/lib/search_solr_tools/helpers/ncdc_paleo_format.rb +0 -61
  44. data/lib/search_solr_tools/helpers/query_builder.rb +0 -13
  45. data/lib/search_solr_tools/helpers/r2r_format.rb +0 -25
  46. data/lib/search_solr_tools/helpers/selectors.rb +0 -22
  47. data/lib/search_solr_tools/helpers/tdar_format.rb +0 -70
  48. data/lib/search_solr_tools/helpers/usgs_format.rb +0 -50
  49. data/lib/search_solr_tools/selectors/adc.rb +0 -96
  50. data/lib/search_solr_tools/selectors/data_one.rb +0 -96
  51. data/lib/search_solr_tools/selectors/echo_iso.rb +0 -112
  52. data/lib/search_solr_tools/selectors/ices_iso.rb +0 -108
  53. data/lib/search_solr_tools/selectors/ncdc_paleo.rb +0 -90
  54. data/lib/search_solr_tools/selectors/nmi.rb +0 -107
  55. data/lib/search_solr_tools/selectors/nodc_iso.rb +0 -108
  56. data/lib/search_solr_tools/selectors/pdc_iso.rb +0 -109
  57. data/lib/search_solr_tools/selectors/r2r.rb +0 -115
  58. data/lib/search_solr_tools/selectors/rda.rb +0 -107
  59. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +0 -91
  60. data/lib/search_solr_tools/selectors/usgs_iso.rb +0 -107
  61. data/lib/search_solr_tools/translators/bcodmo_json.rb +0 -89
  62. data/lib/search_solr_tools/translators/eol_to_solr.rb +0 -84
  63. data/lib/search_solr_tools/translators/gtnp_json.rb +0 -59
@@ -1,64 +0,0 @@
1
- require_relative 'base'
2
- require 'json'
3
- require 'rest-client'
4
-
5
- module SearchSolrTools
6
- module Harvesters
7
- # Harvests data from BcoDmo endpoint, translates and adds it to solr
8
- class BcoDmo < Base
9
- def initialize(env = 'development', die_on_failure = false)
10
- super env, die_on_failure
11
- @translator = Translators::BcodmoJsonToSolr.new
12
- @wkt_parser = RGeo::WKRep::WKTParser.new(nil, {}) # (factory_generator_=nil,
13
- end
14
-
15
- def harvest_and_delete
16
- puts "Running harvest of BCO-DMO catalog from #{bcodmo_url}"
17
- super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
18
- end
19
-
20
- def bcodmo_url
21
- SolrEnvironments[@environment][:bcodmo_url]
22
- end
23
-
24
- def harvest_bcodmo_into_solr
25
- result = translate_bcodmo
26
- insert_solr_docs(result[:add_docs], Base::JSON_CONTENT_TYPE)
27
-
28
- errors_exist = result[:failure_ids].length > 0
29
- fail 'Failed to harvest some records from BCO-DMO' if errors_exist && @die_on_failure
30
- end
31
-
32
- def translate_bcodmo
33
- documents = []
34
- failure_ids = []
35
- request_json(SolrEnvironments[@environment][:bcodmo_url]).each do |record|
36
- geometry = request_json(record['geometryUrl'])
37
- results = parse_record(record, geometry)
38
- results[:documents].each { |d| documents << d }
39
- results[:failure_ids].each { |id| failure_ids << id }
40
- end
41
- { add_docs: documents, failure_ids: failure_ids }
42
- end
43
-
44
- def request_json(url)
45
- puts "Request: #{url}"
46
- JSON.parse(RestClient.get(url))
47
- end
48
-
49
- def parse_record(record, geometry)
50
- documents = []
51
- failure_ids = []
52
- begin
53
- JSON.parse(RestClient.get(record['datasets'])).each do |dataset|
54
- documents << { 'add' => { 'doc' => @translator.translate(dataset, record, geometry) } }
55
- end
56
- rescue => e
57
- puts "Failed to add record #{record['id']} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
58
- failure_ids << record['id']
59
- end
60
- { documents: documents, failure_ids: failure_ids }
61
- end
62
- end
63
- end
64
- end
@@ -1,49 +0,0 @@
1
- require_relative 'base'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- class DataOne < Base
6
- def initialize(env = 'development', die_on_failure = false)
7
- super
8
- @page_size = 250
9
- @translator = Helpers::IsoToSolr.new :data_one
10
- end
11
-
12
- def harvest_and_delete
13
- puts "Running harvest of dataONE catalog from #{metadata_url}"
14
- super(method(:harvest_data_one_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:DATA_ONE][:long_name]}\"")
15
- end
16
-
17
- def harvest_data_one_into_solr
18
- start = 0
19
- while (entries = get_results_from_data_one(start)) && (entries.length > 0)
20
- begin
21
- insert_solr_docs(get_docs_with_translated_entries_from_data_one(entries))
22
- rescue => e
23
- puts "ERROR: #{e}\n\n"
24
- raise e if @die_on_failure
25
- end
26
- start += @page_size
27
- end
28
- end
29
-
30
- def get_results_from_data_one(start)
31
- get_results(build_request(start, @page_size), './response/result/doc')
32
- end
33
-
34
- def metadata_url
35
- SolrEnvironments[@environment][:data_one_url]
36
- end
37
-
38
- def get_docs_with_translated_entries_from_data_one(entries)
39
- entries.map do |e|
40
- create_new_solr_add_doc_with_child(@translator.translate(e).root)
41
- end
42
- end
43
-
44
- def build_request(start = 0, max_records = 100)
45
- "#{metadata_url}&start=#{start}&rows=#{max_records}"
46
- end
47
- end
48
- end
49
- end
@@ -1,52 +0,0 @@
1
- require_relative 'base'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- # Harvests data from ECHO and inserts it into Solr after it has been translated
6
- class Echo < Base
7
- def initialize(env = 'development', die_on_failure = false)
8
- super env, die_on_failure
9
- @page_size = 100
10
- @translator = Helpers::IsoToSolr.new :echo
11
- end
12
-
13
- def harvest_and_delete
14
- puts "Running harvest of ECHO catalog from #{echo_url}"
15
- super(method(:harvest_echo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ECHO][:long_name]}\"")
16
- end
17
-
18
- # get translated entries from ECHO and add them to Solr
19
- # this is the main entry point for the class
20
- def harvest_echo_into_solr
21
- page_num = 1
22
- while (entries = get_results_from_echo(page_num)) && (entries.length > 0)
23
- begin
24
- insert_solr_docs get_docs_with_translated_entries_from_echo(entries)
25
- rescue => e
26
- puts "ERROR: #{e}\n\n"
27
- raise e if @die_on_failure
28
- end
29
- page_num += 1
30
- end
31
- end
32
-
33
- def echo_url
34
- SolrEnvironments[@environment][:echo_url]
35
- end
36
-
37
- def get_results_from_echo(page_num)
38
- get_results build_request(@page_size, page_num), './/results/result', 'application/echo10+xml'
39
- end
40
-
41
- def get_docs_with_translated_entries_from_echo(entries)
42
- entries.map do |entry|
43
- create_new_solr_add_doc_with_child(@translator.translate(entry).root)
44
- end
45
- end
46
-
47
- def build_request(max_records = '25', page_num = '1')
48
- echo_url + '&page_size=' + max_records.to_s + '&page_num=' + page_num.to_s
49
- end
50
- end
51
- end
52
- end
@@ -1,51 +0,0 @@
1
- require_relative 'base'
2
- require 'json'
3
- require 'rgeo/geo_json'
4
-
5
- module SearchSolrTools
6
- module Harvesters
7
- class Eol < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @translator = SearchSolrTools::Translators::EolToSolr.new
11
- end
12
-
13
- def harvest_and_delete
14
- puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
15
- SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
16
- super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
17
- end
18
-
19
- def harvest_eol_into_solr
20
- solr_add_queries = eol_dataset_urls.map do |dataset|
21
- begin
22
- doc = open_xml_document(dataset)
23
- if doc.xpath('//xmlns:metadata').size > 1
24
- # THREDDS allows for a dataset of datasests, EOL should not utilize this
25
- fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
26
- end
27
- metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
28
- { 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
29
- rescue => e
30
- puts "ERROR: #{e}"
31
- puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
32
- raise e if @die_on_failure
33
- next
34
- end
35
- end
36
- insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
37
- end
38
-
39
- def eol_dataset_urls
40
- SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
41
- doc = open_xml_document(endpoint)
42
- doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
43
- end
44
- end
45
-
46
- def open_xml_document(url)
47
- Nokogiri::XML(open(url), &:strict)
48
- end
49
- end
50
- end
51
- end
@@ -1,67 +0,0 @@
1
- require_relative 'base'
2
- require 'json'
3
- require 'rest-client'
4
-
5
- module SearchSolrTools
6
- module Harvesters
7
- # Harvests data from GTN-P endpoints, translates and adds it to solr
8
- class GtnP < Base
9
- def initialize(env = 'development', die_on_failure = false)
10
- super env, die_on_failure
11
- @translator = Translators::GtnpJsonToSolr.new
12
- end
13
-
14
- def gtnp_service_urls
15
- json_records = []
16
- SearchSolrTools::SolrEnvironments[:common][:gtnp].flat_map do |endpoint|
17
- record = request_json(endpoint)
18
- json_records << record
19
- end
20
- json_records
21
- end
22
-
23
- def harvest_and_delete
24
- puts 'Running harvest of GTN-P catalog using the following configured GTN-P URLs:'
25
- SearchSolrTools::SolrEnvironments[:common][:gtnp].each { |x| puts x }
26
- super(method(:harvest_gtnp_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:GTNP][:long_name]}\"")
27
- end
28
-
29
- def harvest_gtnp_into_solr
30
- result = translate_gtnp
31
- insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
32
- fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
33
- end
34
-
35
- def translate_gtnp
36
- documents = []
37
- failure_ids = []
38
- gtnp_records = gtnp_service_urls
39
- gtnp_records.each do |record|
40
- results = parse_record(record)
41
- results[:documents].each { |d| documents << d }
42
- results[:failure_ids].each { |id| failure_ids << id }
43
- end
44
- { add_docs: documents, failure_ids: failure_ids }
45
- end
46
-
47
- def request_json(url)
48
- JSON.parse(RestClient.get(url))
49
- end
50
-
51
- def parse_record(record)
52
- documents = []
53
- failure_ids = []
54
- begin
55
- record.drop(1).each do |dataset|
56
- trans_doc = @translator.translate(dataset, record[0])
57
- documents << { 'add' => { 'doc' => trans_doc } }
58
- end
59
- rescue => e
60
- puts "Failed to add record #{record[0][:title]} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
61
- failure_ids << record[0][:title]
62
- end
63
- { documents: documents, failure_ids: failure_ids }
64
- end
65
- end
66
- end
67
- end
@@ -1,58 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/csw_iso_query_builder'
3
-
4
- module SearchSolrTools
5
- module Harvesters
6
- # Harvests data from ICES and inserts it into Solr after it has been translated
7
- class Ices < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @page_size = 100
11
- @translator = Helpers::IsoToSolr.new :ices
12
- end
13
-
14
- def harvest_and_delete
15
- puts "Running harvest of ICES catalog from #{ices_url}"
16
- super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
17
- end
18
-
19
- # get translated entries from ICES and add them to Solr
20
- # this is the main entry point for the class
21
- def harvest_ices_into_solr
22
- start_index = 1
23
- while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
24
- begin
25
- insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
26
- rescue => e
27
- puts "ERROR: #{e}"
28
- raise e if @die_on_failure
29
- end
30
- start_index += @page_size
31
- end
32
- end
33
-
34
- def ices_url
35
- SolrEnvironments[@environment][:ices_url]
36
- end
37
-
38
- def get_results_from_ices(start_index)
39
- get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
40
- end
41
-
42
- def get_docs_with_translated_entries_from_ices(entries)
43
- entries.map do |entry|
44
- create_new_solr_add_doc_with_child(@translator.translate(entry).root)
45
- end
46
- end
47
-
48
- def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
49
- Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
50
- 'resultType' => resultType,
51
- 'maxRecords' => maxRecords,
52
- 'startPosition' => startPosition,
53
- 'constraintLanguage' => 'CQL_TEXT',
54
- 'outputSchema' => 'http://www.isotc211.org/2005/gmd')
55
- end
56
- end
57
- end
58
- end
@@ -1,62 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/csw_iso_query_builder'
3
-
4
- module SearchSolrTools
5
- module Harvesters
6
- # Harvests data from NODC PALEO and inserts it into Solr after it has been translated
7
- class NcdcPaleo < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @page_size = 50
11
- @translator = Helpers::IsoToSolr.new :ncdc_paleo
12
- end
13
-
14
- def harvest_and_delete
15
- puts "Running harvest of NCDC Paleo catalog from #{ncdc_paleo_url}"
16
- super(method(:harvest_ncdc_paleo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NCDC_PALEO][:long_name]}\"")
17
- end
18
-
19
- def harvest_ncdc_paleo_into_solr
20
- start_index = 1
21
- while (entries = get_results_from_ncdc_paleo_url(start_index)) && (entries.length > 0)
22
- begin
23
- insert_solr_docs get_docs_with_translated_entries_from_ncdc_paleo(entries)
24
- rescue => e
25
- puts "ERROR: #{e}"
26
- raise e if @die_on_failure
27
- end
28
- start_index += @page_size
29
- end
30
- end
31
-
32
- def ncdc_paleo_url
33
- SolrEnvironments[@environment][:ncdc_paleo_url]
34
- end
35
-
36
- def get_results_from_ncdc_paleo_url(start_index)
37
- get_results build_csw_request('results', @page_size, start_index), '//csw:Record'
38
- end
39
-
40
- def get_docs_with_translated_entries_from_ncdc_paleo(entries)
41
- auth_ids = entries.map { |e| e.xpath("./dc:identifier[@scheme='urn:x-esri:specification:ServiceType:ArcIMS:Metadata:DocID']").text }
42
-
43
- auth_ids.map do |record|
44
- result_xml = get_results("https://gis.ncdc.noaa.gov/gptpaleo/csw?getxml=#{record}",
45
- '/rdf:RDF/rdf:Description').first
46
- solr_doc = create_new_solr_add_doc_with_child(@translator.translate(result_xml).root)
47
- insert_node = solr_doc.at_xpath('//doc')
48
- insert_node.add_child("<field name='authoritative_id'>#{record}</field>")
49
- insert_node.add_child("<field name='dataset_url'>https://gis.ncdc.noaa.gov/gptpaleo/catalog/search/resource/details.page?uuid=#{record}")
50
- solr_doc.root
51
- end
52
- end
53
-
54
- def build_csw_request(resultType = 'results', maxRecords = '1000', startPosition = '1')
55
- Helpers::CswIsoQueryBuilder.get_query_string(ncdc_paleo_url,
56
- 'resultType' => resultType,
57
- 'maxRecords' => maxRecords,
58
- 'startPosition' => startPosition)
59
- end
60
- end
61
- end
62
- end
@@ -1,34 +0,0 @@
1
- require_relative 'oai'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- class Nmi < Oai
6
- def initialize(env = 'development', die_on_failure = false)
7
- super
8
- @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:NMI][:long_name]
9
- @translator = Helpers::IsoToSolr.new :nmi
10
- end
11
-
12
- def metadata_url
13
- SolrEnvironments[@environment][:nmi_url]
14
- end
15
-
16
- # resumption_token must be empty to stop the harvest loop; NMI's feed does not
17
- # provide any resumption token and gets all the records in just one go
18
- def results
19
- @resumption_token = ''
20
- list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
21
- list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
22
- end
23
-
24
- private
25
-
26
- def request_params
27
- {
28
- verb: 'ListRecords',
29
- metadataPrefix: 'dif'
30
- }
31
- end
32
- end
33
- end
34
- end
@@ -1,75 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/csw_iso_query_builder'
3
-
4
- module SearchSolrTools
5
- module Harvesters
6
- # Harvests data from NODC and inserts it into Solr after it has been translated
7
- class Nodc < Base
8
- def initialize(env = 'development', die_on_failure = false)
9
- super env, die_on_failure
10
- @page_size = 50
11
- @translator = Helpers::IsoToSolr.new :nodc
12
- end
13
-
14
- def harvest_and_delete
15
- puts "Running harvest of NODC catalog from #{nodc_url}"
16
- super(method(:harvest_nodc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NODC][:long_name]}\"")
17
- end
18
-
19
- # get translated entries from NODC and add them to Solr
20
- # this is the main entry point for the class
21
- def harvest_nodc_into_solr
22
- start_index = 1
23
- while (entries = get_results_from_nodc(start_index)) && (entries.length > 0)
24
- begin
25
- insert_solr_docs get_docs_with_translated_entries_from_nodc(entries)
26
- rescue => e
27
- puts "ERROR: #{e}"
28
- raise e if @die_on_failure
29
- end
30
- start_index += @page_size
31
- end
32
- end
33
-
34
- def nodc_url
35
- SolrEnvironments[@environment][:nodc_url]
36
- end
37
-
38
- def get_results_from_nodc(start_index)
39
- get_results build_csw_request('results', @page_size, start_index), '//gmi:MI_Metadata'
40
- end
41
-
42
- def get_docs_with_translated_entries_from_nodc(entries)
43
- entries.map do |entry|
44
- create_new_solr_add_doc_with_child(@translator.translate(entry).root)
45
- end
46
- end
47
-
48
- def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
49
- Helpers::CswIsoQueryBuilder.get_query_string(nodc_url,
50
- 'resultType' => resultType,
51
- 'maxRecords' => maxRecords,
52
- 'startPosition' => startPosition,
53
- 'constraint' => bbox_constraint,
54
- 'outputSchema' => 'http://www.isotc211.org/2005/gmd')
55
- end
56
-
57
- def bbox_constraint
58
- bbox = {
59
- west: '-180',
60
- south: '45',
61
- east: '180',
62
- north: '90'
63
- }
64
-
65
- URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
66
- 'xmlns:gml="http://www.opengis.net/gml" ' \
67
- 'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
68
- '<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
69
- '<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
70
- '<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
71
- '</gml:Envelope></ogc:BBOX></Filter>'
72
- end
73
- end
74
- end
75
- end
@@ -1,62 +0,0 @@
1
- require_relative 'base'
2
- require_relative '../helpers/query_builder'
3
-
4
- require 'json'
5
-
6
- module SearchSolrTools
7
- module Harvesters
8
- # Base class for harvesting Oai feeds into SOLR
9
- class Oai < Base
10
- # Used in query string params, resumptionToken
11
-
12
- def initialize(env = 'development', die_on_failure = false)
13
- super env, die_on_failure
14
- # This is updated when we harvest based on the response
15
- # from the server.
16
- @resumption_token = nil
17
- end
18
-
19
- def encode_data_provider_url(url)
20
- URI.encode(url)
21
- end
22
-
23
- def harvest_and_delete
24
- puts "Running #{self.class.name} at #{metadata_url}"
25
- super(method(:harvest), %(data_centers:"#{@data_centers}"))
26
- end
27
-
28
- def harvest
29
- while @resumption_token.nil? || !@resumption_token.empty?
30
- begin
31
- insert_solr_docs(translated_docs(results))
32
- rescue => e
33
- puts "ERROR: #{e.class} #{e}"
34
- raise e if @die_on_failure
35
- end
36
- end
37
- end
38
-
39
- def results
40
- fail NotImplementedError
41
- end
42
-
43
- def metadata_url
44
- fail NotImplementedError
45
- end
46
-
47
- def translated_docs(entries)
48
- entries.map { |e| create_new_solr_add_doc_with_child(@translator.translate(e).root) }
49
- end
50
-
51
- private
52
-
53
- def request_params
54
- fail NotImplementedError
55
- end
56
-
57
- def request_string
58
- "#{metadata_url}#{Helpers::QueryBuilder.build(request_params)}"
59
- end
60
- end
61
- end
62
- end
@@ -1,40 +0,0 @@
1
- require_relative 'oai'
2
-
3
- module SearchSolrTools
4
- module Harvesters
5
- # Harvests data from Polar data catalogue and inserts it into
6
- # Solr after it has been translated
7
- class Pdc < Oai
8
- def initialize(env = 'development', die_on_failure = false)
9
- super
10
- @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:PDC][:long_name]
11
- @translator = Helpers::IsoToSolr.new :pdc
12
- end
13
-
14
- def metadata_url
15
- SolrEnvironments[@environment][:pdc_url]
16
- end
17
-
18
- def results
19
- list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
20
-
21
- @resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces).first.text
22
-
23
- list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
24
- end
25
-
26
- private
27
-
28
- def request_params
29
- # If a 'resumptionToken' is supplied with any arguments other than 'verb',
30
- # the response from PDC gives a badArgument error, saying "The argument
31
- # 'resumptionToken' must be supplied without other arguments"
32
- {
33
- verb: 'ListRecords',
34
- metadataPrefix: @resumption_token.nil? ? 'iso' : nil,
35
- resumptionToken: @resumption_token
36
- }.delete_if { |_k, v| v.nil? }
37
- end
38
- end
39
- end
40
- end
@@ -1,61 +0,0 @@
1
- require_relative 'base'
2
-
3
- require 'nokogiri'
4
- require 'rest-client'
5
-
6
- module SearchSolrTools
7
- module Harvesters
8
- class R2R < Base
9
- def initialize(env = 'development', die_on_failure = false)
10
- super
11
- @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
12
- @translator = Helpers::IsoToSolr.new :r2r
13
- @metadata_url = SolrEnvironments[@environment][:r2r_url]
14
- end
15
-
16
- def harvest_and_delete
17
- puts "Running #{self.class.name} at #{@metadata_url}"
18
- super(method(:harvest), %(data_centers:"#{@data_centers}"))
19
- end
20
-
21
- # rubocop: disable MethodLength
22
- # rubocop: disable AbcSize
23
- def harvest
24
- # first fetch list of available records at http://get.rvdata.us/services/cruise/
25
- # then loop through each one of those, using the root <gmi:MI_Metadata> tag
26
- puts "Getting list of records from #{@data_centers}"
27
- RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
28
- unless resp.code == 200
29
- puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
30
- next
31
- end
32
-
33
- doc = Nokogiri::HTML(resp.body)
34
-
35
- urls = doc.xpath('//a').map do |node|
36
- "#{@metadata_url}#{node.attr('href')}"
37
- end
38
-
39
- urls.each_slice(50) do |url_subset|
40
- # each result is a nokogirii doc with root element
41
- # <gmi:MI_Metadata>
42
- results = url_subset.map do |url|
43
- get_results(url, '//gmi:MI_Metadata').first
44
- end
45
-
46
- begin
47
- translated = results.map do |e|
48
- create_new_solr_add_doc_with_child(@translator.translate(e).root)
49
- end
50
-
51
- insert_solr_docs(translated)
52
- rescue => e
53
- puts "ERROR: #{e}"
54
- raise e if @die_on_failure
55
- end
56
- end
57
- end
58
- end
59
- end
60
- end
61
- end