search_solr_tools 6.1.0 → 6.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -2
- data/bin/search_solr_tools +5 -17
- data/lib/search_solr_tools/config/environments.rb +3 -1
- data/lib/search_solr_tools/config/environments.yaml +0 -32
- data/lib/search_solr_tools/errors/harvest_error.rb +44 -31
- data/lib/search_solr_tools/harvesters/auto_suggest.rb +5 -3
- data/lib/search_solr_tools/harvesters/base.rb +21 -20
- data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +7 -5
- data/lib/search_solr_tools/harvesters/nsidc_json.rb +9 -8
- data/lib/search_solr_tools/helpers/bounding_box_util.rb +8 -8
- data/lib/search_solr_tools/helpers/facet_configuration.rb +3 -1
- data/lib/search_solr_tools/helpers/harvest_status.rb +10 -8
- data/lib/search_solr_tools/helpers/iso_namespaces.rb +3 -1
- data/lib/search_solr_tools/helpers/solr_format.rb +25 -45
- data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +13 -10
- data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +2 -0
- data/lib/search_solr_tools/translators/nsidc_json.rb +48 -44
- data/lib/search_solr_tools/version.rb +3 -1
- data/lib/search_solr_tools.rb +3 -2
- metadata +3 -45
- data/lib/search_solr_tools/harvesters/adc.rb +0 -49
- data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +0 -46
- data/lib/search_solr_tools/harvesters/bcodmo.rb +0 -64
- data/lib/search_solr_tools/harvesters/data_one.rb +0 -49
- data/lib/search_solr_tools/harvesters/echo.rb +0 -52
- data/lib/search_solr_tools/harvesters/eol.rb +0 -51
- data/lib/search_solr_tools/harvesters/gtnp.rb +0 -67
- data/lib/search_solr_tools/harvesters/ices.rb +0 -58
- data/lib/search_solr_tools/harvesters/ncdc_paleo.rb +0 -62
- data/lib/search_solr_tools/harvesters/nmi.rb +0 -34
- data/lib/search_solr_tools/harvesters/nodc.rb +0 -75
- data/lib/search_solr_tools/harvesters/oai.rb +0 -62
- data/lib/search_solr_tools/harvesters/pdc.rb +0 -40
- data/lib/search_solr_tools/harvesters/r2r.rb +0 -61
- data/lib/search_solr_tools/harvesters/rda.rb +0 -35
- data/lib/search_solr_tools/harvesters/tdar.rb +0 -71
- data/lib/search_solr_tools/harvesters/usgs.rb +0 -76
- data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +0 -29
- data/lib/search_solr_tools/helpers/data_one_format.rb +0 -74
- data/lib/search_solr_tools/helpers/iso_to_solr.rb +0 -97
- data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +0 -197
- data/lib/search_solr_tools/helpers/ncdc_paleo_format.rb +0 -61
- data/lib/search_solr_tools/helpers/query_builder.rb +0 -13
- data/lib/search_solr_tools/helpers/r2r_format.rb +0 -25
- data/lib/search_solr_tools/helpers/selectors.rb +0 -22
- data/lib/search_solr_tools/helpers/tdar_format.rb +0 -70
- data/lib/search_solr_tools/helpers/usgs_format.rb +0 -50
- data/lib/search_solr_tools/selectors/adc.rb +0 -96
- data/lib/search_solr_tools/selectors/data_one.rb +0 -96
- data/lib/search_solr_tools/selectors/echo_iso.rb +0 -112
- data/lib/search_solr_tools/selectors/ices_iso.rb +0 -108
- data/lib/search_solr_tools/selectors/ncdc_paleo.rb +0 -90
- data/lib/search_solr_tools/selectors/nmi.rb +0 -107
- data/lib/search_solr_tools/selectors/nodc_iso.rb +0 -108
- data/lib/search_solr_tools/selectors/pdc_iso.rb +0 -109
- data/lib/search_solr_tools/selectors/r2r.rb +0 -115
- data/lib/search_solr_tools/selectors/rda.rb +0 -107
- data/lib/search_solr_tools/selectors/tdar_opensearch.rb +0 -91
- data/lib/search_solr_tools/selectors/usgs_iso.rb +0 -107
- data/lib/search_solr_tools/translators/bcodmo_json.rb +0 -89
- data/lib/search_solr_tools/translators/eol_to_solr.rb +0 -84
- data/lib/search_solr_tools/translators/gtnp_json.rb +0 -59
@@ -1,64 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require 'json'
|
3
|
-
require 'rest-client'
|
4
|
-
|
5
|
-
module SearchSolrTools
|
6
|
-
module Harvesters
|
7
|
-
# Harvests data from BcoDmo endpoint, translates and adds it to solr
|
8
|
-
class BcoDmo < Base
|
9
|
-
def initialize(env = 'development', die_on_failure = false)
|
10
|
-
super env, die_on_failure
|
11
|
-
@translator = Translators::BcodmoJsonToSolr.new
|
12
|
-
@wkt_parser = RGeo::WKRep::WKTParser.new(nil, {}) # (factory_generator_=nil,
|
13
|
-
end
|
14
|
-
|
15
|
-
def harvest_and_delete
|
16
|
-
puts "Running harvest of BCO-DMO catalog from #{bcodmo_url}"
|
17
|
-
super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
|
18
|
-
end
|
19
|
-
|
20
|
-
def bcodmo_url
|
21
|
-
SolrEnvironments[@environment][:bcodmo_url]
|
22
|
-
end
|
23
|
-
|
24
|
-
def harvest_bcodmo_into_solr
|
25
|
-
result = translate_bcodmo
|
26
|
-
insert_solr_docs(result[:add_docs], Base::JSON_CONTENT_TYPE)
|
27
|
-
|
28
|
-
errors_exist = result[:failure_ids].length > 0
|
29
|
-
fail 'Failed to harvest some records from BCO-DMO' if errors_exist && @die_on_failure
|
30
|
-
end
|
31
|
-
|
32
|
-
def translate_bcodmo
|
33
|
-
documents = []
|
34
|
-
failure_ids = []
|
35
|
-
request_json(SolrEnvironments[@environment][:bcodmo_url]).each do |record|
|
36
|
-
geometry = request_json(record['geometryUrl'])
|
37
|
-
results = parse_record(record, geometry)
|
38
|
-
results[:documents].each { |d| documents << d }
|
39
|
-
results[:failure_ids].each { |id| failure_ids << id }
|
40
|
-
end
|
41
|
-
{ add_docs: documents, failure_ids: failure_ids }
|
42
|
-
end
|
43
|
-
|
44
|
-
def request_json(url)
|
45
|
-
puts "Request: #{url}"
|
46
|
-
JSON.parse(RestClient.get(url))
|
47
|
-
end
|
48
|
-
|
49
|
-
def parse_record(record, geometry)
|
50
|
-
documents = []
|
51
|
-
failure_ids = []
|
52
|
-
begin
|
53
|
-
JSON.parse(RestClient.get(record['datasets'])).each do |dataset|
|
54
|
-
documents << { 'add' => { 'doc' => @translator.translate(dataset, record, geometry) } }
|
55
|
-
end
|
56
|
-
rescue => e
|
57
|
-
puts "Failed to add record #{record['id']} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
|
58
|
-
failure_ids << record['id']
|
59
|
-
end
|
60
|
-
{ documents: documents, failure_ids: failure_ids }
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
@@ -1,49 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
class DataOne < Base
|
6
|
-
def initialize(env = 'development', die_on_failure = false)
|
7
|
-
super
|
8
|
-
@page_size = 250
|
9
|
-
@translator = Helpers::IsoToSolr.new :data_one
|
10
|
-
end
|
11
|
-
|
12
|
-
def harvest_and_delete
|
13
|
-
puts "Running harvest of dataONE catalog from #{metadata_url}"
|
14
|
-
super(method(:harvest_data_one_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:DATA_ONE][:long_name]}\"")
|
15
|
-
end
|
16
|
-
|
17
|
-
def harvest_data_one_into_solr
|
18
|
-
start = 0
|
19
|
-
while (entries = get_results_from_data_one(start)) && (entries.length > 0)
|
20
|
-
begin
|
21
|
-
insert_solr_docs(get_docs_with_translated_entries_from_data_one(entries))
|
22
|
-
rescue => e
|
23
|
-
puts "ERROR: #{e}\n\n"
|
24
|
-
raise e if @die_on_failure
|
25
|
-
end
|
26
|
-
start += @page_size
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def get_results_from_data_one(start)
|
31
|
-
get_results(build_request(start, @page_size), './response/result/doc')
|
32
|
-
end
|
33
|
-
|
34
|
-
def metadata_url
|
35
|
-
SolrEnvironments[@environment][:data_one_url]
|
36
|
-
end
|
37
|
-
|
38
|
-
def get_docs_with_translated_entries_from_data_one(entries)
|
39
|
-
entries.map do |e|
|
40
|
-
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def build_request(start = 0, max_records = 100)
|
45
|
-
"#{metadata_url}&start=#{start}&rows=#{max_records}"
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
@@ -1,52 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
# Harvests data from ECHO and inserts it into Solr after it has been translated
|
6
|
-
class Echo < Base
|
7
|
-
def initialize(env = 'development', die_on_failure = false)
|
8
|
-
super env, die_on_failure
|
9
|
-
@page_size = 100
|
10
|
-
@translator = Helpers::IsoToSolr.new :echo
|
11
|
-
end
|
12
|
-
|
13
|
-
def harvest_and_delete
|
14
|
-
puts "Running harvest of ECHO catalog from #{echo_url}"
|
15
|
-
super(method(:harvest_echo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ECHO][:long_name]}\"")
|
16
|
-
end
|
17
|
-
|
18
|
-
# get translated entries from ECHO and add them to Solr
|
19
|
-
# this is the main entry point for the class
|
20
|
-
def harvest_echo_into_solr
|
21
|
-
page_num = 1
|
22
|
-
while (entries = get_results_from_echo(page_num)) && (entries.length > 0)
|
23
|
-
begin
|
24
|
-
insert_solr_docs get_docs_with_translated_entries_from_echo(entries)
|
25
|
-
rescue => e
|
26
|
-
puts "ERROR: #{e}\n\n"
|
27
|
-
raise e if @die_on_failure
|
28
|
-
end
|
29
|
-
page_num += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def echo_url
|
34
|
-
SolrEnvironments[@environment][:echo_url]
|
35
|
-
end
|
36
|
-
|
37
|
-
def get_results_from_echo(page_num)
|
38
|
-
get_results build_request(@page_size, page_num), './/results/result', 'application/echo10+xml'
|
39
|
-
end
|
40
|
-
|
41
|
-
def get_docs_with_translated_entries_from_echo(entries)
|
42
|
-
entries.map do |entry|
|
43
|
-
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def build_request(max_records = '25', page_num = '1')
|
48
|
-
echo_url + '&page_size=' + max_records.to_s + '&page_num=' + page_num.to_s
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
@@ -1,51 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require 'json'
|
3
|
-
require 'rgeo/geo_json'
|
4
|
-
|
5
|
-
module SearchSolrTools
|
6
|
-
module Harvesters
|
7
|
-
class Eol < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@translator = SearchSolrTools::Translators::EolToSolr.new
|
11
|
-
end
|
12
|
-
|
13
|
-
def harvest_and_delete
|
14
|
-
puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
|
15
|
-
SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
|
16
|
-
super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
def harvest_eol_into_solr
|
20
|
-
solr_add_queries = eol_dataset_urls.map do |dataset|
|
21
|
-
begin
|
22
|
-
doc = open_xml_document(dataset)
|
23
|
-
if doc.xpath('//xmlns:metadata').size > 1
|
24
|
-
# THREDDS allows for a dataset of datasests, EOL should not utilize this
|
25
|
-
fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
|
26
|
-
end
|
27
|
-
metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
|
28
|
-
{ 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
|
29
|
-
rescue => e
|
30
|
-
puts "ERROR: #{e}"
|
31
|
-
puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
|
32
|
-
raise e if @die_on_failure
|
33
|
-
next
|
34
|
-
end
|
35
|
-
end
|
36
|
-
insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
|
37
|
-
end
|
38
|
-
|
39
|
-
def eol_dataset_urls
|
40
|
-
SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
|
41
|
-
doc = open_xml_document(endpoint)
|
42
|
-
doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def open_xml_document(url)
|
47
|
-
Nokogiri::XML(open(url), &:strict)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
@@ -1,67 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require 'json'
|
3
|
-
require 'rest-client'
|
4
|
-
|
5
|
-
module SearchSolrTools
|
6
|
-
module Harvesters
|
7
|
-
# Harvests data from GTN-P endpoints, translates and adds it to solr
|
8
|
-
class GtnP < Base
|
9
|
-
def initialize(env = 'development', die_on_failure = false)
|
10
|
-
super env, die_on_failure
|
11
|
-
@translator = Translators::GtnpJsonToSolr.new
|
12
|
-
end
|
13
|
-
|
14
|
-
def gtnp_service_urls
|
15
|
-
json_records = []
|
16
|
-
SearchSolrTools::SolrEnvironments[:common][:gtnp].flat_map do |endpoint|
|
17
|
-
record = request_json(endpoint)
|
18
|
-
json_records << record
|
19
|
-
end
|
20
|
-
json_records
|
21
|
-
end
|
22
|
-
|
23
|
-
def harvest_and_delete
|
24
|
-
puts 'Running harvest of GTN-P catalog using the following configured GTN-P URLs:'
|
25
|
-
SearchSolrTools::SolrEnvironments[:common][:gtnp].each { |x| puts x }
|
26
|
-
super(method(:harvest_gtnp_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:GTNP][:long_name]}\"")
|
27
|
-
end
|
28
|
-
|
29
|
-
def harvest_gtnp_into_solr
|
30
|
-
result = translate_gtnp
|
31
|
-
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
32
|
-
fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
|
33
|
-
end
|
34
|
-
|
35
|
-
def translate_gtnp
|
36
|
-
documents = []
|
37
|
-
failure_ids = []
|
38
|
-
gtnp_records = gtnp_service_urls
|
39
|
-
gtnp_records.each do |record|
|
40
|
-
results = parse_record(record)
|
41
|
-
results[:documents].each { |d| documents << d }
|
42
|
-
results[:failure_ids].each { |id| failure_ids << id }
|
43
|
-
end
|
44
|
-
{ add_docs: documents, failure_ids: failure_ids }
|
45
|
-
end
|
46
|
-
|
47
|
-
def request_json(url)
|
48
|
-
JSON.parse(RestClient.get(url))
|
49
|
-
end
|
50
|
-
|
51
|
-
def parse_record(record)
|
52
|
-
documents = []
|
53
|
-
failure_ids = []
|
54
|
-
begin
|
55
|
-
record.drop(1).each do |dataset|
|
56
|
-
trans_doc = @translator.translate(dataset, record[0])
|
57
|
-
documents << { 'add' => { 'doc' => trans_doc } }
|
58
|
-
end
|
59
|
-
rescue => e
|
60
|
-
puts "Failed to add record #{record[0][:title]} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
|
61
|
-
failure_ids << record[0][:title]
|
62
|
-
end
|
63
|
-
{ documents: documents, failure_ids: failure_ids }
|
64
|
-
end
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
@@ -1,58 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/csw_iso_query_builder'
|
3
|
-
|
4
|
-
module SearchSolrTools
|
5
|
-
module Harvesters
|
6
|
-
# Harvests data from ICES and inserts it into Solr after it has been translated
|
7
|
-
class Ices < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@page_size = 100
|
11
|
-
@translator = Helpers::IsoToSolr.new :ices
|
12
|
-
end
|
13
|
-
|
14
|
-
def harvest_and_delete
|
15
|
-
puts "Running harvest of ICES catalog from #{ices_url}"
|
16
|
-
super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
# get translated entries from ICES and add them to Solr
|
20
|
-
# this is the main entry point for the class
|
21
|
-
def harvest_ices_into_solr
|
22
|
-
start_index = 1
|
23
|
-
while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
|
24
|
-
begin
|
25
|
-
insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
|
26
|
-
rescue => e
|
27
|
-
puts "ERROR: #{e}"
|
28
|
-
raise e if @die_on_failure
|
29
|
-
end
|
30
|
-
start_index += @page_size
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def ices_url
|
35
|
-
SolrEnvironments[@environment][:ices_url]
|
36
|
-
end
|
37
|
-
|
38
|
-
def get_results_from_ices(start_index)
|
39
|
-
get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
|
40
|
-
end
|
41
|
-
|
42
|
-
def get_docs_with_translated_entries_from_ices(entries)
|
43
|
-
entries.map do |entry|
|
44
|
-
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
49
|
-
Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
|
50
|
-
'resultType' => resultType,
|
51
|
-
'maxRecords' => maxRecords,
|
52
|
-
'startPosition' => startPosition,
|
53
|
-
'constraintLanguage' => 'CQL_TEXT',
|
54
|
-
'outputSchema' => 'http://www.isotc211.org/2005/gmd')
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
@@ -1,62 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/csw_iso_query_builder'
|
3
|
-
|
4
|
-
module SearchSolrTools
|
5
|
-
module Harvesters
|
6
|
-
# Harvests data from NODC PALEO and inserts it into Solr after it has been translated
|
7
|
-
class NcdcPaleo < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@page_size = 50
|
11
|
-
@translator = Helpers::IsoToSolr.new :ncdc_paleo
|
12
|
-
end
|
13
|
-
|
14
|
-
def harvest_and_delete
|
15
|
-
puts "Running harvest of NCDC Paleo catalog from #{ncdc_paleo_url}"
|
16
|
-
super(method(:harvest_ncdc_paleo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NCDC_PALEO][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
def harvest_ncdc_paleo_into_solr
|
20
|
-
start_index = 1
|
21
|
-
while (entries = get_results_from_ncdc_paleo_url(start_index)) && (entries.length > 0)
|
22
|
-
begin
|
23
|
-
insert_solr_docs get_docs_with_translated_entries_from_ncdc_paleo(entries)
|
24
|
-
rescue => e
|
25
|
-
puts "ERROR: #{e}"
|
26
|
-
raise e if @die_on_failure
|
27
|
-
end
|
28
|
-
start_index += @page_size
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def ncdc_paleo_url
|
33
|
-
SolrEnvironments[@environment][:ncdc_paleo_url]
|
34
|
-
end
|
35
|
-
|
36
|
-
def get_results_from_ncdc_paleo_url(start_index)
|
37
|
-
get_results build_csw_request('results', @page_size, start_index), '//csw:Record'
|
38
|
-
end
|
39
|
-
|
40
|
-
def get_docs_with_translated_entries_from_ncdc_paleo(entries)
|
41
|
-
auth_ids = entries.map { |e| e.xpath("./dc:identifier[@scheme='urn:x-esri:specification:ServiceType:ArcIMS:Metadata:DocID']").text }
|
42
|
-
|
43
|
-
auth_ids.map do |record|
|
44
|
-
result_xml = get_results("https://gis.ncdc.noaa.gov/gptpaleo/csw?getxml=#{record}",
|
45
|
-
'/rdf:RDF/rdf:Description').first
|
46
|
-
solr_doc = create_new_solr_add_doc_with_child(@translator.translate(result_xml).root)
|
47
|
-
insert_node = solr_doc.at_xpath('//doc')
|
48
|
-
insert_node.add_child("<field name='authoritative_id'>#{record}</field>")
|
49
|
-
insert_node.add_child("<field name='dataset_url'>https://gis.ncdc.noaa.gov/gptpaleo/catalog/search/resource/details.page?uuid=#{record}")
|
50
|
-
solr_doc.root
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def build_csw_request(resultType = 'results', maxRecords = '1000', startPosition = '1')
|
55
|
-
Helpers::CswIsoQueryBuilder.get_query_string(ncdc_paleo_url,
|
56
|
-
'resultType' => resultType,
|
57
|
-
'maxRecords' => maxRecords,
|
58
|
-
'startPosition' => startPosition)
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
require_relative 'oai'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
class Nmi < Oai
|
6
|
-
def initialize(env = 'development', die_on_failure = false)
|
7
|
-
super
|
8
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:NMI][:long_name]
|
9
|
-
@translator = Helpers::IsoToSolr.new :nmi
|
10
|
-
end
|
11
|
-
|
12
|
-
def metadata_url
|
13
|
-
SolrEnvironments[@environment][:nmi_url]
|
14
|
-
end
|
15
|
-
|
16
|
-
# resumption_token must be empty to stop the harvest loop; NMI's feed does not
|
17
|
-
# provide any resumption token and gets all the records in just one go
|
18
|
-
def results
|
19
|
-
@resumption_token = ''
|
20
|
-
list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
|
21
|
-
list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
|
22
|
-
end
|
23
|
-
|
24
|
-
private
|
25
|
-
|
26
|
-
def request_params
|
27
|
-
{
|
28
|
-
verb: 'ListRecords',
|
29
|
-
metadataPrefix: 'dif'
|
30
|
-
}
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
@@ -1,75 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/csw_iso_query_builder'
|
3
|
-
|
4
|
-
module SearchSolrTools
|
5
|
-
module Harvesters
|
6
|
-
# Harvests data from NODC and inserts it into Solr after it has been translated
|
7
|
-
class Nodc < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@page_size = 50
|
11
|
-
@translator = Helpers::IsoToSolr.new :nodc
|
12
|
-
end
|
13
|
-
|
14
|
-
def harvest_and_delete
|
15
|
-
puts "Running harvest of NODC catalog from #{nodc_url}"
|
16
|
-
super(method(:harvest_nodc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NODC][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
# get translated entries from NODC and add them to Solr
|
20
|
-
# this is the main entry point for the class
|
21
|
-
def harvest_nodc_into_solr
|
22
|
-
start_index = 1
|
23
|
-
while (entries = get_results_from_nodc(start_index)) && (entries.length > 0)
|
24
|
-
begin
|
25
|
-
insert_solr_docs get_docs_with_translated_entries_from_nodc(entries)
|
26
|
-
rescue => e
|
27
|
-
puts "ERROR: #{e}"
|
28
|
-
raise e if @die_on_failure
|
29
|
-
end
|
30
|
-
start_index += @page_size
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def nodc_url
|
35
|
-
SolrEnvironments[@environment][:nodc_url]
|
36
|
-
end
|
37
|
-
|
38
|
-
def get_results_from_nodc(start_index)
|
39
|
-
get_results build_csw_request('results', @page_size, start_index), '//gmi:MI_Metadata'
|
40
|
-
end
|
41
|
-
|
42
|
-
def get_docs_with_translated_entries_from_nodc(entries)
|
43
|
-
entries.map do |entry|
|
44
|
-
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
49
|
-
Helpers::CswIsoQueryBuilder.get_query_string(nodc_url,
|
50
|
-
'resultType' => resultType,
|
51
|
-
'maxRecords' => maxRecords,
|
52
|
-
'startPosition' => startPosition,
|
53
|
-
'constraint' => bbox_constraint,
|
54
|
-
'outputSchema' => 'http://www.isotc211.org/2005/gmd')
|
55
|
-
end
|
56
|
-
|
57
|
-
def bbox_constraint
|
58
|
-
bbox = {
|
59
|
-
west: '-180',
|
60
|
-
south: '45',
|
61
|
-
east: '180',
|
62
|
-
north: '90'
|
63
|
-
}
|
64
|
-
|
65
|
-
URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
|
66
|
-
'xmlns:gml="http://www.opengis.net/gml" ' \
|
67
|
-
'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
|
68
|
-
'<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
|
69
|
-
'<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
|
70
|
-
'<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
|
71
|
-
'</gml:Envelope></ogc:BBOX></Filter>'
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
@@ -1,62 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/query_builder'
|
3
|
-
|
4
|
-
require 'json'
|
5
|
-
|
6
|
-
module SearchSolrTools
|
7
|
-
module Harvesters
|
8
|
-
# Base class for harvesting Oai feeds into SOLR
|
9
|
-
class Oai < Base
|
10
|
-
# Used in query string params, resumptionToken
|
11
|
-
|
12
|
-
def initialize(env = 'development', die_on_failure = false)
|
13
|
-
super env, die_on_failure
|
14
|
-
# This is updated when we harvest based on the response
|
15
|
-
# from the server.
|
16
|
-
@resumption_token = nil
|
17
|
-
end
|
18
|
-
|
19
|
-
def encode_data_provider_url(url)
|
20
|
-
URI.encode(url)
|
21
|
-
end
|
22
|
-
|
23
|
-
def harvest_and_delete
|
24
|
-
puts "Running #{self.class.name} at #{metadata_url}"
|
25
|
-
super(method(:harvest), %(data_centers:"#{@data_centers}"))
|
26
|
-
end
|
27
|
-
|
28
|
-
def harvest
|
29
|
-
while @resumption_token.nil? || !@resumption_token.empty?
|
30
|
-
begin
|
31
|
-
insert_solr_docs(translated_docs(results))
|
32
|
-
rescue => e
|
33
|
-
puts "ERROR: #{e.class} #{e}"
|
34
|
-
raise e if @die_on_failure
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def results
|
40
|
-
fail NotImplementedError
|
41
|
-
end
|
42
|
-
|
43
|
-
def metadata_url
|
44
|
-
fail NotImplementedError
|
45
|
-
end
|
46
|
-
|
47
|
-
def translated_docs(entries)
|
48
|
-
entries.map { |e| create_new_solr_add_doc_with_child(@translator.translate(e).root) }
|
49
|
-
end
|
50
|
-
|
51
|
-
private
|
52
|
-
|
53
|
-
def request_params
|
54
|
-
fail NotImplementedError
|
55
|
-
end
|
56
|
-
|
57
|
-
def request_string
|
58
|
-
"#{metadata_url}#{Helpers::QueryBuilder.build(request_params)}"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
require_relative 'oai'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
# Harvests data from Polar data catalogue and inserts it into
|
6
|
-
# Solr after it has been translated
|
7
|
-
class Pdc < Oai
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super
|
10
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:PDC][:long_name]
|
11
|
-
@translator = Helpers::IsoToSolr.new :pdc
|
12
|
-
end
|
13
|
-
|
14
|
-
def metadata_url
|
15
|
-
SolrEnvironments[@environment][:pdc_url]
|
16
|
-
end
|
17
|
-
|
18
|
-
def results
|
19
|
-
list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
|
20
|
-
|
21
|
-
@resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces).first.text
|
22
|
-
|
23
|
-
list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def request_params
|
29
|
-
# If a 'resumptionToken' is supplied with any arguments other than 'verb',
|
30
|
-
# the response from PDC gives a badArgument error, saying "The argument
|
31
|
-
# 'resumptionToken' must be supplied without other arguments"
|
32
|
-
{
|
33
|
-
verb: 'ListRecords',
|
34
|
-
metadataPrefix: @resumption_token.nil? ? 'iso' : nil,
|
35
|
-
resumptionToken: @resumption_token
|
36
|
-
}.delete_if { |_k, v| v.nil? }
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
@@ -1,61 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
|
3
|
-
require 'nokogiri'
|
4
|
-
require 'rest-client'
|
5
|
-
|
6
|
-
module SearchSolrTools
|
7
|
-
module Harvesters
|
8
|
-
class R2R < Base
|
9
|
-
def initialize(env = 'development', die_on_failure = false)
|
10
|
-
super
|
11
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
|
12
|
-
@translator = Helpers::IsoToSolr.new :r2r
|
13
|
-
@metadata_url = SolrEnvironments[@environment][:r2r_url]
|
14
|
-
end
|
15
|
-
|
16
|
-
def harvest_and_delete
|
17
|
-
puts "Running #{self.class.name} at #{@metadata_url}"
|
18
|
-
super(method(:harvest), %(data_centers:"#{@data_centers}"))
|
19
|
-
end
|
20
|
-
|
21
|
-
# rubocop: disable MethodLength
|
22
|
-
# rubocop: disable AbcSize
|
23
|
-
def harvest
|
24
|
-
# first fetch list of available records at http://get.rvdata.us/services/cruise/
|
25
|
-
# then loop through each one of those, using the root <gmi:MI_Metadata> tag
|
26
|
-
puts "Getting list of records from #{@data_centers}"
|
27
|
-
RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
|
28
|
-
unless resp.code == 200
|
29
|
-
puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
|
30
|
-
next
|
31
|
-
end
|
32
|
-
|
33
|
-
doc = Nokogiri::HTML(resp.body)
|
34
|
-
|
35
|
-
urls = doc.xpath('//a').map do |node|
|
36
|
-
"#{@metadata_url}#{node.attr('href')}"
|
37
|
-
end
|
38
|
-
|
39
|
-
urls.each_slice(50) do |url_subset|
|
40
|
-
# each result is a nokogirii doc with root element
|
41
|
-
# <gmi:MI_Metadata>
|
42
|
-
results = url_subset.map do |url|
|
43
|
-
get_results(url, '//gmi:MI_Metadata').first
|
44
|
-
end
|
45
|
-
|
46
|
-
begin
|
47
|
-
translated = results.map do |e|
|
48
|
-
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
49
|
-
end
|
50
|
-
|
51
|
-
insert_solr_docs(translated)
|
52
|
-
rescue => e
|
53
|
-
puts "ERROR: #{e}"
|
54
|
-
raise e if @die_on_failure
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|