search_solr_tools 6.1.0 → 6.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -2
- data/bin/search_solr_tools +5 -17
- data/lib/search_solr_tools/config/environments.rb +3 -1
- data/lib/search_solr_tools/config/environments.yaml +0 -32
- data/lib/search_solr_tools/errors/harvest_error.rb +44 -31
- data/lib/search_solr_tools/harvesters/auto_suggest.rb +5 -3
- data/lib/search_solr_tools/harvesters/base.rb +21 -20
- data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +7 -5
- data/lib/search_solr_tools/harvesters/nsidc_json.rb +9 -8
- data/lib/search_solr_tools/helpers/bounding_box_util.rb +8 -8
- data/lib/search_solr_tools/helpers/facet_configuration.rb +3 -1
- data/lib/search_solr_tools/helpers/harvest_status.rb +10 -8
- data/lib/search_solr_tools/helpers/iso_namespaces.rb +3 -1
- data/lib/search_solr_tools/helpers/solr_format.rb +25 -45
- data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +13 -10
- data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +2 -0
- data/lib/search_solr_tools/translators/nsidc_json.rb +48 -44
- data/lib/search_solr_tools/version.rb +3 -1
- data/lib/search_solr_tools.rb +3 -2
- metadata +3 -45
- data/lib/search_solr_tools/harvesters/adc.rb +0 -49
- data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +0 -46
- data/lib/search_solr_tools/harvesters/bcodmo.rb +0 -64
- data/lib/search_solr_tools/harvesters/data_one.rb +0 -49
- data/lib/search_solr_tools/harvesters/echo.rb +0 -52
- data/lib/search_solr_tools/harvesters/eol.rb +0 -51
- data/lib/search_solr_tools/harvesters/gtnp.rb +0 -67
- data/lib/search_solr_tools/harvesters/ices.rb +0 -58
- data/lib/search_solr_tools/harvesters/ncdc_paleo.rb +0 -62
- data/lib/search_solr_tools/harvesters/nmi.rb +0 -34
- data/lib/search_solr_tools/harvesters/nodc.rb +0 -75
- data/lib/search_solr_tools/harvesters/oai.rb +0 -62
- data/lib/search_solr_tools/harvesters/pdc.rb +0 -40
- data/lib/search_solr_tools/harvesters/r2r.rb +0 -61
- data/lib/search_solr_tools/harvesters/rda.rb +0 -35
- data/lib/search_solr_tools/harvesters/tdar.rb +0 -71
- data/lib/search_solr_tools/harvesters/usgs.rb +0 -76
- data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +0 -29
- data/lib/search_solr_tools/helpers/data_one_format.rb +0 -74
- data/lib/search_solr_tools/helpers/iso_to_solr.rb +0 -97
- data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +0 -197
- data/lib/search_solr_tools/helpers/ncdc_paleo_format.rb +0 -61
- data/lib/search_solr_tools/helpers/query_builder.rb +0 -13
- data/lib/search_solr_tools/helpers/r2r_format.rb +0 -25
- data/lib/search_solr_tools/helpers/selectors.rb +0 -22
- data/lib/search_solr_tools/helpers/tdar_format.rb +0 -70
- data/lib/search_solr_tools/helpers/usgs_format.rb +0 -50
- data/lib/search_solr_tools/selectors/adc.rb +0 -96
- data/lib/search_solr_tools/selectors/data_one.rb +0 -96
- data/lib/search_solr_tools/selectors/echo_iso.rb +0 -112
- data/lib/search_solr_tools/selectors/ices_iso.rb +0 -108
- data/lib/search_solr_tools/selectors/ncdc_paleo.rb +0 -90
- data/lib/search_solr_tools/selectors/nmi.rb +0 -107
- data/lib/search_solr_tools/selectors/nodc_iso.rb +0 -108
- data/lib/search_solr_tools/selectors/pdc_iso.rb +0 -109
- data/lib/search_solr_tools/selectors/r2r.rb +0 -115
- data/lib/search_solr_tools/selectors/rda.rb +0 -107
- data/lib/search_solr_tools/selectors/tdar_opensearch.rb +0 -91
- data/lib/search_solr_tools/selectors/usgs_iso.rb +0 -107
- data/lib/search_solr_tools/translators/bcodmo_json.rb +0 -89
- data/lib/search_solr_tools/translators/eol_to_solr.rb +0 -84
- data/lib/search_solr_tools/translators/gtnp_json.rb +0 -59
@@ -1,64 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require 'json'
|
3
|
-
require 'rest-client'
|
4
|
-
|
5
|
-
module SearchSolrTools
|
6
|
-
module Harvesters
|
7
|
-
# Harvests data from BcoDmo endpoint, translates and adds it to solr
|
8
|
-
class BcoDmo < Base
|
9
|
-
def initialize(env = 'development', die_on_failure = false)
|
10
|
-
super env, die_on_failure
|
11
|
-
@translator = Translators::BcodmoJsonToSolr.new
|
12
|
-
@wkt_parser = RGeo::WKRep::WKTParser.new(nil, {}) # (factory_generator_=nil,
|
13
|
-
end
|
14
|
-
|
15
|
-
def harvest_and_delete
|
16
|
-
puts "Running harvest of BCO-DMO catalog from #{bcodmo_url}"
|
17
|
-
super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
|
18
|
-
end
|
19
|
-
|
20
|
-
def bcodmo_url
|
21
|
-
SolrEnvironments[@environment][:bcodmo_url]
|
22
|
-
end
|
23
|
-
|
24
|
-
def harvest_bcodmo_into_solr
|
25
|
-
result = translate_bcodmo
|
26
|
-
insert_solr_docs(result[:add_docs], Base::JSON_CONTENT_TYPE)
|
27
|
-
|
28
|
-
errors_exist = result[:failure_ids].length > 0
|
29
|
-
fail 'Failed to harvest some records from BCO-DMO' if errors_exist && @die_on_failure
|
30
|
-
end
|
31
|
-
|
32
|
-
def translate_bcodmo
|
33
|
-
documents = []
|
34
|
-
failure_ids = []
|
35
|
-
request_json(SolrEnvironments[@environment][:bcodmo_url]).each do |record|
|
36
|
-
geometry = request_json(record['geometryUrl'])
|
37
|
-
results = parse_record(record, geometry)
|
38
|
-
results[:documents].each { |d| documents << d }
|
39
|
-
results[:failure_ids].each { |id| failure_ids << id }
|
40
|
-
end
|
41
|
-
{ add_docs: documents, failure_ids: failure_ids }
|
42
|
-
end
|
43
|
-
|
44
|
-
def request_json(url)
|
45
|
-
puts "Request: #{url}"
|
46
|
-
JSON.parse(RestClient.get(url))
|
47
|
-
end
|
48
|
-
|
49
|
-
def parse_record(record, geometry)
|
50
|
-
documents = []
|
51
|
-
failure_ids = []
|
52
|
-
begin
|
53
|
-
JSON.parse(RestClient.get(record['datasets'])).each do |dataset|
|
54
|
-
documents << { 'add' => { 'doc' => @translator.translate(dataset, record, geometry) } }
|
55
|
-
end
|
56
|
-
rescue => e
|
57
|
-
puts "Failed to add record #{record['id']} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
|
58
|
-
failure_ids << record['id']
|
59
|
-
end
|
60
|
-
{ documents: documents, failure_ids: failure_ids }
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
@@ -1,49 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
class DataOne < Base
|
6
|
-
def initialize(env = 'development', die_on_failure = false)
|
7
|
-
super
|
8
|
-
@page_size = 250
|
9
|
-
@translator = Helpers::IsoToSolr.new :data_one
|
10
|
-
end
|
11
|
-
|
12
|
-
def harvest_and_delete
|
13
|
-
puts "Running harvest of dataONE catalog from #{metadata_url}"
|
14
|
-
super(method(:harvest_data_one_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:DATA_ONE][:long_name]}\"")
|
15
|
-
end
|
16
|
-
|
17
|
-
def harvest_data_one_into_solr
|
18
|
-
start = 0
|
19
|
-
while (entries = get_results_from_data_one(start)) && (entries.length > 0)
|
20
|
-
begin
|
21
|
-
insert_solr_docs(get_docs_with_translated_entries_from_data_one(entries))
|
22
|
-
rescue => e
|
23
|
-
puts "ERROR: #{e}\n\n"
|
24
|
-
raise e if @die_on_failure
|
25
|
-
end
|
26
|
-
start += @page_size
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def get_results_from_data_one(start)
|
31
|
-
get_results(build_request(start, @page_size), './response/result/doc')
|
32
|
-
end
|
33
|
-
|
34
|
-
def metadata_url
|
35
|
-
SolrEnvironments[@environment][:data_one_url]
|
36
|
-
end
|
37
|
-
|
38
|
-
def get_docs_with_translated_entries_from_data_one(entries)
|
39
|
-
entries.map do |e|
|
40
|
-
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def build_request(start = 0, max_records = 100)
|
45
|
-
"#{metadata_url}&start=#{start}&rows=#{max_records}"
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
@@ -1,52 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
# Harvests data from ECHO and inserts it into Solr after it has been translated
|
6
|
-
class Echo < Base
|
7
|
-
def initialize(env = 'development', die_on_failure = false)
|
8
|
-
super env, die_on_failure
|
9
|
-
@page_size = 100
|
10
|
-
@translator = Helpers::IsoToSolr.new :echo
|
11
|
-
end
|
12
|
-
|
13
|
-
def harvest_and_delete
|
14
|
-
puts "Running harvest of ECHO catalog from #{echo_url}"
|
15
|
-
super(method(:harvest_echo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ECHO][:long_name]}\"")
|
16
|
-
end
|
17
|
-
|
18
|
-
# get translated entries from ECHO and add them to Solr
|
19
|
-
# this is the main entry point for the class
|
20
|
-
def harvest_echo_into_solr
|
21
|
-
page_num = 1
|
22
|
-
while (entries = get_results_from_echo(page_num)) && (entries.length > 0)
|
23
|
-
begin
|
24
|
-
insert_solr_docs get_docs_with_translated_entries_from_echo(entries)
|
25
|
-
rescue => e
|
26
|
-
puts "ERROR: #{e}\n\n"
|
27
|
-
raise e if @die_on_failure
|
28
|
-
end
|
29
|
-
page_num += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def echo_url
|
34
|
-
SolrEnvironments[@environment][:echo_url]
|
35
|
-
end
|
36
|
-
|
37
|
-
def get_results_from_echo(page_num)
|
38
|
-
get_results build_request(@page_size, page_num), './/results/result', 'application/echo10+xml'
|
39
|
-
end
|
40
|
-
|
41
|
-
def get_docs_with_translated_entries_from_echo(entries)
|
42
|
-
entries.map do |entry|
|
43
|
-
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def build_request(max_records = '25', page_num = '1')
|
48
|
-
echo_url + '&page_size=' + max_records.to_s + '&page_num=' + page_num.to_s
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
@@ -1,51 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require 'json'
|
3
|
-
require 'rgeo/geo_json'
|
4
|
-
|
5
|
-
module SearchSolrTools
|
6
|
-
module Harvesters
|
7
|
-
class Eol < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@translator = SearchSolrTools::Translators::EolToSolr.new
|
11
|
-
end
|
12
|
-
|
13
|
-
def harvest_and_delete
|
14
|
-
puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
|
15
|
-
SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
|
16
|
-
super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
def harvest_eol_into_solr
|
20
|
-
solr_add_queries = eol_dataset_urls.map do |dataset|
|
21
|
-
begin
|
22
|
-
doc = open_xml_document(dataset)
|
23
|
-
if doc.xpath('//xmlns:metadata').size > 1
|
24
|
-
# THREDDS allows for a dataset of datasests, EOL should not utilize this
|
25
|
-
fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
|
26
|
-
end
|
27
|
-
metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
|
28
|
-
{ 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
|
29
|
-
rescue => e
|
30
|
-
puts "ERROR: #{e}"
|
31
|
-
puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
|
32
|
-
raise e if @die_on_failure
|
33
|
-
next
|
34
|
-
end
|
35
|
-
end
|
36
|
-
insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
|
37
|
-
end
|
38
|
-
|
39
|
-
def eol_dataset_urls
|
40
|
-
SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
|
41
|
-
doc = open_xml_document(endpoint)
|
42
|
-
doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def open_xml_document(url)
|
47
|
-
Nokogiri::XML(open(url), &:strict)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
@@ -1,67 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require 'json'
|
3
|
-
require 'rest-client'
|
4
|
-
|
5
|
-
module SearchSolrTools
|
6
|
-
module Harvesters
|
7
|
-
# Harvests data from GTN-P endpoints, translates and adds it to solr
|
8
|
-
class GtnP < Base
|
9
|
-
def initialize(env = 'development', die_on_failure = false)
|
10
|
-
super env, die_on_failure
|
11
|
-
@translator = Translators::GtnpJsonToSolr.new
|
12
|
-
end
|
13
|
-
|
14
|
-
def gtnp_service_urls
|
15
|
-
json_records = []
|
16
|
-
SearchSolrTools::SolrEnvironments[:common][:gtnp].flat_map do |endpoint|
|
17
|
-
record = request_json(endpoint)
|
18
|
-
json_records << record
|
19
|
-
end
|
20
|
-
json_records
|
21
|
-
end
|
22
|
-
|
23
|
-
def harvest_and_delete
|
24
|
-
puts 'Running harvest of GTN-P catalog using the following configured GTN-P URLs:'
|
25
|
-
SearchSolrTools::SolrEnvironments[:common][:gtnp].each { |x| puts x }
|
26
|
-
super(method(:harvest_gtnp_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:GTNP][:long_name]}\"")
|
27
|
-
end
|
28
|
-
|
29
|
-
def harvest_gtnp_into_solr
|
30
|
-
result = translate_gtnp
|
31
|
-
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
32
|
-
fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
|
33
|
-
end
|
34
|
-
|
35
|
-
def translate_gtnp
|
36
|
-
documents = []
|
37
|
-
failure_ids = []
|
38
|
-
gtnp_records = gtnp_service_urls
|
39
|
-
gtnp_records.each do |record|
|
40
|
-
results = parse_record(record)
|
41
|
-
results[:documents].each { |d| documents << d }
|
42
|
-
results[:failure_ids].each { |id| failure_ids << id }
|
43
|
-
end
|
44
|
-
{ add_docs: documents, failure_ids: failure_ids }
|
45
|
-
end
|
46
|
-
|
47
|
-
def request_json(url)
|
48
|
-
JSON.parse(RestClient.get(url))
|
49
|
-
end
|
50
|
-
|
51
|
-
def parse_record(record)
|
52
|
-
documents = []
|
53
|
-
failure_ids = []
|
54
|
-
begin
|
55
|
-
record.drop(1).each do |dataset|
|
56
|
-
trans_doc = @translator.translate(dataset, record[0])
|
57
|
-
documents << { 'add' => { 'doc' => trans_doc } }
|
58
|
-
end
|
59
|
-
rescue => e
|
60
|
-
puts "Failed to add record #{record[0][:title]} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
|
61
|
-
failure_ids << record[0][:title]
|
62
|
-
end
|
63
|
-
{ documents: documents, failure_ids: failure_ids }
|
64
|
-
end
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
@@ -1,58 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/csw_iso_query_builder'
|
3
|
-
|
4
|
-
module SearchSolrTools
|
5
|
-
module Harvesters
|
6
|
-
# Harvests data from ICES and inserts it into Solr after it has been translated
|
7
|
-
class Ices < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@page_size = 100
|
11
|
-
@translator = Helpers::IsoToSolr.new :ices
|
12
|
-
end
|
13
|
-
|
14
|
-
def harvest_and_delete
|
15
|
-
puts "Running harvest of ICES catalog from #{ices_url}"
|
16
|
-
super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
# get translated entries from ICES and add them to Solr
|
20
|
-
# this is the main entry point for the class
|
21
|
-
def harvest_ices_into_solr
|
22
|
-
start_index = 1
|
23
|
-
while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
|
24
|
-
begin
|
25
|
-
insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
|
26
|
-
rescue => e
|
27
|
-
puts "ERROR: #{e}"
|
28
|
-
raise e if @die_on_failure
|
29
|
-
end
|
30
|
-
start_index += @page_size
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def ices_url
|
35
|
-
SolrEnvironments[@environment][:ices_url]
|
36
|
-
end
|
37
|
-
|
38
|
-
def get_results_from_ices(start_index)
|
39
|
-
get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
|
40
|
-
end
|
41
|
-
|
42
|
-
def get_docs_with_translated_entries_from_ices(entries)
|
43
|
-
entries.map do |entry|
|
44
|
-
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
49
|
-
Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
|
50
|
-
'resultType' => resultType,
|
51
|
-
'maxRecords' => maxRecords,
|
52
|
-
'startPosition' => startPosition,
|
53
|
-
'constraintLanguage' => 'CQL_TEXT',
|
54
|
-
'outputSchema' => 'http://www.isotc211.org/2005/gmd')
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
@@ -1,62 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/csw_iso_query_builder'
|
3
|
-
|
4
|
-
module SearchSolrTools
|
5
|
-
module Harvesters
|
6
|
-
# Harvests data from NODC PALEO and inserts it into Solr after it has been translated
|
7
|
-
class NcdcPaleo < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@page_size = 50
|
11
|
-
@translator = Helpers::IsoToSolr.new :ncdc_paleo
|
12
|
-
end
|
13
|
-
|
14
|
-
def harvest_and_delete
|
15
|
-
puts "Running harvest of NCDC Paleo catalog from #{ncdc_paleo_url}"
|
16
|
-
super(method(:harvest_ncdc_paleo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NCDC_PALEO][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
def harvest_ncdc_paleo_into_solr
|
20
|
-
start_index = 1
|
21
|
-
while (entries = get_results_from_ncdc_paleo_url(start_index)) && (entries.length > 0)
|
22
|
-
begin
|
23
|
-
insert_solr_docs get_docs_with_translated_entries_from_ncdc_paleo(entries)
|
24
|
-
rescue => e
|
25
|
-
puts "ERROR: #{e}"
|
26
|
-
raise e if @die_on_failure
|
27
|
-
end
|
28
|
-
start_index += @page_size
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def ncdc_paleo_url
|
33
|
-
SolrEnvironments[@environment][:ncdc_paleo_url]
|
34
|
-
end
|
35
|
-
|
36
|
-
def get_results_from_ncdc_paleo_url(start_index)
|
37
|
-
get_results build_csw_request('results', @page_size, start_index), '//csw:Record'
|
38
|
-
end
|
39
|
-
|
40
|
-
def get_docs_with_translated_entries_from_ncdc_paleo(entries)
|
41
|
-
auth_ids = entries.map { |e| e.xpath("./dc:identifier[@scheme='urn:x-esri:specification:ServiceType:ArcIMS:Metadata:DocID']").text }
|
42
|
-
|
43
|
-
auth_ids.map do |record|
|
44
|
-
result_xml = get_results("https://gis.ncdc.noaa.gov/gptpaleo/csw?getxml=#{record}",
|
45
|
-
'/rdf:RDF/rdf:Description').first
|
46
|
-
solr_doc = create_new_solr_add_doc_with_child(@translator.translate(result_xml).root)
|
47
|
-
insert_node = solr_doc.at_xpath('//doc')
|
48
|
-
insert_node.add_child("<field name='authoritative_id'>#{record}</field>")
|
49
|
-
insert_node.add_child("<field name='dataset_url'>https://gis.ncdc.noaa.gov/gptpaleo/catalog/search/resource/details.page?uuid=#{record}")
|
50
|
-
solr_doc.root
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def build_csw_request(resultType = 'results', maxRecords = '1000', startPosition = '1')
|
55
|
-
Helpers::CswIsoQueryBuilder.get_query_string(ncdc_paleo_url,
|
56
|
-
'resultType' => resultType,
|
57
|
-
'maxRecords' => maxRecords,
|
58
|
-
'startPosition' => startPosition)
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
require_relative 'oai'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
class Nmi < Oai
|
6
|
-
def initialize(env = 'development', die_on_failure = false)
|
7
|
-
super
|
8
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:NMI][:long_name]
|
9
|
-
@translator = Helpers::IsoToSolr.new :nmi
|
10
|
-
end
|
11
|
-
|
12
|
-
def metadata_url
|
13
|
-
SolrEnvironments[@environment][:nmi_url]
|
14
|
-
end
|
15
|
-
|
16
|
-
# resumption_token must be empty to stop the harvest loop; NMI's feed does not
|
17
|
-
# provide any resumption token and gets all the records in just one go
|
18
|
-
def results
|
19
|
-
@resumption_token = ''
|
20
|
-
list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
|
21
|
-
list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
|
22
|
-
end
|
23
|
-
|
24
|
-
private
|
25
|
-
|
26
|
-
def request_params
|
27
|
-
{
|
28
|
-
verb: 'ListRecords',
|
29
|
-
metadataPrefix: 'dif'
|
30
|
-
}
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
@@ -1,75 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/csw_iso_query_builder'
|
3
|
-
|
4
|
-
module SearchSolrTools
|
5
|
-
module Harvesters
|
6
|
-
# Harvests data from NODC and inserts it into Solr after it has been translated
|
7
|
-
class Nodc < Base
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super env, die_on_failure
|
10
|
-
@page_size = 50
|
11
|
-
@translator = Helpers::IsoToSolr.new :nodc
|
12
|
-
end
|
13
|
-
|
14
|
-
def harvest_and_delete
|
15
|
-
puts "Running harvest of NODC catalog from #{nodc_url}"
|
16
|
-
super(method(:harvest_nodc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NODC][:long_name]}\"")
|
17
|
-
end
|
18
|
-
|
19
|
-
# get translated entries from NODC and add them to Solr
|
20
|
-
# this is the main entry point for the class
|
21
|
-
def harvest_nodc_into_solr
|
22
|
-
start_index = 1
|
23
|
-
while (entries = get_results_from_nodc(start_index)) && (entries.length > 0)
|
24
|
-
begin
|
25
|
-
insert_solr_docs get_docs_with_translated_entries_from_nodc(entries)
|
26
|
-
rescue => e
|
27
|
-
puts "ERROR: #{e}"
|
28
|
-
raise e if @die_on_failure
|
29
|
-
end
|
30
|
-
start_index += @page_size
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def nodc_url
|
35
|
-
SolrEnvironments[@environment][:nodc_url]
|
36
|
-
end
|
37
|
-
|
38
|
-
def get_results_from_nodc(start_index)
|
39
|
-
get_results build_csw_request('results', @page_size, start_index), '//gmi:MI_Metadata'
|
40
|
-
end
|
41
|
-
|
42
|
-
def get_docs_with_translated_entries_from_nodc(entries)
|
43
|
-
entries.map do |entry|
|
44
|
-
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
49
|
-
Helpers::CswIsoQueryBuilder.get_query_string(nodc_url,
|
50
|
-
'resultType' => resultType,
|
51
|
-
'maxRecords' => maxRecords,
|
52
|
-
'startPosition' => startPosition,
|
53
|
-
'constraint' => bbox_constraint,
|
54
|
-
'outputSchema' => 'http://www.isotc211.org/2005/gmd')
|
55
|
-
end
|
56
|
-
|
57
|
-
def bbox_constraint
|
58
|
-
bbox = {
|
59
|
-
west: '-180',
|
60
|
-
south: '45',
|
61
|
-
east: '180',
|
62
|
-
north: '90'
|
63
|
-
}
|
64
|
-
|
65
|
-
URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
|
66
|
-
'xmlns:gml="http://www.opengis.net/gml" ' \
|
67
|
-
'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
|
68
|
-
'<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
|
69
|
-
'<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
|
70
|
-
'<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
|
71
|
-
'</gml:Envelope></ogc:BBOX></Filter>'
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
@@ -1,62 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
require_relative '../helpers/query_builder'
|
3
|
-
|
4
|
-
require 'json'
|
5
|
-
|
6
|
-
module SearchSolrTools
|
7
|
-
module Harvesters
|
8
|
-
# Base class for harvesting Oai feeds into SOLR
|
9
|
-
class Oai < Base
|
10
|
-
# Used in query string params, resumptionToken
|
11
|
-
|
12
|
-
def initialize(env = 'development', die_on_failure = false)
|
13
|
-
super env, die_on_failure
|
14
|
-
# This is updated when we harvest based on the response
|
15
|
-
# from the server.
|
16
|
-
@resumption_token = nil
|
17
|
-
end
|
18
|
-
|
19
|
-
def encode_data_provider_url(url)
|
20
|
-
URI.encode(url)
|
21
|
-
end
|
22
|
-
|
23
|
-
def harvest_and_delete
|
24
|
-
puts "Running #{self.class.name} at #{metadata_url}"
|
25
|
-
super(method(:harvest), %(data_centers:"#{@data_centers}"))
|
26
|
-
end
|
27
|
-
|
28
|
-
def harvest
|
29
|
-
while @resumption_token.nil? || !@resumption_token.empty?
|
30
|
-
begin
|
31
|
-
insert_solr_docs(translated_docs(results))
|
32
|
-
rescue => e
|
33
|
-
puts "ERROR: #{e.class} #{e}"
|
34
|
-
raise e if @die_on_failure
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def results
|
40
|
-
fail NotImplementedError
|
41
|
-
end
|
42
|
-
|
43
|
-
def metadata_url
|
44
|
-
fail NotImplementedError
|
45
|
-
end
|
46
|
-
|
47
|
-
def translated_docs(entries)
|
48
|
-
entries.map { |e| create_new_solr_add_doc_with_child(@translator.translate(e).root) }
|
49
|
-
end
|
50
|
-
|
51
|
-
private
|
52
|
-
|
53
|
-
def request_params
|
54
|
-
fail NotImplementedError
|
55
|
-
end
|
56
|
-
|
57
|
-
def request_string
|
58
|
-
"#{metadata_url}#{Helpers::QueryBuilder.build(request_params)}"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
require_relative 'oai'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Harvesters
|
5
|
-
# Harvests data from Polar data catalogue and inserts it into
|
6
|
-
# Solr after it has been translated
|
7
|
-
class Pdc < Oai
|
8
|
-
def initialize(env = 'development', die_on_failure = false)
|
9
|
-
super
|
10
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:PDC][:long_name]
|
11
|
-
@translator = Helpers::IsoToSolr.new :pdc
|
12
|
-
end
|
13
|
-
|
14
|
-
def metadata_url
|
15
|
-
SolrEnvironments[@environment][:pdc_url]
|
16
|
-
end
|
17
|
-
|
18
|
-
def results
|
19
|
-
list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
|
20
|
-
|
21
|
-
@resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces).first.text
|
22
|
-
|
23
|
-
list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def request_params
|
29
|
-
# If a 'resumptionToken' is supplied with any arguments other than 'verb',
|
30
|
-
# the response from PDC gives a badArgument error, saying "The argument
|
31
|
-
# 'resumptionToken' must be supplied without other arguments"
|
32
|
-
{
|
33
|
-
verb: 'ListRecords',
|
34
|
-
metadataPrefix: @resumption_token.nil? ? 'iso' : nil,
|
35
|
-
resumptionToken: @resumption_token
|
36
|
-
}.delete_if { |_k, v| v.nil? }
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
@@ -1,61 +0,0 @@
|
|
1
|
-
require_relative 'base'
|
2
|
-
|
3
|
-
require 'nokogiri'
|
4
|
-
require 'rest-client'
|
5
|
-
|
6
|
-
module SearchSolrTools
|
7
|
-
module Harvesters
|
8
|
-
class R2R < Base
|
9
|
-
def initialize(env = 'development', die_on_failure = false)
|
10
|
-
super
|
11
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
|
12
|
-
@translator = Helpers::IsoToSolr.new :r2r
|
13
|
-
@metadata_url = SolrEnvironments[@environment][:r2r_url]
|
14
|
-
end
|
15
|
-
|
16
|
-
def harvest_and_delete
|
17
|
-
puts "Running #{self.class.name} at #{@metadata_url}"
|
18
|
-
super(method(:harvest), %(data_centers:"#{@data_centers}"))
|
19
|
-
end
|
20
|
-
|
21
|
-
# rubocop: disable MethodLength
|
22
|
-
# rubocop: disable AbcSize
|
23
|
-
def harvest
|
24
|
-
# first fetch list of available records at http://get.rvdata.us/services/cruise/
|
25
|
-
# then loop through each one of those, using the root <gmi:MI_Metadata> tag
|
26
|
-
puts "Getting list of records from #{@data_centers}"
|
27
|
-
RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
|
28
|
-
unless resp.code == 200
|
29
|
-
puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
|
30
|
-
next
|
31
|
-
end
|
32
|
-
|
33
|
-
doc = Nokogiri::HTML(resp.body)
|
34
|
-
|
35
|
-
urls = doc.xpath('//a').map do |node|
|
36
|
-
"#{@metadata_url}#{node.attr('href')}"
|
37
|
-
end
|
38
|
-
|
39
|
-
urls.each_slice(50) do |url_subset|
|
40
|
-
# each result is a nokogirii doc with root element
|
41
|
-
# <gmi:MI_Metadata>
|
42
|
-
results = url_subset.map do |url|
|
43
|
-
get_results(url, '//gmi:MI_Metadata').first
|
44
|
-
end
|
45
|
-
|
46
|
-
begin
|
47
|
-
translated = results.map do |e|
|
48
|
-
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
49
|
-
end
|
50
|
-
|
51
|
-
insert_solr_docs(translated)
|
52
|
-
rescue => e
|
53
|
-
puts "ERROR: #{e}"
|
54
|
-
raise e if @die_on_failure
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|