search_solr_tools 3.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +88 -0
- data/COPYING +674 -0
- data/README.md +203 -0
- data/bin/search_solr_tools +87 -0
- data/lib/search_solr_tools.rb +8 -0
- data/lib/search_solr_tools/config/environments.rb +12 -0
- data/lib/search_solr_tools/config/environments.yaml +73 -0
- data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
- data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
- data/lib/search_solr_tools/harvesters/base.rb +183 -0
- data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
- data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
- data/lib/search_solr_tools/harvesters/echo.rb +50 -0
- data/lib/search_solr_tools/harvesters/eol.rb +53 -0
- data/lib/search_solr_tools/harvesters/ices.rb +55 -0
- data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
- data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
- data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
- data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
- data/lib/search_solr_tools/harvesters/oai.rb +59 -0
- data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
- data/lib/search_solr_tools/harvesters/rda.rb +33 -0
- data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
- data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
- data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
- data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
- data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
- data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
- data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
- data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
- data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
- data/lib/search_solr_tools/helpers/selectors.rb +20 -0
- data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
- data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
- data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
- data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
- data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
- data/lib/search_solr_tools/selectors/cisl.rb +112 -0
- data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
- data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
- data/lib/search_solr_tools/selectors/nmi.rb +106 -0
- data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
- data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
- data/lib/search_solr_tools/selectors/rda.rb +106 -0
- data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
- data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
- data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
- data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
- data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
- data/lib/search_solr_tools/version.rb +3 -0
- data/search_solr_tools.gemspec +45 -0
- metadata +345 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'multi_json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'rest-client'
|
5
|
+
require 'rsolr'
|
6
|
+
require 'time'
|
7
|
+
|
8
|
+
module SearchSolrTools
|
9
|
+
module Harvesters
|
10
|
+
# base class for solr harvesters
|
11
|
+
class Base
|
12
|
+
attr_accessor :environment
|
13
|
+
|
14
|
+
DELETE_DOCUMENTS_RATIO = 0.1
|
15
|
+
XML_CONTENT_TYPE = 'text/xml; charset=utf-8'
|
16
|
+
JSON_CONTENT_TYPE = 'application/json; charset=utf-8'
|
17
|
+
|
18
|
+
def initialize(env = 'development', die_on_failure = false)
|
19
|
+
@environment = env
|
20
|
+
@die_on_failure = die_on_failure
|
21
|
+
end
|
22
|
+
|
23
|
+
def solr_url
|
24
|
+
env = SolrEnvironments[@environment]
|
25
|
+
"http://#{env[:host]}:#{env[:port]}/#{env[:collection_path]}"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Some data providers require encoding (such as URI.encode),
|
29
|
+
# while others barf on encoding. The default is to just
|
30
|
+
# return url, override this in the subclass if special
|
31
|
+
# encoding is needed.
|
32
|
+
def encode_data_provider_url(url)
|
33
|
+
url
|
34
|
+
end
|
35
|
+
|
36
|
+
def harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name])
|
37
|
+
start_time = Time.now.utc.iso8601
|
38
|
+
harvest_method.call
|
39
|
+
delete_old_documents start_time, delete_constraints, solr_core
|
40
|
+
end
|
41
|
+
|
42
|
+
def delete_old_documents(timestamp, constraints, solr_core, force = false)
|
43
|
+
constraints = sanitize_data_centers_constraints(constraints)
|
44
|
+
delete_query = "last_update:[* TO #{timestamp}] AND #{constraints}"
|
45
|
+
solr = RSolr.connect url: solr_url + "/#{solr_core}"
|
46
|
+
unchanged_count = (solr.get 'select', params: { q: delete_query, rows: 0 })['response']['numFound'].to_i
|
47
|
+
if unchanged_count == 0
|
48
|
+
puts "All documents were updated after #{timestamp}, nothing to delete"
|
49
|
+
else
|
50
|
+
puts "Begin removing documents older than #{timestamp}"
|
51
|
+
remove_documents(solr, delete_query, constraints, force, unchanged_count)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def sanitize_data_centers_constraints(query_string)
|
56
|
+
# Remove lucene special characters, preserve the query parameter and compress whitespace
|
57
|
+
query_string.gsub!(/[:&|!~\-\(\)\{\}\[\]\^\*\?\+]+/, ' ')
|
58
|
+
query_string.gsub!(/data_centers /, 'data_centers:')
|
59
|
+
query_string.squeeze(' ').strip
|
60
|
+
end
|
61
|
+
|
62
|
+
def remove_documents(solr, delete_query, constraints, force, numfound)
|
63
|
+
all_response_count = (solr.get 'select', params: { q: constraints, rows: 0 })['response']['numFound']
|
64
|
+
if force || (numfound / all_response_count.to_f < DELETE_DOCUMENTS_RATIO)
|
65
|
+
puts "Deleting #{numfound} documents for #{constraints}"
|
66
|
+
solr.delete_by_query delete_query
|
67
|
+
solr.commit
|
68
|
+
else
|
69
|
+
puts "Failed to delete records older than current harvest start because they exceeded #{DELETE_DOCUMENTS_RATIO} of the total records for this data center."
|
70
|
+
puts "\tTotal records: #{all_response_count}"
|
71
|
+
puts "\tNon-updated records: #{numfound}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Update Solr with an array of Nokogiri xml documents, report number of successfully added documents
|
76
|
+
def insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
|
77
|
+
success = 0
|
78
|
+
failure = 0
|
79
|
+
docs.each do |doc|
|
80
|
+
insert_solr_doc(doc, content_type, core) ? success += 1 : failure += 1
|
81
|
+
end
|
82
|
+
puts "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
|
83
|
+
puts "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."
|
84
|
+
fail 'Some documents failed to be inserted into Solr' if failure > 0
|
85
|
+
end
|
86
|
+
|
87
|
+
def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
|
88
|
+
url = solr_url + "/#{core}/update?commit=true"
|
89
|
+
success = false
|
90
|
+
|
91
|
+
# Some of the docs will cause Solr to crash - CPU goes to 195% with `top` and it
|
92
|
+
# doesn't seem to recover.
|
93
|
+
return success unless doc_valid?(doc) if content_type == XML_CONTENT_TYPE
|
94
|
+
|
95
|
+
doc_serialized = get_serialized_doc(doc, content_type)
|
96
|
+
|
97
|
+
# Some docs will cause solr to time out during the POST
|
98
|
+
begin
|
99
|
+
RestClient.post(url, doc_serialized, content_type: content_type) do |response, _request, _result|
|
100
|
+
success = response.code == 200
|
101
|
+
puts "Error for #{doc_serialized}\n\n response: #{response.body}" unless success
|
102
|
+
end
|
103
|
+
rescue => e
|
104
|
+
puts "Rest exception while POSTing to Solr: #{e}, for doc: #{doc_serialized}"
|
105
|
+
end
|
106
|
+
|
107
|
+
success
|
108
|
+
end
|
109
|
+
|
110
|
+
def get_serialized_doc(doc, content_type)
|
111
|
+
if content_type.eql?(XML_CONTENT_TYPE)
|
112
|
+
return doc.respond_to?(:to_xml) ? doc.to_xml : doc
|
113
|
+
elsif content_type.eql?(JSON_CONTENT_TYPE)
|
114
|
+
return MultiJson.dump(doc)
|
115
|
+
else
|
116
|
+
return doc
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Get results from some ISO end point specified in the query string
|
121
|
+
def get_results(request_url, metadata_path, content_type = 'application/xml')
|
122
|
+
timeout = 300
|
123
|
+
retries_left = 3
|
124
|
+
|
125
|
+
request_url = encode_data_provider_url(request_url)
|
126
|
+
|
127
|
+
begin
|
128
|
+
puts "Request: #{request_url}"
|
129
|
+
response = open(request_url, read_timeout: timeout, 'Content-Type' => content_type)
|
130
|
+
rescue OpenURI::HTTPError, Timeout::Error => e
|
131
|
+
retries_left -= 1
|
132
|
+
puts "## REQUEST FAILED ## Retrying #{retries_left} more times..."
|
133
|
+
|
134
|
+
retry if retries_left > 0
|
135
|
+
|
136
|
+
raise e if @die_on_failure
|
137
|
+
return
|
138
|
+
end
|
139
|
+
doc = Nokogiri.XML(response)
|
140
|
+
doc.xpath(metadata_path, Helpers::IsoNamespaces.namespaces(doc))
|
141
|
+
end
|
142
|
+
|
143
|
+
# returns Nokogiri XML document with content
|
144
|
+
# '<?xml version="1.0"?><add/>'
|
145
|
+
def create_new_solr_add_doc
|
146
|
+
doc = Nokogiri::XML::Document.new
|
147
|
+
doc.root = Nokogiri::XML::Node.new('add', doc)
|
148
|
+
doc
|
149
|
+
end
|
150
|
+
|
151
|
+
# returns a Nokogiri XML document with content
|
152
|
+
# '<?xml version="1.0"?><add> <child /> </add>'
|
153
|
+
def create_new_solr_add_doc_with_child(child)
|
154
|
+
doc = create_new_solr_add_doc
|
155
|
+
doc.root.add_child(child)
|
156
|
+
doc
|
157
|
+
end
|
158
|
+
|
159
|
+
# Make sure that Solr is able to accept this doc in a POST
|
160
|
+
def doc_valid?(doc)
|
161
|
+
spatial_coverages = doc.xpath(".//field[@name='spatial_coverages']").first
|
162
|
+
return true if spatial_coverages.nil?
|
163
|
+
|
164
|
+
spatial_coverages = spatial_coverages.text.split(' ')
|
165
|
+
|
166
|
+
# We've only seen the failure with 4 spatial coverage values
|
167
|
+
return true if spatial_coverages.size < 4
|
168
|
+
|
169
|
+
valid_solr_spatial_coverage?(spatial_coverages)
|
170
|
+
end
|
171
|
+
|
172
|
+
# spatial_coverages is an array with length 4:
|
173
|
+
# [North, East, South, West]
|
174
|
+
def valid_solr_spatial_coverage?(spatial_coverages)
|
175
|
+
north, east, south, west = spatial_coverages
|
176
|
+
|
177
|
+
polar_point = (north == south) && (north.to_f.abs == 90)
|
178
|
+
|
179
|
+
(east == west) || !polar_point
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'rest-client'
|
3
|
+
|
4
|
+
module SearchSolrTools
|
5
|
+
module Harvesters
|
6
|
+
# Harvests data from BcoDmo endpoint, translates and adds it to solr
|
7
|
+
class BcoDmo < Base
|
8
|
+
def initialize(env = 'development', die_on_failure = false)
|
9
|
+
super env, die_on_failure
|
10
|
+
@translator = Translators::BcodmoJsonToSolr.new
|
11
|
+
@wkt_parser = RGeo::WKRep::WKTParser.new(nil, {}) # (factory_generator_=nil,
|
12
|
+
end
|
13
|
+
|
14
|
+
def harvest_and_delete
|
15
|
+
super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
|
16
|
+
end
|
17
|
+
|
18
|
+
def harvest_bcodmo_into_solr
|
19
|
+
result = translate_bcodmo
|
20
|
+
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
21
|
+
fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def translate_bcodmo
|
25
|
+
documents = []
|
26
|
+
failure_ids = []
|
27
|
+
request_json(SolrEnvironments[@environment][:bcodmo_url]).each do |record|
|
28
|
+
geometry = request_json(record['geometryUrl'])
|
29
|
+
results = parse_record(record, geometry)
|
30
|
+
results[:documents].each { |d| documents << d }
|
31
|
+
results[:failure_ids].each { |id| failure_ids << id }
|
32
|
+
end
|
33
|
+
{ add_docs: documents, failure_ids: failure_ids }
|
34
|
+
end
|
35
|
+
|
36
|
+
def request_json(url)
|
37
|
+
JSON.parse(RestClient.get(url))
|
38
|
+
end
|
39
|
+
|
40
|
+
def parse_record(record, geometry)
|
41
|
+
documents = []
|
42
|
+
failure_ids = []
|
43
|
+
begin
|
44
|
+
JSON.parse(RestClient.get(record['datasets'])).each do |dataset|
|
45
|
+
documents << { 'add' => { 'doc' => @translator.translate(dataset, record, geometry) } }
|
46
|
+
end
|
47
|
+
rescue => e
|
48
|
+
puts "Failed to add record #{record['id']} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
|
49
|
+
failure_ids << record['id']
|
50
|
+
end
|
51
|
+
{ documents: documents, failure_ids: failure_ids }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
|
2
|
+
module SearchSolrTools
|
3
|
+
module Harvesters
|
4
|
+
# Harvests data from CISL and inserts it into Solr after it has been translated
|
5
|
+
class Cisl < Oai
|
6
|
+
def initialize(env = 'development', die_on_failure = false)
|
7
|
+
super
|
8
|
+
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]
|
9
|
+
@translator = Helpers::IsoToSolr.new :cisl
|
10
|
+
|
11
|
+
# Used in query string params, resumptionToken
|
12
|
+
@dataset = '0bdd2d39-3493-4fa2-98f9-6766596bdc50'
|
13
|
+
end
|
14
|
+
|
15
|
+
def metadata_url
|
16
|
+
SolrEnvironments[@environment][:cisl_url]
|
17
|
+
end
|
18
|
+
|
19
|
+
def results
|
20
|
+
list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
|
21
|
+
|
22
|
+
@resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces)
|
23
|
+
@resumption_token = format_resumption_token(@resumption_token.first.text)
|
24
|
+
|
25
|
+
list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def request_params
|
31
|
+
{
|
32
|
+
verb: 'ListRecords',
|
33
|
+
metadataPrefix: 'dif',
|
34
|
+
set: @dataset,
|
35
|
+
resumptionToken: @resumption_token
|
36
|
+
}.delete_if { |_k, v| v.nil? }
|
37
|
+
end
|
38
|
+
|
39
|
+
# The ruby response is lacking quotes, which the token requires in order to work...
|
40
|
+
# Also, the response back seems to be inconsistent - sometimes it adds " instead of '"',
|
41
|
+
# which makes the token fail to work.
|
42
|
+
# To get around this I'd prefer to make assumptions about the token and let it break if
|
43
|
+
# they change the formatting. For now, all fields other than offset should be able to be
|
44
|
+
# assumed to remain constant.
|
45
|
+
# If the input is empty, then we are done - return an empty string, which is checked for
|
46
|
+
# in the harvest loop.
|
47
|
+
def format_resumption_token(resumption_token)
|
48
|
+
return '' if resumption_token.empty?
|
49
|
+
|
50
|
+
resumption_token =~ /offset:(\d+)/
|
51
|
+
offset = Regexp.last_match(1)
|
52
|
+
|
53
|
+
{
|
54
|
+
from: nil,
|
55
|
+
until: nil,
|
56
|
+
set: @dataset,
|
57
|
+
metadataPrefix: 'dif',
|
58
|
+
offset: offset
|
59
|
+
}.to_json
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module SearchSolrTools
|
2
|
+
module Harvesters
|
3
|
+
# Harvests data from ECHO and inserts it into Solr after it has been translated
|
4
|
+
class Echo < Base
|
5
|
+
def initialize(env = 'development', die_on_failure = false)
|
6
|
+
super env, die_on_failure
|
7
|
+
@page_size = 1000
|
8
|
+
@translator = Helpers::IsoToSolr.new :echo
|
9
|
+
end
|
10
|
+
|
11
|
+
def harvest_and_delete
|
12
|
+
puts "Running harvest of ECHO catalog from #{echo_url}"
|
13
|
+
super(method(:harvest_echo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ECHO][:long_name]}\"")
|
14
|
+
end
|
15
|
+
|
16
|
+
# get translated entries from ECHO and add them to Solr
|
17
|
+
# this is the main entry point for the class
|
18
|
+
def harvest_echo_into_solr
|
19
|
+
page_num = 1
|
20
|
+
while (entries = get_results_from_echo(page_num)) && (entries.length > 0)
|
21
|
+
begin
|
22
|
+
insert_solr_docs get_docs_with_translated_entries_from_echo(entries)
|
23
|
+
rescue => e
|
24
|
+
puts "ERROR: #{e}\n\n"
|
25
|
+
raise e if @die_on_failure
|
26
|
+
end
|
27
|
+
page_num += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def echo_url
|
32
|
+
SolrEnvironments[@environment][:echo_url]
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_results_from_echo(page_num)
|
36
|
+
get_results build_request(@page_size, page_num), './/results/result', 'application/echo10+xml'
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_docs_with_translated_entries_from_echo(entries)
|
40
|
+
docs = []
|
41
|
+
entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
|
42
|
+
docs
|
43
|
+
end
|
44
|
+
|
45
|
+
def build_request(max_records = '25', page_num = '1')
|
46
|
+
echo_url + '?page_size=' + max_records.to_s + '&page_num=' + page_num.to_s
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
require 'json'
|
3
|
+
require 'rgeo/geo_json'
|
4
|
+
|
5
|
+
module SearchSolrTools
|
6
|
+
module Harvesters
|
7
|
+
class Eol < Base
|
8
|
+
def initialize(env = 'development', die_on_failure = false)
|
9
|
+
super env, die_on_failure
|
10
|
+
@translator = SearchSolrTools::Translators::EolToSolr.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def harvest_and_delete
|
14
|
+
puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
|
15
|
+
SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
|
16
|
+
super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
|
17
|
+
end
|
18
|
+
|
19
|
+
def harvest_eol_into_solr
|
20
|
+
solr_add_queries = eol_dataset_urls.map do |dataset|
|
21
|
+
begin
|
22
|
+
doc = open_xml_document(dataset)
|
23
|
+
if doc.xpath('//xmlns:metadata').size > 1
|
24
|
+
# THREDDS allows for a dataset of datasests, EOL should not utilize this
|
25
|
+
fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
|
26
|
+
end
|
27
|
+
metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
|
28
|
+
{ 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
|
29
|
+
rescue => e
|
30
|
+
puts "ERROR: #{e}"
|
31
|
+
puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
|
32
|
+
raise e if @die_on_failure
|
33
|
+
next
|
34
|
+
end
|
35
|
+
end
|
36
|
+
insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
|
37
|
+
end
|
38
|
+
|
39
|
+
def eol_dataset_urls
|
40
|
+
SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
|
41
|
+
doc = open_xml_document(endpoint)
|
42
|
+
doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def open_xml_document(url)
|
47
|
+
Nokogiri::XML(open(url)) do |config|
|
48
|
+
config.strict
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module SearchSolrTools
|
2
|
+
module Harvesters
|
3
|
+
# Harvests data from ICES and inserts it into Solr after it has been translated
|
4
|
+
class Ices < Base
|
5
|
+
def initialize(env = 'development', die_on_failure = false)
|
6
|
+
super env, die_on_failure
|
7
|
+
@page_size = 100
|
8
|
+
@translator = Helpers::IsoToSolr.new :ices
|
9
|
+
end
|
10
|
+
|
11
|
+
def harvest_and_delete
|
12
|
+
puts "Running harvest of ICES catalog from #{ices_url}"
|
13
|
+
super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
|
14
|
+
end
|
15
|
+
|
16
|
+
# get translated entries from ICES and add them to Solr
|
17
|
+
# this is the main entry point for the class
|
18
|
+
def harvest_ices_into_solr
|
19
|
+
start_index = 1
|
20
|
+
while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
|
21
|
+
begin
|
22
|
+
insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
|
23
|
+
rescue => e
|
24
|
+
puts "ERROR: #{e}"
|
25
|
+
raise e if @die_on_failure
|
26
|
+
end
|
27
|
+
start_index += @page_size
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def ices_url
|
32
|
+
SolrEnvironments[@environment][:ices_url]
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_results_from_ices(start_index)
|
36
|
+
get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_docs_with_translated_entries_from_ices(entries)
|
40
|
+
docs = []
|
41
|
+
entries.each { |r| docs.push(create_new_solr_add_doc_with_child(@translator.translate(r).root)) }
|
42
|
+
docs
|
43
|
+
end
|
44
|
+
|
45
|
+
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
46
|
+
Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
|
47
|
+
'resultType' => resultType,
|
48
|
+
'maxRecords' => maxRecords,
|
49
|
+
'startPosition' => startPosition,
|
50
|
+
'constraintLanguage' => 'CQL_TEXT'
|
51
|
+
)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|