search_solr_tools 3.7.1 → 3.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +65 -4
- data/bin/search_solr_tools +4 -6
- data/lib/search_solr_tools/config/environments.yaml +4 -4
- data/lib/search_solr_tools/harvesters/adc.rb +47 -0
- data/lib/search_solr_tools/harvesters/base.rb +1 -1
- data/lib/search_solr_tools/harvesters/bcodmo.rb +7 -1
- data/lib/search_solr_tools/harvesters/echo.rb +4 -4
- data/lib/search_solr_tools/harvesters/eol.rb +1 -1
- data/lib/search_solr_tools/harvesters/gtnp.rb +1 -1
- data/lib/search_solr_tools/harvesters/ices.rb +3 -3
- data/lib/search_solr_tools/harvesters/nodc.rb +3 -3
- data/lib/search_solr_tools/harvesters/nsidc_json.rb +1 -1
- data/lib/search_solr_tools/harvesters/oai.rb +3 -3
- data/lib/search_solr_tools/harvesters/tdar.rb +19 -7
- data/lib/search_solr_tools/harvesters/usgs.rb +3 -3
- data/lib/search_solr_tools/helpers/selectors.rb +1 -1
- data/lib/search_solr_tools/helpers/solr_format.rb +2 -2
- data/lib/search_solr_tools/selectors/adc.rb +95 -0
- data/lib/search_solr_tools/translators/bcodmo_json.rb +23 -6
- data/lib/search_solr_tools/translators/gtnp_json.rb +0 -1
- data/lib/search_solr_tools/version.rb +1 -1
- data/search_solr_tools.gemspec +1 -1
- metadata +7 -7
- data/lib/search_solr_tools/harvesters/cisl.rb +0 -65
- data/lib/search_solr_tools/selectors/cisl.rb +0 -112
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c338059d0ec2049e415517a2e6b9b89df18cdd1
|
4
|
+
data.tar.gz: 3cb8c2c60e269ec9b66c305730522b7da3ed8243
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5334498934992c587a414e3672f98130b4125d7013035a1d35cbd335eecf248a5438696255da3f456fd51b6f7fef48b3e79ecd199a5af9200cc06c4aca7f563d
|
7
|
+
data.tar.gz: 303c74385390406d79ff379efbc44a22405427d512d8e369a685f851cc50a77d54dc07229eb105529059e06f3b5f4fb4b7a70e1902a3817f9ccad424c2a9d71d
|
data/CHANGELOG.md
CHANGED
@@ -1,24 +1,85 @@
|
|
1
|
-
## v3.
|
1
|
+
## v3.8.0 (2017-03-28)
|
2
|
+
|
3
|
+
Changes
|
4
|
+
|
5
|
+
- Change ECHO harvester to harvest 100 records at a time, rather than 1000 to
|
6
|
+
avoid timeout/hanging issues with the large requests.
|
7
|
+
- Change "CISL"/ACADIS Gateway harvester to "NSF Arctic Data Center";
|
8
|
+
aoncadis.org redirects to another site, and the data center's name was
|
9
|
+
changed. The feed format was also changed; the harvester was updated to
|
10
|
+
consume the new feed.
|
11
|
+
|
12
|
+
Bugfixes
|
13
|
+
|
14
|
+
- Update NODC feed URL to use https.
|
15
|
+
- Update RDA feed URL to use https.
|
16
|
+
- Update handling of geometries to match new format provided by BCO-DMO feed.
|
17
|
+
- Update NMI feed URL; the feed was relocated.
|
18
|
+
- Harvesting tDAR starts from record 0 instead of record 1.
|
19
|
+
- tDAR harvester no longer attempts to obtain another page of records after
|
20
|
+
all the records have been harvested; where other feeds return an empty
|
21
|
+
response that our harvester handles without issue, tDAR throws an error if
|
22
|
+
the "startRecord" parameter is higher than their last record.
|
23
|
+
- Exit with a non-0 status when a problem with the whole feed is encountered,
|
24
|
+
even if `--die-on-failure` is not passed. That flag should only cause
|
25
|
+
failures when there are issues with individual records; we don't want
|
26
|
+
harvesting to stop due to a metadata issue with a small number of
|
27
|
+
records.
|
28
|
+
- Include BCO-DMO URL in the harvester output the same way all the other URLs
|
29
|
+
are displayed.
|
30
|
+
|
31
|
+
## v3.7.1 (2016-05-18)
|
32
|
+
|
33
|
+
- RuboCop fixes.
|
34
|
+
|
35
|
+
## v3.7.0 (2016-05-18)
|
2
36
|
|
3
37
|
New Features
|
4
38
|
|
39
|
+
- Add sponsored programs to NSIDC harvesting.
|
5
40
|
- Add support for ingesting Data Access Links from NSIDC JSON
|
6
41
|
|
42
|
+
Bugfixes
|
43
|
+
|
44
|
+
- Fix dependency issue with gem "listen".
|
45
|
+
- Fix bad configuration for OAI feed URLs.
|
46
|
+
|
47
|
+
## v3.5.1 (2016-02-15)
|
48
|
+
|
49
|
+
Bugfixes
|
50
|
+
|
51
|
+
- Add temporal duration facet for GTN-P data center.
|
52
|
+
|
53
|
+
## v3.5.0 (2016-02-11)
|
54
|
+
|
55
|
+
Changes
|
56
|
+
|
57
|
+
- Update long name for GTN-P data center.
|
58
|
+
|
59
|
+
## v3.4.0 (2016-02-11)
|
60
|
+
|
61
|
+
New Features
|
62
|
+
|
63
|
+
- Add harvester for GTN-P.
|
64
|
+
|
65
|
+
## v3.3.4 (2016-02-08)
|
66
|
+
|
67
|
+
See v3.4.0.
|
7
68
|
|
8
|
-
## v3.3.
|
69
|
+
## v3.3.3 (2016-01-14)
|
9
70
|
|
10
71
|
Bugfix
|
11
72
|
|
12
73
|
- Added quote checking for cisl offset parsing check
|
13
74
|
|
14
|
-
## v3.3.1
|
75
|
+
## v3.3.1 (2015-09-25)
|
15
76
|
|
16
77
|
Bugfix
|
17
78
|
|
18
79
|
- Remove strange facet string for temporal duration from NOAA Paleo search
|
19
80
|
results.
|
20
81
|
|
21
|
-
## v3.3.0
|
82
|
+
## v3.3.0 (2015-09-24)
|
22
83
|
|
23
84
|
New Features
|
24
85
|
|
data/bin/search_solr_tools
CHANGED
@@ -19,11 +19,11 @@ class SolrHarvestCLI < Thor
|
|
19
19
|
puts target
|
20
20
|
begin
|
21
21
|
harvest_class = get_harvester_class(target)
|
22
|
-
harvester = harvest_class.new
|
22
|
+
harvester = harvest_class.new(options[:environment], die_on_failure)
|
23
23
|
harvester.harvest_and_delete
|
24
24
|
rescue => e
|
25
25
|
puts "harvest failed for #{target}: #{e.message}"
|
26
|
-
raise e
|
26
|
+
raise e
|
27
27
|
end
|
28
28
|
end
|
29
29
|
end
|
@@ -62,11 +62,10 @@ class SolrHarvestCLI < Thor
|
|
62
62
|
end
|
63
63
|
|
64
64
|
no_tasks do
|
65
|
-
# rubocop: disable MethodLength
|
66
65
|
def harvester_map
|
67
66
|
{
|
68
67
|
'bco_dmo' => SearchSolrTools::Harvesters::BcoDmo,
|
69
|
-
'
|
68
|
+
'adc' => SearchSolrTools::Harvesters::Adc,
|
70
69
|
'data_one' => SearchSolrTools::Harvesters::DataOne,
|
71
70
|
'echo' => SearchSolrTools::Harvesters::Echo,
|
72
71
|
'eol' => SearchSolrTools::Harvesters::Eol,
|
@@ -85,11 +84,10 @@ class SolrHarvestCLI < Thor
|
|
85
84
|
'ade_auto_suggest' => SearchSolrTools::Harvesters::AdeAutoSuggest
|
86
85
|
}
|
87
86
|
end
|
88
|
-
# rubocop: enable MethodLength
|
89
87
|
|
90
88
|
def get_harvester_class(data_center_name)
|
91
89
|
name = data_center_name.downcase.to_s
|
92
|
-
|
90
|
+
fail("Invalid data center #{name}") unless harvester_map.key?(name)
|
93
91
|
|
94
92
|
harvester_map[name]
|
95
93
|
end
|
@@ -4,7 +4,7 @@
|
|
4
4
|
:collection_path: solr
|
5
5
|
:port: 8983
|
6
6
|
:bcodmo_url: http://www.bco-dmo.org/nsidc/arctic-deployments.json
|
7
|
-
:
|
7
|
+
:adc_url: https://arcticdata.io/metacat/d1/mn/v2/query/solr/
|
8
8
|
:data_one_url: https://cn.dataone.org/cn/v1/query/solr/select?q=northBoundCoord:%5B45.0%20TO%2090.0%5D
|
9
9
|
:echo_url: https://api.echo.nasa.gov/catalog-rest/echo_catalog/datasets.echo10
|
10
10
|
:gtnp:
|
@@ -12,10 +12,10 @@
|
|
12
12
|
- http://www.gtnpdatabase.org/rest/activelayers/json
|
13
13
|
:ices_url: http://geo.ices.dk/geonetwork/srv/en/csw
|
14
14
|
:ncdc_paleo_url: http://gis.ncdc.noaa.gov/gptpaleo/csw
|
15
|
-
:nmi_url: http://
|
16
|
-
:nodc_url:
|
15
|
+
:nmi_url: http://arcticdata.met.no/metamod/oai
|
16
|
+
:nodc_url: https://data.nodc.noaa.gov/geoportal/csw
|
17
17
|
:pdc_url: http://www.polardata.ca/oai/provider
|
18
|
-
:rda_url:
|
18
|
+
:rda_url: https://rda.ucar.edu/cgi-bin/oai
|
19
19
|
:tdar_url: http://core.tdar.org/search/rss
|
20
20
|
:usgs_url: https://www.sciencebase.gov/catalog/item/527cf4ede4b0850ea05182ee/csw
|
21
21
|
:eol:
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module SearchSolrTools
|
2
|
+
module Harvesters
|
3
|
+
class Adc < Base
|
4
|
+
def initialize(env = 'development', die_on_failure = false)
|
5
|
+
super
|
6
|
+
@page_size = 250
|
7
|
+
@translator = Helpers::IsoToSolr.new :adc
|
8
|
+
end
|
9
|
+
|
10
|
+
def harvest_and_delete
|
11
|
+
puts "Running harvest of adc catalog from #{metadata_url}"
|
12
|
+
super(method(:harvest_adc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:long_name]}\"")
|
13
|
+
end
|
14
|
+
|
15
|
+
def harvest_adc_into_solr
|
16
|
+
start = 0
|
17
|
+
while (entries = get_results_from_adc(start)) && (entries.length > 0)
|
18
|
+
begin
|
19
|
+
insert_solr_docs(get_docs_with_translated_entries_from_adc(entries))
|
20
|
+
rescue => e
|
21
|
+
puts "ERROR: #{e}\n\n"
|
22
|
+
raise e if @die_on_failure
|
23
|
+
end
|
24
|
+
start += @page_size
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def get_results_from_adc(start)
|
29
|
+
get_results(build_request(start, @page_size), './response/result/doc')
|
30
|
+
end
|
31
|
+
|
32
|
+
def metadata_url
|
33
|
+
SolrEnvironments[@environment][:adc_url]
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_docs_with_translated_entries_from_adc(entries)
|
37
|
+
entries.map do |e|
|
38
|
+
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def build_request(start = 0, max_records = 100)
|
43
|
+
"#{metadata_url}?q=*:*&start=#{start}&rows=#{max_records}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -81,7 +81,7 @@ module SearchSolrTools
|
|
81
81
|
end
|
82
82
|
puts "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
|
83
83
|
puts "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."
|
84
|
-
|
84
|
+
fail 'Some documents failed to be inserted into Solr' if failure > 0
|
85
85
|
end
|
86
86
|
|
87
87
|
def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
|
@@ -12,13 +12,18 @@ module SearchSolrTools
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def harvest_and_delete
|
15
|
+
puts "Running harvest of BCO-DMO catalog from #{bcodmo_url}"
|
15
16
|
super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
|
16
17
|
end
|
17
18
|
|
19
|
+
def bcodmo_url
|
20
|
+
SolrEnvironments[@environment][:bcodmo_url]
|
21
|
+
end
|
22
|
+
|
18
23
|
def harvest_bcodmo_into_solr
|
19
24
|
result = translate_bcodmo
|
20
25
|
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
21
|
-
|
26
|
+
fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
|
22
27
|
end
|
23
28
|
|
24
29
|
def translate_bcodmo
|
@@ -34,6 +39,7 @@ module SearchSolrTools
|
|
34
39
|
end
|
35
40
|
|
36
41
|
def request_json(url)
|
42
|
+
puts "Request: #{url}"
|
37
43
|
JSON.parse(RestClient.get(url))
|
38
44
|
end
|
39
45
|
|
@@ -4,7 +4,7 @@ module SearchSolrTools
|
|
4
4
|
class Echo < Base
|
5
5
|
def initialize(env = 'development', die_on_failure = false)
|
6
6
|
super env, die_on_failure
|
7
|
-
@page_size =
|
7
|
+
@page_size = 100
|
8
8
|
@translator = Helpers::IsoToSolr.new :echo
|
9
9
|
end
|
10
10
|
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_echo(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_request(max_records = '25', page_num = '1')
|
@@ -22,7 +22,7 @@ module SearchSolrTools
|
|
22
22
|
doc = open_xml_document(dataset)
|
23
23
|
if doc.xpath('//xmlns:metadata').size > 1
|
24
24
|
# THREDDS allows for a dataset of datasests, EOL should not utilize this
|
25
|
-
|
25
|
+
fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
|
26
26
|
end
|
27
27
|
metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
|
28
28
|
{ 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
|
@@ -28,7 +28,7 @@ module SearchSolrTools
|
|
28
28
|
def harvest_gtnp_into_solr
|
29
29
|
result = translate_gtnp
|
30
30
|
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
31
|
-
|
31
|
+
fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
|
32
32
|
end
|
33
33
|
|
34
34
|
def translate_gtnp
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_ices(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_nodc(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
@@ -23,7 +23,7 @@ module SearchSolrTools
|
|
23
23
|
def harvest_nsidc_json_into_solr
|
24
24
|
result = docs_with_translated_entries_from_nsidc
|
25
25
|
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
26
|
-
|
26
|
+
fail 'Failed to harvest and insert some authoritative IDs' if result[:failure_ids].length > 0
|
27
27
|
end
|
28
28
|
|
29
29
|
def nsidc_json_url
|
@@ -34,11 +34,11 @@ module SearchSolrTools
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def results
|
37
|
-
|
37
|
+
fail NotImplementedError
|
38
38
|
end
|
39
39
|
|
40
40
|
def metadata_url
|
41
|
-
|
41
|
+
fail NotImplementedError
|
42
42
|
end
|
43
43
|
|
44
44
|
def translated_docs(entries)
|
@@ -48,7 +48,7 @@ module SearchSolrTools
|
|
48
48
|
private
|
49
49
|
|
50
50
|
def request_params
|
51
|
-
|
51
|
+
fail NotImplementedError
|
52
52
|
end
|
53
53
|
|
54
54
|
def request_string
|
@@ -14,14 +14,22 @@ module SearchSolrTools
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def harvest_tdar_into_solr
|
17
|
-
start_record =
|
17
|
+
start_record = 0
|
18
|
+
total_harvested = 0
|
19
|
+
total_expected = total_results
|
18
20
|
while (entries = get_results_from_tdar(start_record)) && (entries.length > 0)
|
19
21
|
begin
|
20
|
-
insert_solr_docs
|
22
|
+
insert_solr_docs(get_docs_with_translated_entries_from_tdar(entries))
|
21
23
|
rescue => e
|
22
24
|
puts "ERROR: #{e}\n\n"
|
23
25
|
raise e if @die_on_failure
|
24
26
|
end
|
27
|
+
|
28
|
+
# if we have all the records we expect, don't attempt another request;
|
29
|
+
# it would result in an error
|
30
|
+
total_harvested += entries.length
|
31
|
+
break if total_harvested >= total_expected
|
32
|
+
|
25
33
|
start_record += @page_size
|
26
34
|
end
|
27
35
|
end
|
@@ -31,16 +39,16 @@ module SearchSolrTools
|
|
31
39
|
end
|
32
40
|
|
33
41
|
def get_results_from_tdar(start_record)
|
34
|
-
get_results
|
42
|
+
get_results(build_request(@page_size, start_record), './/atom:entry', 'application/xml')
|
35
43
|
end
|
36
44
|
|
37
45
|
def get_docs_with_translated_entries_from_tdar(entries)
|
38
|
-
|
39
|
-
|
40
|
-
|
46
|
+
entries.map do |entry|
|
47
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
48
|
+
end
|
41
49
|
end
|
42
50
|
|
43
|
-
def build_request(max_records = '25', start_record = '
|
51
|
+
def build_request(max_records = '25', start_record = '0')
|
44
52
|
request_url = tdar_url + '?_tDAR.searchType=ACADIS_RSS&'\
|
45
53
|
'resourceTypes=DATASET&'\
|
46
54
|
'groups[0].latitudeLongitudeBoxes[0].maximumLongitude=180&'\
|
@@ -52,6 +60,10 @@ module SearchSolrTools
|
|
52
60
|
|
53
61
|
request_url
|
54
62
|
end
|
63
|
+
|
64
|
+
def total_results
|
65
|
+
get_results(build_request(0, 0), './/opensearch:totalResults').text.to_i
|
66
|
+
end
|
55
67
|
end
|
56
68
|
end
|
57
69
|
end
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_usgs(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
@@ -6,7 +6,7 @@ module SearchSolrTools
|
|
6
6
|
# This hash grabs all the selector files inside the selectors directory,
|
7
7
|
# to add a new source we need to create a selector file and add it to this hash.
|
8
8
|
SELECTORS = {
|
9
|
-
|
9
|
+
adc: Selectors::ADC,
|
10
10
|
data_one: Selectors::DATA_ONE,
|
11
11
|
echo: Selectors::ECHO,
|
12
12
|
ices: Selectors::ICES,
|
@@ -10,7 +10,7 @@ module SearchSolrTools
|
|
10
10
|
module SolrFormat
|
11
11
|
DATA_CENTER_NAMES = {
|
12
12
|
BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
|
13
|
-
|
13
|
+
ADC: { short_name: 'NSF ADC', long_name: 'NSF Arctic Data Center' },
|
14
14
|
DATA_ONE: { short_name: 'DataONE', long_name: 'DataONE' },
|
15
15
|
ECHO: { short_name: 'NASA ECHO', long_name: 'NASA Earth Observing System (EOS) Clearing House (ECHO)' },
|
16
16
|
EOL: { short_name: 'UCAR NCAR EOL', long_name: 'UCAR NCAR - Earth Observing Laboratory' },
|
@@ -134,7 +134,7 @@ module SearchSolrTools
|
|
134
134
|
j = send(find_index_method, resolution['max_resolution'])
|
135
135
|
return resolution_values[i..j]
|
136
136
|
end
|
137
|
-
|
137
|
+
fail "Invalid resolution #{resolution['type']}"
|
138
138
|
end
|
139
139
|
|
140
140
|
def self.resolution_not_specified?(resolution)
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'search_solr_tools'
|
2
|
+
|
3
|
+
module SearchSolrTools
|
4
|
+
module Selectors
|
5
|
+
ADC = {
|
6
|
+
authoritative_id: {
|
7
|
+
xpaths: ['.//str[@name="id"]'],
|
8
|
+
multivalue: false
|
9
|
+
},
|
10
|
+
title: {
|
11
|
+
xpaths: ['.//str[@name="title"]'],
|
12
|
+
multivalue: false
|
13
|
+
},
|
14
|
+
summary: {
|
15
|
+
xpaths: ['.//str[@name="abstract"]'],
|
16
|
+
multivalue: false
|
17
|
+
},
|
18
|
+
data_centers: {
|
19
|
+
xpaths: [''],
|
20
|
+
default_values: [Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:long_name]],
|
21
|
+
multivalue: false
|
22
|
+
},
|
23
|
+
authors: {
|
24
|
+
xpaths: ['.//str[@name="author"]'],
|
25
|
+
multivalue: false
|
26
|
+
},
|
27
|
+
keywords: {
|
28
|
+
xpaths: ['.//arr[@name="keywords"]/str'],
|
29
|
+
multivalue: true
|
30
|
+
},
|
31
|
+
last_revision_date: {
|
32
|
+
xpaths: ['.//date[@name="updateDate"]'],
|
33
|
+
default_values: [Helpers::SolrFormat.date_str(DateTime.now)], # formats the date into ISO8601 as in http://lucene.apache.org/solr/4_4_0/solr-core/org/apache/solr/schema/DateField.html
|
34
|
+
multivalue: false,
|
35
|
+
format: Helpers::SolrFormat::DATE
|
36
|
+
},
|
37
|
+
dataset_url: {
|
38
|
+
xpaths: ['.//str[@name="dataUrl"]'],
|
39
|
+
default_values: [''],
|
40
|
+
multivalue: false
|
41
|
+
},
|
42
|
+
spatial_coverages: {
|
43
|
+
xpaths: ['.'],
|
44
|
+
multivalue: false,
|
45
|
+
format: Helpers::DataOneFormat.method(:spatial_display)
|
46
|
+
},
|
47
|
+
spatial: {
|
48
|
+
xpaths: ['.'],
|
49
|
+
multivalue: false,
|
50
|
+
format: Helpers::DataOneFormat.method(:spatial_index)
|
51
|
+
},
|
52
|
+
spatial_area: {
|
53
|
+
xpaths: ['.'],
|
54
|
+
multivalue: false,
|
55
|
+
format: Helpers::DataOneFormat.method(:spatial_area)
|
56
|
+
},
|
57
|
+
temporal_coverages: {
|
58
|
+
xpaths: ['.'],
|
59
|
+
multivalue: false,
|
60
|
+
format: Helpers::DataOneFormat.method(:temporal_coverage)
|
61
|
+
},
|
62
|
+
temporal_duration: {
|
63
|
+
xpaths: ['.'],
|
64
|
+
multivalue: false,
|
65
|
+
format: Helpers::DataOneFormat.method(:temporal_duration)
|
66
|
+
},
|
67
|
+
temporal: {
|
68
|
+
xpaths: ['.'],
|
69
|
+
multivalue: false,
|
70
|
+
format: Helpers::DataOneFormat.method(:temporal_index_string)
|
71
|
+
},
|
72
|
+
source: {
|
73
|
+
xpaths: [''],
|
74
|
+
default_values: ['ADE'],
|
75
|
+
multivalue: false
|
76
|
+
},
|
77
|
+
facet_data_center: {
|
78
|
+
xpaths: [''],
|
79
|
+
default_values: ["#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:long_name]} | #{Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:short_name]}"],
|
80
|
+
multivalue: false
|
81
|
+
},
|
82
|
+
facet_spatial_scope: {
|
83
|
+
xpaths: ['.'],
|
84
|
+
multivalue: false,
|
85
|
+
format: Helpers::DataOneFormat.method(:facet_spatial_scope)
|
86
|
+
},
|
87
|
+
facet_temporal_duration: {
|
88
|
+
xpaths: ['.'],
|
89
|
+
default_values: [Helpers::SolrFormat::NOT_SPECIFIED],
|
90
|
+
format: Helpers::DataOneFormat.method(:facet_temporal_duration),
|
91
|
+
multivalue: false
|
92
|
+
}
|
93
|
+
}
|
94
|
+
end
|
95
|
+
end
|
@@ -50,12 +50,29 @@ module SearchSolrTools
|
|
50
50
|
end
|
51
51
|
|
52
52
|
def translate_geometry(wkt_geom)
|
53
|
-
wkt_geom['geometry']
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
53
|
+
if wkt_geom['geometry']['type'] == 'LineString'
|
54
|
+
wkt_geom['geometry']['type'] = 'MultiPoint'
|
55
|
+
end
|
56
|
+
geometry = RGeo::GeoJSON.decode(wkt_geom).geometry
|
57
|
+
geometry = RGeo::Feature.cast(geometry, RGeo::Feature::MultiPoint)
|
58
|
+
|
59
|
+
# This feed sometimes returns MultiLineString but wrongly calls them 'LineString'
|
60
|
+
# If the above fails, we assume this is why. If the feed gets fixed, this code
|
61
|
+
# should still handle that.
|
62
|
+
if geometry.nil? || geometry.num_geometries == 0
|
63
|
+
# Try to decode as an actual MultiLineString.
|
64
|
+
wkt_geom['geometry']['type'] = 'MultiLineString'
|
65
|
+
geometry = RGeo::GeoJSON.decode(wkt_geom).geometry
|
66
|
+
|
67
|
+
# Convert to a MultiPoint, for passing into the helper functions below.
|
68
|
+
coords = geometry.coordinates.flatten
|
69
|
+
coords = coords.each_slice(2).to_a
|
70
|
+
f = RGeo::Geos.factory
|
71
|
+
points = []
|
72
|
+
coords.each { |x, y| points << f.point(x, y) }
|
73
|
+
geometry = f.multi_point(points)
|
74
|
+
end
|
75
|
+
|
59
76
|
{
|
60
77
|
spatial_display: Helpers::TranslateSpatialCoverage.geojson_to_spatial_display_str([geometry]),
|
61
78
|
spatial_index: Helpers::TranslateSpatialCoverage.geojson_to_spatial_index_str([geometry]),
|
@@ -8,7 +8,6 @@ module SearchSolrTools
|
|
8
8
|
module Translators
|
9
9
|
# Translates GTN-P json to solr json format
|
10
10
|
class GtnpJsonToSolr
|
11
|
-
# rubocop:disable Metrics/MethodLength
|
12
11
|
# rubocop:disable AbcSize
|
13
12
|
def translate(json_doc, json_record)
|
14
13
|
json_geo = json_doc['geo'].nil? ? json_doc['coordinates'] : json_doc['geo']['coordinates']
|
data/search_solr_tools.gemspec
CHANGED
@@ -40,7 +40,7 @@ Gem::Specification.new do |spec|
|
|
40
40
|
spec.add_development_dependency 'guard-rubocop', '~> 1.2'
|
41
41
|
spec.add_development_dependency 'rake', '~> 10.4'
|
42
42
|
spec.add_development_dependency 'rspec', '~> 3.2'
|
43
|
-
spec.add_development_dependency 'rubocop', '~> 0.32'
|
43
|
+
spec.add_development_dependency 'rubocop', '~> 0.32.1'
|
44
44
|
spec.add_development_dependency 'webmock', '~> 1.13'
|
45
45
|
spec.add_development_dependency 'listen', '3.0.5'
|
46
46
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: search_solr_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Chalstrom
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date:
|
15
|
+
date: 2017-03-28 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: iso8601
|
@@ -230,14 +230,14 @@ dependencies:
|
|
230
230
|
requirements:
|
231
231
|
- - "~>"
|
232
232
|
- !ruby/object:Gem::Version
|
233
|
-
version:
|
233
|
+
version: 0.32.1
|
234
234
|
type: :development
|
235
235
|
prerelease: false
|
236
236
|
version_requirements: !ruby/object:Gem::Requirement
|
237
237
|
requirements:
|
238
238
|
- - "~>"
|
239
239
|
- !ruby/object:Gem::Version
|
240
|
-
version:
|
240
|
+
version: 0.32.1
|
241
241
|
- !ruby/object:Gem::Dependency
|
242
242
|
name: webmock
|
243
243
|
requirement: !ruby/object:Gem::Requirement
|
@@ -288,11 +288,11 @@ files:
|
|
288
288
|
- lib/search_solr_tools.rb
|
289
289
|
- lib/search_solr_tools/config/environments.rb
|
290
290
|
- lib/search_solr_tools/config/environments.yaml
|
291
|
+
- lib/search_solr_tools/harvesters/adc.rb
|
291
292
|
- lib/search_solr_tools/harvesters/ade_auto_suggest.rb
|
292
293
|
- lib/search_solr_tools/harvesters/auto_suggest.rb
|
293
294
|
- lib/search_solr_tools/harvesters/base.rb
|
294
295
|
- lib/search_solr_tools/harvesters/bcodmo.rb
|
295
|
-
- lib/search_solr_tools/harvesters/cisl.rb
|
296
296
|
- lib/search_solr_tools/harvesters/data_one.rb
|
297
297
|
- lib/search_solr_tools/harvesters/echo.rb
|
298
298
|
- lib/search_solr_tools/harvesters/eol.rb
|
@@ -325,7 +325,7 @@ files:
|
|
325
325
|
- lib/search_solr_tools/helpers/translate_spatial_coverage.rb
|
326
326
|
- lib/search_solr_tools/helpers/translate_temporal_coverage.rb
|
327
327
|
- lib/search_solr_tools/helpers/usgs_format.rb
|
328
|
-
- lib/search_solr_tools/selectors/
|
328
|
+
- lib/search_solr_tools/selectors/adc.rb
|
329
329
|
- lib/search_solr_tools/selectors/data_one.rb
|
330
330
|
- lib/search_solr_tools/selectors/echo_iso.rb
|
331
331
|
- lib/search_solr_tools/selectors/ices_iso.rb
|
@@ -363,7 +363,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
363
363
|
version: '0'
|
364
364
|
requirements: []
|
365
365
|
rubyforge_project:
|
366
|
-
rubygems_version: 2.4.
|
366
|
+
rubygems_version: 2.4.8
|
367
367
|
signing_key:
|
368
368
|
specification_version: 4
|
369
369
|
summary: Tools to harvest and manage various scientific dataset feeds in a Solr instance.
|
@@ -1,65 +0,0 @@
|
|
1
|
-
|
2
|
-
module SearchSolrTools
|
3
|
-
module Harvesters
|
4
|
-
# Harvests data from CISL and inserts it into Solr after it has been translated
|
5
|
-
class Cisl < Oai
|
6
|
-
def initialize(env = 'development', die_on_failure = false)
|
7
|
-
super
|
8
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]
|
9
|
-
@translator = Helpers::IsoToSolr.new :cisl
|
10
|
-
|
11
|
-
# Used in query string params, resumptionToken
|
12
|
-
@dataset = '0bdd2d39-3493-4fa2-98f9-6766596bdc50'
|
13
|
-
end
|
14
|
-
|
15
|
-
def metadata_url
|
16
|
-
SolrEnvironments[@environment][:cisl_url]
|
17
|
-
end
|
18
|
-
|
19
|
-
def results
|
20
|
-
list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
|
21
|
-
|
22
|
-
@resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces)
|
23
|
-
@resumption_token = format_resumption_token(@resumption_token.first.text)
|
24
|
-
|
25
|
-
list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def request_params
|
31
|
-
{
|
32
|
-
verb: 'ListRecords',
|
33
|
-
metadataPrefix: 'dif',
|
34
|
-
set: @dataset,
|
35
|
-
resumptionToken: @resumption_token
|
36
|
-
}.delete_if { |_k, v| v.nil? }
|
37
|
-
end
|
38
|
-
|
39
|
-
# The ruby response is lacking quotes, which the token requires in order to work...
|
40
|
-
# Also, the response back seems to be inconsistent - sometimes it adds " instead of '"',
|
41
|
-
# which makes the token fail to work.
|
42
|
-
# To get around this I'd prefer to make assumptions about the token and let it break if
|
43
|
-
# they change the formatting. For now, all fields other than offset should be able to be
|
44
|
-
# assumed to remain constant.
|
45
|
-
# glewis 2016-01-15: It broke, offset has quotes around it, so I updated the regex to account for
|
46
|
-
# the possibility, including '"' or '"'
|
47
|
-
# If the input is empty, then we are done - return an empty string, which is checked for
|
48
|
-
# in the harvest loop.
|
49
|
-
def format_resumption_token(resumption_token)
|
50
|
-
return '' if resumption_token.empty?
|
51
|
-
|
52
|
-
resumption_token =~ /offset(?:"|")?:(\d+)/
|
53
|
-
offset = Regexp.last_match(1)
|
54
|
-
|
55
|
-
{
|
56
|
-
from: nil,
|
57
|
-
until: nil,
|
58
|
-
set: @dataset,
|
59
|
-
metadataPrefix: 'dif',
|
60
|
-
offset: offset
|
61
|
-
}.to_json
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
@@ -1,112 +0,0 @@
|
|
1
|
-
require 'search_solr_tools'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Selectors
|
5
|
-
# The hash contains keys that should map to the fields in the solr schema,
|
6
|
-
# the keys are called selectors and are in charge of selecting the nodes
|
7
|
-
# from the ISO document, applying the default value if none of the xpaths
|
8
|
-
# resolved to a value and formatting the field. xpaths and multivalue are
|
9
|
-
# required, default_value, format, and reduce are optional.
|
10
|
-
#
|
11
|
-
# reduce takes the formatted result of multiple nodes and produces a single
|
12
|
-
# result. This is for fields that are not multivalued, but their value
|
13
|
-
# should consider information from all the nodes (for example, storing
|
14
|
-
# only the maximum duration from multiple temporal coverage fields, taking
|
15
|
-
# the sum of multiple spatial areas)
|
16
|
-
CISL = {
|
17
|
-
authoritative_id: {
|
18
|
-
xpaths: ['.//oai:header/oai:identifier'],
|
19
|
-
multivalue: false
|
20
|
-
},
|
21
|
-
title: {
|
22
|
-
xpaths: ['.//dif:Entry_Title'],
|
23
|
-
multivalue: false
|
24
|
-
},
|
25
|
-
summary: {
|
26
|
-
xpaths: ['.//dif:Summary/dif:Abstract'],
|
27
|
-
multivalue: false
|
28
|
-
},
|
29
|
-
data_centers: {
|
30
|
-
xpaths: [''],
|
31
|
-
default_values: [SearchSolrTools::Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]],
|
32
|
-
multivalue: false
|
33
|
-
},
|
34
|
-
authors: {
|
35
|
-
xpaths: [''],
|
36
|
-
multivalue: true
|
37
|
-
},
|
38
|
-
keywords: {
|
39
|
-
xpaths: [
|
40
|
-
'.//dif:Parameters/dif:Category',
|
41
|
-
'.//dif:Parameters/dif:Topic',
|
42
|
-
'.//dif:Parameters/dif:Term',
|
43
|
-
'.//dif:Parameters/dif:Variable_Level_1'
|
44
|
-
].reverse,
|
45
|
-
multivalue: true
|
46
|
-
},
|
47
|
-
last_revision_date: {
|
48
|
-
xpaths: ['.//dif:Last_DIF_Revision_Date'],
|
49
|
-
default_values: [SearchSolrTools::Helpers::SolrFormat.date_str(DateTime.now)], # formats the date into ISO8601 as in http://lucene.apache.org/solr/4_4_0/solr-core/org/apache/solr/schema/DateField.html
|
50
|
-
multivalue: false,
|
51
|
-
format: SearchSolrTools::Helpers::SolrFormat::DATE
|
52
|
-
},
|
53
|
-
dataset_url: {
|
54
|
-
xpaths: ['.//dif:Related_URL/dif:URL'],
|
55
|
-
multivalue: false
|
56
|
-
},
|
57
|
-
spatial_coverages: {
|
58
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
59
|
-
multivalue: true,
|
60
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::SPATIAL_DISPLAY
|
61
|
-
},
|
62
|
-
spatial: {
|
63
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
64
|
-
multivalue: true,
|
65
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::SPATIAL_INDEX
|
66
|
-
},
|
67
|
-
spatial_area: {
|
68
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
69
|
-
multivalue: false,
|
70
|
-
reduce: SearchSolrTools::Helpers::IsoToSolrFormat::MAX_SPATIAL_AREA,
|
71
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::SPATIAL_AREA
|
72
|
-
},
|
73
|
-
temporal: {
|
74
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
75
|
-
multivalue: true,
|
76
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::TEMPORAL_INDEX_STRING
|
77
|
-
},
|
78
|
-
temporal_coverages: {
|
79
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
80
|
-
multivalue: true,
|
81
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::TEMPORAL_DISPLAY_STRING
|
82
|
-
},
|
83
|
-
temporal_duration: {
|
84
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
85
|
-
multivalue: false,
|
86
|
-
reduce: SearchSolrTools::Helpers::SolrFormat::REDUCE_TEMPORAL_DURATION,
|
87
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::TEMPORAL_DURATION
|
88
|
-
},
|
89
|
-
source: {
|
90
|
-
xpaths: [''],
|
91
|
-
default_values: ['ADE'],
|
92
|
-
multivalue: false
|
93
|
-
},
|
94
|
-
facet_data_center: {
|
95
|
-
xpaths: [''],
|
96
|
-
default_values: ["#{SearchSolrTools::Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]} | #{Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:short_name]}"],
|
97
|
-
multivalue: false
|
98
|
-
},
|
99
|
-
facet_spatial_scope: {
|
100
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
101
|
-
multivalue: true,
|
102
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::FACET_SPATIAL_SCOPE
|
103
|
-
},
|
104
|
-
facet_temporal_duration: {
|
105
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
106
|
-
default_values: [SearchSolrTools::Helpers::SolrFormat::NOT_SPECIFIED],
|
107
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::FACET_TEMPORAL_DURATION,
|
108
|
-
multivalue: true
|
109
|
-
}
|
110
|
-
}
|
111
|
-
end
|
112
|
-
end
|