search_solr_tools 3.7.1 → 3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +65 -4
- data/bin/search_solr_tools +4 -6
- data/lib/search_solr_tools/config/environments.yaml +4 -4
- data/lib/search_solr_tools/harvesters/adc.rb +47 -0
- data/lib/search_solr_tools/harvesters/base.rb +1 -1
- data/lib/search_solr_tools/harvesters/bcodmo.rb +7 -1
- data/lib/search_solr_tools/harvesters/echo.rb +4 -4
- data/lib/search_solr_tools/harvesters/eol.rb +1 -1
- data/lib/search_solr_tools/harvesters/gtnp.rb +1 -1
- data/lib/search_solr_tools/harvesters/ices.rb +3 -3
- data/lib/search_solr_tools/harvesters/nodc.rb +3 -3
- data/lib/search_solr_tools/harvesters/nsidc_json.rb +1 -1
- data/lib/search_solr_tools/harvesters/oai.rb +3 -3
- data/lib/search_solr_tools/harvesters/tdar.rb +19 -7
- data/lib/search_solr_tools/harvesters/usgs.rb +3 -3
- data/lib/search_solr_tools/helpers/selectors.rb +1 -1
- data/lib/search_solr_tools/helpers/solr_format.rb +2 -2
- data/lib/search_solr_tools/selectors/adc.rb +95 -0
- data/lib/search_solr_tools/translators/bcodmo_json.rb +23 -6
- data/lib/search_solr_tools/translators/gtnp_json.rb +0 -1
- data/lib/search_solr_tools/version.rb +1 -1
- data/search_solr_tools.gemspec +1 -1
- metadata +7 -7
- data/lib/search_solr_tools/harvesters/cisl.rb +0 -65
- data/lib/search_solr_tools/selectors/cisl.rb +0 -112
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c338059d0ec2049e415517a2e6b9b89df18cdd1
|
4
|
+
data.tar.gz: 3cb8c2c60e269ec9b66c305730522b7da3ed8243
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5334498934992c587a414e3672f98130b4125d7013035a1d35cbd335eecf248a5438696255da3f456fd51b6f7fef48b3e79ecd199a5af9200cc06c4aca7f563d
|
7
|
+
data.tar.gz: 303c74385390406d79ff379efbc44a22405427d512d8e369a685f851cc50a77d54dc07229eb105529059e06f3b5f4fb4b7a70e1902a3817f9ccad424c2a9d71d
|
data/CHANGELOG.md
CHANGED
@@ -1,24 +1,85 @@
|
|
1
|
-
## v3.
|
1
|
+
## v3.8.0 (2017-03-28)
|
2
|
+
|
3
|
+
Changes
|
4
|
+
|
5
|
+
- Change ECHO harvester to harvest 100 records at a time, rather than 1000 to
|
6
|
+
avoid timeout/hanging issues with the large requests.
|
7
|
+
- Change "CISL"/ACADIS Gateway harvester to "NSF Arctic Data Center";
|
8
|
+
aoncadis.org redirects to another site, and the data center's name was
|
9
|
+
changed. The feed format was also changed; the harvester was updated to
|
10
|
+
consume the new feed.
|
11
|
+
|
12
|
+
Bugfixes
|
13
|
+
|
14
|
+
- Update NODC feed URL to use https.
|
15
|
+
- Update RDA feed URL to use https.
|
16
|
+
- Update handling of geometries to match new format provided by BCO-DMO feed.
|
17
|
+
- Update NMI feed URL; the feed was relocated.
|
18
|
+
- Harvesting tDAR starts from record 0 instead of record 1.
|
19
|
+
- tDAR harvester no longer attempts to obtain another page of records after
|
20
|
+
all the records have been harvested; where other feeds return an empty
|
21
|
+
response that our harvester handles without issue, tDAR throws an error if
|
22
|
+
the "startRecord" parameter is higher than their last record.
|
23
|
+
- Exit with a non-0 status when a problem with the whole feed is encountered,
|
24
|
+
even if `--die-on-failure` is not passed. That flag should only cause
|
25
|
+
failures when there are issues with individual records; we don't want
|
26
|
+
harvesting to stop due to a metadata issue with a small number of
|
27
|
+
records.
|
28
|
+
- Include BCO-DMO URL in the harvester output the same way all the other URLs
|
29
|
+
are displayed.
|
30
|
+
|
31
|
+
## v3.7.1 (2016-05-18)
|
32
|
+
|
33
|
+
- RuboCop fixes.
|
34
|
+
|
35
|
+
## v3.7.0 (2016-05-18)
|
2
36
|
|
3
37
|
New Features
|
4
38
|
|
39
|
+
- Add sponsored programs to NSIDC harvesting.
|
5
40
|
- Add support for ingesting Data Access Links from NSIDC JSON
|
6
41
|
|
42
|
+
Bugfixes
|
43
|
+
|
44
|
+
- Fix dependency issue with gem "listen".
|
45
|
+
- Fix bad configuration for OAI feed URLs.
|
46
|
+
|
47
|
+
## v3.5.1 (2016-02-15)
|
48
|
+
|
49
|
+
Bugfixes
|
50
|
+
|
51
|
+
- Add temporal duration facet for GTN-P data center.
|
52
|
+
|
53
|
+
## v3.5.0 (2016-02-11)
|
54
|
+
|
55
|
+
Changes
|
56
|
+
|
57
|
+
- Update long name for GTN-P data center.
|
58
|
+
|
59
|
+
## v3.4.0 (2016-02-11)
|
60
|
+
|
61
|
+
New Features
|
62
|
+
|
63
|
+
- Add harvester for GTN-P.
|
64
|
+
|
65
|
+
## v3.3.4 (2016-02-08)
|
66
|
+
|
67
|
+
See v3.4.0.
|
7
68
|
|
8
|
-
## v3.3.
|
69
|
+
## v3.3.3 (2016-01-14)
|
9
70
|
|
10
71
|
Bugfix
|
11
72
|
|
12
73
|
- Added quote checking for cisl offset parsing check
|
13
74
|
|
14
|
-
## v3.3.1
|
75
|
+
## v3.3.1 (2015-09-25)
|
15
76
|
|
16
77
|
Bugfix
|
17
78
|
|
18
79
|
- Remove strange facet string for temporal duration from NOAA Paleo search
|
19
80
|
results.
|
20
81
|
|
21
|
-
## v3.3.0
|
82
|
+
## v3.3.0 (2015-09-24)
|
22
83
|
|
23
84
|
New Features
|
24
85
|
|
data/bin/search_solr_tools
CHANGED
@@ -19,11 +19,11 @@ class SolrHarvestCLI < Thor
|
|
19
19
|
puts target
|
20
20
|
begin
|
21
21
|
harvest_class = get_harvester_class(target)
|
22
|
-
harvester = harvest_class.new
|
22
|
+
harvester = harvest_class.new(options[:environment], die_on_failure)
|
23
23
|
harvester.harvest_and_delete
|
24
24
|
rescue => e
|
25
25
|
puts "harvest failed for #{target}: #{e.message}"
|
26
|
-
raise e
|
26
|
+
raise e
|
27
27
|
end
|
28
28
|
end
|
29
29
|
end
|
@@ -62,11 +62,10 @@ class SolrHarvestCLI < Thor
|
|
62
62
|
end
|
63
63
|
|
64
64
|
no_tasks do
|
65
|
-
# rubocop: disable MethodLength
|
66
65
|
def harvester_map
|
67
66
|
{
|
68
67
|
'bco_dmo' => SearchSolrTools::Harvesters::BcoDmo,
|
69
|
-
'
|
68
|
+
'adc' => SearchSolrTools::Harvesters::Adc,
|
70
69
|
'data_one' => SearchSolrTools::Harvesters::DataOne,
|
71
70
|
'echo' => SearchSolrTools::Harvesters::Echo,
|
72
71
|
'eol' => SearchSolrTools::Harvesters::Eol,
|
@@ -85,11 +84,10 @@ class SolrHarvestCLI < Thor
|
|
85
84
|
'ade_auto_suggest' => SearchSolrTools::Harvesters::AdeAutoSuggest
|
86
85
|
}
|
87
86
|
end
|
88
|
-
# rubocop: enable MethodLength
|
89
87
|
|
90
88
|
def get_harvester_class(data_center_name)
|
91
89
|
name = data_center_name.downcase.to_s
|
92
|
-
|
90
|
+
fail("Invalid data center #{name}") unless harvester_map.key?(name)
|
93
91
|
|
94
92
|
harvester_map[name]
|
95
93
|
end
|
@@ -4,7 +4,7 @@
|
|
4
4
|
:collection_path: solr
|
5
5
|
:port: 8983
|
6
6
|
:bcodmo_url: http://www.bco-dmo.org/nsidc/arctic-deployments.json
|
7
|
-
:
|
7
|
+
:adc_url: https://arcticdata.io/metacat/d1/mn/v2/query/solr/
|
8
8
|
:data_one_url: https://cn.dataone.org/cn/v1/query/solr/select?q=northBoundCoord:%5B45.0%20TO%2090.0%5D
|
9
9
|
:echo_url: https://api.echo.nasa.gov/catalog-rest/echo_catalog/datasets.echo10
|
10
10
|
:gtnp:
|
@@ -12,10 +12,10 @@
|
|
12
12
|
- http://www.gtnpdatabase.org/rest/activelayers/json
|
13
13
|
:ices_url: http://geo.ices.dk/geonetwork/srv/en/csw
|
14
14
|
:ncdc_paleo_url: http://gis.ncdc.noaa.gov/gptpaleo/csw
|
15
|
-
:nmi_url: http://
|
16
|
-
:nodc_url:
|
15
|
+
:nmi_url: http://arcticdata.met.no/metamod/oai
|
16
|
+
:nodc_url: https://data.nodc.noaa.gov/geoportal/csw
|
17
17
|
:pdc_url: http://www.polardata.ca/oai/provider
|
18
|
-
:rda_url:
|
18
|
+
:rda_url: https://rda.ucar.edu/cgi-bin/oai
|
19
19
|
:tdar_url: http://core.tdar.org/search/rss
|
20
20
|
:usgs_url: https://www.sciencebase.gov/catalog/item/527cf4ede4b0850ea05182ee/csw
|
21
21
|
:eol:
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module SearchSolrTools
|
2
|
+
module Harvesters
|
3
|
+
class Adc < Base
|
4
|
+
def initialize(env = 'development', die_on_failure = false)
|
5
|
+
super
|
6
|
+
@page_size = 250
|
7
|
+
@translator = Helpers::IsoToSolr.new :adc
|
8
|
+
end
|
9
|
+
|
10
|
+
def harvest_and_delete
|
11
|
+
puts "Running harvest of adc catalog from #{metadata_url}"
|
12
|
+
super(method(:harvest_adc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:long_name]}\"")
|
13
|
+
end
|
14
|
+
|
15
|
+
def harvest_adc_into_solr
|
16
|
+
start = 0
|
17
|
+
while (entries = get_results_from_adc(start)) && (entries.length > 0)
|
18
|
+
begin
|
19
|
+
insert_solr_docs(get_docs_with_translated_entries_from_adc(entries))
|
20
|
+
rescue => e
|
21
|
+
puts "ERROR: #{e}\n\n"
|
22
|
+
raise e if @die_on_failure
|
23
|
+
end
|
24
|
+
start += @page_size
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def get_results_from_adc(start)
|
29
|
+
get_results(build_request(start, @page_size), './response/result/doc')
|
30
|
+
end
|
31
|
+
|
32
|
+
def metadata_url
|
33
|
+
SolrEnvironments[@environment][:adc_url]
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_docs_with_translated_entries_from_adc(entries)
|
37
|
+
entries.map do |e|
|
38
|
+
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def build_request(start = 0, max_records = 100)
|
43
|
+
"#{metadata_url}?q=*:*&start=#{start}&rows=#{max_records}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -81,7 +81,7 @@ module SearchSolrTools
|
|
81
81
|
end
|
82
82
|
puts "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
|
83
83
|
puts "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."
|
84
|
-
|
84
|
+
fail 'Some documents failed to be inserted into Solr' if failure > 0
|
85
85
|
end
|
86
86
|
|
87
87
|
def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
|
@@ -12,13 +12,18 @@ module SearchSolrTools
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def harvest_and_delete
|
15
|
+
puts "Running harvest of BCO-DMO catalog from #{bcodmo_url}"
|
15
16
|
super(method(:harvest_bcodmo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:BCODMO][:long_name]}\"")
|
16
17
|
end
|
17
18
|
|
19
|
+
def bcodmo_url
|
20
|
+
SolrEnvironments[@environment][:bcodmo_url]
|
21
|
+
end
|
22
|
+
|
18
23
|
def harvest_bcodmo_into_solr
|
19
24
|
result = translate_bcodmo
|
20
25
|
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
21
|
-
|
26
|
+
fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
|
22
27
|
end
|
23
28
|
|
24
29
|
def translate_bcodmo
|
@@ -34,6 +39,7 @@ module SearchSolrTools
|
|
34
39
|
end
|
35
40
|
|
36
41
|
def request_json(url)
|
42
|
+
puts "Request: #{url}"
|
37
43
|
JSON.parse(RestClient.get(url))
|
38
44
|
end
|
39
45
|
|
@@ -4,7 +4,7 @@ module SearchSolrTools
|
|
4
4
|
class Echo < Base
|
5
5
|
def initialize(env = 'development', die_on_failure = false)
|
6
6
|
super env, die_on_failure
|
7
|
-
@page_size =
|
7
|
+
@page_size = 100
|
8
8
|
@translator = Helpers::IsoToSolr.new :echo
|
9
9
|
end
|
10
10
|
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_echo(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_request(max_records = '25', page_num = '1')
|
@@ -22,7 +22,7 @@ module SearchSolrTools
|
|
22
22
|
doc = open_xml_document(dataset)
|
23
23
|
if doc.xpath('//xmlns:metadata').size > 1
|
24
24
|
# THREDDS allows for a dataset of datasests, EOL should not utilize this
|
25
|
-
|
25
|
+
fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
|
26
26
|
end
|
27
27
|
metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
|
28
28
|
{ 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
|
@@ -28,7 +28,7 @@ module SearchSolrTools
|
|
28
28
|
def harvest_gtnp_into_solr
|
29
29
|
result = translate_gtnp
|
30
30
|
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
31
|
-
|
31
|
+
fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
|
32
32
|
end
|
33
33
|
|
34
34
|
def translate_gtnp
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_ices(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_nodc(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
@@ -23,7 +23,7 @@ module SearchSolrTools
|
|
23
23
|
def harvest_nsidc_json_into_solr
|
24
24
|
result = docs_with_translated_entries_from_nsidc
|
25
25
|
insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
|
26
|
-
|
26
|
+
fail 'Failed to harvest and insert some authoritative IDs' if result[:failure_ids].length > 0
|
27
27
|
end
|
28
28
|
|
29
29
|
def nsidc_json_url
|
@@ -34,11 +34,11 @@ module SearchSolrTools
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def results
|
37
|
-
|
37
|
+
fail NotImplementedError
|
38
38
|
end
|
39
39
|
|
40
40
|
def metadata_url
|
41
|
-
|
41
|
+
fail NotImplementedError
|
42
42
|
end
|
43
43
|
|
44
44
|
def translated_docs(entries)
|
@@ -48,7 +48,7 @@ module SearchSolrTools
|
|
48
48
|
private
|
49
49
|
|
50
50
|
def request_params
|
51
|
-
|
51
|
+
fail NotImplementedError
|
52
52
|
end
|
53
53
|
|
54
54
|
def request_string
|
@@ -14,14 +14,22 @@ module SearchSolrTools
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def harvest_tdar_into_solr
|
17
|
-
start_record =
|
17
|
+
start_record = 0
|
18
|
+
total_harvested = 0
|
19
|
+
total_expected = total_results
|
18
20
|
while (entries = get_results_from_tdar(start_record)) && (entries.length > 0)
|
19
21
|
begin
|
20
|
-
insert_solr_docs
|
22
|
+
insert_solr_docs(get_docs_with_translated_entries_from_tdar(entries))
|
21
23
|
rescue => e
|
22
24
|
puts "ERROR: #{e}\n\n"
|
23
25
|
raise e if @die_on_failure
|
24
26
|
end
|
27
|
+
|
28
|
+
# if we have all the records we expect, don't attempt another request;
|
29
|
+
# it would result in an error
|
30
|
+
total_harvested += entries.length
|
31
|
+
break if total_harvested >= total_expected
|
32
|
+
|
25
33
|
start_record += @page_size
|
26
34
|
end
|
27
35
|
end
|
@@ -31,16 +39,16 @@ module SearchSolrTools
|
|
31
39
|
end
|
32
40
|
|
33
41
|
def get_results_from_tdar(start_record)
|
34
|
-
get_results
|
42
|
+
get_results(build_request(@page_size, start_record), './/atom:entry', 'application/xml')
|
35
43
|
end
|
36
44
|
|
37
45
|
def get_docs_with_translated_entries_from_tdar(entries)
|
38
|
-
|
39
|
-
|
40
|
-
|
46
|
+
entries.map do |entry|
|
47
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
48
|
+
end
|
41
49
|
end
|
42
50
|
|
43
|
-
def build_request(max_records = '25', start_record = '
|
51
|
+
def build_request(max_records = '25', start_record = '0')
|
44
52
|
request_url = tdar_url + '?_tDAR.searchType=ACADIS_RSS&'\
|
45
53
|
'resourceTypes=DATASET&'\
|
46
54
|
'groups[0].latitudeLongitudeBoxes[0].maximumLongitude=180&'\
|
@@ -52,6 +60,10 @@ module SearchSolrTools
|
|
52
60
|
|
53
61
|
request_url
|
54
62
|
end
|
63
|
+
|
64
|
+
def total_results
|
65
|
+
get_results(build_request(0, 0), './/opensearch:totalResults').text.to_i
|
66
|
+
end
|
55
67
|
end
|
56
68
|
end
|
57
69
|
end
|
@@ -37,9 +37,9 @@ module SearchSolrTools
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def get_docs_with_translated_entries_from_usgs(entries)
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
entries.map do |entry|
|
41
|
+
create_new_solr_add_doc_with_child(@translator.translate(entry).root)
|
42
|
+
end
|
43
43
|
end
|
44
44
|
|
45
45
|
def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
|
@@ -6,7 +6,7 @@ module SearchSolrTools
|
|
6
6
|
# This hash grabs all the selector files inside the selectors directory,
|
7
7
|
# to add a new source we need to create a selector file and add it to this hash.
|
8
8
|
SELECTORS = {
|
9
|
-
|
9
|
+
adc: Selectors::ADC,
|
10
10
|
data_one: Selectors::DATA_ONE,
|
11
11
|
echo: Selectors::ECHO,
|
12
12
|
ices: Selectors::ICES,
|
@@ -10,7 +10,7 @@ module SearchSolrTools
|
|
10
10
|
module SolrFormat
|
11
11
|
DATA_CENTER_NAMES = {
|
12
12
|
BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
|
13
|
-
|
13
|
+
ADC: { short_name: 'NSF ADC', long_name: 'NSF Arctic Data Center' },
|
14
14
|
DATA_ONE: { short_name: 'DataONE', long_name: 'DataONE' },
|
15
15
|
ECHO: { short_name: 'NASA ECHO', long_name: 'NASA Earth Observing System (EOS) Clearing House (ECHO)' },
|
16
16
|
EOL: { short_name: 'UCAR NCAR EOL', long_name: 'UCAR NCAR - Earth Observing Laboratory' },
|
@@ -134,7 +134,7 @@ module SearchSolrTools
|
|
134
134
|
j = send(find_index_method, resolution['max_resolution'])
|
135
135
|
return resolution_values[i..j]
|
136
136
|
end
|
137
|
-
|
137
|
+
fail "Invalid resolution #{resolution['type']}"
|
138
138
|
end
|
139
139
|
|
140
140
|
def self.resolution_not_specified?(resolution)
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'search_solr_tools'
|
2
|
+
|
3
|
+
module SearchSolrTools
|
4
|
+
module Selectors
|
5
|
+
ADC = {
|
6
|
+
authoritative_id: {
|
7
|
+
xpaths: ['.//str[@name="id"]'],
|
8
|
+
multivalue: false
|
9
|
+
},
|
10
|
+
title: {
|
11
|
+
xpaths: ['.//str[@name="title"]'],
|
12
|
+
multivalue: false
|
13
|
+
},
|
14
|
+
summary: {
|
15
|
+
xpaths: ['.//str[@name="abstract"]'],
|
16
|
+
multivalue: false
|
17
|
+
},
|
18
|
+
data_centers: {
|
19
|
+
xpaths: [''],
|
20
|
+
default_values: [Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:long_name]],
|
21
|
+
multivalue: false
|
22
|
+
},
|
23
|
+
authors: {
|
24
|
+
xpaths: ['.//str[@name="author"]'],
|
25
|
+
multivalue: false
|
26
|
+
},
|
27
|
+
keywords: {
|
28
|
+
xpaths: ['.//arr[@name="keywords"]/str'],
|
29
|
+
multivalue: true
|
30
|
+
},
|
31
|
+
last_revision_date: {
|
32
|
+
xpaths: ['.//date[@name="updateDate"]'],
|
33
|
+
default_values: [Helpers::SolrFormat.date_str(DateTime.now)], # formats the date into ISO8601 as in http://lucene.apache.org/solr/4_4_0/solr-core/org/apache/solr/schema/DateField.html
|
34
|
+
multivalue: false,
|
35
|
+
format: Helpers::SolrFormat::DATE
|
36
|
+
},
|
37
|
+
dataset_url: {
|
38
|
+
xpaths: ['.//str[@name="dataUrl"]'],
|
39
|
+
default_values: [''],
|
40
|
+
multivalue: false
|
41
|
+
},
|
42
|
+
spatial_coverages: {
|
43
|
+
xpaths: ['.'],
|
44
|
+
multivalue: false,
|
45
|
+
format: Helpers::DataOneFormat.method(:spatial_display)
|
46
|
+
},
|
47
|
+
spatial: {
|
48
|
+
xpaths: ['.'],
|
49
|
+
multivalue: false,
|
50
|
+
format: Helpers::DataOneFormat.method(:spatial_index)
|
51
|
+
},
|
52
|
+
spatial_area: {
|
53
|
+
xpaths: ['.'],
|
54
|
+
multivalue: false,
|
55
|
+
format: Helpers::DataOneFormat.method(:spatial_area)
|
56
|
+
},
|
57
|
+
temporal_coverages: {
|
58
|
+
xpaths: ['.'],
|
59
|
+
multivalue: false,
|
60
|
+
format: Helpers::DataOneFormat.method(:temporal_coverage)
|
61
|
+
},
|
62
|
+
temporal_duration: {
|
63
|
+
xpaths: ['.'],
|
64
|
+
multivalue: false,
|
65
|
+
format: Helpers::DataOneFormat.method(:temporal_duration)
|
66
|
+
},
|
67
|
+
temporal: {
|
68
|
+
xpaths: ['.'],
|
69
|
+
multivalue: false,
|
70
|
+
format: Helpers::DataOneFormat.method(:temporal_index_string)
|
71
|
+
},
|
72
|
+
source: {
|
73
|
+
xpaths: [''],
|
74
|
+
default_values: ['ADE'],
|
75
|
+
multivalue: false
|
76
|
+
},
|
77
|
+
facet_data_center: {
|
78
|
+
xpaths: [''],
|
79
|
+
default_values: ["#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:long_name]} | #{Helpers::SolrFormat::DATA_CENTER_NAMES[:ADC][:short_name]}"],
|
80
|
+
multivalue: false
|
81
|
+
},
|
82
|
+
facet_spatial_scope: {
|
83
|
+
xpaths: ['.'],
|
84
|
+
multivalue: false,
|
85
|
+
format: Helpers::DataOneFormat.method(:facet_spatial_scope)
|
86
|
+
},
|
87
|
+
facet_temporal_duration: {
|
88
|
+
xpaths: ['.'],
|
89
|
+
default_values: [Helpers::SolrFormat::NOT_SPECIFIED],
|
90
|
+
format: Helpers::DataOneFormat.method(:facet_temporal_duration),
|
91
|
+
multivalue: false
|
92
|
+
}
|
93
|
+
}
|
94
|
+
end
|
95
|
+
end
|
@@ -50,12 +50,29 @@ module SearchSolrTools
|
|
50
50
|
end
|
51
51
|
|
52
52
|
def translate_geometry(wkt_geom)
|
53
|
-
wkt_geom['geometry']
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
53
|
+
if wkt_geom['geometry']['type'] == 'LineString'
|
54
|
+
wkt_geom['geometry']['type'] = 'MultiPoint'
|
55
|
+
end
|
56
|
+
geometry = RGeo::GeoJSON.decode(wkt_geom).geometry
|
57
|
+
geometry = RGeo::Feature.cast(geometry, RGeo::Feature::MultiPoint)
|
58
|
+
|
59
|
+
# This feed sometimes returns MultiLineString but wrongly calls them 'LineString'
|
60
|
+
# If the above fails, we assume this is why. If the feed gets fixed, this code
|
61
|
+
# should still handle that.
|
62
|
+
if geometry.nil? || geometry.num_geometries == 0
|
63
|
+
# Try to decode as an actual MultiLineString.
|
64
|
+
wkt_geom['geometry']['type'] = 'MultiLineString'
|
65
|
+
geometry = RGeo::GeoJSON.decode(wkt_geom).geometry
|
66
|
+
|
67
|
+
# Convert to a MultiPoint, for passing into the helper functions below.
|
68
|
+
coords = geometry.coordinates.flatten
|
69
|
+
coords = coords.each_slice(2).to_a
|
70
|
+
f = RGeo::Geos.factory
|
71
|
+
points = []
|
72
|
+
coords.each { |x, y| points << f.point(x, y) }
|
73
|
+
geometry = f.multi_point(points)
|
74
|
+
end
|
75
|
+
|
59
76
|
{
|
60
77
|
spatial_display: Helpers::TranslateSpatialCoverage.geojson_to_spatial_display_str([geometry]),
|
61
78
|
spatial_index: Helpers::TranslateSpatialCoverage.geojson_to_spatial_index_str([geometry]),
|
@@ -8,7 +8,6 @@ module SearchSolrTools
|
|
8
8
|
module Translators
|
9
9
|
# Translates GTN-P json to solr json format
|
10
10
|
class GtnpJsonToSolr
|
11
|
-
# rubocop:disable Metrics/MethodLength
|
12
11
|
# rubocop:disable AbcSize
|
13
12
|
def translate(json_doc, json_record)
|
14
13
|
json_geo = json_doc['geo'].nil? ? json_doc['coordinates'] : json_doc['geo']['coordinates']
|
data/search_solr_tools.gemspec
CHANGED
@@ -40,7 +40,7 @@ Gem::Specification.new do |spec|
|
|
40
40
|
spec.add_development_dependency 'guard-rubocop', '~> 1.2'
|
41
41
|
spec.add_development_dependency 'rake', '~> 10.4'
|
42
42
|
spec.add_development_dependency 'rspec', '~> 3.2'
|
43
|
-
spec.add_development_dependency 'rubocop', '~> 0.32'
|
43
|
+
spec.add_development_dependency 'rubocop', '~> 0.32.1'
|
44
44
|
spec.add_development_dependency 'webmock', '~> 1.13'
|
45
45
|
spec.add_development_dependency 'listen', '3.0.5'
|
46
46
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: search_solr_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Chalstrom
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date:
|
15
|
+
date: 2017-03-28 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: iso8601
|
@@ -230,14 +230,14 @@ dependencies:
|
|
230
230
|
requirements:
|
231
231
|
- - "~>"
|
232
232
|
- !ruby/object:Gem::Version
|
233
|
-
version:
|
233
|
+
version: 0.32.1
|
234
234
|
type: :development
|
235
235
|
prerelease: false
|
236
236
|
version_requirements: !ruby/object:Gem::Requirement
|
237
237
|
requirements:
|
238
238
|
- - "~>"
|
239
239
|
- !ruby/object:Gem::Version
|
240
|
-
version:
|
240
|
+
version: 0.32.1
|
241
241
|
- !ruby/object:Gem::Dependency
|
242
242
|
name: webmock
|
243
243
|
requirement: !ruby/object:Gem::Requirement
|
@@ -288,11 +288,11 @@ files:
|
|
288
288
|
- lib/search_solr_tools.rb
|
289
289
|
- lib/search_solr_tools/config/environments.rb
|
290
290
|
- lib/search_solr_tools/config/environments.yaml
|
291
|
+
- lib/search_solr_tools/harvesters/adc.rb
|
291
292
|
- lib/search_solr_tools/harvesters/ade_auto_suggest.rb
|
292
293
|
- lib/search_solr_tools/harvesters/auto_suggest.rb
|
293
294
|
- lib/search_solr_tools/harvesters/base.rb
|
294
295
|
- lib/search_solr_tools/harvesters/bcodmo.rb
|
295
|
-
- lib/search_solr_tools/harvesters/cisl.rb
|
296
296
|
- lib/search_solr_tools/harvesters/data_one.rb
|
297
297
|
- lib/search_solr_tools/harvesters/echo.rb
|
298
298
|
- lib/search_solr_tools/harvesters/eol.rb
|
@@ -325,7 +325,7 @@ files:
|
|
325
325
|
- lib/search_solr_tools/helpers/translate_spatial_coverage.rb
|
326
326
|
- lib/search_solr_tools/helpers/translate_temporal_coverage.rb
|
327
327
|
- lib/search_solr_tools/helpers/usgs_format.rb
|
328
|
-
- lib/search_solr_tools/selectors/
|
328
|
+
- lib/search_solr_tools/selectors/adc.rb
|
329
329
|
- lib/search_solr_tools/selectors/data_one.rb
|
330
330
|
- lib/search_solr_tools/selectors/echo_iso.rb
|
331
331
|
- lib/search_solr_tools/selectors/ices_iso.rb
|
@@ -363,7 +363,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
363
363
|
version: '0'
|
364
364
|
requirements: []
|
365
365
|
rubyforge_project:
|
366
|
-
rubygems_version: 2.4.
|
366
|
+
rubygems_version: 2.4.8
|
367
367
|
signing_key:
|
368
368
|
specification_version: 4
|
369
369
|
summary: Tools to harvest and manage various scientific dataset feeds in a Solr instance.
|
@@ -1,65 +0,0 @@
|
|
1
|
-
|
2
|
-
module SearchSolrTools
|
3
|
-
module Harvesters
|
4
|
-
# Harvests data from CISL and inserts it into Solr after it has been translated
|
5
|
-
class Cisl < Oai
|
6
|
-
def initialize(env = 'development', die_on_failure = false)
|
7
|
-
super
|
8
|
-
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]
|
9
|
-
@translator = Helpers::IsoToSolr.new :cisl
|
10
|
-
|
11
|
-
# Used in query string params, resumptionToken
|
12
|
-
@dataset = '0bdd2d39-3493-4fa2-98f9-6766596bdc50'
|
13
|
-
end
|
14
|
-
|
15
|
-
def metadata_url
|
16
|
-
SolrEnvironments[@environment][:cisl_url]
|
17
|
-
end
|
18
|
-
|
19
|
-
def results
|
20
|
-
list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
|
21
|
-
|
22
|
-
@resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces)
|
23
|
-
@resumption_token = format_resumption_token(@resumption_token.first.text)
|
24
|
-
|
25
|
-
list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def request_params
|
31
|
-
{
|
32
|
-
verb: 'ListRecords',
|
33
|
-
metadataPrefix: 'dif',
|
34
|
-
set: @dataset,
|
35
|
-
resumptionToken: @resumption_token
|
36
|
-
}.delete_if { |_k, v| v.nil? }
|
37
|
-
end
|
38
|
-
|
39
|
-
# The ruby response is lacking quotes, which the token requires in order to work...
|
40
|
-
# Also, the response back seems to be inconsistent - sometimes it adds " instead of '"',
|
41
|
-
# which makes the token fail to work.
|
42
|
-
# To get around this I'd prefer to make assumptions about the token and let it break if
|
43
|
-
# they change the formatting. For now, all fields other than offset should be able to be
|
44
|
-
# assumed to remain constant.
|
45
|
-
# glewis 2016-01-15: It broke, offset has quotes around it, so I updated the regex to account for
|
46
|
-
# the possibility, including '"' or '"'
|
47
|
-
# If the input is empty, then we are done - return an empty string, which is checked for
|
48
|
-
# in the harvest loop.
|
49
|
-
def format_resumption_token(resumption_token)
|
50
|
-
return '' if resumption_token.empty?
|
51
|
-
|
52
|
-
resumption_token =~ /offset(?:"|")?:(\d+)/
|
53
|
-
offset = Regexp.last_match(1)
|
54
|
-
|
55
|
-
{
|
56
|
-
from: nil,
|
57
|
-
until: nil,
|
58
|
-
set: @dataset,
|
59
|
-
metadataPrefix: 'dif',
|
60
|
-
offset: offset
|
61
|
-
}.to_json
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
@@ -1,112 +0,0 @@
|
|
1
|
-
require 'search_solr_tools'
|
2
|
-
|
3
|
-
module SearchSolrTools
|
4
|
-
module Selectors
|
5
|
-
# The hash contains keys that should map to the fields in the solr schema,
|
6
|
-
# the keys are called selectors and are in charge of selecting the nodes
|
7
|
-
# from the ISO document, applying the default value if none of the xpaths
|
8
|
-
# resolved to a value and formatting the field. xpaths and multivalue are
|
9
|
-
# required, default_value, format, and reduce are optional.
|
10
|
-
#
|
11
|
-
# reduce takes the formatted result of multiple nodes and produces a single
|
12
|
-
# result. This is for fields that are not multivalued, but their value
|
13
|
-
# should consider information from all the nodes (for example, storing
|
14
|
-
# only the maximum duration from multiple temporal coverage fields, taking
|
15
|
-
# the sum of multiple spatial areas)
|
16
|
-
CISL = {
|
17
|
-
authoritative_id: {
|
18
|
-
xpaths: ['.//oai:header/oai:identifier'],
|
19
|
-
multivalue: false
|
20
|
-
},
|
21
|
-
title: {
|
22
|
-
xpaths: ['.//dif:Entry_Title'],
|
23
|
-
multivalue: false
|
24
|
-
},
|
25
|
-
summary: {
|
26
|
-
xpaths: ['.//dif:Summary/dif:Abstract'],
|
27
|
-
multivalue: false
|
28
|
-
},
|
29
|
-
data_centers: {
|
30
|
-
xpaths: [''],
|
31
|
-
default_values: [SearchSolrTools::Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]],
|
32
|
-
multivalue: false
|
33
|
-
},
|
34
|
-
authors: {
|
35
|
-
xpaths: [''],
|
36
|
-
multivalue: true
|
37
|
-
},
|
38
|
-
keywords: {
|
39
|
-
xpaths: [
|
40
|
-
'.//dif:Parameters/dif:Category',
|
41
|
-
'.//dif:Parameters/dif:Topic',
|
42
|
-
'.//dif:Parameters/dif:Term',
|
43
|
-
'.//dif:Parameters/dif:Variable_Level_1'
|
44
|
-
].reverse,
|
45
|
-
multivalue: true
|
46
|
-
},
|
47
|
-
last_revision_date: {
|
48
|
-
xpaths: ['.//dif:Last_DIF_Revision_Date'],
|
49
|
-
default_values: [SearchSolrTools::Helpers::SolrFormat.date_str(DateTime.now)], # formats the date into ISO8601 as in http://lucene.apache.org/solr/4_4_0/solr-core/org/apache/solr/schema/DateField.html
|
50
|
-
multivalue: false,
|
51
|
-
format: SearchSolrTools::Helpers::SolrFormat::DATE
|
52
|
-
},
|
53
|
-
dataset_url: {
|
54
|
-
xpaths: ['.//dif:Related_URL/dif:URL'],
|
55
|
-
multivalue: false
|
56
|
-
},
|
57
|
-
spatial_coverages: {
|
58
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
59
|
-
multivalue: true,
|
60
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::SPATIAL_DISPLAY
|
61
|
-
},
|
62
|
-
spatial: {
|
63
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
64
|
-
multivalue: true,
|
65
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::SPATIAL_INDEX
|
66
|
-
},
|
67
|
-
spatial_area: {
|
68
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
69
|
-
multivalue: false,
|
70
|
-
reduce: SearchSolrTools::Helpers::IsoToSolrFormat::MAX_SPATIAL_AREA,
|
71
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::SPATIAL_AREA
|
72
|
-
},
|
73
|
-
temporal: {
|
74
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
75
|
-
multivalue: true,
|
76
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::TEMPORAL_INDEX_STRING
|
77
|
-
},
|
78
|
-
temporal_coverages: {
|
79
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
80
|
-
multivalue: true,
|
81
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::TEMPORAL_DISPLAY_STRING
|
82
|
-
},
|
83
|
-
temporal_duration: {
|
84
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
85
|
-
multivalue: false,
|
86
|
-
reduce: SearchSolrTools::Helpers::SolrFormat::REDUCE_TEMPORAL_DURATION,
|
87
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::TEMPORAL_DURATION
|
88
|
-
},
|
89
|
-
source: {
|
90
|
-
xpaths: [''],
|
91
|
-
default_values: ['ADE'],
|
92
|
-
multivalue: false
|
93
|
-
},
|
94
|
-
facet_data_center: {
|
95
|
-
xpaths: [''],
|
96
|
-
default_values: ["#{SearchSolrTools::Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:long_name]} | #{Helpers::SolrFormat::DATA_CENTER_NAMES[:CISL][:short_name]}"],
|
97
|
-
multivalue: false
|
98
|
-
},
|
99
|
-
facet_spatial_scope: {
|
100
|
-
xpaths: ['.//dif:Spatial_Coverage'],
|
101
|
-
multivalue: true,
|
102
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::FACET_SPATIAL_SCOPE
|
103
|
-
},
|
104
|
-
facet_temporal_duration: {
|
105
|
-
xpaths: ['.//dif:Temporal_Coverage'],
|
106
|
-
default_values: [SearchSolrTools::Helpers::SolrFormat::NOT_SPECIFIED],
|
107
|
-
format: SearchSolrTools::Helpers::IsoToSolrFormat::FACET_TEMPORAL_DURATION,
|
108
|
-
multivalue: true
|
109
|
-
}
|
110
|
-
}
|
111
|
-
end
|
112
|
-
end
|