search_solr_tools 3.1.2 → 3.1.3.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/search_solr_tools +1 -0
- data/lib/search_solr_tools/config/environments.yaml +1 -0
- data/lib/search_solr_tools/harvesters/r2r.rb +61 -0
- data/lib/search_solr_tools/helpers/iso_namespaces.rb +17 -17
- data/lib/search_solr_tools/helpers/r2r_format.rb +25 -0
- data/lib/search_solr_tools/helpers/selectors.rb +1 -0
- data/lib/search_solr_tools/helpers/solr_format.rb +5 -4
- data/lib/search_solr_tools/selectors/r2r.rb +113 -0
- data/lib/search_solr_tools/version.rb +1 -1
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74bf9594b189bbd05ae04fffaff8a431ee8ad73e
|
4
|
+
data.tar.gz: 6739088539526ce22e95c1e5e4de024f02509917
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01d720a93d61862c78b5494c71b3ed7f09c11ebe4304963b9a8d911643c5f78a8772123babb1ae5316e841be60819190c11c23296b5ecd473f7fb8e0839d6d03
|
7
|
+
data.tar.gz: d0d4909dd34e3cbe940da7aad635fcc6657fc16ddc15a8a64727b79bac15648542abe95d1900597bdfaf5776c7bcf3d57c693338710cb67d50b82ae88330eb40
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[](https://travis-ci.org/nsidc/search-solr-tools)
|
1
|
+
[](http://badge.fury.io/rb/search_solr_tools) [](https://travis-ci.org/nsidc/search-solr-tools)
|
2
2
|
|
3
3
|
# NSIDC Search Solr Tools
|
4
4
|
|
data/bin/search_solr_tools
CHANGED
@@ -66,6 +66,7 @@ class SolrHarvestCLI < Thor
|
|
66
66
|
'ices' => SearchSolrTools::Harvesters::Ices,
|
67
67
|
'nmi' => SearchSolrTools::Harvesters::Nmi,
|
68
68
|
'nodc' => SearchSolrTools::Harvesters::Nodc,
|
69
|
+
'r2r' => SearchSolrTools::Harvesters::R2R,
|
69
70
|
'rda' => SearchSolrTools::Harvesters::Rda,
|
70
71
|
'usgs' => SearchSolrTools::Harvesters::Usgs,
|
71
72
|
'tdar' => SearchSolrTools::Harvesters::Tdar,
|
@@ -29,6 +29,7 @@
|
|
29
29
|
- http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.BARROW.thredds.xml
|
30
30
|
- http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.DBO.thredds.xml
|
31
31
|
- http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.ITEX.thredds.xml
|
32
|
+
:r2r_url: http://get.rvdata.us/services/cruise/
|
32
33
|
|
33
34
|
:local:
|
34
35
|
:host: localhost
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rest-client'
|
3
|
+
|
4
|
+
require_relative 'base'
|
5
|
+
|
6
|
+
module SearchSolrTools
|
7
|
+
module Harvesters
|
8
|
+
class R2R < Base
|
9
|
+
def initialize(env = 'development', die_on_failure = false)
|
10
|
+
super
|
11
|
+
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
|
12
|
+
@translator = Helpers::IsoToSolr.new :r2r
|
13
|
+
@metadata_url = SolrEnvironments[@environment][:r2r_url]
|
14
|
+
end
|
15
|
+
|
16
|
+
def harvest_and_delete
|
17
|
+
puts "Running #{self.class.name} at #{@metadata_url}"
|
18
|
+
super(method(:harvest), %(data_centers:"#{@data_centers}"))
|
19
|
+
end
|
20
|
+
|
21
|
+
# rubocop: disable MethodLength
|
22
|
+
# rubocop: disable AbcSize
|
23
|
+
def harvest
|
24
|
+
# first fetch list of available records at http://get.rvdata.us/services/cruise/
|
25
|
+
# then loop through each one of those, using the root <gmi:MI_Metadata> tag
|
26
|
+
puts "Getting list of records from #{@data_centers}"
|
27
|
+
RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
|
28
|
+
unless resp.code == 200
|
29
|
+
puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
|
30
|
+
next
|
31
|
+
end
|
32
|
+
|
33
|
+
doc = Nokogiri::HTML(resp.body)
|
34
|
+
|
35
|
+
urls = doc.xpath('//a').map do |node|
|
36
|
+
"#{@metadata_url}#{node.attr('href')}"
|
37
|
+
end
|
38
|
+
|
39
|
+
urls.each_slice(50) do |url_subset|
|
40
|
+
# each result is a nokogirii doc with root element
|
41
|
+
# <gmi:MI_Metadata>
|
42
|
+
results = url_subset.map do |url|
|
43
|
+
get_results(url, '//gmi:MI_Metadata').first
|
44
|
+
end
|
45
|
+
|
46
|
+
begin
|
47
|
+
translated = results.map do |e|
|
48
|
+
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
49
|
+
end
|
50
|
+
|
51
|
+
insert_solr_docs(translated)
|
52
|
+
rescue => e
|
53
|
+
puts "ERROR: #{e}"
|
54
|
+
raise e if @die_on_failure
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -7,23 +7,23 @@ module SearchSolrTools
|
|
7
7
|
end
|
8
8
|
|
9
9
|
ISO_NAMESPACES = {
|
10
|
-
'
|
11
|
-
'
|
12
|
-
'
|
13
|
-
'
|
14
|
-
'
|
15
|
-
'
|
16
|
-
'
|
17
|
-
'
|
18
|
-
'
|
19
|
-
'
|
20
|
-
'
|
21
|
-
'
|
22
|
-
'
|
23
|
-
'
|
24
|
-
'
|
25
|
-
'
|
26
|
-
'
|
10
|
+
'atom' => 'http://www.w3.org/2005/Atom',
|
11
|
+
'csw' => 'http://www.opengis.net/cat/csw/2.0.2',
|
12
|
+
'dc' => 'http://purl.org/dc/elements/1.1/',
|
13
|
+
'dif' => 'http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/',
|
14
|
+
'gco' => 'http://www.isotc211.org/2005/gco',
|
15
|
+
'georss' => 'http://www.georss.org/georss',
|
16
|
+
'gmd' => 'http://www.isotc211.org/2005/gmd',
|
17
|
+
'gmi' => 'http://www.isotc211.org/2005/gmi',
|
18
|
+
'gml' => 'http://www.opengis.net/gml/3.2',
|
19
|
+
'gmx' => 'http://www.isotc211.org/2005/gmx',
|
20
|
+
'gsr' => 'http://www.isotc211.org/2005/gsr',
|
21
|
+
'gss' => 'http://www.isotc211.org/2005/gss',
|
22
|
+
'gts' => 'http://www.isotc211.org/2005/gts',
|
23
|
+
'oai' => 'http://www.openarchives.org/OAI/2.0/',
|
24
|
+
'srv' => 'http://www.isotc211.org/2005/srv',
|
25
|
+
'xlink' => 'http://www.w3.org/1999/xlink',
|
26
|
+
'xsi' => 'http://www.w3.org/2001/XMLSchema-instance'
|
27
27
|
}
|
28
28
|
end
|
29
29
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './iso_namespaces'
|
2
|
+
require_relative './iso_to_solr_format'
|
3
|
+
require_relative './solr_format'
|
4
|
+
|
5
|
+
module SearchSolrTools
|
6
|
+
module Helpers
|
7
|
+
class R2RFormat < IsoToSolrFormat
|
8
|
+
TEMPORAL_INDEX_STRING = proc { |node| R2RFormat.temporal_index_str(node) }
|
9
|
+
TEMPORAL_DISPLAY_STRING = proc { |node| R2RFormat.temporal_display_str(node) }
|
10
|
+
TEMPORAL_DURATION = proc { |node| R2RFormat.get_temporal_duration(node) }
|
11
|
+
FACET_TEMPORAL_DURATION = proc { |node| R2RFormat.get_temporal_duration_facet(node) }
|
12
|
+
|
13
|
+
def self.date_range(temporal_node, _formatted = false)
|
14
|
+
xpath_start = './/gmd:temporalElement/gmd:EX_SpatialTemporalExtent/gmd:extent/'\
|
15
|
+
'gml:TimeInstant[@gml:id="start"]/gml:timePosition'
|
16
|
+
xpath_end = xpath_start.gsub('start', 'end')
|
17
|
+
|
18
|
+
{
|
19
|
+
start: temporal_node.xpath(xpath_start).text,
|
20
|
+
end: temporal_node.xpath(xpath_end).text
|
21
|
+
}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -9,18 +9,19 @@ module SearchSolrTools
|
|
9
9
|
# rubocop:disable Metrics/ModuleLength
|
10
10
|
module SolrFormat
|
11
11
|
DATA_CENTER_NAMES = {
|
12
|
-
|
12
|
+
BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
|
13
13
|
CISL: { short_name: 'ACADIS Gateway', long_name: 'Advanced Cooperative Arctic Data and Information Service' },
|
14
14
|
ECHO: { short_name: 'NASA ECHO', long_name: 'NASA Earth Observing System (EOS) Clearing House (ECHO)' },
|
15
15
|
EOL: { short_name: 'UCAR NCAR EOL', long_name: 'UCAR NCAR - Earth Observing Laboratory' },
|
16
16
|
ICES: { short_name: 'ICES', long_name: 'International Council for the Exploration of the Sea' },
|
17
17
|
NMI: { short_name: 'Met.no', long_name: 'Norwegian Meteorological Institute' },
|
18
18
|
NODC: { short_name: 'NOAA NODC', long_name: 'NOAA National Oceanographic Data Center' },
|
19
|
+
NSIDC: { short_name: 'NSIDC', long_name: 'National Snow and Ice Data Center' },
|
20
|
+
PDC: { short_name: 'PDC', long_name: 'Polar Data Catalogue' },
|
21
|
+
R2R: { short_name: 'R2R', long_name: 'Rolling Deck to Repository' },
|
19
22
|
RDA: { short_name: 'UCAR NCAR RDA', long_name: 'UCAR NCAR Research Data Archive' },
|
20
|
-
USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' },
|
21
|
-
BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
|
22
23
|
TDAR: { short_name: 'tDAR', long_name: 'tDAR: The Digital Archaeological Record' },
|
23
|
-
|
24
|
+
USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' }
|
24
25
|
}
|
25
26
|
|
26
27
|
NOT_SPECIFIED = 'Not specified'
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'search_solr_tools'
|
2
|
+
|
3
|
+
module SearchSolrTools
|
4
|
+
module Selectors
|
5
|
+
# The hash contains keys that should map to the fields in the solr schema,
|
6
|
+
# the keys are called selectors and are in charge of selecting the nodes
|
7
|
+
# from the ISO document, applying the default value if none of the xpaths
|
8
|
+
# resolved to a value and formatting the field. xpaths and multivalue are
|
9
|
+
# required, default_value, format, and reduce are optional.
|
10
|
+
#
|
11
|
+
# reduce takes the formatted result of multiple nodes and produces a single
|
12
|
+
# result. This is for fields that are not multivalued, but their value
|
13
|
+
# should consider information from all the nodes (for example, storing
|
14
|
+
# only the maximum duration from multiple temporal coverage fields, taking
|
15
|
+
# the sum of multiple spatial areas)
|
16
|
+
R2R = {
|
17
|
+
authoritative_id: {
|
18
|
+
xpaths: ['.//gmd:fileIdentifier/gco:CharacterString'],
|
19
|
+
multivalue: false
|
20
|
+
},
|
21
|
+
title: {
|
22
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmx:Anchor'],
|
23
|
+
multivalue: false
|
24
|
+
},
|
25
|
+
summary: {
|
26
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString'],
|
27
|
+
multivalue: false
|
28
|
+
},
|
29
|
+
data_centers: {
|
30
|
+
xpaths: [''],
|
31
|
+
default_values: [Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]],
|
32
|
+
multivalue: false
|
33
|
+
},
|
34
|
+
authors: {
|
35
|
+
xpaths: [".//gmd:CI_ResponsibleParty[./gmd:role/gmd:CI_RoleCode[@codeListValue='contributor']]/gmd:individualName/gmx:Anchor"],
|
36
|
+
multivalue: true
|
37
|
+
},
|
38
|
+
keywords: {
|
39
|
+
xpaths: ['.//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString',
|
40
|
+
'.//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gmx:Anchor'],
|
41
|
+
multivalue: true
|
42
|
+
},
|
43
|
+
last_revision_date: {
|
44
|
+
xpaths: ['.//gmd:dateStamp/gco:Date', './/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:DateTime'],
|
45
|
+
default_values: [Helpers::SolrFormat.date_str(DateTime.now)], # formats the date into ISO8601 as in http://lucene.apache.org/solr/4_4_0/solr-core/org/apache/solr/schema/DateField.html
|
46
|
+
multivalue: false,
|
47
|
+
format: Helpers::SolrFormat::DATE
|
48
|
+
},
|
49
|
+
dataset_url: {
|
50
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmx:Anchor/@xlink:href'],
|
51
|
+
multivalue: false,
|
52
|
+
format: Helpers::IsoToSolrFormat::DATASET_URL
|
53
|
+
},
|
54
|
+
spatial_coverages: {
|
55
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
56
|
+
multivalue: true,
|
57
|
+
format: Helpers::IsoToSolrFormat::SPATIAL_DISPLAY
|
58
|
+
},
|
59
|
+
spatial: {
|
60
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
61
|
+
multivalue: true,
|
62
|
+
format: Helpers::IsoToSolrFormat::SPATIAL_INDEX
|
63
|
+
},
|
64
|
+
spatial_area: {
|
65
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
66
|
+
multivalue: false,
|
67
|
+
reduce: Helpers::IsoToSolrFormat::MAX_SPATIAL_AREA,
|
68
|
+
format: Helpers::IsoToSolrFormat::SPATIAL_AREA
|
69
|
+
},
|
70
|
+
temporal_coverages: {
|
71
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
72
|
+
multivalue: false,
|
73
|
+
format: Helpers::R2RFormat::TEMPORAL_DISPLAY_STRING
|
74
|
+
},
|
75
|
+
temporal_duration: {
|
76
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
77
|
+
multivalue: false,
|
78
|
+
reduce: Helpers::SolrFormat::REDUCE_TEMPORAL_DURATION,
|
79
|
+
format: Helpers::R2RFormat::TEMPORAL_DURATION
|
80
|
+
},
|
81
|
+
temporal: {
|
82
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
83
|
+
multivalue: false,
|
84
|
+
format: Helpers::R2RFormat::TEMPORAL_INDEX_STRING
|
85
|
+
},
|
86
|
+
sensors: {
|
87
|
+
xpaths: ['.//gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:instrument/gmi:MI_Instrument/gmi:type/gmx:Anchor'],
|
88
|
+
multivalue: true
|
89
|
+
},
|
90
|
+
source: {
|
91
|
+
xpaths: [''],
|
92
|
+
default_values: ['ADE'],
|
93
|
+
multivalue: false
|
94
|
+
},
|
95
|
+
facet_data_center: {
|
96
|
+
xpaths: [''],
|
97
|
+
default_values: ["#{Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]} | #{Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:short_name]}"],
|
98
|
+
multivalue: false
|
99
|
+
},
|
100
|
+
facet_spatial_scope: {
|
101
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
102
|
+
multivalue: true,
|
103
|
+
format: Helpers::IsoToSolrFormat::FACET_SPATIAL_SCOPE
|
104
|
+
},
|
105
|
+
facet_temporal_duration: {
|
106
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
107
|
+
default_values: [Helpers::SolrFormat::NOT_SPECIFIED],
|
108
|
+
format: Helpers::R2RFormat::FACET_TEMPORAL_DURATION,
|
109
|
+
multivalue: true
|
110
|
+
}
|
111
|
+
}
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: search_solr_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.3.pre2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Chalstrom
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2015-
|
15
|
+
date: 2015-07-01 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: iso8601
|
@@ -288,6 +288,7 @@ files:
|
|
288
288
|
- lib/search_solr_tools/harvesters/nsidc_json.rb
|
289
289
|
- lib/search_solr_tools/harvesters/oai.rb
|
290
290
|
- lib/search_solr_tools/harvesters/pdc.rb
|
291
|
+
- lib/search_solr_tools/harvesters/r2r.rb
|
291
292
|
- lib/search_solr_tools/harvesters/rda.rb
|
292
293
|
- lib/search_solr_tools/harvesters/tdar.rb
|
293
294
|
- lib/search_solr_tools/harvesters/usgs.rb
|
@@ -298,6 +299,7 @@ files:
|
|
298
299
|
- lib/search_solr_tools/helpers/iso_to_solr.rb
|
299
300
|
- lib/search_solr_tools/helpers/iso_to_solr_format.rb
|
300
301
|
- lib/search_solr_tools/helpers/query_builder.rb
|
302
|
+
- lib/search_solr_tools/helpers/r2r_format.rb
|
301
303
|
- lib/search_solr_tools/helpers/selectors.rb
|
302
304
|
- lib/search_solr_tools/helpers/solr_format.rb
|
303
305
|
- lib/search_solr_tools/helpers/tdar_format.rb
|
@@ -310,6 +312,7 @@ files:
|
|
310
312
|
- lib/search_solr_tools/selectors/nmi.rb
|
311
313
|
- lib/search_solr_tools/selectors/nodc_iso.rb
|
312
314
|
- lib/search_solr_tools/selectors/pdc_iso.rb
|
315
|
+
- lib/search_solr_tools/selectors/r2r.rb
|
313
316
|
- lib/search_solr_tools/selectors/rda.rb
|
314
317
|
- lib/search_solr_tools/selectors/tdar_opensearch.rb
|
315
318
|
- lib/search_solr_tools/selectors/usgs_iso.rb
|
@@ -333,9 +336,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
333
336
|
version: '2.0'
|
334
337
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
335
338
|
requirements:
|
336
|
-
- - "
|
339
|
+
- - ">"
|
337
340
|
- !ruby/object:Gem::Version
|
338
|
-
version:
|
341
|
+
version: 1.3.1
|
339
342
|
requirements: []
|
340
343
|
rubyforge_project:
|
341
344
|
rubygems_version: 2.4.8
|