search_solr_tools 3.1.2 → 3.1.3.pre2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/search_solr_tools +1 -0
- data/lib/search_solr_tools/config/environments.yaml +1 -0
- data/lib/search_solr_tools/harvesters/r2r.rb +61 -0
- data/lib/search_solr_tools/helpers/iso_namespaces.rb +17 -17
- data/lib/search_solr_tools/helpers/r2r_format.rb +25 -0
- data/lib/search_solr_tools/helpers/selectors.rb +1 -0
- data/lib/search_solr_tools/helpers/solr_format.rb +5 -4
- data/lib/search_solr_tools/selectors/r2r.rb +113 -0
- data/lib/search_solr_tools/version.rb +1 -1
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74bf9594b189bbd05ae04fffaff8a431ee8ad73e
|
4
|
+
data.tar.gz: 6739088539526ce22e95c1e5e4de024f02509917
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01d720a93d61862c78b5494c71b3ed7f09c11ebe4304963b9a8d911643c5f78a8772123babb1ae5316e841be60819190c11c23296b5ecd473f7fb8e0839d6d03
|
7
|
+
data.tar.gz: d0d4909dd34e3cbe940da7aad635fcc6657fc16ddc15a8a64727b79bac15648542abe95d1900597bdfaf5776c7bcf3d57c693338710cb67d50b82ae88330eb40
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[![Build Status](https://travis-ci.org/nsidc/search-solr-tools.svg?branch=master)](https://travis-ci.org/nsidc/search-solr-tools)
|
1
|
+
[![Gem Version](https://badge.fury.io/rb/search_solr_tools.svg)](http://badge.fury.io/rb/search_solr_tools) [![Build Status](https://travis-ci.org/nsidc/search-solr-tools.svg?branch=master)](https://travis-ci.org/nsidc/search-solr-tools)
|
2
2
|
|
3
3
|
# NSIDC Search Solr Tools
|
4
4
|
|
data/bin/search_solr_tools
CHANGED
@@ -66,6 +66,7 @@ class SolrHarvestCLI < Thor
|
|
66
66
|
'ices' => SearchSolrTools::Harvesters::Ices,
|
67
67
|
'nmi' => SearchSolrTools::Harvesters::Nmi,
|
68
68
|
'nodc' => SearchSolrTools::Harvesters::Nodc,
|
69
|
+
'r2r' => SearchSolrTools::Harvesters::R2R,
|
69
70
|
'rda' => SearchSolrTools::Harvesters::Rda,
|
70
71
|
'usgs' => SearchSolrTools::Harvesters::Usgs,
|
71
72
|
'tdar' => SearchSolrTools::Harvesters::Tdar,
|
@@ -29,6 +29,7 @@
|
|
29
29
|
- http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.BARROW.thredds.xml
|
30
30
|
- http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.DBO.thredds.xml
|
31
31
|
- http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.ITEX.thredds.xml
|
32
|
+
:r2r_url: http://get.rvdata.us/services/cruise/
|
32
33
|
|
33
34
|
:local:
|
34
35
|
:host: localhost
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rest-client'
|
3
|
+
|
4
|
+
require_relative 'base'
|
5
|
+
|
6
|
+
module SearchSolrTools
|
7
|
+
module Harvesters
|
8
|
+
class R2R < Base
|
9
|
+
def initialize(env = 'development', die_on_failure = false)
|
10
|
+
super
|
11
|
+
@data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
|
12
|
+
@translator = Helpers::IsoToSolr.new :r2r
|
13
|
+
@metadata_url = SolrEnvironments[@environment][:r2r_url]
|
14
|
+
end
|
15
|
+
|
16
|
+
def harvest_and_delete
|
17
|
+
puts "Running #{self.class.name} at #{@metadata_url}"
|
18
|
+
super(method(:harvest), %(data_centers:"#{@data_centers}"))
|
19
|
+
end
|
20
|
+
|
21
|
+
# rubocop: disable MethodLength
|
22
|
+
# rubocop: disable AbcSize
|
23
|
+
def harvest
|
24
|
+
# first fetch list of available records at http://get.rvdata.us/services/cruise/
|
25
|
+
# then loop through each one of those, using the root <gmi:MI_Metadata> tag
|
26
|
+
puts "Getting list of records from #{@data_centers}"
|
27
|
+
RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
|
28
|
+
unless resp.code == 200
|
29
|
+
puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
|
30
|
+
next
|
31
|
+
end
|
32
|
+
|
33
|
+
doc = Nokogiri::HTML(resp.body)
|
34
|
+
|
35
|
+
urls = doc.xpath('//a').map do |node|
|
36
|
+
"#{@metadata_url}#{node.attr('href')}"
|
37
|
+
end
|
38
|
+
|
39
|
+
urls.each_slice(50) do |url_subset|
|
40
|
+
# each result is a nokogirii doc with root element
|
41
|
+
# <gmi:MI_Metadata>
|
42
|
+
results = url_subset.map do |url|
|
43
|
+
get_results(url, '//gmi:MI_Metadata').first
|
44
|
+
end
|
45
|
+
|
46
|
+
begin
|
47
|
+
translated = results.map do |e|
|
48
|
+
create_new_solr_add_doc_with_child(@translator.translate(e).root)
|
49
|
+
end
|
50
|
+
|
51
|
+
insert_solr_docs(translated)
|
52
|
+
rescue => e
|
53
|
+
puts "ERROR: #{e}"
|
54
|
+
raise e if @die_on_failure
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -7,23 +7,23 @@ module SearchSolrTools
|
|
7
7
|
end
|
8
8
|
|
9
9
|
ISO_NAMESPACES = {
|
10
|
-
'
|
11
|
-
'
|
12
|
-
'
|
13
|
-
'
|
14
|
-
'
|
15
|
-
'
|
16
|
-
'
|
17
|
-
'
|
18
|
-
'
|
19
|
-
'
|
20
|
-
'
|
21
|
-
'
|
22
|
-
'
|
23
|
-
'
|
24
|
-
'
|
25
|
-
'
|
26
|
-
'
|
10
|
+
'atom' => 'http://www.w3.org/2005/Atom',
|
11
|
+
'csw' => 'http://www.opengis.net/cat/csw/2.0.2',
|
12
|
+
'dc' => 'http://purl.org/dc/elements/1.1/',
|
13
|
+
'dif' => 'http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/',
|
14
|
+
'gco' => 'http://www.isotc211.org/2005/gco',
|
15
|
+
'georss' => 'http://www.georss.org/georss',
|
16
|
+
'gmd' => 'http://www.isotc211.org/2005/gmd',
|
17
|
+
'gmi' => 'http://www.isotc211.org/2005/gmi',
|
18
|
+
'gml' => 'http://www.opengis.net/gml/3.2',
|
19
|
+
'gmx' => 'http://www.isotc211.org/2005/gmx',
|
20
|
+
'gsr' => 'http://www.isotc211.org/2005/gsr',
|
21
|
+
'gss' => 'http://www.isotc211.org/2005/gss',
|
22
|
+
'gts' => 'http://www.isotc211.org/2005/gts',
|
23
|
+
'oai' => 'http://www.openarchives.org/OAI/2.0/',
|
24
|
+
'srv' => 'http://www.isotc211.org/2005/srv',
|
25
|
+
'xlink' => 'http://www.w3.org/1999/xlink',
|
26
|
+
'xsi' => 'http://www.w3.org/2001/XMLSchema-instance'
|
27
27
|
}
|
28
28
|
end
|
29
29
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require_relative './iso_namespaces'
|
2
|
+
require_relative './iso_to_solr_format'
|
3
|
+
require_relative './solr_format'
|
4
|
+
|
5
|
+
module SearchSolrTools
|
6
|
+
module Helpers
|
7
|
+
class R2RFormat < IsoToSolrFormat
|
8
|
+
TEMPORAL_INDEX_STRING = proc { |node| R2RFormat.temporal_index_str(node) }
|
9
|
+
TEMPORAL_DISPLAY_STRING = proc { |node| R2RFormat.temporal_display_str(node) }
|
10
|
+
TEMPORAL_DURATION = proc { |node| R2RFormat.get_temporal_duration(node) }
|
11
|
+
FACET_TEMPORAL_DURATION = proc { |node| R2RFormat.get_temporal_duration_facet(node) }
|
12
|
+
|
13
|
+
def self.date_range(temporal_node, _formatted = false)
|
14
|
+
xpath_start = './/gmd:temporalElement/gmd:EX_SpatialTemporalExtent/gmd:extent/'\
|
15
|
+
'gml:TimeInstant[@gml:id="start"]/gml:timePosition'
|
16
|
+
xpath_end = xpath_start.gsub('start', 'end')
|
17
|
+
|
18
|
+
{
|
19
|
+
start: temporal_node.xpath(xpath_start).text,
|
20
|
+
end: temporal_node.xpath(xpath_end).text
|
21
|
+
}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -9,18 +9,19 @@ module SearchSolrTools
|
|
9
9
|
# rubocop:disable Metrics/ModuleLength
|
10
10
|
module SolrFormat
|
11
11
|
DATA_CENTER_NAMES = {
|
12
|
-
|
12
|
+
BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
|
13
13
|
CISL: { short_name: 'ACADIS Gateway', long_name: 'Advanced Cooperative Arctic Data and Information Service' },
|
14
14
|
ECHO: { short_name: 'NASA ECHO', long_name: 'NASA Earth Observing System (EOS) Clearing House (ECHO)' },
|
15
15
|
EOL: { short_name: 'UCAR NCAR EOL', long_name: 'UCAR NCAR - Earth Observing Laboratory' },
|
16
16
|
ICES: { short_name: 'ICES', long_name: 'International Council for the Exploration of the Sea' },
|
17
17
|
NMI: { short_name: 'Met.no', long_name: 'Norwegian Meteorological Institute' },
|
18
18
|
NODC: { short_name: 'NOAA NODC', long_name: 'NOAA National Oceanographic Data Center' },
|
19
|
+
NSIDC: { short_name: 'NSIDC', long_name: 'National Snow and Ice Data Center' },
|
20
|
+
PDC: { short_name: 'PDC', long_name: 'Polar Data Catalogue' },
|
21
|
+
R2R: { short_name: 'R2R', long_name: 'Rolling Deck to Repository' },
|
19
22
|
RDA: { short_name: 'UCAR NCAR RDA', long_name: 'UCAR NCAR Research Data Archive' },
|
20
|
-
USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' },
|
21
|
-
BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
|
22
23
|
TDAR: { short_name: 'tDAR', long_name: 'tDAR: The Digital Archaeological Record' },
|
23
|
-
|
24
|
+
USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' }
|
24
25
|
}
|
25
26
|
|
26
27
|
NOT_SPECIFIED = 'Not specified'
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'search_solr_tools'
|
2
|
+
|
3
|
+
module SearchSolrTools
|
4
|
+
module Selectors
|
5
|
+
# The hash contains keys that should map to the fields in the solr schema,
|
6
|
+
# the keys are called selectors and are in charge of selecting the nodes
|
7
|
+
# from the ISO document, applying the default value if none of the xpaths
|
8
|
+
# resolved to a value and formatting the field. xpaths and multivalue are
|
9
|
+
# required, default_value, format, and reduce are optional.
|
10
|
+
#
|
11
|
+
# reduce takes the formatted result of multiple nodes and produces a single
|
12
|
+
# result. This is for fields that are not multivalued, but their value
|
13
|
+
# should consider information from all the nodes (for example, storing
|
14
|
+
# only the maximum duration from multiple temporal coverage fields, taking
|
15
|
+
# the sum of multiple spatial areas)
|
16
|
+
R2R = {
|
17
|
+
authoritative_id: {
|
18
|
+
xpaths: ['.//gmd:fileIdentifier/gco:CharacterString'],
|
19
|
+
multivalue: false
|
20
|
+
},
|
21
|
+
title: {
|
22
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmx:Anchor'],
|
23
|
+
multivalue: false
|
24
|
+
},
|
25
|
+
summary: {
|
26
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString'],
|
27
|
+
multivalue: false
|
28
|
+
},
|
29
|
+
data_centers: {
|
30
|
+
xpaths: [''],
|
31
|
+
default_values: [Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]],
|
32
|
+
multivalue: false
|
33
|
+
},
|
34
|
+
authors: {
|
35
|
+
xpaths: [".//gmd:CI_ResponsibleParty[./gmd:role/gmd:CI_RoleCode[@codeListValue='contributor']]/gmd:individualName/gmx:Anchor"],
|
36
|
+
multivalue: true
|
37
|
+
},
|
38
|
+
keywords: {
|
39
|
+
xpaths: ['.//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString',
|
40
|
+
'.//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gmx:Anchor'],
|
41
|
+
multivalue: true
|
42
|
+
},
|
43
|
+
last_revision_date: {
|
44
|
+
xpaths: ['.//gmd:dateStamp/gco:Date', './/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:DateTime'],
|
45
|
+
default_values: [Helpers::SolrFormat.date_str(DateTime.now)], # formats the date into ISO8601 as in http://lucene.apache.org/solr/4_4_0/solr-core/org/apache/solr/schema/DateField.html
|
46
|
+
multivalue: false,
|
47
|
+
format: Helpers::SolrFormat::DATE
|
48
|
+
},
|
49
|
+
dataset_url: {
|
50
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmx:Anchor/@xlink:href'],
|
51
|
+
multivalue: false,
|
52
|
+
format: Helpers::IsoToSolrFormat::DATASET_URL
|
53
|
+
},
|
54
|
+
spatial_coverages: {
|
55
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
56
|
+
multivalue: true,
|
57
|
+
format: Helpers::IsoToSolrFormat::SPATIAL_DISPLAY
|
58
|
+
},
|
59
|
+
spatial: {
|
60
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
61
|
+
multivalue: true,
|
62
|
+
format: Helpers::IsoToSolrFormat::SPATIAL_INDEX
|
63
|
+
},
|
64
|
+
spatial_area: {
|
65
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
66
|
+
multivalue: false,
|
67
|
+
reduce: Helpers::IsoToSolrFormat::MAX_SPATIAL_AREA,
|
68
|
+
format: Helpers::IsoToSolrFormat::SPATIAL_AREA
|
69
|
+
},
|
70
|
+
temporal_coverages: {
|
71
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
72
|
+
multivalue: false,
|
73
|
+
format: Helpers::R2RFormat::TEMPORAL_DISPLAY_STRING
|
74
|
+
},
|
75
|
+
temporal_duration: {
|
76
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
77
|
+
multivalue: false,
|
78
|
+
reduce: Helpers::SolrFormat::REDUCE_TEMPORAL_DURATION,
|
79
|
+
format: Helpers::R2RFormat::TEMPORAL_DURATION
|
80
|
+
},
|
81
|
+
temporal: {
|
82
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
83
|
+
multivalue: false,
|
84
|
+
format: Helpers::R2RFormat::TEMPORAL_INDEX_STRING
|
85
|
+
},
|
86
|
+
sensors: {
|
87
|
+
xpaths: ['.//gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:instrument/gmi:MI_Instrument/gmi:type/gmx:Anchor'],
|
88
|
+
multivalue: true
|
89
|
+
},
|
90
|
+
source: {
|
91
|
+
xpaths: [''],
|
92
|
+
default_values: ['ADE'],
|
93
|
+
multivalue: false
|
94
|
+
},
|
95
|
+
facet_data_center: {
|
96
|
+
xpaths: [''],
|
97
|
+
default_values: ["#{Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]} | #{Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:short_name]}"],
|
98
|
+
multivalue: false
|
99
|
+
},
|
100
|
+
facet_spatial_scope: {
|
101
|
+
xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
|
102
|
+
multivalue: true,
|
103
|
+
format: Helpers::IsoToSolrFormat::FACET_SPATIAL_SCOPE
|
104
|
+
},
|
105
|
+
facet_temporal_duration: {
|
106
|
+
xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
|
107
|
+
default_values: [Helpers::SolrFormat::NOT_SPECIFIED],
|
108
|
+
format: Helpers::R2RFormat::FACET_TEMPORAL_DURATION,
|
109
|
+
multivalue: true
|
110
|
+
}
|
111
|
+
}
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: search_solr_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.3.pre2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Chalstrom
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2015-
|
15
|
+
date: 2015-07-01 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: iso8601
|
@@ -288,6 +288,7 @@ files:
|
|
288
288
|
- lib/search_solr_tools/harvesters/nsidc_json.rb
|
289
289
|
- lib/search_solr_tools/harvesters/oai.rb
|
290
290
|
- lib/search_solr_tools/harvesters/pdc.rb
|
291
|
+
- lib/search_solr_tools/harvesters/r2r.rb
|
291
292
|
- lib/search_solr_tools/harvesters/rda.rb
|
292
293
|
- lib/search_solr_tools/harvesters/tdar.rb
|
293
294
|
- lib/search_solr_tools/harvesters/usgs.rb
|
@@ -298,6 +299,7 @@ files:
|
|
298
299
|
- lib/search_solr_tools/helpers/iso_to_solr.rb
|
299
300
|
- lib/search_solr_tools/helpers/iso_to_solr_format.rb
|
300
301
|
- lib/search_solr_tools/helpers/query_builder.rb
|
302
|
+
- lib/search_solr_tools/helpers/r2r_format.rb
|
301
303
|
- lib/search_solr_tools/helpers/selectors.rb
|
302
304
|
- lib/search_solr_tools/helpers/solr_format.rb
|
303
305
|
- lib/search_solr_tools/helpers/tdar_format.rb
|
@@ -310,6 +312,7 @@ files:
|
|
310
312
|
- lib/search_solr_tools/selectors/nmi.rb
|
311
313
|
- lib/search_solr_tools/selectors/nodc_iso.rb
|
312
314
|
- lib/search_solr_tools/selectors/pdc_iso.rb
|
315
|
+
- lib/search_solr_tools/selectors/r2r.rb
|
313
316
|
- lib/search_solr_tools/selectors/rda.rb
|
314
317
|
- lib/search_solr_tools/selectors/tdar_opensearch.rb
|
315
318
|
- lib/search_solr_tools/selectors/usgs_iso.rb
|
@@ -333,9 +336,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
333
336
|
version: '2.0'
|
334
337
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
335
338
|
requirements:
|
336
|
-
- - "
|
339
|
+
- - ">"
|
337
340
|
- !ruby/object:Gem::Version
|
338
|
-
version:
|
341
|
+
version: 1.3.1
|
339
342
|
requirements: []
|
340
343
|
rubyforge_project:
|
341
344
|
rubygems_version: 2.4.8
|