search_solr_tools 3.1.2 → 3.1.3.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1c7f4c152205a4ab6781f37a0c94994c8c38b49d
4
- data.tar.gz: a3d55a95e66ca26f077344798de686ed35c17182
3
+ metadata.gz: 74bf9594b189bbd05ae04fffaff8a431ee8ad73e
4
+ data.tar.gz: 6739088539526ce22e95c1e5e4de024f02509917
5
5
  SHA512:
6
- metadata.gz: ae1e563fc38f3ffed03eda53c1f7a2281757a0c1853ed474191b10323a0558b66c8db2cd9711ec2db327fc892a100860769a1b7d0b46a64f127a7914f5d7ee62
7
- data.tar.gz: b3762ea1d52947abebc6eaff5e5a75bb54509e0f7fbe304eb4a1c91ea3fd73df25c82c9d6112a21a85fcc0c34a81b04b7346865f4c103319a11b462e7d171ed7
6
+ metadata.gz: 01d720a93d61862c78b5494c71b3ed7f09c11ebe4304963b9a8d911643c5f78a8772123babb1ae5316e841be60819190c11c23296b5ecd473f7fb8e0839d6d03
7
+ data.tar.gz: d0d4909dd34e3cbe940da7aad635fcc6657fc16ddc15a8a64727b79bac15648542abe95d1900597bdfaf5776c7bcf3d57c693338710cb67d50b82ae88330eb40
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- [![Build Status](https://travis-ci.org/nsidc/search-solr-tools.svg?branch=master)](https://travis-ci.org/nsidc/search-solr-tools)
1
+ [![Gem Version](https://badge.fury.io/rb/search_solr_tools.svg)](http://badge.fury.io/rb/search_solr_tools) [![Build Status](https://travis-ci.org/nsidc/search-solr-tools.svg?branch=master)](https://travis-ci.org/nsidc/search-solr-tools)
2
2
 
3
3
  # NSIDC Search Solr Tools
4
4
 
@@ -66,6 +66,7 @@ class SolrHarvestCLI < Thor
66
66
  'ices' => SearchSolrTools::Harvesters::Ices,
67
67
  'nmi' => SearchSolrTools::Harvesters::Nmi,
68
68
  'nodc' => SearchSolrTools::Harvesters::Nodc,
69
+ 'r2r' => SearchSolrTools::Harvesters::R2R,
69
70
  'rda' => SearchSolrTools::Harvesters::Rda,
70
71
  'usgs' => SearchSolrTools::Harvesters::Usgs,
71
72
  'tdar' => SearchSolrTools::Harvesters::Tdar,
@@ -29,6 +29,7 @@
29
29
  - http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.BARROW.thredds.xml
30
30
  - http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.DBO.thredds.xml
31
31
  - http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.project.ITEX.thredds.xml
32
+ :r2r_url: http://get.rvdata.us/services/cruise/
32
33
 
33
34
  :local:
34
35
  :host: localhost
@@ -0,0 +1,61 @@
1
+ require 'nokogiri'
2
+ require 'rest-client'
3
+
4
+ require_relative 'base'
5
+
6
+ module SearchSolrTools
7
+ module Harvesters
8
+ class R2R < Base
9
+ def initialize(env = 'development', die_on_failure = false)
10
+ super
11
+ @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
12
+ @translator = Helpers::IsoToSolr.new :r2r
13
+ @metadata_url = SolrEnvironments[@environment][:r2r_url]
14
+ end
15
+
16
+ def harvest_and_delete
17
+ puts "Running #{self.class.name} at #{@metadata_url}"
18
+ super(method(:harvest), %(data_centers:"#{@data_centers}"))
19
+ end
20
+
21
+ # rubocop: disable MethodLength
22
+ # rubocop: disable AbcSize
23
+ def harvest
24
+ # first fetch list of available records at http://get.rvdata.us/services/cruise/
25
+ # then loop through each one of those, using the root <gmi:MI_Metadata> tag
26
+ puts "Getting list of records from #{@data_centers}"
27
+ RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
28
+ unless resp.code == 200
29
+ puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
30
+ next
31
+ end
32
+
33
+ doc = Nokogiri::HTML(resp.body)
34
+
35
+ urls = doc.xpath('//a').map do |node|
36
+ "#{@metadata_url}#{node.attr('href')}"
37
+ end
38
+
39
+ urls.each_slice(50) do |url_subset|
40
+ # each result is a nokogirii doc with root element
41
+ # <gmi:MI_Metadata>
42
+ results = url_subset.map do |url|
43
+ get_results(url, '//gmi:MI_Metadata').first
44
+ end
45
+
46
+ begin
47
+ translated = results.map do |e|
48
+ create_new_solr_add_doc_with_child(@translator.translate(e).root)
49
+ end
50
+
51
+ insert_solr_docs(translated)
52
+ rescue => e
53
+ puts "ERROR: #{e}"
54
+ raise e if @die_on_failure
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -7,23 +7,23 @@ module SearchSolrTools
7
7
  end
8
8
 
9
9
  ISO_NAMESPACES = {
10
- 'csw' => 'http://www.opengis.net/cat/csw/2.0.2',
11
- 'gmd' => 'http://www.isotc211.org/2005/gmd',
12
- 'gco' => 'http://www.isotc211.org/2005/gco',
13
- 'gml' => 'http://www.opengis.net/gml/3.2',
14
- 'gmi' => 'http://www.isotc211.org/2005/gmi',
15
- 'gmx' => 'http://www.isotc211.org/2005/gmx',
16
- 'gsr' => 'http://www.isotc211.org/2005/gsr',
17
- 'gss' => 'http://www.isotc211.org/2005/gss',
18
- 'gts' => 'http://www.isotc211.org/2005/gts',
19
- 'srv' => 'http://www.isotc211.org/2005/srv',
20
- 'xlink' => 'http://www.w3.org/1999/xlink',
21
- 'xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
22
- 'oai' => 'http://www.openarchives.org/OAI/2.0/',
23
- 'dif' => 'http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/',
24
- 'atom' => 'http://www.w3.org/2005/Atom',
25
- 'dc' => 'http://purl.org/dc/elements/1.1/',
26
- 'georss' => 'http://www.georss.org/georss'
10
+ 'atom' => 'http://www.w3.org/2005/Atom',
11
+ 'csw' => 'http://www.opengis.net/cat/csw/2.0.2',
12
+ 'dc' => 'http://purl.org/dc/elements/1.1/',
13
+ 'dif' => 'http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/',
14
+ 'gco' => 'http://www.isotc211.org/2005/gco',
15
+ 'georss' => 'http://www.georss.org/georss',
16
+ 'gmd' => 'http://www.isotc211.org/2005/gmd',
17
+ 'gmi' => 'http://www.isotc211.org/2005/gmi',
18
+ 'gml' => 'http://www.opengis.net/gml/3.2',
19
+ 'gmx' => 'http://www.isotc211.org/2005/gmx',
20
+ 'gsr' => 'http://www.isotc211.org/2005/gsr',
21
+ 'gss' => 'http://www.isotc211.org/2005/gss',
22
+ 'gts' => 'http://www.isotc211.org/2005/gts',
23
+ 'oai' => 'http://www.openarchives.org/OAI/2.0/',
24
+ 'srv' => 'http://www.isotc211.org/2005/srv',
25
+ 'xlink' => 'http://www.w3.org/1999/xlink',
26
+ 'xsi' => 'http://www.w3.org/2001/XMLSchema-instance'
27
27
  }
28
28
  end
29
29
  end
@@ -0,0 +1,25 @@
1
+ require_relative './iso_namespaces'
2
+ require_relative './iso_to_solr_format'
3
+ require_relative './solr_format'
4
+
5
+ module SearchSolrTools
6
+ module Helpers
7
+ class R2RFormat < IsoToSolrFormat
8
+ TEMPORAL_INDEX_STRING = proc { |node| R2RFormat.temporal_index_str(node) }
9
+ TEMPORAL_DISPLAY_STRING = proc { |node| R2RFormat.temporal_display_str(node) }
10
+ TEMPORAL_DURATION = proc { |node| R2RFormat.get_temporal_duration(node) }
11
+ FACET_TEMPORAL_DURATION = proc { |node| R2RFormat.get_temporal_duration_facet(node) }
12
+
13
+ def self.date_range(temporal_node, _formatted = false)
14
+ xpath_start = './/gmd:temporalElement/gmd:EX_SpatialTemporalExtent/gmd:extent/'\
15
+ 'gml:TimeInstant[@gml:id="start"]/gml:timePosition'
16
+ xpath_end = xpath_start.gsub('start', 'end')
17
+
18
+ {
19
+ start: temporal_node.xpath(xpath_start).text,
20
+ end: temporal_node.xpath(xpath_end).text
21
+ }
22
+ end
23
+ end
24
+ end
25
+ end
@@ -12,6 +12,7 @@ module SearchSolrTools
12
12
  nmi: Selectors::NMI,
13
13
  nodc: Selectors::NODC,
14
14
  pdc: Selectors::PDC,
15
+ r2r: Selectors::R2R,
15
16
  rda: Selectors::RDA,
16
17
  tdar: Selectors::TDAR,
17
18
  usgs: Selectors::USGS
@@ -9,18 +9,19 @@ module SearchSolrTools
9
9
  # rubocop:disable Metrics/ModuleLength
10
10
  module SolrFormat
11
11
  DATA_CENTER_NAMES = {
12
- NSIDC: { short_name: 'NSIDC', long_name: 'National Snow and Ice Data Center' },
12
+ BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
13
13
  CISL: { short_name: 'ACADIS Gateway', long_name: 'Advanced Cooperative Arctic Data and Information Service' },
14
14
  ECHO: { short_name: 'NASA ECHO', long_name: 'NASA Earth Observing System (EOS) Clearing House (ECHO)' },
15
15
  EOL: { short_name: 'UCAR NCAR EOL', long_name: 'UCAR NCAR - Earth Observing Laboratory' },
16
16
  ICES: { short_name: 'ICES', long_name: 'International Council for the Exploration of the Sea' },
17
17
  NMI: { short_name: 'Met.no', long_name: 'Norwegian Meteorological Institute' },
18
18
  NODC: { short_name: 'NOAA NODC', long_name: 'NOAA National Oceanographic Data Center' },
19
+ NSIDC: { short_name: 'NSIDC', long_name: 'National Snow and Ice Data Center' },
20
+ PDC: { short_name: 'PDC', long_name: 'Polar Data Catalogue' },
21
+ R2R: { short_name: 'R2R', long_name: 'Rolling Deck to Repository' },
19
22
  RDA: { short_name: 'UCAR NCAR RDA', long_name: 'UCAR NCAR Research Data Archive' },
20
- USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' },
21
- BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
22
23
  TDAR: { short_name: 'tDAR', long_name: 'tDAR: The Digital Archaeological Record' },
23
- PDC: { short_name: 'PDC', long_name: 'Polar Data Catalogue' }
24
+ USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' }
24
25
  }
25
26
 
26
27
  NOT_SPECIFIED = 'Not specified'
@@ -0,0 +1,113 @@
1
+ require 'search_solr_tools'
2
+
3
+ module SearchSolrTools
4
+ module Selectors
5
+ # The hash contains keys that should map to the fields in the solr schema,
6
+ # the keys are called selectors and are in charge of selecting the nodes
7
+ # from the ISO document, applying the default value if none of the xpaths
8
+ # resolved to a value and formatting the field. xpaths and multivalue are
9
+ # required, default_value, format, and reduce are optional.
10
+ #
11
+ # reduce takes the formatted result of multiple nodes and produces a single
12
+ # result. This is for fields that are not multivalued, but their value
13
+ # should consider information from all the nodes (for example, storing
14
+ # only the maximum duration from multiple temporal coverage fields, taking
15
+ # the sum of multiple spatial areas)
16
+ R2R = {
17
+ authoritative_id: {
18
+ xpaths: ['.//gmd:fileIdentifier/gco:CharacterString'],
19
+ multivalue: false
20
+ },
21
+ title: {
22
+ xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmx:Anchor'],
23
+ multivalue: false
24
+ },
25
+ summary: {
26
+ xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString'],
27
+ multivalue: false
28
+ },
29
+ data_centers: {
30
+ xpaths: [''],
31
+ default_values: [Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]],
32
+ multivalue: false
33
+ },
34
+ authors: {
35
+ xpaths: [".//gmd:CI_ResponsibleParty[./gmd:role/gmd:CI_RoleCode[@codeListValue='contributor']]/gmd:individualName/gmx:Anchor"],
36
+ multivalue: true
37
+ },
38
+ keywords: {
39
+ xpaths: ['.//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString',
40
+ './/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gmx:Anchor'],
41
+ multivalue: true
42
+ },
43
+ last_revision_date: {
44
+ xpaths: ['.//gmd:dateStamp/gco:Date', './/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:DateTime'],
45
+ default_values: [Helpers::SolrFormat.date_str(DateTime.now)], # formats the date into ISO8601 as in http://lucene.apache.org/solr/4_4_0/solr-core/org/apache/solr/schema/DateField.html
46
+ multivalue: false,
47
+ format: Helpers::SolrFormat::DATE
48
+ },
49
+ dataset_url: {
50
+ xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmx:Anchor/@xlink:href'],
51
+ multivalue: false,
52
+ format: Helpers::IsoToSolrFormat::DATASET_URL
53
+ },
54
+ spatial_coverages: {
55
+ xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
56
+ multivalue: true,
57
+ format: Helpers::IsoToSolrFormat::SPATIAL_DISPLAY
58
+ },
59
+ spatial: {
60
+ xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
61
+ multivalue: true,
62
+ format: Helpers::IsoToSolrFormat::SPATIAL_INDEX
63
+ },
64
+ spatial_area: {
65
+ xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
66
+ multivalue: false,
67
+ reduce: Helpers::IsoToSolrFormat::MAX_SPATIAL_AREA,
68
+ format: Helpers::IsoToSolrFormat::SPATIAL_AREA
69
+ },
70
+ temporal_coverages: {
71
+ xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
72
+ multivalue: false,
73
+ format: Helpers::R2RFormat::TEMPORAL_DISPLAY_STRING
74
+ },
75
+ temporal_duration: {
76
+ xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
77
+ multivalue: false,
78
+ reduce: Helpers::SolrFormat::REDUCE_TEMPORAL_DURATION,
79
+ format: Helpers::R2RFormat::TEMPORAL_DURATION
80
+ },
81
+ temporal: {
82
+ xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
83
+ multivalue: false,
84
+ format: Helpers::R2RFormat::TEMPORAL_INDEX_STRING
85
+ },
86
+ sensors: {
87
+ xpaths: ['.//gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:instrument/gmi:MI_Instrument/gmi:type/gmx:Anchor'],
88
+ multivalue: true
89
+ },
90
+ source: {
91
+ xpaths: [''],
92
+ default_values: ['ADE'],
93
+ multivalue: false
94
+ },
95
+ facet_data_center: {
96
+ xpaths: [''],
97
+ default_values: ["#{Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]} | #{Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:short_name]}"],
98
+ multivalue: false
99
+ },
100
+ facet_spatial_scope: {
101
+ xpaths: ['.//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox'],
102
+ multivalue: true,
103
+ format: Helpers::IsoToSolrFormat::FACET_SPATIAL_SCOPE
104
+ },
105
+ facet_temporal_duration: {
106
+ xpaths: ['.//gmd:EX_Extent[@id="temporalExtent"]'],
107
+ default_values: [Helpers::SolrFormat::NOT_SPECIFIED],
108
+ format: Helpers::R2RFormat::FACET_TEMPORAL_DURATION,
109
+ multivalue: true
110
+ }
111
+ }
112
+ end
113
+ end
@@ -1,3 +1,3 @@
1
1
  module SearchSolrTools
2
- VERSION = '3.1.2'
2
+ VERSION = '3.1.3.pre2'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: search_solr_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.2
4
+ version: 3.1.3.pre2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Chalstrom
@@ -12,7 +12,7 @@ authors:
12
12
  autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
- date: 2015-06-30 00:00:00.000000000 Z
15
+ date: 2015-07-01 00:00:00.000000000 Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: iso8601
@@ -288,6 +288,7 @@ files:
288
288
  - lib/search_solr_tools/harvesters/nsidc_json.rb
289
289
  - lib/search_solr_tools/harvesters/oai.rb
290
290
  - lib/search_solr_tools/harvesters/pdc.rb
291
+ - lib/search_solr_tools/harvesters/r2r.rb
291
292
  - lib/search_solr_tools/harvesters/rda.rb
292
293
  - lib/search_solr_tools/harvesters/tdar.rb
293
294
  - lib/search_solr_tools/harvesters/usgs.rb
@@ -298,6 +299,7 @@ files:
298
299
  - lib/search_solr_tools/helpers/iso_to_solr.rb
299
300
  - lib/search_solr_tools/helpers/iso_to_solr_format.rb
300
301
  - lib/search_solr_tools/helpers/query_builder.rb
302
+ - lib/search_solr_tools/helpers/r2r_format.rb
301
303
  - lib/search_solr_tools/helpers/selectors.rb
302
304
  - lib/search_solr_tools/helpers/solr_format.rb
303
305
  - lib/search_solr_tools/helpers/tdar_format.rb
@@ -310,6 +312,7 @@ files:
310
312
  - lib/search_solr_tools/selectors/nmi.rb
311
313
  - lib/search_solr_tools/selectors/nodc_iso.rb
312
314
  - lib/search_solr_tools/selectors/pdc_iso.rb
315
+ - lib/search_solr_tools/selectors/r2r.rb
313
316
  - lib/search_solr_tools/selectors/rda.rb
314
317
  - lib/search_solr_tools/selectors/tdar_opensearch.rb
315
318
  - lib/search_solr_tools/selectors/usgs_iso.rb
@@ -333,9 +336,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
333
336
  version: '2.0'
334
337
  required_rubygems_version: !ruby/object:Gem::Requirement
335
338
  requirements:
336
- - - ">="
339
+ - - ">"
337
340
  - !ruby/object:Gem::Version
338
- version: '0'
341
+ version: 1.3.1
339
342
  requirements: []
340
343
  rubyforge_project:
341
344
  rubygems_version: 2.4.8