search_solr_tools 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,37 @@
1
+ require_relative './iso_namespaces'
2
+
3
+ module SearchSolrTools
4
+ module Helpers
5
+ # Utility methods for dealing with bounding boxes.
6
+ module BoundingBoxUtil
7
+ SOUTHERN_GLOBAL_BOUNDARY = -85.0
8
+ NORTHERN_GLOBAL_BOUNDARY = 85.0
9
+
10
+ def self.bounding_box_hash_from_geo_json(geometry)
11
+ if geometry_is_point?(geometry)
12
+ return { west: geometry.x.to_s, south: geometry.y.to_s, east: geometry.x.to_s, north: geometry.y.to_s }
13
+ else
14
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geometry)
15
+ return { west: bbox.min_x.to_s, south: bbox.min_y.to_s, east: bbox.max_x.to_s, north: bbox.max_y.to_s }
16
+ end
17
+ end
18
+
19
+ def self.geometry_is_point?(geometry)
20
+ geometry.geometry_type.to_s.downcase.eql?('point')
21
+ end
22
+
23
+ def self.box_global?(box)
24
+ box[:south].to_f < SOUTHERN_GLOBAL_BOUNDARY && box[:north].to_f > NORTHERN_GLOBAL_BOUNDARY
25
+ end
26
+
27
+ def self.box_local?(box)
28
+ distance = box[:north].to_f - box[:south].to_f
29
+ distance < 1
30
+ end
31
+
32
+ def self.box_invalid?(box)
33
+ [:north, :south, :east, :west].any? { |d| box[d].to_s.empty? }
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,30 @@
1
+ require 'search_solr_tools/helpers/query_builder'
2
+
3
+ module SearchSolrTools
4
+ module Helpers
5
+ # Constructs the string to query a CSW endpoint
6
+ class CswIsoQueryBuilder
7
+ DEFAULT_PARAMS = {
8
+ service: 'CSW',
9
+ version: '2.0.2',
10
+ request: 'GetRecords',
11
+ 'TypeNames' => 'gmd:MD_Metadata',
12
+ 'ElementSetName' => 'full',
13
+ 'resultType' => 'results',
14
+ 'outputFormat' => 'application/xml',
15
+ 'maxRecords' => '25',
16
+ 'startPosition' => '1',
17
+ 'outputSchema' => 'http://www.isotc211.org/2005/gmd'
18
+ }
19
+
20
+ def self.get_query_string(url, query_params = {})
21
+ all_params = query_params(query_params)
22
+ QueryBuilder.build(all_params).prepend(url)
23
+ end
24
+
25
+ def self.query_params(query_params = {})
26
+ DEFAULT_PARAMS.merge(query_params)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,19 @@
1
+ require 'json'
2
+ require 'rest_client'
3
+ require 'singleton'
4
+
5
+ module SearchSolrTools
6
+ module Helpers
7
+ ## Singleton configuration class to get and parse the binning configuration from the catalog services endpoint
8
+ class FacetConfiguration
9
+ include Singleton
10
+ def self.import_bin_configuration(env)
11
+ @bin_configuration = JSON.parse(RestClient.get(SolrEnvironments[env][:nsidc_dataset_metadata_url] + '/binConfiguration')) if @bin_configuration.nil?
12
+ end
13
+
14
+ def self.get_facet_bin(facet_name)
15
+ @bin_configuration.select { |x| x['facet_name'] == facet_name }.sort_by! { |x| x['order_value'] }
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,30 @@
1
+ module SearchSolrTools
2
+ module Helpers
3
+ # Helper class to provide default namespaces for XML document parsing.
4
+ class IsoNamespaces
5
+ def self.namespaces(doc = nil)
6
+ ISO_NAMESPACES.merge(doc.nil? ? {} : doc.namespaces)
7
+ end
8
+
9
+ ISO_NAMESPACES = {
10
+ 'csw' => 'http://www.opengis.net/cat/csw/2.0.2',
11
+ 'gmd' => 'http://www.isotc211.org/2005/gmd',
12
+ 'gco' => 'http://www.isotc211.org/2005/gco',
13
+ 'gml' => 'http://www.opengis.net/gml/3.2',
14
+ 'gmi' => 'http://www.isotc211.org/2005/gmi',
15
+ 'gmx' => 'http://www.isotc211.org/2005/gmx',
16
+ 'gsr' => 'http://www.isotc211.org/2005/gsr',
17
+ 'gss' => 'http://www.isotc211.org/2005/gss',
18
+ 'gts' => 'http://www.isotc211.org/2005/gts',
19
+ 'srv' => 'http://www.isotc211.org/2005/srv',
20
+ 'xlink' => 'http://www.w3.org/1999/xlink',
21
+ 'xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
22
+ 'oai' => 'http://www.openarchives.org/OAI/2.0/',
23
+ 'dif' => 'http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/',
24
+ 'atom' => 'http://www.w3.org/2005/Atom',
25
+ 'dc' => 'http://purl.org/dc/elements/1.1/',
26
+ 'georss' => 'http://www.georss.org/georss'
27
+ }
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,96 @@
1
+ require 'nokogiri'
2
+
3
+ module SearchSolrTools
4
+ module Helpers
5
+ # Translates ISO nokogiri documents into solr nokogiri documents using a hash driver object
6
+ # This class should be constructed passing the selector file hash as a parameter (see selectors.rb)
7
+ # after creating an instance we call transtale with a nokogiri iso document as a parameter.
8
+ class IsoToSolr
9
+ def initialize(selector)
10
+ @fields = SELECTORS[selector]
11
+ @multiple_whitespace = /\s{2,}/ # save the regex so it is not recompiled every time format_field() is called
12
+ end
13
+
14
+ # this will return a nodeset with all the elements that matched the xpath
15
+ def eval_xpath(iso_xml_doc, xpath, multivalue, reduce)
16
+ fields = []
17
+ begin
18
+ iso_xml_doc.xpath(xpath, IsoNamespaces.namespaces(iso_xml_doc)).each do |f|
19
+ fields.push(f)
20
+ break if multivalue == false && reduce.nil?
21
+ end
22
+ rescue
23
+ fields = []
24
+ end
25
+ fields
26
+ end
27
+
28
+ def get_default_values(selector)
29
+ selector.key?(:default_values) ? selector[:default_values] : ['']
30
+ end
31
+
32
+ def format_text(field)
33
+ field.respond_to?(:text) ? field.text : field
34
+ end
35
+
36
+ def format_field(selector, field)
37
+ formatted = selector.key?(:format) ? selector[:format].call(field) : format_text(field) rescue format_text(field)
38
+ formatted = strip_invalid_utf8_bytes(formatted)
39
+ formatted.strip! if formatted.respond_to?(:strip!)
40
+ formatted.gsub!(@multiple_whitespace, ' ') if formatted.respond_to?(:gsub!)
41
+ formatted
42
+ end
43
+
44
+ def format_fields(selector, fields, reduce = nil)
45
+ formatted = fields.map { |f| format_field(selector, f) }.flatten
46
+ formatted = [reduce.call(formatted)] unless reduce.nil?
47
+ selector[:unique] ? formatted.uniq : formatted
48
+ end
49
+
50
+ def create_solr_fields(iso_xml_doc, selector)
51
+ selector[:xpaths].each do |xpath|
52
+ fields = eval_xpath(iso_xml_doc, xpath, selector[:multivalue], selector[:reduce])
53
+
54
+ # stop evaluating xpaths once we find data in one of them
55
+ if fields.size > 0 && fields.any? { |f| strip_invalid_utf8_bytes(f.text).strip.length > 0 }
56
+ return format_fields(selector, fields, selector[:reduce])
57
+ end
58
+ end
59
+ format_fields(selector, get_default_values(selector))
60
+ end
61
+
62
+ def translate(iso_xml_doc)
63
+ solr_xml_doc = Nokogiri::XML::Builder.new do |xml|
64
+ xml.doc_ do
65
+ build_fields(xml, iso_xml_doc)
66
+ end
67
+ end
68
+ solr_xml_doc.doc
69
+ end
70
+
71
+ def build_fields(xml, iso_xml_doc)
72
+ @fields.each do |field_name, selector|
73
+ create_solr_fields(iso_xml_doc, selector).each do |value|
74
+ if value.is_a? Array
75
+ value.each do |v|
76
+ xml.field_({ name: field_name }, v) unless v.nil? || v.eql?('')
77
+ end
78
+ else
79
+ xml.field_({ name: field_name }, value) unless value.nil? || value.eql?('')
80
+ end
81
+ end
82
+ end
83
+ end
84
+
85
+ def strip_invalid_utf8_bytes(text)
86
+ if text.respond_to?(:encode) && (!text.valid_encoding?)
87
+ text.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
88
+ end
89
+
90
+ text.gsub!("\u00BF", '') if text.respond_to?(:gsub!)
91
+
92
+ text
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,198 @@
1
+ require 'date'
2
+
3
+ require_relative './iso_namespaces'
4
+ require_relative './solr_format'
5
+
6
+ module SearchSolrTools
7
+ module Helpers
8
+ # Methods for generating formatted strings from ISO xml nodes that can be indexed by SOLR
9
+ # rubocop:disable ClassLength
10
+ class IsoToSolrFormat
11
+ KEYWORDS = proc { |keywords| build_keyword_list keywords }
12
+
13
+ SPATIAL_DISPLAY = proc { |node| IsoToSolrFormat.spatial_display_str(node) }
14
+ SPATIAL_INDEX = proc { |node| IsoToSolrFormat.spatial_index_str(node) }
15
+ SPATIAL_AREA = proc { |node| IsoToSolrFormat.spatial_area_str(node) }
16
+ MAX_SPATIAL_AREA = proc { |values| IsoToSolrFormat.get_max_spatial_area(values) }
17
+
18
+ FACET_SPONSORED_PROGRAM = proc { |node| IsoToSolrFormat.sponsored_program_facet node }
19
+ FACET_SPATIAL_COVERAGE = proc { |node| IsoToSolrFormat.get_spatial_facet(node) }
20
+ FACET_SPATIAL_SCOPE = proc { |node| IsoToSolrFormat.get_spatial_scope_facet(node) }
21
+ FACET_TEMPORAL_DURATION = proc { |node| IsoToSolrFormat.get_temporal_duration_facet(node) }
22
+
23
+ TEMPORAL_DURATION = proc { |node| IsoToSolrFormat.get_temporal_duration(node) }
24
+ TEMPORAL_INDEX_STRING = proc { |node| IsoToSolrFormat.temporal_index_str node }
25
+ TEMPORAL_DISPLAY_STRING = proc { |node| IsoToSolrFormat.temporal_display_str node }
26
+ TEMPORAL_DISPLAY_STRING_FORMATTED = proc { |node| IsoToSolrFormat.temporal_display_str(node, true) }
27
+
28
+ DATASET_URL = proc { |node| IsoToSolrFormat.dataset_url(node) }
29
+ ICES_DATASET_URL = proc { |node| IsoToSolrFormat.ices_dataset_url(node) }
30
+ EOL_AUTHOR_FORMAT = proc { |node| IsoToSolrFormat.eol_author_format(node) }
31
+
32
+ def self.spatial_display_str(box_node)
33
+ box = bounding_box(box_node)
34
+ "#{box[:south]} #{box[:west]} #{box[:north]} #{box[:east]}"
35
+ end
36
+
37
+ def self.spatial_index_str(box_node)
38
+ box = bounding_box(box_node)
39
+ if box[:west] == box[:east] && box[:south] == box[:north]
40
+ [box[:west], box[:south]]
41
+ else
42
+ [box[:west], box[:south], box[:east], box[:north]]
43
+ end.join(' ')
44
+ end
45
+
46
+ def self.spatial_area_str(box_node)
47
+ box = bounding_box(box_node)
48
+ area = box[:north].to_f - box[:south].to_f
49
+ area
50
+ end
51
+
52
+ def self.get_max_spatial_area(values)
53
+ values.map(&:to_f).max
54
+ end
55
+
56
+ def self.get_spatial_facet(box_node)
57
+ box = bounding_box(box_node)
58
+
59
+ if BoundingBoxUtil.box_invalid?(box)
60
+ facet = nil
61
+ elsif BoundingBoxUtil.box_global?(box)
62
+ facet = 'Global'
63
+ else
64
+ facet = 'Non Global'
65
+ end
66
+ facet
67
+ end
68
+
69
+ def self.get_spatial_scope_facet(box_node)
70
+ box = bounding_box(box_node)
71
+ SolrFormat.get_spatial_scope_facet_with_bounding_box(box)
72
+ end
73
+
74
+ def self.temporal_display_str(temporal_node, formatted = false)
75
+ SolrFormat.temporal_display_str(date_range(temporal_node, formatted))
76
+ end
77
+
78
+ def self.get_temporal_duration(temporal_node)
79
+ dr = date_range(temporal_node)
80
+ dr[:end].to_s.empty? ? end_time = Time.now : end_time = Time.parse(dr[:end])
81
+ dr[:start].to_s.empty? ? duration = nil : duration = SolrFormat.get_temporal_duration(Time.parse(dr[:start]), end_time)
82
+ duration
83
+ end
84
+
85
+ def self.get_temporal_duration_facet(temporal_node)
86
+ duration = get_temporal_duration(temporal_node)
87
+ SolrFormat.get_temporal_duration_facet(duration)
88
+ end
89
+
90
+ def self.temporal_index_str(temporal_node)
91
+ dr = date_range(temporal_node)
92
+ SolrFormat.temporal_index_str(dr)
93
+ end
94
+
95
+ def self.sponsored_program_facet(node)
96
+ long_name = node.xpath('.//gmd:organisationName', IsoNamespaces.namespaces(node)).text.strip
97
+ short_name = node.xpath('.//gmd:organisationShortName', IsoNamespaces.namespaces(node)).text.strip
98
+
99
+ [long_name, short_name].join(' | ')
100
+ end
101
+
102
+ def self.build_keyword_list(keywords)
103
+ category = keywords.xpath('.//CategoryKeyword').text
104
+ topic = keywords.xpath('.//TopicKeyword').text
105
+ term = keywords.xpath('.//TermKeyword').text
106
+ category << ' > ' << topic << ' > ' << term
107
+ end
108
+
109
+ def self.date_range(temporal_node, formatted = false)
110
+ start_date = get_first_matching_child(
111
+ temporal_node,
112
+ ['.//gml:beginPosition', './/BeginningDateTime', './/gco:Date', './/dif:Start_Date']
113
+ )
114
+ start_date = '' unless SolrFormat.date?(start_date)
115
+ start_date = SolrFormat.date_str(start_date) if formatted
116
+
117
+ end_date = get_first_matching_child(
118
+ temporal_node,
119
+ ['.//gml:endPosition', './/EndingDateTime', './/gco:Date', './/dif:Stop_Date']
120
+ )
121
+ end_date = '' unless SolrFormat.date?(end_date)
122
+ end_date = SolrFormat.date_str(end_date) if formatted
123
+
124
+ {
125
+ start: start_date,
126
+ end: end_date
127
+ }
128
+ end
129
+
130
+ # Met.no sometimes has bad metadata, such as <gmd:URL>SU-1 (planned activity)</gmd:URL>
131
+ def self.dataset_url(url_node)
132
+ url_node.text.strip =~ %r{http[s]?://} ? url_node.text.strip : ''
133
+ end
134
+
135
+ def self.ices_dataset_url(auth_id)
136
+ 'http://geo.ices.dk/geonetwork/srv/en/main.home?uuid=' + auth_id
137
+ end
138
+
139
+ def self.get_first_matching_child(node, paths)
140
+ matching_nodes = node.at_xpath(paths.join(' | '), IsoNamespaces.namespaces(node))
141
+ matching_nodes.nil? ? '' : matching_nodes.text
142
+ end
143
+
144
+ def self.bounding_box(box_node)
145
+ {
146
+ west: get_bound(box_node, :west),
147
+ south: get_bound(box_node, :south),
148
+ east: get_bound(box_node, :east),
149
+ north: get_bound(box_node, :north)
150
+ }
151
+ end
152
+
153
+ def self.axis_label(direction)
154
+ {
155
+ north: 'Latitude',
156
+ south: 'Latitude',
157
+ east: 'Longitude',
158
+ west: 'Longitude'
159
+ }[direction]
160
+ end
161
+
162
+ def self.coordinate_boundary(lat_lon)
163
+ {
164
+ 'Latitude' => 90,
165
+ 'Longitude' => 180
166
+ }[lat_lon]
167
+ end
168
+
169
+ def self.node_values(box_node, direction, lat_lon)
170
+ get_first_matching_child(
171
+ box_node,
172
+ [
173
+ "./gmd:#{direction.to_s.downcase}Bounding#{lat_lon}/gco:Decimal",
174
+ "./gmd:#{direction.to_s.downcase}Bound#{lat_lon}/gco:Decimal",
175
+ "./#{direction.to_s.capitalize}BoundingCoordinate",
176
+ "./dif:#{direction.to_s.capitalize}ernmost_#{lat_lon}"
177
+ ]
178
+ ).split(' ')
179
+ end
180
+
181
+ def self.get_bound(box_node, direction)
182
+ lat_lon = axis_label(direction)
183
+
184
+ vals = node_values(box_node, direction, lat_lon)
185
+ val = vals.first
186
+
187
+ boundary = coordinate_boundary(lat_lon)
188
+ out_of_bounds = boundary < val.to_f.abs
189
+
190
+ return '' if vals.empty? || out_of_bounds
191
+
192
+ val = (-val.to_f) if %w(West South).include?(vals.last)
193
+
194
+ val.to_f.to_s
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,13 @@
1
+ module SearchSolrTools
2
+ module Helpers
3
+ # Class to build a query string based on a hash of params
4
+ class QueryBuilder
5
+ class << self
6
+ def build(params)
7
+ param_str = params.map { |k, v| "#{k}=#{v}" }.join('&')
8
+ "?#{param_str}"
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,20 @@
1
+ require 'require_all'
2
+ require_rel '../selectors'
3
+
4
+ module SearchSolrTools
5
+ module Helpers
6
+ # This hash grabs all the selector files inside the selectors directory,
7
+ # to add a new source we need to create a selector file and add it to this hash.
8
+ SELECTORS = {
9
+ cisl: Selectors::CISL,
10
+ echo: Selectors::ECHO,
11
+ ices: Selectors::ICES,
12
+ nmi: Selectors::NMI,
13
+ nodc: Selectors::NODC,
14
+ pdc: Selectors::PDC,
15
+ rda: Selectors::RDA,
16
+ tdar: Selectors::TDAR,
17
+ usgs: Selectors::USGS
18
+ }
19
+ end
20
+ end