search_solr_tools 3.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,37 @@
1
+ require_relative './iso_namespaces'
2
+
3
+ module SearchSolrTools
4
+ module Helpers
5
+ # Utility methods for dealing with bounding boxes.
6
+ module BoundingBoxUtil
7
+ SOUTHERN_GLOBAL_BOUNDARY = -85.0
8
+ NORTHERN_GLOBAL_BOUNDARY = 85.0
9
+
10
+ def self.bounding_box_hash_from_geo_json(geometry)
11
+ if geometry_is_point?(geometry)
12
+ return { west: geometry.x.to_s, south: geometry.y.to_s, east: geometry.x.to_s, north: geometry.y.to_s }
13
+ else
14
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geometry)
15
+ return { west: bbox.min_x.to_s, south: bbox.min_y.to_s, east: bbox.max_x.to_s, north: bbox.max_y.to_s }
16
+ end
17
+ end
18
+
19
+ def self.geometry_is_point?(geometry)
20
+ geometry.geometry_type.to_s.downcase.eql?('point')
21
+ end
22
+
23
+ def self.box_global?(box)
24
+ box[:south].to_f < SOUTHERN_GLOBAL_BOUNDARY && box[:north].to_f > NORTHERN_GLOBAL_BOUNDARY
25
+ end
26
+
27
+ def self.box_local?(box)
28
+ distance = box[:north].to_f - box[:south].to_f
29
+ distance < 1
30
+ end
31
+
32
+ def self.box_invalid?(box)
33
+ [:north, :south, :east, :west].any? { |d| box[d].to_s.empty? }
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,30 @@
1
+ require 'search_solr_tools/helpers/query_builder'
2
+
3
+ module SearchSolrTools
4
+ module Helpers
5
+ # Constructs the string to query a CSW endpoint
6
+ class CswIsoQueryBuilder
7
+ DEFAULT_PARAMS = {
8
+ service: 'CSW',
9
+ version: '2.0.2',
10
+ request: 'GetRecords',
11
+ 'TypeNames' => 'gmd:MD_Metadata',
12
+ 'ElementSetName' => 'full',
13
+ 'resultType' => 'results',
14
+ 'outputFormat' => 'application/xml',
15
+ 'maxRecords' => '25',
16
+ 'startPosition' => '1',
17
+ 'outputSchema' => 'http://www.isotc211.org/2005/gmd'
18
+ }
19
+
20
+ def self.get_query_string(url, query_params = {})
21
+ all_params = query_params(query_params)
22
+ QueryBuilder.build(all_params).prepend(url)
23
+ end
24
+
25
+ def self.query_params(query_params = {})
26
+ DEFAULT_PARAMS.merge(query_params)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,19 @@
1
+ require 'json'
2
+ require 'rest_client'
3
+ require 'singleton'
4
+
5
+ module SearchSolrTools
6
+ module Helpers
7
+ ## Singleton configuration class to get and parse the binning configuration from the catalog services endpoint
8
+ class FacetConfiguration
9
+ include Singleton
10
+ def self.import_bin_configuration(env)
11
+ @bin_configuration = JSON.parse(RestClient.get(SolrEnvironments[env][:nsidc_dataset_metadata_url] + '/binConfiguration')) if @bin_configuration.nil?
12
+ end
13
+
14
+ def self.get_facet_bin(facet_name)
15
+ @bin_configuration.select { |x| x['facet_name'] == facet_name }.sort_by! { |x| x['order_value'] }
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,30 @@
1
+ module SearchSolrTools
2
+ module Helpers
3
+ # Helper class to provide default namespaces for XML document parsing.
4
+ class IsoNamespaces
5
+ def self.namespaces(doc = nil)
6
+ ISO_NAMESPACES.merge(doc.nil? ? {} : doc.namespaces)
7
+ end
8
+
9
+ ISO_NAMESPACES = {
10
+ 'csw' => 'http://www.opengis.net/cat/csw/2.0.2',
11
+ 'gmd' => 'http://www.isotc211.org/2005/gmd',
12
+ 'gco' => 'http://www.isotc211.org/2005/gco',
13
+ 'gml' => 'http://www.opengis.net/gml/3.2',
14
+ 'gmi' => 'http://www.isotc211.org/2005/gmi',
15
+ 'gmx' => 'http://www.isotc211.org/2005/gmx',
16
+ 'gsr' => 'http://www.isotc211.org/2005/gsr',
17
+ 'gss' => 'http://www.isotc211.org/2005/gss',
18
+ 'gts' => 'http://www.isotc211.org/2005/gts',
19
+ 'srv' => 'http://www.isotc211.org/2005/srv',
20
+ 'xlink' => 'http://www.w3.org/1999/xlink',
21
+ 'xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
22
+ 'oai' => 'http://www.openarchives.org/OAI/2.0/',
23
+ 'dif' => 'http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/',
24
+ 'atom' => 'http://www.w3.org/2005/Atom',
25
+ 'dc' => 'http://purl.org/dc/elements/1.1/',
26
+ 'georss' => 'http://www.georss.org/georss'
27
+ }
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,96 @@
1
+ require 'nokogiri'
2
+
3
+ module SearchSolrTools
4
+ module Helpers
5
+ # Translates ISO nokogiri documents into solr nokogiri documents using a hash driver object
6
+ # This class should be constructed passing the selector file hash as a parameter (see selectors.rb)
7
+ # after creating an instance we call transtale with a nokogiri iso document as a parameter.
8
+ class IsoToSolr
9
+ def initialize(selector)
10
+ @fields = SELECTORS[selector]
11
+ @multiple_whitespace = /\s{2,}/ # save the regex so it is not recompiled every time format_field() is called
12
+ end
13
+
14
+ # this will return a nodeset with all the elements that matched the xpath
15
+ def eval_xpath(iso_xml_doc, xpath, multivalue, reduce)
16
+ fields = []
17
+ begin
18
+ iso_xml_doc.xpath(xpath, IsoNamespaces.namespaces(iso_xml_doc)).each do |f|
19
+ fields.push(f)
20
+ break if multivalue == false && reduce.nil?
21
+ end
22
+ rescue
23
+ fields = []
24
+ end
25
+ fields
26
+ end
27
+
28
+ def get_default_values(selector)
29
+ selector.key?(:default_values) ? selector[:default_values] : ['']
30
+ end
31
+
32
+ def format_text(field)
33
+ field.respond_to?(:text) ? field.text : field
34
+ end
35
+
36
+ def format_field(selector, field)
37
+ formatted = selector.key?(:format) ? selector[:format].call(field) : format_text(field) rescue format_text(field)
38
+ formatted = strip_invalid_utf8_bytes(formatted)
39
+ formatted.strip! if formatted.respond_to?(:strip!)
40
+ formatted.gsub!(@multiple_whitespace, ' ') if formatted.respond_to?(:gsub!)
41
+ formatted
42
+ end
43
+
44
+ def format_fields(selector, fields, reduce = nil)
45
+ formatted = fields.map { |f| format_field(selector, f) }.flatten
46
+ formatted = [reduce.call(formatted)] unless reduce.nil?
47
+ selector[:unique] ? formatted.uniq : formatted
48
+ end
49
+
50
+ def create_solr_fields(iso_xml_doc, selector)
51
+ selector[:xpaths].each do |xpath|
52
+ fields = eval_xpath(iso_xml_doc, xpath, selector[:multivalue], selector[:reduce])
53
+
54
+ # stop evaluating xpaths once we find data in one of them
55
+ if fields.size > 0 && fields.any? { |f| strip_invalid_utf8_bytes(f.text).strip.length > 0 }
56
+ return format_fields(selector, fields, selector[:reduce])
57
+ end
58
+ end
59
+ format_fields(selector, get_default_values(selector))
60
+ end
61
+
62
+ def translate(iso_xml_doc)
63
+ solr_xml_doc = Nokogiri::XML::Builder.new do |xml|
64
+ xml.doc_ do
65
+ build_fields(xml, iso_xml_doc)
66
+ end
67
+ end
68
+ solr_xml_doc.doc
69
+ end
70
+
71
+ def build_fields(xml, iso_xml_doc)
72
+ @fields.each do |field_name, selector|
73
+ create_solr_fields(iso_xml_doc, selector).each do |value|
74
+ if value.is_a? Array
75
+ value.each do |v|
76
+ xml.field_({ name: field_name }, v) unless v.nil? || v.eql?('')
77
+ end
78
+ else
79
+ xml.field_({ name: field_name }, value) unless value.nil? || value.eql?('')
80
+ end
81
+ end
82
+ end
83
+ end
84
+
85
+ def strip_invalid_utf8_bytes(text)
86
+ if text.respond_to?(:encode) && (!text.valid_encoding?)
87
+ text.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
88
+ end
89
+
90
+ text.gsub!("\u00BF", '') if text.respond_to?(:gsub!)
91
+
92
+ text
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,198 @@
1
+ require 'date'
2
+
3
+ require_relative './iso_namespaces'
4
+ require_relative './solr_format'
5
+
6
+ module SearchSolrTools
7
+ module Helpers
8
+ # Methods for generating formatted strings from ISO xml nodes that can be indexed by SOLR
9
+ # rubocop:disable ClassLength
10
+ class IsoToSolrFormat
11
+ KEYWORDS = proc { |keywords| build_keyword_list keywords }
12
+
13
+ SPATIAL_DISPLAY = proc { |node| IsoToSolrFormat.spatial_display_str(node) }
14
+ SPATIAL_INDEX = proc { |node| IsoToSolrFormat.spatial_index_str(node) }
15
+ SPATIAL_AREA = proc { |node| IsoToSolrFormat.spatial_area_str(node) }
16
+ MAX_SPATIAL_AREA = proc { |values| IsoToSolrFormat.get_max_spatial_area(values) }
17
+
18
+ FACET_SPONSORED_PROGRAM = proc { |node| IsoToSolrFormat.sponsored_program_facet node }
19
+ FACET_SPATIAL_COVERAGE = proc { |node| IsoToSolrFormat.get_spatial_facet(node) }
20
+ FACET_SPATIAL_SCOPE = proc { |node| IsoToSolrFormat.get_spatial_scope_facet(node) }
21
+ FACET_TEMPORAL_DURATION = proc { |node| IsoToSolrFormat.get_temporal_duration_facet(node) }
22
+
23
+ TEMPORAL_DURATION = proc { |node| IsoToSolrFormat.get_temporal_duration(node) }
24
+ TEMPORAL_INDEX_STRING = proc { |node| IsoToSolrFormat.temporal_index_str node }
25
+ TEMPORAL_DISPLAY_STRING = proc { |node| IsoToSolrFormat.temporal_display_str node }
26
+ TEMPORAL_DISPLAY_STRING_FORMATTED = proc { |node| IsoToSolrFormat.temporal_display_str(node, true) }
27
+
28
+ DATASET_URL = proc { |node| IsoToSolrFormat.dataset_url(node) }
29
+ ICES_DATASET_URL = proc { |node| IsoToSolrFormat.ices_dataset_url(node) }
30
+ EOL_AUTHOR_FORMAT = proc { |node| IsoToSolrFormat.eol_author_format(node) }
31
+
32
+ def self.spatial_display_str(box_node)
33
+ box = bounding_box(box_node)
34
+ "#{box[:south]} #{box[:west]} #{box[:north]} #{box[:east]}"
35
+ end
36
+
37
+ def self.spatial_index_str(box_node)
38
+ box = bounding_box(box_node)
39
+ if box[:west] == box[:east] && box[:south] == box[:north]
40
+ [box[:west], box[:south]]
41
+ else
42
+ [box[:west], box[:south], box[:east], box[:north]]
43
+ end.join(' ')
44
+ end
45
+
46
+ def self.spatial_area_str(box_node)
47
+ box = bounding_box(box_node)
48
+ area = box[:north].to_f - box[:south].to_f
49
+ area
50
+ end
51
+
52
+ def self.get_max_spatial_area(values)
53
+ values.map(&:to_f).max
54
+ end
55
+
56
+ def self.get_spatial_facet(box_node)
57
+ box = bounding_box(box_node)
58
+
59
+ if BoundingBoxUtil.box_invalid?(box)
60
+ facet = nil
61
+ elsif BoundingBoxUtil.box_global?(box)
62
+ facet = 'Global'
63
+ else
64
+ facet = 'Non Global'
65
+ end
66
+ facet
67
+ end
68
+
69
+ def self.get_spatial_scope_facet(box_node)
70
+ box = bounding_box(box_node)
71
+ SolrFormat.get_spatial_scope_facet_with_bounding_box(box)
72
+ end
73
+
74
+ def self.temporal_display_str(temporal_node, formatted = false)
75
+ SolrFormat.temporal_display_str(date_range(temporal_node, formatted))
76
+ end
77
+
78
+ def self.get_temporal_duration(temporal_node)
79
+ dr = date_range(temporal_node)
80
+ dr[:end].to_s.empty? ? end_time = Time.now : end_time = Time.parse(dr[:end])
81
+ dr[:start].to_s.empty? ? duration = nil : duration = SolrFormat.get_temporal_duration(Time.parse(dr[:start]), end_time)
82
+ duration
83
+ end
84
+
85
+ def self.get_temporal_duration_facet(temporal_node)
86
+ duration = get_temporal_duration(temporal_node)
87
+ SolrFormat.get_temporal_duration_facet(duration)
88
+ end
89
+
90
+ def self.temporal_index_str(temporal_node)
91
+ dr = date_range(temporal_node)
92
+ SolrFormat.temporal_index_str(dr)
93
+ end
94
+
95
+ def self.sponsored_program_facet(node)
96
+ long_name = node.xpath('.//gmd:organisationName', IsoNamespaces.namespaces(node)).text.strip
97
+ short_name = node.xpath('.//gmd:organisationShortName', IsoNamespaces.namespaces(node)).text.strip
98
+
99
+ [long_name, short_name].join(' | ')
100
+ end
101
+
102
+ def self.build_keyword_list(keywords)
103
+ category = keywords.xpath('.//CategoryKeyword').text
104
+ topic = keywords.xpath('.//TopicKeyword').text
105
+ term = keywords.xpath('.//TermKeyword').text
106
+ category << ' > ' << topic << ' > ' << term
107
+ end
108
+
109
+ def self.date_range(temporal_node, formatted = false)
110
+ start_date = get_first_matching_child(
111
+ temporal_node,
112
+ ['.//gml:beginPosition', './/BeginningDateTime', './/gco:Date', './/dif:Start_Date']
113
+ )
114
+ start_date = '' unless SolrFormat.date?(start_date)
115
+ start_date = SolrFormat.date_str(start_date) if formatted
116
+
117
+ end_date = get_first_matching_child(
118
+ temporal_node,
119
+ ['.//gml:endPosition', './/EndingDateTime', './/gco:Date', './/dif:Stop_Date']
120
+ )
121
+ end_date = '' unless SolrFormat.date?(end_date)
122
+ end_date = SolrFormat.date_str(end_date) if formatted
123
+
124
+ {
125
+ start: start_date,
126
+ end: end_date
127
+ }
128
+ end
129
+
130
+ # Met.no sometimes has bad metadata, such as <gmd:URL>SU-1 (planned activity)</gmd:URL>
131
+ def self.dataset_url(url_node)
132
+ url_node.text.strip =~ %r{http[s]?://} ? url_node.text.strip : ''
133
+ end
134
+
135
+ def self.ices_dataset_url(auth_id)
136
+ 'http://geo.ices.dk/geonetwork/srv/en/main.home?uuid=' + auth_id
137
+ end
138
+
139
+ def self.get_first_matching_child(node, paths)
140
+ matching_nodes = node.at_xpath(paths.join(' | '), IsoNamespaces.namespaces(node))
141
+ matching_nodes.nil? ? '' : matching_nodes.text
142
+ end
143
+
144
+ def self.bounding_box(box_node)
145
+ {
146
+ west: get_bound(box_node, :west),
147
+ south: get_bound(box_node, :south),
148
+ east: get_bound(box_node, :east),
149
+ north: get_bound(box_node, :north)
150
+ }
151
+ end
152
+
153
+ def self.axis_label(direction)
154
+ {
155
+ north: 'Latitude',
156
+ south: 'Latitude',
157
+ east: 'Longitude',
158
+ west: 'Longitude'
159
+ }[direction]
160
+ end
161
+
162
+ def self.coordinate_boundary(lat_lon)
163
+ {
164
+ 'Latitude' => 90,
165
+ 'Longitude' => 180
166
+ }[lat_lon]
167
+ end
168
+
169
+ def self.node_values(box_node, direction, lat_lon)
170
+ get_first_matching_child(
171
+ box_node,
172
+ [
173
+ "./gmd:#{direction.to_s.downcase}Bounding#{lat_lon}/gco:Decimal",
174
+ "./gmd:#{direction.to_s.downcase}Bound#{lat_lon}/gco:Decimal",
175
+ "./#{direction.to_s.capitalize}BoundingCoordinate",
176
+ "./dif:#{direction.to_s.capitalize}ernmost_#{lat_lon}"
177
+ ]
178
+ ).split(' ')
179
+ end
180
+
181
+ def self.get_bound(box_node, direction)
182
+ lat_lon = axis_label(direction)
183
+
184
+ vals = node_values(box_node, direction, lat_lon)
185
+ val = vals.first
186
+
187
+ boundary = coordinate_boundary(lat_lon)
188
+ out_of_bounds = boundary < val.to_f.abs
189
+
190
+ return '' if vals.empty? || out_of_bounds
191
+
192
+ val = (-val.to_f) if %w(West South).include?(vals.last)
193
+
194
+ val.to_f.to_s
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,13 @@
1
+ module SearchSolrTools
2
+ module Helpers
3
+ # Class to build a query string based on a hash of params
4
+ class QueryBuilder
5
+ class << self
6
+ def build(params)
7
+ param_str = params.map { |k, v| "#{k}=#{v}" }.join('&')
8
+ "?#{param_str}"
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,20 @@
1
+ require 'require_all'
2
+ require_rel '../selectors'
3
+
4
+ module SearchSolrTools
5
+ module Helpers
6
+ # This hash grabs all the selector files inside the selectors directory,
7
+ # to add a new source we need to create a selector file and add it to this hash.
8
+ SELECTORS = {
9
+ cisl: Selectors::CISL,
10
+ echo: Selectors::ECHO,
11
+ ices: Selectors::ICES,
12
+ nmi: Selectors::NMI,
13
+ nodc: Selectors::NODC,
14
+ pdc: Selectors::PDC,
15
+ rda: Selectors::RDA,
16
+ tdar: Selectors::TDAR,
17
+ usgs: Selectors::USGS
18
+ }
19
+ end
20
+ end