search_solr_tools 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,260 @@
1
+ require 'date'
2
+ require 'iso8601'
3
+
4
+ require_relative 'bounding_box_util'
5
+
6
+ module SearchSolrTools
7
+ module Helpers
8
+ # Methods for generating formatted values that can be indexed by SOLR
9
+ # rubocop:disable Metrics/ModuleLength
10
+ module SolrFormat
11
+ DATA_CENTER_NAMES = {
12
+ NSIDC: { short_name: 'NSIDC', long_name: 'National Snow and Ice Data Center' },
13
+ CISL: { short_name: 'ACADIS Gateway', long_name: 'Advanced Cooperative Arctic Data and Information Service' },
14
+ ECHO: { short_name: 'NASA ECHO', long_name: 'NASA Earth Observing System (EOS) Clearing House (ECHO)' },
15
+ EOL: { short_name: 'UCAR NCAR EOL', long_name: 'UCAR NCAR - Earth Observing Laboratory' },
16
+ ICES: { short_name: 'ICES', long_name: 'International Council for the Exploration of the Sea' },
17
+ NMI: { short_name: 'Met.no', long_name: 'Norwegian Meteorological Institute' },
18
+ NODC: { short_name: 'NOAA NODC', long_name: 'NOAA National Oceanographic Data Center' },
19
+ RDA: { short_name: 'UCAR NCAR RDA', long_name: 'UCAR NCAR Research Data Archive' },
20
+ USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' },
21
+ BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
22
+ TDAR: { short_name: 'tDAR', long_name: 'tDAR: The Digital Archaeological Record' },
23
+ PDC: { short_name: 'PDC', long_name: 'Polar Data Catalogue' }
24
+ }
25
+
26
+ NOT_SPECIFIED = 'Not specified'
27
+
28
+ TEMPORAL_RESOLUTION_FACET_VALUES = %w(Subhourly Hourly Subdaily Daily Weekly Submonthly Monthly Subyearly Yearly Multiyearly)
29
+ SUBHOURLY_INDEX = 0
30
+ HOURLY_INDEX = 1
31
+ SUBDAILY_INDEX = 2
32
+ DAILY_INDEX = 3
33
+ WEEKLY_INDEX = 4
34
+ SUBMONTHLY_INDEX = 5
35
+ MONTHLY_INDEX = 6
36
+ SUBYEARLY_INDEX = 7
37
+ YEARLY_INDEX = 8
38
+ MULTIYEARLY_INDEX = 9
39
+
40
+ SPATIAL_RESOLUTION_FACET_VALUES = ['0 - 500 m', '501 m - 1 km', '2 - 5 km', '6 - 15 km', '16 - 30 km', '>30 km']
41
+ SPATIAL_0_500_INDEX = 0
42
+ SPATIAL_501_1_INDEX = 1
43
+ SPATIAL_2_5_INDEX = 2
44
+ SPATIAL_6_15_INDEX = 3
45
+ SPATIAL_16_30_INDEX = 4
46
+ SPATIAL_GREATER_30_INDEX = 5
47
+
48
+ REDUCE_TEMPORAL_DURATION = proc { |values| reduce_temporal_duration(values) }
49
+ DATE = proc { |date| date_str date.text }
50
+
51
+ HTTP_URL_FORMAT = proc do |url_node|
52
+ url = url_node.text
53
+ url =~ %r{//} ? url : "http://#{url}"
54
+ end
55
+
56
+ def self.temporal_display_str(date_range)
57
+ temporal_str = "#{date_range[:start]}"
58
+ temporal_str += ",#{date_range[:end]}" unless date_range[:end].nil?
59
+ temporal_str
60
+ end
61
+
62
+ # returns the temporal duration in days; returns -1 if there is not a valid
63
+ # start date
64
+ def self.get_temporal_duration(start_time, end_time)
65
+ if start_time.to_s.empty?
66
+ duration = nil
67
+ else
68
+ end_time = Time.now if end_time.to_s.empty?
69
+ # datasets that cover just one day would have end_date - start_date = 0,
70
+ # so we need to add 1 to make sure the duration is the actual number of
71
+ # days; if the end date and start date are flipped in the metadata, a
72
+ # negative duration doesn't make sense so use the absolute value
73
+ duration = Integer((end_time - start_time).abs / 86_400) + 1
74
+ end
75
+ duration
76
+ end
77
+
78
+ def self.get_temporal_duration_facet(duration)
79
+ return NOT_SPECIFIED if duration.nil?
80
+ years = duration.to_i / 365
81
+ temporal_duration_range(years)
82
+ end
83
+
84
+ # We are indexing date ranges a spatial coordinates.
85
+ # This means we have to convert dates into the format YY.YYMMDD which can be stored in the standard lat/long space
86
+ # For example: 2013-01-01T00:00:00Z to 2013-01-31T00:00:00Z will be converted to 20.130101, 20.130131.
87
+ # See http://wiki.apache.org/solr/SpatialForTimeDurations
88
+ def self.temporal_index_str(date_range)
89
+ "#{format_date_for_index date_range[:start], MIN_DATE} #{format_date_for_index(date_range[:end], MAX_DATE)}"
90
+ end
91
+
92
+ def self.reduce_temporal_duration(values)
93
+ values.map { |v| Integer(v) rescue nil }.compact.max
94
+ end
95
+
96
+ def self.facet_binning(type, format_string)
97
+ binned_facet = bin(FacetConfiguration.get_facet_bin(type), format_string)
98
+ if binned_facet.nil?
99
+ return format_string
100
+ elsif binned_facet.eql?('exclude')
101
+ return nil
102
+ else
103
+ return binned_facet
104
+ end
105
+
106
+ nil
107
+ end
108
+
109
+ def self.parameter_binning(parameter_string)
110
+ binned_parameter = bin(FacetConfiguration.get_facet_bin('parameter'), parameter_string)
111
+ # use variable_level_1 if no mapping exists
112
+ if binned_parameter.nil?
113
+ parts = parameter_string.split '>'
114
+ return parts[3].strip if parts.length >= 4
115
+ else
116
+ return binned_parameter
117
+ end
118
+
119
+ nil
120
+ end
121
+
122
+ def self.resolution_value(resolution, find_index_method, resolution_values)
123
+ return NOT_SPECIFIED if self.resolution_not_specified? resolution
124
+ if resolution['type'] == 'single'
125
+ i = send(find_index_method, resolution['resolution'])
126
+ return resolution_values[i]
127
+ end
128
+ if resolution['type'] == 'range'
129
+ i = send(find_index_method, resolution['min_resolution'])
130
+ j = send(find_index_method, resolution['max_resolution'])
131
+ return resolution_values[i..j]
132
+ end
133
+ fail "Invalid resolution #{resolution['type']}"
134
+ end
135
+
136
+ def self.resolution_not_specified?(resolution)
137
+ return true if resolution.to_s.empty?
138
+ return true unless %w(single range).include? resolution['type']
139
+ return true if resolution['type'] == 'single' && resolution['resolution'].to_s.empty?
140
+ return true if resolution['type'] == 'range' && resolution['min_resolution'].to_s.empty?
141
+ end
142
+
143
+ def self.get_spatial_scope_facet_with_bounding_box(bbox)
144
+ if bbox.nil? || BoundingBoxUtil.box_invalid?(bbox)
145
+ return nil
146
+ elsif BoundingBoxUtil.box_global?(bbox)
147
+ facet = 'Coverage from over 85 degrees North to -85 degrees South | Global'
148
+ elsif BoundingBoxUtil.box_local?(bbox)
149
+ facet = 'Less than 1 degree of latitude change | Local'
150
+ else
151
+ facet = 'Between 1 and 170 degrees of latitude change | Regional'
152
+ end
153
+ facet
154
+ end
155
+
156
+ def self.date_str(date)
157
+ d = if date.is_a? String
158
+ DateTime.parse(date.strip) rescue nil
159
+ else
160
+ date
161
+ end
162
+ "#{d.iso8601[0..-7]}Z" unless d.nil?
163
+ end
164
+
165
+ private
166
+
167
+ MIN_DATE = '00010101'
168
+ MAX_DATE = Time.now.strftime('%Y%m%d')
169
+
170
+ def self.bin(mappings, term)
171
+ mappings.each do |mapping|
172
+ term.match(mapping['pattern']) do
173
+ return mapping['mapping']
174
+ end
175
+ end
176
+ nil
177
+ end
178
+
179
+ # rubocop:disable CyclomaticComplexity
180
+ def self.find_index_for_single_temporal_resolution_value(string_duration)
181
+ iso8601_duration = ISO8601::Duration.new(string_duration)
182
+
183
+ dur_sec = iso8601_duration.to_seconds
184
+
185
+ case dur_sec
186
+ when 0..3_599 then SUBHOURLY_INDEX
187
+ when 3600 then HOURLY_INDEX
188
+ when 3601..86_399 then SUBDAILY_INDEX
189
+ when 86_400..172_800 then DAILY_INDEX
190
+ when 172_801..691_200 then WEEKLY_INDEX
191
+ when 691_201..1_728_000 then SUBMONTHLY_INDEX
192
+ when 1_728_001..2_678_400 then MONTHLY_INDEX
193
+ when 2_678_400..31_535_999 then SUBYEARLY_INDEX
194
+ when 31_536_000 then YEARLY_INDEX
195
+ else
196
+ MULTIYEARLY_INDEX
197
+ end
198
+ end
199
+ # rubocop:enable CyclomaticComplexity
200
+
201
+ def self.find_index_for_single_spatial_resolution_value(string_duration)
202
+ value, units = string_duration.split(' ')
203
+
204
+ if units == 'deg'
205
+ spatial_resolution_index_degrees(value)
206
+ elsif units == 'm'
207
+ spatial_resolution_index_meters(value)
208
+ end
209
+ end
210
+
211
+ def self.spatial_resolution_index_degrees(degrees)
212
+ if degrees.to_f <= 0.05
213
+ SPATIAL_2_5_INDEX
214
+ elsif degrees.to_f < 0.5
215
+ SPATIAL_16_30_INDEX
216
+ else
217
+ SPATIAL_GREATER_30_INDEX
218
+ end
219
+ end
220
+
221
+ def self.spatial_resolution_index_meters(meters)
222
+ case meters.to_f
223
+ when 0..500 then SPATIAL_0_500_INDEX
224
+ when 500..1_000 then SPATIAL_501_1_INDEX
225
+ when 1_000..5_000 then SPATIAL_2_5_INDEX
226
+ when 5_000..15_000 then SPATIAL_6_15_INDEX
227
+ when 15_000..30_000 then SPATIAL_16_30_INDEX
228
+ else
229
+ SPATIAL_GREATER_30_INDEX
230
+ end
231
+ end
232
+
233
+ # takes a temporal_duration in years, returns a string representing the range
234
+ # for faceting
235
+ def self.temporal_duration_range(years)
236
+ range = []
237
+
238
+ range.push '< 1 year' if years >= 0 && years < 1
239
+ range.push '1+ years' if years >= 1
240
+ range.push '5+ years' if years >= 5
241
+ range.push '10+ years' if years >= 10
242
+
243
+ range
244
+ end
245
+
246
+ def self.date?(date)
247
+ valid_date = if date.is_a? String
248
+ d = DateTime.parse(date.strip) rescue false
249
+ DateTime.valid_date?(d.year, d.mon, d.day) unless d.eql?(false)
250
+ end
251
+ valid_date
252
+ end
253
+
254
+ def self.format_date_for_index(date_str, default)
255
+ date_str = default unless date? date_str
256
+ DateTime.parse(date_str).strftime('%C.%y%m%d')
257
+ end
258
+ end
259
+ end
260
+ end
@@ -0,0 +1,70 @@
1
+ require_relative './iso_namespaces'
2
+ require_relative './iso_to_solr_format'
3
+ require_relative './solr_format'
4
+
5
+ module SearchSolrTools
6
+ module Helpers
7
+ # Special formatter for dealing with temporal metadata issues in the TDAR feed
8
+ class TdarFormat < IsoToSolrFormat
9
+ SPATIAL_DISPLAY = proc { |node| TdarFormat.spatial_display_str(node) }
10
+ SPATIAL_INDEX = proc { |node| TdarFormat.spatial_index_str(node) }
11
+ FACET_SPATIAL_SCOPE = proc { |node| TdarFormat.get_spatial_scope_facet(node) }
12
+
13
+ TEMPORAL_INDEX_STRING = proc { |node| TdarFormat.temporal_index_str(node) }
14
+ TEMPORAL_DISPLAY_STRING = proc { |node| TdarFormat.temporal_display_str(node) }
15
+ TEMPORAL_DISPLAY_STRING_FORMATTED = proc { |node| TdarFormat.temporal_display_str(node, true) }
16
+ TEMPORAL_DURATION = proc { |node| TdarFormat.get_temporal_duration(node) }
17
+ FACET_TEMPORAL_DURATION = proc { |node| TdarFormat.get_temporal_duration_facet(node) }
18
+
19
+ def self.get_spatial_scope_facet(node)
20
+ box = bounding_box(node)
21
+ SolrFormat.get_spatial_scope_facet_with_bounding_box(box)
22
+ end
23
+
24
+ def self.date_range(temporal_node, formatted = false)
25
+ xpath = '.'
26
+ namespaces = IsoNamespaces.namespaces(temporal_node)
27
+
28
+ temporal_node_count = temporal_node.xpath(xpath, namespaces).size
29
+ date_str = temporal_node.at_xpath(xpath, namespaces).text
30
+
31
+ super if temporal_node_count != 1
32
+
33
+ case date_str
34
+ when /^[0-9]{4}$/
35
+ year_to_range(date_str)
36
+ when /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$/
37
+ single_date_to_range(date_str)
38
+ else
39
+ super
40
+ end
41
+ end
42
+
43
+ def self.single_date_to_range(date)
44
+ {
45
+ start: date,
46
+ end: date
47
+ }
48
+ end
49
+
50
+ def self.year_to_range(year)
51
+ {
52
+ start: "#{year}-01-01",
53
+ end: "#{year}-12-31"
54
+ }
55
+ end
56
+
57
+ # Bounding box is defined by two coordinates to create a point.
58
+ # Create a bounding box from this point.
59
+ def self.bounding_box(node)
60
+ point = node.text.split(' ')
61
+ {
62
+ west: point[1],
63
+ south: point[0],
64
+ east: point[3],
65
+ north: point[2]
66
+ }
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ require 'rgeo/geo_json'
2
+
3
+ require 'search_solr_tools/helpers/bounding_box_util'
4
+ require 'search_solr_tools/helpers/iso_to_solr_format'
5
+
6
+ module SearchSolrTools
7
+ module Helpers
8
+ # Methods to translate list of geoJson objects to solr format values
9
+ module TranslateSpatialCoverage
10
+ def self.geojson_to_spatial_display_str(spatial_coverage_geom)
11
+ spatial_coverage_geom = convert_multipoint_to_point(spatial_coverage_geom)
12
+ spatial_coverage_geom.map do |geom|
13
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geom)
14
+ "#{bbox.min_y} #{bbox.min_x} #{bbox.max_y} #{bbox.max_x}"
15
+ end
16
+ end
17
+
18
+ def self.convert_multipoint_to_point(spatial_coverage_geom)
19
+ return_geom = []
20
+ spatial_coverage_geom.each do |geom|
21
+ if geom.geometry_type.to_s.downcase.eql?('multipoint')
22
+ geom.each do |point|
23
+ return_geom << point
24
+ end
25
+ else
26
+ return_geom << geom
27
+ end
28
+ end
29
+ return_geom
30
+ end
31
+
32
+ def self.geojson_to_spatial_index_str(spatial_coverage_geom)
33
+ spatial_coverage_geom = convert_multipoint_to_point(spatial_coverage_geom)
34
+ spatial_coverage_geom.map do |geo_json|
35
+ if geo_json.geometry_type.to_s.downcase.eql?('point')
36
+ "#{geo_json.x} #{geo_json.y}"
37
+ else
38
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geo_json)
39
+ "#{bbox.min_x} #{bbox.min_y} #{bbox.max_x} #{bbox.max_y}"
40
+ end
41
+ end
42
+ end
43
+
44
+ def self.geojson_to_spatial_area(spatial_coverage_geom)
45
+ spatial_areas = spatial_coverage_geom.map do |geo_json|
46
+ if %w(point).include?(geo_json.geometry_type.to_s.downcase)
47
+ 0.0
48
+ else
49
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geo_json)
50
+ bbox.max_y - bbox.min_y
51
+ end
52
+ end
53
+ return nil if spatial_areas.empty?
54
+ spatial_areas.sort.last
55
+ end
56
+
57
+ def self.geojson_to_global_facet(spatial_coverage_geom)
58
+ return nil if spatial_coverage_geom.nil?
59
+ spatial_coverage_geom.each do |geo_json|
60
+ bbox_hash = BoundingBoxUtil.bounding_box_hash_from_geo_json(geo_json)
61
+ return 'Show Global Only' if BoundingBoxUtil.box_global?(bbox_hash)
62
+ end
63
+ nil
64
+ end
65
+
66
+ def self.geojson_to_spatial_scope_facet(spatial_coverage_geom)
67
+ unless spatial_coverage_geom.nil?
68
+ spatial_coverage_geom.map do |geo_json|
69
+ bbox_hash = BoundingBoxUtil.bounding_box_hash_from_geo_json(geo_json)
70
+ scope = SolrFormat.get_spatial_scope_facet_with_bounding_box(bbox_hash)
71
+ scope unless scope.nil?
72
+ end.uniq
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,40 @@
1
+ require 'rgeo/geo_json'
2
+
3
+ require 'search_solr_tools/helpers/solr_format'
4
+
5
+ module SearchSolrTools
6
+ module Helpers
7
+ # Methods to translate temporal coverage object to solr format values
8
+ module TranslateTemporalCoverage
9
+ def self.translate_coverages(temporal_coverages_json)
10
+ temporal_coverages = temporal_coverages_json.to_a.map do |coverage|
11
+ start_time = time_string(coverage, 'start')
12
+ end_time = time_string(coverage, 'end')
13
+
14
+ [
15
+ SolrFormat.temporal_index_str(start: start_time.to_s, end: end_time.to_s),
16
+ SolrFormat.temporal_display_str(start: format_string(start_time), end: format_string(end_time)),
17
+ SolrFormat.get_temporal_duration(start_time, end_time)
18
+ ]
19
+ end.transpose
20
+
21
+ temporal_index_str = temporal_coverages[0] || []
22
+ temporal_display = temporal_coverages[1] || []
23
+ temporal_durations = temporal_coverages[2] || []
24
+
25
+ max_temporal_duration = SolrFormat.reduce_temporal_duration(temporal_durations)
26
+ facet = SolrFormat.get_temporal_duration_facet(max_temporal_duration)
27
+
28
+ { 'temporal_coverages' => temporal_display, 'temporal_duration' => max_temporal_duration, 'temporal' => temporal_index_str, 'facet_temporal_duration' => facet }
29
+ end
30
+
31
+ def self.format_string(value)
32
+ value.to_s.empty? ? nil : value.strftime('%Y-%m-%d')
33
+ end
34
+
35
+ def self.time_string(coverage, key)
36
+ Time.parse(coverage[key]) unless coverage[key].to_s.empty?
37
+ end
38
+ end
39
+ end
40
+ end