search_solr_tools 3.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +88 -0
  3. data/COPYING +674 -0
  4. data/README.md +203 -0
  5. data/bin/search_solr_tools +87 -0
  6. data/lib/search_solr_tools.rb +8 -0
  7. data/lib/search_solr_tools/config/environments.rb +12 -0
  8. data/lib/search_solr_tools/config/environments.yaml +73 -0
  9. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +43 -0
  10. data/lib/search_solr_tools/harvesters/auto_suggest.rb +61 -0
  11. data/lib/search_solr_tools/harvesters/base.rb +183 -0
  12. data/lib/search_solr_tools/harvesters/bcodmo.rb +55 -0
  13. data/lib/search_solr_tools/harvesters/cisl.rb +63 -0
  14. data/lib/search_solr_tools/harvesters/echo.rb +50 -0
  15. data/lib/search_solr_tools/harvesters/eol.rb +53 -0
  16. data/lib/search_solr_tools/harvesters/ices.rb +55 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +32 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +72 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +33 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +60 -0
  21. data/lib/search_solr_tools/harvesters/oai.rb +59 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +38 -0
  23. data/lib/search_solr_tools/harvesters/rda.rb +33 -0
  24. data/lib/search_solr_tools/harvesters/tdar.rb +57 -0
  25. data/lib/search_solr_tools/harvesters/usgs.rb +74 -0
  26. data/lib/search_solr_tools/helpers/bounding_box_util.rb +37 -0
  27. data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +30 -0
  28. data/lib/search_solr_tools/helpers/facet_configuration.rb +19 -0
  29. data/lib/search_solr_tools/helpers/iso_namespaces.rb +30 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr.rb +96 -0
  31. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +198 -0
  32. data/lib/search_solr_tools/helpers/query_builder.rb +13 -0
  33. data/lib/search_solr_tools/helpers/selectors.rb +20 -0
  34. data/lib/search_solr_tools/helpers/solr_format.rb +260 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +70 -0
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +77 -0
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +40 -0
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +50 -0
  39. data/lib/search_solr_tools/selectors/cisl.rb +112 -0
  40. data/lib/search_solr_tools/selectors/echo_iso.rb +111 -0
  41. data/lib/search_solr_tools/selectors/ices_iso.rb +107 -0
  42. data/lib/search_solr_tools/selectors/nmi.rb +106 -0
  43. data/lib/search_solr_tools/selectors/nodc_iso.rb +107 -0
  44. data/lib/search_solr_tools/selectors/pdc_iso.rb +108 -0
  45. data/lib/search_solr_tools/selectors/rda.rb +106 -0
  46. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +89 -0
  47. data/lib/search_solr_tools/selectors/usgs_iso.rb +105 -0
  48. data/lib/search_solr_tools/translators/bcodmo_json.rb +69 -0
  49. data/lib/search_solr_tools/translators/eol_to_solr.rb +78 -0
  50. data/lib/search_solr_tools/translators/nsidc_json.rb +190 -0
  51. data/lib/search_solr_tools/version.rb +3 -0
  52. data/search_solr_tools.gemspec +45 -0
  53. metadata +345 -0
@@ -0,0 +1,260 @@
1
+ require 'date'
2
+ require 'iso8601'
3
+
4
+ require_relative 'bounding_box_util'
5
+
6
+ module SearchSolrTools
7
+ module Helpers
8
+ # Methods for generating formatted values that can be indexed by SOLR
9
+ # rubocop:disable Metrics/ModuleLength
10
+ module SolrFormat
11
+ DATA_CENTER_NAMES = {
12
+ NSIDC: { short_name: 'NSIDC', long_name: 'National Snow and Ice Data Center' },
13
+ CISL: { short_name: 'ACADIS Gateway', long_name: 'Advanced Cooperative Arctic Data and Information Service' },
14
+ ECHO: { short_name: 'NASA ECHO', long_name: 'NASA Earth Observing System (EOS) Clearing House (ECHO)' },
15
+ EOL: { short_name: 'UCAR NCAR EOL', long_name: 'UCAR NCAR - Earth Observing Laboratory' },
16
+ ICES: { short_name: 'ICES', long_name: 'International Council for the Exploration of the Sea' },
17
+ NMI: { short_name: 'Met.no', long_name: 'Norwegian Meteorological Institute' },
18
+ NODC: { short_name: 'NOAA NODC', long_name: 'NOAA National Oceanographic Data Center' },
19
+ RDA: { short_name: 'UCAR NCAR RDA', long_name: 'UCAR NCAR Research Data Archive' },
20
+ USGS: { short_name: 'USGS ScienceBase', long_name: 'U.S. Geological Survey ScienceBase' },
21
+ BCODMO: { short_name: 'BCO-DMO', long_name: 'Biological and Chemical Oceanography Data Management Office' },
22
+ TDAR: { short_name: 'tDAR', long_name: 'tDAR: The Digital Archaeological Record' },
23
+ PDC: { short_name: 'PDC', long_name: 'Polar Data Catalogue' }
24
+ }
25
+
26
+ NOT_SPECIFIED = 'Not specified'
27
+
28
+ TEMPORAL_RESOLUTION_FACET_VALUES = %w(Subhourly Hourly Subdaily Daily Weekly Submonthly Monthly Subyearly Yearly Multiyearly)
29
+ SUBHOURLY_INDEX = 0
30
+ HOURLY_INDEX = 1
31
+ SUBDAILY_INDEX = 2
32
+ DAILY_INDEX = 3
33
+ WEEKLY_INDEX = 4
34
+ SUBMONTHLY_INDEX = 5
35
+ MONTHLY_INDEX = 6
36
+ SUBYEARLY_INDEX = 7
37
+ YEARLY_INDEX = 8
38
+ MULTIYEARLY_INDEX = 9
39
+
40
+ SPATIAL_RESOLUTION_FACET_VALUES = ['0 - 500 m', '501 m - 1 km', '2 - 5 km', '6 - 15 km', '16 - 30 km', '>30 km']
41
+ SPATIAL_0_500_INDEX = 0
42
+ SPATIAL_501_1_INDEX = 1
43
+ SPATIAL_2_5_INDEX = 2
44
+ SPATIAL_6_15_INDEX = 3
45
+ SPATIAL_16_30_INDEX = 4
46
+ SPATIAL_GREATER_30_INDEX = 5
47
+
48
+ REDUCE_TEMPORAL_DURATION = proc { |values| reduce_temporal_duration(values) }
49
+ DATE = proc { |date| date_str date.text }
50
+
51
+ HTTP_URL_FORMAT = proc do |url_node|
52
+ url = url_node.text
53
+ url =~ %r{//} ? url : "http://#{url}"
54
+ end
55
+
56
+ def self.temporal_display_str(date_range)
57
+ temporal_str = "#{date_range[:start]}"
58
+ temporal_str += ",#{date_range[:end]}" unless date_range[:end].nil?
59
+ temporal_str
60
+ end
61
+
62
+ # returns the temporal duration in days; returns -1 if there is not a valid
63
+ # start date
64
+ def self.get_temporal_duration(start_time, end_time)
65
+ if start_time.to_s.empty?
66
+ duration = nil
67
+ else
68
+ end_time = Time.now if end_time.to_s.empty?
69
+ # datasets that cover just one day would have end_date - start_date = 0,
70
+ # so we need to add 1 to make sure the duration is the actual number of
71
+ # days; if the end date and start date are flipped in the metadata, a
72
+ # negative duration doesn't make sense so use the absolute value
73
+ duration = Integer((end_time - start_time).abs / 86_400) + 1
74
+ end
75
+ duration
76
+ end
77
+
78
+ def self.get_temporal_duration_facet(duration)
79
+ return NOT_SPECIFIED if duration.nil?
80
+ years = duration.to_i / 365
81
+ temporal_duration_range(years)
82
+ end
83
+
84
+ # We are indexing date ranges a spatial coordinates.
85
+ # This means we have to convert dates into the format YY.YYMMDD which can be stored in the standard lat/long space
86
+ # For example: 2013-01-01T00:00:00Z to 2013-01-31T00:00:00Z will be converted to 20.130101, 20.130131.
87
+ # See http://wiki.apache.org/solr/SpatialForTimeDurations
88
+ def self.temporal_index_str(date_range)
89
+ "#{format_date_for_index date_range[:start], MIN_DATE} #{format_date_for_index(date_range[:end], MAX_DATE)}"
90
+ end
91
+
92
+ def self.reduce_temporal_duration(values)
93
+ values.map { |v| Integer(v) rescue nil }.compact.max
94
+ end
95
+
96
+ def self.facet_binning(type, format_string)
97
+ binned_facet = bin(FacetConfiguration.get_facet_bin(type), format_string)
98
+ if binned_facet.nil?
99
+ return format_string
100
+ elsif binned_facet.eql?('exclude')
101
+ return nil
102
+ else
103
+ return binned_facet
104
+ end
105
+
106
+ nil
107
+ end
108
+
109
+ def self.parameter_binning(parameter_string)
110
+ binned_parameter = bin(FacetConfiguration.get_facet_bin('parameter'), parameter_string)
111
+ # use variable_level_1 if no mapping exists
112
+ if binned_parameter.nil?
113
+ parts = parameter_string.split '>'
114
+ return parts[3].strip if parts.length >= 4
115
+ else
116
+ return binned_parameter
117
+ end
118
+
119
+ nil
120
+ end
121
+
122
+ def self.resolution_value(resolution, find_index_method, resolution_values)
123
+ return NOT_SPECIFIED if self.resolution_not_specified? resolution
124
+ if resolution['type'] == 'single'
125
+ i = send(find_index_method, resolution['resolution'])
126
+ return resolution_values[i]
127
+ end
128
+ if resolution['type'] == 'range'
129
+ i = send(find_index_method, resolution['min_resolution'])
130
+ j = send(find_index_method, resolution['max_resolution'])
131
+ return resolution_values[i..j]
132
+ end
133
+ fail "Invalid resolution #{resolution['type']}"
134
+ end
135
+
136
+ def self.resolution_not_specified?(resolution)
137
+ return true if resolution.to_s.empty?
138
+ return true unless %w(single range).include? resolution['type']
139
+ return true if resolution['type'] == 'single' && resolution['resolution'].to_s.empty?
140
+ return true if resolution['type'] == 'range' && resolution['min_resolution'].to_s.empty?
141
+ end
142
+
143
+ def self.get_spatial_scope_facet_with_bounding_box(bbox)
144
+ if bbox.nil? || BoundingBoxUtil.box_invalid?(bbox)
145
+ return nil
146
+ elsif BoundingBoxUtil.box_global?(bbox)
147
+ facet = 'Coverage from over 85 degrees North to -85 degrees South | Global'
148
+ elsif BoundingBoxUtil.box_local?(bbox)
149
+ facet = 'Less than 1 degree of latitude change | Local'
150
+ else
151
+ facet = 'Between 1 and 170 degrees of latitude change | Regional'
152
+ end
153
+ facet
154
+ end
155
+
156
+ def self.date_str(date)
157
+ d = if date.is_a? String
158
+ DateTime.parse(date.strip) rescue nil
159
+ else
160
+ date
161
+ end
162
+ "#{d.iso8601[0..-7]}Z" unless d.nil?
163
+ end
164
+
165
+ private
166
+
167
+ MIN_DATE = '00010101'
168
+ MAX_DATE = Time.now.strftime('%Y%m%d')
169
+
170
+ def self.bin(mappings, term)
171
+ mappings.each do |mapping|
172
+ term.match(mapping['pattern']) do
173
+ return mapping['mapping']
174
+ end
175
+ end
176
+ nil
177
+ end
178
+
179
+ # rubocop:disable CyclomaticComplexity
180
+ def self.find_index_for_single_temporal_resolution_value(string_duration)
181
+ iso8601_duration = ISO8601::Duration.new(string_duration)
182
+
183
+ dur_sec = iso8601_duration.to_seconds
184
+
185
+ case dur_sec
186
+ when 0..3_599 then SUBHOURLY_INDEX
187
+ when 3600 then HOURLY_INDEX
188
+ when 3601..86_399 then SUBDAILY_INDEX
189
+ when 86_400..172_800 then DAILY_INDEX
190
+ when 172_801..691_200 then WEEKLY_INDEX
191
+ when 691_201..1_728_000 then SUBMONTHLY_INDEX
192
+ when 1_728_001..2_678_400 then MONTHLY_INDEX
193
+ when 2_678_400..31_535_999 then SUBYEARLY_INDEX
194
+ when 31_536_000 then YEARLY_INDEX
195
+ else
196
+ MULTIYEARLY_INDEX
197
+ end
198
+ end
199
+ # rubocop:enable CyclomaticComplexity
200
+
201
+ def self.find_index_for_single_spatial_resolution_value(string_duration)
202
+ value, units = string_duration.split(' ')
203
+
204
+ if units == 'deg'
205
+ spatial_resolution_index_degrees(value)
206
+ elsif units == 'm'
207
+ spatial_resolution_index_meters(value)
208
+ end
209
+ end
210
+
211
+ def self.spatial_resolution_index_degrees(degrees)
212
+ if degrees.to_f <= 0.05
213
+ SPATIAL_2_5_INDEX
214
+ elsif degrees.to_f < 0.5
215
+ SPATIAL_16_30_INDEX
216
+ else
217
+ SPATIAL_GREATER_30_INDEX
218
+ end
219
+ end
220
+
221
+ def self.spatial_resolution_index_meters(meters)
222
+ case meters.to_f
223
+ when 0..500 then SPATIAL_0_500_INDEX
224
+ when 500..1_000 then SPATIAL_501_1_INDEX
225
+ when 1_000..5_000 then SPATIAL_2_5_INDEX
226
+ when 5_000..15_000 then SPATIAL_6_15_INDEX
227
+ when 15_000..30_000 then SPATIAL_16_30_INDEX
228
+ else
229
+ SPATIAL_GREATER_30_INDEX
230
+ end
231
+ end
232
+
233
+ # takes a temporal_duration in years, returns a string representing the range
234
+ # for faceting
235
+ def self.temporal_duration_range(years)
236
+ range = []
237
+
238
+ range.push '< 1 year' if years >= 0 && years < 1
239
+ range.push '1+ years' if years >= 1
240
+ range.push '5+ years' if years >= 5
241
+ range.push '10+ years' if years >= 10
242
+
243
+ range
244
+ end
245
+
246
+ def self.date?(date)
247
+ valid_date = if date.is_a? String
248
+ d = DateTime.parse(date.strip) rescue false
249
+ DateTime.valid_date?(d.year, d.mon, d.day) unless d.eql?(false)
250
+ end
251
+ valid_date
252
+ end
253
+
254
+ def self.format_date_for_index(date_str, default)
255
+ date_str = default unless date? date_str
256
+ DateTime.parse(date_str).strftime('%C.%y%m%d')
257
+ end
258
+ end
259
+ end
260
+ end
@@ -0,0 +1,70 @@
1
+ require_relative './iso_namespaces'
2
+ require_relative './iso_to_solr_format'
3
+ require_relative './solr_format'
4
+
5
+ module SearchSolrTools
6
+ module Helpers
7
+ # Special formatter for dealing with temporal metadata issues in the TDAR feed
8
+ class TdarFormat < IsoToSolrFormat
9
+ SPATIAL_DISPLAY = proc { |node| TdarFormat.spatial_display_str(node) }
10
+ SPATIAL_INDEX = proc { |node| TdarFormat.spatial_index_str(node) }
11
+ FACET_SPATIAL_SCOPE = proc { |node| TdarFormat.get_spatial_scope_facet(node) }
12
+
13
+ TEMPORAL_INDEX_STRING = proc { |node| TdarFormat.temporal_index_str(node) }
14
+ TEMPORAL_DISPLAY_STRING = proc { |node| TdarFormat.temporal_display_str(node) }
15
+ TEMPORAL_DISPLAY_STRING_FORMATTED = proc { |node| TdarFormat.temporal_display_str(node, true) }
16
+ TEMPORAL_DURATION = proc { |node| TdarFormat.get_temporal_duration(node) }
17
+ FACET_TEMPORAL_DURATION = proc { |node| TdarFormat.get_temporal_duration_facet(node) }
18
+
19
+ def self.get_spatial_scope_facet(node)
20
+ box = bounding_box(node)
21
+ SolrFormat.get_spatial_scope_facet_with_bounding_box(box)
22
+ end
23
+
24
+ def self.date_range(temporal_node, formatted = false)
25
+ xpath = '.'
26
+ namespaces = IsoNamespaces.namespaces(temporal_node)
27
+
28
+ temporal_node_count = temporal_node.xpath(xpath, namespaces).size
29
+ date_str = temporal_node.at_xpath(xpath, namespaces).text
30
+
31
+ super if temporal_node_count != 1
32
+
33
+ case date_str
34
+ when /^[0-9]{4}$/
35
+ year_to_range(date_str)
36
+ when /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$/
37
+ single_date_to_range(date_str)
38
+ else
39
+ super
40
+ end
41
+ end
42
+
43
+ def self.single_date_to_range(date)
44
+ {
45
+ start: date,
46
+ end: date
47
+ }
48
+ end
49
+
50
+ def self.year_to_range(year)
51
+ {
52
+ start: "#{year}-01-01",
53
+ end: "#{year}-12-31"
54
+ }
55
+ end
56
+
57
+ # Bounding box is defined by two coordinates to create a point.
58
+ # Create a bounding box from this point.
59
+ def self.bounding_box(node)
60
+ point = node.text.split(' ')
61
+ {
62
+ west: point[1],
63
+ south: point[0],
64
+ east: point[3],
65
+ north: point[2]
66
+ }
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ require 'rgeo/geo_json'
2
+
3
+ require 'search_solr_tools/helpers/bounding_box_util'
4
+ require 'search_solr_tools/helpers/iso_to_solr_format'
5
+
6
+ module SearchSolrTools
7
+ module Helpers
8
+ # Methods to translate list of geoJson objects to solr format values
9
+ module TranslateSpatialCoverage
10
+ def self.geojson_to_spatial_display_str(spatial_coverage_geom)
11
+ spatial_coverage_geom = convert_multipoint_to_point(spatial_coverage_geom)
12
+ spatial_coverage_geom.map do |geom|
13
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geom)
14
+ "#{bbox.min_y} #{bbox.min_x} #{bbox.max_y} #{bbox.max_x}"
15
+ end
16
+ end
17
+
18
+ def self.convert_multipoint_to_point(spatial_coverage_geom)
19
+ return_geom = []
20
+ spatial_coverage_geom.each do |geom|
21
+ if geom.geometry_type.to_s.downcase.eql?('multipoint')
22
+ geom.each do |point|
23
+ return_geom << point
24
+ end
25
+ else
26
+ return_geom << geom
27
+ end
28
+ end
29
+ return_geom
30
+ end
31
+
32
+ def self.geojson_to_spatial_index_str(spatial_coverage_geom)
33
+ spatial_coverage_geom = convert_multipoint_to_point(spatial_coverage_geom)
34
+ spatial_coverage_geom.map do |geo_json|
35
+ if geo_json.geometry_type.to_s.downcase.eql?('point')
36
+ "#{geo_json.x} #{geo_json.y}"
37
+ else
38
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geo_json)
39
+ "#{bbox.min_x} #{bbox.min_y} #{bbox.max_x} #{bbox.max_y}"
40
+ end
41
+ end
42
+ end
43
+
44
+ def self.geojson_to_spatial_area(spatial_coverage_geom)
45
+ spatial_areas = spatial_coverage_geom.map do |geo_json|
46
+ if %w(point).include?(geo_json.geometry_type.to_s.downcase)
47
+ 0.0
48
+ else
49
+ bbox = RGeo::Cartesian::BoundingBox.create_from_geometry(geo_json)
50
+ bbox.max_y - bbox.min_y
51
+ end
52
+ end
53
+ return nil if spatial_areas.empty?
54
+ spatial_areas.sort.last
55
+ end
56
+
57
+ def self.geojson_to_global_facet(spatial_coverage_geom)
58
+ return nil if spatial_coverage_geom.nil?
59
+ spatial_coverage_geom.each do |geo_json|
60
+ bbox_hash = BoundingBoxUtil.bounding_box_hash_from_geo_json(geo_json)
61
+ return 'Show Global Only' if BoundingBoxUtil.box_global?(bbox_hash)
62
+ end
63
+ nil
64
+ end
65
+
66
+ def self.geojson_to_spatial_scope_facet(spatial_coverage_geom)
67
+ unless spatial_coverage_geom.nil?
68
+ spatial_coverage_geom.map do |geo_json|
69
+ bbox_hash = BoundingBoxUtil.bounding_box_hash_from_geo_json(geo_json)
70
+ scope = SolrFormat.get_spatial_scope_facet_with_bounding_box(bbox_hash)
71
+ scope unless scope.nil?
72
+ end.uniq
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,40 @@
1
+ require 'rgeo/geo_json'
2
+
3
+ require 'search_solr_tools/helpers/solr_format'
4
+
5
+ module SearchSolrTools
6
+ module Helpers
7
+ # Methods to translate temporal coverage object to solr format values
8
+ module TranslateTemporalCoverage
9
+ def self.translate_coverages(temporal_coverages_json)
10
+ temporal_coverages = temporal_coverages_json.to_a.map do |coverage|
11
+ start_time = time_string(coverage, 'start')
12
+ end_time = time_string(coverage, 'end')
13
+
14
+ [
15
+ SolrFormat.temporal_index_str(start: start_time.to_s, end: end_time.to_s),
16
+ SolrFormat.temporal_display_str(start: format_string(start_time), end: format_string(end_time)),
17
+ SolrFormat.get_temporal_duration(start_time, end_time)
18
+ ]
19
+ end.transpose
20
+
21
+ temporal_index_str = temporal_coverages[0] || []
22
+ temporal_display = temporal_coverages[1] || []
23
+ temporal_durations = temporal_coverages[2] || []
24
+
25
+ max_temporal_duration = SolrFormat.reduce_temporal_duration(temporal_durations)
26
+ facet = SolrFormat.get_temporal_duration_facet(max_temporal_duration)
27
+
28
+ { 'temporal_coverages' => temporal_display, 'temporal_duration' => max_temporal_duration, 'temporal' => temporal_index_str, 'facet_temporal_duration' => facet }
29
+ end
30
+
31
+ def self.format_string(value)
32
+ value.to_s.empty? ? nil : value.strftime('%Y-%m-%d')
33
+ end
34
+
35
+ def self.time_string(coverage, key)
36
+ Time.parse(coverage[key]) unless coverage[key].to_s.empty?
37
+ end
38
+ end
39
+ end
40
+ end