geo_combine 0.2.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,5 +2,7 @@ module GeoCombine
2
2
  module Exceptions
3
3
  class InvalidDCTReferences < StandardError
4
4
  end
5
+ class InvalidGeometry < StandardError
6
+ end
5
7
  end
6
8
  end
@@ -15,7 +15,7 @@ module GeoCombine
15
15
  # @param [String] text
16
16
  # @return [String]
17
17
  def remove_lines(text)
18
- text.gsub(/\n/, '')
18
+ text.delete("\n")
19
19
  end
20
20
 
21
21
  ##
@@ -25,5 +25,10 @@ module GeoCombine
25
25
  def sanitize_and_remove_lines(text)
26
26
  remove_lines(sanitize(text))
27
27
  end
28
+
29
+ # slugs should be lowercase and only have a-z, A-Z, 0-9, and -
30
+ def sluggify(slug)
31
+ slug.gsub(/[^a-zA-Z0-9\-]/, '-').gsub(/[\-]+/, '-').downcase
32
+ end
28
33
  end
29
34
  end
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GeoCombine
4
+ ##
5
+ # A class to harvest and index results from GeoBlacklight sites
6
+ # You can configure the sites to be harvested via a configure command.
7
+ # GeoCombine::GeoBlacklightHarvester.configure do
8
+ # {
9
+ # SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } }
10
+ # }
11
+ # end
12
+ # The class configuration also allows for various other things to be configured:
13
+ # - A debug parameter to print out details of what is being harvested and indexed
14
+ # - crawl delays for each page of results (globally or on a per site basis)
15
+ # - Solr's commitWithin parameter (defaults to 5000)
16
+ # - A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)
17
+ # Example: GeoCombine::GeoBlacklightHarvester.new('SITE').index
18
+ class GeoBlacklightHarvester
19
+ require 'active_support/core_ext/object/to_query'
20
+
21
+ class << self
22
+ attr_writer :document_transformer
23
+
24
+ def configure(&block)
25
+ @config = yield block
26
+ end
27
+
28
+ def config
29
+ @config || {}
30
+ end
31
+
32
+ def document_transformer
33
+ @document_transformer || ->(document) do
34
+ document.delete('_version_')
35
+ document.delete('score')
36
+ document.delete('timestamp')
37
+ document
38
+ end
39
+ end
40
+ end
41
+
42
+
43
+ attr_reader :site, :site_key
44
+ def initialize(site_key)
45
+ @site_key = site_key
46
+ @site = self.class.config[site_key]
47
+
48
+ raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
49
+ end
50
+
51
+ def index
52
+ puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
53
+ response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
54
+ response_class = BlacklightResponseVersionFactory.call(response)
55
+
56
+ response_class.new(response: response, base_url: base_url).documents.each do |docs|
57
+ docs.map! do |document|
58
+ self.class.document_transformer.call(document) if self.class.document_transformer
59
+ end.compact
60
+
61
+ puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
62
+ solr_connection.update params: { commitWithin: commit_within, overwrite: true },
63
+ data: docs.to_json,
64
+ headers: { 'Content-Type' => 'application/json' }
65
+
66
+ sleep(crawl_delay.to_i) if crawl_delay
67
+ end
68
+ end
69
+
70
+ ##
71
+ # A "factory" class to determine the blacklight response version to use
72
+ class BlacklightResponseVersionFactory
73
+ def self.call(json)
74
+ keys = json.keys
75
+ if keys.include?('response')
76
+ LegacyBlacklightResponse
77
+ elsif keys.any? && %w[links data].all? { |param| keys.include?(param) }
78
+ ModernBlacklightResponse
79
+ else
80
+ raise NotImplementedError, "The following json response was not able to be parsed by the GeoBlacklightHarvester\n#{json}"
81
+ end
82
+ end
83
+ end
84
+
85
+ class LegacyBlacklightResponse
86
+ attr_reader :base_url
87
+ attr_accessor :response, :page
88
+ def initialize(response:, base_url:)
89
+ @base_url = base_url
90
+ @response = response
91
+ @page = 1
92
+ end
93
+
94
+ def documents
95
+ return enum_for(:documents) unless block_given?
96
+
97
+ while current_page && total_pages && (current_page <= total_pages) do
98
+ yield response.dig('response', 'docs')
99
+
100
+ break if current_page == total_pages
101
+ self.page += 1
102
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
103
+
104
+ begin
105
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
106
+ rescue => e
107
+ puts "Request for #{url} failed with #{e}"
108
+ self.response = nil
109
+ end
110
+ end
111
+ end
112
+
113
+ private
114
+
115
+ def url
116
+ "#{base_url}&page=#{page}"
117
+ end
118
+
119
+ def current_page
120
+ response.dig('response', 'pages', 'current_page')
121
+ end
122
+
123
+ def total_pages
124
+ response.dig('response', 'pages', 'total_pages')
125
+ end
126
+ end
127
+
128
+ ##
129
+ # Class to return documents from the Blacklight API (v7 and above)
130
+ class ModernBlacklightResponse
131
+ attr_reader :base_url
132
+ attr_accessor :response, :page
133
+ def initialize(response:, base_url:)
134
+ @base_url = base_url
135
+ @response = response
136
+ @page = 1
137
+ end
138
+
139
+ def documents
140
+ return enum_for(:documents) unless block_given?
141
+
142
+ while response && response['data'].any?
143
+ document_urls = response['data'].collect { |data| data.dig('links', 'self') }.compact
144
+
145
+ yield documents_from_urls(document_urls)
146
+
147
+ url = response.dig('links', 'next')
148
+ break unless url
149
+ url = "#{url}&format=json"
150
+ self.page += 1
151
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
152
+ begin
153
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
154
+ rescue => e
155
+ puts "Request for #{url} failed with #{e}"
156
+ self.response = nil
157
+ end
158
+ end
159
+ end
160
+
161
+ private
162
+
163
+ def documents_from_urls(urls)
164
+ puts "Fetching #{urls.count} documents for page #{page}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
165
+ urls.map do |url|
166
+ begin
167
+ JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
168
+ rescue => e
169
+ puts "Fetching \"#{url}/raw\" failed with #{e}"
170
+
171
+ nil
172
+ end
173
+ end.compact
174
+ end
175
+ end
176
+
177
+ private
178
+
179
+ def base_url
180
+ "#{site[:host]}?#{default_params.to_query}"
181
+ end
182
+
183
+ def solr_connection
184
+ solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
185
+
186
+ RSolr.connect url: solr_url, adapter: :net_http_persistent
187
+ end
188
+
189
+ def commit_within
190
+ self.class.config[:commit_within] || '5000'
191
+ end
192
+
193
+ def crawl_delay
194
+ site[:crawl_delay] || self.class.config[:crawl_delay]
195
+ end
196
+
197
+ def default_params
198
+ {
199
+ per_page: 100,
200
+ format: :json
201
+ }.merge(site[:params])
202
+ end
203
+ end
204
+ end
@@ -1,3 +1,5 @@
1
+ require 'active_support/core_ext/object/blank'
2
+ require 'active_support/core_ext/hash/except'
1
3
  require 'open-uri'
2
4
 
3
5
  module GeoCombine
@@ -9,6 +11,16 @@ module GeoCombine
9
11
  attr_reader :metadata
10
12
 
11
13
  GEOBLACKLIGHT_VERSION = 'v1.1.0'
14
+ SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/#{GEOBLACKLIGHT_VERSION}/schema/geoblacklight-schema.json".freeze
15
+ DEPRECATED_KEYS_V1 = %w[
16
+ uuid
17
+ georss_polygon_s
18
+ georss_point_s
19
+ georss_box_s
20
+ dc_relation_sm
21
+ solr_issued_i
22
+ solr_bbox
23
+ ].freeze
12
24
 
13
25
  ##
14
26
  # Initializes a GeoBlacklight object
@@ -24,7 +36,9 @@ module GeoCombine
24
36
  # Calls metadata enhancement methods for each key, value pair in the
25
37
  # metadata hash
26
38
  def enhance_metadata
27
- @metadata.each do |key, value|
39
+ upgrade_to_v1 if metadata['geoblacklight_version'].blank?
40
+
41
+ metadata.each do |key, value|
28
42
  translate_formats(key, value)
29
43
  enhance_subjects(key, value)
30
44
  format_proper_date(key, value)
@@ -36,23 +50,25 @@ module GeoCombine
36
50
  ##
37
51
  # Returns a string of JSON from a GeoBlacklight hash
38
52
  # @return (String)
39
- def to_json
40
- @metadata.to_json
53
+ def to_json(options = {})
54
+ metadata.to_json(options)
41
55
  end
42
56
 
43
57
  ##
44
58
  # Validates a GeoBlacklight-Schema json document
45
59
  # @return [Boolean]
46
60
  def valid?
47
- @schema ||= JSON.parse(open("https://raw.githubusercontent.com/geoblacklight/geoblacklight/#{GEOBLACKLIGHT_VERSION}/schema/geoblacklight-schema.json").read)
48
- JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') && dct_references_validate!
61
+ @schema ||= JSON.parse(open(SCHEMA_JSON_URL).read)
62
+ JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') &&
63
+ dct_references_validate! &&
64
+ spatial_validate!
49
65
  end
50
66
 
51
67
  ##
52
68
  # Validate dct_references_s
53
69
  # @return [Boolean]
54
70
  def dct_references_validate!
55
- return true unless metadata.key?('dct_references_s')
71
+ return true unless metadata.key?('dct_references_s') # TODO: shouldn't we require this field?
56
72
  begin
57
73
  ref = JSON.parse(metadata['dct_references_s'])
58
74
  raise GeoCombine::Exceptions::InvalidDCTReferences, 'dct_references must be parsed to a Hash' unless ref.is_a?(Hash)
@@ -62,49 +78,82 @@ module GeoCombine
62
78
  end
63
79
  end
64
80
 
81
+ def spatial_validate!
82
+ GeoCombine::BoundingBox.from_envelope(metadata['solr_geom']).valid?
83
+ end
84
+
65
85
  private
66
86
 
67
87
  ##
68
88
  # Enhances the 'dc_format_s' field by translating a format type to a valid
69
89
  # GeoBlacklight-Schema format
70
90
  def translate_formats(key, value)
71
- @metadata[key] = formats[value] if key == 'dc_format_s' && formats.include?(value)
91
+ return unless key == 'dc_format_s' && formats.include?(value)
92
+ metadata[key] = formats[value]
72
93
  end
73
94
 
74
95
  ##
75
96
  # Enhances the 'layer_geom_type_s' field by translating from known types
76
97
  def translate_geometry_type(key, value)
77
- @metadata[key] = geometry_types[value] if key == 'layer_geom_type_s' && geometry_types.include?(value)
98
+ return unless key == 'layer_geom_type_s' && geometry_types.include?(value)
99
+ metadata[key] = geometry_types[value]
78
100
  end
79
101
 
80
102
  ##
81
103
  # Enhances the 'dc_subject_sm' field by translating subjects to ISO topic
82
104
  # categories
83
105
  def enhance_subjects(key, value)
84
- @metadata[key] = value.map do |val|
106
+ return unless key == 'dc_subject_sm'
107
+ metadata[key] = value.map do |val|
85
108
  if subjects.include?(val)
86
109
  subjects[val]
87
110
  else
88
111
  val
89
112
  end
90
- end if key == 'dc_subject_sm'
113
+ end
91
114
  end
92
115
 
93
116
  ##
94
117
  # Formats the 'layer_modified_dt' to a valid valid RFC3339 date/time string
95
118
  # and ISO8601 (for indexing into Solr)
96
119
  def format_proper_date(key, value)
97
- @metadata[key] = Time.parse(value).utc.iso8601 if key == 'layer_modified_dt'
120
+ return unless key == 'layer_modified_dt'
121
+ metadata[key] = Time.parse(value).utc.iso8601
98
122
  end
99
123
 
100
124
  def fields_should_be_array(key, value)
101
- @metadata[key] = [value] if should_be_array.include?(key) && !value.kind_of?(Array)
125
+ return unless should_be_array.include?(key) && !value.is_a?(Array)
126
+ metadata[key] = [value]
102
127
  end
103
128
 
104
129
  ##
105
130
  # GeoBlacklight-Schema fields that should be type Array
106
131
  def should_be_array
107
- ['dc_creator_sm', 'dc_subject_sm', 'dct_spatial_sm', 'dct_temporal_sm', 'dct_isPartOf_sm']
132
+ %w[
133
+ dc_creator_sm
134
+ dc_subject_sm
135
+ dct_spatial_sm
136
+ dct_temporal_sm
137
+ dct_isPartOf_sm
138
+ ].freeze
139
+ end
140
+
141
+ ##
142
+ # Converts a pre-v1.0 schema into a compliant v1.0 schema
143
+ def upgrade_to_v1
144
+ metadata['geoblacklight_version'] = '1.0'
145
+
146
+ # ensure required fields
147
+ metadata['dc_identifier_s'] = metadata['uuid'] if metadata['dc_identifier_s'].blank?
148
+
149
+ # normalize to alphanum and - only
150
+ metadata['layer_slug_s'].gsub!(/[^[[:alnum:]]]+/, '-') if metadata['layer_slug_s'].present?
151
+
152
+ # remove deprecated fields
153
+ metadata.except!(*DEPRECATED_KEYS_V1)
154
+
155
+ # ensure we have a proper v1 record
156
+ valid?
108
157
  end
109
158
  end
110
159
  end
@@ -0,0 +1,229 @@
1
+ require 'active_support/core_ext/object/blank'
2
+ require 'cgi'
3
+
4
+ module GeoCombine
5
+ # Data model for OpenGeoPortal metadata
6
+ class OGP
7
+ class InvalidMetadata < RuntimeError; end
8
+ include GeoCombine::Formatting
9
+ attr_reader :metadata
10
+
11
+ ##
12
+ # Initializes an OGP object for parsing
13
+ # @param [String] metadata a valid serialized JSON string from OGP instance
14
+ # @raise [InvalidMetadata]
15
+ def initialize(metadata)
16
+ @metadata = JSON.parse(metadata)
17
+ raise InvalidMetadata unless valid?
18
+ end
19
+
20
+ OGP_REQUIRED_FIELDS = %w[
21
+ Access
22
+ Institution
23
+ LayerDisplayName
24
+ LayerId
25
+ MaxX
26
+ MaxY
27
+ MinX
28
+ MinY
29
+ Name
30
+ ].freeze
31
+
32
+ ##
33
+ # Runs validity checks on OGP metadata to ensure fields are present
34
+ def valid?
35
+ OGP_REQUIRED_FIELDS.all? { |k| metadata[k].present? }
36
+ end
37
+
38
+ ##
39
+ # Creates and returns a Geoblacklight schema object from this metadata
40
+ # @return [GeoCombine::Geoblacklight]
41
+ def to_geoblacklight
42
+ GeoCombine::Geoblacklight.new(geoblacklight_terms.to_json)
43
+ end
44
+
45
+ ##
46
+ # Builds a Geoblacklight Schema type hash from Esri Open Data portal
47
+ # metadata
48
+ # @return [Hash]
49
+ def geoblacklight_terms
50
+ {
51
+ # Required fields
52
+ dc_identifier_s: identifier,
53
+ layer_slug_s: slug,
54
+ dc_title_s: metadata['LayerDisplayName'],
55
+ solr_geom: envelope,
56
+ dct_provenance_s: institution,
57
+ dc_rights_s: metadata['Access'],
58
+ geoblacklight_version: '1.0',
59
+
60
+ # Recommended fields
61
+ dc_description_s: metadata['Abstract'],
62
+ layer_geom_type_s: ogp_geom,
63
+ dct_references_s: references,
64
+ layer_id_s: "#{metadata['WorkspaceName']}:#{metadata['Name']}",
65
+
66
+ # Optional
67
+ dct_temporal_sm: [metadata['ContentDate']],
68
+ dc_format_s: ogp_formats,
69
+ # dct_issued_dt
70
+ # dc_language_s
71
+ dct_spatial_sm: placenames,
72
+ solr_year_i: year,
73
+ dc_publisher_s: metadata['Publisher'],
74
+ dc_subject_sm: subjects,
75
+ dc_type_s: 'Dataset'
76
+ }.delete_if { |_k, v| v.nil? }
77
+ end
78
+
79
+ def date
80
+ begin
81
+ DateTime.rfc3339(metadata['ContentDate'])
82
+ rescue
83
+ nil
84
+ end
85
+ end
86
+
87
+ def year
88
+ date.year unless date.nil?
89
+ end
90
+
91
+ ##
92
+ # Convert "Paper Map" to Raster, assumes all OGP "Paper Maps" have WMS
93
+ def ogp_geom
94
+ case metadata['DataType']
95
+ when 'Paper Map'
96
+ 'Raster'
97
+ else
98
+ metadata['DataType']
99
+ end
100
+ end
101
+
102
+ ##
103
+ # OGP doesn't ship format types, so we just try and be clever here.
104
+ def ogp_formats
105
+ case metadata['DataType']
106
+ when 'Paper Map', 'Raster'
107
+ return 'GeoTIFF'
108
+ when 'Polygon', 'Point', 'Line'
109
+ return 'Shapefile'
110
+ else
111
+ raise ArgumentError, metadata['DataType']
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Converts references to json
117
+ # @return [String]
118
+ def references
119
+ references_hash.to_json
120
+ end
121
+
122
+ ##
123
+ # Builds a Solr Envelope using CQL syntax
124
+ # @return [String]
125
+ def envelope
126
+ raise ArgumentError unless west >= -180 && west <= 180 &&
127
+ east >= -180 && east <= 180 &&
128
+ north >= -90 && north <= 90 &&
129
+ south >= -90 && south <= 90 &&
130
+ west <= east && south <= north
131
+ "ENVELOPE(#{west}, #{east}, #{north}, #{south})"
132
+ end
133
+
134
+ def subjects
135
+ fgdc.metadata.xpath('//themekey').map(&:text) if fgdc
136
+ end
137
+
138
+ def placenames
139
+ fgdc.metadata.xpath('//placekey').map(&:text) if fgdc
140
+ end
141
+
142
+ def fgdc
143
+ GeoCombine::Fgdc.new(metadata['FgdcText']) if metadata['FgdcText']
144
+ end
145
+
146
+ private
147
+
148
+ ##
149
+ # Builds references used for dct_references
150
+ # @return [Hash]
151
+ def references_hash
152
+ results = {
153
+ 'http://www.opengis.net/def/serviceType/ogc/wfs' => location['wfs'],
154
+ 'http://www.opengis.net/def/serviceType/ogc/wms' => location['wms'],
155
+ 'http://schema.org/url' => location['url'],
156
+ download_uri => location['download']
157
+ }
158
+
159
+ # Handle null, "", and [""]
160
+ results.map { |k, v| { k => ([] << v).flatten.first } if v }
161
+ .flatten
162
+ .compact
163
+ .reduce({}, :merge)
164
+ end
165
+
166
+ def download_uri
167
+ return 'http://schema.org/DownloadAction' if institution == 'Harvard'
168
+ 'http://schema.org/downloadUrl'
169
+ end
170
+
171
+ ##
172
+ # OGP "Location" field parsed
173
+ def location
174
+ JSON.parse(metadata['Location'])
175
+ end
176
+
177
+ def north
178
+ metadata['MaxY'].to_f
179
+ end
180
+
181
+ def south
182
+ metadata['MinY'].to_f
183
+ end
184
+
185
+ def east
186
+ metadata['MaxX'].to_f
187
+ end
188
+
189
+ def west
190
+ metadata['MinX'].to_f
191
+ end
192
+
193
+ def institution
194
+ metadata['Institution']
195
+ end
196
+
197
+ def identifier
198
+ CGI.escape(metadata['LayerId']) # TODO: why are we using CGI.escape?
199
+ end
200
+
201
+ def slug
202
+ name = metadata['LayerId'] || metadata['Name'] || ''
203
+ name = [institution, name].join('-') if institution.present? &&
204
+ !name.downcase.start_with?(institution.downcase)
205
+ sluggify(filter_name(name))
206
+ end
207
+
208
+ SLUG_BLACKLIST = %w[
209
+ SDE_DATA.
210
+ SDE.
211
+ SDE2.
212
+ GISPORTAL.GISOWNER01.
213
+ GISDATA.
214
+ MORIS.
215
+ ].freeze
216
+
217
+ def filter_name(name)
218
+ # strip out schema and usernames
219
+ SLUG_BLACKLIST.each do |blacklisted|
220
+ name.sub!(blacklisted, '')
221
+ end
222
+ unless name.size > 1
223
+ # use first word of title is empty name
224
+ name = metadata['LayerDisplayName'].split.first
225
+ end
226
+ name
227
+ end
228
+ end
229
+ end