geo_combine 0.2.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,5 +2,7 @@ module GeoCombine
2
2
  module Exceptions
3
3
  class InvalidDCTReferences < StandardError
4
4
  end
5
+ class InvalidGeometry < StandardError
6
+ end
5
7
  end
6
8
  end
@@ -15,7 +15,7 @@ module GeoCombine
15
15
  # @param [String] text
16
16
  # @return [String]
17
17
  def remove_lines(text)
18
- text.gsub(/\n/, '')
18
+ text.delete("\n")
19
19
  end
20
20
 
21
21
  ##
@@ -25,5 +25,10 @@ module GeoCombine
25
25
  def sanitize_and_remove_lines(text)
26
26
  remove_lines(sanitize(text))
27
27
  end
28
+
29
+ # slugs should be lowercase and only have a-z, A-Z, 0-9, and -
30
+ def sluggify(slug)
31
+ slug.gsub(/[^a-zA-Z0-9\-]/, '-').gsub(/[\-]+/, '-').downcase
32
+ end
28
33
  end
29
34
  end
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GeoCombine
4
+ ##
5
+ # A class to harvest and index results from GeoBlacklight sites
6
+ # You can configure the sites to be harvested via a configure command.
7
+ # GeoCombine::GeoBlacklightHarvester.configure do
8
+ # {
9
+ # SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } }
10
+ # }
11
+ # end
12
+ # The class configuration also allows for various other things to be configured:
13
+ # - A debug parameter to print out details of what is being harvested and indexed
14
+ # - crawl delays for each page of results (globally or on a per site basis)
15
+ # - Solr's commitWithin parameter (defaults to 5000)
16
+ # - A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)
17
+ # Example: GeoCombine::GeoBlacklightHarvester.new('SITE').index
18
+ class GeoBlacklightHarvester
19
+ require 'active_support/core_ext/object/to_query'
20
+
21
+ class << self
22
+ attr_writer :document_transformer
23
+
24
+ def configure(&block)
25
+ @config = yield block
26
+ end
27
+
28
+ def config
29
+ @config || {}
30
+ end
31
+
32
+ def document_transformer
33
+ @document_transformer || ->(document) do
34
+ document.delete('_version_')
35
+ document.delete('score')
36
+ document.delete('timestamp')
37
+ document
38
+ end
39
+ end
40
+ end
41
+
42
+
43
+ attr_reader :site, :site_key
44
+ def initialize(site_key)
45
+ @site_key = site_key
46
+ @site = self.class.config[site_key]
47
+
48
+ raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
49
+ end
50
+
51
+ def index
52
+ puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
53
+ response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
54
+ response_class = BlacklightResponseVersionFactory.call(response)
55
+
56
+ response_class.new(response: response, base_url: base_url).documents.each do |docs|
57
+ docs.map! do |document|
58
+ self.class.document_transformer.call(document) if self.class.document_transformer
59
+ end.compact
60
+
61
+ puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
62
+ solr_connection.update params: { commitWithin: commit_within, overwrite: true },
63
+ data: docs.to_json,
64
+ headers: { 'Content-Type' => 'application/json' }
65
+
66
+ sleep(crawl_delay.to_i) if crawl_delay
67
+ end
68
+ end
69
+
70
+ ##
71
+ # A "factory" class to determine the blacklight response version to use
72
+ class BlacklightResponseVersionFactory
73
+ def self.call(json)
74
+ keys = json.keys
75
+ if keys.include?('response')
76
+ LegacyBlacklightResponse
77
+ elsif keys.any? && %w[links data].all? { |param| keys.include?(param) }
78
+ ModernBlacklightResponse
79
+ else
80
+ raise NotImplementedError, "The following json response was not able to be parsed by the GeoBlacklightHarvester\n#{json}"
81
+ end
82
+ end
83
+ end
84
+
85
+ class LegacyBlacklightResponse
86
+ attr_reader :base_url
87
+ attr_accessor :response, :page
88
+ def initialize(response:, base_url:)
89
+ @base_url = base_url
90
+ @response = response
91
+ @page = 1
92
+ end
93
+
94
+ def documents
95
+ return enum_for(:documents) unless block_given?
96
+
97
+ while current_page && total_pages && (current_page <= total_pages) do
98
+ yield response.dig('response', 'docs')
99
+
100
+ break if current_page == total_pages
101
+ self.page += 1
102
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
103
+
104
+ begin
105
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
106
+ rescue => e
107
+ puts "Request for #{url} failed with #{e}"
108
+ self.response = nil
109
+ end
110
+ end
111
+ end
112
+
113
+ private
114
+
115
+ def url
116
+ "#{base_url}&page=#{page}"
117
+ end
118
+
119
+ def current_page
120
+ response.dig('response', 'pages', 'current_page')
121
+ end
122
+
123
+ def total_pages
124
+ response.dig('response', 'pages', 'total_pages')
125
+ end
126
+ end
127
+
128
+ ##
129
+ # Class to return documents from the Blacklight API (v7 and above)
130
+ class ModernBlacklightResponse
131
+ attr_reader :base_url
132
+ attr_accessor :response, :page
133
+ def initialize(response:, base_url:)
134
+ @base_url = base_url
135
+ @response = response
136
+ @page = 1
137
+ end
138
+
139
+ def documents
140
+ return enum_for(:documents) unless block_given?
141
+
142
+ while response && response['data'].any?
143
+ document_urls = response['data'].collect { |data| data.dig('links', 'self') }.compact
144
+
145
+ yield documents_from_urls(document_urls)
146
+
147
+ url = response.dig('links', 'next')
148
+ break unless url
149
+ url = "#{url}&format=json"
150
+ self.page += 1
151
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
152
+ begin
153
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
154
+ rescue => e
155
+ puts "Request for #{url} failed with #{e}"
156
+ self.response = nil
157
+ end
158
+ end
159
+ end
160
+
161
+ private
162
+
163
+ def documents_from_urls(urls)
164
+ puts "Fetching #{urls.count} documents for page #{page}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
165
+ urls.map do |url|
166
+ begin
167
+ JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
168
+ rescue => e
169
+ puts "Fetching \"#{url}/raw\" failed with #{e}"
170
+
171
+ nil
172
+ end
173
+ end.compact
174
+ end
175
+ end
176
+
177
+ private
178
+
179
+ def base_url
180
+ "#{site[:host]}?#{default_params.to_query}"
181
+ end
182
+
183
+ def solr_connection
184
+ solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
185
+
186
+ RSolr.connect url: solr_url, adapter: :net_http_persistent
187
+ end
188
+
189
+ def commit_within
190
+ self.class.config[:commit_within] || '5000'
191
+ end
192
+
193
+ def crawl_delay
194
+ site[:crawl_delay] || self.class.config[:crawl_delay]
195
+ end
196
+
197
+ def default_params
198
+ {
199
+ per_page: 100,
200
+ format: :json
201
+ }.merge(site[:params])
202
+ end
203
+ end
204
+ end
@@ -1,3 +1,5 @@
1
+ require 'active_support/core_ext/object/blank'
2
+ require 'active_support/core_ext/hash/except'
1
3
  require 'open-uri'
2
4
 
3
5
  module GeoCombine
@@ -9,6 +11,16 @@ module GeoCombine
9
11
  attr_reader :metadata
10
12
 
11
13
  GEOBLACKLIGHT_VERSION = 'v1.1.0'
14
+ SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/#{GEOBLACKLIGHT_VERSION}/schema/geoblacklight-schema.json".freeze
15
+ DEPRECATED_KEYS_V1 = %w[
16
+ uuid
17
+ georss_polygon_s
18
+ georss_point_s
19
+ georss_box_s
20
+ dc_relation_sm
21
+ solr_issued_i
22
+ solr_bbox
23
+ ].freeze
12
24
 
13
25
  ##
14
26
  # Initializes a GeoBlacklight object
@@ -24,7 +36,9 @@ module GeoCombine
24
36
  # Calls metadata enhancement methods for each key, value pair in the
25
37
  # metadata hash
26
38
  def enhance_metadata
27
- @metadata.each do |key, value|
39
+ upgrade_to_v1 if metadata['geoblacklight_version'].blank?
40
+
41
+ metadata.each do |key, value|
28
42
  translate_formats(key, value)
29
43
  enhance_subjects(key, value)
30
44
  format_proper_date(key, value)
@@ -36,23 +50,25 @@ module GeoCombine
36
50
  ##
37
51
  # Returns a string of JSON from a GeoBlacklight hash
38
52
  # @return (String)
39
- def to_json
40
- @metadata.to_json
53
+ def to_json(options = {})
54
+ metadata.to_json(options)
41
55
  end
42
56
 
43
57
  ##
44
58
  # Validates a GeoBlacklight-Schema json document
45
59
  # @return [Boolean]
46
60
  def valid?
47
- @schema ||= JSON.parse(open("https://raw.githubusercontent.com/geoblacklight/geoblacklight/#{GEOBLACKLIGHT_VERSION}/schema/geoblacklight-schema.json").read)
48
- JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') && dct_references_validate!
61
+ @schema ||= JSON.parse(open(SCHEMA_JSON_URL).read)
62
+ JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') &&
63
+ dct_references_validate! &&
64
+ spatial_validate!
49
65
  end
50
66
 
51
67
  ##
52
68
  # Validate dct_references_s
53
69
  # @return [Boolean]
54
70
  def dct_references_validate!
55
- return true unless metadata.key?('dct_references_s')
71
+ return true unless metadata.key?('dct_references_s') # TODO: shouldn't we require this field?
56
72
  begin
57
73
  ref = JSON.parse(metadata['dct_references_s'])
58
74
  raise GeoCombine::Exceptions::InvalidDCTReferences, 'dct_references must be parsed to a Hash' unless ref.is_a?(Hash)
@@ -62,49 +78,82 @@ module GeoCombine
62
78
  end
63
79
  end
64
80
 
81
+ def spatial_validate!
82
+ GeoCombine::BoundingBox.from_envelope(metadata['solr_geom']).valid?
83
+ end
84
+
65
85
  private
66
86
 
67
87
  ##
68
88
  # Enhances the 'dc_format_s' field by translating a format type to a valid
69
89
  # GeoBlacklight-Schema format
70
90
  def translate_formats(key, value)
71
- @metadata[key] = formats[value] if key == 'dc_format_s' && formats.include?(value)
91
+ return unless key == 'dc_format_s' && formats.include?(value)
92
+ metadata[key] = formats[value]
72
93
  end
73
94
 
74
95
  ##
75
96
  # Enhances the 'layer_geom_type_s' field by translating from known types
76
97
  def translate_geometry_type(key, value)
77
- @metadata[key] = geometry_types[value] if key == 'layer_geom_type_s' && geometry_types.include?(value)
98
+ return unless key == 'layer_geom_type_s' && geometry_types.include?(value)
99
+ metadata[key] = geometry_types[value]
78
100
  end
79
101
 
80
102
  ##
81
103
  # Enhances the 'dc_subject_sm' field by translating subjects to ISO topic
82
104
  # categories
83
105
  def enhance_subjects(key, value)
84
- @metadata[key] = value.map do |val|
106
+ return unless key == 'dc_subject_sm'
107
+ metadata[key] = value.map do |val|
85
108
  if subjects.include?(val)
86
109
  subjects[val]
87
110
  else
88
111
  val
89
112
  end
90
- end if key == 'dc_subject_sm'
113
+ end
91
114
  end
92
115
 
93
116
  ##
94
117
  # Formats the 'layer_modified_dt' to a valid valid RFC3339 date/time string
95
118
  # and ISO8601 (for indexing into Solr)
96
119
  def format_proper_date(key, value)
97
- @metadata[key] = Time.parse(value).utc.iso8601 if key == 'layer_modified_dt'
120
+ return unless key == 'layer_modified_dt'
121
+ metadata[key] = Time.parse(value).utc.iso8601
98
122
  end
99
123
 
100
124
  def fields_should_be_array(key, value)
101
- @metadata[key] = [value] if should_be_array.include?(key) && !value.kind_of?(Array)
125
+ return unless should_be_array.include?(key) && !value.is_a?(Array)
126
+ metadata[key] = [value]
102
127
  end
103
128
 
104
129
  ##
105
130
  # GeoBlacklight-Schema fields that should be type Array
106
131
  def should_be_array
107
- ['dc_creator_sm', 'dc_subject_sm', 'dct_spatial_sm', 'dct_temporal_sm', 'dct_isPartOf_sm']
132
+ %w[
133
+ dc_creator_sm
134
+ dc_subject_sm
135
+ dct_spatial_sm
136
+ dct_temporal_sm
137
+ dct_isPartOf_sm
138
+ ].freeze
139
+ end
140
+
141
+ ##
142
+ # Converts a pre-v1.0 schema into a compliant v1.0 schema
143
+ def upgrade_to_v1
144
+ metadata['geoblacklight_version'] = '1.0'
145
+
146
+ # ensure required fields
147
+ metadata['dc_identifier_s'] = metadata['uuid'] if metadata['dc_identifier_s'].blank?
148
+
149
+ # normalize to alphanum and - only
150
+ metadata['layer_slug_s'].gsub!(/[^[[:alnum:]]]+/, '-') if metadata['layer_slug_s'].present?
151
+
152
+ # remove deprecated fields
153
+ metadata.except!(*DEPRECATED_KEYS_V1)
154
+
155
+ # ensure we have a proper v1 record
156
+ valid?
108
157
  end
109
158
  end
110
159
  end
@@ -0,0 +1,229 @@
1
+ require 'active_support/core_ext/object/blank'
2
+ require 'cgi'
3
+
4
+ module GeoCombine
5
+ # Data model for OpenGeoPortal metadata
6
+ class OGP
7
+ class InvalidMetadata < RuntimeError; end
8
+ include GeoCombine::Formatting
9
+ attr_reader :metadata
10
+
11
+ ##
12
+ # Initializes an OGP object for parsing
13
+ # @param [String] metadata a valid serialized JSON string from OGP instance
14
+ # @raise [InvalidMetadata]
15
+ def initialize(metadata)
16
+ @metadata = JSON.parse(metadata)
17
+ raise InvalidMetadata unless valid?
18
+ end
19
+
20
+ OGP_REQUIRED_FIELDS = %w[
21
+ Access
22
+ Institution
23
+ LayerDisplayName
24
+ LayerId
25
+ MaxX
26
+ MaxY
27
+ MinX
28
+ MinY
29
+ Name
30
+ ].freeze
31
+
32
+ ##
33
+ # Runs validity checks on OGP metadata to ensure fields are present
34
+ def valid?
35
+ OGP_REQUIRED_FIELDS.all? { |k| metadata[k].present? }
36
+ end
37
+
38
+ ##
39
+ # Creates and returns a Geoblacklight schema object from this metadata
40
+ # @return [GeoCombine::Geoblacklight]
41
+ def to_geoblacklight
42
+ GeoCombine::Geoblacklight.new(geoblacklight_terms.to_json)
43
+ end
44
+
45
+ ##
46
+ # Builds a Geoblacklight Schema type hash from Esri Open Data portal
47
+ # metadata
48
+ # @return [Hash]
49
+ def geoblacklight_terms
50
+ {
51
+ # Required fields
52
+ dc_identifier_s: identifier,
53
+ layer_slug_s: slug,
54
+ dc_title_s: metadata['LayerDisplayName'],
55
+ solr_geom: envelope,
56
+ dct_provenance_s: institution,
57
+ dc_rights_s: metadata['Access'],
58
+ geoblacklight_version: '1.0',
59
+
60
+ # Recommended fields
61
+ dc_description_s: metadata['Abstract'],
62
+ layer_geom_type_s: ogp_geom,
63
+ dct_references_s: references,
64
+ layer_id_s: "#{metadata['WorkspaceName']}:#{metadata['Name']}",
65
+
66
+ # Optional
67
+ dct_temporal_sm: [metadata['ContentDate']],
68
+ dc_format_s: ogp_formats,
69
+ # dct_issued_dt
70
+ # dc_language_s
71
+ dct_spatial_sm: placenames,
72
+ solr_year_i: year,
73
+ dc_publisher_s: metadata['Publisher'],
74
+ dc_subject_sm: subjects,
75
+ dc_type_s: 'Dataset'
76
+ }.delete_if { |_k, v| v.nil? }
77
+ end
78
+
79
+ def date
80
+ begin
81
+ DateTime.rfc3339(metadata['ContentDate'])
82
+ rescue
83
+ nil
84
+ end
85
+ end
86
+
87
+ def year
88
+ date.year unless date.nil?
89
+ end
90
+
91
+ ##
92
+ # Convert "Paper Map" to Raster, assumes all OGP "Paper Maps" have WMS
93
+ def ogp_geom
94
+ case metadata['DataType']
95
+ when 'Paper Map'
96
+ 'Raster'
97
+ else
98
+ metadata['DataType']
99
+ end
100
+ end
101
+
102
+ ##
103
+ # OGP doesn't ship format types, so we just try and be clever here.
104
+ def ogp_formats
105
+ case metadata['DataType']
106
+ when 'Paper Map', 'Raster'
107
+ return 'GeoTIFF'
108
+ when 'Polygon', 'Point', 'Line'
109
+ return 'Shapefile'
110
+ else
111
+ raise ArgumentError, metadata['DataType']
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Converts references to json
117
+ # @return [String]
118
+ def references
119
+ references_hash.to_json
120
+ end
121
+
122
+ ##
123
+ # Builds a Solr Envelope using CQL syntax
124
+ # @return [String]
125
+ def envelope
126
+ raise ArgumentError unless west >= -180 && west <= 180 &&
127
+ east >= -180 && east <= 180 &&
128
+ north >= -90 && north <= 90 &&
129
+ south >= -90 && south <= 90 &&
130
+ west <= east && south <= north
131
+ "ENVELOPE(#{west}, #{east}, #{north}, #{south})"
132
+ end
133
+
134
+ def subjects
135
+ fgdc.metadata.xpath('//themekey').map(&:text) if fgdc
136
+ end
137
+
138
+ def placenames
139
+ fgdc.metadata.xpath('//placekey').map(&:text) if fgdc
140
+ end
141
+
142
+ def fgdc
143
+ GeoCombine::Fgdc.new(metadata['FgdcText']) if metadata['FgdcText']
144
+ end
145
+
146
+ private
147
+
148
+ ##
149
+ # Builds references used for dct_references
150
+ # @return [Hash]
151
+ def references_hash
152
+ results = {
153
+ 'http://www.opengis.net/def/serviceType/ogc/wfs' => location['wfs'],
154
+ 'http://www.opengis.net/def/serviceType/ogc/wms' => location['wms'],
155
+ 'http://schema.org/url' => location['url'],
156
+ download_uri => location['download']
157
+ }
158
+
159
+ # Handle null, "", and [""]
160
+ results.map { |k, v| { k => ([] << v).flatten.first } if v }
161
+ .flatten
162
+ .compact
163
+ .reduce({}, :merge)
164
+ end
165
+
166
+ def download_uri
167
+ return 'http://schema.org/DownloadAction' if institution == 'Harvard'
168
+ 'http://schema.org/downloadUrl'
169
+ end
170
+
171
+ ##
172
+ # OGP "Location" field parsed
173
+ def location
174
+ JSON.parse(metadata['Location'])
175
+ end
176
+
177
+ def north
178
+ metadata['MaxY'].to_f
179
+ end
180
+
181
+ def south
182
+ metadata['MinY'].to_f
183
+ end
184
+
185
+ def east
186
+ metadata['MaxX'].to_f
187
+ end
188
+
189
+ def west
190
+ metadata['MinX'].to_f
191
+ end
192
+
193
+ def institution
194
+ metadata['Institution']
195
+ end
196
+
197
+ def identifier
198
+ CGI.escape(metadata['LayerId']) # TODO: why are we using CGI.escape?
199
+ end
200
+
201
+ def slug
202
+ name = metadata['LayerId'] || metadata['Name'] || ''
203
+ name = [institution, name].join('-') if institution.present? &&
204
+ !name.downcase.start_with?(institution.downcase)
205
+ sluggify(filter_name(name))
206
+ end
207
+
208
+ SLUG_BLACKLIST = %w[
209
+ SDE_DATA.
210
+ SDE.
211
+ SDE2.
212
+ GISPORTAL.GISOWNER01.
213
+ GISDATA.
214
+ MORIS.
215
+ ].freeze
216
+
217
+ def filter_name(name)
218
+ # strip out schema and usernames
219
+ SLUG_BLACKLIST.each do |blacklisted|
220
+ name.sub!(blacklisted, '')
221
+ end
222
+ unless name.size > 1
223
+ # use first word of title is empty name
224
+ name = metadata['LayerDisplayName'].split.first
225
+ end
226
+ name
227
+ end
228
+ end
229
+ end