geo_combine 0.4.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +53 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +20 -0
- data/.rubocop_todo.yml +165 -0
- data/Gemfile +3 -1
- data/README.md +80 -1
- data/Rakefile +4 -2
- data/bin/geocombine +1 -0
- data/geo_combine.gemspec +5 -0
- data/lib/geo_combine/bounding_box.rb +7 -1
- data/lib/geo_combine/ckan_metadata.rb +10 -8
- data/lib/geo_combine/cli.rb +3 -1
- data/lib/geo_combine/esri_open_data.rb +2 -0
- data/lib/geo_combine/exceptions.rb +3 -0
- data/lib/geo_combine/fgdc.rb +2 -2
- data/lib/geo_combine/formats.rb +2 -0
- data/lib/geo_combine/formatting.rb +3 -1
- data/lib/geo_combine/geo_blacklight_harvester.rb +211 -0
- data/lib/geo_combine/geoblacklight.rb +20 -6
- data/lib/geo_combine/geometry_types.rb +2 -0
- data/lib/geo_combine/iso19139.rb +2 -1
- data/lib/geo_combine/ogp.rb +13 -11
- data/lib/geo_combine/railtie.rb +2 -0
- data/lib/geo_combine/subjects.rb +2 -0
- data/lib/geo_combine/version.rb +3 -1
- data/lib/geo_combine.rb +7 -3
- data/lib/tasks/geo_combine.rake +57 -26
- data/lib/xslt/fgdc2html.xsl +38 -9
- data/lib/xslt/iso2html.xsl +1107 -1070
- data/spec/features/fgdc2html_spec.rb +53 -1
- data/spec/features/iso2html_spec.rb +17 -2
- data/spec/fixtures/docs/princeton_fgdc.xml +374 -0
- data/spec/fixtures/docs/repos.json +3224 -0
- data/spec/fixtures/docs/simple_xml.xml +10 -0
- data/spec/fixtures/docs/simple_xslt.xsl +11 -0
- data/spec/fixtures/docs/stanford_iso.xml +652 -0
- data/spec/fixtures/docs/tufts_fgdc.xml +977 -0
- data/spec/fixtures/indexing/basic_geoblacklight.json +27 -0
- data/spec/fixtures/indexing/geoblacklight.json +33 -0
- data/spec/fixtures/indexing/layers.json +16119 -0
- data/spec/fixtures/indexing/test.txt +1 -0
- data/spec/fixtures/json_docs.rb +2 -0
- data/spec/fixtures/xml_docs.rb +9 -1659
- data/spec/helpers.rb +7 -7
- data/spec/lib/geo_combine/bounding_box_spec.rb +18 -0
- data/spec/lib/geo_combine/ckan_metadata_spec.rb +34 -11
- data/spec/lib/geo_combine/esri_open_data_spec.rb +23 -2
- data/spec/lib/geo_combine/fgdc_spec.rb +41 -10
- data/spec/lib/geo_combine/formatting_spec.rb +13 -5
- data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +194 -0
- data/spec/lib/geo_combine/geoblacklight_spec.rb +41 -11
- data/spec/lib/geo_combine/iso19139_spec.rb +26 -14
- data/spec/lib/geo_combine/ogp_spec.rb +28 -8
- data/spec/lib/geo_combine_spec.rb +7 -4
- data/spec/lib/tasks/geo_combine_spec.rb +45 -0
- data/spec/spec_helper.rb +19 -84
- data/spec/support/fixtures.rb +9 -0
- metadata +103 -6
- data/.coveralls.yml +0 -1
- data/.travis.yml +0 -7
@@ -0,0 +1,211 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GeoCombine
|
4
|
+
##
|
5
|
+
# A class to harvest and index results from GeoBlacklight sites
|
6
|
+
# You can configure the sites to be harvested via a configure command.
|
7
|
+
# GeoCombine::GeoBlacklightHarvester.configure do
|
8
|
+
# {
|
9
|
+
# SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } }
|
10
|
+
# }
|
11
|
+
# end
|
12
|
+
# The class configuration also allows for various other things to be configured:
|
13
|
+
# - A debug parameter to print out details of what is being harvested and indexed
|
14
|
+
# - crawl delays for each page of results (globally or on a per site basis)
|
15
|
+
# - Solr's commitWithin parameter (defaults to 5000)
|
16
|
+
# - A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)
|
17
|
+
# Example: GeoCombine::GeoBlacklightHarvester.new('SITE').index
|
18
|
+
class GeoBlacklightHarvester
|
19
|
+
require 'active_support/core_ext/object/to_query'
|
20
|
+
|
21
|
+
class << self
|
22
|
+
attr_writer :document_transformer
|
23
|
+
|
24
|
+
def configure(&block)
|
25
|
+
@config = yield block
|
26
|
+
end
|
27
|
+
|
28
|
+
def config
|
29
|
+
@config || {}
|
30
|
+
end
|
31
|
+
|
32
|
+
def document_transformer
|
33
|
+
@document_transformer || lambda do |document|
|
34
|
+
document.delete('_version_')
|
35
|
+
document.delete('score')
|
36
|
+
document.delete('timestamp')
|
37
|
+
document.delete('solr_bboxtype__minX')
|
38
|
+
document.delete('solr_bboxtype__minY')
|
39
|
+
document.delete('solr_bboxtype__maxX')
|
40
|
+
document.delete('solr_bboxtype__maxY')
|
41
|
+
document
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
attr_reader :site, :site_key
|
47
|
+
|
48
|
+
def initialize(site_key)
|
49
|
+
@site_key = site_key
|
50
|
+
@site = self.class.config[site_key]
|
51
|
+
|
52
|
+
raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
|
53
|
+
end
|
54
|
+
|
55
|
+
def index
|
56
|
+
puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
|
57
|
+
response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
|
58
|
+
response_class = BlacklightResponseVersionFactory.call(response)
|
59
|
+
|
60
|
+
response_class.new(response: response, base_url: base_url).documents.each do |docs|
|
61
|
+
docs.map! do |document|
|
62
|
+
self.class.document_transformer&.call(document)
|
63
|
+
end.compact
|
64
|
+
|
65
|
+
puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
|
66
|
+
solr_connection.update params: { commitWithin: commit_within, overwrite: true },
|
67
|
+
data: docs.to_json,
|
68
|
+
headers: { 'Content-Type' => 'application/json' }
|
69
|
+
|
70
|
+
sleep(crawl_delay.to_i) if crawl_delay
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# A "factory" class to determine the blacklight response version to use
|
76
|
+
class BlacklightResponseVersionFactory
|
77
|
+
def self.call(json)
|
78
|
+
keys = json.keys
|
79
|
+
if keys.include?('response')
|
80
|
+
LegacyBlacklightResponse
|
81
|
+
elsif keys.any? && %w[links data].all? { |param| keys.include?(param) }
|
82
|
+
ModernBlacklightResponse
|
83
|
+
else
|
84
|
+
raise NotImplementedError,
|
85
|
+
"The following json response was not able to be parsed by the GeoBlacklightHarvester\n#{json}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
class LegacyBlacklightResponse
|
91
|
+
attr_reader :base_url
|
92
|
+
attr_accessor :response, :page
|
93
|
+
|
94
|
+
def initialize(response:, base_url:)
|
95
|
+
@base_url = base_url
|
96
|
+
@response = response
|
97
|
+
@page = 1
|
98
|
+
end
|
99
|
+
|
100
|
+
def documents
|
101
|
+
return enum_for(:documents) unless block_given?
|
102
|
+
|
103
|
+
while current_page && total_pages && (current_page <= total_pages)
|
104
|
+
yield response.dig('response', 'docs')
|
105
|
+
|
106
|
+
break if current_page == total_pages
|
107
|
+
|
108
|
+
self.page += 1
|
109
|
+
puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
|
110
|
+
|
111
|
+
begin
|
112
|
+
self.response = JSON.parse(Net::HTTP.get(URI(url)))
|
113
|
+
rescue StandardError => e
|
114
|
+
puts "Request for #{url} failed with #{e}"
|
115
|
+
self.response = nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
def url
|
123
|
+
"#{base_url}&page=#{page}"
|
124
|
+
end
|
125
|
+
|
126
|
+
def current_page
|
127
|
+
response.dig('response', 'pages', 'current_page')
|
128
|
+
end
|
129
|
+
|
130
|
+
def total_pages
|
131
|
+
response.dig('response', 'pages', 'total_pages')
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
##
|
136
|
+
# Class to return documents from the Blacklight API (v7 and above)
|
137
|
+
class ModernBlacklightResponse
|
138
|
+
attr_reader :base_url
|
139
|
+
attr_accessor :response, :page
|
140
|
+
|
141
|
+
def initialize(response:, base_url:)
|
142
|
+
@base_url = base_url
|
143
|
+
@response = response
|
144
|
+
@page = 1
|
145
|
+
end
|
146
|
+
|
147
|
+
def documents
|
148
|
+
return enum_for(:documents) unless block_given?
|
149
|
+
|
150
|
+
while response && response['data'].any?
|
151
|
+
document_urls = response['data'].collect { |data| data.dig('links', 'self') }.compact
|
152
|
+
|
153
|
+
yield documents_from_urls(document_urls)
|
154
|
+
|
155
|
+
url = response.dig('links', 'next')
|
156
|
+
break unless url
|
157
|
+
|
158
|
+
url = "#{url}&format=json"
|
159
|
+
self.page += 1
|
160
|
+
puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
|
161
|
+
begin
|
162
|
+
self.response = JSON.parse(Net::HTTP.get(URI(url)))
|
163
|
+
rescue StandardError => e
|
164
|
+
puts "Request for #{url} failed with #{e}"
|
165
|
+
self.response = nil
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
private
|
171
|
+
|
172
|
+
def documents_from_urls(urls)
|
173
|
+
puts "Fetching #{urls.count} documents for page #{page}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
|
174
|
+
urls.map do |url|
|
175
|
+
JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
|
176
|
+
rescue StandardError => e
|
177
|
+
puts "Fetching \"#{url}/raw\" failed with #{e}"
|
178
|
+
|
179
|
+
nil
|
180
|
+
end.compact
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
private
|
185
|
+
|
186
|
+
def base_url
|
187
|
+
"#{site[:host]}?#{default_params.to_query}"
|
188
|
+
end
|
189
|
+
|
190
|
+
def solr_connection
|
191
|
+
solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
|
192
|
+
|
193
|
+
RSolr.connect url: solr_url, adapter: :net_http_persistent
|
194
|
+
end
|
195
|
+
|
196
|
+
def commit_within
|
197
|
+
self.class.config[:commit_within] || '5000'
|
198
|
+
end
|
199
|
+
|
200
|
+
def crawl_delay
|
201
|
+
site[:crawl_delay] || self.class.config[:crawl_delay]
|
202
|
+
end
|
203
|
+
|
204
|
+
def default_params
|
205
|
+
{
|
206
|
+
per_page: 100,
|
207
|
+
format: :json
|
208
|
+
}.merge(site[:params])
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'active_support/core_ext/object/blank'
|
2
4
|
require 'active_support/core_ext/hash/except'
|
3
5
|
require 'open-uri'
|
@@ -10,8 +12,8 @@ module GeoCombine
|
|
10
12
|
|
11
13
|
attr_reader :metadata
|
12
14
|
|
13
|
-
GEOBLACKLIGHT_VERSION = '
|
14
|
-
SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight
|
15
|
+
GEOBLACKLIGHT_VERSION = '1.0'
|
16
|
+
SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/main/schema/geoblacklight-schema-#{GEOBLACKLIGHT_VERSION}.json"
|
15
17
|
DEPRECATED_KEYS_V1 = %w[
|
16
18
|
uuid
|
17
19
|
georss_polygon_s
|
@@ -29,7 +31,6 @@ module GeoCombine
|
|
29
31
|
# @param [Hash] fields enhancements to metadata that are merged with @metadata
|
30
32
|
def initialize(metadata, fields = {})
|
31
33
|
@metadata = JSON.parse(metadata).merge(fields)
|
32
|
-
@schema = nil
|
33
34
|
end
|
34
35
|
|
35
36
|
##
|
@@ -58,8 +59,7 @@ module GeoCombine
|
|
58
59
|
# Validates a GeoBlacklight-Schema json document
|
59
60
|
# @return [Boolean]
|
60
61
|
def valid?
|
61
|
-
|
62
|
-
JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') &&
|
62
|
+
JSON::Validator.validate!(schema, to_json, fragment: '#/definitions/layer') &&
|
63
63
|
dct_references_validate! &&
|
64
64
|
spatial_validate!
|
65
65
|
end
|
@@ -69,9 +69,14 @@ module GeoCombine
|
|
69
69
|
# @return [Boolean]
|
70
70
|
def dct_references_validate!
|
71
71
|
return true unless metadata.key?('dct_references_s') # TODO: shouldn't we require this field?
|
72
|
+
|
72
73
|
begin
|
73
74
|
ref = JSON.parse(metadata['dct_references_s'])
|
74
|
-
|
75
|
+
unless ref.is_a?(Hash)
|
76
|
+
raise GeoCombine::Exceptions::InvalidDCTReferences,
|
77
|
+
'dct_references must be parsed to a Hash'
|
78
|
+
end
|
79
|
+
|
75
80
|
true
|
76
81
|
rescue JSON::ParserError => e
|
77
82
|
raise e, "Invalid JSON in dct_references_s: #{e.message}"
|
@@ -89,6 +94,7 @@ module GeoCombine
|
|
89
94
|
# GeoBlacklight-Schema format
|
90
95
|
def translate_formats(key, value)
|
91
96
|
return unless key == 'dc_format_s' && formats.include?(value)
|
97
|
+
|
92
98
|
metadata[key] = formats[value]
|
93
99
|
end
|
94
100
|
|
@@ -96,6 +102,7 @@ module GeoCombine
|
|
96
102
|
# Enhances the 'layer_geom_type_s' field by translating from known types
|
97
103
|
def translate_geometry_type(key, value)
|
98
104
|
return unless key == 'layer_geom_type_s' && geometry_types.include?(value)
|
105
|
+
|
99
106
|
metadata[key] = geometry_types[value]
|
100
107
|
end
|
101
108
|
|
@@ -104,6 +111,7 @@ module GeoCombine
|
|
104
111
|
# categories
|
105
112
|
def enhance_subjects(key, value)
|
106
113
|
return unless key == 'dc_subject_sm'
|
114
|
+
|
107
115
|
metadata[key] = value.map do |val|
|
108
116
|
if subjects.include?(val)
|
109
117
|
subjects[val]
|
@@ -118,11 +126,13 @@ module GeoCombine
|
|
118
126
|
# and ISO8601 (for indexing into Solr)
|
119
127
|
def format_proper_date(key, value)
|
120
128
|
return unless key == 'layer_modified_dt'
|
129
|
+
|
121
130
|
metadata[key] = Time.parse(value).utc.iso8601
|
122
131
|
end
|
123
132
|
|
124
133
|
def fields_should_be_array(key, value)
|
125
134
|
return unless should_be_array.include?(key) && !value.is_a?(Array)
|
135
|
+
|
126
136
|
metadata[key] = [value]
|
127
137
|
end
|
128
138
|
|
@@ -155,5 +165,9 @@ module GeoCombine
|
|
155
165
|
# ensure we have a proper v1 record
|
156
166
|
valid?
|
157
167
|
end
|
168
|
+
|
169
|
+
def schema
|
170
|
+
@schema ||= JSON.parse(URI.open(SCHEMA_JSON_URL).read)
|
171
|
+
end
|
158
172
|
end
|
159
173
|
end
|
data/lib/geo_combine/iso19139.rb
CHANGED
data/lib/geo_combine/ogp.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'active_support/core_ext/object/blank'
|
2
4
|
require 'cgi'
|
3
5
|
|
@@ -77,15 +79,13 @@ module GeoCombine
|
|
77
79
|
end
|
78
80
|
|
79
81
|
def date
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
nil
|
84
|
-
end
|
82
|
+
DateTime.rfc3339(metadata['ContentDate'])
|
83
|
+
rescue StandardError
|
84
|
+
nil
|
85
85
|
end
|
86
86
|
|
87
87
|
def year
|
88
|
-
date
|
88
|
+
date&.year
|
89
89
|
end
|
90
90
|
|
91
91
|
##
|
@@ -104,9 +104,9 @@ module GeoCombine
|
|
104
104
|
def ogp_formats
|
105
105
|
case metadata['DataType']
|
106
106
|
when 'Paper Map', 'Raster'
|
107
|
-
|
107
|
+
'GeoTIFF'
|
108
108
|
when 'Polygon', 'Point', 'Line'
|
109
|
-
|
109
|
+
'Shapefile'
|
110
110
|
else
|
111
111
|
raise ArgumentError, metadata['DataType']
|
112
112
|
end
|
@@ -128,6 +128,7 @@ module GeoCombine
|
|
128
128
|
north >= -90 && north <= 90 &&
|
129
129
|
south >= -90 && south <= 90 &&
|
130
130
|
west <= east && south <= north
|
131
|
+
|
131
132
|
"ENVELOPE(#{west}, #{east}, #{north}, #{south})"
|
132
133
|
end
|
133
134
|
|
@@ -165,6 +166,7 @@ module GeoCombine
|
|
165
166
|
|
166
167
|
def download_uri
|
167
168
|
return 'http://schema.org/DownloadAction' if institution == 'Harvard'
|
169
|
+
|
168
170
|
'http://schema.org/downloadUrl'
|
169
171
|
end
|
170
172
|
|
@@ -205,7 +207,7 @@ module GeoCombine
|
|
205
207
|
sluggify(filter_name(name))
|
206
208
|
end
|
207
209
|
|
208
|
-
|
210
|
+
SLUG_STRIP_VALUES = %w[
|
209
211
|
SDE_DATA.
|
210
212
|
SDE.
|
211
213
|
SDE2.
|
@@ -216,8 +218,8 @@ module GeoCombine
|
|
216
218
|
|
217
219
|
def filter_name(name)
|
218
220
|
# strip out schema and usernames
|
219
|
-
|
220
|
-
name.sub!(
|
221
|
+
SLUG_STRIP_VALUES.each do |strip_val|
|
222
|
+
name.sub!(strip_val, '')
|
221
223
|
end
|
222
224
|
unless name.size > 1
|
223
225
|
# use first word of title is empty name
|
data/lib/geo_combine/railtie.rb
CHANGED
data/lib/geo_combine/subjects.rb
CHANGED
data/lib/geo_combine/version.rb
CHANGED
data/lib/geo_combine.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'json'
|
3
5
|
require 'json-schema'
|
4
6
|
require 'sanitize'
|
5
7
|
|
6
8
|
module GeoCombine
|
7
|
-
|
8
9
|
##
|
9
10
|
# TODO: Create a parse method that can interpret the type of metadata being
|
10
11
|
# passed in.
|
@@ -23,7 +24,7 @@ module GeoCombine
|
|
23
24
|
# @param [String] metadata can be a File path
|
24
25
|
# "./tmp/edu.stanford.purl/bb/338/jh/0716/iso19139.xml" or a String of XML
|
25
26
|
# metadata
|
26
|
-
def initialize
|
27
|
+
def initialize(metadata)
|
27
28
|
metadata = File.read metadata if File.readable? metadata
|
28
29
|
metadata = Nokogiri::XML(metadata) if metadata.instance_of? String
|
29
30
|
@metadata = metadata
|
@@ -35,7 +36,7 @@ module GeoCombine
|
|
35
36
|
# GeoCombine::Geoblacklight on its instantiation
|
36
37
|
# @return [GeoCombine::Geoblacklight] the data transformed into
|
37
38
|
# geoblacklight schema, returned as a GeoCombine::Geoblacklight
|
38
|
-
def to_geoblacklight
|
39
|
+
def to_geoblacklight(fields = {})
|
39
40
|
GeoCombine::Geoblacklight.new(xsl_geoblacklight.apply_to(@metadata), fields)
|
40
41
|
end
|
41
42
|
|
@@ -68,6 +69,9 @@ require 'geo_combine/esri_open_data'
|
|
68
69
|
require 'geo_combine/ckan_metadata'
|
69
70
|
require 'geo_combine/ogp'
|
70
71
|
|
72
|
+
# Require harvesting/indexing files
|
73
|
+
require 'geo_combine/geo_blacklight_harvester'
|
74
|
+
|
71
75
|
# Require gem files
|
72
76
|
require 'geo_combine/version'
|
73
77
|
require 'geo_combine/railtie' if defined?(Rails)
|
data/lib/tasks/geo_combine.rake
CHANGED
@@ -1,16 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'net/http'
|
2
4
|
require 'json'
|
3
5
|
require 'rsolr'
|
4
6
|
require 'find'
|
7
|
+
require 'geo_combine/geo_blacklight_harvester'
|
5
8
|
|
6
9
|
namespace :geocombine do
|
7
|
-
commit_within = (ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
|
8
|
-
ogm_path = ENV['OGM_PATH'] || 'tmp/opengeometadata'
|
9
|
-
solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
|
10
|
-
whitelist = %w[
|
11
|
-
https://github.com/OpenGeoMetadata/big-ten.git
|
12
|
-
]
|
13
|
-
|
14
10
|
desc 'Clone OpenGeoMetadata repositories'
|
15
11
|
task :clone, [:repo] do |_t, args|
|
16
12
|
if args.repo
|
@@ -18,46 +14,81 @@ namespace :geocombine do
|
|
18
14
|
else
|
19
15
|
ogm_api_uri = URI('https://api.github.com/orgs/opengeometadata/repos')
|
20
16
|
ogm_repos = JSON.parse(Net::HTTP.get(ogm_api_uri)).map do |repo|
|
21
|
-
repo['clone_url'] if repo['size']
|
17
|
+
repo['clone_url'] if (repo['size']).positive?
|
22
18
|
end.compact
|
23
|
-
ogm_repos.
|
19
|
+
ogm_repos.reject! { |repo| GeoCombineRake.denylist.include?(repo) }
|
24
20
|
end
|
25
21
|
ogm_repos.each do |repo|
|
26
|
-
system "echo #{repo} && mkdir -p #{ogm_path} && cd #{ogm_path} && git clone --depth 1 #{repo}"
|
22
|
+
Kernel.system "echo #{repo} && mkdir -p #{GeoCombineRake.ogm_path} && cd #{GeoCombineRake.ogm_path} && git clone --depth 1 #{repo}"
|
27
23
|
end
|
28
24
|
end
|
29
25
|
|
30
26
|
desc '"git pull" OpenGeoMetadata repositories'
|
31
27
|
task :pull, [:repo] do |_t, args|
|
32
28
|
paths = if args.repo
|
33
|
-
[File.join(ogm_path, args.repo)]
|
29
|
+
[File.join(GeoCombineRake.ogm_path, args.repo)]
|
34
30
|
else
|
35
|
-
Dir.glob("#{ogm_path}/*")
|
31
|
+
Dir.glob("#{GeoCombineRake.ogm_path}/*")
|
36
32
|
end
|
37
33
|
paths.each do |path|
|
38
34
|
next unless File.directory?(path)
|
39
|
-
|
35
|
+
|
36
|
+
Kernel.system "echo #{path} && cd #{path} && git pull origin"
|
40
37
|
end
|
41
38
|
end
|
42
39
|
|
43
|
-
desc 'Index all
|
40
|
+
desc 'Index all JSON documents except Layers.json'
|
44
41
|
task :index do
|
45
|
-
puts "Indexing #{ogm_path} into #{solr_url}"
|
46
|
-
solr = RSolr.connect url: solr_url, adapter: :net_http_persistent
|
47
|
-
Find.find(ogm_path) do |path|
|
48
|
-
next unless File.basename(path)
|
42
|
+
puts "Indexing #{GeoCombineRake.ogm_path} into #{GeoCombineRake.solr_url}"
|
43
|
+
solr = RSolr.connect url: GeoCombineRake.solr_url, adapter: :net_http_persistent
|
44
|
+
Find.find(GeoCombineRake.ogm_path) do |path|
|
45
|
+
next unless File.basename(path).include?('.json') && File.basename(path) != 'layers.json'
|
46
|
+
|
49
47
|
doc = JSON.parse(File.read(path))
|
50
48
|
[doc].flatten.each do |record|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
puts error
|
58
|
-
end
|
49
|
+
puts "Indexing #{record['layer_slug_s']}: #{path}" if $DEBUG
|
50
|
+
solr.update params: { commitWithin: GeoCombineRake.commit_within, overwrite: true },
|
51
|
+
data: [record].to_json,
|
52
|
+
headers: { 'Content-Type' => 'application/json' }
|
53
|
+
rescue RSolr::Error::Http => e
|
54
|
+
puts e
|
59
55
|
end
|
60
56
|
end
|
61
57
|
solr.commit
|
62
58
|
end
|
59
|
+
|
60
|
+
namespace :geoblacklight_harvester do
|
61
|
+
desc 'Harvest documents from a configured GeoBlacklight instance'
|
62
|
+
task :index, [:site] => [:environment] do |_t, args|
|
63
|
+
raise ArgumentError, 'A site argument is required' unless args.site
|
64
|
+
|
65
|
+
GeoCombine::GeoBlacklightHarvester.new(args.site.to_sym).index
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Class to hold helper methods for use in GeoCombine rake tasks
|
71
|
+
class GeoCombineRake
|
72
|
+
def self.commit_within
|
73
|
+
(ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.denylist
|
77
|
+
[
|
78
|
+
'https://github.com/OpenGeoMetadata/GeoCombine.git',
|
79
|
+
'https://github.com/OpenGeoMetadata/aardvark.git',
|
80
|
+
'https://github.com/OpenGeoMetadata/metadata-issues.git',
|
81
|
+
'https://github.com/OpenGeoMetadata/ogm_utils-python.git',
|
82
|
+
'https://github.com/OpenGeoMetadata/opengeometadata.github.io.git',
|
83
|
+
'https://github.com/OpenGeoMetadata/opengeometadata-rails.git'
|
84
|
+
]
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.ogm_path
|
88
|
+
ENV['OGM_PATH'] || 'tmp/opengeometadata'
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.solr_url
|
92
|
+
ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
|
93
|
+
end
|
63
94
|
end
|