geo_combine 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +53 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +20 -0
- data/.rubocop_todo.yml +165 -0
- data/Gemfile +3 -1
- data/README.md +80 -1
- data/Rakefile +4 -2
- data/bin/geocombine +1 -0
- data/geo_combine.gemspec +5 -0
- data/lib/geo_combine/bounding_box.rb +7 -1
- data/lib/geo_combine/ckan_metadata.rb +10 -8
- data/lib/geo_combine/cli.rb +3 -1
- data/lib/geo_combine/esri_open_data.rb +2 -0
- data/lib/geo_combine/exceptions.rb +3 -0
- data/lib/geo_combine/fgdc.rb +2 -2
- data/lib/geo_combine/formats.rb +2 -0
- data/lib/geo_combine/formatting.rb +3 -1
- data/lib/geo_combine/geo_blacklight_harvester.rb +211 -0
- data/lib/geo_combine/geoblacklight.rb +20 -6
- data/lib/geo_combine/geometry_types.rb +2 -0
- data/lib/geo_combine/iso19139.rb +2 -1
- data/lib/geo_combine/ogp.rb +13 -11
- data/lib/geo_combine/railtie.rb +2 -0
- data/lib/geo_combine/subjects.rb +2 -0
- data/lib/geo_combine/version.rb +3 -1
- data/lib/geo_combine.rb +7 -3
- data/lib/tasks/geo_combine.rake +57 -26
- data/lib/xslt/fgdc2html.xsl +38 -9
- data/lib/xslt/iso2html.xsl +1107 -1070
- data/spec/features/fgdc2html_spec.rb +53 -1
- data/spec/features/iso2html_spec.rb +17 -2
- data/spec/fixtures/docs/princeton_fgdc.xml +374 -0
- data/spec/fixtures/docs/repos.json +3224 -0
- data/spec/fixtures/docs/simple_xml.xml +10 -0
- data/spec/fixtures/docs/simple_xslt.xsl +11 -0
- data/spec/fixtures/docs/stanford_iso.xml +652 -0
- data/spec/fixtures/docs/tufts_fgdc.xml +977 -0
- data/spec/fixtures/indexing/basic_geoblacklight.json +27 -0
- data/spec/fixtures/indexing/geoblacklight.json +33 -0
- data/spec/fixtures/indexing/layers.json +16119 -0
- data/spec/fixtures/indexing/test.txt +1 -0
- data/spec/fixtures/json_docs.rb +2 -0
- data/spec/fixtures/xml_docs.rb +9 -1659
- data/spec/helpers.rb +7 -7
- data/spec/lib/geo_combine/bounding_box_spec.rb +18 -0
- data/spec/lib/geo_combine/ckan_metadata_spec.rb +34 -11
- data/spec/lib/geo_combine/esri_open_data_spec.rb +23 -2
- data/spec/lib/geo_combine/fgdc_spec.rb +41 -10
- data/spec/lib/geo_combine/formatting_spec.rb +13 -5
- data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +194 -0
- data/spec/lib/geo_combine/geoblacklight_spec.rb +41 -11
- data/spec/lib/geo_combine/iso19139_spec.rb +26 -14
- data/spec/lib/geo_combine/ogp_spec.rb +28 -8
- data/spec/lib/geo_combine_spec.rb +7 -4
- data/spec/lib/tasks/geo_combine_spec.rb +45 -0
- data/spec/spec_helper.rb +19 -84
- data/spec/support/fixtures.rb +9 -0
- metadata +103 -6
- data/.coveralls.yml +0 -1
- data/.travis.yml +0 -7
@@ -0,0 +1,211 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GeoCombine
|
4
|
+
##
|
5
|
+
# A class to harvest and index results from GeoBlacklight sites
|
6
|
+
# You can configure the sites to be harvested via a configure command.
|
7
|
+
# GeoCombine::GeoBlacklightHarvester.configure do
|
8
|
+
# {
|
9
|
+
# SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } }
|
10
|
+
# }
|
11
|
+
# end
|
12
|
+
# The class configuration also allows for various other things to be configured:
|
13
|
+
# - A debug parameter to print out details of what is being harvested and indexed
|
14
|
+
# - crawl delays for each page of results (globally or on a per site basis)
|
15
|
+
# - Solr's commitWithin parameter (defaults to 5000)
|
16
|
+
# - A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)
|
17
|
+
# Example: GeoCombine::GeoBlacklightHarvester.new('SITE').index
|
18
|
+
class GeoBlacklightHarvester
|
19
|
+
require 'active_support/core_ext/object/to_query'
|
20
|
+
|
21
|
+
class << self
|
22
|
+
attr_writer :document_transformer
|
23
|
+
|
24
|
+
def configure(&block)
|
25
|
+
@config = yield block
|
26
|
+
end
|
27
|
+
|
28
|
+
def config
|
29
|
+
@config || {}
|
30
|
+
end
|
31
|
+
|
32
|
+
def document_transformer
|
33
|
+
@document_transformer || lambda do |document|
|
34
|
+
document.delete('_version_')
|
35
|
+
document.delete('score')
|
36
|
+
document.delete('timestamp')
|
37
|
+
document.delete('solr_bboxtype__minX')
|
38
|
+
document.delete('solr_bboxtype__minY')
|
39
|
+
document.delete('solr_bboxtype__maxX')
|
40
|
+
document.delete('solr_bboxtype__maxY')
|
41
|
+
document
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
attr_reader :site, :site_key
|
47
|
+
|
48
|
+
def initialize(site_key)
|
49
|
+
@site_key = site_key
|
50
|
+
@site = self.class.config[site_key]
|
51
|
+
|
52
|
+
raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
|
53
|
+
end
|
54
|
+
|
55
|
+
def index
|
56
|
+
puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
|
57
|
+
response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
|
58
|
+
response_class = BlacklightResponseVersionFactory.call(response)
|
59
|
+
|
60
|
+
response_class.new(response: response, base_url: base_url).documents.each do |docs|
|
61
|
+
docs.map! do |document|
|
62
|
+
self.class.document_transformer&.call(document)
|
63
|
+
end.compact
|
64
|
+
|
65
|
+
puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
|
66
|
+
solr_connection.update params: { commitWithin: commit_within, overwrite: true },
|
67
|
+
data: docs.to_json,
|
68
|
+
headers: { 'Content-Type' => 'application/json' }
|
69
|
+
|
70
|
+
sleep(crawl_delay.to_i) if crawl_delay
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# A "factory" class to determine the blacklight response version to use
|
76
|
+
class BlacklightResponseVersionFactory
|
77
|
+
def self.call(json)
|
78
|
+
keys = json.keys
|
79
|
+
if keys.include?('response')
|
80
|
+
LegacyBlacklightResponse
|
81
|
+
elsif keys.any? && %w[links data].all? { |param| keys.include?(param) }
|
82
|
+
ModernBlacklightResponse
|
83
|
+
else
|
84
|
+
raise NotImplementedError,
|
85
|
+
"The following json response was not able to be parsed by the GeoBlacklightHarvester\n#{json}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
class LegacyBlacklightResponse
|
91
|
+
attr_reader :base_url
|
92
|
+
attr_accessor :response, :page
|
93
|
+
|
94
|
+
def initialize(response:, base_url:)
|
95
|
+
@base_url = base_url
|
96
|
+
@response = response
|
97
|
+
@page = 1
|
98
|
+
end
|
99
|
+
|
100
|
+
def documents
|
101
|
+
return enum_for(:documents) unless block_given?
|
102
|
+
|
103
|
+
while current_page && total_pages && (current_page <= total_pages)
|
104
|
+
yield response.dig('response', 'docs')
|
105
|
+
|
106
|
+
break if current_page == total_pages
|
107
|
+
|
108
|
+
self.page += 1
|
109
|
+
puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
|
110
|
+
|
111
|
+
begin
|
112
|
+
self.response = JSON.parse(Net::HTTP.get(URI(url)))
|
113
|
+
rescue StandardError => e
|
114
|
+
puts "Request for #{url} failed with #{e}"
|
115
|
+
self.response = nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
def url
|
123
|
+
"#{base_url}&page=#{page}"
|
124
|
+
end
|
125
|
+
|
126
|
+
def current_page
|
127
|
+
response.dig('response', 'pages', 'current_page')
|
128
|
+
end
|
129
|
+
|
130
|
+
def total_pages
|
131
|
+
response.dig('response', 'pages', 'total_pages')
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
##
|
136
|
+
# Class to return documents from the Blacklight API (v7 and above)
|
137
|
+
class ModernBlacklightResponse
|
138
|
+
attr_reader :base_url
|
139
|
+
attr_accessor :response, :page
|
140
|
+
|
141
|
+
def initialize(response:, base_url:)
|
142
|
+
@base_url = base_url
|
143
|
+
@response = response
|
144
|
+
@page = 1
|
145
|
+
end
|
146
|
+
|
147
|
+
def documents
|
148
|
+
return enum_for(:documents) unless block_given?
|
149
|
+
|
150
|
+
while response && response['data'].any?
|
151
|
+
document_urls = response['data'].collect { |data| data.dig('links', 'self') }.compact
|
152
|
+
|
153
|
+
yield documents_from_urls(document_urls)
|
154
|
+
|
155
|
+
url = response.dig('links', 'next')
|
156
|
+
break unless url
|
157
|
+
|
158
|
+
url = "#{url}&format=json"
|
159
|
+
self.page += 1
|
160
|
+
puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
|
161
|
+
begin
|
162
|
+
self.response = JSON.parse(Net::HTTP.get(URI(url)))
|
163
|
+
rescue StandardError => e
|
164
|
+
puts "Request for #{url} failed with #{e}"
|
165
|
+
self.response = nil
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
private
|
171
|
+
|
172
|
+
def documents_from_urls(urls)
|
173
|
+
puts "Fetching #{urls.count} documents for page #{page}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
|
174
|
+
urls.map do |url|
|
175
|
+
JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
|
176
|
+
rescue StandardError => e
|
177
|
+
puts "Fetching \"#{url}/raw\" failed with #{e}"
|
178
|
+
|
179
|
+
nil
|
180
|
+
end.compact
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
private
|
185
|
+
|
186
|
+
def base_url
|
187
|
+
"#{site[:host]}?#{default_params.to_query}"
|
188
|
+
end
|
189
|
+
|
190
|
+
def solr_connection
|
191
|
+
solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
|
192
|
+
|
193
|
+
RSolr.connect url: solr_url, adapter: :net_http_persistent
|
194
|
+
end
|
195
|
+
|
196
|
+
def commit_within
|
197
|
+
self.class.config[:commit_within] || '5000'
|
198
|
+
end
|
199
|
+
|
200
|
+
def crawl_delay
|
201
|
+
site[:crawl_delay] || self.class.config[:crawl_delay]
|
202
|
+
end
|
203
|
+
|
204
|
+
def default_params
|
205
|
+
{
|
206
|
+
per_page: 100,
|
207
|
+
format: :json
|
208
|
+
}.merge(site[:params])
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'active_support/core_ext/object/blank'
|
2
4
|
require 'active_support/core_ext/hash/except'
|
3
5
|
require 'open-uri'
|
@@ -10,8 +12,8 @@ module GeoCombine
|
|
10
12
|
|
11
13
|
attr_reader :metadata
|
12
14
|
|
13
|
-
GEOBLACKLIGHT_VERSION = '
|
14
|
-
SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight
|
15
|
+
GEOBLACKLIGHT_VERSION = '1.0'
|
16
|
+
SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/main/schema/geoblacklight-schema-#{GEOBLACKLIGHT_VERSION}.json"
|
15
17
|
DEPRECATED_KEYS_V1 = %w[
|
16
18
|
uuid
|
17
19
|
georss_polygon_s
|
@@ -29,7 +31,6 @@ module GeoCombine
|
|
29
31
|
# @param [Hash] fields enhancements to metadata that are merged with @metadata
|
30
32
|
def initialize(metadata, fields = {})
|
31
33
|
@metadata = JSON.parse(metadata).merge(fields)
|
32
|
-
@schema = nil
|
33
34
|
end
|
34
35
|
|
35
36
|
##
|
@@ -58,8 +59,7 @@ module GeoCombine
|
|
58
59
|
# Validates a GeoBlacklight-Schema json document
|
59
60
|
# @return [Boolean]
|
60
61
|
def valid?
|
61
|
-
|
62
|
-
JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') &&
|
62
|
+
JSON::Validator.validate!(schema, to_json, fragment: '#/definitions/layer') &&
|
63
63
|
dct_references_validate! &&
|
64
64
|
spatial_validate!
|
65
65
|
end
|
@@ -69,9 +69,14 @@ module GeoCombine
|
|
69
69
|
# @return [Boolean]
|
70
70
|
def dct_references_validate!
|
71
71
|
return true unless metadata.key?('dct_references_s') # TODO: shouldn't we require this field?
|
72
|
+
|
72
73
|
begin
|
73
74
|
ref = JSON.parse(metadata['dct_references_s'])
|
74
|
-
|
75
|
+
unless ref.is_a?(Hash)
|
76
|
+
raise GeoCombine::Exceptions::InvalidDCTReferences,
|
77
|
+
'dct_references must be parsed to a Hash'
|
78
|
+
end
|
79
|
+
|
75
80
|
true
|
76
81
|
rescue JSON::ParserError => e
|
77
82
|
raise e, "Invalid JSON in dct_references_s: #{e.message}"
|
@@ -89,6 +94,7 @@ module GeoCombine
|
|
89
94
|
# GeoBlacklight-Schema format
|
90
95
|
def translate_formats(key, value)
|
91
96
|
return unless key == 'dc_format_s' && formats.include?(value)
|
97
|
+
|
92
98
|
metadata[key] = formats[value]
|
93
99
|
end
|
94
100
|
|
@@ -96,6 +102,7 @@ module GeoCombine
|
|
96
102
|
# Enhances the 'layer_geom_type_s' field by translating from known types
|
97
103
|
def translate_geometry_type(key, value)
|
98
104
|
return unless key == 'layer_geom_type_s' && geometry_types.include?(value)
|
105
|
+
|
99
106
|
metadata[key] = geometry_types[value]
|
100
107
|
end
|
101
108
|
|
@@ -104,6 +111,7 @@ module GeoCombine
|
|
104
111
|
# categories
|
105
112
|
def enhance_subjects(key, value)
|
106
113
|
return unless key == 'dc_subject_sm'
|
114
|
+
|
107
115
|
metadata[key] = value.map do |val|
|
108
116
|
if subjects.include?(val)
|
109
117
|
subjects[val]
|
@@ -118,11 +126,13 @@ module GeoCombine
|
|
118
126
|
# and ISO8601 (for indexing into Solr)
|
119
127
|
def format_proper_date(key, value)
|
120
128
|
return unless key == 'layer_modified_dt'
|
129
|
+
|
121
130
|
metadata[key] = Time.parse(value).utc.iso8601
|
122
131
|
end
|
123
132
|
|
124
133
|
def fields_should_be_array(key, value)
|
125
134
|
return unless should_be_array.include?(key) && !value.is_a?(Array)
|
135
|
+
|
126
136
|
metadata[key] = [value]
|
127
137
|
end
|
128
138
|
|
@@ -155,5 +165,9 @@ module GeoCombine
|
|
155
165
|
# ensure we have a proper v1 record
|
156
166
|
valid?
|
157
167
|
end
|
168
|
+
|
169
|
+
def schema
|
170
|
+
@schema ||= JSON.parse(URI.open(SCHEMA_JSON_URL).read)
|
171
|
+
end
|
158
172
|
end
|
159
173
|
end
|
data/lib/geo_combine/iso19139.rb
CHANGED
data/lib/geo_combine/ogp.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'active_support/core_ext/object/blank'
|
2
4
|
require 'cgi'
|
3
5
|
|
@@ -77,15 +79,13 @@ module GeoCombine
|
|
77
79
|
end
|
78
80
|
|
79
81
|
def date
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
nil
|
84
|
-
end
|
82
|
+
DateTime.rfc3339(metadata['ContentDate'])
|
83
|
+
rescue StandardError
|
84
|
+
nil
|
85
85
|
end
|
86
86
|
|
87
87
|
def year
|
88
|
-
date
|
88
|
+
date&.year
|
89
89
|
end
|
90
90
|
|
91
91
|
##
|
@@ -104,9 +104,9 @@ module GeoCombine
|
|
104
104
|
def ogp_formats
|
105
105
|
case metadata['DataType']
|
106
106
|
when 'Paper Map', 'Raster'
|
107
|
-
|
107
|
+
'GeoTIFF'
|
108
108
|
when 'Polygon', 'Point', 'Line'
|
109
|
-
|
109
|
+
'Shapefile'
|
110
110
|
else
|
111
111
|
raise ArgumentError, metadata['DataType']
|
112
112
|
end
|
@@ -128,6 +128,7 @@ module GeoCombine
|
|
128
128
|
north >= -90 && north <= 90 &&
|
129
129
|
south >= -90 && south <= 90 &&
|
130
130
|
west <= east && south <= north
|
131
|
+
|
131
132
|
"ENVELOPE(#{west}, #{east}, #{north}, #{south})"
|
132
133
|
end
|
133
134
|
|
@@ -165,6 +166,7 @@ module GeoCombine
|
|
165
166
|
|
166
167
|
def download_uri
|
167
168
|
return 'http://schema.org/DownloadAction' if institution == 'Harvard'
|
169
|
+
|
168
170
|
'http://schema.org/downloadUrl'
|
169
171
|
end
|
170
172
|
|
@@ -205,7 +207,7 @@ module GeoCombine
|
|
205
207
|
sluggify(filter_name(name))
|
206
208
|
end
|
207
209
|
|
208
|
-
|
210
|
+
SLUG_STRIP_VALUES = %w[
|
209
211
|
SDE_DATA.
|
210
212
|
SDE.
|
211
213
|
SDE2.
|
@@ -216,8 +218,8 @@ module GeoCombine
|
|
216
218
|
|
217
219
|
def filter_name(name)
|
218
220
|
# strip out schema and usernames
|
219
|
-
|
220
|
-
name.sub!(
|
221
|
+
SLUG_STRIP_VALUES.each do |strip_val|
|
222
|
+
name.sub!(strip_val, '')
|
221
223
|
end
|
222
224
|
unless name.size > 1
|
223
225
|
# use first word of title is empty name
|
data/lib/geo_combine/railtie.rb
CHANGED
data/lib/geo_combine/subjects.rb
CHANGED
data/lib/geo_combine/version.rb
CHANGED
data/lib/geo_combine.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'json'
|
3
5
|
require 'json-schema'
|
4
6
|
require 'sanitize'
|
5
7
|
|
6
8
|
module GeoCombine
|
7
|
-
|
8
9
|
##
|
9
10
|
# TODO: Create a parse method that can interpret the type of metadata being
|
10
11
|
# passed in.
|
@@ -23,7 +24,7 @@ module GeoCombine
|
|
23
24
|
# @param [String] metadata can be a File path
|
24
25
|
# "./tmp/edu.stanford.purl/bb/338/jh/0716/iso19139.xml" or a String of XML
|
25
26
|
# metadata
|
26
|
-
def initialize
|
27
|
+
def initialize(metadata)
|
27
28
|
metadata = File.read metadata if File.readable? metadata
|
28
29
|
metadata = Nokogiri::XML(metadata) if metadata.instance_of? String
|
29
30
|
@metadata = metadata
|
@@ -35,7 +36,7 @@ module GeoCombine
|
|
35
36
|
# GeoCombine::Geoblacklight on its instantiation
|
36
37
|
# @return [GeoCombine::Geoblacklight] the data transformed into
|
37
38
|
# geoblacklight schema, returned as a GeoCombine::Geoblacklight
|
38
|
-
def to_geoblacklight
|
39
|
+
def to_geoblacklight(fields = {})
|
39
40
|
GeoCombine::Geoblacklight.new(xsl_geoblacklight.apply_to(@metadata), fields)
|
40
41
|
end
|
41
42
|
|
@@ -68,6 +69,9 @@ require 'geo_combine/esri_open_data'
|
|
68
69
|
require 'geo_combine/ckan_metadata'
|
69
70
|
require 'geo_combine/ogp'
|
70
71
|
|
72
|
+
# Require harvesting/indexing files
|
73
|
+
require 'geo_combine/geo_blacklight_harvester'
|
74
|
+
|
71
75
|
# Require gem files
|
72
76
|
require 'geo_combine/version'
|
73
77
|
require 'geo_combine/railtie' if defined?(Rails)
|
data/lib/tasks/geo_combine.rake
CHANGED
@@ -1,16 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'net/http'
|
2
4
|
require 'json'
|
3
5
|
require 'rsolr'
|
4
6
|
require 'find'
|
7
|
+
require 'geo_combine/geo_blacklight_harvester'
|
5
8
|
|
6
9
|
namespace :geocombine do
|
7
|
-
commit_within = (ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
|
8
|
-
ogm_path = ENV['OGM_PATH'] || 'tmp/opengeometadata'
|
9
|
-
solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
|
10
|
-
whitelist = %w[
|
11
|
-
https://github.com/OpenGeoMetadata/big-ten.git
|
12
|
-
]
|
13
|
-
|
14
10
|
desc 'Clone OpenGeoMetadata repositories'
|
15
11
|
task :clone, [:repo] do |_t, args|
|
16
12
|
if args.repo
|
@@ -18,46 +14,81 @@ namespace :geocombine do
|
|
18
14
|
else
|
19
15
|
ogm_api_uri = URI('https://api.github.com/orgs/opengeometadata/repos')
|
20
16
|
ogm_repos = JSON.parse(Net::HTTP.get(ogm_api_uri)).map do |repo|
|
21
|
-
repo['clone_url'] if repo['size']
|
17
|
+
repo['clone_url'] if (repo['size']).positive?
|
22
18
|
end.compact
|
23
|
-
ogm_repos.
|
19
|
+
ogm_repos.reject! { |repo| GeoCombineRake.denylist.include?(repo) }
|
24
20
|
end
|
25
21
|
ogm_repos.each do |repo|
|
26
|
-
system "echo #{repo} && mkdir -p #{ogm_path} && cd #{ogm_path} && git clone --depth 1 #{repo}"
|
22
|
+
Kernel.system "echo #{repo} && mkdir -p #{GeoCombineRake.ogm_path} && cd #{GeoCombineRake.ogm_path} && git clone --depth 1 #{repo}"
|
27
23
|
end
|
28
24
|
end
|
29
25
|
|
30
26
|
desc '"git pull" OpenGeoMetadata repositories'
|
31
27
|
task :pull, [:repo] do |_t, args|
|
32
28
|
paths = if args.repo
|
33
|
-
[File.join(ogm_path, args.repo)]
|
29
|
+
[File.join(GeoCombineRake.ogm_path, args.repo)]
|
34
30
|
else
|
35
|
-
Dir.glob("#{ogm_path}/*")
|
31
|
+
Dir.glob("#{GeoCombineRake.ogm_path}/*")
|
36
32
|
end
|
37
33
|
paths.each do |path|
|
38
34
|
next unless File.directory?(path)
|
39
|
-
|
35
|
+
|
36
|
+
Kernel.system "echo #{path} && cd #{path} && git pull origin"
|
40
37
|
end
|
41
38
|
end
|
42
39
|
|
43
|
-
desc 'Index all
|
40
|
+
desc 'Index all JSON documents except Layers.json'
|
44
41
|
task :index do
|
45
|
-
puts "Indexing #{ogm_path} into #{solr_url}"
|
46
|
-
solr = RSolr.connect url: solr_url, adapter: :net_http_persistent
|
47
|
-
Find.find(ogm_path) do |path|
|
48
|
-
next unless File.basename(path)
|
42
|
+
puts "Indexing #{GeoCombineRake.ogm_path} into #{GeoCombineRake.solr_url}"
|
43
|
+
solr = RSolr.connect url: GeoCombineRake.solr_url, adapter: :net_http_persistent
|
44
|
+
Find.find(GeoCombineRake.ogm_path) do |path|
|
45
|
+
next unless File.basename(path).include?('.json') && File.basename(path) != 'layers.json'
|
46
|
+
|
49
47
|
doc = JSON.parse(File.read(path))
|
50
48
|
[doc].flatten.each do |record|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
puts error
|
58
|
-
end
|
49
|
+
puts "Indexing #{record['layer_slug_s']}: #{path}" if $DEBUG
|
50
|
+
solr.update params: { commitWithin: GeoCombineRake.commit_within, overwrite: true },
|
51
|
+
data: [record].to_json,
|
52
|
+
headers: { 'Content-Type' => 'application/json' }
|
53
|
+
rescue RSolr::Error::Http => e
|
54
|
+
puts e
|
59
55
|
end
|
60
56
|
end
|
61
57
|
solr.commit
|
62
58
|
end
|
59
|
+
|
60
|
+
namespace :geoblacklight_harvester do
|
61
|
+
desc 'Harvest documents from a configured GeoBlacklight instance'
|
62
|
+
task :index, [:site] => [:environment] do |_t, args|
|
63
|
+
raise ArgumentError, 'A site argument is required' unless args.site
|
64
|
+
|
65
|
+
GeoCombine::GeoBlacklightHarvester.new(args.site.to_sym).index
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Class to hold helper methods for use in GeoCombine rake tasks
|
71
|
+
class GeoCombineRake
|
72
|
+
def self.commit_within
|
73
|
+
(ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.denylist
|
77
|
+
[
|
78
|
+
'https://github.com/OpenGeoMetadata/GeoCombine.git',
|
79
|
+
'https://github.com/OpenGeoMetadata/aardvark.git',
|
80
|
+
'https://github.com/OpenGeoMetadata/metadata-issues.git',
|
81
|
+
'https://github.com/OpenGeoMetadata/ogm_utils-python.git',
|
82
|
+
'https://github.com/OpenGeoMetadata/opengeometadata.github.io.git',
|
83
|
+
'https://github.com/OpenGeoMetadata/opengeometadata-rails.git'
|
84
|
+
]
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.ogm_path
|
88
|
+
ENV['OGM_PATH'] || 'tmp/opengeometadata'
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.solr_url
|
92
|
+
ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
|
93
|
+
end
|
63
94
|
end
|