geo_combine 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +3 -6
- data/.rubocop.yml +4 -1
- data/.rubocop_todo.yml +16 -19
- data/README.md +47 -22
- data/geo_combine.gemspec +1 -0
- data/lib/geo_combine/ckan_metadata.rb +5 -4
- data/lib/geo_combine/geo_blacklight_harvester.rb +17 -12
- data/lib/geo_combine/geoblacklight.rb +1 -1
- data/lib/geo_combine/harvester.rb +33 -16
- data/lib/geo_combine/indexer.rb +104 -25
- data/lib/geo_combine/logger.rb +16 -0
- data/lib/geo_combine/migrators/v1_aardvark_migrator.rb +76 -10
- data/lib/geo_combine/ogp.rb +1 -1
- data/lib/geo_combine/version.rb +1 -1
- data/lib/tasks/geo_combine.rake +3 -7
- data/spec/fixtures/docs/full_geoblacklight.json +8 -1
- data/spec/fixtures/docs/full_geoblacklight_aardvark.json +26 -8
- data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +5 -4
- data/spec/lib/geo_combine/harvester_spec.rb +8 -22
- data/spec/lib/geo_combine/indexer_spec.rb +92 -21
- data/spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb +29 -5
- data/spec/lib/geo_combine_spec.rb +20 -17
- data/spec/spec_helper.rb +1 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b168ca81c2b6d5ff2fa0ce75d18154ad9806f5a872cc6f96e41ab0cea628864
|
4
|
+
data.tar.gz: 9ba88c0cca642ebe79301182f992f2dfe223bc1cf4abaa9b00ac4687293b78f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c54ead8b591bd20fc3b62fddbfec2b0ac6830972cd85bf49fd31f230c84c8cf1a7bd4c0b56de702a235eca130fc643e4c796cb84adf15fd93f0b73075161fe2
|
7
|
+
data.tar.gz: 28fe9a5209dd77c2f8b60e87fddec8c7892ddf33794afd75522cf09c6827496c445c6c2f52445ad7fa46508ac1a4dcedd0c5068e161d0e176ec7ab232b21d296
|
data/.github/workflows/ruby.yml
CHANGED
@@ -10,7 +10,7 @@ jobs:
|
|
10
10
|
- name: Set up Ruby and install dependencies
|
11
11
|
uses: ruby/setup-ruby@v1
|
12
12
|
with:
|
13
|
-
ruby-version:
|
13
|
+
ruby-version: 3.1
|
14
14
|
bundler-cache: true
|
15
15
|
- name: Run linter
|
16
16
|
run: bundle exec rubocop
|
@@ -19,11 +19,8 @@ jobs:
|
|
19
19
|
runs-on: ubuntu-latest
|
20
20
|
strategy:
|
21
21
|
matrix:
|
22
|
-
ruby: [
|
23
|
-
faraday_version: [''] # Defaults to whatever's the most recent version.
|
24
|
-
include:
|
25
|
-
- ruby: 2.7
|
26
|
-
faraday_version: '~> 1.0'
|
22
|
+
ruby: [3.1, 3.2, 3.3]
|
23
|
+
faraday_version: ['', '~> 1.0'] # Defaults to whatever's the most recent version.
|
27
24
|
steps:
|
28
25
|
- uses: actions/checkout@v2
|
29
26
|
|
data/.rubocop.yml
CHANGED
@@ -5,7 +5,7 @@ require:
|
|
5
5
|
inherit_from: .rubocop_todo.yml
|
6
6
|
|
7
7
|
AllCops:
|
8
|
-
TargetRubyVersion:
|
8
|
+
TargetRubyVersion: 3.1
|
9
9
|
DisplayCopNames: true
|
10
10
|
NewCops: enable
|
11
11
|
Exclude:
|
@@ -16,6 +16,9 @@ AllCops:
|
|
16
16
|
RSpec/DescribeClass:
|
17
17
|
Enabled: false
|
18
18
|
|
19
|
+
RSpec/MultipleMemoizedHelpers:
|
20
|
+
Enabled: false
|
21
|
+
|
19
22
|
RSpec/BeforeAfterAll:
|
20
23
|
Exclude:
|
21
24
|
- 'spec/lib/tasks/geo_combine_spec.rb'
|
data/.rubocop_todo.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2023-
|
3
|
+
# on 2023-09-13 18:53:11 UTC using RuboCop version 1.56.3.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
@@ -12,6 +12,7 @@ Lint/RescueException:
|
|
12
12
|
- 'spec/helpers.rb'
|
13
13
|
|
14
14
|
# Offense count: 1
|
15
|
+
# This cop supports unsafe autocorrection (--autocorrect-all).
|
15
16
|
Lint/UselessAssignment:
|
16
17
|
Exclude:
|
17
18
|
- 'spec/helpers.rb'
|
@@ -32,17 +33,17 @@ Metrics/BlockLength:
|
|
32
33
|
Metrics/ClassLength:
|
33
34
|
Max: 152
|
34
35
|
|
35
|
-
# Offense count:
|
36
|
+
# Offense count: 5
|
36
37
|
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
37
38
|
Metrics/CyclomaticComplexity:
|
38
39
|
Max: 11
|
39
40
|
|
40
|
-
# Offense count:
|
41
|
+
# Offense count: 13
|
41
42
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
42
43
|
Metrics/MethodLength:
|
43
44
|
Max: 21
|
44
45
|
|
45
|
-
# Offense count:
|
46
|
+
# Offense count: 2
|
46
47
|
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
47
48
|
Metrics/PerceivedComplexity:
|
48
49
|
Max: 11
|
@@ -66,13 +67,6 @@ RSpec/ExpectInHook:
|
|
66
67
|
- 'spec/lib/geo_combine/geo_blacklight_harvester_spec.rb'
|
67
68
|
- 'spec/lib/geo_combine/geoblacklight_spec.rb'
|
68
69
|
|
69
|
-
# Offense count: 1
|
70
|
-
# Configuration parameters: Include, CustomTransform, IgnoreMethods, SpecSuffixOnly.
|
71
|
-
# Include: **/*_spec*rb*, **/spec/**/*
|
72
|
-
RSpec/FilePath:
|
73
|
-
Exclude:
|
74
|
-
- 'spec/lib/geo_combine_spec.rb'
|
75
|
-
|
76
70
|
# Offense count: 23
|
77
71
|
# Configuration parameters: EnforcedStyle.
|
78
72
|
# SupportedStyles: have_received, receive
|
@@ -110,16 +104,19 @@ RSpec/OverwritingSetup:
|
|
110
104
|
Exclude:
|
111
105
|
- 'spec/lib/geo_combine/geoblacklight_spec.rb'
|
112
106
|
|
113
|
-
# Offense count: 1
|
114
|
-
RSpec/PendingWithoutReason:
|
115
|
-
Exclude:
|
116
|
-
- 'spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb'
|
117
|
-
|
118
107
|
# Offense count: 2
|
119
108
|
RSpec/RepeatedExampleGroupBody:
|
120
109
|
Exclude:
|
121
110
|
- 'spec/lib/geo_combine/iso19139_spec.rb'
|
122
111
|
|
112
|
+
# Offense count: 1
|
113
|
+
# Configuration parameters: Include, CustomTransform, IgnoreMethods, IgnoreMetadata.
|
114
|
+
# Include: **/*_spec.rb
|
115
|
+
RSpec/SpecFilePathFormat:
|
116
|
+
Exclude:
|
117
|
+
- '**/spec/routing/**/*'
|
118
|
+
- 'spec/lib/geo_combine_spec.rb'
|
119
|
+
|
123
120
|
# Offense count: 19
|
124
121
|
RSpec/StubbedMock:
|
125
122
|
Exclude:
|
@@ -145,7 +142,7 @@ Security/Open:
|
|
145
142
|
Exclude:
|
146
143
|
- 'lib/geo_combine/geoblacklight.rb'
|
147
144
|
|
148
|
-
# Offense count:
|
145
|
+
# Offense count: 6
|
149
146
|
# Configuration parameters: AllowedConstants.
|
150
147
|
Style/Documentation:
|
151
148
|
Exclude:
|
@@ -158,9 +155,9 @@ Style/Documentation:
|
|
158
155
|
- 'lib/geo_combine/geometry_types.rb'
|
159
156
|
- 'lib/geo_combine/iso19139.rb'
|
160
157
|
|
161
|
-
# Offense count:
|
158
|
+
# Offense count: 12
|
162
159
|
# This cop supports safe autocorrection (--autocorrect).
|
163
160
|
# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, AllowedPatterns.
|
164
161
|
# URISchemes: http, https
|
165
162
|
Layout/LineLength:
|
166
|
-
Max:
|
163
|
+
Max: 198
|
data/README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# GeoCombine
|
2
2
|
|
3
|
-
|
3
|
+
![CI](https://github.com/OpenGeoMetadata/GeoCombine/actions/workflows/ruby.yml/badge.svg)
|
4
4
|
| [![Coverage Status](https://img.shields.io/badge/coverage-95%25-brightgreen)]()
|
5
5
|
| [![Gem Version](https://img.shields.io/gem/v/geo_combine.svg)](https://github.com/OpenGeoMetadata/GeoCombine/releases)
|
6
6
|
|
7
|
-
|
8
7
|
A Ruby toolkit for managing geospatial metadata, including:
|
9
|
-
|
8
|
+
|
9
|
+
- tasks for cloning, updating, and indexing OpenGeoMetadata metadata
|
10
10
|
- library for converting metadata between standards
|
11
11
|
|
12
12
|
## Installation
|
@@ -19,11 +19,15 @@ gem 'geo_combine'
|
|
19
19
|
|
20
20
|
And then execute:
|
21
21
|
|
22
|
-
|
22
|
+
```sh
|
23
|
+
$ bundle install
|
24
|
+
```
|
23
25
|
|
24
26
|
Or install it yourself as:
|
25
27
|
|
26
|
-
|
28
|
+
```sh
|
29
|
+
$ gem install geo_combine
|
30
|
+
```
|
27
31
|
|
28
32
|
## Usage
|
29
33
|
|
@@ -43,8 +47,42 @@ Or install it yourself as:
|
|
43
47
|
> iso_metadata.to_html
|
44
48
|
```
|
45
49
|
|
50
|
+
### Migrating metadata
|
51
|
+
|
52
|
+
You can use the `GeoCombine::Migrators` to migrate metadata from one schema to another.
|
53
|
+
|
54
|
+
Currently, the only migrator is `GeoCombine::Migrators::V1AardvarkMigrator` which migrates from the [GeoBlacklight v1 schema](https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/gbl-1.0.md) to the [Aardvark schema](https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark.md)
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
# Load a record in geoblacklight v1 schema
|
58
|
+
record = JSON.parse(File.read('.spec/fixtures/docs/full_geoblacklight.json'))
|
59
|
+
|
60
|
+
# Migrate it to Aardvark schema
|
61
|
+
GeoCombine::Migrators::V1AardvarkMigrator.new(v1_hash: record).run
|
62
|
+
```
|
63
|
+
|
64
|
+
Some fields cannot be migrated automatically. To handle the migration of collection names to IDs when migrating from v1 to Aardvark, you can provide a mapping of collection names to IDs to the migrator:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
# You can store this mapping as a JSON or CSV file and load it into a hash
|
68
|
+
id_map = {
|
69
|
+
'My Collection 1' => 'institution:my-collection-1',
|
70
|
+
'My Collection 2' => 'institution:my-collection-2'
|
71
|
+
}
|
72
|
+
|
73
|
+
GeoCombine::Migrators::V1AardvarkMigrator.new(v1_hash: record, collection_id_map: id_map).run
|
74
|
+
```
|
75
|
+
|
46
76
|
### OpenGeoMetadata
|
47
77
|
|
78
|
+
#### Logging
|
79
|
+
|
80
|
+
Some of the tools and scripts in this gem use Ruby's `Logger` class to print information to `$stderr`. By default, the log level is set to `Logger::INFO`. For more verbose information, you can set the `LOG_LEVEL` environment variable to `DEBUG`:
|
81
|
+
|
82
|
+
```sh
|
83
|
+
$ LOG_LEVEL=DEBUG bundle exec rake geocombine:clone
|
84
|
+
```
|
85
|
+
|
48
86
|
#### Clone OpenGeoMetadata repositories locally
|
49
87
|
|
50
88
|
```sh
|
@@ -63,7 +101,7 @@ You can also specify a single repository:
|
|
63
101
|
$ bundle exec rake geocombine:clone[edu.stanford.purl]
|
64
102
|
```
|
65
103
|
|
66
|
-
|
104
|
+
_Note: If you are using zsh, you will need to use escape characters in front of the brackets:_
|
67
105
|
|
68
106
|
```sh
|
69
107
|
$ bundle exec rake geocombine:clone\[edu.stanford.purl\]
|
@@ -83,7 +121,7 @@ You can also specify a single repository:
|
|
83
121
|
$ bundle exec rake geocombine:pull[edu.stanford.purl]
|
84
122
|
```
|
85
123
|
|
86
|
-
|
124
|
+
_Note: If you are using zsh, you will need to use escape characters in front of the brackets:_
|
87
125
|
|
88
126
|
```sh
|
89
127
|
$ bundle exec rake geocombine:pull\[edu.stanford.purl\]
|
@@ -98,23 +136,14 @@ To index into Solr, GeoCombine requires a Solr instance that is running the
|
|
98
136
|
$ bundle exec rake geocombine:index
|
99
137
|
```
|
100
138
|
|
101
|
-
|
139
|
+
If Blacklight is installed in the ruby environment and a solr index is configured, the rake task will use the solr index configured in the Blacklight application (this is the case when invoking GeoCombine from your GeoBlacklight installation). If Blacklight is unavailable, the rake task will try to find a Solr instance running at `http://localhost:8983/solr/blacklight-core`.
|
102
140
|
|
103
|
-
|
104
|
-
|
105
|
-
Solr location can also be specified by an environment variable `SOLR_URL`.
|
141
|
+
You can also set a the Solr instance URL using `SOLR_URL`:
|
106
142
|
|
107
143
|
```sh
|
108
144
|
$ SOLR_URL=http://www.example.com:1234/solr/collection bundle exec rake geocombine:index
|
109
145
|
```
|
110
146
|
|
111
|
-
Depending on your Solr instance's performance characteristics, you may want to
|
112
|
-
change the [`commitWithin` parameter](https://lucene.apache.org/solr/guide/6_6/updatehandlers-in-solrconfig.html) (in milliseconds):
|
113
|
-
|
114
|
-
```sh
|
115
|
-
$ SOLR_COMMIT_WITHIN=100 bundle exec rake geocombine:index
|
116
|
-
```
|
117
|
-
|
118
147
|
### Harvesting and indexing documents from GeoBlacklight sites
|
119
148
|
|
120
149
|
GeoCombine provides a Harvester class and rake task to harvest and index content from GeoBlacklight sites (or any site that follows the Blacklight API format). Given that the configurations can change from consumer to consumer and site to site, the class provides a relatively simple configuration API. This can be configured in an initializer, a wrapping rake task, or any other ruby context where the rake task our class would be invoked.
|
@@ -160,10 +189,6 @@ Crawl delays can be configured (in seconds) either globally for all sites or on
|
|
160
189
|
|
161
190
|
Solr's commitWithin option can be configured (in milliseconds) by passing a value under the commit_within key.
|
162
191
|
|
163
|
-
##### Debugging (default: false)
|
164
|
-
|
165
|
-
The harvester and indexer will only `puts` content when errors happen. It is possible to see some progress information by setting the debug configuration option.
|
166
|
-
|
167
192
|
#### Transforming Documents
|
168
193
|
|
169
194
|
You may need to transform documents that are harvested for various purposes (removing fields, adding fields, omitting a document all together, etc). You can configure some ruby code (a proc) that will take the document in, transform it, and return the transformed document. By default the indexer will remove the `score`, `timestamp`, and `_version_` fields from the documents harvested. If you provide your own transformer, you'll likely want to remove these fields in addition to the other transformations you provide.
|
data/geo_combine.gemspec
CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_dependency 'thor'
|
27
27
|
spec.add_dependency 'faraday-net_http_persistent', '~> 2.0'
|
28
28
|
spec.add_dependency 'git'
|
29
|
+
spec.add_dependency 'faraday-retry', '~> 2.2'
|
29
30
|
|
30
31
|
spec.add_development_dependency 'bundler'
|
31
32
|
spec.add_development_dependency 'rake'
|
@@ -44,7 +44,8 @@ module GeoCombine
|
|
44
44
|
def envelope
|
45
45
|
return envelope_from_bbox unless envelope_from_bbox.nil?
|
46
46
|
return envelope_from_spatial(',') unless envelope_from_spatial(',').nil?
|
47
|
-
|
47
|
+
|
48
|
+
envelope_from_spatial(' ') unless envelope_from_spatial(' ').nil?
|
48
49
|
end
|
49
50
|
|
50
51
|
def envelope_from_bbox
|
@@ -55,7 +56,7 @@ module GeoCombine
|
|
55
56
|
north: extras('bbox-north-lat')
|
56
57
|
)
|
57
58
|
begin
|
58
|
-
|
59
|
+
bbox.to_envelope if bbox.valid?
|
59
60
|
rescue GeoCombine::Exceptions::InvalidGeometry
|
60
61
|
nil
|
61
62
|
end
|
@@ -64,10 +65,10 @@ module GeoCombine
|
|
64
65
|
def envelope_from_spatial(delimiter)
|
65
66
|
bbox = GeoCombine::BoundingBox.from_string_delimiter(
|
66
67
|
extras('spatial'),
|
67
|
-
delimiter:
|
68
|
+
delimiter:
|
68
69
|
)
|
69
70
|
begin
|
70
|
-
|
71
|
+
bbox.to_envelope if bbox.valid?
|
71
72
|
rescue GeoCombine::Exceptions::InvalidGeometry
|
72
73
|
nil
|
73
74
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'geo_combine/logger'
|
4
|
+
|
3
5
|
module GeoCombine
|
4
6
|
##
|
5
7
|
# A class to harvest and index results from GeoBlacklight sites
|
@@ -45,24 +47,25 @@ module GeoCombine
|
|
45
47
|
|
46
48
|
attr_reader :site, :site_key
|
47
49
|
|
48
|
-
def initialize(site_key)
|
50
|
+
def initialize(site_key, logger: GeoCombine::Logger.logger)
|
49
51
|
@site_key = site_key
|
50
52
|
@site = self.class.config[site_key]
|
53
|
+
@logger = logger
|
51
54
|
|
52
55
|
raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
|
53
56
|
end
|
54
57
|
|
55
58
|
def index
|
56
|
-
|
59
|
+
@logger.debug "fetching page 1 @ #{base_url}&page=1"
|
57
60
|
response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
|
58
61
|
response_class = BlacklightResponseVersionFactory.call(response)
|
59
62
|
|
60
|
-
response_class.new(response
|
63
|
+
response_class.new(response:, base_url:, logger: @logger).documents.each do |docs|
|
61
64
|
docs.map! do |document|
|
62
65
|
self.class.document_transformer&.call(document)
|
63
66
|
end.compact
|
64
67
|
|
65
|
-
|
68
|
+
@logger.debug "adding #{docs.count} documents to solr"
|
66
69
|
solr_connection.update params: { commitWithin: commit_within, overwrite: true },
|
67
70
|
data: docs.to_json,
|
68
71
|
headers: { 'Content-Type' => 'application/json' }
|
@@ -91,10 +94,11 @@ module GeoCombine
|
|
91
94
|
attr_reader :base_url
|
92
95
|
attr_accessor :response, :page
|
93
96
|
|
94
|
-
def initialize(response:, base_url:)
|
97
|
+
def initialize(response:, base_url:, logger: GeoCombine::Logger.logger)
|
95
98
|
@base_url = base_url
|
96
99
|
@response = response
|
97
100
|
@page = 1
|
101
|
+
@logger = logger
|
98
102
|
end
|
99
103
|
|
100
104
|
def documents
|
@@ -106,12 +110,12 @@ module GeoCombine
|
|
106
110
|
break if current_page == total_pages
|
107
111
|
|
108
112
|
self.page += 1
|
109
|
-
|
113
|
+
@logger.debug "fetching page #{page} @ #{url}"
|
110
114
|
|
111
115
|
begin
|
112
116
|
self.response = JSON.parse(Net::HTTP.get(URI(url)))
|
113
117
|
rescue StandardError => e
|
114
|
-
|
118
|
+
@logger.error "request for #{url} failed with #{e}"
|
115
119
|
self.response = nil
|
116
120
|
end
|
117
121
|
end
|
@@ -138,10 +142,11 @@ module GeoCombine
|
|
138
142
|
attr_reader :base_url
|
139
143
|
attr_accessor :response, :page
|
140
144
|
|
141
|
-
def initialize(response:, base_url:)
|
145
|
+
def initialize(response:, base_url:, logger: GeoCombine::Logger.logger)
|
142
146
|
@base_url = base_url
|
143
147
|
@response = response
|
144
148
|
@page = 1
|
149
|
+
@logger = logger
|
145
150
|
end
|
146
151
|
|
147
152
|
def documents
|
@@ -157,11 +162,11 @@ module GeoCombine
|
|
157
162
|
|
158
163
|
url = "#{url}&format=json"
|
159
164
|
self.page += 1
|
160
|
-
|
165
|
+
@logger.debug "fetching page #{page} @ #{url}"
|
161
166
|
begin
|
162
167
|
self.response = JSON.parse(Net::HTTP.get(URI(url)))
|
163
168
|
rescue StandardError => e
|
164
|
-
|
169
|
+
@logger.error "Request for #{url} failed with #{e}"
|
165
170
|
self.response = nil
|
166
171
|
end
|
167
172
|
end
|
@@ -170,11 +175,11 @@ module GeoCombine
|
|
170
175
|
private
|
171
176
|
|
172
177
|
def documents_from_urls(urls)
|
173
|
-
|
178
|
+
@logger.debug "fetching #{urls.count} documents for page #{page}"
|
174
179
|
urls.map do |url|
|
175
180
|
JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
|
176
181
|
rescue StandardError => e
|
177
|
-
|
182
|
+
@logger.error "fetching \"#{url}/raw\" failed with #{e}"
|
178
183
|
|
179
184
|
nil
|
180
185
|
end.compact
|
@@ -13,7 +13,7 @@ module GeoCombine
|
|
13
13
|
attr_reader :metadata
|
14
14
|
|
15
15
|
GEOBLACKLIGHT_VERSION = '1.0'
|
16
|
-
SCHEMA_JSON_URL = "https://raw.githubusercontent.com/OpenGeoMetadata/opengeometadata.github.io/main/docs/schema/geoblacklight-schema-#{GEOBLACKLIGHT_VERSION}.json"
|
16
|
+
SCHEMA_JSON_URL = "https://raw.githubusercontent.com/OpenGeoMetadata/opengeometadata.github.io/main/docs/schema/geoblacklight-schema-#{GEOBLACKLIGHT_VERSION}.json".freeze
|
17
17
|
DEPRECATED_KEYS_V1 = %w[
|
18
18
|
uuid
|
19
19
|
georss_polygon_s
|
@@ -3,6 +3,8 @@
|
|
3
3
|
require 'json'
|
4
4
|
require 'find'
|
5
5
|
require 'git'
|
6
|
+
require 'net/http'
|
7
|
+
require 'geo_combine/logger'
|
6
8
|
|
7
9
|
module GeoCombine
|
8
10
|
# Harvests Geoblacklight documents from OpenGeoMetadata for indexing
|
@@ -29,26 +31,37 @@ module GeoCombine
|
|
29
31
|
|
30
32
|
def initialize(
|
31
33
|
ogm_path: ENV.fetch('OGM_PATH', 'tmp/opengeometadata'),
|
32
|
-
schema_version: ENV.fetch('SCHEMA_VERSION', '1.0')
|
34
|
+
schema_version: ENV.fetch('SCHEMA_VERSION', '1.0'),
|
35
|
+
logger: GeoCombine::Logger.logger
|
33
36
|
)
|
34
37
|
@ogm_path = ogm_path
|
35
38
|
@schema_version = schema_version
|
39
|
+
@logger = logger
|
36
40
|
end
|
37
41
|
|
38
42
|
# Enumerable of docs to index, for passing to an indexer
|
39
43
|
def docs_to_index
|
40
44
|
return to_enum(:docs_to_index) unless block_given?
|
41
45
|
|
46
|
+
@logger.info "loading documents from #{ogm_path}"
|
42
47
|
Find.find(@ogm_path) do |path|
|
43
48
|
# skip non-json and layers.json files
|
44
|
-
|
49
|
+
if File.basename(path) == 'layers.json' || !File.basename(path).end_with?('.json')
|
50
|
+
@logger.debug "skipping #{path}; not a geoblacklight JSON document"
|
51
|
+
next
|
52
|
+
end
|
45
53
|
|
46
54
|
doc = JSON.parse(File.read(path))
|
47
55
|
[doc].flatten.each do |record|
|
48
56
|
# skip indexing if this record has a different schema version than what we want
|
49
57
|
record_schema = record['gbl_mdVersion_s'] || record['geoblacklight_version']
|
50
|
-
|
58
|
+
record_id = record['layer_slug_s'] || record['dc_identifier_s']
|
59
|
+
if record_schema != @schema_version
|
60
|
+
@logger.debug "skipping #{record_id}; schema version #{record_schema} doesn't match #{@schema_version}"
|
61
|
+
next
|
62
|
+
end
|
51
63
|
|
64
|
+
@logger.debug "found record #{record_id} at #{path}"
|
52
65
|
yield record, path
|
53
66
|
end
|
54
67
|
end
|
@@ -61,14 +74,16 @@ module GeoCombine
|
|
61
74
|
clone(repo) unless File.directory? repo_path
|
62
75
|
|
63
76
|
Git.open(repo_path).pull
|
64
|
-
|
65
|
-
|
77
|
+
@logger.info "updated #{repo}"
|
78
|
+
repo
|
66
79
|
end
|
67
80
|
|
68
81
|
# Update all repositories
|
69
|
-
# Return the
|
82
|
+
# Return the names of repositories updated
|
70
83
|
def pull_all
|
71
|
-
repositories.map(&method(:pull)).
|
84
|
+
updated = repositories.map(&method(:pull)).compact
|
85
|
+
@logger.info "updated #{updated.size} repositories"
|
86
|
+
updated
|
72
87
|
end
|
73
88
|
|
74
89
|
# Clone a repository via git
|
@@ -76,25 +91,27 @@ module GeoCombine
|
|
76
91
|
def clone(repo)
|
77
92
|
repo_path = File.join(@ogm_path, repo)
|
78
93
|
repo_info = repository_info(repo)
|
94
|
+
repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git"
|
79
95
|
|
80
96
|
# Skip if exists; warn if archived or empty
|
81
97
|
if File.directory? repo_path
|
82
|
-
|
83
|
-
return
|
98
|
+
@logger.warn "skipping clone to #{repo_path}; directory exists"
|
99
|
+
return nil
|
84
100
|
end
|
85
|
-
|
86
|
-
|
101
|
+
@logger.warn "repository is archived: #{repo_url}" if repo_info['archived']
|
102
|
+
@logger.warn "repository is empty: #{repo_url}" if repo_info['size'].zero?
|
87
103
|
|
88
|
-
repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git"
|
89
104
|
Git.clone(repo_url, nil, path: ogm_path, depth: 1)
|
90
|
-
|
91
|
-
|
105
|
+
@logger.info "cloned #{repo_url} to #{repo_path}"
|
106
|
+
repo
|
92
107
|
end
|
93
108
|
|
94
109
|
# Clone all repositories via git
|
95
|
-
# Return the
|
110
|
+
# Return the names of repositories cloned.
|
96
111
|
def clone_all
|
97
|
-
repositories.map(&method(:clone)).
|
112
|
+
cloned = repositories.map(&method(:clone)).compact
|
113
|
+
@logger.info "cloned #{cloned.size} repositories"
|
114
|
+
cloned
|
98
115
|
end
|
99
116
|
|
100
117
|
private
|