geo_combine 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -5
- data/lib/geo_combine/harvester.rb +1 -1
- data/lib/geo_combine/indexer.rb +3 -9
- data/lib/geo_combine/migrators/v1_aardvark_migrator.rb +6 -1
- data/lib/geo_combine/version.rb +1 -1
- data/spec/fixtures/docs/full_geoblacklight_aardvark.json +1 -0
- data/spec/lib/geo_combine/harvester_spec.rb +1 -1
- data/spec/lib/geo_combine/indexer_spec.rb +12 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f3486b76e27b3a26ad35ed97c2ed196890ea7ea0a4f707f8cfebca3967cd1472
|
4
|
+
data.tar.gz: 0c2d1e07f52144bd96faec67b83facb0a379fbc17f69fcaa36fd584da5b32773
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71a0cc0aa25959868ef0eb370d8b53561d3505ee3bdf3c8dbd3d8a1553ff66c1049eed1521ea9ba4e665a615c36712a6dd3d025488ab8e46e1c91b2b0536e4f3
|
7
|
+
data.tar.gz: 95422cfd7054c883a2fad7b31083cd514eb4cecb4ebd978f6df692d35e801484d71b78f4e316a358ad18f67ceb0e13bb1fcb52f6cfee5b10b5c48747294d693b
|
data/README.md
CHANGED
@@ -32,18 +32,32 @@ $ gem install geo_combine
|
|
32
32
|
## Usage
|
33
33
|
|
34
34
|
### Converting metadata
|
35
|
-
|
35
|
+
#### Converting metadata into GeoBlacklight JSON
|
36
|
+
GeoCombine provides several classes representing different metadata standards that implement the `#to_geoblacklight` method for generating records in the [GeoBlacklight JSON format](https://opengeometadata.org/reference/):
|
37
|
+
```ruby
|
38
|
+
GeoCombine::Iso19139 # ISO 19139 XML
|
39
|
+
GeoCombine::OGP # OpenGeoPortal JSON
|
40
|
+
GeoCombine::Fgdc # FGDC XML
|
41
|
+
GeoCombine::EsriOpenData # Esri Open Data Portal JSON
|
42
|
+
GeoCombine::CkanMetadata # CKAN JSON
|
43
|
+
```
|
44
|
+
An example for converting an ISO 19139 XML record:
|
36
45
|
```ruby
|
37
46
|
# Create a new ISO19139 object
|
38
47
|
> iso_metadata = GeoCombine::Iso19139.new('./tmp/opengeometadata/edu.stanford.purl/bb/338/jh/0716/iso19139.xml')
|
39
48
|
|
40
|
-
# Convert
|
49
|
+
# Convert to GeoBlacklight's metadata format
|
41
50
|
> iso_metadata.to_geoblacklight
|
42
51
|
|
43
|
-
#
|
52
|
+
# Output it as JSON instead of a Ruby hash
|
44
53
|
> iso_metadata.to_geoblacklight.to_json
|
54
|
+
```
|
55
|
+
Some formats also support conversion into HTML for display in a web browser:
|
56
|
+
```ruby
|
57
|
+
# Create a new ISO19139 object
|
58
|
+
> iso_metadata = GeoCombine::Iso19139.new('./tmp/opengeometadata/edu.stanford.purl/bb/338/jh/0716/iso19139.xml')
|
45
59
|
|
46
|
-
# Convert ISO
|
60
|
+
# Convert ISO to HTML
|
47
61
|
> iso_metadata.to_html
|
48
62
|
```
|
49
63
|
|
@@ -73,7 +87,7 @@ id_map = {
|
|
73
87
|
GeoCombine::Migrators::V1AardvarkMigrator.new(v1_hash: record, collection_id_map: id_map).run
|
74
88
|
```
|
75
89
|
|
76
|
-
### OpenGeoMetadata
|
90
|
+
### Downloading metadata from OpenGeoMetadata
|
77
91
|
|
78
92
|
#### Logging
|
79
93
|
|
@@ -144,6 +158,13 @@ You can also set a the Solr instance URL using `SOLR_URL`:
|
|
144
158
|
$ SOLR_URL=http://www.example.com:1234/solr/collection bundle exec rake geocombine:index
|
145
159
|
```
|
146
160
|
|
161
|
+
By default, GeoCombine will index only records using the Aardvark metadata format. If you instead want to index records using an older format (e.g. because your GeoBlacklight instance is version 3 or older), you can set the `SCHEMA_VERSION` environment variable:
|
162
|
+
|
163
|
+
```sh
|
164
|
+
# Only index schema version 1.0 records
|
165
|
+
$ SCHEMA_VERSION=1.0 bundle exec rake geocombine:index
|
166
|
+
```
|
167
|
+
|
147
168
|
### Harvesting and indexing documents from GeoBlacklight sites
|
148
169
|
|
149
170
|
GeoCombine provides a Harvester class and rake task to harvest and index content from GeoBlacklight sites (or any site that follows the Blacklight API format). Given that the configurations can change from consumer to consumer and site to site, the class provides a relatively simple configuration API. This can be configured in an initializer, a wrapping rake task, or any other ruby context where the rake task our class would be invoked.
|
@@ -31,7 +31,7 @@ module GeoCombine
|
|
31
31
|
|
32
32
|
def initialize(
|
33
33
|
ogm_path: ENV.fetch('OGM_PATH', 'tmp/opengeometadata'),
|
34
|
-
schema_version: ENV.fetch('SCHEMA_VERSION', '
|
34
|
+
schema_version: ENV.fetch('SCHEMA_VERSION', 'Aardvark'),
|
35
35
|
logger: GeoCombine::Logger.logger
|
36
36
|
)
|
37
37
|
@ogm_path = ogm_path
|
data/lib/geo_combine/indexer.rb
CHANGED
@@ -35,16 +35,10 @@ module GeoCombine
|
|
35
35
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
36
36
|
|
37
37
|
# Index in batches; set batch size via BATCH_SIZE
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
batch << [doc, path]
|
42
|
-
else
|
43
|
-
total_indexed += index_batch(batch)
|
44
|
-
batch = []
|
45
|
-
end
|
38
|
+
docs.each_slice(@batch_size) do |slice|
|
39
|
+
batch = slice.map { |doc, path| [doc, path] }
|
40
|
+
total_indexed += index_batch(batch)
|
46
41
|
end
|
47
|
-
total_indexed += index_batch(batch) unless batch.empty?
|
48
42
|
|
49
43
|
# Issue a commit to make sure all documents are indexed
|
50
44
|
@solr.commit
|
@@ -50,6 +50,7 @@ module GeoCombine
|
|
50
50
|
end
|
51
51
|
|
52
52
|
# Convert non-crosswalked fields via lookup tables
|
53
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
53
54
|
def convert_non_crosswalked_fields
|
54
55
|
# Keys may or may not include whitespace, so we normalize them.
|
55
56
|
# Resource class is required so we default to "Other"; resource type is not required.
|
@@ -57,6 +58,9 @@ module GeoCombine
|
|
57
58
|
resource_type = RESOURCE_TYPE_MAP[@v1_hash['layer_geom_type_s']&.gsub(/\s+/, '')]
|
58
59
|
@v2_hash['gbl_resourceType_sm'] = resource_type unless resource_type.nil?
|
59
60
|
|
61
|
+
# If locn_geometry is in the ENVELOPE format, also add it as dcat_bbox
|
62
|
+
@v2_hash['dcat_bbox'] = @v2_hash['locn_geometry'] if @v2_hash['locn_geometry']&.match?(/ENVELOPE/)
|
63
|
+
|
60
64
|
# If the user specified a collection id map, use it to convert the collection names to ids
|
61
65
|
is_part_of = @v1_hash['dct_isPartOf_sm']&.map { |name| @collection_id_map[name] }&.compact
|
62
66
|
if is_part_of.present?
|
@@ -65,6 +69,7 @@ module GeoCombine
|
|
65
69
|
@v2_hash.delete('dct_isPartOf_sm')
|
66
70
|
end
|
67
71
|
end
|
72
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
68
73
|
|
69
74
|
# Remove fields that are no longer used
|
70
75
|
def remove_deprecated_fields
|
@@ -80,7 +85,7 @@ module GeoCombine
|
|
80
85
|
'dc_publisher_s' => 'dct_publisher_sm', # new namespace; single to multi-valued
|
81
86
|
'dct_provenance_s' => 'schema_provider_s', # new URI name
|
82
87
|
'dc_subject_sm' => 'dct_subject_sm', # new namespace
|
83
|
-
'solr_geom' => '
|
88
|
+
'solr_geom' => 'locn_geometry', # new URI name
|
84
89
|
'solr_year_i' => 'gbl_indexYear_im', # new URI name; single to multi-valued
|
85
90
|
'dc_source_sm' => 'dct_source_sm', # new namespace
|
86
91
|
'dc_rights_s' => 'dct_accessRights_s', # new URI name
|
data/lib/geo_combine/version.rb
CHANGED
@@ -5,7 +5,7 @@ require 'geo_combine/harvester'
|
|
5
5
|
require 'spec_helper'
|
6
6
|
|
7
7
|
RSpec.describe GeoCombine::Harvester do
|
8
|
-
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing',
|
8
|
+
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing', schema_version: '1.0') }
|
9
9
|
|
10
10
|
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
|
11
11
|
let(:repo_name) { 'my-institution' }
|
@@ -96,6 +96,18 @@ RSpec.describe GeoCombine::Indexer do
|
|
96
96
|
)
|
97
97
|
end
|
98
98
|
|
99
|
+
context 'when the number of docs is greater than batch size' do
|
100
|
+
before do
|
101
|
+
stub_const('ENV', 'SOLR_BATCH_SIZE' => 10)
|
102
|
+
end
|
103
|
+
|
104
|
+
let(:docs) { (1..40).map { |n| [{ 'id' => n }, "path/to/record#{n}.json"] } }
|
105
|
+
|
106
|
+
it 'indexes the correct number of documents' do
|
107
|
+
expect(indexer.index(docs)).to eq 40
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
99
111
|
it 'commits changes to solr after indexing' do
|
100
112
|
indexer.index(docs)
|
101
113
|
expect(solr).to have_received(:commit).once
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: geo_combine
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jack Reed
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -376,7 +376,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
376
376
|
- !ruby/object:Gem::Version
|
377
377
|
version: '0'
|
378
378
|
requirements: []
|
379
|
-
rubygems_version: 3.
|
379
|
+
rubygems_version: 3.4.19
|
380
380
|
signing_key:
|
381
381
|
specification_version: 4
|
382
382
|
summary: A Ruby toolkit for managing geospatial metadata
|