geo_combine 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +3 -6
- data/.rubocop.yml +4 -1
- data/.rubocop_todo.yml +16 -19
- data/README.md +47 -22
- data/geo_combine.gemspec +1 -0
- data/lib/geo_combine/ckan_metadata.rb +5 -4
- data/lib/geo_combine/geo_blacklight_harvester.rb +17 -12
- data/lib/geo_combine/geoblacklight.rb +1 -1
- data/lib/geo_combine/harvester.rb +33 -16
- data/lib/geo_combine/indexer.rb +104 -25
- data/lib/geo_combine/logger.rb +16 -0
- data/lib/geo_combine/migrators/v1_aardvark_migrator.rb +76 -10
- data/lib/geo_combine/ogp.rb +1 -1
- data/lib/geo_combine/version.rb +1 -1
- data/lib/tasks/geo_combine.rake +3 -7
- data/spec/fixtures/docs/full_geoblacklight.json +8 -1
- data/spec/fixtures/docs/full_geoblacklight_aardvark.json +26 -8
- data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +5 -4
- data/spec/lib/geo_combine/harvester_spec.rb +8 -22
- data/spec/lib/geo_combine/indexer_spec.rb +92 -21
- data/spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb +29 -5
- data/spec/lib/geo_combine_spec.rb +20 -17
- data/spec/spec_helper.rb +1 -1
- metadata +17 -2
data/lib/geo_combine/indexer.rb
CHANGED
@@ -1,47 +1,126 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'rsolr'
|
4
|
+
require 'faraday/retry'
|
4
5
|
require 'faraday/net_http_persistent'
|
6
|
+
require 'geo_combine/logger'
|
5
7
|
|
6
8
|
module GeoCombine
|
7
9
|
# Indexes Geoblacklight documents into Solr
|
8
10
|
class Indexer
|
9
11
|
attr_reader :solr
|
10
12
|
|
11
|
-
def
|
12
|
-
|
13
|
+
def initialize(solr: nil, logger: GeoCombine::Logger.logger)
|
14
|
+
@logger = logger
|
15
|
+
@batch_size = ENV.fetch('SOLR_BATCH_SIZE', 100).to_i
|
16
|
+
|
17
|
+
# If SOLR_URL is set, use it; if in a Geoblacklight app, use its solr core
|
18
|
+
solr_url = ENV.fetch('SOLR_URL', nil)
|
19
|
+
solr_url ||= Blacklight.default_index.connection.base_uri.to_s if defined? Blacklight
|
20
|
+
|
21
|
+
# If neither, warn and try to use local Blacklight default solr core
|
22
|
+
if solr_url.nil?
|
23
|
+
@logger.warn 'SOLR_URL not set; using Blacklight default'
|
24
|
+
solr_url = 'http://localhost:8983/solr/blacklight-core'
|
25
|
+
end
|
26
|
+
|
27
|
+
@solr = solr || RSolr.connect(client, url: solr_url)
|
13
28
|
end
|
14
29
|
|
15
|
-
|
16
|
-
|
30
|
+
# Index everything and return the number of docs successfully indexed
|
31
|
+
def index(docs)
|
32
|
+
# Track total indexed and time spent
|
33
|
+
@logger.info "indexing into #{solr_url}"
|
34
|
+
total_indexed = 0
|
35
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
36
|
+
|
37
|
+
# Index in batches; set batch size via BATCH_SIZE
|
38
|
+
batch = []
|
39
|
+
docs.each do |doc, path|
|
40
|
+
if batch.size < @batch_size
|
41
|
+
batch << [doc, path]
|
42
|
+
else
|
43
|
+
total_indexed += index_batch(batch)
|
44
|
+
batch = []
|
45
|
+
end
|
46
|
+
end
|
47
|
+
total_indexed += index_batch(batch) unless batch.empty?
|
48
|
+
|
49
|
+
# Issue a commit to make sure all documents are indexed
|
50
|
+
@solr.commit
|
51
|
+
end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
52
|
+
sec = end_time - start_time
|
53
|
+
@logger.info format('indexed %<total_indexed>d documents in %<sec>.2f seconds', total_indexed:, sec:)
|
54
|
+
total_indexed
|
17
55
|
end
|
18
56
|
|
57
|
+
# URL to the solr instance being used
|
19
58
|
def solr_url
|
20
59
|
@solr.options[:url]
|
21
60
|
end
|
22
61
|
|
23
|
-
|
24
|
-
def index(docs, commit_within: ENV.fetch('SOLR_COMMIT_WITHIN', 5000).to_i)
|
25
|
-
indexed_count = 0
|
26
|
-
|
27
|
-
docs.each do |record, path|
|
28
|
-
# log the unique identifier for the record for debugging
|
29
|
-
id = record['id'] || record['dc_identifier_s']
|
30
|
-
puts "Indexing #{id}: #{path}" if $DEBUG
|
31
|
-
|
32
|
-
# index the record into solr
|
33
|
-
@solr.update params: { commitWithin: commit_within, overwrite: true },
|
34
|
-
data: [record].to_json,
|
35
|
-
headers: { 'Content-Type' => 'application/json' }
|
36
|
-
|
37
|
-
# count the number of records successfully indexed
|
38
|
-
indexed_count += 1
|
39
|
-
rescue RSolr::Error::Http => e
|
40
|
-
puts e
|
41
|
-
end
|
62
|
+
private
|
42
63
|
|
43
|
-
|
44
|
-
|
64
|
+
# Index a batch of documents; if we fail, index them all individually
|
65
|
+
def index_batch(batch)
|
66
|
+
docs = batch.map(&:first)
|
67
|
+
@solr.update(data: batch_json(docs), params:, headers:)
|
68
|
+
@logger.debug "indexed batch (#{batch.size} docs)"
|
69
|
+
batch.size
|
70
|
+
rescue RSolr::Error::Http => e
|
71
|
+
@logger.error "error indexing batch (#{batch.size} docs): #{format_error(e)}"
|
72
|
+
@logger.warn 'retrying documents individually'
|
73
|
+
batch.map { |doc, path| index_single(doc, path) }.compact.size
|
74
|
+
end
|
75
|
+
|
76
|
+
# Index a single document; if it fails, log the error and continue
|
77
|
+
def index_single(doc, path)
|
78
|
+
@solr.add(doc, params:, headers:)
|
79
|
+
@logger.debug "indexed #{path}"
|
80
|
+
doc
|
81
|
+
rescue RSolr::Error::Http => e
|
82
|
+
@logger.error "error indexing #{path}: #{format_error(e)}"
|
83
|
+
nil
|
84
|
+
end
|
85
|
+
|
86
|
+
# Generate a JSON string to send to solr update API for a batch of documents
|
87
|
+
def batch_json(batch)
|
88
|
+
batch.map { |doc| "add: { doc: #{doc.to_json} }" }.join(",\n").prepend('{ ').concat(' }')
|
89
|
+
end
|
90
|
+
|
91
|
+
# Generate a friendly error message for logging including status code and message
|
92
|
+
def format_error(error)
|
93
|
+
code = error.response[:status]
|
94
|
+
status_info = "#{code} #{RSolr::Error::Http::STATUS_CODES[code.to_i]}"
|
95
|
+
error_info = parse_solr_error(error)
|
96
|
+
[status_info, error_info].compact.join(' - ')
|
97
|
+
end
|
98
|
+
|
99
|
+
# Extract the specific error message from a solr JSON error response, if any
|
100
|
+
def parse_solr_error(error)
|
101
|
+
JSON.parse(error.response[:body]).dig('error', 'msg')
|
102
|
+
rescue StandardError
|
103
|
+
nil
|
104
|
+
end
|
105
|
+
|
106
|
+
def headers
|
107
|
+
{ 'Content-Type' => 'application/json' }
|
108
|
+
end
|
109
|
+
|
110
|
+
def params
|
111
|
+
{ overwrite: true }
|
112
|
+
end
|
113
|
+
|
114
|
+
def client
|
115
|
+
@client ||= Faraday.new do |conn|
|
116
|
+
conn.request :retry, max: 3, interval: 1, backoff_factor: 2, exceptions: [
|
117
|
+
Faraday::TimeoutError,
|
118
|
+
Faraday::ConnectionFailed,
|
119
|
+
Faraday::TooManyRequestsError
|
120
|
+
]
|
121
|
+
conn.response :raise_error
|
122
|
+
conn.adapter :net_http_persistent
|
123
|
+
end
|
45
124
|
end
|
46
125
|
end
|
47
126
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
module GeoCombine
|
6
|
+
# Logger for gem
|
7
|
+
class Logger
|
8
|
+
def self.logger
|
9
|
+
@logger ||= ::Logger.new(
|
10
|
+
$stderr,
|
11
|
+
progname: 'GeoCombine',
|
12
|
+
level: ENV.fetch('LOG_LEVEL', 'info').to_sym
|
13
|
+
)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -1,41 +1,86 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'active_support'
|
4
|
+
|
3
5
|
module GeoCombine
|
4
6
|
module Migrators
|
5
|
-
# TODO: WARNING! This class is not fully implemented and should not be used in
|
6
|
-
# production. See https://github.com/OpenGeoMetadata/GeoCombine/issues/121
|
7
|
-
# for remaining work.
|
8
|
-
#
|
9
7
|
# migrates the v1 schema to the aardvark schema
|
10
8
|
class V1AardvarkMigrator
|
11
9
|
attr_reader :v1_hash
|
12
10
|
|
13
11
|
# @param v1_hash [Hash] parsed json in the v1 schema
|
14
|
-
|
12
|
+
# @param collection_id_map [Hash] a hash mapping collection names to ids for converting dct_isPartOf_sm
|
13
|
+
def initialize(v1_hash:, collection_id_map: {})
|
15
14
|
@v1_hash = v1_hash
|
15
|
+
@v2_hash = v1_hash
|
16
|
+
@collection_id_map = collection_id_map
|
16
17
|
end
|
17
18
|
|
18
19
|
def run
|
19
|
-
|
20
|
-
v2_hash['gbl_mdVersion_s']
|
21
|
-
|
20
|
+
# Return unchanged if already in the aardvark schema
|
21
|
+
return @v2_hash if @v2_hash['gbl_mdVersion_s'] == 'Aardvark'
|
22
|
+
|
23
|
+
# Convert the record
|
24
|
+
convert_keys
|
25
|
+
convert_single_to_multi_valued_fields
|
26
|
+
convert_non_crosswalked_fields
|
27
|
+
remove_deprecated_fields
|
28
|
+
|
29
|
+
# Mark the record as converted and return it
|
30
|
+
@v2_hash['gbl_mdVersion_s'] = 'Aardvark'
|
31
|
+
@v2_hash
|
22
32
|
end
|
23
33
|
|
34
|
+
# Namespace and URI changes to fields
|
24
35
|
def convert_keys
|
25
|
-
|
36
|
+
@v2_hash.transform_keys! do |k|
|
26
37
|
SCHEMA_FIELD_MAP[k] || k
|
27
38
|
end
|
28
39
|
end
|
29
40
|
|
41
|
+
# Fields that need to be converted from single to multi-valued
|
42
|
+
def convert_single_to_multi_valued_fields
|
43
|
+
@v2_hash = @v2_hash.each_with_object({}) do |(k, v), h|
|
44
|
+
h[k] = if !v.is_a?(Array) && k.match?(/.*_[s|i]m/)
|
45
|
+
[v]
|
46
|
+
else
|
47
|
+
v
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Convert non-crosswalked fields via lookup tables
|
53
|
+
def convert_non_crosswalked_fields
|
54
|
+
# Keys may or may not include whitespace, so we normalize them.
|
55
|
+
# Resource class is required so we default to "Other"; resource type is not required.
|
56
|
+
@v2_hash['gbl_resourceClass_sm'] = RESOURCE_CLASS_MAP[@v1_hash['dc_type_s']&.gsub(/\s+/, '')] || ['Other']
|
57
|
+
resource_type = RESOURCE_TYPE_MAP[@v1_hash['layer_geom_type_s']&.gsub(/\s+/, '')]
|
58
|
+
@v2_hash['gbl_resourceType_sm'] = resource_type unless resource_type.nil?
|
59
|
+
|
60
|
+
# If the user specified a collection id map, use it to convert the collection names to ids
|
61
|
+
is_part_of = @v1_hash['dct_isPartOf_sm']&.map { |name| @collection_id_map[name] }&.compact
|
62
|
+
if is_part_of.present?
|
63
|
+
@v2_hash['dct_isPartOf_sm'] = is_part_of
|
64
|
+
else
|
65
|
+
@v2_hash.delete('dct_isPartOf_sm')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Remove fields that are no longer used
|
70
|
+
def remove_deprecated_fields
|
71
|
+
@v2_hash = @v2_hash.except(*SCHEMA_FIELD_MAP.keys, 'dc_type_s', 'layer_geom_type_s')
|
72
|
+
end
|
73
|
+
|
30
74
|
SCHEMA_FIELD_MAP = {
|
31
75
|
'dc_title_s' => 'dct_title_s', # new namespace
|
32
76
|
'dc_description_s' => 'dct_description_sm', # new namespace; single to multi-valued
|
33
77
|
'dc_language_s' => 'dct_language_sm', # new namespace; single to multi-valued
|
34
|
-
'dc_language_sm' => 'dct_language_sm', # new namespace
|
78
|
+
'dc_language_sm' => 'dct_language_sm', # new namespace
|
35
79
|
'dc_creator_sm' => 'dct_creator_sm', # new namespace
|
36
80
|
'dc_publisher_s' => 'dct_publisher_sm', # new namespace; single to multi-valued
|
37
81
|
'dct_provenance_s' => 'schema_provider_s', # new URI name
|
38
82
|
'dc_subject_sm' => 'dct_subject_sm', # new namespace
|
83
|
+
'solr_geom' => 'dcat_bbox', # new URI name
|
39
84
|
'solr_year_i' => 'gbl_indexYear_im', # new URI name; single to multi-valued
|
40
85
|
'dc_source_sm' => 'dct_source_sm', # new namespace
|
41
86
|
'dc_rights_s' => 'dct_accessRights_s', # new URI name
|
@@ -47,6 +92,27 @@ module GeoCombine
|
|
47
92
|
'geoblacklight_version' => 'gbl_mdVersion_s', # new URI name
|
48
93
|
'suppressed_b' => 'gbl_suppressed_b' # new namespace
|
49
94
|
}.freeze
|
95
|
+
|
96
|
+
# Map Dublin Core types to Aardvark resource class sets
|
97
|
+
# See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-class.md
|
98
|
+
RESOURCE_CLASS_MAP = {
|
99
|
+
'Collection' => ['Collections'],
|
100
|
+
'Dataset' => ['Datasets'],
|
101
|
+
'Image' => ['Imagery'],
|
102
|
+
'InteractiveResource' => ['Websites'],
|
103
|
+
'Service' => ['Web services'],
|
104
|
+
'StillImage' => ['Imagery']
|
105
|
+
}.freeze
|
106
|
+
|
107
|
+
# Map geometry types to Aardvark resource type sets
|
108
|
+
# See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-type.md
|
109
|
+
RESOURCE_TYPE_MAP = {
|
110
|
+
'Point' => ['Point data'],
|
111
|
+
'Line' => ['Line data'],
|
112
|
+
'Polygon' => ['Polygon data'],
|
113
|
+
'Raster' => ['Raster data'],
|
114
|
+
'Table' => ['Table data']
|
115
|
+
}.freeze
|
50
116
|
end
|
51
117
|
end
|
52
118
|
end
|
data/lib/geo_combine/ogp.rb
CHANGED
data/lib/geo_combine/version.rb
CHANGED
data/lib/tasks/geo_combine.rake
CHANGED
@@ -12,24 +12,20 @@ namespace :geocombine do
|
|
12
12
|
desc 'Clone OpenGeoMetadata repositories'
|
13
13
|
task :clone, [:repo] do |_t, args|
|
14
14
|
harvester = GeoCombine::Harvester.new
|
15
|
-
|
16
|
-
puts "Cloned #{total} repositories"
|
15
|
+
args[:repo] ? harvester.clone(args.repo) : harvester.clone_all
|
17
16
|
end
|
18
17
|
|
19
18
|
desc '"git pull" OpenGeoMetadata repositories'
|
20
19
|
task :pull, [:repo] do |_t, args|
|
21
20
|
harvester = GeoCombine::Harvester.new
|
22
|
-
|
23
|
-
puts "Updated #{total} repositories"
|
21
|
+
args[:repo] ? harvester.pull(args.repo) : harvester.pull_all
|
24
22
|
end
|
25
23
|
|
26
24
|
desc 'Index all JSON documents except Layers.json'
|
27
25
|
task :index do
|
28
26
|
harvester = GeoCombine::Harvester.new
|
29
27
|
indexer = GeoCombine::Indexer.new
|
30
|
-
|
31
|
-
total = indexer.index(harvester.docs_to_index)
|
32
|
-
puts "Indexed #{total} documents"
|
28
|
+
indexer.index(harvester.docs_to_index)
|
33
29
|
end
|
34
30
|
|
35
31
|
namespace :geoblacklight_harvester do
|
@@ -28,6 +28,13 @@
|
|
28
28
|
"dct_spatial_sm":[
|
29
29
|
"Uganda"
|
30
30
|
],
|
31
|
+
"dct_isPartOf_sm":[
|
32
|
+
"Uganda GIS Maps and Data, 2000-2010"
|
33
|
+
],
|
34
|
+
"dc_source_sm": [
|
35
|
+
"stanford-rb371kw9607"
|
36
|
+
],
|
31
37
|
"solr_geom":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)",
|
32
|
-
"solr_year_i":2005
|
38
|
+
"solr_year_i":2005,
|
39
|
+
"suppressed_b":false
|
33
40
|
}
|
@@ -1,19 +1,31 @@
|
|
1
1
|
{
|
2
2
|
"gbl_mdVersion_s":"Aardvark",
|
3
|
-
"dct_identifier_sm":
|
3
|
+
"dct_identifier_sm":[
|
4
|
+
"http://purl.stanford.edu/cz128vq0535"
|
5
|
+
],
|
4
6
|
"dct_title_s":"2005 Rural Poverty GIS Database: Uganda",
|
5
|
-
"dct_description_sm":
|
7
|
+
"dct_description_sm":[
|
8
|
+
"This polygon shapefile contains 2005 poverty data for 855 rural subcounties in Uganda. These data are intended for researchers, students, policy makers and the general public for reference and mapping purposes, and may be used for basic applications such as viewing, querying, and map output production."
|
9
|
+
],
|
6
10
|
"dct_accessRights_s":"Public",
|
7
11
|
"schema_provider_s":"Stanford",
|
8
12
|
"dct_references_s":"{\"http://schema.org/url\":\"http://purl.stanford.edu/cz128vq0535\",\"http://schema.org/downloadUrl\":\"http://stacks.stanford.edu/file/druid:cz128vq0535/data.zip\",\"http://www.loc.gov/mods/v3\":\"http://purl.stanford.edu/cz128vq0535.mods\",\"http://www.isotc211.org/schemas/2005/gmd/\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/iso19139.xml\",\"http://www.w3.org/1999/xhtml\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/default.html\",\"http://www.opengis.net/def/serviceType/ogc/wfs\":\"https://geowebservices.stanford.edu/geoserver/wfs\",\"http://www.opengis.net/def/serviceType/ogc/wms\":\"https://geowebservices.stanford.edu/geoserver/wms\"}",
|
9
13
|
"gbl_wxsIdentifier_s":"druid:cz128vq0535",
|
10
14
|
"id":"stanford-cz128vq0535",
|
11
|
-
"
|
15
|
+
"gbl_resourceType_sm": [
|
16
|
+
"Polygon data"
|
17
|
+
],
|
12
18
|
"gbl_mdModified_dt":"2015-01-13T18:46:38Z",
|
13
19
|
"dct_format_s":"Shapefile",
|
14
|
-
"dct_language_sm":
|
15
|
-
|
16
|
-
|
20
|
+
"dct_language_sm":[
|
21
|
+
"English"
|
22
|
+
],
|
23
|
+
"gbl_resourceClass_sm":[
|
24
|
+
"Datasets"
|
25
|
+
],
|
26
|
+
"dct_publisher_sm":[
|
27
|
+
"Uganda Bureau of Statistics"
|
28
|
+
],
|
17
29
|
"dct_creator_sm":[
|
18
30
|
"Uganda Bureau of Statistics"
|
19
31
|
],
|
@@ -28,6 +40,12 @@
|
|
28
40
|
"dct_spatial_sm":[
|
29
41
|
"Uganda"
|
30
42
|
],
|
31
|
-
"
|
32
|
-
|
43
|
+
"dct_source_sm": [
|
44
|
+
"stanford-rb371kw9607"
|
45
|
+
],
|
46
|
+
"dcat_bbox":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)",
|
47
|
+
"gbl_indexYear_im":[
|
48
|
+
2005
|
49
|
+
],
|
50
|
+
"gbl_suppressed_b":false
|
33
51
|
}
|
@@ -5,8 +5,9 @@ require 'spec_helper'
|
|
5
5
|
require 'rsolr'
|
6
6
|
|
7
7
|
RSpec.describe GeoCombine::GeoBlacklightHarvester do
|
8
|
-
subject(:harvester) { described_class.new(site_key) }
|
8
|
+
subject(:harvester) { described_class.new(site_key, logger:) }
|
9
9
|
|
10
|
+
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
|
10
11
|
let(:site_key) { :INSTITUTION }
|
11
12
|
let(:stub_json_response) { '{}' }
|
12
13
|
let(:stub_solr_connection) { double('RSolr::Connection') }
|
@@ -40,7 +41,7 @@ RSpec.describe GeoCombine::GeoBlacklightHarvester do
|
|
40
41
|
|
41
42
|
let(:docs) { [{ layer_slug_s: 'abc-123' }, { layer_slug_s: 'abc-321' }] }
|
42
43
|
let(:stub_json_response) do
|
43
|
-
{ response: { docs
|
44
|
+
{ response: { docs:, pages: { current_page: 1, total_pages: 1 } } }.to_json
|
44
45
|
end
|
45
46
|
|
46
47
|
it 'adds documents returned to solr' do
|
@@ -142,7 +143,7 @@ RSpec.describe GeoCombine::GeoBlacklightHarvester do
|
|
142
143
|
).and_return(stub_second_response.to_json)
|
143
144
|
base_url = 'https://example.com?f%5Bdct_provenance_s%5D%5B%5D=INSTITUTION&format=json&per_page=100'
|
144
145
|
docs = described_class::LegacyBlacklightResponse.new(response: stub_first_response,
|
145
|
-
base_url:
|
146
|
+
base_url:).documents
|
146
147
|
|
147
148
|
expect(docs.to_a).to eq([first_docs, second_docs])
|
148
149
|
end
|
@@ -182,7 +183,7 @@ RSpec.describe GeoCombine::GeoBlacklightHarvester do
|
|
182
183
|
|
183
184
|
base_url = 'https://example.com?f%5Bdct_provenance_s%5D%5B%5D=INSTITUTION&format=json&per_page=100'
|
184
185
|
docs = described_class::ModernBlacklightResponse.new(response: first_results_response,
|
185
|
-
base_url:
|
186
|
+
base_url:).documents
|
186
187
|
|
187
188
|
expect(docs.to_a).to eq([
|
188
189
|
[{ 'layer_slug_s' => 'abc-123' }, { 'layer_slug_s' => 'abc-321' }],
|
@@ -5,8 +5,9 @@ require 'geo_combine/harvester'
|
|
5
5
|
require 'spec_helper'
|
6
6
|
|
7
7
|
RSpec.describe GeoCombine::Harvester do
|
8
|
-
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing') }
|
8
|
+
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing', logger:) }
|
9
9
|
|
10
|
+
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
|
10
11
|
let(:repo_name) { 'my-institution' }
|
11
12
|
let(:repo_path) { File.join(harvester.ogm_path, repo_name) }
|
12
13
|
let(:repo_url) { "https://github.com/OpenGeoMetadata/#{repo_name}.git" }
|
@@ -34,8 +35,7 @@ RSpec.describe GeoCombine::Harvester do
|
|
34
35
|
end
|
35
36
|
|
36
37
|
# stub git commands
|
37
|
-
allow(Git).to
|
38
|
-
allow(Git).to receive(:clone).and_return(stub_repo)
|
38
|
+
allow(Git).to receive_messages(open: stub_repo, clone: stub_repo)
|
39
39
|
allow(stub_repo).to receive(:pull).and_return(stub_repo)
|
40
40
|
end
|
41
41
|
|
@@ -48,7 +48,7 @@ RSpec.describe GeoCombine::Harvester do
|
|
48
48
|
end
|
49
49
|
|
50
50
|
it 'skips records with a different schema version' do
|
51
|
-
harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: 'Aardvark')
|
51
|
+
harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: 'Aardvark', logger:)
|
52
52
|
expect { |b| harvester.docs_to_index(&b) }.to yield_successive_args(
|
53
53
|
[JSON.parse(File.read('spec/fixtures/indexing/aardvark.json')), 'spec/fixtures/indexing/aardvark.json']
|
54
54
|
)
|
@@ -75,8 +75,8 @@ RSpec.describe GeoCombine::Harvester do
|
|
75
75
|
expect(stub_repo).to have_received(:pull).exactly(2).times
|
76
76
|
end
|
77
77
|
|
78
|
-
it 'returns the
|
79
|
-
expect(harvester.pull_all).to eq(
|
78
|
+
it 'returns the names of repositories pulled' do
|
79
|
+
expect(harvester.pull_all).to eq(%w[my-institution another-institution])
|
80
80
|
end
|
81
81
|
|
82
82
|
it 'skips repositories in the denylist' do
|
@@ -107,20 +107,6 @@ RSpec.describe GeoCombine::Harvester do
|
|
107
107
|
harvester.clone(repo_name)
|
108
108
|
expect(Git).not_to have_received(:clone)
|
109
109
|
end
|
110
|
-
|
111
|
-
it 'warns if a repository is empty' do
|
112
|
-
allow(Net::HTTP).to receive(:get).with('https://api.github.com/repos/opengeometadata/empty').and_return('{"size": 0}')
|
113
|
-
expect do
|
114
|
-
harvester.clone('empty')
|
115
|
-
end.to output(/repository 'empty' is empty/).to_stdout
|
116
|
-
end
|
117
|
-
|
118
|
-
it 'warns if a repository is archived' do
|
119
|
-
allow(Net::HTTP).to receive(:get).with('https://api.github.com/repos/opengeometadata/empty').and_return('{"archived": true}')
|
120
|
-
expect do
|
121
|
-
harvester.clone('outdated-institution')
|
122
|
-
end.to output(/repository 'outdated-institution' is archived/).to_stdout
|
123
|
-
end
|
124
110
|
end
|
125
111
|
|
126
112
|
describe '#clone_all' do
|
@@ -134,8 +120,8 @@ RSpec.describe GeoCombine::Harvester do
|
|
134
120
|
expect(Git).not_to have_received(:clone).with('https://github.com/OpenGeoMetadata/aardvark.git')
|
135
121
|
end
|
136
122
|
|
137
|
-
it 'returns the
|
138
|
-
expect(harvester.clone_all).to eq(
|
123
|
+
it 'returns the names of repositories cloned' do
|
124
|
+
expect(harvester.clone_all).to eq(%w[my-institution another-institution])
|
139
125
|
end
|
140
126
|
end
|
141
127
|
|
@@ -3,10 +3,22 @@
|
|
3
3
|
require 'geo_combine/indexer'
|
4
4
|
require 'spec_helper'
|
5
5
|
|
6
|
+
# Mock an available Blacklight installation
|
7
|
+
class FakeBlacklight
|
8
|
+
def self.default_index
|
9
|
+
Repository
|
10
|
+
end
|
11
|
+
|
12
|
+
class Repository
|
13
|
+
def self.connection; end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
6
17
|
RSpec.describe GeoCombine::Indexer do
|
7
|
-
subject(:indexer) { described_class.new(solr:
|
18
|
+
subject(:indexer) { described_class.new(solr:, logger:) }
|
8
19
|
|
9
|
-
let(:
|
20
|
+
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
|
21
|
+
let(:solr) { instance_double(RSolr::Client, options: { url: 'TEST' }) }
|
10
22
|
let(:docs) do
|
11
23
|
[
|
12
24
|
[{ 'id' => '1' }, 'path/to/record1.json'], # v1.0 schema
|
@@ -21,36 +33,69 @@ RSpec.describe GeoCombine::Indexer do
|
|
21
33
|
|
22
34
|
describe '#initialize' do
|
23
35
|
before do
|
24
|
-
stub_const('ENV', 'SOLR_URL' => 'http://localhost:8983/solr/geoblacklight')
|
25
36
|
allow(RSolr).to receive(:connect).and_return(solr)
|
26
37
|
end
|
27
38
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
39
|
+
context 'when solr url is set in the environment' do
|
40
|
+
before do
|
41
|
+
stub_const('ENV', 'SOLR_URL' => 'http://localhost:8983/solr/geoblacklight')
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'connects to the solr instance' do
|
45
|
+
described_class.new(logger:)
|
46
|
+
expect(RSolr).to have_received(:connect).with(
|
47
|
+
be_a(Faraday::Connection),
|
48
|
+
url: 'http://localhost:8983/solr/geoblacklight'
|
49
|
+
)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'when there is a configured Blacklight connection' do
|
54
|
+
before do
|
55
|
+
stub_const('Blacklight', FakeBlacklight)
|
56
|
+
allow(FakeBlacklight::Repository).to receive(:connection).and_return(
|
57
|
+
instance_double(RSolr::Client, base_uri: URI('http://localhost:8983/solr/geoblacklight'))
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'connects to the solr instance' do
|
62
|
+
described_class.new(logger:)
|
63
|
+
expect(RSolr).to have_received(:connect).with(
|
64
|
+
be_a(Faraday::Connection),
|
65
|
+
url: 'http://localhost:8983/solr/geoblacklight'
|
66
|
+
)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
context 'when solr url is not set' do
|
71
|
+
before do
|
72
|
+
stub_const('ENV', {})
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'falls back to the Blacklight default' do
|
76
|
+
described_class.new(logger:)
|
77
|
+
expect(RSolr).to have_received(:connect).with(
|
78
|
+
be_a(Faraday::Connection),
|
79
|
+
url: 'http://localhost:8983/solr/blacklight-core'
|
80
|
+
)
|
81
|
+
end
|
34
82
|
end
|
35
83
|
end
|
36
84
|
|
37
85
|
describe '#index' do
|
38
|
-
|
39
|
-
|
86
|
+
let(:solr_error_msg) { { error: { msg: 'error message' } }.to_json }
|
87
|
+
let(:solr_response) { { status: '400', body: solr_error_msg } }
|
88
|
+
let(:error) { RSolr::Error::Http.new({ uri: URI('') }, solr_response) }
|
89
|
+
|
90
|
+
it 'sends records in batches to solr' do
|
91
|
+
indexer.index(docs)
|
40
92
|
expect(solr).to have_received(:update).with(
|
41
|
-
|
42
|
-
|
43
|
-
|
93
|
+
data: "{ add: { doc: {\"id\":\"1\"} },\nadd: { doc: {\"dc_identifier_s\":\"2\"} } }",
|
94
|
+
headers: { 'Content-Type' => 'application/json' },
|
95
|
+
params: { overwrite: true }
|
44
96
|
)
|
45
97
|
end
|
46
98
|
|
47
|
-
it 'prints the id and path of each record in debug mode' do
|
48
|
-
$DEBUG = true
|
49
|
-
expect { indexer.index([docs[0]]) }.to output("Indexing 1: path/to/record1.json\n").to_stdout
|
50
|
-
expect { indexer.index([docs[1]]) }.to output("Indexing 2: path/to/record2.json\n").to_stdout
|
51
|
-
$DEBUG = false
|
52
|
-
end
|
53
|
-
|
54
99
|
it 'commits changes to solr after indexing' do
|
55
100
|
indexer.index(docs)
|
56
101
|
expect(solr).to have_received(:commit).once
|
@@ -59,5 +104,31 @@ RSpec.describe GeoCombine::Indexer do
|
|
59
104
|
it 'returns the count of records successfully indexed' do
|
60
105
|
expect(indexer.index(docs)).to eq 2
|
61
106
|
end
|
107
|
+
|
108
|
+
context 'when an error occurs during batch indexing' do
|
109
|
+
before do
|
110
|
+
allow(solr).to receive(:update).and_raise(error)
|
111
|
+
allow(solr).to receive(:add)
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'attempts to index records individually' do
|
115
|
+
total = indexer.index(docs)
|
116
|
+
expect(solr).to have_received(:add).twice
|
117
|
+
expect(total).to eq 2
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
context 'when an error occurs during individual indexing' do
|
122
|
+
before do
|
123
|
+
allow(solr).to receive(:update).and_raise(error)
|
124
|
+
allow(solr).to receive(:add).with(docs[0][0], anything).and_raise(error)
|
125
|
+
allow(solr).to receive(:add).with(docs[1][0], anything)
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'continues indexing' do
|
129
|
+
total = indexer.index(docs)
|
130
|
+
expect(total).to eq 1
|
131
|
+
end
|
132
|
+
end
|
62
133
|
end
|
63
134
|
end
|