geo_combine 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,47 +1,126 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'rsolr'
4
+ require 'faraday/retry'
4
5
  require 'faraday/net_http_persistent'
6
+ require 'geo_combine/logger'
5
7
 
6
8
  module GeoCombine
7
9
  # Indexes Geoblacklight documents into Solr
8
10
  class Indexer
9
11
  attr_reader :solr
10
12
 
11
- def self.solr(url: ENV.fetch('SOLR_URL', 'http://127.0.0.1:8983/solr/blacklight-core'))
12
- RSolr.connect url: url, adapter: :net_http_persistent
13
+ def initialize(solr: nil, logger: GeoCombine::Logger.logger)
14
+ @logger = logger
15
+ @batch_size = ENV.fetch('SOLR_BATCH_SIZE', 100).to_i
16
+
17
+ # If SOLR_URL is set, use it; if in a Geoblacklight app, use its solr core
18
+ solr_url = ENV.fetch('SOLR_URL', nil)
19
+ solr_url ||= Blacklight.default_index.connection.base_uri.to_s if defined? Blacklight
20
+
21
+ # If neither, warn and try to use local Blacklight default solr core
22
+ if solr_url.nil?
23
+ @logger.warn 'SOLR_URL not set; using Blacklight default'
24
+ solr_url = 'http://localhost:8983/solr/blacklight-core'
25
+ end
26
+
27
+ @solr = solr || RSolr.connect(client, url: solr_url)
13
28
  end
14
29
 
15
- def initialize(solr: GeoCombine::Indexer.solr)
16
- @solr = solr
30
+ # Index everything and return the number of docs successfully indexed
31
+ def index(docs)
32
+ # Track total indexed and time spent
33
+ @logger.info "indexing into #{solr_url}"
34
+ total_indexed = 0
35
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
36
+
37
+ # Index in batches; set batch size via BATCH_SIZE
38
+ batch = []
39
+ docs.each do |doc, path|
40
+ if batch.size < @batch_size
41
+ batch << [doc, path]
42
+ else
43
+ total_indexed += index_batch(batch)
44
+ batch = []
45
+ end
46
+ end
47
+ total_indexed += index_batch(batch) unless batch.empty?
48
+
49
+ # Issue a commit to make sure all documents are indexed
50
+ @solr.commit
51
+ end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
52
+ sec = end_time - start_time
53
+ @logger.info format('indexed %<total_indexed>d documents in %<sec>.2f seconds', total_indexed:, sec:)
54
+ total_indexed
17
55
  end
18
56
 
57
+ # URL to the solr instance being used
19
58
  def solr_url
20
59
  @solr.options[:url]
21
60
  end
22
61
 
23
- # Index everything and return the number of docs successfully indexed
24
- def index(docs, commit_within: ENV.fetch('SOLR_COMMIT_WITHIN', 5000).to_i)
25
- indexed_count = 0
26
-
27
- docs.each do |record, path|
28
- # log the unique identifier for the record for debugging
29
- id = record['id'] || record['dc_identifier_s']
30
- puts "Indexing #{id}: #{path}" if $DEBUG
31
-
32
- # index the record into solr
33
- @solr.update params: { commitWithin: commit_within, overwrite: true },
34
- data: [record].to_json,
35
- headers: { 'Content-Type' => 'application/json' }
36
-
37
- # count the number of records successfully indexed
38
- indexed_count += 1
39
- rescue RSolr::Error::Http => e
40
- puts e
41
- end
62
+ private
42
63
 
43
- @solr.commit
44
- indexed_count
64
+ # Index a batch of documents; if we fail, index them all individually
65
+ def index_batch(batch)
66
+ docs = batch.map(&:first)
67
+ @solr.update(data: batch_json(docs), params:, headers:)
68
+ @logger.debug "indexed batch (#{batch.size} docs)"
69
+ batch.size
70
+ rescue RSolr::Error::Http => e
71
+ @logger.error "error indexing batch (#{batch.size} docs): #{format_error(e)}"
72
+ @logger.warn 'retrying documents individually'
73
+ batch.map { |doc, path| index_single(doc, path) }.compact.size
74
+ end
75
+
76
+ # Index a single document; if it fails, log the error and continue
77
+ def index_single(doc, path)
78
+ @solr.add(doc, params:, headers:)
79
+ @logger.debug "indexed #{path}"
80
+ doc
81
+ rescue RSolr::Error::Http => e
82
+ @logger.error "error indexing #{path}: #{format_error(e)}"
83
+ nil
84
+ end
85
+
86
+ # Generate a JSON string to send to solr update API for a batch of documents
87
+ def batch_json(batch)
88
+ batch.map { |doc| "add: { doc: #{doc.to_json} }" }.join(",\n").prepend('{ ').concat(' }')
89
+ end
90
+
91
+ # Generate a friendly error message for logging including status code and message
92
+ def format_error(error)
93
+ code = error.response[:status]
94
+ status_info = "#{code} #{RSolr::Error::Http::STATUS_CODES[code.to_i]}"
95
+ error_info = parse_solr_error(error)
96
+ [status_info, error_info].compact.join(' - ')
97
+ end
98
+
99
+ # Extract the specific error message from a solr JSON error response, if any
100
+ def parse_solr_error(error)
101
+ JSON.parse(error.response[:body]).dig('error', 'msg')
102
+ rescue StandardError
103
+ nil
104
+ end
105
+
106
+ def headers
107
+ { 'Content-Type' => 'application/json' }
108
+ end
109
+
110
+ def params
111
+ { overwrite: true }
112
+ end
113
+
114
+ def client
115
+ @client ||= Faraday.new do |conn|
116
+ conn.request :retry, max: 3, interval: 1, backoff_factor: 2, exceptions: [
117
+ Faraday::TimeoutError,
118
+ Faraday::ConnectionFailed,
119
+ Faraday::TooManyRequestsError
120
+ ]
121
+ conn.response :raise_error
122
+ conn.adapter :net_http_persistent
123
+ end
45
124
  end
46
125
  end
47
126
  end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logger'
4
+
5
+ module GeoCombine
6
+ # Logger for gem
7
+ class Logger
8
+ def self.logger
9
+ @logger ||= ::Logger.new(
10
+ $stderr,
11
+ progname: 'GeoCombine',
12
+ level: ENV.fetch('LOG_LEVEL', 'info').to_sym
13
+ )
14
+ end
15
+ end
16
+ end
@@ -1,41 +1,86 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'active_support'
4
+
3
5
  module GeoCombine
4
6
  module Migrators
5
- # TODO: WARNING! This class is not fully implemented and should not be used in
6
- # production. See https://github.com/OpenGeoMetadata/GeoCombine/issues/121
7
- # for remaining work.
8
- #
9
7
  # migrates the v1 schema to the aardvark schema
10
8
  class V1AardvarkMigrator
11
9
  attr_reader :v1_hash
12
10
 
13
11
  # @param v1_hash [Hash] parsed json in the v1 schema
14
- def initialize(v1_hash:)
12
+ # @param collection_id_map [Hash] a hash mapping collection names to ids for converting dct_isPartOf_sm
13
+ def initialize(v1_hash:, collection_id_map: {})
15
14
  @v1_hash = v1_hash
15
+ @v2_hash = v1_hash
16
+ @collection_id_map = collection_id_map
16
17
  end
17
18
 
18
19
  def run
19
- v2_hash = convert_keys
20
- v2_hash['gbl_mdVersion_s'] = 'Aardvark'
21
- v2_hash
20
+ # Return unchanged if already in the aardvark schema
21
+ return @v2_hash if @v2_hash['gbl_mdVersion_s'] == 'Aardvark'
22
+
23
+ # Convert the record
24
+ convert_keys
25
+ convert_single_to_multi_valued_fields
26
+ convert_non_crosswalked_fields
27
+ remove_deprecated_fields
28
+
29
+ # Mark the record as converted and return it
30
+ @v2_hash['gbl_mdVersion_s'] = 'Aardvark'
31
+ @v2_hash
22
32
  end
23
33
 
34
+ # Namespace and URI changes to fields
24
35
  def convert_keys
25
- v1_hash.transform_keys do |k|
36
+ @v2_hash.transform_keys! do |k|
26
37
  SCHEMA_FIELD_MAP[k] || k
27
38
  end
28
39
  end
29
40
 
41
+ # Fields that need to be converted from single to multi-valued
42
+ def convert_single_to_multi_valued_fields
43
+ @v2_hash = @v2_hash.each_with_object({}) do |(k, v), h|
44
+ h[k] = if !v.is_a?(Array) && k.match?(/.*_[s|i]m/)
45
+ [v]
46
+ else
47
+ v
48
+ end
49
+ end
50
+ end
51
+
52
+ # Convert non-crosswalked fields via lookup tables
53
+ def convert_non_crosswalked_fields
54
+ # Keys may or may not include whitespace, so we normalize them.
55
+ # Resource class is required so we default to "Other"; resource type is not required.
56
+ @v2_hash['gbl_resourceClass_sm'] = RESOURCE_CLASS_MAP[@v1_hash['dc_type_s']&.gsub(/\s+/, '')] || ['Other']
57
+ resource_type = RESOURCE_TYPE_MAP[@v1_hash['layer_geom_type_s']&.gsub(/\s+/, '')]
58
+ @v2_hash['gbl_resourceType_sm'] = resource_type unless resource_type.nil?
59
+
60
+ # If the user specified a collection id map, use it to convert the collection names to ids
61
+ is_part_of = @v1_hash['dct_isPartOf_sm']&.map { |name| @collection_id_map[name] }&.compact
62
+ if is_part_of.present?
63
+ @v2_hash['dct_isPartOf_sm'] = is_part_of
64
+ else
65
+ @v2_hash.delete('dct_isPartOf_sm')
66
+ end
67
+ end
68
+
69
+ # Remove fields that are no longer used
70
+ def remove_deprecated_fields
71
+ @v2_hash = @v2_hash.except(*SCHEMA_FIELD_MAP.keys, 'dc_type_s', 'layer_geom_type_s')
72
+ end
73
+
30
74
  SCHEMA_FIELD_MAP = {
31
75
  'dc_title_s' => 'dct_title_s', # new namespace
32
76
  'dc_description_s' => 'dct_description_sm', # new namespace; single to multi-valued
33
77
  'dc_language_s' => 'dct_language_sm', # new namespace; single to multi-valued
34
- 'dc_language_sm' => 'dct_language_sm', # new namespace; single to multi-valued
78
+ 'dc_language_sm' => 'dct_language_sm', # new namespace
35
79
  'dc_creator_sm' => 'dct_creator_sm', # new namespace
36
80
  'dc_publisher_s' => 'dct_publisher_sm', # new namespace; single to multi-valued
37
81
  'dct_provenance_s' => 'schema_provider_s', # new URI name
38
82
  'dc_subject_sm' => 'dct_subject_sm', # new namespace
83
+ 'solr_geom' => 'dcat_bbox', # new URI name
39
84
  'solr_year_i' => 'gbl_indexYear_im', # new URI name; single to multi-valued
40
85
  'dc_source_sm' => 'dct_source_sm', # new namespace
41
86
  'dc_rights_s' => 'dct_accessRights_s', # new URI name
@@ -47,6 +92,27 @@ module GeoCombine
47
92
  'geoblacklight_version' => 'gbl_mdVersion_s', # new URI name
48
93
  'suppressed_b' => 'gbl_suppressed_b' # new namespace
49
94
  }.freeze
95
+
96
+ # Map Dublin Core types to Aardvark resource class sets
97
+ # See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-class.md
98
+ RESOURCE_CLASS_MAP = {
99
+ 'Collection' => ['Collections'],
100
+ 'Dataset' => ['Datasets'],
101
+ 'Image' => ['Imagery'],
102
+ 'InteractiveResource' => ['Websites'],
103
+ 'Service' => ['Web services'],
104
+ 'StillImage' => ['Imagery']
105
+ }.freeze
106
+
107
+ # Map geometry types to Aardvark resource type sets
108
+ # See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-type.md
109
+ RESOURCE_TYPE_MAP = {
110
+ 'Point' => ['Point data'],
111
+ 'Line' => ['Line data'],
112
+ 'Polygon' => ['Polygon data'],
113
+ 'Raster' => ['Raster data'],
114
+ 'Table' => ['Table data']
115
+ }.freeze
50
116
  end
51
117
  end
52
118
  end
@@ -75,7 +75,7 @@ module GeoCombine
75
75
  dc_publisher_s: metadata['Publisher'],
76
76
  dc_subject_sm: subjects,
77
77
  dc_type_s: 'Dataset'
78
- }.delete_if { |_k, v| v.nil? }
78
+ }.compact
79
79
  end
80
80
 
81
81
  def date
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GeoCombine
4
- VERSION = '0.8.0'
4
+ VERSION = '0.9.0'
5
5
  end
@@ -12,24 +12,20 @@ namespace :geocombine do
12
12
  desc 'Clone OpenGeoMetadata repositories'
13
13
  task :clone, [:repo] do |_t, args|
14
14
  harvester = GeoCombine::Harvester.new
15
- total = args[:repo] ? harvester.clone(args.repo) : harvester.clone_all
16
- puts "Cloned #{total} repositories"
15
+ args[:repo] ? harvester.clone(args.repo) : harvester.clone_all
17
16
  end
18
17
 
19
18
  desc '"git pull" OpenGeoMetadata repositories'
20
19
  task :pull, [:repo] do |_t, args|
21
20
  harvester = GeoCombine::Harvester.new
22
- total = args[:repo] ? harvester.pull(args.repo) : harvester.pull_all
23
- puts "Updated #{total} repositories"
21
+ args[:repo] ? harvester.pull(args.repo) : harvester.pull_all
24
22
  end
25
23
 
26
24
  desc 'Index all JSON documents except Layers.json'
27
25
  task :index do
28
26
  harvester = GeoCombine::Harvester.new
29
27
  indexer = GeoCombine::Indexer.new
30
- puts "Indexing #{harvester.ogm_path} into #{indexer.solr_url}"
31
- total = indexer.index(harvester.docs_to_index)
32
- puts "Indexed #{total} documents"
28
+ indexer.index(harvester.docs_to_index)
33
29
  end
34
30
 
35
31
  namespace :geoblacklight_harvester do
@@ -28,6 +28,13 @@
28
28
  "dct_spatial_sm":[
29
29
  "Uganda"
30
30
  ],
31
+ "dct_isPartOf_sm":[
32
+ "Uganda GIS Maps and Data, 2000-2010"
33
+ ],
34
+ "dc_source_sm": [
35
+ "stanford-rb371kw9607"
36
+ ],
31
37
  "solr_geom":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)",
32
- "solr_year_i":2005
38
+ "solr_year_i":2005,
39
+ "suppressed_b":false
33
40
  }
@@ -1,19 +1,31 @@
1
1
  {
2
2
  "gbl_mdVersion_s":"Aardvark",
3
- "dct_identifier_sm":"http://purl.stanford.edu/cz128vq0535",
3
+ "dct_identifier_sm":[
4
+ "http://purl.stanford.edu/cz128vq0535"
5
+ ],
4
6
  "dct_title_s":"2005 Rural Poverty GIS Database: Uganda",
5
- "dct_description_sm":"This polygon shapefile contains 2005 poverty data for 855 rural subcounties in Uganda. These data are intended for researchers, students, policy makers and the general public for reference and mapping purposes, and may be used for basic applications such as viewing, querying, and map output production.",
7
+ "dct_description_sm":[
8
+ "This polygon shapefile contains 2005 poverty data for 855 rural subcounties in Uganda. These data are intended for researchers, students, policy makers and the general public for reference and mapping purposes, and may be used for basic applications such as viewing, querying, and map output production."
9
+ ],
6
10
  "dct_accessRights_s":"Public",
7
11
  "schema_provider_s":"Stanford",
8
12
  "dct_references_s":"{\"http://schema.org/url\":\"http://purl.stanford.edu/cz128vq0535\",\"http://schema.org/downloadUrl\":\"http://stacks.stanford.edu/file/druid:cz128vq0535/data.zip\",\"http://www.loc.gov/mods/v3\":\"http://purl.stanford.edu/cz128vq0535.mods\",\"http://www.isotc211.org/schemas/2005/gmd/\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/iso19139.xml\",\"http://www.w3.org/1999/xhtml\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/default.html\",\"http://www.opengis.net/def/serviceType/ogc/wfs\":\"https://geowebservices.stanford.edu/geoserver/wfs\",\"http://www.opengis.net/def/serviceType/ogc/wms\":\"https://geowebservices.stanford.edu/geoserver/wms\"}",
9
13
  "gbl_wxsIdentifier_s":"druid:cz128vq0535",
10
14
  "id":"stanford-cz128vq0535",
11
- "layer_geom_type_s":"Polygon",
15
+ "gbl_resourceType_sm": [
16
+ "Polygon data"
17
+ ],
12
18
  "gbl_mdModified_dt":"2015-01-13T18:46:38Z",
13
19
  "dct_format_s":"Shapefile",
14
- "dct_language_sm":"English",
15
- "dc_type_s":"Dataset",
16
- "dct_publisher_sm":"Uganda Bureau of Statistics",
20
+ "dct_language_sm":[
21
+ "English"
22
+ ],
23
+ "gbl_resourceClass_sm":[
24
+ "Datasets"
25
+ ],
26
+ "dct_publisher_sm":[
27
+ "Uganda Bureau of Statistics"
28
+ ],
17
29
  "dct_creator_sm":[
18
30
  "Uganda Bureau of Statistics"
19
31
  ],
@@ -28,6 +40,12 @@
28
40
  "dct_spatial_sm":[
29
41
  "Uganda"
30
42
  ],
31
- "solr_geom":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)",
32
- "gbl_indexYear_im":2005
43
+ "dct_source_sm": [
44
+ "stanford-rb371kw9607"
45
+ ],
46
+ "dcat_bbox":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)",
47
+ "gbl_indexYear_im":[
48
+ 2005
49
+ ],
50
+ "gbl_suppressed_b":false
33
51
  }
@@ -5,8 +5,9 @@ require 'spec_helper'
5
5
  require 'rsolr'
6
6
 
7
7
  RSpec.describe GeoCombine::GeoBlacklightHarvester do
8
- subject(:harvester) { described_class.new(site_key) }
8
+ subject(:harvester) { described_class.new(site_key, logger:) }
9
9
 
10
+ let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
10
11
  let(:site_key) { :INSTITUTION }
11
12
  let(:stub_json_response) { '{}' }
12
13
  let(:stub_solr_connection) { double('RSolr::Connection') }
@@ -40,7 +41,7 @@ RSpec.describe GeoCombine::GeoBlacklightHarvester do
40
41
 
41
42
  let(:docs) { [{ layer_slug_s: 'abc-123' }, { layer_slug_s: 'abc-321' }] }
42
43
  let(:stub_json_response) do
43
- { response: { docs: docs, pages: { current_page: 1, total_pages: 1 } } }.to_json
44
+ { response: { docs:, pages: { current_page: 1, total_pages: 1 } } }.to_json
44
45
  end
45
46
 
46
47
  it 'adds documents returned to solr' do
@@ -142,7 +143,7 @@ RSpec.describe GeoCombine::GeoBlacklightHarvester do
142
143
  ).and_return(stub_second_response.to_json)
143
144
  base_url = 'https://example.com?f%5Bdct_provenance_s%5D%5B%5D=INSTITUTION&format=json&per_page=100'
144
145
  docs = described_class::LegacyBlacklightResponse.new(response: stub_first_response,
145
- base_url: base_url).documents
146
+ base_url:).documents
146
147
 
147
148
  expect(docs.to_a).to eq([first_docs, second_docs])
148
149
  end
@@ -182,7 +183,7 @@ RSpec.describe GeoCombine::GeoBlacklightHarvester do
182
183
 
183
184
  base_url = 'https://example.com?f%5Bdct_provenance_s%5D%5B%5D=INSTITUTION&format=json&per_page=100'
184
185
  docs = described_class::ModernBlacklightResponse.new(response: first_results_response,
185
- base_url: base_url).documents
186
+ base_url:).documents
186
187
 
187
188
  expect(docs.to_a).to eq([
188
189
  [{ 'layer_slug_s' => 'abc-123' }, { 'layer_slug_s' => 'abc-321' }],
@@ -5,8 +5,9 @@ require 'geo_combine/harvester'
5
5
  require 'spec_helper'
6
6
 
7
7
  RSpec.describe GeoCombine::Harvester do
8
- subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing') }
8
+ subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing', logger:) }
9
9
 
10
+ let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
10
11
  let(:repo_name) { 'my-institution' }
11
12
  let(:repo_path) { File.join(harvester.ogm_path, repo_name) }
12
13
  let(:repo_url) { "https://github.com/OpenGeoMetadata/#{repo_name}.git" }
@@ -34,8 +35,7 @@ RSpec.describe GeoCombine::Harvester do
34
35
  end
35
36
 
36
37
  # stub git commands
37
- allow(Git).to receive(:open).and_return(stub_repo)
38
- allow(Git).to receive(:clone).and_return(stub_repo)
38
+ allow(Git).to receive_messages(open: stub_repo, clone: stub_repo)
39
39
  allow(stub_repo).to receive(:pull).and_return(stub_repo)
40
40
  end
41
41
 
@@ -48,7 +48,7 @@ RSpec.describe GeoCombine::Harvester do
48
48
  end
49
49
 
50
50
  it 'skips records with a different schema version' do
51
- harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: 'Aardvark')
51
+ harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: 'Aardvark', logger:)
52
52
  expect { |b| harvester.docs_to_index(&b) }.to yield_successive_args(
53
53
  [JSON.parse(File.read('spec/fixtures/indexing/aardvark.json')), 'spec/fixtures/indexing/aardvark.json']
54
54
  )
@@ -75,8 +75,8 @@ RSpec.describe GeoCombine::Harvester do
75
75
  expect(stub_repo).to have_received(:pull).exactly(2).times
76
76
  end
77
77
 
78
- it 'returns the count of repositories pulled' do
79
- expect(harvester.pull_all).to eq(2)
78
+ it 'returns the names of repositories pulled' do
79
+ expect(harvester.pull_all).to eq(%w[my-institution another-institution])
80
80
  end
81
81
 
82
82
  it 'skips repositories in the denylist' do
@@ -107,20 +107,6 @@ RSpec.describe GeoCombine::Harvester do
107
107
  harvester.clone(repo_name)
108
108
  expect(Git).not_to have_received(:clone)
109
109
  end
110
-
111
- it 'warns if a repository is empty' do
112
- allow(Net::HTTP).to receive(:get).with('https://api.github.com/repos/opengeometadata/empty').and_return('{"size": 0}')
113
- expect do
114
- harvester.clone('empty')
115
- end.to output(/repository 'empty' is empty/).to_stdout
116
- end
117
-
118
- it 'warns if a repository is archived' do
119
- allow(Net::HTTP).to receive(:get).with('https://api.github.com/repos/opengeometadata/empty').and_return('{"archived": true}')
120
- expect do
121
- harvester.clone('outdated-institution')
122
- end.to output(/repository 'outdated-institution' is archived/).to_stdout
123
- end
124
110
  end
125
111
 
126
112
  describe '#clone_all' do
@@ -134,8 +120,8 @@ RSpec.describe GeoCombine::Harvester do
134
120
  expect(Git).not_to have_received(:clone).with('https://github.com/OpenGeoMetadata/aardvark.git')
135
121
  end
136
122
 
137
- it 'returns the count of repositories cloned' do
138
- expect(harvester.clone_all).to eq(2)
123
+ it 'returns the names of repositories cloned' do
124
+ expect(harvester.clone_all).to eq(%w[my-institution another-institution])
139
125
  end
140
126
  end
141
127
 
@@ -3,10 +3,22 @@
3
3
  require 'geo_combine/indexer'
4
4
  require 'spec_helper'
5
5
 
6
+ # Mock an available Blacklight installation
7
+ class FakeBlacklight
8
+ def self.default_index
9
+ Repository
10
+ end
11
+
12
+ class Repository
13
+ def self.connection; end
14
+ end
15
+ end
16
+
6
17
  RSpec.describe GeoCombine::Indexer do
7
- subject(:indexer) { described_class.new(solr: solr) }
18
+ subject(:indexer) { described_class.new(solr:, logger:) }
8
19
 
9
- let(:solr) { instance_double(RSolr::Client) }
20
+ let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
21
+ let(:solr) { instance_double(RSolr::Client, options: { url: 'TEST' }) }
10
22
  let(:docs) do
11
23
  [
12
24
  [{ 'id' => '1' }, 'path/to/record1.json'], # v1.0 schema
@@ -21,36 +33,69 @@ RSpec.describe GeoCombine::Indexer do
21
33
 
22
34
  describe '#initialize' do
23
35
  before do
24
- stub_const('ENV', 'SOLR_URL' => 'http://localhost:8983/solr/geoblacklight')
25
36
  allow(RSolr).to receive(:connect).and_return(solr)
26
37
  end
27
38
 
28
- it 'connects to a solr instance if set in the environment' do
29
- described_class.new
30
- expect(RSolr).to have_received(:connect).with(
31
- url: 'http://localhost:8983/solr/geoblacklight',
32
- adapter: :net_http_persistent
33
- )
39
+ context 'when solr url is set in the environment' do
40
+ before do
41
+ stub_const('ENV', 'SOLR_URL' => 'http://localhost:8983/solr/geoblacklight')
42
+ end
43
+
44
+ it 'connects to the solr instance' do
45
+ described_class.new(logger:)
46
+ expect(RSolr).to have_received(:connect).with(
47
+ be_a(Faraday::Connection),
48
+ url: 'http://localhost:8983/solr/geoblacklight'
49
+ )
50
+ end
51
+ end
52
+
53
+ context 'when there is a configured Blacklight connection' do
54
+ before do
55
+ stub_const('Blacklight', FakeBlacklight)
56
+ allow(FakeBlacklight::Repository).to receive(:connection).and_return(
57
+ instance_double(RSolr::Client, base_uri: URI('http://localhost:8983/solr/geoblacklight'))
58
+ )
59
+ end
60
+
61
+ it 'connects to the solr instance' do
62
+ described_class.new(logger:)
63
+ expect(RSolr).to have_received(:connect).with(
64
+ be_a(Faraday::Connection),
65
+ url: 'http://localhost:8983/solr/geoblacklight'
66
+ )
67
+ end
68
+ end
69
+
70
+ context 'when solr url is not set' do
71
+ before do
72
+ stub_const('ENV', {})
73
+ end
74
+
75
+ it 'falls back to the Blacklight default' do
76
+ described_class.new(logger:)
77
+ expect(RSolr).to have_received(:connect).with(
78
+ be_a(Faraday::Connection),
79
+ url: 'http://localhost:8983/solr/blacklight-core'
80
+ )
81
+ end
34
82
  end
35
83
  end
36
84
 
37
85
  describe '#index' do
38
- it 'posts each record to solr as JSON' do
39
- indexer.index([docs[0]], commit_within: 1)
86
+ let(:solr_error_msg) { { error: { msg: 'error message' } }.to_json }
87
+ let(:solr_response) { { status: '400', body: solr_error_msg } }
88
+ let(:error) { RSolr::Error::Http.new({ uri: URI('') }, solr_response) }
89
+
90
+ it 'sends records in batches to solr' do
91
+ indexer.index(docs)
40
92
  expect(solr).to have_received(:update).with(
41
- params: { commitWithin: 1, overwrite: true },
42
- data: [docs[0][0]].to_json,
43
- headers: { 'Content-Type' => 'application/json' }
93
+ data: "{ add: { doc: {\"id\":\"1\"} },\nadd: { doc: {\"dc_identifier_s\":\"2\"} } }",
94
+ headers: { 'Content-Type' => 'application/json' },
95
+ params: { overwrite: true }
44
96
  )
45
97
  end
46
98
 
47
- it 'prints the id and path of each record in debug mode' do
48
- $DEBUG = true
49
- expect { indexer.index([docs[0]]) }.to output("Indexing 1: path/to/record1.json\n").to_stdout
50
- expect { indexer.index([docs[1]]) }.to output("Indexing 2: path/to/record2.json\n").to_stdout
51
- $DEBUG = false
52
- end
53
-
54
99
  it 'commits changes to solr after indexing' do
55
100
  indexer.index(docs)
56
101
  expect(solr).to have_received(:commit).once
@@ -59,5 +104,31 @@ RSpec.describe GeoCombine::Indexer do
59
104
  it 'returns the count of records successfully indexed' do
60
105
  expect(indexer.index(docs)).to eq 2
61
106
  end
107
+
108
+ context 'when an error occurs during batch indexing' do
109
+ before do
110
+ allow(solr).to receive(:update).and_raise(error)
111
+ allow(solr).to receive(:add)
112
+ end
113
+
114
+ it 'attempts to index records individually' do
115
+ total = indexer.index(docs)
116
+ expect(solr).to have_received(:add).twice
117
+ expect(total).to eq 2
118
+ end
119
+ end
120
+
121
+ context 'when an error occurs during individual indexing' do
122
+ before do
123
+ allow(solr).to receive(:update).and_raise(error)
124
+ allow(solr).to receive(:add).with(docs[0][0], anything).and_raise(error)
125
+ allow(solr).to receive(:add).with(docs[1][0], anything)
126
+ end
127
+
128
+ it 'continues indexing' do
129
+ total = indexer.index(docs)
130
+ expect(total).to eq 1
131
+ end
132
+ end
62
133
  end
63
134
  end