search_solr_tools 4.2.0 → 5.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -1
- data/README.md +6 -6
- data/bin/search_solr_tools +58 -2
- data/lib/search_solr_tools.rb +9 -7
- data/lib/search_solr_tools/errors/harvest_error.rb +88 -0
- data/lib/search_solr_tools/harvesters/adc.rb +2 -0
- data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +2 -0
- data/lib/search_solr_tools/harvesters/auto_suggest.rb +5 -1
- data/lib/search_solr_tools/harvesters/base.rb +65 -11
- data/lib/search_solr_tools/harvesters/bcodmo.rb +1 -0
- data/lib/search_solr_tools/harvesters/data_one.rb +2 -0
- data/lib/search_solr_tools/harvesters/echo.rb +2 -0
- data/lib/search_solr_tools/harvesters/gtnp.rb +1 -0
- data/lib/search_solr_tools/harvesters/ices.rb +3 -0
- data/lib/search_solr_tools/harvesters/ncdc_paleo.rb +3 -0
- data/lib/search_solr_tools/harvesters/nmi.rb +2 -0
- data/lib/search_solr_tools/harvesters/nodc.rb +3 -0
- data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +2 -0
- data/lib/search_solr_tools/harvesters/nsidc_json.rb +32 -5
- data/lib/search_solr_tools/harvesters/oai.rb +3 -0
- data/lib/search_solr_tools/harvesters/pdc.rb +2 -0
- data/lib/search_solr_tools/harvesters/r2r.rb +2 -2
- data/lib/search_solr_tools/harvesters/rda.rb +2 -0
- data/lib/search_solr_tools/harvesters/tdar.rb +2 -0
- data/lib/search_solr_tools/harvesters/usgs.rb +3 -0
- data/lib/search_solr_tools/helpers/data_one_format.rb +3 -3
- data/lib/search_solr_tools/helpers/harvest_status.rb +44 -0
- data/lib/search_solr_tools/helpers/iso_to_solr.rb +1 -0
- data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +2 -2
- data/lib/search_solr_tools/helpers/ncdc_paleo_format.rb +3 -3
- data/lib/search_solr_tools/helpers/r2r_format.rb +3 -3
- data/lib/search_solr_tools/helpers/selectors.rb +1 -2
- data/lib/search_solr_tools/helpers/solr_format.rb +1 -0
- data/lib/search_solr_tools/helpers/tdar_format.rb +3 -3
- data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +2 -2
- data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +1 -1
- data/lib/search_solr_tools/helpers/usgs_format.rb +2 -2
- data/lib/search_solr_tools/selectors/adc.rb +2 -1
- data/lib/search_solr_tools/selectors/data_one.rb +2 -1
- data/lib/search_solr_tools/selectors/echo_iso.rb +2 -1
- data/lib/search_solr_tools/selectors/ices_iso.rb +2 -1
- data/lib/search_solr_tools/selectors/ncdc_paleo.rb +2 -1
- data/lib/search_solr_tools/selectors/nmi.rb +2 -1
- data/lib/search_solr_tools/selectors/nodc_iso.rb +2 -1
- data/lib/search_solr_tools/selectors/pdc_iso.rb +2 -1
- data/lib/search_solr_tools/selectors/r2r.rb +3 -1
- data/lib/search_solr_tools/selectors/rda.rb +2 -1
- data/lib/search_solr_tools/selectors/tdar_opensearch.rb +3 -1
- data/lib/search_solr_tools/selectors/usgs_iso.rb +3 -1
- data/lib/search_solr_tools/translators/bcodmo_json.rb +3 -0
- data/lib/search_solr_tools/translators/eol_to_solr.rb +6 -0
- data/lib/search_solr_tools/translators/gtnp_json.rb +3 -0
- data/lib/search_solr_tools/translators/nsidc_json.rb +3 -0
- data/lib/search_solr_tools/version.rb +1 -1
- data/search_solr_tools.gemspec +22 -23
- metadata +47 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a961c7f8cdb3a126f31ba685db351337dda8816f64de94b82aad774f53b49a0c
|
4
|
+
data.tar.gz: bcadb76963b19f66567c3e5a233dd599a66d642073eb5c043b3fc5653ce2502f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a238ac9143e968252c4f37474c99ed38a6920160f9f3d8ba8753b72848dbf7152ba0bc0b142298141eae6693102481a34f20d7c461e869b1a1606b7a3fcb471
|
7
|
+
data.tar.gz: 0ca3f51c62c0683d58652928ea2bf8a09987cee68d8af691ee523186e1d3ce7dc0f4bec0e294cdb82206eb8ebb5d2b6ad6085bed651b6f0944f979cecb0875bb
|
data/CHANGELOG.md
CHANGED
@@ -1,9 +1,35 @@
|
|
1
|
-
##
|
1
|
+
## v5.1.0 (2020-07-23)
|
2
|
+
|
3
|
+
- Added a CLI method to "ping" the Solr and Source servers for a given
|
4
|
+
data center.
|
5
|
+
- Added a CLI method "errcode" to get information about the various
|
6
|
+
error codes that may be returned during harvest
|
7
|
+
- Updated the CLI harvest to return more useful error codes on failure.
|
8
|
+
|
9
|
+
## v5.0.1 (2020-07-02)
|
10
|
+
|
11
|
+
- Bug fix: some requires weren't included that needed to be.
|
12
|
+
|
13
|
+
## v5.0.0 (2020-07-02)
|
14
|
+
|
15
|
+
- Update Ruby to 2.6.5, update gem dependencies to more recent version.
|
16
|
+
- Updates to correspond with an update to Solr 8.5.2
|
17
|
+
|
18
|
+
## v4.2.1 (2019-08-13)
|
19
|
+
|
20
|
+
- Patch release to include updated CHANGELOG.
|
21
|
+
|
22
|
+
## v4.2.0 (2019-08-12)
|
2
23
|
|
3
24
|
- Update dataset-catalog-services URL to only fetch current (*not* retired)
|
4
25
|
metadata records.
|
5
26
|
- Add a few more gem release notes to README.
|
6
27
|
|
28
|
+
Note: v4.1.0 was prematurely released and, in theory, yanked. However, on the
|
29
|
+
second try at publishing 4.1.0, Rubygems complained about the attempt to
|
30
|
+
republish a gem. The version was therefore bumped again to 4.2.0 as the path of
|
31
|
+
least resistance to a successful publish. v4.1.0 should not be used.
|
32
|
+
|
7
33
|
## v4.0.1 (2019-07-08)
|
8
34
|
|
9
35
|
- Update CHANGELOG and release instructions.
|
data/README.md
CHANGED
@@ -61,7 +61,7 @@ Once you have the gem built in the project directory, install the utility:
|
|
61
61
|
|
62
62
|
### Requirements
|
63
63
|
|
64
|
-
* Ruby > 2.
|
64
|
+
* Ruby > 2.6.5
|
65
65
|
* [Bundler](http://bundler.io/)
|
66
66
|
* Requirements for nokogiri:
|
67
67
|
* [libxml2/libxml2-dev](http://xmlsoft.org/)
|
@@ -105,7 +105,7 @@ Please be sure to run them in the `bundle exec` context if you're utilizing bund
|
|
105
105
|
|
106
106
|
Requirements:
|
107
107
|
|
108
|
-
* Ruby > 2.
|
108
|
+
* Ruby > 2.6.5
|
109
109
|
* [Bundler](http://bundler.io/)
|
110
110
|
* [Gem Release](https://github.com/svenfuchs/gem-release)
|
111
111
|
* [Rake](https://github.com/ruby/rake)
|
@@ -124,9 +124,9 @@ tagging, and publishing to RubyGems.
|
|
124
124
|
|---------------------------|-------------|
|
125
125
|
| `rake release:pre[false]` | Increase the current prerelease version number, push changes |
|
126
126
|
| `rake release:pre[true]` | Increase the current prerelease version number, publish release\* |
|
127
|
-
| `rake release:none` | Drop the prerelease version, publish release
|
128
|
-
| `rake release:minor` | Increase the minor version number, publish release
|
129
|
-
| `rake release:major` | Increase the major version number, publish release
|
127
|
+
| `rake release:none` | Drop the prerelease version, publish release\*, then `pre[false]` (does a patch release) |
|
128
|
+
| `rake release:minor` | Increase the minor version number, publish release\*, then `pre[false]` |
|
129
|
+
| `rake release:major` | Increase the major version number, publish release\*, then `pre[false]` |
|
130
130
|
|
131
131
|
\*"publish release" means each of the following occurs:
|
132
132
|
|
@@ -150,7 +150,7 @@ order to publish a new version of the gem to Rubygems. To get the lastest API ke
|
|
150
150
|
### SOLR
|
151
151
|
|
152
152
|
To harvest data utilizing the gem, you will need an installed instance of [Solr
|
153
|
-
8.
|
153
|
+
8.5.3](https://lucene.apache.org/solr/guide/)
|
154
154
|
|
155
155
|
#### NSIDC
|
156
156
|
|
data/bin/search_solr_tools
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
require 'search_solr_tools'
|
5
5
|
require 'thor'
|
6
6
|
|
7
|
+
# rubocop:disable Metrics/AbcSize
|
7
8
|
class SolrHarvestCLI < Thor
|
8
9
|
map %w[--version -v] => :__print_version
|
9
10
|
|
@@ -12,6 +13,48 @@ class SolrHarvestCLI < Thor
|
|
12
13
|
puts SearchSolrTools::VERSION
|
13
14
|
end
|
14
15
|
|
16
|
+
desc 'errcode CODE', 'Print all exit codes bundled in CODE. Omit CODE to print all codes'
|
17
|
+
def errcode(code = -1)
|
18
|
+
codes = SearchSolrTools::Errors::HarvestError.describe_exit_code(code)
|
19
|
+
|
20
|
+
puts 'CODE | DESCRIPTION'
|
21
|
+
puts '-----+------------'
|
22
|
+
codes.each do |c, text|
|
23
|
+
puts format('%4<code>d | %<text>s', code: c, text: text)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
desc 'ping', 'Ping the solr and harvesting endpoints related to the specified data center(s)'
|
28
|
+
option :data_center, type: :array, required: true
|
29
|
+
option :environment, required: true
|
30
|
+
def ping
|
31
|
+
solr_success = true
|
32
|
+
source_success = true
|
33
|
+
options[:data_center].each do |target|
|
34
|
+
begin
|
35
|
+
harvest_class = get_harvester_class(target)
|
36
|
+
harvester = harvest_class.new(options[:environment])
|
37
|
+
solr_status = harvester.ping_solr
|
38
|
+
source_status = harvester.ping_source
|
39
|
+
rescue StandardError => e
|
40
|
+
solr_status = false
|
41
|
+
source_status = false
|
42
|
+
puts "Error trying to ping for #{target}: #{e}"
|
43
|
+
end
|
44
|
+
solr_success &&= solr_status
|
45
|
+
source_success &&= source_status
|
46
|
+
puts "Target: #{target}, Solr ping OK? #{solr_status}, data center ping OK? #{source_status}"
|
47
|
+
end
|
48
|
+
|
49
|
+
ping_status = SearchSolrTools::Helpers::HarvestStatus.new(
|
50
|
+
SearchSolrTools::Helpers::HarvestStatus::PING_SOLR => solr_success,
|
51
|
+
SearchSolrTools::Helpers::HarvestStatus::PING_SOURCE => source_success
|
52
|
+
)
|
53
|
+
raise SearchSolrTools::Errors::HarvestError, ping_status unless ping_status.ok?
|
54
|
+
rescue SearchSolrTools::Errors::HarvestError => e
|
55
|
+
exit e.exit_code
|
56
|
+
end
|
57
|
+
|
15
58
|
desc 'harvest', 'Harvest from the specified data centers'
|
16
59
|
option :data_center, type: :array, required: true
|
17
60
|
option :environment, required: true
|
@@ -22,10 +65,21 @@ class SolrHarvestCLI < Thor
|
|
22
65
|
begin
|
23
66
|
harvest_class = get_harvester_class(target)
|
24
67
|
harvester = harvest_class.new(options[:environment], die_on_failure)
|
68
|
+
ping_status = SearchSolrTools::Helpers::HarvestStatus.new(
|
69
|
+
SearchSolrTools::Helpers::HarvestStatus::PING_SOLR => harvester.ping_solr,
|
70
|
+
SearchSolrTools::Helpers::HarvestStatus::PING_SOURCE => harvester.ping_source
|
71
|
+
)
|
72
|
+
raise SearchSolrTools::Errors::HarvestError, ping_status unless ping_status.ok?
|
73
|
+
|
25
74
|
harvester.harvest_and_delete
|
75
|
+
rescue SearchSolrTools::Errors::HarvestError => e
|
76
|
+
puts "THERE WERE HARVEST STATUS ERRORS:\n#{e.message}"
|
77
|
+
exit e.exit_code
|
26
78
|
rescue StandardError => e
|
79
|
+
# If it gets here, there is an error that we aren't expecting.
|
27
80
|
puts "harvest failed for #{target}: #{e.message}"
|
28
|
-
|
81
|
+
puts e.backtrace
|
82
|
+
exit SearchSolrTools::Errors::HarvestError::ERRCODE_OTHER
|
29
83
|
end
|
30
84
|
end
|
31
85
|
end
|
@@ -85,10 +139,12 @@ class SolrHarvestCLI < Thor
|
|
85
139
|
|
86
140
|
def get_harvester_class(data_center_name)
|
87
141
|
name = data_center_name.downcase.to_s
|
88
|
-
raise("Invalid data center #{name}") unless harvester_map.key?(name)
|
142
|
+
raise SearchSolrTools::Errors::HarvestError.new(nil, "Invalid data center #{name}") unless harvester_map.key?(name)
|
89
143
|
|
90
144
|
harvester_map[name]
|
91
145
|
end
|
92
146
|
end
|
93
147
|
end
|
148
|
+
# rubocop:enable Metrics/AbcSize
|
149
|
+
|
94
150
|
SolrHarvestCLI.start(ARGV)
|
data/lib/search_solr_tools.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
-
|
2
|
-
require_relative '
|
3
|
-
require_relative './search_solr_tools/version'
|
1
|
+
require_relative 'search_solr_tools/config/environments'
|
2
|
+
require_relative 'search_solr_tools/version'
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
require_relative 'search_solr_tools/helpers/selectors'
|
5
|
+
require_relative 'search_solr_tools/helpers/harvest_status'
|
6
|
+
require_relative 'search_solr_tools/errors/harvest_error'
|
7
|
+
|
8
|
+
%w( selectors harvesters translators ).each do |subdir|
|
9
|
+
Dir[File.join(__dir__, 'search_solr_tools', subdir, '*.rb')].each { |file| require file }
|
10
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module SearchSolrTools
|
2
|
+
module Errors
|
3
|
+
class HarvestError < StandardError
|
4
|
+
ERRCODE_SOLR_PING = 1
|
5
|
+
ERRCODE_SOURCE_PING = 2
|
6
|
+
ERRCODE_SOURCE_NO_RESULTS = 4
|
7
|
+
ERRCODE_SOURCE_HARVEST_ERROR = 8
|
8
|
+
ERRCODE_DOCUMENT_INVALID = 16
|
9
|
+
ERRCODE_INGEST_ERROR = 32
|
10
|
+
ERRCODE_OTHER = 128
|
11
|
+
|
12
|
+
ERRCODE_DESC = {
|
13
|
+
ERRCODE_SOLR_PING => 'Solr instance did not return a successful ping',
|
14
|
+
ERRCODE_SOURCE_PING => 'Source to be harvested did not return a successful ping',
|
15
|
+
ERRCODE_SOURCE_NO_RESULTS => 'Source to be harvested returned no documents matching query',
|
16
|
+
ERRCODE_SOURCE_HARVEST_ERROR => 'One or more source documents returned an error when trying to retrieve or translate',
|
17
|
+
ERRCODE_DOCUMENT_INVALID => 'One or more documents to be harvested was invalid (malformed)',
|
18
|
+
ERRCODE_INGEST_ERROR => 'Solr returned an error trying to ingest one or more harvested documents',
|
19
|
+
ERRCODE_OTHER => 'General error code for non-harvest related issues'
|
20
|
+
}.freeze
|
21
|
+
|
22
|
+
PING_ERRCODE_MAP = {
|
23
|
+
'ping_solr' => ERRCODE_SOLR_PING,
|
24
|
+
'ping_source' => ERRCODE_SOURCE_PING,
|
25
|
+
}
|
26
|
+
|
27
|
+
STATUS_ERRCODE_MAP = {
|
28
|
+
Helpers::HarvestStatus::HARVEST_NO_DOCS => ERRCODE_SOURCE_NO_RESULTS,
|
29
|
+
Helpers::HarvestStatus::HARVEST_FAILURE => ERRCODE_SOURCE_HARVEST_ERROR,
|
30
|
+
Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC => ERRCODE_DOCUMENT_INVALID,
|
31
|
+
Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR => ERRCODE_INGEST_ERROR,
|
32
|
+
Helpers::HarvestStatus::OTHER_ERROR => ERRCODE_OTHER
|
33
|
+
}.freeze
|
34
|
+
|
35
|
+
# If code is -1, it means display all error codes
|
36
|
+
def self.describe_exit_code(code = -1)
|
37
|
+
code = code.to_i
|
38
|
+
code_list = []
|
39
|
+
|
40
|
+
# Loop through all bit-flag values
|
41
|
+
[128, 64, 32, 16, 8, 4, 2, 1].each do |k|
|
42
|
+
if code >= k || code == -1
|
43
|
+
code_list.prepend k
|
44
|
+
code -= k unless code == -1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
codes = {}
|
49
|
+
code_list.each do |k|
|
50
|
+
next if code == -1 && !ERRCODE_DESC.keys.include?(k) # skip INVALID CODE if showing all codes
|
51
|
+
codes[k] = ERRCODE_DESC.keys.include?(k) ? ERRCODE_DESC[k] : 'INVALID CODE NUMBER'
|
52
|
+
end
|
53
|
+
|
54
|
+
codes
|
55
|
+
end
|
56
|
+
|
57
|
+
def initialize(status, message=nil)
|
58
|
+
@status_data = status
|
59
|
+
@other_message = message
|
60
|
+
end
|
61
|
+
|
62
|
+
def exit_code
|
63
|
+
if @status_data.nil?
|
64
|
+
puts "OTHER ERROR REPORTED: #{@other_message}"
|
65
|
+
return ERRCODE_OTHER
|
66
|
+
end
|
67
|
+
|
68
|
+
puts "EXIT CODE STATUS:\n#{@status_data.status}"
|
69
|
+
|
70
|
+
code = 0
|
71
|
+
code += ERRCODE_SOLR_PING unless @status_data.ping_solr
|
72
|
+
code += ERRCODE_SOURCE_PING unless @status_data.ping_source
|
73
|
+
code += ERRCODE_SOURCE_NO_RESULTS if @status_data.status[Helpers::HarvestStatus::HARVEST_NO_DOCS] > 0
|
74
|
+
code += ERRCODE_SOURCE_HARVEST_ERROR if @status_data.status[Helpers::HarvestStatus::HARVEST_FAILURE] > 0
|
75
|
+
code += ERRCODE_DOCUMENT_INVALID if @status_data.status[Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC] > 0
|
76
|
+
code += ERRCODE_INGEST_ERROR if @status_data.status[Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR] > 0
|
77
|
+
|
78
|
+
code = ERRCODE_OTHER if code == 0
|
79
|
+
|
80
|
+
code
|
81
|
+
end
|
82
|
+
|
83
|
+
def message
|
84
|
+
self.class.describe_exit_code(exit_code).map{|c,v| v}.join("\n")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require_relative 'base'
|
1
2
|
require 'json'
|
2
3
|
require 'rest-client'
|
3
4
|
|
@@ -45,8 +46,11 @@ module SearchSolrTools
|
|
45
46
|
end
|
46
47
|
|
47
48
|
def add_documents_to_solr(add_docs)
|
48
|
-
|
49
|
+
status = insert_solr_doc add_docs, Base::JSON_CONTENT_TYPE, @env_settings[:auto_suggest_collection_name]
|
50
|
+
|
51
|
+
if status == Helpers::HarvestStatus::INGEST_OK
|
49
52
|
puts "Added #{add_docs.size} auto suggest documents in one commit"
|
53
|
+
return Helpers::HarvestStatus.new(Helpers::HarvestStatus::INGEST_OK => add_docs)
|
50
54
|
else
|
51
55
|
puts "Failed adding #{add_docs.size} documents in single commit, retrying one by one"
|
52
56
|
new_add_docs = []
|
@@ -5,6 +5,12 @@ require 'rest-client'
|
|
5
5
|
require 'rsolr'
|
6
6
|
require 'time'
|
7
7
|
|
8
|
+
require 'search_solr_tools'
|
9
|
+
require_relative '../helpers/iso_namespaces'
|
10
|
+
require_relative '../helpers/solr_format'
|
11
|
+
require_relative '../helpers/iso_to_solr'
|
12
|
+
|
13
|
+
|
8
14
|
module SearchSolrTools
|
9
15
|
module Harvesters
|
10
16
|
# base class for solr harvesters
|
@@ -33,17 +39,47 @@ module SearchSolrTools
|
|
33
39
|
url
|
34
40
|
end
|
35
41
|
|
42
|
+
# Ping the Solr instance to ensure that it's running.
|
43
|
+
# The ping query is specified to manually check the title, as it's possible
|
44
|
+
# there is no "default" query in the solr instance.
|
45
|
+
def ping_solr(core = SolrEnvironments[@environment][:collection_name])
|
46
|
+
url = solr_url + "/#{core}/admin/ping?df=title"
|
47
|
+
success = false
|
48
|
+
|
49
|
+
# Some docs will cause solr to time out during the POST
|
50
|
+
begin
|
51
|
+
RestClient.get(url) do |response, _request, _result|
|
52
|
+
success = response.code == 200
|
53
|
+
puts "Error in ping request: #{response.body}" unless success
|
54
|
+
end
|
55
|
+
rescue => e
|
56
|
+
puts "Rest exception while pinging Solr: #{e}"
|
57
|
+
end
|
58
|
+
success
|
59
|
+
end
|
60
|
+
|
61
|
+
# This should be overridden by child classes to implement the ability
|
62
|
+
# to "ping" the data center. Returns true if the ping is successful (or, as
|
63
|
+
# in this default, no ping method was defined)
|
64
|
+
def ping_source
|
65
|
+
puts "Harvester does not have ping method defined, assuming true"
|
66
|
+
true
|
67
|
+
end
|
68
|
+
|
36
69
|
def harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name])
|
37
70
|
start_time = Time.now.utc.iso8601
|
38
|
-
|
71
|
+
|
72
|
+
harvest_status = harvest_method.call
|
39
73
|
delete_old_documents start_time, delete_constraints, solr_core
|
74
|
+
|
75
|
+
harvest_status
|
40
76
|
end
|
41
77
|
|
42
78
|
def delete_old_documents(timestamp, constraints, solr_core, force = false)
|
43
79
|
constraints = sanitize_data_centers_constraints(constraints)
|
44
80
|
delete_query = "last_update:[* TO #{timestamp}] AND #{constraints}"
|
45
81
|
solr = RSolr.connect url: solr_url + "/#{solr_core}"
|
46
|
-
unchanged_count = (solr.get 'select', params: { q: delete_query, rows: 0 })['response']['numFound'].to_i
|
82
|
+
unchanged_count = (solr.get 'select', params: { wt: :ruby, q: delete_query, rows: 0 })['response']['numFound'].to_i
|
47
83
|
if unchanged_count == 0
|
48
84
|
puts "All documents were updated after #{timestamp}, nothing to delete"
|
49
85
|
else
|
@@ -61,7 +97,7 @@ module SearchSolrTools
|
|
61
97
|
end
|
62
98
|
|
63
99
|
def remove_documents(solr, delete_query, constraints, force, numfound)
|
64
|
-
all_response_count = (solr.get 'select', params: { q: constraints, rows: 0 })['response']['numFound']
|
100
|
+
all_response_count = (solr.get 'select', params: { wt: :ruby, q: constraints, rows: 0 })['response']['numFound']
|
65
101
|
if force || (numfound / all_response_count.to_f < DELETE_DOCUMENTS_RATIO)
|
66
102
|
puts "Deleting #{numfound} documents for #{constraints}"
|
67
103
|
solr.delete_by_query delete_query
|
@@ -77,21 +113,31 @@ module SearchSolrTools
|
|
77
113
|
def insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
|
78
114
|
success = 0
|
79
115
|
failure = 0
|
116
|
+
|
117
|
+
status = Helpers::HarvestStatus.new
|
118
|
+
|
80
119
|
docs.each do |doc|
|
81
|
-
insert_solr_doc(doc, content_type, core)
|
120
|
+
doc_status = insert_solr_doc(doc, content_type, core)
|
121
|
+
status.record_status doc_status
|
122
|
+
doc_status == Helpers::HarvestStatus::INGEST_OK ? success += 1 : failure += 1
|
82
123
|
end
|
83
124
|
puts "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
|
84
125
|
puts "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."
|
85
|
-
|
126
|
+
|
127
|
+
status
|
86
128
|
end
|
87
129
|
|
130
|
+
# TODO Need to return a specific type of failure:
|
131
|
+
# - Bad record content identified and no ingest attempted
|
132
|
+
# - Solr tries to ingest document and fails (bad content not detected prior to ingest)
|
133
|
+
# - Solr cannot insert document for reasons other than the document structure and content.
|
88
134
|
def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
|
89
135
|
url = solr_url + "/#{core}/update?commit=true"
|
90
|
-
|
136
|
+
status = Helpers::HarvestStatus::INGEST_OK
|
91
137
|
|
92
138
|
# Some of the docs will cause Solr to crash - CPU goes to 195% with `top` and it
|
93
139
|
# doesn't seem to recover.
|
94
|
-
return
|
140
|
+
return Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC if content_type == XML_CONTENT_TYPE && !doc_valid?(doc)
|
95
141
|
|
96
142
|
doc_serialized = get_serialized_doc(doc, content_type)
|
97
143
|
|
@@ -99,13 +145,18 @@ module SearchSolrTools
|
|
99
145
|
begin
|
100
146
|
RestClient.post(url, doc_serialized, content_type: content_type) do |response, _request, _result|
|
101
147
|
success = response.code == 200
|
102
|
-
|
148
|
+
unless success
|
149
|
+
puts "Error for #{doc_serialized}\n\n response: #{response.body}"
|
150
|
+
status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
|
151
|
+
end
|
103
152
|
end
|
104
153
|
rescue => e
|
154
|
+
# TODO Need to provide more detail re: this failure so we know whether to
|
155
|
+
# exit the job with a status != 0
|
105
156
|
puts "Rest exception while POSTing to Solr: #{e}, for doc: #{doc_serialized}"
|
157
|
+
status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
|
106
158
|
end
|
107
|
-
|
108
|
-
success
|
159
|
+
status
|
109
160
|
end
|
110
161
|
|
111
162
|
def get_serialized_doc(doc, content_type)
|
@@ -118,7 +169,7 @@ module SearchSolrTools
|
|
118
169
|
end
|
119
170
|
end
|
120
171
|
|
121
|
-
# Get results from
|
172
|
+
# Get results from an end point specified in the request_url
|
122
173
|
def get_results(request_url, metadata_path, content_type = 'application/xml')
|
123
174
|
timeout = 300
|
124
175
|
retries_left = 3
|
@@ -134,6 +185,9 @@ module SearchSolrTools
|
|
134
185
|
|
135
186
|
retry if retries_left > 0
|
136
187
|
|
188
|
+
# TODO - Do we really need this "die_on_failure" anymore? The empty return
|
189
|
+
# will cause the "No Documents" error to be thrown in the harvester class
|
190
|
+
# now, so it will pretty much always "die on failure"
|
137
191
|
raise e if @die_on_failure
|
138
192
|
return
|
139
193
|
end
|