search_solr_tools 4.2.0 → 5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -1
  3. data/README.md +6 -6
  4. data/bin/search_solr_tools +58 -2
  5. data/lib/search_solr_tools.rb +9 -7
  6. data/lib/search_solr_tools/errors/harvest_error.rb +88 -0
  7. data/lib/search_solr_tools/harvesters/adc.rb +2 -0
  8. data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +2 -0
  9. data/lib/search_solr_tools/harvesters/auto_suggest.rb +5 -1
  10. data/lib/search_solr_tools/harvesters/base.rb +65 -11
  11. data/lib/search_solr_tools/harvesters/bcodmo.rb +1 -0
  12. data/lib/search_solr_tools/harvesters/data_one.rb +2 -0
  13. data/lib/search_solr_tools/harvesters/echo.rb +2 -0
  14. data/lib/search_solr_tools/harvesters/gtnp.rb +1 -0
  15. data/lib/search_solr_tools/harvesters/ices.rb +3 -0
  16. data/lib/search_solr_tools/harvesters/ncdc_paleo.rb +3 -0
  17. data/lib/search_solr_tools/harvesters/nmi.rb +2 -0
  18. data/lib/search_solr_tools/harvesters/nodc.rb +3 -0
  19. data/lib/search_solr_tools/harvesters/nsidc_auto_suggest.rb +2 -0
  20. data/lib/search_solr_tools/harvesters/nsidc_json.rb +32 -5
  21. data/lib/search_solr_tools/harvesters/oai.rb +3 -0
  22. data/lib/search_solr_tools/harvesters/pdc.rb +2 -0
  23. data/lib/search_solr_tools/harvesters/r2r.rb +2 -2
  24. data/lib/search_solr_tools/harvesters/rda.rb +2 -0
  25. data/lib/search_solr_tools/harvesters/tdar.rb +2 -0
  26. data/lib/search_solr_tools/harvesters/usgs.rb +3 -0
  27. data/lib/search_solr_tools/helpers/data_one_format.rb +3 -3
  28. data/lib/search_solr_tools/helpers/harvest_status.rb +44 -0
  29. data/lib/search_solr_tools/helpers/iso_to_solr.rb +1 -0
  30. data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +2 -2
  31. data/lib/search_solr_tools/helpers/ncdc_paleo_format.rb +3 -3
  32. data/lib/search_solr_tools/helpers/r2r_format.rb +3 -3
  33. data/lib/search_solr_tools/helpers/selectors.rb +1 -2
  34. data/lib/search_solr_tools/helpers/solr_format.rb +1 -0
  35. data/lib/search_solr_tools/helpers/tdar_format.rb +3 -3
  36. data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +2 -2
  37. data/lib/search_solr_tools/helpers/translate_temporal_coverage.rb +1 -1
  38. data/lib/search_solr_tools/helpers/usgs_format.rb +2 -2
  39. data/lib/search_solr_tools/selectors/adc.rb +2 -1
  40. data/lib/search_solr_tools/selectors/data_one.rb +2 -1
  41. data/lib/search_solr_tools/selectors/echo_iso.rb +2 -1
  42. data/lib/search_solr_tools/selectors/ices_iso.rb +2 -1
  43. data/lib/search_solr_tools/selectors/ncdc_paleo.rb +2 -1
  44. data/lib/search_solr_tools/selectors/nmi.rb +2 -1
  45. data/lib/search_solr_tools/selectors/nodc_iso.rb +2 -1
  46. data/lib/search_solr_tools/selectors/pdc_iso.rb +2 -1
  47. data/lib/search_solr_tools/selectors/r2r.rb +3 -1
  48. data/lib/search_solr_tools/selectors/rda.rb +2 -1
  49. data/lib/search_solr_tools/selectors/tdar_opensearch.rb +3 -1
  50. data/lib/search_solr_tools/selectors/usgs_iso.rb +3 -1
  51. data/lib/search_solr_tools/translators/bcodmo_json.rb +3 -0
  52. data/lib/search_solr_tools/translators/eol_to_solr.rb +6 -0
  53. data/lib/search_solr_tools/translators/gtnp_json.rb +3 -0
  54. data/lib/search_solr_tools/translators/nsidc_json.rb +3 -0
  55. data/lib/search_solr_tools/version.rb +1 -1
  56. data/search_solr_tools.gemspec +22 -23
  57. metadata +47 -55
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5929841e7bf672bf499a73bbfd49a4f7b13db0238cfd57c3705d59513351fc30
4
- data.tar.gz: f4a8809af8058bb43f24a0a0842b89e5158ca96ae16145d65f63dd8a6030f8bd
3
+ metadata.gz: a961c7f8cdb3a126f31ba685db351337dda8816f64de94b82aad774f53b49a0c
4
+ data.tar.gz: bcadb76963b19f66567c3e5a233dd599a66d642073eb5c043b3fc5653ce2502f
5
5
  SHA512:
6
- metadata.gz: 867c37d6e1b39d2a7de7a001c0aec4cc0891338f0d3504e7de8062b3ff88aeef1e8794f312f97b038bd4b1a4936e58e5629ea97907c417d13e9ead01d502cfb4
7
- data.tar.gz: d0eb4c0cdd073d4c34c8460a72e767347d52dcc0d03de007c829e7d6b2a20075e8013d1764db39e74e4d62a62e43537599b28d98b8ab0cbe163499663b578e64
6
+ metadata.gz: 1a238ac9143e968252c4f37474c99ed38a6920160f9f3d8ba8753b72848dbf7152ba0bc0b142298141eae6693102481a34f20d7c461e869b1a1606b7a3fcb471
7
+ data.tar.gz: 0ca3f51c62c0683d58652928ea2bf8a09987cee68d8af691ee523186e1d3ce7dc0f4bec0e294cdb82206eb8ebb5d2b6ad6085bed651b6f0944f979cecb0875bb
@@ -1,9 +1,35 @@
1
- ## v4.1.0 (2019-08-12)
1
+ ## v5.1.0 (2020-07-23)
2
+
3
+ - Added a CLI method to "ping" the Solr and Source servers for a given
4
+ data center.
5
+ - Added a CLI method "errcode" to get information about the various
6
+ error codes that may be returned during harvest
7
+ - Updated the CLI harvest to return more useful error codes on failure.
8
+
9
+ ## v5.0.1 (2020-07-02)
10
+
11
+ - Bug fix: some requires weren't included that needed to be.
12
+
13
+ ## v5.0.0 (2020-07-02)
14
+
15
+ - Update Ruby to 2.6.5, update gem dependencies to more recent version.
16
+ - Updates to correspond with an update to Solr 8.5.2
17
+
18
+ ## v4.2.1 (2019-08-13)
19
+
20
+ - Patch release to include updated CHANGELOG.
21
+
22
+ ## v4.2.0 (2019-08-12)
2
23
 
3
24
  - Update dataset-catalog-services URL to only fetch current (*not* retired)
4
25
  metadata records.
5
26
  - Add a few more gem release notes to README.
6
27
 
28
+ Note: v4.1.0 was prematurely released and, in theory, yanked. However, on the
29
+ second try at publishing 4.1.0, Rubygems complained about the attempt to
30
+ republish a gem. The version was therefore bumped again to 4.2.0 as the path of
31
+ least resistance to a successful publish. v4.1.0 should not be used.
32
+
7
33
  ## v4.0.1 (2019-07-08)
8
34
 
9
35
  - Update CHANGELOG and release instructions.
data/README.md CHANGED
@@ -61,7 +61,7 @@ Once you have the gem built in the project directory, install the utility:
61
61
 
62
62
  ### Requirements
63
63
 
64
- * Ruby > 2.0.0
64
+ * Ruby > 2.6.5
65
65
  * [Bundler](http://bundler.io/)
66
66
  * Requirements for nokogiri:
67
67
  * [libxml2/libxml2-dev](http://xmlsoft.org/)
@@ -105,7 +105,7 @@ Please be sure to run them in the `bundle exec` context if you're utilizing bund
105
105
 
106
106
  Requirements:
107
107
 
108
- * Ruby > 2.0.0
108
+ * Ruby > 2.6.5
109
109
  * [Bundler](http://bundler.io/)
110
110
  * [Gem Release](https://github.com/svenfuchs/gem-release)
111
111
  * [Rake](https://github.com/ruby/rake)
@@ -124,9 +124,9 @@ tagging, and publishing to RubyGems.
124
124
  |---------------------------|-------------|
125
125
  | `rake release:pre[false]` | Increase the current prerelease version number, push changes |
126
126
  | `rake release:pre[true]` | Increase the current prerelease version number, publish release\* |
127
- | `rake release:none` | Drop the prerelease version, publish release, then `pre[false]` (does a patch release) |
128
- | `rake release:minor` | Increase the minor version number, publish release, then `pre[false]` |
129
- | `rake release:major` | Increase the major version number, publish release, then `pre[false]` |
127
+ | `rake release:none` | Drop the prerelease version, publish release\*, then `pre[false]` (does a patch release) |
128
+ | `rake release:minor` | Increase the minor version number, publish release\*, then `pre[false]` |
129
+ | `rake release:major` | Increase the major version number, publish release\*, then `pre[false]` |
130
130
 
131
131
  \*"publish release" means each of the following occurs:
132
132
 
@@ -150,7 +150,7 @@ order to publish a new version of the gem to Rubygems. To get the lastest API ke
150
150
  ### SOLR
151
151
 
152
152
  To harvest data utilizing the gem, you will need an installed instance of [Solr
153
- 8.1.1](https://lucene.apache.org/solr/guide/)
153
+ 8.5.3](https://lucene.apache.org/solr/guide/)
154
154
 
155
155
  #### NSIDC
156
156
 
@@ -4,6 +4,7 @@
4
4
  require 'search_solr_tools'
5
5
  require 'thor'
6
6
 
7
+ # rubocop:disable Metrics/AbcSize
7
8
  class SolrHarvestCLI < Thor
8
9
  map %w[--version -v] => :__print_version
9
10
 
@@ -12,6 +13,48 @@ class SolrHarvestCLI < Thor
12
13
  puts SearchSolrTools::VERSION
13
14
  end
14
15
 
16
+ desc 'errcode CODE', 'Print all exit codes bundled in CODE. Omit CODE to print all codes'
17
+ def errcode(code = -1)
18
+ codes = SearchSolrTools::Errors::HarvestError.describe_exit_code(code)
19
+
20
+ puts 'CODE | DESCRIPTION'
21
+ puts '-----+------------'
22
+ codes.each do |c, text|
23
+ puts format('%4<code>d | %<text>s', code: c, text: text)
24
+ end
25
+ end
26
+
27
+ desc 'ping', 'Ping the solr and harvesting endpoints related to the specified data center(s)'
28
+ option :data_center, type: :array, required: true
29
+ option :environment, required: true
30
+ def ping
31
+ solr_success = true
32
+ source_success = true
33
+ options[:data_center].each do |target|
34
+ begin
35
+ harvest_class = get_harvester_class(target)
36
+ harvester = harvest_class.new(options[:environment])
37
+ solr_status = harvester.ping_solr
38
+ source_status = harvester.ping_source
39
+ rescue StandardError => e
40
+ solr_status = false
41
+ source_status = false
42
+ puts "Error trying to ping for #{target}: #{e}"
43
+ end
44
+ solr_success &&= solr_status
45
+ source_success &&= source_status
46
+ puts "Target: #{target}, Solr ping OK? #{solr_status}, data center ping OK? #{source_status}"
47
+ end
48
+
49
+ ping_status = SearchSolrTools::Helpers::HarvestStatus.new(
50
+ SearchSolrTools::Helpers::HarvestStatus::PING_SOLR => solr_success,
51
+ SearchSolrTools::Helpers::HarvestStatus::PING_SOURCE => source_success
52
+ )
53
+ raise SearchSolrTools::Errors::HarvestError, ping_status unless ping_status.ok?
54
+ rescue SearchSolrTools::Errors::HarvestError => e
55
+ exit e.exit_code
56
+ end
57
+
15
58
  desc 'harvest', 'Harvest from the specified data centers'
16
59
  option :data_center, type: :array, required: true
17
60
  option :environment, required: true
@@ -22,10 +65,21 @@ class SolrHarvestCLI < Thor
22
65
  begin
23
66
  harvest_class = get_harvester_class(target)
24
67
  harvester = harvest_class.new(options[:environment], die_on_failure)
68
+ ping_status = SearchSolrTools::Helpers::HarvestStatus.new(
69
+ SearchSolrTools::Helpers::HarvestStatus::PING_SOLR => harvester.ping_solr,
70
+ SearchSolrTools::Helpers::HarvestStatus::PING_SOURCE => harvester.ping_source
71
+ )
72
+ raise SearchSolrTools::Errors::HarvestError, ping_status unless ping_status.ok?
73
+
25
74
  harvester.harvest_and_delete
75
+ rescue SearchSolrTools::Errors::HarvestError => e
76
+ puts "THERE WERE HARVEST STATUS ERRORS:\n#{e.message}"
77
+ exit e.exit_code
26
78
  rescue StandardError => e
79
+ # If it gets here, there is an error that we aren't expecting.
27
80
  puts "harvest failed for #{target}: #{e.message}"
28
- raise e
81
+ puts e.backtrace
82
+ exit SearchSolrTools::Errors::HarvestError::ERRCODE_OTHER
29
83
  end
30
84
  end
31
85
  end
@@ -85,10 +139,12 @@ class SolrHarvestCLI < Thor
85
139
 
86
140
  def get_harvester_class(data_center_name)
87
141
  name = data_center_name.downcase.to_s
88
- raise("Invalid data center #{name}") unless harvester_map.key?(name)
142
+ raise SearchSolrTools::Errors::HarvestError.new(nil, "Invalid data center #{name}") unless harvester_map.key?(name)
89
143
 
90
144
  harvester_map[name]
91
145
  end
92
146
  end
93
147
  end
148
+ # rubocop:enable Metrics/AbcSize
149
+
94
150
  SolrHarvestCLI.start(ARGV)
@@ -1,8 +1,10 @@
1
- require 'require_all'
2
- require_relative './search_solr_tools/config/environments'
3
- require_relative './search_solr_tools/version'
1
+ require_relative 'search_solr_tools/config/environments'
2
+ require_relative 'search_solr_tools/version'
4
3
 
5
- require_rel './search_solr_tools/helpers'
6
- require_rel './search_solr_tools/selectors'
7
- require_rel './search_solr_tools/harvesters'
8
- require_rel './search_solr_tools/translators'
4
+ require_relative 'search_solr_tools/helpers/selectors'
5
+ require_relative 'search_solr_tools/helpers/harvest_status'
6
+ require_relative 'search_solr_tools/errors/harvest_error'
7
+
8
+ %w( selectors harvesters translators ).each do |subdir|
9
+ Dir[File.join(__dir__, 'search_solr_tools', subdir, '*.rb')].each { |file| require file }
10
+ end
@@ -0,0 +1,88 @@
1
+ module SearchSolrTools
2
+ module Errors
3
+ class HarvestError < StandardError
4
+ ERRCODE_SOLR_PING = 1
5
+ ERRCODE_SOURCE_PING = 2
6
+ ERRCODE_SOURCE_NO_RESULTS = 4
7
+ ERRCODE_SOURCE_HARVEST_ERROR = 8
8
+ ERRCODE_DOCUMENT_INVALID = 16
9
+ ERRCODE_INGEST_ERROR = 32
10
+ ERRCODE_OTHER = 128
11
+
12
+ ERRCODE_DESC = {
13
+ ERRCODE_SOLR_PING => 'Solr instance did not return a successful ping',
14
+ ERRCODE_SOURCE_PING => 'Source to be harvested did not return a successful ping',
15
+ ERRCODE_SOURCE_NO_RESULTS => 'Source to be harvested returned no documents matching query',
16
+ ERRCODE_SOURCE_HARVEST_ERROR => 'One or more source documents returned an error when trying to retrieve or translate',
17
+ ERRCODE_DOCUMENT_INVALID => 'One or more documents to be harvested was invalid (malformed)',
18
+ ERRCODE_INGEST_ERROR => 'Solr returned an error trying to ingest one or more harvested documents',
19
+ ERRCODE_OTHER => 'General error code for non-harvest related issues'
20
+ }.freeze
21
+
22
+ PING_ERRCODE_MAP = {
23
+ 'ping_solr' => ERRCODE_SOLR_PING,
24
+ 'ping_source' => ERRCODE_SOURCE_PING,
25
+ }
26
+
27
+ STATUS_ERRCODE_MAP = {
28
+ Helpers::HarvestStatus::HARVEST_NO_DOCS => ERRCODE_SOURCE_NO_RESULTS,
29
+ Helpers::HarvestStatus::HARVEST_FAILURE => ERRCODE_SOURCE_HARVEST_ERROR,
30
+ Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC => ERRCODE_DOCUMENT_INVALID,
31
+ Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR => ERRCODE_INGEST_ERROR,
32
+ Helpers::HarvestStatus::OTHER_ERROR => ERRCODE_OTHER
33
+ }.freeze
34
+
35
+ # If code is -1, it means display all error codes
36
+ def self.describe_exit_code(code = -1)
37
+ code = code.to_i
38
+ code_list = []
39
+
40
+ # Loop through all bit-flag values
41
+ [128, 64, 32, 16, 8, 4, 2, 1].each do |k|
42
+ if code >= k || code == -1
43
+ code_list.prepend k
44
+ code -= k unless code == -1
45
+ end
46
+ end
47
+
48
+ codes = {}
49
+ code_list.each do |k|
50
+ next if code == -1 && !ERRCODE_DESC.keys.include?(k) # skip INVALID CODE if showing all codes
51
+ codes[k] = ERRCODE_DESC.keys.include?(k) ? ERRCODE_DESC[k] : 'INVALID CODE NUMBER'
52
+ end
53
+
54
+ codes
55
+ end
56
+
57
+ def initialize(status, message=nil)
58
+ @status_data = status
59
+ @other_message = message
60
+ end
61
+
62
+ def exit_code
63
+ if @status_data.nil?
64
+ puts "OTHER ERROR REPORTED: #{@other_message}"
65
+ return ERRCODE_OTHER
66
+ end
67
+
68
+ puts "EXIT CODE STATUS:\n#{@status_data.status}"
69
+
70
+ code = 0
71
+ code += ERRCODE_SOLR_PING unless @status_data.ping_solr
72
+ code += ERRCODE_SOURCE_PING unless @status_data.ping_source
73
+ code += ERRCODE_SOURCE_NO_RESULTS if @status_data.status[Helpers::HarvestStatus::HARVEST_NO_DOCS] > 0
74
+ code += ERRCODE_SOURCE_HARVEST_ERROR if @status_data.status[Helpers::HarvestStatus::HARVEST_FAILURE] > 0
75
+ code += ERRCODE_DOCUMENT_INVALID if @status_data.status[Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC] > 0
76
+ code += ERRCODE_INGEST_ERROR if @status_data.status[Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR] > 0
77
+
78
+ code = ERRCODE_OTHER if code == 0
79
+
80
+ code
81
+ end
82
+
83
+ def message
84
+ self.class.describe_exit_code(exit_code).map{|c,v| v}.join("\n")
85
+ end
86
+ end
87
+ end
88
+ end
@@ -1,3 +1,5 @@
1
+ require_relative 'base'
2
+
1
3
  module SearchSolrTools
2
4
  module Harvesters
3
5
  class Adc < Base
@@ -1,3 +1,5 @@
1
+ require_relative 'auto_suggest'
2
+
1
3
  module SearchSolrTools
2
4
  module Harvesters
3
5
  class AdeAutoSuggest < AutoSuggest
@@ -1,3 +1,4 @@
1
+ require_relative 'base'
1
2
  require 'json'
2
3
  require 'rest-client'
3
4
 
@@ -45,8 +46,11 @@ module SearchSolrTools
45
46
  end
46
47
 
47
48
  def add_documents_to_solr(add_docs)
48
- if insert_solr_doc add_docs, Base::JSON_CONTENT_TYPE, @env_settings[:auto_suggest_collection_name]
49
+ status = insert_solr_doc add_docs, Base::JSON_CONTENT_TYPE, @env_settings[:auto_suggest_collection_name]
50
+
51
+ if status == Helpers::HarvestStatus::INGEST_OK
49
52
  puts "Added #{add_docs.size} auto suggest documents in one commit"
53
+ return Helpers::HarvestStatus.new(Helpers::HarvestStatus::INGEST_OK => add_docs)
50
54
  else
51
55
  puts "Failed adding #{add_docs.size} documents in single commit, retrying one by one"
52
56
  new_add_docs = []
@@ -5,6 +5,12 @@ require 'rest-client'
5
5
  require 'rsolr'
6
6
  require 'time'
7
7
 
8
+ require 'search_solr_tools'
9
+ require_relative '../helpers/iso_namespaces'
10
+ require_relative '../helpers/solr_format'
11
+ require_relative '../helpers/iso_to_solr'
12
+
13
+
8
14
  module SearchSolrTools
9
15
  module Harvesters
10
16
  # base class for solr harvesters
@@ -33,17 +39,47 @@ module SearchSolrTools
33
39
  url
34
40
  end
35
41
 
42
+ # Ping the Solr instance to ensure that it's running.
43
+ # The ping query is specified to manually check the title, as it's possible
44
+ # there is no "default" query in the solr instance.
45
+ def ping_solr(core = SolrEnvironments[@environment][:collection_name])
46
+ url = solr_url + "/#{core}/admin/ping?df=title"
47
+ success = false
48
+
49
+ # Some docs will cause solr to time out during the POST
50
+ begin
51
+ RestClient.get(url) do |response, _request, _result|
52
+ success = response.code == 200
53
+ puts "Error in ping request: #{response.body}" unless success
54
+ end
55
+ rescue => e
56
+ puts "Rest exception while pinging Solr: #{e}"
57
+ end
58
+ success
59
+ end
60
+
61
+ # This should be overridden by child classes to implement the ability
62
+ # to "ping" the data center. Returns true if the ping is successful (or, as
63
+ # in this default, no ping method was defined)
64
+ def ping_source
65
+ puts "Harvester does not have ping method defined, assuming true"
66
+ true
67
+ end
68
+
36
69
  def harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name])
37
70
  start_time = Time.now.utc.iso8601
38
- harvest_method.call
71
+
72
+ harvest_status = harvest_method.call
39
73
  delete_old_documents start_time, delete_constraints, solr_core
74
+
75
+ harvest_status
40
76
  end
41
77
 
42
78
  def delete_old_documents(timestamp, constraints, solr_core, force = false)
43
79
  constraints = sanitize_data_centers_constraints(constraints)
44
80
  delete_query = "last_update:[* TO #{timestamp}] AND #{constraints}"
45
81
  solr = RSolr.connect url: solr_url + "/#{solr_core}"
46
- unchanged_count = (solr.get 'select', params: { q: delete_query, rows: 0 })['response']['numFound'].to_i
82
+ unchanged_count = (solr.get 'select', params: { wt: :ruby, q: delete_query, rows: 0 })['response']['numFound'].to_i
47
83
  if unchanged_count == 0
48
84
  puts "All documents were updated after #{timestamp}, nothing to delete"
49
85
  else
@@ -61,7 +97,7 @@ module SearchSolrTools
61
97
  end
62
98
 
63
99
  def remove_documents(solr, delete_query, constraints, force, numfound)
64
- all_response_count = (solr.get 'select', params: { q: constraints, rows: 0 })['response']['numFound']
100
+ all_response_count = (solr.get 'select', params: { wt: :ruby, q: constraints, rows: 0 })['response']['numFound']
65
101
  if force || (numfound / all_response_count.to_f < DELETE_DOCUMENTS_RATIO)
66
102
  puts "Deleting #{numfound} documents for #{constraints}"
67
103
  solr.delete_by_query delete_query
@@ -77,21 +113,31 @@ module SearchSolrTools
77
113
  def insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
78
114
  success = 0
79
115
  failure = 0
116
+
117
+ status = Helpers::HarvestStatus.new
118
+
80
119
  docs.each do |doc|
81
- insert_solr_doc(doc, content_type, core) ? success += 1 : failure += 1
120
+ doc_status = insert_solr_doc(doc, content_type, core)
121
+ status.record_status doc_status
122
+ doc_status == Helpers::HarvestStatus::INGEST_OK ? success += 1 : failure += 1
82
123
  end
83
124
  puts "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
84
125
  puts "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."
85
- fail 'Some documents failed to be inserted into Solr' if failure > 0
126
+
127
+ status
86
128
  end
87
129
 
130
+ # TODO Need to return a specific type of failure:
131
+ # - Bad record content identified and no ingest attempted
132
+ # - Solr tries to ingest document and fails (bad content not detected prior to ingest)
133
+ # - Solr cannot insert document for reasons other than the document structure and content.
88
134
  def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
89
135
  url = solr_url + "/#{core}/update?commit=true"
90
- success = false
136
+ status = Helpers::HarvestStatus::INGEST_OK
91
137
 
92
138
  # Some of the docs will cause Solr to crash - CPU goes to 195% with `top` and it
93
139
  # doesn't seem to recover.
94
- return success if content_type == XML_CONTENT_TYPE && !doc_valid?(doc)
140
+ return Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC if content_type == XML_CONTENT_TYPE && !doc_valid?(doc)
95
141
 
96
142
  doc_serialized = get_serialized_doc(doc, content_type)
97
143
 
@@ -99,13 +145,18 @@ module SearchSolrTools
99
145
  begin
100
146
  RestClient.post(url, doc_serialized, content_type: content_type) do |response, _request, _result|
101
147
  success = response.code == 200
102
- puts "Error for #{doc_serialized}\n\n response: #{response.body}" unless success
148
+ unless success
149
+ puts "Error for #{doc_serialized}\n\n response: #{response.body}"
150
+ status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
151
+ end
103
152
  end
104
153
  rescue => e
154
+ # TODO Need to provide more detail re: this failure so we know whether to
155
+ # exit the job with a status != 0
105
156
  puts "Rest exception while POSTing to Solr: #{e}, for doc: #{doc_serialized}"
157
+ status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
106
158
  end
107
-
108
- success
159
+ status
109
160
  end
110
161
 
111
162
  def get_serialized_doc(doc, content_type)
@@ -118,7 +169,7 @@ module SearchSolrTools
118
169
  end
119
170
  end
120
171
 
121
- # Get results from some ISO end point specified in the query string
172
+ # Get results from an end point specified in the request_url
122
173
  def get_results(request_url, metadata_path, content_type = 'application/xml')
123
174
  timeout = 300
124
175
  retries_left = 3
@@ -134,6 +185,9 @@ module SearchSolrTools
134
185
 
135
186
  retry if retries_left > 0
136
187
 
188
+ # TODO - Do we really need this "die_on_failure" anymore? The empty return
189
+ # will cause the "No Documents" error to be thrown in the harvester class
190
+ # now, so it will pretty much always "die on failure"
137
191
  raise e if @die_on_failure
138
192
  return
139
193
  end