traject 1.1.0 → 2.0.0.rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +20 -0
  3. data/README.md +85 -73
  4. data/doc/batch_execution.md +2 -6
  5. data/doc/other_commands.md +3 -5
  6. data/doc/settings.md +27 -38
  7. data/lib/traject/command_line.rb +1 -1
  8. data/lib/traject/csv_writer.rb +34 -0
  9. data/lib/traject/delimited_writer.rb +110 -0
  10. data/lib/traject/indexer.rb +29 -11
  11. data/lib/traject/indexer/settings.rb +39 -13
  12. data/lib/traject/line_writer.rb +10 -6
  13. data/lib/traject/marc_reader.rb +2 -1
  14. data/lib/traject/solr_json_writer.rb +277 -0
  15. data/lib/traject/thread_pool.rb +38 -48
  16. data/lib/traject/translation_map.rb +3 -0
  17. data/lib/traject/util.rb +13 -51
  18. data/lib/traject/version.rb +1 -1
  19. data/lib/translation_maps/marc_geographic.yaml +2 -2
  20. data/test/delimited_writer_test.rb +104 -0
  21. data/test/indexer/read_write_test.rb +0 -22
  22. data/test/indexer/settings_test.rb +24 -0
  23. data/test/solr_json_writer_test.rb +248 -0
  24. data/test/test_helper.rb +5 -3
  25. data/test/test_support/demo_config.rb +0 -5
  26. data/test/translation_map_test.rb +9 -0
  27. data/traject.gemspec +18 -5
  28. metadata +77 -87
  29. data/lib/traject/marc4j_reader.rb +0 -153
  30. data/lib/traject/solrj_writer.rb +0 -351
  31. data/test/marc4j_reader_test.rb +0 -136
  32. data/test/solrj_writer_test.rb +0 -209
  33. data/vendor/solrj/README +0 -8
  34. data/vendor/solrj/build.xml +0 -39
  35. data/vendor/solrj/ivy.xml +0 -16
  36. data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
  37. data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
  38. data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
  39. data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
  40. data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
  41. data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
  42. data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
  43. data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
  44. data/vendor/solrj/lib/noggit-0.5.jar +0 -0
  45. data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
  46. data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
  47. data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
  48. data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
  49. data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
  50. data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
  51. data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
@@ -12,7 +12,7 @@ module Traject
12
12
  #
13
13
  # A CommandLine object has a single persistent Indexer object it uses
14
14
  class CommandLine
15
- # orig_argv is origina one passed in, remaining_argv is after destructive
15
+ # orig_argv is original one passed in, remaining_argv is after destructive
16
16
  # processing by slop, still has file args in it etc.
17
17
  attr_accessor :orig_argv, :remaining_argv
18
18
  attr_accessor :slop, :options
@@ -0,0 +1,34 @@
1
+ require 'traject/delimited_writer'
2
+ require 'csv'
3
+
4
+ # A CSV-writer, for folks who like that sort of thing.
5
+ # Use DelimitedWriter for non-CSV lines (e.g., tab-delimited)
6
+ #
7
+ #
8
+
9
+ class Traject::CSVWriter < Traject::DelimitedWriter
10
+
11
+ def initialize(*args)
12
+ super
13
+ self.delimiter = nil # Let CSV take care of it
14
+ end
15
+
16
+ def _write(data)
17
+ @output_file << data
18
+ end
19
+
20
+ # Turn the output file into a CSV writer
21
+ def open_output_file
22
+ of = super
23
+ CSV.new(of)
24
+ end
25
+
26
+ # Let CSV take care of the comma escaping
27
+ def escape(x)
28
+ x = x.to_s
29
+ x.gsub! internal_delimiter, @eidelim
30
+ x
31
+ end
32
+
33
+
34
+ end
@@ -0,0 +1,110 @@
1
+ require 'traject/line_writer'
2
+
3
+ # A simple line writer that uses configuration to determine
4
+ # how to produce a tab-delimited file
5
+ #
6
+ # Appropos settings:
7
+ #
8
+ # * output_file -- the file to write to
9
+ # * output_stream -- the stream to write to, if defined and output_file is not
10
+ # * delimited_writer.delimiter -- What to separate fields with; default is tab
11
+ # * delimited_writer.internal_delimiter -- Delimiter _within_ a field, for multiple
12
+ # values. Default is pipe ( | )
13
+ # * delimited_writer.fields -- comma-separated list of the fields to output
14
+ # * delimited_writer.header (true/false) -- boolean that determines if we should output a header row. Default is true
15
+ # * delimited_writer.escape -- If a value actually contains the delimited or internal_delimiter, what to do?
16
+ # If unset, will follow the procedure below. If set, will turn it into the character(s) given
17
+ #
18
+ #
19
+ # If `delimited_writer.escape` is not set, the writer will automatically
20
+ # escape delimiters/internal_delimiters in the following way:
21
+ # * If the delimiter is a tab, replace tabs in values with a single space
22
+ # * If the delimiter is anything else, prefix it with a backslash
23
+
24
+ class Traject::DelimitedWriter < Traject::LineWriter
25
+
26
+ attr_reader :delimiter, :internal_delimiter, :edelim, :eidelim
27
+ attr_accessor :header
28
+
29
+ def initialize(settings)
30
+ super
31
+
32
+ # fields to output
33
+
34
+ begin
35
+ @fields = settings['delimited_writer.fields'].split(",")
36
+ rescue NoMethodError => e
37
+ end
38
+
39
+ if e or @fields.empty?
40
+ raise ArgumentError.new("#{self.class.name} must have a comma-delimited list of field names to output set in setting 'delimited_writer.fields'")
41
+ end
42
+
43
+ self.delimiter = settings['delimited_writer.delimiter'] || "\t"
44
+ self.internal_delimiter = settings['delimited_writer.internal_delimiter'] || '|'
45
+ self.header = settings['delimited_writer.header'].to_s != 'false'
46
+
47
+ # Output the header if need be
48
+ write_header if @header
49
+ end
50
+
51
+ def escaped_delimiter(d)
52
+ return nil if d.nil?
53
+ d == "\t" ? ' ' : '\\' + d
54
+ end
55
+
56
+ def delimiter=(d)
57
+ @delimiter = d
58
+ @edelim = escaped_delimiter(d)
59
+ self
60
+ end
61
+
62
+ def internal_delimiter=(d)
63
+ @internal_delimiter = d
64
+ @eidelim = escaped_delimiter(d)
65
+ end
66
+
67
+
68
+
69
+
70
+ def write_header
71
+ _write(@fields)
72
+ end
73
+
74
+ def _write(data)
75
+ output_file.puts(data.join(delimiter))
76
+ end
77
+
78
+ # Get the output values out of the context
79
+ def raw_output_values(context)
80
+ context.output_hash.values_at(*@fields)
81
+ end
82
+
83
+ # Escape the delimiters in whatever way has been defined
84
+ def escape(x)
85
+ x = x.to_s
86
+ x.gsub! @delimiter, @edelim if @delimiter
87
+ x.gsub! @internal_delimiter, @eidelim
88
+ x
89
+ end
90
+
91
+
92
+ # Derive actual output field values from the raw values
93
+ def output_values(raw)
94
+ raw.map do |x|
95
+ if x.is_a? Array
96
+ x.map!{|s| escape(s)}
97
+ x.join(@internal_delimiter)
98
+ else
99
+ escape(x)
100
+ end
101
+ end
102
+ end
103
+
104
+ # Spit out the escaped values joined by the delimiter
105
+ def serialize(context)
106
+ output_values(raw_output_values(context))
107
+ end
108
+
109
+
110
+ end
@@ -6,13 +6,16 @@ require 'traject/thread_pool'
6
6
 
7
7
  require 'traject/indexer/settings'
8
8
  require 'traject/marc_reader'
9
- require 'traject/marc4j_reader'
10
9
  require 'traject/json_writer'
11
- require 'traject/solrj_writer'
10
+ require 'traject/solr_json_writer'
12
11
 
13
12
  require 'traject/macros/marc21'
14
13
  require 'traject/macros/basic'
15
14
 
15
+ if defined? JRUBY_VERSION
16
+ require 'traject/marc4j_reader'
17
+ end
18
+
16
19
  # This class does indexing for traject: Getting input records from a Reader
17
20
  # class, mapping the input records to an output hash, and then sending the output
18
21
  # hash off somewhere (usually Solr) with a Writer class.
@@ -53,8 +56,9 @@ require 'traject/macros/basic'
53
56
  # 2) Responds to the usual ruby #each, returning a source record from each #each.
54
57
  # (Including Enumerable is prob a good idea too)
55
58
  #
56
- # The default reader is the Traject::Marc4JReader, who's behavior is
57
- # further customized by several settings in the Settings hash.
59
+ # The default reader is the Traject::MarcReader, who's behavior is
60
+ # further customized by several settings in the Settings hash. Jruby users
61
+ # with specialized needs may want to look at the gem traject-marc4j_reader.
58
62
  #
59
63
  # Alternate readers can be set directly with the #reader_class= method, or
60
64
  # with the "reader_class_name" Setting, a String name of a class
@@ -72,14 +76,22 @@ require 'traject/macros/basic'
72
76
  # 4) Optionally implements a #skipped_record_count method, returning int count of records
73
77
  # that were skipped due to errors (and presumably logged)
74
78
  #
75
- # The default writer is the SolrJWriter, using Java SolrJ to
76
- # write to a Solr. A few other built-in writers are available,
77
- # but it's anticipated more will be created as plugins or local
78
- # code for special purposes.
79
+ # Traject packages one solr writer: traject/solr_json_writer, which sends
80
+ # in json format and works under both ruby and jruby, but only with solr version
81
+ # >= 3.2. To index to an older solr installation, you'll need to use jruby and
82
+ # install the gem traject-solrj_writer, which uses the solrj .jar underneath.
79
83
  #
80
84
  # You can set alternate writers by setting a Class object directly
81
85
  # with the #writer_class method, or by the 'writer_class_name' Setting,
82
- # with a String name of class meeting the Writer contract.
86
+ # with a String name of class meeting the Writer contract. There are several
87
+ # that ship with traject itself:
88
+ #
89
+ # * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
90
+ # * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
91
+ # * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
92
+ # each line consists of the id, field, and value(s).
93
+ # * traject/delimited_writer and traject/csv_writer -- write character-delimited files
94
+ # (default is tab-delimited) or comma-separated-value files.
83
95
  #
84
96
  class Traject::Indexer
85
97
 
@@ -310,7 +322,13 @@ class Traject::Indexer
310
322
  reader = self.reader!(io_stream)
311
323
  writer = self.writer!
312
324
 
313
- thread_pool = Traject::ThreadPool.new(settings["processing_thread_pool"].to_i)
325
+
326
+ processing_threads = settings["processing_thread_pool"].to_i
327
+ if processing_threads > 0 and !(defined? JRuby)
328
+ processing_threads = 0
329
+ logger.warn "Processing threads set to 0 because we're not running under JRuby"
330
+ end
331
+ thread_pool = Traject::ThreadPool.new(processing_threads)
314
332
 
315
333
  logger.info " Indexer with reader: #{reader.class.name} and writer: #{writer.class.name}"
316
334
 
@@ -326,7 +344,7 @@ class Traject::Indexer
326
344
  thread_pool.raise_collected_exception!
327
345
 
328
346
  if settings["debug_ascii_progress"].to_s == "true"
329
- $stderr.write "." if count % settings["solrj_writer.batch_size"] == 0
347
+ $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
330
348
  end
331
349
 
332
350
  if log_batch_size && (count % log_batch_size == 0)
@@ -1,4 +1,5 @@
1
1
  require 'hashie'
2
+ require 'concurrent'
2
3
 
3
4
  class Traject::Indexer
4
5
 
@@ -22,9 +23,6 @@ class Traject::Indexer
22
23
  include Hashie::Extensions::MergeInitializer # can init with hash
23
24
  include Hashie::Extensions::IndifferentAccess
24
25
 
25
- # Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
26
- alias_method :store, :indifferent_writer
27
-
28
26
  def initialize(*args)
29
27
  super
30
28
  self.default_proc = lambda do |hash, key|
@@ -59,19 +57,37 @@ class Traject::Indexer
59
57
  def fill_in_defaults!
60
58
  self.reverse_merge!(self.class.defaults)
61
59
  end
60
+
61
+
62
+ def self.mri_defaults
63
+ {
64
+ "reader_class_name" => "Traject::MarcReader",
65
+ "writer_class_name" => "Traject::SolrJsonWriter",
66
+ "marc_source.type" => "binary",
67
+ "solrj_writer.batch_size" => 200,
68
+ "solrj_writer.thread_pool" => 1,
69
+ "processing_thread_pool" => self.default_processing_thread_pool,
70
+ "log.batch_size.severity" => "info"
71
+ }
72
+ end
62
73
 
63
- def self.defaults
64
- @@defaults ||= {
65
- "reader_class_name" => "Traject::MarcReader",
66
- "writer_class_name" => "Traject::SolrJWriter",
67
- "marc_source.type" => "binary",
68
- "marc4j_reader.permissive" => true,
69
- "solrj_writer.batch_size" => 200,
70
- "solrj_writer.thread_pool" => 1,
71
- "processing_thread_pool" => 3,
72
- "log.batch_size.severity" => "info"
74
+ def self.jruby_defaults
75
+ {
76
+ 'reader_class_name' => "Traject::Marc4JReader",
77
+ 'marc4j_reader.permissive' => true
73
78
  }
74
79
  end
80
+
81
+
82
+ def self.defaults
83
+ return @@defaults if defined? @@defaults
84
+ default_settings = self.mri_defaults
85
+ if defined? JRUBY_VERSION
86
+ default_settings.merge! self.jruby_defaults
87
+ end
88
+
89
+ @@defaults = default_settings
90
+ end
75
91
 
76
92
  def inspect
77
93
  # Keep any key ending in password out of the inspect
@@ -80,5 +96,15 @@ class Traject::Indexer
80
96
  hash
81
97
  end.inspect
82
98
  end
99
+
100
+ protected
101
+ def self.default_processing_thread_pool
102
+ if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
103
+ [1, Concurrent.processor_count - 1].max
104
+ else
105
+ 1
106
+ end
107
+ end
108
+
83
109
  end
84
110
  end
@@ -16,14 +16,18 @@ require 'thread'
16
16
  # method. For instance, see JsonWriter.
17
17
  class Traject::LineWriter
18
18
  attr_reader :settings
19
- attr_reader :write_mutex
19
+ attr_reader :write_mutex, :output_file
20
20
 
21
21
  def initialize(argSettings)
22
22
  @settings = argSettings
23
23
  @write_mutex = Mutex.new
24
24
 
25
25
  # trigger lazy loading now for thread-safety
26
- output_file
26
+ @output_file = open_output_file
27
+ end
28
+
29
+ def _write(data)
30
+ output_file.puts(data)
27
31
  end
28
32
 
29
33
 
@@ -34,13 +38,13 @@ class Traject::LineWriter
34
38
  def put(context)
35
39
  serialized = serialize(context)
36
40
  write_mutex.synchronize do
37
- output_file.puts(serialized)
41
+ _write(serialized)
38
42
  end
39
43
  end
40
44
 
41
- def output_file
45
+ def open_output_file
42
46
  unless defined? @output_file
43
- @output_file =
47
+ of =
44
48
  if settings["output_file"]
45
49
  File.open(settings["output_file"], 'w:UTF-8')
46
50
  elsif settings["output_stream"]
@@ -49,7 +53,7 @@ class Traject::LineWriter
49
53
  $stdout
50
54
  end
51
55
  end
52
- return @output_file
56
+ return of
53
57
  end
54
58
 
55
59
  def close
@@ -5,7 +5,8 @@ require 'traject/ndj_reader'
5
5
  # can read MARC ISO 2709 ('binary'), MARC-XML, and Marc-in-json (newline-delimited-json).
6
6
  #
7
7
  # Marc4JReader is an alternative to this class, powered by Marc4J. You may be interested
8
- # in comparing for performance, under your particular use case.
8
+ # in comparing for performance, under your particular use case. To use it, you'll need
9
+ # the gem traject-marc4j_reader.
9
10
  #
10
11
  # By default assumes binary MARC encoding, please set marc_source.type setting
11
12
  # for XML or json. If binary, please set marc_source.encoding with char encoding.
@@ -0,0 +1,277 @@
1
+ require 'yell'
2
+
3
+ require 'traject'
4
+ require 'traject/util'
5
+ require 'traject/qualified_const_get'
6
+ require 'traject/thread_pool'
7
+
8
+ require 'json'
9
+ require 'httpclient'
10
+
11
+ require 'uri'
12
+ require 'thread' # for Mutex/Queue
13
+ require 'concurrent' # for atomic_fixnum
14
+
15
+ # Write to Solr using the JSON interface; only works for Solr >= 3.2
16
+ #
17
+ # This should work under both MRI and JRuby, with JRuby getting much
18
+ # better performance due to the threading model.
19
+ #
20
+ # Relevant settings
21
+ #
22
+ # * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
23
+ #
24
+ # * solr.update_url: The actual update url. If unset, we'll first see if
25
+ # "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update"
26
+ #
27
+ # * solr_writer.batch_size: How big a batch to send to solr. Default is 100.
28
+ # My tests indicate that this setting doesn't change overall index speed by a ton.
29
+ #
30
+ # * solr_writer.thread_pool: How many threads to use for the writer. Default is 1.
31
+ # Likely useful even under MRI since thread will be waiting on Solr for some time.
32
+ #
33
+ # * solr_writer.max_skipped: How many records skipped due to errors before we
34
+ # bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
35
+ # raise and abort on a single record that could not be added to Solr.
36
+ #
37
+ # * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
38
+ # end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
39
+ # compat only.)
40
+ #
41
+ # * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
42
+ # giving up as a timeout. Default 10 minutes. Solr can be slow.
43
+ #
44
+ # * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
45
+ # or mock object to be used for HTTP.
46
+
47
+
48
+ class Traject::SolrJsonWriter
49
+ include Traject::QualifiedConstGet
50
+
51
+ DEFAULT_MAX_SKIPPED = 0
52
+ DEFAULT_BATCH_SIZE = 100
53
+
54
+ # The passed-in settings
55
+ attr_reader :settings, :thread_pool_size
56
+
57
+ # A queue to hold documents before sending to solr
58
+ attr_reader :batched_queue
59
+
60
+ def initialize(argSettings)
61
+ @settings = Traject::Indexer::Settings.new(argSettings)
62
+
63
+ # Set max errors
64
+ @max_skipped = (@settings['solr_writer.max_skipped'] || DEFAULT_MAX_SKIPPED).to_i
65
+ if @max_skipped < 0
66
+ @max_skipped = nil
67
+ end
68
+
69
+ @http_client = @settings["solr_json_writer.http_client"] || HTTPClient.new
70
+
71
+ @batch_size = (settings["solr_writer.batch_size"] || DEFAULT_BATCH_SIZE).to_i
72
+ @batch_size = 1 if @batch_size < 1
73
+
74
+ # Store error count in an AtomicInteger, so multi threads can increment
75
+ # it safely, if we're threaded.
76
+ @skipped_record_incrementer = Concurrent::AtomicFixnum.new(0)
77
+
78
+
79
+ # How many threads to use for the writer?
80
+ # if our thread pool settings are 0, it'll just create a null threadpool that
81
+ # executes in calling context.
82
+ @thread_pool_size = (@settings["solr_writer.thread_pool"] || 1).to_i
83
+
84
+ @batched_queue = Queue.new
85
+ @thread_pool = Traject::ThreadPool.new(@thread_pool_size)
86
+
87
+ # old setting solrj_writer supported for backwards compat, as we make
88
+ # this the new default writer.
89
+ @commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
90
+
91
+ # Figure out where to send updates
92
+ @solr_update_url = self.determine_solr_update_url
93
+
94
+ logger.info(" #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
95
+ end
96
+
97
+
98
+ # Add a single context to the queue, ready to be sent to solr
99
+ def put(context)
100
+ @thread_pool.raise_collected_exception!
101
+
102
+ @batched_queue << context
103
+ if @batched_queue.size >= @batch_size
104
+ batch = Traject::Util.drain_queue(@batched_queue)
105
+ @thread_pool.maybe_in_thread_pool(batch) {|batch_arg| send_batch(batch_arg) }
106
+ end
107
+ end
108
+
109
+ # Send the given batch of contexts. If something goes wrong, send
110
+ # them one at a time.
111
+ # @param [Array<Traject::Indexer::Context>] an array of contexts
112
+ def send_batch(batch)
113
+ return if batch.empty?
114
+ json_package = JSON.generate(batch.map { |c| c.output_hash })
115
+ begin
116
+ resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
117
+ rescue StandardError => exception
118
+ end
119
+
120
+ if exception || resp.status != 200
121
+ error_message = exception ?
122
+ Traject::Util.exception_to_log_message(exception) :
123
+ "Solr response: #{resp.status}: #{resp.body}"
124
+
125
+ logger.error "Error in Solr batch add. Will retry documents individually at performance penalty: #{error_message}"
126
+
127
+ batch.each do |c|
128
+ send_single(c)
129
+ end
130
+ end
131
+ end
132
+
133
+
134
+ # Send a single context to Solr, logging an error if need be
135
+ # @param [Traject::Indexer::Context] c The context whose document you want to send
136
+ def send_single(c)
137
+ json_package = JSON.generate([c.output_hash])
138
+ begin
139
+ resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
140
+ # Catch Timeouts and network errors as skipped records, but otherwise
141
+ # allow unexpected errors to propagate up.
142
+ rescue HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED => exception
143
+ end
144
+
145
+ if exception || resp.status != 200
146
+ if exception
147
+ msg = Traject::Util.exception_to_log_message(e)
148
+ else
149
+ msg = "Solr error response: #{resp.status}: #{resp.body}"
150
+ end
151
+ logger.error "Could not add record #{record_id_from_context c} at source file position #{c.position}: #{msg}"
152
+ logger.debug(c.source_record.to_s)
153
+
154
+ @skipped_record_incrementer.increment
155
+ if @max_skipped and skipped_record_count > @max_skipped
156
+ raise RuntimeError.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
157
+ end
158
+
159
+ end
160
+
161
+ end
162
+
163
+
164
+ # Get the logger from the settings, or default to an effectively null logger
165
+ def logger
166
+ settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
167
+ end
168
+
169
+ # Returns MARC 001, then a slash, then output_hash["id"] -- if both
170
+ # are present. Otherwise may return just one, or even an empty string.
171
+ def record_id_from_context(context)
172
+ marc_id = context.source_record && context.source_record['001'] && context.source_record['001'].value
173
+ output_id = context.output_hash["id"]
174
+
175
+ return [marc_id, output_id].compact.join("/")
176
+ end
177
+
178
+
179
+ # On close, we need to (a) raise any exceptions we might have, (b) send off
180
+ # the last (possibly empty) batch, and (c) commit if instructed to do so
181
+ # via the solr_writer.commit_on_close setting.
182
+ def close
183
+ @thread_pool.raise_collected_exception!
184
+
185
+ # Finish off whatever's left. Do it in the thread pool for
186
+ # consistency, and to ensure expected order of operations, so
187
+ # it goes to the end of the queue behind any other work.
188
+ batch = Traject::Util.drain_queue(@batched_queue)
189
+ if batch.length > 0
190
+ @thread_pool.maybe_in_thread_pool { send_batch(batch) }
191
+ end
192
+
193
+ # Wait for shutdown, and time it.
194
+ logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
195
+ elapsed = @thread_pool.shutdown_and_wait
196
+ if elapsed > 60
197
+ logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
198
+ end
199
+ logger.debug "#{self.class.name}: Thread pool shutdown complete"
200
+ logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
201
+
202
+ # check again now that we've waited, there could still be some
203
+ # that didn't show up before.
204
+ @thread_pool.raise_collected_exception!
205
+
206
+ # Commit if we're supposed to
207
+ if @commit_on_close
208
+ commit
209
+ end
210
+ end
211
+
212
+
213
+ # Send a commit
214
+ def commit
215
+ logger.info "#{self.class.name} sending commit to solr at url #{@solr_update_url}..."
216
+
217
+ original_timeout = @http_client.receive_timeout
218
+
219
+ @http_client.receive_timeout = (settings["commit_timeout"] || (10 * 60)).to_i
220
+
221
+ resp = @http_client.get(@solr_update_url, {"commit" => 'true'})
222
+ unless resp.status == 200
223
+ raise RuntimeError.new("Could not commit to Solr: #{resp.status} #{resp.body}")
224
+ end
225
+
226
+ @http_client.receive_timeout = original_timeout
227
+ end
228
+
229
+
230
+ # Return count of encountered skipped records. Most accurate to call
231
+ # it after #close, in which case it should include full count, even
232
+ # under async thread_pool.
233
+ def skipped_record_count
234
+ @skipped_record_incrementer.value
235
+ end
236
+
237
+
238
+ # Relatively complex logic to determine if we have a valid URL and what it is
239
+ def determine_solr_update_url
240
+ if settings['solr.update_url']
241
+ check_solr_update_url(settings['solr.update_url'])
242
+ else
243
+ derive_solr_update_url_from_solr_url(settings['solr.url'])
244
+ end
245
+ end
246
+
247
+
248
+ # If we've got a solr.update_url, make sure it's ok
249
+ def check_solr_update_url(url)
250
+ unless url =~ /^#{URI::regexp}$/
251
+ raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
252
+ end
253
+ url
254
+ end
255
+
256
+ def derive_solr_update_url_from_solr_url(url)
257
+ # Nil? Then we bail
258
+ if url.nil?
259
+ raise ArgumentError.new("#{self.class.name}: Neither solr.update_url nor solr.url set; need at least one")
260
+ end
261
+
262
+ # Not a URL? Bail
263
+ unless url =~ /^#{URI::regexp}$/
264
+ raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
265
+ end
266
+
267
+ # First, try the /update/json handler
268
+ candidate = [url.chomp('/'), 'update', 'json'].join('/')
269
+ resp = @http_client.get(candidate)
270
+ if resp.status == 404
271
+ candidate = [url.chomp('/'), 'update'].join('/')
272
+ end
273
+ candidate
274
+ end
275
+
276
+
277
+ end