traject 1.1.0 → 2.0.0.rc.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +20 -0
  3. data/README.md +85 -73
  4. data/doc/batch_execution.md +2 -6
  5. data/doc/other_commands.md +3 -5
  6. data/doc/settings.md +27 -38
  7. data/lib/traject/command_line.rb +1 -1
  8. data/lib/traject/csv_writer.rb +34 -0
  9. data/lib/traject/delimited_writer.rb +110 -0
  10. data/lib/traject/indexer.rb +29 -11
  11. data/lib/traject/indexer/settings.rb +39 -13
  12. data/lib/traject/line_writer.rb +10 -6
  13. data/lib/traject/marc_reader.rb +2 -1
  14. data/lib/traject/solr_json_writer.rb +277 -0
  15. data/lib/traject/thread_pool.rb +38 -48
  16. data/lib/traject/translation_map.rb +3 -0
  17. data/lib/traject/util.rb +13 -51
  18. data/lib/traject/version.rb +1 -1
  19. data/lib/translation_maps/marc_geographic.yaml +2 -2
  20. data/test/delimited_writer_test.rb +104 -0
  21. data/test/indexer/read_write_test.rb +0 -22
  22. data/test/indexer/settings_test.rb +24 -0
  23. data/test/solr_json_writer_test.rb +248 -0
  24. data/test/test_helper.rb +5 -3
  25. data/test/test_support/demo_config.rb +0 -5
  26. data/test/translation_map_test.rb +9 -0
  27. data/traject.gemspec +18 -5
  28. metadata +77 -87
  29. data/lib/traject/marc4j_reader.rb +0 -153
  30. data/lib/traject/solrj_writer.rb +0 -351
  31. data/test/marc4j_reader_test.rb +0 -136
  32. data/test/solrj_writer_test.rb +0 -209
  33. data/vendor/solrj/README +0 -8
  34. data/vendor/solrj/build.xml +0 -39
  35. data/vendor/solrj/ivy.xml +0 -16
  36. data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
  37. data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
  38. data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
  39. data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
  40. data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
  41. data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
  42. data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
  43. data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
  44. data/vendor/solrj/lib/noggit-0.5.jar +0 -0
  45. data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
  46. data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
  47. data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
  48. data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
  49. data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
  50. data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
  51. data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
@@ -12,7 +12,7 @@ module Traject
12
12
  #
13
13
  # A CommandLine object has a single persistent Indexer object it uses
14
14
  class CommandLine
15
- # orig_argv is origina one passed in, remaining_argv is after destructive
15
+ # orig_argv is original one passed in, remaining_argv is after destructive
16
16
  # processing by slop, still has file args in it etc.
17
17
  attr_accessor :orig_argv, :remaining_argv
18
18
  attr_accessor :slop, :options
@@ -0,0 +1,34 @@
1
+ require 'traject/delimited_writer'
2
+ require 'csv'
3
+
4
+ # A CSV-writer, for folks who like that sort of thing.
5
+ # Use DelimitedWriter for non-CSV lines (e.g., tab-delimited)
6
+ #
7
+ #
8
+
9
+ class Traject::CSVWriter < Traject::DelimitedWriter
10
+
11
+ def initialize(*args)
12
+ super
13
+ self.delimiter = nil # Let CSV take care of it
14
+ end
15
+
16
+ def _write(data)
17
+ @output_file << data
18
+ end
19
+
20
+ # Turn the output file into a CSV writer
21
+ def open_output_file
22
+ of = super
23
+ CSV.new(of)
24
+ end
25
+
26
+ # Let CSV take care of the comma escaping
27
+ def escape(x)
28
+ x = x.to_s
29
+ x.gsub! internal_delimiter, @eidelim
30
+ x
31
+ end
32
+
33
+
34
+ end
@@ -0,0 +1,110 @@
1
+ require 'traject/line_writer'
2
+
3
+ # A simple line writer that uses configuration to determine
4
+ # how to produce a tab-delimited file
5
+ #
6
+ # Appropos settings:
7
+ #
8
+ # * output_file -- the file to write to
9
+ # * output_stream -- the stream to write to, if defined and output_file is not
10
+ # * delimited_writer.delimiter -- What to separate fields with; default is tab
11
+ # * delimited_writer.internal_delimiter -- Delimiter _within_ a field, for multiple
12
+ # values. Default is pipe ( | )
13
+ # * delimited_writer.fields -- comma-separated list of the fields to output
14
+ # * delimited_writer.header (true/false) -- boolean that determines if we should output a header row. Default is true
15
+ # * delimited_writer.escape -- If a value actually contains the delimited or internal_delimiter, what to do?
16
+ # If unset, will follow the procedure below. If set, will turn it into the character(s) given
17
+ #
18
+ #
19
+ # If `delimited_writer.escape` is not set, the writer will automatically
20
+ # escape delimiters/internal_delimiters in the following way:
21
+ # * If the delimiter is a tab, replace tabs in values with a single space
22
+ # * If the delimiter is anything else, prefix it with a backslash
23
+
24
+ class Traject::DelimitedWriter < Traject::LineWriter
25
+
26
+ attr_reader :delimiter, :internal_delimiter, :edelim, :eidelim
27
+ attr_accessor :header
28
+
29
+ def initialize(settings)
30
+ super
31
+
32
+ # fields to output
33
+
34
+ begin
35
+ @fields = settings['delimited_writer.fields'].split(",")
36
+ rescue NoMethodError => e
37
+ end
38
+
39
+ if e or @fields.empty?
40
+ raise ArgumentError.new("#{self.class.name} must have a comma-delimited list of field names to output set in setting 'delimited_writer.fields'")
41
+ end
42
+
43
+ self.delimiter = settings['delimited_writer.delimiter'] || "\t"
44
+ self.internal_delimiter = settings['delimited_writer.internal_delimiter'] || '|'
45
+ self.header = settings['delimited_writer.header'].to_s != 'false'
46
+
47
+ # Output the header if need be
48
+ write_header if @header
49
+ end
50
+
51
+ def escaped_delimiter(d)
52
+ return nil if d.nil?
53
+ d == "\t" ? ' ' : '\\' + d
54
+ end
55
+
56
+ def delimiter=(d)
57
+ @delimiter = d
58
+ @edelim = escaped_delimiter(d)
59
+ self
60
+ end
61
+
62
+ def internal_delimiter=(d)
63
+ @internal_delimiter = d
64
+ @eidelim = escaped_delimiter(d)
65
+ end
66
+
67
+
68
+
69
+
70
+ def write_header
71
+ _write(@fields)
72
+ end
73
+
74
+ def _write(data)
75
+ output_file.puts(data.join(delimiter))
76
+ end
77
+
78
+ # Get the output values out of the context
79
+ def raw_output_values(context)
80
+ context.output_hash.values_at(*@fields)
81
+ end
82
+
83
+ # Escape the delimiters in whatever way has been defined
84
+ def escape(x)
85
+ x = x.to_s
86
+ x.gsub! @delimiter, @edelim if @delimiter
87
+ x.gsub! @internal_delimiter, @eidelim
88
+ x
89
+ end
90
+
91
+
92
+ # Derive actual output field values from the raw values
93
+ def output_values(raw)
94
+ raw.map do |x|
95
+ if x.is_a? Array
96
+ x.map!{|s| escape(s)}
97
+ x.join(@internal_delimiter)
98
+ else
99
+ escape(x)
100
+ end
101
+ end
102
+ end
103
+
104
+ # Spit out the escaped values joined by the delimiter
105
+ def serialize(context)
106
+ output_values(raw_output_values(context))
107
+ end
108
+
109
+
110
+ end
@@ -6,13 +6,16 @@ require 'traject/thread_pool'
6
6
 
7
7
  require 'traject/indexer/settings'
8
8
  require 'traject/marc_reader'
9
- require 'traject/marc4j_reader'
10
9
  require 'traject/json_writer'
11
- require 'traject/solrj_writer'
10
+ require 'traject/solr_json_writer'
12
11
 
13
12
  require 'traject/macros/marc21'
14
13
  require 'traject/macros/basic'
15
14
 
15
+ if defined? JRUBY_VERSION
16
+ require 'traject/marc4j_reader'
17
+ end
18
+
16
19
  # This class does indexing for traject: Getting input records from a Reader
17
20
  # class, mapping the input records to an output hash, and then sending the output
18
21
  # hash off somewhere (usually Solr) with a Writer class.
@@ -53,8 +56,9 @@ require 'traject/macros/basic'
53
56
  # 2) Responds to the usual ruby #each, returning a source record from each #each.
54
57
  # (Including Enumerable is prob a good idea too)
55
58
  #
56
- # The default reader is the Traject::Marc4JReader, who's behavior is
57
- # further customized by several settings in the Settings hash.
59
+ # The default reader is the Traject::MarcReader, who's behavior is
60
+ # further customized by several settings in the Settings hash. Jruby users
61
+ # with specialized needs may want to look at the gem traject-marc4j_reader.
58
62
  #
59
63
  # Alternate readers can be set directly with the #reader_class= method, or
60
64
  # with the "reader_class_name" Setting, a String name of a class
@@ -72,14 +76,22 @@ require 'traject/macros/basic'
72
76
  # 4) Optionally implements a #skipped_record_count method, returning int count of records
73
77
  # that were skipped due to errors (and presumably logged)
74
78
  #
75
- # The default writer is the SolrJWriter, using Java SolrJ to
76
- # write to a Solr. A few other built-in writers are available,
77
- # but it's anticipated more will be created as plugins or local
78
- # code for special purposes.
79
+ # Traject packages one solr writer: traject/solr_json_writer, which sends
80
+ # in json format and works under both ruby and jruby, but only with solr version
81
+ # >= 3.2. To index to an older solr installation, you'll need to use jruby and
82
+ # install the gem traject-solrj_writer, which uses the solrj .jar underneath.
79
83
  #
80
84
  # You can set alternate writers by setting a Class object directly
81
85
  # with the #writer_class method, or by the 'writer_class_name' Setting,
82
- # with a String name of class meeting the Writer contract.
86
+ # with a String name of class meeting the Writer contract. There are several
87
+ # that ship with traject itself:
88
+ #
89
+ # * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
90
+ # * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
91
+ # * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
92
+ # each line consists of the id, field, and value(s).
93
+ # * traject/delimited_writer and traject/csv_writer -- write character-delimited files
94
+ # (default is tab-delimited) or comma-separated-value files.
83
95
  #
84
96
  class Traject::Indexer
85
97
 
@@ -310,7 +322,13 @@ class Traject::Indexer
310
322
  reader = self.reader!(io_stream)
311
323
  writer = self.writer!
312
324
 
313
- thread_pool = Traject::ThreadPool.new(settings["processing_thread_pool"].to_i)
325
+
326
+ processing_threads = settings["processing_thread_pool"].to_i
327
+ if processing_threads > 0 and !(defined? JRuby)
328
+ processing_threads = 0
329
+ logger.warn "Processing threads set to 0 because we're not running under JRuby"
330
+ end
331
+ thread_pool = Traject::ThreadPool.new(processing_threads)
314
332
 
315
333
  logger.info " Indexer with reader: #{reader.class.name} and writer: #{writer.class.name}"
316
334
 
@@ -326,7 +344,7 @@ class Traject::Indexer
326
344
  thread_pool.raise_collected_exception!
327
345
 
328
346
  if settings["debug_ascii_progress"].to_s == "true"
329
- $stderr.write "." if count % settings["solrj_writer.batch_size"] == 0
347
+ $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
330
348
  end
331
349
 
332
350
  if log_batch_size && (count % log_batch_size == 0)
@@ -1,4 +1,5 @@
1
1
  require 'hashie'
2
+ require 'concurrent'
2
3
 
3
4
  class Traject::Indexer
4
5
 
@@ -22,9 +23,6 @@ class Traject::Indexer
22
23
  include Hashie::Extensions::MergeInitializer # can init with hash
23
24
  include Hashie::Extensions::IndifferentAccess
24
25
 
25
- # Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
26
- alias_method :store, :indifferent_writer
27
-
28
26
  def initialize(*args)
29
27
  super
30
28
  self.default_proc = lambda do |hash, key|
@@ -59,19 +57,37 @@ class Traject::Indexer
59
57
  def fill_in_defaults!
60
58
  self.reverse_merge!(self.class.defaults)
61
59
  end
60
+
61
+
62
+ def self.mri_defaults
63
+ {
64
+ "reader_class_name" => "Traject::MarcReader",
65
+ "writer_class_name" => "Traject::SolrJsonWriter",
66
+ "marc_source.type" => "binary",
67
+ "solrj_writer.batch_size" => 200,
68
+ "solrj_writer.thread_pool" => 1,
69
+ "processing_thread_pool" => self.default_processing_thread_pool,
70
+ "log.batch_size.severity" => "info"
71
+ }
72
+ end
62
73
 
63
- def self.defaults
64
- @@defaults ||= {
65
- "reader_class_name" => "Traject::MarcReader",
66
- "writer_class_name" => "Traject::SolrJWriter",
67
- "marc_source.type" => "binary",
68
- "marc4j_reader.permissive" => true,
69
- "solrj_writer.batch_size" => 200,
70
- "solrj_writer.thread_pool" => 1,
71
- "processing_thread_pool" => 3,
72
- "log.batch_size.severity" => "info"
74
+ def self.jruby_defaults
75
+ {
76
+ 'reader_class_name' => "Traject::Marc4JReader",
77
+ 'marc4j_reader.permissive' => true
73
78
  }
74
79
  end
80
+
81
+
82
+ def self.defaults
83
+ return @@defaults if defined? @@defaults
84
+ default_settings = self.mri_defaults
85
+ if defined? JRUBY_VERSION
86
+ default_settings.merge! self.jruby_defaults
87
+ end
88
+
89
+ @@defaults = default_settings
90
+ end
75
91
 
76
92
  def inspect
77
93
  # Keep any key ending in password out of the inspect
@@ -80,5 +96,15 @@ class Traject::Indexer
80
96
  hash
81
97
  end.inspect
82
98
  end
99
+
100
+ protected
101
+ def self.default_processing_thread_pool
102
+ if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
103
+ [1, Concurrent.processor_count - 1].max
104
+ else
105
+ 1
106
+ end
107
+ end
108
+
83
109
  end
84
110
  end
@@ -16,14 +16,18 @@ require 'thread'
16
16
  # method. For instance, see JsonWriter.
17
17
  class Traject::LineWriter
18
18
  attr_reader :settings
19
- attr_reader :write_mutex
19
+ attr_reader :write_mutex, :output_file
20
20
 
21
21
  def initialize(argSettings)
22
22
  @settings = argSettings
23
23
  @write_mutex = Mutex.new
24
24
 
25
25
  # trigger lazy loading now for thread-safety
26
- output_file
26
+ @output_file = open_output_file
27
+ end
28
+
29
+ def _write(data)
30
+ output_file.puts(data)
27
31
  end
28
32
 
29
33
 
@@ -34,13 +38,13 @@ class Traject::LineWriter
34
38
  def put(context)
35
39
  serialized = serialize(context)
36
40
  write_mutex.synchronize do
37
- output_file.puts(serialized)
41
+ _write(serialized)
38
42
  end
39
43
  end
40
44
 
41
- def output_file
45
+ def open_output_file
42
46
  unless defined? @output_file
43
- @output_file =
47
+ of =
44
48
  if settings["output_file"]
45
49
  File.open(settings["output_file"], 'w:UTF-8')
46
50
  elsif settings["output_stream"]
@@ -49,7 +53,7 @@ class Traject::LineWriter
49
53
  $stdout
50
54
  end
51
55
  end
52
- return @output_file
56
+ return of
53
57
  end
54
58
 
55
59
  def close
@@ -5,7 +5,8 @@ require 'traject/ndj_reader'
5
5
  # can read MARC ISO 2709 ('binary'), MARC-XML, and Marc-in-json (newline-delimited-json).
6
6
  #
7
7
  # Marc4JReader is an alternative to this class, powered by Marc4J. You may be interested
8
- # in comparing for performance, under your particular use case.
8
+ # in comparing for performance, under your particular use case. To use it, you'll need
9
+ # the gem traject-marc4j_reader.
9
10
  #
10
11
  # By default assumes binary MARC encoding, please set marc_source.type setting
11
12
  # for XML or json. If binary, please set marc_source.encoding with char encoding.
@@ -0,0 +1,277 @@
1
+ require 'yell'
2
+
3
+ require 'traject'
4
+ require 'traject/util'
5
+ require 'traject/qualified_const_get'
6
+ require 'traject/thread_pool'
7
+
8
+ require 'json'
9
+ require 'httpclient'
10
+
11
+ require 'uri'
12
+ require 'thread' # for Mutex/Queue
13
+ require 'concurrent' # for atomic_fixnum
14
+
15
+ # Write to Solr using the JSON interface; only works for Solr >= 3.2
16
+ #
17
+ # This should work under both MRI and JRuby, with JRuby getting much
18
+ # better performance due to the threading model.
19
+ #
20
+ # Relevant settings
21
+ #
22
+ # * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
23
+ #
24
+ # * solr.update_url: The actual update url. If unset, we'll first see if
25
+ # "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update"
26
+ #
27
+ # * solr_writer.batch_size: How big a batch to send to solr. Default is 100.
28
+ # My tests indicate that this setting doesn't change overall index speed by a ton.
29
+ #
30
+ # * solr_writer.thread_pool: How many threads to use for the writer. Default is 1.
31
+ # Likely useful even under MRI since thread will be waiting on Solr for some time.
32
+ #
33
+ # * solr_writer.max_skipped: How many records skipped due to errors before we
34
+ # bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
35
+ # raise and abort on a single record that could not be added to Solr.
36
+ #
37
+ # * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
38
+ # end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
39
+ # compat only.)
40
+ #
41
+ # * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
42
+ # giving up as a timeout. Default 10 minutes. Solr can be slow.
43
+ #
44
+ # * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
45
+ # or mock object to be used for HTTP.
46
+
47
+
48
+ class Traject::SolrJsonWriter
49
+ include Traject::QualifiedConstGet
50
+
51
+ DEFAULT_MAX_SKIPPED = 0
52
+ DEFAULT_BATCH_SIZE = 100
53
+
54
+ # The passed-in settings
55
+ attr_reader :settings, :thread_pool_size
56
+
57
+ # A queue to hold documents before sending to solr
58
+ attr_reader :batched_queue
59
+
60
+ def initialize(argSettings)
61
+ @settings = Traject::Indexer::Settings.new(argSettings)
62
+
63
+ # Set max errors
64
+ @max_skipped = (@settings['solr_writer.max_skipped'] || DEFAULT_MAX_SKIPPED).to_i
65
+ if @max_skipped < 0
66
+ @max_skipped = nil
67
+ end
68
+
69
+ @http_client = @settings["solr_json_writer.http_client"] || HTTPClient.new
70
+
71
+ @batch_size = (settings["solr_writer.batch_size"] || DEFAULT_BATCH_SIZE).to_i
72
+ @batch_size = 1 if @batch_size < 1
73
+
74
+ # Store error count in an AtomicInteger, so multi threads can increment
75
+ # it safely, if we're threaded.
76
+ @skipped_record_incrementer = Concurrent::AtomicFixnum.new(0)
77
+
78
+
79
+ # How many threads to use for the writer?
80
+ # if our thread pool settings are 0, it'll just create a null threadpool that
81
+ # executes in calling context.
82
+ @thread_pool_size = (@settings["solr_writer.thread_pool"] || 1).to_i
83
+
84
+ @batched_queue = Queue.new
85
+ @thread_pool = Traject::ThreadPool.new(@thread_pool_size)
86
+
87
+ # old setting solrj_writer supported for backwards compat, as we make
88
+ # this the new default writer.
89
+ @commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
90
+
91
+ # Figure out where to send updates
92
+ @solr_update_url = self.determine_solr_update_url
93
+
94
+ logger.info(" #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
95
+ end
96
+
97
+
98
+ # Add a single context to the queue, ready to be sent to solr
99
+ def put(context)
100
+ @thread_pool.raise_collected_exception!
101
+
102
+ @batched_queue << context
103
+ if @batched_queue.size >= @batch_size
104
+ batch = Traject::Util.drain_queue(@batched_queue)
105
+ @thread_pool.maybe_in_thread_pool(batch) {|batch_arg| send_batch(batch_arg) }
106
+ end
107
+ end
108
+
109
+ # Send the given batch of contexts. If something goes wrong, send
110
+ # them one at a time.
111
+ # @param [Array<Traject::Indexer::Context>] an array of contexts
112
+ def send_batch(batch)
113
+ return if batch.empty?
114
+ json_package = JSON.generate(batch.map { |c| c.output_hash })
115
+ begin
116
+ resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
117
+ rescue StandardError => exception
118
+ end
119
+
120
+ if exception || resp.status != 200
121
+ error_message = exception ?
122
+ Traject::Util.exception_to_log_message(exception) :
123
+ "Solr response: #{resp.status}: #{resp.body}"
124
+
125
+ logger.error "Error in Solr batch add. Will retry documents individually at performance penalty: #{error_message}"
126
+
127
+ batch.each do |c|
128
+ send_single(c)
129
+ end
130
+ end
131
+ end
132
+
133
+
134
+ # Send a single context to Solr, logging an error if need be
135
+ # @param [Traject::Indexer::Context] c The context whose document you want to send
136
+ def send_single(c)
137
+ json_package = JSON.generate([c.output_hash])
138
+ begin
139
+ resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
140
+ # Catch Timeouts and network errors as skipped records, but otherwise
141
+ # allow unexpected errors to propagate up.
142
+ rescue HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED => exception
143
+ end
144
+
145
+ if exception || resp.status != 200
146
+ if exception
147
+ msg = Traject::Util.exception_to_log_message(e)
148
+ else
149
+ msg = "Solr error response: #{resp.status}: #{resp.body}"
150
+ end
151
+ logger.error "Could not add record #{record_id_from_context c} at source file position #{c.position}: #{msg}"
152
+ logger.debug(c.source_record.to_s)
153
+
154
+ @skipped_record_incrementer.increment
155
+ if @max_skipped and skipped_record_count > @max_skipped
156
+ raise RuntimeError.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
157
+ end
158
+
159
+ end
160
+
161
+ end
162
+
163
+
164
+ # Get the logger from the settings, or default to an effectively null logger
165
+ def logger
166
+ settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
167
+ end
168
+
169
+ # Returns MARC 001, then a slash, then output_hash["id"] -- if both
170
+ # are present. Otherwise may return just one, or even an empty string.
171
+ def record_id_from_context(context)
172
+ marc_id = context.source_record && context.source_record['001'] && context.source_record['001'].value
173
+ output_id = context.output_hash["id"]
174
+
175
+ return [marc_id, output_id].compact.join("/")
176
+ end
177
+
178
+
179
+ # On close, we need to (a) raise any exceptions we might have, (b) send off
180
+ # the last (possibly empty) batch, and (c) commit if instructed to do so
181
+ # via the solr_writer.commit_on_close setting.
182
+ def close
183
+ @thread_pool.raise_collected_exception!
184
+
185
+ # Finish off whatever's left. Do it in the thread pool for
186
+ # consistency, and to ensure expected order of operations, so
187
+ # it goes to the end of the queue behind any other work.
188
+ batch = Traject::Util.drain_queue(@batched_queue)
189
+ if batch.length > 0
190
+ @thread_pool.maybe_in_thread_pool { send_batch(batch) }
191
+ end
192
+
193
+ # Wait for shutdown, and time it.
194
+ logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
195
+ elapsed = @thread_pool.shutdown_and_wait
196
+ if elapsed > 60
197
+ logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
198
+ end
199
+ logger.debug "#{self.class.name}: Thread pool shutdown complete"
200
+ logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
201
+
202
+ # check again now that we've waited, there could still be some
203
+ # that didn't show up before.
204
+ @thread_pool.raise_collected_exception!
205
+
206
+ # Commit if we're supposed to
207
+ if @commit_on_close
208
+ commit
209
+ end
210
+ end
211
+
212
+
213
+ # Send a commit
214
+ def commit
215
+ logger.info "#{self.class.name} sending commit to solr at url #{@solr_update_url}..."
216
+
217
+ original_timeout = @http_client.receive_timeout
218
+
219
+ @http_client.receive_timeout = (settings["commit_timeout"] || (10 * 60)).to_i
220
+
221
+ resp = @http_client.get(@solr_update_url, {"commit" => 'true'})
222
+ unless resp.status == 200
223
+ raise RuntimeError.new("Could not commit to Solr: #{resp.status} #{resp.body}")
224
+ end
225
+
226
+ @http_client.receive_timeout = original_timeout
227
+ end
228
+
229
+
230
+ # Return count of encountered skipped records. Most accurate to call
231
+ # it after #close, in which case it should include full count, even
232
+ # under async thread_pool.
233
+ def skipped_record_count
234
+ @skipped_record_incrementer.value
235
+ end
236
+
237
+
238
+ # Relatively complex logic to determine if we have a valid URL and what it is
239
+ def determine_solr_update_url
240
+ if settings['solr.update_url']
241
+ check_solr_update_url(settings['solr.update_url'])
242
+ else
243
+ derive_solr_update_url_from_solr_url(settings['solr.url'])
244
+ end
245
+ end
246
+
247
+
248
+ # If we've got a solr.update_url, make sure it's ok
249
+ def check_solr_update_url(url)
250
+ unless url =~ /^#{URI::regexp}$/
251
+ raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
252
+ end
253
+ url
254
+ end
255
+
256
+ def derive_solr_update_url_from_solr_url(url)
257
+ # Nil? Then we bail
258
+ if url.nil?
259
+ raise ArgumentError.new("#{self.class.name}: Neither solr.update_url nor solr.url set; need at least one")
260
+ end
261
+
262
+ # Not a URL? Bail
263
+ unless url =~ /^#{URI::regexp}$/
264
+ raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
265
+ end
266
+
267
+ # First, try the /update/json handler
268
+ candidate = [url.chomp('/'), 'update', 'json'].join('/')
269
+ resp = @http_client.get(candidate)
270
+ if resp.status == 404
271
+ candidate = [url.chomp('/'), 'update'].join('/')
272
+ end
273
+ candidate
274
+ end
275
+
276
+
277
+ end