traject 2.3.0 → 2.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 29f78e1693c756df82614f418d07bfce01d6ee9e
4
- data.tar.gz: 9992b817165971d2e3f7f75f18d2a51d5e26c544
3
+ metadata.gz: dee0f24c94b13285ce7e8408c7d8ccbd796e0c7a
4
+ data.tar.gz: 9ff0d8523063acaec87aaba83fae96ae5ce46217
5
5
  SHA512:
6
- metadata.gz: 416140d76e5bb04bf5de09d2fa04ff19e6161bc8adb8305c525e8ef76b1f7fb075a10b8f8bb6e1b9518fe246fcf22e3b9966f4356d89333ff61b3f6967c66525
7
- data.tar.gz: bbe192b157398efad350b13c791193082085dc7518a755f694e7c56ae3a951cb0288878a0e2d90fc491a1654b5d962d17c7120b5060a6c6bcb84425a86217a63
6
+ metadata.gz: 557a5f61241a4dc3f02a0ab0029a75b429690ac7b0f2901ffdfbf9d6312ce27f3b8744a44f9f87c1ef18367b66a081051d8b0671b560173446643971a467f7d0
7
+ data.tar.gz: d4bb78212b5861447df4f42de5099a03e654eeb37e671e753743cb737a1a159b03e14d8b8c75f5b2259d3c50ee1ffa9d8ff541db9cdff740fdc8d42fe39781e4
data/.travis.yml CHANGED
@@ -10,6 +10,7 @@ rvm:
10
10
  - rbx-2
11
11
  before_install:
12
12
  - gem update --system
13
- - gem install bundler
13
+ - gem uninstall bundler
14
+ - gem update bundler
14
15
  jdk:
15
16
  - oraclejdk8
@@ -1,6 +1,5 @@
1
1
  require 'yell'
2
2
 
3
- require 'traject'
4
3
  require 'traject/qualified_const_get'
5
4
  require 'traject/thread_pool'
6
5
 
@@ -238,6 +238,10 @@ module Traject::Macros
238
238
  # single square bracket characters if they are the start and/or end
239
239
  # chars and there are no internal square brackets.
240
240
  str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
241
+
242
+ # trim any leading or trailing whitespace
243
+ str.strip!
244
+
241
245
  return str
242
246
  end
243
247
 
@@ -170,7 +170,7 @@ module Traject
170
170
  hash = Hash.new
171
171
 
172
172
  # Split the string(s) given on colon
173
- spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/s*:\s*/)
173
+ spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/\s*:\s*/)
174
174
 
175
175
  spec_strings.each do |part|
176
176
  if m = DATAFIELD_PATTERN.match(part)
@@ -1,6 +1,5 @@
1
1
  require 'yell'
2
2
 
3
- require 'traject'
4
3
  require 'traject/util'
5
4
  require 'traject/qualified_const_get'
6
5
  require 'traject/thread_pool'
@@ -28,21 +27,21 @@ require 'concurrent' # for atomic_fixnum
28
27
  # My tests indicate that this setting doesn't change overall index speed by a ton.
29
28
  #
30
29
  # * solr_writer.thread_pool: How many threads to use for the writer. Default is 1.
31
- # Likely useful even under MRI since thread will be waiting on Solr for some time.
30
+ # Likely useful even under MRI since thread will be waiting on Solr for some time.
32
31
  #
33
- # * solr_writer.max_skipped: How many records skipped due to errors before we
34
- # bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
35
- # raise and abort on a single record that could not be added to Solr.
32
+ # * solr_writer.max_skipped: How many records skipped due to errors before we
33
+ # bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
34
+ # raise and abort on a single record that could not be added to Solr.
36
35
  #
37
36
  # * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
38
37
  # end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
39
38
  # compat only.)
40
39
  #
41
40
  # * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
42
- # giving up as a timeout. Default 10 minutes. Solr can be slow.
41
+ # giving up as a timeout. Default 10 minutes. Solr can be slow.
43
42
  #
44
43
  # * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
45
- # or mock object to be used for HTTP.
44
+ # or mock object to be used for HTTP.
46
45
 
47
46
 
48
47
  class Traject::SolrJsonWriter
@@ -85,7 +84,7 @@ class Traject::SolrJsonWriter
85
84
  @thread_pool = Traject::ThreadPool.new(@thread_pool_size)
86
85
 
87
86
  # old setting solrj_writer supported for backwards compat, as we make
88
- # this the new default writer.
87
+ # this the new default writer.
89
88
  @commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
90
89
 
91
90
  # Figure out where to send updates
@@ -118,12 +117,12 @@ class Traject::SolrJsonWriter
118
117
  end
119
118
 
120
119
  if exception || resp.status != 200
121
- error_message = exception ?
122
- Traject::Util.exception_to_log_message(exception) :
120
+ error_message = exception ?
121
+ Traject::Util.exception_to_log_message(exception) :
123
122
  "Solr response: #{resp.status}: #{resp.body}"
124
123
 
125
124
  logger.error "Error in Solr batch add. Will retry documents individually at performance penalty: #{error_message}"
126
-
125
+
127
126
  batch.each do |c|
128
127
  send_single(c)
129
128
  end
@@ -138,7 +137,7 @@ class Traject::SolrJsonWriter
138
137
  begin
139
138
  resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
140
139
  # Catch Timeouts and network errors as skipped records, but otherwise
141
- # allow unexpected errors to propagate up.
140
+ # allow unexpected errors to propagate up.
142
141
  rescue HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED => exception
143
142
  end
144
143
 
@@ -13,7 +13,7 @@ module Traject
13
13
  # be created, and work sent to the Traject::ThreadPool will just be executed
14
14
  # in the caller thread. We call this a nil threadpool. One situation it can be useful
15
15
  # is if you are running under MRI, where multi-core parallelism isn't available, so
16
- # an actual threadpool may not be useful. (Although in some cases a thread pool,
16
+ # an actual threadpool may not be useful. (Although in some cases a thread pool,
17
17
  # especially one with size 1, can be useful in MRI for I/O blocking operations)
18
18
  #
19
19
  # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
@@ -40,7 +40,7 @@ module Traject
40
40
  # to complete, then return. You can not give any more work to the pool
41
41
  # after you do this. By default it'll wait pretty much forever, which should
42
42
  # be fine. If you never call shutdown, then queued or in-progress work
43
- # may be abandoned when the program ends, which would be bad.
43
+ # may be abandoned when the program ends, which would be bad.
44
44
  #
45
45
  # 7) We will keep track of total times a block is run in thread pool, and
46
46
  # total elapsed (wall) time of running all blocks, so an average_execution_ms
@@ -51,24 +51,26 @@ module Traject
51
51
  attr_reader :pool_size, :queue_capacity
52
52
 
53
53
  # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
- # work in caller thread.
54
+ # work in caller thread.
55
55
  def initialize(pool_size)
56
+ @thread_pool = nil # assume we don't have one
57
+ @exceptions_caught_queue = [] # start off without exceptions
56
58
  unless pool_size.nil? || pool_size == 0
57
- @pool_size = pool_size.to_i
59
+ @pool_size = pool_size.to_i
58
60
  @queue_capacity = pool_size * 3
59
61
 
60
- @thread_pool = Concurrent::ThreadPoolExecutor.new(
61
- :min_threads => @pool_size,
62
- :max_threads => @pool_size,
63
- :max_queue => @queue_capacity,
64
- :fallback_policy => :caller_runs
62
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
63
+ :min_threads => @pool_size,
64
+ :max_threads => @pool_size,
65
+ :max_queue => @queue_capacity,
66
+ :fallback_policy => :caller_runs
65
67
  )
66
68
 
67
69
  # A thread-safe queue to collect exceptions cross-threads.
68
70
  # We really only need to save the first exception, but a queue
69
71
  # is a convenient way to store a value concurrency-safely, and
70
- # might as well store all of them.
71
- @exceptions_caught_queue = Queue.new
72
+ # might as well store all of them.
73
+ @exceptions_caught_queue = Queue.new
72
74
  end
73
75
  end
74
76
 
@@ -133,7 +135,7 @@ module Traject
133
135
  # as a non-functioning threadpool -- then this method is just
134
136
  # a no-op.
135
137
  def raise_collected_exception!
136
- if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
138
+ unless @exceptions_caught_queue.empty?
137
139
  e = @exceptions_caught_queue.pop
138
140
  raise e
139
141
  end
@@ -1,5 +1,3 @@
1
- require 'traject'
2
-
3
1
  require 'yaml'
4
2
  require 'dot-properties'
5
3
 
@@ -131,13 +129,13 @@ module Traject
131
129
  yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
132
130
  prop_file = File.join(base, "translation_maps", "#{path}.properties" )
133
131
 
134
- if File.exists? rb_file
132
+ if File.exist? rb_file
135
133
  found = eval( File.open(rb_file).read , binding, rb_file )
136
134
  break
137
- elsif File.exists? yaml_file
135
+ elsif File.exist? yaml_file
138
136
  found = YAML.load_file(yaml_file)
139
137
  break
140
- elsif File.exists? prop_file
138
+ elsif File.exist? prop_file
141
139
  found = Traject::TranslationMap.read_properties(prop_file)
142
140
  break
143
141
  end
@@ -231,21 +229,21 @@ module Traject
231
229
  array.replace( self.translate_array(array))
232
230
  end
233
231
 
234
- # Return a new TranslationMap that results from merging argument on top of self.
232
+ # Return a new TranslationMap that results from merging argument on top of self.
235
233
  # Can be useful for taking an existing translation map, but merging a few
236
- # overrides on top.
234
+ # overrides on top.
237
235
  #
238
236
  # merged_map = TranslationMap.new(something).merge TranslationMap.new(else)
239
237
  # #...
240
238
  # merged_map.translate_array(something) # etc
241
239
  #
242
- # If a default is set in the second map, it will merge over the first too.
240
+ # If a default is set in the second map, it will merge over the first too.
243
241
  #
244
242
  # You can also pass in a plain hash as an arg, instead of an existing TranslationMap:
245
243
  #
246
244
  # TranslationMap.new(something).merge("overridden_key" => "value", "a" => "")
247
245
  def merge(other_map)
248
- default = other_map.default || self.default
246
+ default = other_map.default || self.default
249
247
  TranslationMap.new(self.to_hash.merge(other_map.to_hash), :default => default)
250
248
  end
251
249
 
@@ -258,9 +256,9 @@ module Traject
258
256
  protected
259
257
 
260
258
  # We use dot-properties gem for reading .properties files,
261
- # return a hash.
262
- def self.read_properties(file_name)
263
- return DotProperties.load(file_name).to_h
259
+ # return a hash.
260
+ def self.read_properties(file_name)
261
+ return DotProperties.load(file_name).to_h
264
262
  end
265
263
 
266
264
  end
data/lib/traject/util.rb CHANGED
@@ -1,5 +1,3 @@
1
- require 'traject'
2
-
3
1
  module Traject
4
2
  # Just some internal utility methods
5
3
  module Util
@@ -27,17 +25,17 @@ module Traject
27
25
  end
28
26
 
29
27
  # Provide a config source file path, and an exception.
30
- #
28
+ #
31
29
  # Returns the line number from the first line in the stack
32
- # trace of the exception that matches your file path.
30
+ # trace of the exception that matches your file path.
33
31
  # of the first line in the backtrace matching that file_path.
34
- #
35
- # Returns `nil` if no suitable backtrace line can be found.
36
32
  #
37
- # Has special logic to try and grep the info out of a SyntaxError, bah.
33
+ # Returns `nil` if no suitable backtrace line can be found.
34
+ #
35
+ # Has special logic to try and grep the info out of a SyntaxError, bah.
38
36
  def self.backtrace_lineno_for_config(file_path, exception)
39
37
  # For a SyntaxError, we really need to grep it from the
40
- # exception message, it really appears to be nowhere else. Ugh.
38
+ # exception message, it really appears to be nowhere else. Ugh.
41
39
  if exception.kind_of? SyntaxError
42
40
  if exception.message =~ /:(\d+):/
43
41
  return $1.to_i
@@ -45,13 +43,13 @@ module Traject
45
43
  end
46
44
 
47
45
  # Otherwise we try to fish it out of the backtrace, first
48
- # line matching the config file path.
46
+ # line matching the config file path.
49
47
 
50
48
  # exception.backtrace_locations exists in MRI 2.1+, which makes
51
49
  # our task a lot easier. But not yet in JRuby 1.7.x, so we got to
52
- # handle the old way of having to parse the strings in backtrace too.
53
- if ( exception.respond_to?(:backtrace_locations) &&
54
- exception.backtrace_locations &&
50
+ # handle the old way of having to parse the strings in backtrace too.
51
+ if ( exception.respond_to?(:backtrace_locations) &&
52
+ exception.backtrace_locations &&
55
53
  exception.backtrace_locations.length > 0 )
56
54
  location = exception.backtrace_locations.find do |bt|
57
55
  bt.path == file_path
@@ -71,19 +69,19 @@ module Traject
71
69
 
72
70
  # Extract just the part of the backtrace that is "below"
73
71
  # the config file mentioned. If we can't find the config file
74
- # in the stack trace, we might return empty array.
72
+ # in the stack trace, we might return empty array.
75
73
  #
76
74
  # If the ruby supports Exception#backtrace_locations, the
77
- # returned array will actually be of Thread::Backtrace::Location elements.
75
+ # returned array will actually be of Thread::Backtrace::Location elements.
78
76
  def self.backtrace_from_config(file_path, exception)
79
77
  filtered_trace = []
80
78
  found = false
81
79
 
82
80
  # MRI 2.1+ has exception.backtrace_locations which makes
83
- # this a lot easier, but JRuby 1.7.x doesn't yet, so we
84
- # need to do it both ways.
85
- if ( exception.respond_to?(:backtrace_locations) &&
86
- exception.backtrace_locations &&
81
+ # this a lot easier, but JRuby 1.7.x doesn't yet, so we
82
+ # need to do it both ways.
83
+ if ( exception.respond_to?(:backtrace_locations) &&
84
+ exception.backtrace_locations &&
87
85
  exception.backtrace_locations.length > 0 )
88
86
 
89
87
  exception.backtrace_locations.each do |location|
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "2.3.0"
2
+ VERSION = "2.3.1"
3
3
  end
@@ -118,6 +118,8 @@ describe "Traject::Macros::Marc21" do
118
118
  assert_equal "one two three", Marc21.trim_punctuation("one two three:")
119
119
  assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
120
120
  assert_equal "one two three", Marc21.trim_punctuation("one two three.")
121
+ assert_equal "one two three...", Marc21.trim_punctuation("one two three...")
122
+ assert_equal "one two three", Marc21.trim_punctuation(" one two three.")
121
123
 
122
124
  assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
123
125
  assert_equal "one two three", Marc21.trim_punctuation("one two three]")
@@ -35,7 +35,7 @@ describe "Traject::MarcExtractor" do
35
35
  end
36
36
 
37
37
  it "parses a mixed bag" do
38
- parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcde:810:700|*4|bcd")
38
+ parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcdes:810:700|*4|bcd")
39
39
  spec245 = parsed['245'].first
40
40
  spec810 = parsed['810'].first
41
41
  spec700 = parsed['700'].first
@@ -46,7 +46,7 @@ describe "Traject::MarcExtractor" do
46
46
  assert spec245
47
47
  assert_nil spec245.indicator1
48
48
  assert_nil spec245.indicator2
49
- assert_equal %w{a b c d e}, spec245.subfields
49
+ assert_equal %w{a b c d e s}, spec245.subfields
50
50
 
51
51
  #810
52
52
  assert spec810
data/traject.gemspec CHANGED
@@ -4,13 +4,13 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'traject/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "traject"
8
- spec.version = Traject::VERSION
9
- spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
10
- spec.email = ["none@nowhere.org"]
11
- spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
12
- spec.homepage = "http://github.com/traject/traject"
13
- spec.license = "MIT"
7
+ spec.name = "traject"
8
+ spec.version = Traject::VERSION
9
+ spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
10
+ spec.email = ["none@nowhere.org"]
11
+ spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
12
+ spec.homepage = "http://github.com/traject/traject"
13
+ spec.license = "MIT"
14
14
 
15
15
  spec.files = `git ls-files`.split($/)
16
16
  spec.executables = ["traject"]
@@ -23,10 +23,10 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency "concurrent-ruby", ">= 0.8.0"
24
24
  spec.add_dependency "marc", "~> 1.0"
25
25
 
26
- spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
27
- spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
28
- spec.add_dependency "yell" # logging
29
- spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
26
+ spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
27
+ spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
28
+ spec.add_dependency "yell" # logging
29
+ spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
30
30
  spec.add_dependency "httpclient", "~> 2.5"
31
31
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
32
32
 
@@ -40,8 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.platform = "ruby"
41
41
  end
42
42
 
43
+ spec.add_development_dependency "bundler", '~> 1.7'
43
44
 
44
- spec.add_development_dependency "bundler", "~> 1.7"
45
45
  spec.add_development_dependency "rake"
46
46
  spec.add_development_dependency "minitest"
47
47
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.0
4
+ version: 2.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-02-24 00:00:00.000000000 Z
12
+ date: 2016-04-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby