traject 2.3.0 → 2.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/lib/traject/indexer.rb +0 -1
- data/lib/traject/macros/marc21.rb +4 -0
- data/lib/traject/marc_extractor_spec.rb +1 -1
- data/lib/traject/solr_json_writer.rb +11 -12
- data/lib/traject/thread_pool.rb +14 -12
- data/lib/traject/translation_map.rb +10 -12
- data/lib/traject/util.rb +16 -18
- data/lib/traject/version.rb +1 -1
- data/test/indexer/macros_marc21_test.rb +2 -0
- data/test/marc_extractor_test.rb +2 -2
- data/traject.gemspec +12 -12
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dee0f24c94b13285ce7e8408c7d8ccbd796e0c7a
|
4
|
+
data.tar.gz: 9ff0d8523063acaec87aaba83fae96ae5ce46217
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 557a5f61241a4dc3f02a0ab0029a75b429690ac7b0f2901ffdfbf9d6312ce27f3b8744a44f9f87c1ef18367b66a081051d8b0671b560173446643971a467f7d0
|
7
|
+
data.tar.gz: d4bb78212b5861447df4f42de5099a03e654eeb37e671e753743cb737a1a159b03e14d8b8c75f5b2259d3c50ee1ffa9d8ff541db9cdff740fdc8d42fe39781e4
|
data/.travis.yml
CHANGED
data/lib/traject/indexer.rb
CHANGED
@@ -238,6 +238,10 @@ module Traject::Macros
|
|
238
238
|
# single square bracket characters if they are the start and/or end
|
239
239
|
# chars and there are no internal square brackets.
|
240
240
|
str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
|
241
|
+
|
242
|
+
# trim any leading or trailing whitespace
|
243
|
+
str.strip!
|
244
|
+
|
241
245
|
return str
|
242
246
|
end
|
243
247
|
|
@@ -170,7 +170,7 @@ module Traject
|
|
170
170
|
hash = Hash.new
|
171
171
|
|
172
172
|
# Split the string(s) given on colon
|
173
|
-
spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(
|
173
|
+
spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/\s*:\s*/)
|
174
174
|
|
175
175
|
spec_strings.each do |part|
|
176
176
|
if m = DATAFIELD_PATTERN.match(part)
|
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'yell'
|
2
2
|
|
3
|
-
require 'traject'
|
4
3
|
require 'traject/util'
|
5
4
|
require 'traject/qualified_const_get'
|
6
5
|
require 'traject/thread_pool'
|
@@ -28,21 +27,21 @@ require 'concurrent' # for atomic_fixnum
|
|
28
27
|
# My tests indicate that this setting doesn't change overall index speed by a ton.
|
29
28
|
#
|
30
29
|
# * solr_writer.thread_pool: How many threads to use for the writer. Default is 1.
|
31
|
-
# Likely useful even under MRI since thread will be waiting on Solr for some time.
|
30
|
+
# Likely useful even under MRI since thread will be waiting on Solr for some time.
|
32
31
|
#
|
33
|
-
# * solr_writer.max_skipped: How many records skipped due to errors before we
|
34
|
-
# bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
|
35
|
-
# raise and abort on a single record that could not be added to Solr.
|
32
|
+
# * solr_writer.max_skipped: How many records skipped due to errors before we
|
33
|
+
# bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
|
34
|
+
# raise and abort on a single record that could not be added to Solr.
|
36
35
|
#
|
37
36
|
# * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
|
38
37
|
# end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
|
39
38
|
# compat only.)
|
40
39
|
#
|
41
40
|
# * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
|
42
|
-
# giving up as a timeout. Default 10 minutes. Solr can be slow.
|
41
|
+
# giving up as a timeout. Default 10 minutes. Solr can be slow.
|
43
42
|
#
|
44
43
|
# * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
|
45
|
-
# or mock object to be used for HTTP.
|
44
|
+
# or mock object to be used for HTTP.
|
46
45
|
|
47
46
|
|
48
47
|
class Traject::SolrJsonWriter
|
@@ -85,7 +84,7 @@ class Traject::SolrJsonWriter
|
|
85
84
|
@thread_pool = Traject::ThreadPool.new(@thread_pool_size)
|
86
85
|
|
87
86
|
# old setting solrj_writer supported for backwards compat, as we make
|
88
|
-
# this the new default writer.
|
87
|
+
# this the new default writer.
|
89
88
|
@commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
|
90
89
|
|
91
90
|
# Figure out where to send updates
|
@@ -118,12 +117,12 @@ class Traject::SolrJsonWriter
|
|
118
117
|
end
|
119
118
|
|
120
119
|
if exception || resp.status != 200
|
121
|
-
error_message = exception ?
|
122
|
-
Traject::Util.exception_to_log_message(exception) :
|
120
|
+
error_message = exception ?
|
121
|
+
Traject::Util.exception_to_log_message(exception) :
|
123
122
|
"Solr response: #{resp.status}: #{resp.body}"
|
124
123
|
|
125
124
|
logger.error "Error in Solr batch add. Will retry documents individually at performance penalty: #{error_message}"
|
126
|
-
|
125
|
+
|
127
126
|
batch.each do |c|
|
128
127
|
send_single(c)
|
129
128
|
end
|
@@ -138,7 +137,7 @@ class Traject::SolrJsonWriter
|
|
138
137
|
begin
|
139
138
|
resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
|
140
139
|
# Catch Timeouts and network errors as skipped records, but otherwise
|
141
|
-
# allow unexpected errors to propagate up.
|
140
|
+
# allow unexpected errors to propagate up.
|
142
141
|
rescue HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED => exception
|
143
142
|
end
|
144
143
|
|
data/lib/traject/thread_pool.rb
CHANGED
@@ -13,7 +13,7 @@ module Traject
|
|
13
13
|
# be created, and work sent to the Traject::ThreadPool will just be executed
|
14
14
|
# in the caller thread. We call this a nil threadpool. One situation it can be useful
|
15
15
|
# is if you are running under MRI, where multi-core parallelism isn't available, so
|
16
|
-
# an actual threadpool may not be useful. (Although in some cases a thread pool,
|
16
|
+
# an actual threadpool may not be useful. (Although in some cases a thread pool,
|
17
17
|
# especially one with size 1, can be useful in MRI for I/O blocking operations)
|
18
18
|
#
|
19
19
|
# 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
|
@@ -40,7 +40,7 @@ module Traject
|
|
40
40
|
# to complete, then return. You can not give any more work to the pool
|
41
41
|
# after you do this. By default it'll wait pretty much forever, which should
|
42
42
|
# be fine. If you never call shutdown, then queued or in-progress work
|
43
|
-
# may be abandoned when the program ends, which would be bad.
|
43
|
+
# may be abandoned when the program ends, which would be bad.
|
44
44
|
#
|
45
45
|
# 7) We will keep track of total times a block is run in thread pool, and
|
46
46
|
# total elapsed (wall) time of running all blocks, so an average_execution_ms
|
@@ -51,24 +51,26 @@ module Traject
|
|
51
51
|
attr_reader :pool_size, :queue_capacity
|
52
52
|
|
53
53
|
# First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
|
54
|
-
# work in caller thread.
|
54
|
+
# work in caller thread.
|
55
55
|
def initialize(pool_size)
|
56
|
+
@thread_pool = nil # assume we don't have one
|
57
|
+
@exceptions_caught_queue = [] # start off without exceptions
|
56
58
|
unless pool_size.nil? || pool_size == 0
|
57
|
-
@pool_size
|
59
|
+
@pool_size = pool_size.to_i
|
58
60
|
@queue_capacity = pool_size * 3
|
59
61
|
|
60
|
-
@thread_pool
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
@thread_pool = Concurrent::ThreadPoolExecutor.new(
|
63
|
+
:min_threads => @pool_size,
|
64
|
+
:max_threads => @pool_size,
|
65
|
+
:max_queue => @queue_capacity,
|
66
|
+
:fallback_policy => :caller_runs
|
65
67
|
)
|
66
68
|
|
67
69
|
# A thread-safe queue to collect exceptions cross-threads.
|
68
70
|
# We really only need to save the first exception, but a queue
|
69
71
|
# is a convenient way to store a value concurrency-safely, and
|
70
|
-
# might as well store all of them.
|
71
|
-
@exceptions_caught_queue
|
72
|
+
# might as well store all of them.
|
73
|
+
@exceptions_caught_queue = Queue.new
|
72
74
|
end
|
73
75
|
end
|
74
76
|
|
@@ -133,7 +135,7 @@ module Traject
|
|
133
135
|
# as a non-functioning threadpool -- then this method is just
|
134
136
|
# a no-op.
|
135
137
|
def raise_collected_exception!
|
136
|
-
|
138
|
+
unless @exceptions_caught_queue.empty?
|
137
139
|
e = @exceptions_caught_queue.pop
|
138
140
|
raise e
|
139
141
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'traject'
|
2
|
-
|
3
1
|
require 'yaml'
|
4
2
|
require 'dot-properties'
|
5
3
|
|
@@ -131,13 +129,13 @@ module Traject
|
|
131
129
|
yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
|
132
130
|
prop_file = File.join(base, "translation_maps", "#{path}.properties" )
|
133
131
|
|
134
|
-
if File.
|
132
|
+
if File.exist? rb_file
|
135
133
|
found = eval( File.open(rb_file).read , binding, rb_file )
|
136
134
|
break
|
137
|
-
elsif File.
|
135
|
+
elsif File.exist? yaml_file
|
138
136
|
found = YAML.load_file(yaml_file)
|
139
137
|
break
|
140
|
-
elsif File.
|
138
|
+
elsif File.exist? prop_file
|
141
139
|
found = Traject::TranslationMap.read_properties(prop_file)
|
142
140
|
break
|
143
141
|
end
|
@@ -231,21 +229,21 @@ module Traject
|
|
231
229
|
array.replace( self.translate_array(array))
|
232
230
|
end
|
233
231
|
|
234
|
-
# Return a new TranslationMap that results from merging argument on top of self.
|
232
|
+
# Return a new TranslationMap that results from merging argument on top of self.
|
235
233
|
# Can be useful for taking an existing translation map, but merging a few
|
236
|
-
# overrides on top.
|
234
|
+
# overrides on top.
|
237
235
|
#
|
238
236
|
# merged_map = TranslationMap.new(something).merge TranslationMap.new(else)
|
239
237
|
# #...
|
240
238
|
# merged_map.translate_array(something) # etc
|
241
239
|
#
|
242
|
-
# If a default is set in the second map, it will merge over the first too.
|
240
|
+
# If a default is set in the second map, it will merge over the first too.
|
243
241
|
#
|
244
242
|
# You can also pass in a plain hash as an arg, instead of an existing TranslationMap:
|
245
243
|
#
|
246
244
|
# TranslationMap.new(something).merge("overridden_key" => "value", "a" => "")
|
247
245
|
def merge(other_map)
|
248
|
-
default = other_map.default || self.default
|
246
|
+
default = other_map.default || self.default
|
249
247
|
TranslationMap.new(self.to_hash.merge(other_map.to_hash), :default => default)
|
250
248
|
end
|
251
249
|
|
@@ -258,9 +256,9 @@ module Traject
|
|
258
256
|
protected
|
259
257
|
|
260
258
|
# We use dot-properties gem for reading .properties files,
|
261
|
-
# return a hash.
|
262
|
-
def self.read_properties(file_name)
|
263
|
-
return DotProperties.load(file_name).to_h
|
259
|
+
# return a hash.
|
260
|
+
def self.read_properties(file_name)
|
261
|
+
return DotProperties.load(file_name).to_h
|
264
262
|
end
|
265
263
|
|
266
264
|
end
|
data/lib/traject/util.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'traject'
|
2
|
-
|
3
1
|
module Traject
|
4
2
|
# Just some internal utility methods
|
5
3
|
module Util
|
@@ -27,17 +25,17 @@ module Traject
|
|
27
25
|
end
|
28
26
|
|
29
27
|
# Provide a config source file path, and an exception.
|
30
|
-
#
|
28
|
+
#
|
31
29
|
# Returns the line number from the first line in the stack
|
32
|
-
# trace of the exception that matches your file path.
|
30
|
+
# trace of the exception that matches your file path.
|
33
31
|
# of the first line in the backtrace matching that file_path.
|
34
|
-
#
|
35
|
-
# Returns `nil` if no suitable backtrace line can be found.
|
36
32
|
#
|
37
|
-
#
|
33
|
+
# Returns `nil` if no suitable backtrace line can be found.
|
34
|
+
#
|
35
|
+
# Has special logic to try and grep the info out of a SyntaxError, bah.
|
38
36
|
def self.backtrace_lineno_for_config(file_path, exception)
|
39
37
|
# For a SyntaxError, we really need to grep it from the
|
40
|
-
# exception message, it really appears to be nowhere else. Ugh.
|
38
|
+
# exception message, it really appears to be nowhere else. Ugh.
|
41
39
|
if exception.kind_of? SyntaxError
|
42
40
|
if exception.message =~ /:(\d+):/
|
43
41
|
return $1.to_i
|
@@ -45,13 +43,13 @@ module Traject
|
|
45
43
|
end
|
46
44
|
|
47
45
|
# Otherwise we try to fish it out of the backtrace, first
|
48
|
-
# line matching the config file path.
|
46
|
+
# line matching the config file path.
|
49
47
|
|
50
48
|
# exception.backtrace_locations exists in MRI 2.1+, which makes
|
51
49
|
# our task a lot easier. But not yet in JRuby 1.7.x, so we got to
|
52
|
-
# handle the old way of having to parse the strings in backtrace too.
|
53
|
-
if ( exception.respond_to?(:backtrace_locations) &&
|
54
|
-
exception.backtrace_locations &&
|
50
|
+
# handle the old way of having to parse the strings in backtrace too.
|
51
|
+
if ( exception.respond_to?(:backtrace_locations) &&
|
52
|
+
exception.backtrace_locations &&
|
55
53
|
exception.backtrace_locations.length > 0 )
|
56
54
|
location = exception.backtrace_locations.find do |bt|
|
57
55
|
bt.path == file_path
|
@@ -71,19 +69,19 @@ module Traject
|
|
71
69
|
|
72
70
|
# Extract just the part of the backtrace that is "below"
|
73
71
|
# the config file mentioned. If we can't find the config file
|
74
|
-
# in the stack trace, we might return empty array.
|
72
|
+
# in the stack trace, we might return empty array.
|
75
73
|
#
|
76
74
|
# If the ruby supports Exception#backtrace_locations, the
|
77
|
-
# returned array will actually be of Thread::Backtrace::Location elements.
|
75
|
+
# returned array will actually be of Thread::Backtrace::Location elements.
|
78
76
|
def self.backtrace_from_config(file_path, exception)
|
79
77
|
filtered_trace = []
|
80
78
|
found = false
|
81
79
|
|
82
80
|
# MRI 2.1+ has exception.backtrace_locations which makes
|
83
|
-
# this a lot easier, but JRuby 1.7.x doesn't yet, so we
|
84
|
-
# need to do it both ways.
|
85
|
-
if ( exception.respond_to?(:backtrace_locations) &&
|
86
|
-
exception.backtrace_locations &&
|
81
|
+
# this a lot easier, but JRuby 1.7.x doesn't yet, so we
|
82
|
+
# need to do it both ways.
|
83
|
+
if ( exception.respond_to?(:backtrace_locations) &&
|
84
|
+
exception.backtrace_locations &&
|
87
85
|
exception.backtrace_locations.length > 0 )
|
88
86
|
|
89
87
|
exception.backtrace_locations.each do |location|
|
data/lib/traject/version.rb
CHANGED
@@ -118,6 +118,8 @@ describe "Traject::Macros::Marc21" do
|
|
118
118
|
assert_equal "one two three", Marc21.trim_punctuation("one two three:")
|
119
119
|
assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
|
120
120
|
assert_equal "one two three", Marc21.trim_punctuation("one two three.")
|
121
|
+
assert_equal "one two three...", Marc21.trim_punctuation("one two three...")
|
122
|
+
assert_equal "one two three", Marc21.trim_punctuation(" one two three.")
|
121
123
|
|
122
124
|
assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
|
123
125
|
assert_equal "one two three", Marc21.trim_punctuation("one two three]")
|
data/test/marc_extractor_test.rb
CHANGED
@@ -35,7 +35,7 @@ describe "Traject::MarcExtractor" do
|
|
35
35
|
end
|
36
36
|
|
37
37
|
it "parses a mixed bag" do
|
38
|
-
parsed = Traject::MarcExtractor::Spec.hash_from_string("
|
38
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcdes:810:700|*4|bcd")
|
39
39
|
spec245 = parsed['245'].first
|
40
40
|
spec810 = parsed['810'].first
|
41
41
|
spec700 = parsed['700'].first
|
@@ -46,7 +46,7 @@ describe "Traject::MarcExtractor" do
|
|
46
46
|
assert spec245
|
47
47
|
assert_nil spec245.indicator1
|
48
48
|
assert_nil spec245.indicator2
|
49
|
-
assert_equal %w{a b c d e}, spec245.subfields
|
49
|
+
assert_equal %w{a b c d e s}, spec245.subfields
|
50
50
|
|
51
51
|
#810
|
52
52
|
assert spec810
|
data/traject.gemspec
CHANGED
@@ -4,13 +4,13 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'traject/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name
|
8
|
-
spec.version
|
9
|
-
spec.authors
|
10
|
-
spec.email
|
11
|
-
spec.summary
|
12
|
-
spec.homepage
|
13
|
-
spec.license
|
7
|
+
spec.name = "traject"
|
8
|
+
spec.version = Traject::VERSION
|
9
|
+
spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
|
10
|
+
spec.email = ["none@nowhere.org"]
|
11
|
+
spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
|
12
|
+
spec.homepage = "http://github.com/traject/traject"
|
13
|
+
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files`.split($/)
|
16
16
|
spec.executables = ["traject"]
|
@@ -23,10 +23,10 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency "concurrent-ruby", ">= 0.8.0"
|
24
24
|
spec.add_dependency "marc", "~> 1.0"
|
25
25
|
|
26
|
-
spec.add_dependency "hashie", "~> 3.1"
|
27
|
-
spec.add_dependency "slop", ">= 3.4.5", "< 4.0"
|
28
|
-
spec.add_dependency "yell"
|
29
|
-
spec.add_dependency "dot-properties", ">= 0.1.1"
|
26
|
+
spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
|
27
|
+
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
28
|
+
spec.add_dependency "yell" # logging
|
29
|
+
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
30
|
spec.add_dependency "httpclient", "~> 2.5"
|
31
31
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
32
32
|
|
@@ -40,8 +40,8 @@ Gem::Specification.new do |spec|
|
|
40
40
|
spec.platform = "ruby"
|
41
41
|
end
|
42
42
|
|
43
|
+
spec.add_development_dependency "bundler", '~> 1.7'
|
43
44
|
|
44
|
-
spec.add_development_dependency "bundler", "~> 1.7"
|
45
45
|
spec.add_development_dependency "rake"
|
46
46
|
spec.add_development_dependency "minitest"
|
47
47
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-04-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|