traject 2.3.0-java → 2.3.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/lib/traject/indexer.rb +0 -1
- data/lib/traject/macros/marc21.rb +4 -0
- data/lib/traject/marc_extractor_spec.rb +1 -1
- data/lib/traject/solr_json_writer.rb +11 -12
- data/lib/traject/thread_pool.rb +14 -12
- data/lib/traject/translation_map.rb +10 -12
- data/lib/traject/util.rb +16 -18
- data/lib/traject/version.rb +1 -1
- data/test/indexer/macros_marc21_test.rb +2 -0
- data/test/marc_extractor_test.rb +2 -2
- data/traject.gemspec +12 -12
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b482969bfad4936f4bab36ebbc6b0a2584f06457
|
4
|
+
data.tar.gz: e58bd672a66565f3dca63e4f3c59ad9eda457625
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea57b4e0b1fb2050793215097786ae3f12e819ecd8d1616964eeba29be24aa53e10c2b47a4d1fd16be70d0c2e4973e3e30a01bdbf9bc5a28f09060197f446ee4
|
7
|
+
data.tar.gz: dc6ea5a40377aff2b9ca33c9a84deb295c972db31b159da7f3b3526a339c80f3aed5b3a3d01088102a749aa3e69d8c91d53c9e5f5395b2f15f5d755eb9daf9c6
|
data/.travis.yml
CHANGED
data/lib/traject/indexer.rb
CHANGED
@@ -238,6 +238,10 @@ module Traject::Macros
|
|
238
238
|
# single square bracket characters if they are the start and/or end
|
239
239
|
# chars and there are no internal square brackets.
|
240
240
|
str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
|
241
|
+
|
242
|
+
# trim any leading or trailing whitespace
|
243
|
+
str.strip!
|
244
|
+
|
241
245
|
return str
|
242
246
|
end
|
243
247
|
|
@@ -170,7 +170,7 @@ module Traject
|
|
170
170
|
hash = Hash.new
|
171
171
|
|
172
172
|
# Split the string(s) given on colon
|
173
|
-
spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(
|
173
|
+
spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/\s*:\s*/)
|
174
174
|
|
175
175
|
spec_strings.each do |part|
|
176
176
|
if m = DATAFIELD_PATTERN.match(part)
|
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'yell'
|
2
2
|
|
3
|
-
require 'traject'
|
4
3
|
require 'traject/util'
|
5
4
|
require 'traject/qualified_const_get'
|
6
5
|
require 'traject/thread_pool'
|
@@ -28,21 +27,21 @@ require 'concurrent' # for atomic_fixnum
|
|
28
27
|
# My tests indicate that this setting doesn't change overall index speed by a ton.
|
29
28
|
#
|
30
29
|
# * solr_writer.thread_pool: How many threads to use for the writer. Default is 1.
|
31
|
-
# Likely useful even under MRI since thread will be waiting on Solr for some time.
|
30
|
+
# Likely useful even under MRI since thread will be waiting on Solr for some time.
|
32
31
|
#
|
33
|
-
# * solr_writer.max_skipped: How many records skipped due to errors before we
|
34
|
-
# bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
|
35
|
-
# raise and abort on a single record that could not be added to Solr.
|
32
|
+
# * solr_writer.max_skipped: How many records skipped due to errors before we
|
33
|
+
# bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
|
34
|
+
# raise and abort on a single record that could not be added to Solr.
|
36
35
|
#
|
37
36
|
# * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
|
38
37
|
# end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
|
39
38
|
# compat only.)
|
40
39
|
#
|
41
40
|
# * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
|
42
|
-
# giving up as a timeout. Default 10 minutes. Solr can be slow.
|
41
|
+
# giving up as a timeout. Default 10 minutes. Solr can be slow.
|
43
42
|
#
|
44
43
|
# * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
|
45
|
-
# or mock object to be used for HTTP.
|
44
|
+
# or mock object to be used for HTTP.
|
46
45
|
|
47
46
|
|
48
47
|
class Traject::SolrJsonWriter
|
@@ -85,7 +84,7 @@ class Traject::SolrJsonWriter
|
|
85
84
|
@thread_pool = Traject::ThreadPool.new(@thread_pool_size)
|
86
85
|
|
87
86
|
# old setting solrj_writer supported for backwards compat, as we make
|
88
|
-
# this the new default writer.
|
87
|
+
# this the new default writer.
|
89
88
|
@commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
|
90
89
|
|
91
90
|
# Figure out where to send updates
|
@@ -118,12 +117,12 @@ class Traject::SolrJsonWriter
|
|
118
117
|
end
|
119
118
|
|
120
119
|
if exception || resp.status != 200
|
121
|
-
error_message = exception ?
|
122
|
-
Traject::Util.exception_to_log_message(exception) :
|
120
|
+
error_message = exception ?
|
121
|
+
Traject::Util.exception_to_log_message(exception) :
|
123
122
|
"Solr response: #{resp.status}: #{resp.body}"
|
124
123
|
|
125
124
|
logger.error "Error in Solr batch add. Will retry documents individually at performance penalty: #{error_message}"
|
126
|
-
|
125
|
+
|
127
126
|
batch.each do |c|
|
128
127
|
send_single(c)
|
129
128
|
end
|
@@ -138,7 +137,7 @@ class Traject::SolrJsonWriter
|
|
138
137
|
begin
|
139
138
|
resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
|
140
139
|
# Catch Timeouts and network errors as skipped records, but otherwise
|
141
|
-
# allow unexpected errors to propagate up.
|
140
|
+
# allow unexpected errors to propagate up.
|
142
141
|
rescue HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED => exception
|
143
142
|
end
|
144
143
|
|
data/lib/traject/thread_pool.rb
CHANGED
@@ -13,7 +13,7 @@ module Traject
|
|
13
13
|
# be created, and work sent to the Traject::ThreadPool will just be executed
|
14
14
|
# in the caller thread. We call this a nil threadpool. One situation it can be useful
|
15
15
|
# is if you are running under MRI, where multi-core parallelism isn't available, so
|
16
|
-
# an actual threadpool may not be useful. (Although in some cases a thread pool,
|
16
|
+
# an actual threadpool may not be useful. (Although in some cases a thread pool,
|
17
17
|
# especially one with size 1, can be useful in MRI for I/O blocking operations)
|
18
18
|
#
|
19
19
|
# 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
|
@@ -40,7 +40,7 @@ module Traject
|
|
40
40
|
# to complete, then return. You can not give any more work to the pool
|
41
41
|
# after you do this. By default it'll wait pretty much forever, which should
|
42
42
|
# be fine. If you never call shutdown, then queued or in-progress work
|
43
|
-
# may be abandoned when the program ends, which would be bad.
|
43
|
+
# may be abandoned when the program ends, which would be bad.
|
44
44
|
#
|
45
45
|
# 7) We will keep track of total times a block is run in thread pool, and
|
46
46
|
# total elapsed (wall) time of running all blocks, so an average_execution_ms
|
@@ -51,24 +51,26 @@ module Traject
|
|
51
51
|
attr_reader :pool_size, :queue_capacity
|
52
52
|
|
53
53
|
# First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
|
54
|
-
# work in caller thread.
|
54
|
+
# work in caller thread.
|
55
55
|
def initialize(pool_size)
|
56
|
+
@thread_pool = nil # assume we don't have one
|
57
|
+
@exceptions_caught_queue = [] # start off without exceptions
|
56
58
|
unless pool_size.nil? || pool_size == 0
|
57
|
-
@pool_size
|
59
|
+
@pool_size = pool_size.to_i
|
58
60
|
@queue_capacity = pool_size * 3
|
59
61
|
|
60
|
-
@thread_pool
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
@thread_pool = Concurrent::ThreadPoolExecutor.new(
|
63
|
+
:min_threads => @pool_size,
|
64
|
+
:max_threads => @pool_size,
|
65
|
+
:max_queue => @queue_capacity,
|
66
|
+
:fallback_policy => :caller_runs
|
65
67
|
)
|
66
68
|
|
67
69
|
# A thread-safe queue to collect exceptions cross-threads.
|
68
70
|
# We really only need to save the first exception, but a queue
|
69
71
|
# is a convenient way to store a value concurrency-safely, and
|
70
|
-
# might as well store all of them.
|
71
|
-
@exceptions_caught_queue
|
72
|
+
# might as well store all of them.
|
73
|
+
@exceptions_caught_queue = Queue.new
|
72
74
|
end
|
73
75
|
end
|
74
76
|
|
@@ -133,7 +135,7 @@ module Traject
|
|
133
135
|
# as a non-functioning threadpool -- then this method is just
|
134
136
|
# a no-op.
|
135
137
|
def raise_collected_exception!
|
136
|
-
|
138
|
+
unless @exceptions_caught_queue.empty?
|
137
139
|
e = @exceptions_caught_queue.pop
|
138
140
|
raise e
|
139
141
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'traject'
|
2
|
-
|
3
1
|
require 'yaml'
|
4
2
|
require 'dot-properties'
|
5
3
|
|
@@ -131,13 +129,13 @@ module Traject
|
|
131
129
|
yaml_file = File.join( base, "translation_maps", "#{path}.yaml" )
|
132
130
|
prop_file = File.join(base, "translation_maps", "#{path}.properties" )
|
133
131
|
|
134
|
-
if File.
|
132
|
+
if File.exist? rb_file
|
135
133
|
found = eval( File.open(rb_file).read , binding, rb_file )
|
136
134
|
break
|
137
|
-
elsif File.
|
135
|
+
elsif File.exist? yaml_file
|
138
136
|
found = YAML.load_file(yaml_file)
|
139
137
|
break
|
140
|
-
elsif File.
|
138
|
+
elsif File.exist? prop_file
|
141
139
|
found = Traject::TranslationMap.read_properties(prop_file)
|
142
140
|
break
|
143
141
|
end
|
@@ -231,21 +229,21 @@ module Traject
|
|
231
229
|
array.replace( self.translate_array(array))
|
232
230
|
end
|
233
231
|
|
234
|
-
# Return a new TranslationMap that results from merging argument on top of self.
|
232
|
+
# Return a new TranslationMap that results from merging argument on top of self.
|
235
233
|
# Can be useful for taking an existing translation map, but merging a few
|
236
|
-
# overrides on top.
|
234
|
+
# overrides on top.
|
237
235
|
#
|
238
236
|
# merged_map = TranslationMap.new(something).merge TranslationMap.new(else)
|
239
237
|
# #...
|
240
238
|
# merged_map.translate_array(something) # etc
|
241
239
|
#
|
242
|
-
# If a default is set in the second map, it will merge over the first too.
|
240
|
+
# If a default is set in the second map, it will merge over the first too.
|
243
241
|
#
|
244
242
|
# You can also pass in a plain hash as an arg, instead of an existing TranslationMap:
|
245
243
|
#
|
246
244
|
# TranslationMap.new(something).merge("overridden_key" => "value", "a" => "")
|
247
245
|
def merge(other_map)
|
248
|
-
default = other_map.default || self.default
|
246
|
+
default = other_map.default || self.default
|
249
247
|
TranslationMap.new(self.to_hash.merge(other_map.to_hash), :default => default)
|
250
248
|
end
|
251
249
|
|
@@ -258,9 +256,9 @@ module Traject
|
|
258
256
|
protected
|
259
257
|
|
260
258
|
# We use dot-properties gem for reading .properties files,
|
261
|
-
# return a hash.
|
262
|
-
def self.read_properties(file_name)
|
263
|
-
return DotProperties.load(file_name).to_h
|
259
|
+
# return a hash.
|
260
|
+
def self.read_properties(file_name)
|
261
|
+
return DotProperties.load(file_name).to_h
|
264
262
|
end
|
265
263
|
|
266
264
|
end
|
data/lib/traject/util.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'traject'
|
2
|
-
|
3
1
|
module Traject
|
4
2
|
# Just some internal utility methods
|
5
3
|
module Util
|
@@ -27,17 +25,17 @@ module Traject
|
|
27
25
|
end
|
28
26
|
|
29
27
|
# Provide a config source file path, and an exception.
|
30
|
-
#
|
28
|
+
#
|
31
29
|
# Returns the line number from the first line in the stack
|
32
|
-
# trace of the exception that matches your file path.
|
30
|
+
# trace of the exception that matches your file path.
|
33
31
|
# of the first line in the backtrace matching that file_path.
|
34
|
-
#
|
35
|
-
# Returns `nil` if no suitable backtrace line can be found.
|
36
32
|
#
|
37
|
-
#
|
33
|
+
# Returns `nil` if no suitable backtrace line can be found.
|
34
|
+
#
|
35
|
+
# Has special logic to try and grep the info out of a SyntaxError, bah.
|
38
36
|
def self.backtrace_lineno_for_config(file_path, exception)
|
39
37
|
# For a SyntaxError, we really need to grep it from the
|
40
|
-
# exception message, it really appears to be nowhere else. Ugh.
|
38
|
+
# exception message, it really appears to be nowhere else. Ugh.
|
41
39
|
if exception.kind_of? SyntaxError
|
42
40
|
if exception.message =~ /:(\d+):/
|
43
41
|
return $1.to_i
|
@@ -45,13 +43,13 @@ module Traject
|
|
45
43
|
end
|
46
44
|
|
47
45
|
# Otherwise we try to fish it out of the backtrace, first
|
48
|
-
# line matching the config file path.
|
46
|
+
# line matching the config file path.
|
49
47
|
|
50
48
|
# exception.backtrace_locations exists in MRI 2.1+, which makes
|
51
49
|
# our task a lot easier. But not yet in JRuby 1.7.x, so we got to
|
52
|
-
# handle the old way of having to parse the strings in backtrace too.
|
53
|
-
if ( exception.respond_to?(:backtrace_locations) &&
|
54
|
-
exception.backtrace_locations &&
|
50
|
+
# handle the old way of having to parse the strings in backtrace too.
|
51
|
+
if ( exception.respond_to?(:backtrace_locations) &&
|
52
|
+
exception.backtrace_locations &&
|
55
53
|
exception.backtrace_locations.length > 0 )
|
56
54
|
location = exception.backtrace_locations.find do |bt|
|
57
55
|
bt.path == file_path
|
@@ -71,19 +69,19 @@ module Traject
|
|
71
69
|
|
72
70
|
# Extract just the part of the backtrace that is "below"
|
73
71
|
# the config file mentioned. If we can't find the config file
|
74
|
-
# in the stack trace, we might return empty array.
|
72
|
+
# in the stack trace, we might return empty array.
|
75
73
|
#
|
76
74
|
# If the ruby supports Exception#backtrace_locations, the
|
77
|
-
# returned array will actually be of Thread::Backtrace::Location elements.
|
75
|
+
# returned array will actually be of Thread::Backtrace::Location elements.
|
78
76
|
def self.backtrace_from_config(file_path, exception)
|
79
77
|
filtered_trace = []
|
80
78
|
found = false
|
81
79
|
|
82
80
|
# MRI 2.1+ has exception.backtrace_locations which makes
|
83
|
-
# this a lot easier, but JRuby 1.7.x doesn't yet, so we
|
84
|
-
# need to do it both ways.
|
85
|
-
if ( exception.respond_to?(:backtrace_locations) &&
|
86
|
-
exception.backtrace_locations &&
|
81
|
+
# this a lot easier, but JRuby 1.7.x doesn't yet, so we
|
82
|
+
# need to do it both ways.
|
83
|
+
if ( exception.respond_to?(:backtrace_locations) &&
|
84
|
+
exception.backtrace_locations &&
|
87
85
|
exception.backtrace_locations.length > 0 )
|
88
86
|
|
89
87
|
exception.backtrace_locations.each do |location|
|
data/lib/traject/version.rb
CHANGED
@@ -118,6 +118,8 @@ describe "Traject::Macros::Marc21" do
|
|
118
118
|
assert_equal "one two three", Marc21.trim_punctuation("one two three:")
|
119
119
|
assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
|
120
120
|
assert_equal "one two three", Marc21.trim_punctuation("one two three.")
|
121
|
+
assert_equal "one two three...", Marc21.trim_punctuation("one two three...")
|
122
|
+
assert_equal "one two three", Marc21.trim_punctuation(" one two three.")
|
121
123
|
|
122
124
|
assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
|
123
125
|
assert_equal "one two three", Marc21.trim_punctuation("one two three]")
|
data/test/marc_extractor_test.rb
CHANGED
@@ -35,7 +35,7 @@ describe "Traject::MarcExtractor" do
|
|
35
35
|
end
|
36
36
|
|
37
37
|
it "parses a mixed bag" do
|
38
|
-
parsed = Traject::MarcExtractor::Spec.hash_from_string("
|
38
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcdes:810:700|*4|bcd")
|
39
39
|
spec245 = parsed['245'].first
|
40
40
|
spec810 = parsed['810'].first
|
41
41
|
spec700 = parsed['700'].first
|
@@ -46,7 +46,7 @@ describe "Traject::MarcExtractor" do
|
|
46
46
|
assert spec245
|
47
47
|
assert_nil spec245.indicator1
|
48
48
|
assert_nil spec245.indicator2
|
49
|
-
assert_equal %w{a b c d e}, spec245.subfields
|
49
|
+
assert_equal %w{a b c d e s}, spec245.subfields
|
50
50
|
|
51
51
|
#810
|
52
52
|
assert spec810
|
data/traject.gemspec
CHANGED
@@ -4,13 +4,13 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'traject/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name
|
8
|
-
spec.version
|
9
|
-
spec.authors
|
10
|
-
spec.email
|
11
|
-
spec.summary
|
12
|
-
spec.homepage
|
13
|
-
spec.license
|
7
|
+
spec.name = "traject"
|
8
|
+
spec.version = Traject::VERSION
|
9
|
+
spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
|
10
|
+
spec.email = ["none@nowhere.org"]
|
11
|
+
spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
|
12
|
+
spec.homepage = "http://github.com/traject/traject"
|
13
|
+
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files`.split($/)
|
16
16
|
spec.executables = ["traject"]
|
@@ -23,10 +23,10 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency "concurrent-ruby", ">= 0.8.0"
|
24
24
|
spec.add_dependency "marc", "~> 1.0"
|
25
25
|
|
26
|
-
spec.add_dependency "hashie", "~> 3.1"
|
27
|
-
spec.add_dependency "slop", ">= 3.4.5", "< 4.0"
|
28
|
-
spec.add_dependency "yell"
|
29
|
-
spec.add_dependency "dot-properties", ">= 0.1.1"
|
26
|
+
spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
|
27
|
+
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
28
|
+
spec.add_dependency "yell" # logging
|
29
|
+
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
30
|
spec.add_dependency "httpclient", "~> 2.5"
|
31
31
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
32
32
|
|
@@ -40,8 +40,8 @@ Gem::Specification.new do |spec|
|
|
40
40
|
spec.platform = "ruby"
|
41
41
|
end
|
42
42
|
|
43
|
+
spec.add_development_dependency "bundler", '~> 1.7'
|
43
44
|
|
44
|
-
spec.add_development_dependency "bundler", "~> 1.7"
|
45
45
|
spec.add_development_dependency "rake"
|
46
46
|
spec.add_development_dependency "minitest"
|
47
47
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.1
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-04-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|