traject 0.10.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +6 -5
- data/doc/extending.md +4 -4
- data/doc/settings.md +5 -0
- data/lib/traject/command_line.rb +40 -19
- data/lib/traject/macros/marc21.rb +14 -6
- data/lib/traject/marc4j_reader.rb +15 -1
- data/lib/traject/mock_reader.rb +27 -16
- data/lib/traject/mock_writer.rb +3 -15
- data/lib/traject/solrj_writer.rb +6 -0
- data/lib/traject/translation_map.rb +1 -1
- data/lib/traject/version.rb +1 -1
- data/test/marc4j_reader_test.rb +10 -0
- data/traject.gemspec +1 -1
- metadata +4 -4
data/README.md
CHANGED
@@ -321,9 +321,10 @@ You can supply more than one conf file with repeated `-c` arguments.
|
|
321
321
|
|
322
322
|
traject -c connection_conf.rb -c indexing_conf.rb marc_file.mrc
|
323
323
|
|
324
|
-
If you
|
324
|
+
If you supply a `--stdin` argument, traject will try to read from stdin.
|
325
|
+
You can only supply one marc file at a time, but we can take advantage of stdin to get around this:
|
325
326
|
|
326
|
-
cat some/dir/*.marc | traject -c conf_file.rb
|
327
|
+
cat some/dir/*.marc | traject -c conf_file.rb --stdin
|
327
328
|
|
328
329
|
You can set any setting on the command line with `-s key=value`.
|
329
330
|
This will over-ride any settings set with `provide` in conf files.
|
@@ -343,7 +344,7 @@ Use `-u` as a shortcut for `s solr.url=X`
|
|
343
344
|
|
344
345
|
traject -c conf_file.rb -u http://example.com/solr marc_file.mrc
|
345
346
|
|
346
|
-
Also see `-I load_path` and `-
|
347
|
+
Also see `-I load_path` and `-G Gemfile` options under Extending With Your Own Code.
|
347
348
|
|
348
349
|
See also [Hints for batch and cronjob use](./doc/batch_execution.md) of traject.
|
349
350
|
|
@@ -374,9 +375,9 @@ Own Code](./doc/extending.md)
|
|
374
375
|
* translation map files found on the load path or in a
|
375
376
|
"./translation_maps" subdir on the load path will be found
|
376
377
|
for Traject translation maps.
|
377
|
-
* Traject `-
|
378
|
+
* Traject `-G` command line can be used to tell traject to use
|
378
379
|
bundler with a `Gemfile` located at current working dirctory
|
379
|
-
(or give an argument to `-
|
380
|
+
(or give an argument to `-G ./some/myGemfile`)
|
380
381
|
|
381
382
|
## More
|
382
383
|
|
data/doc/extending.md
CHANGED
@@ -19,9 +19,9 @@ of a couple traject features meant to make it easier.
|
|
19
19
|
* translation map files found on the load path or in a
|
20
20
|
"./translation_maps" subdir on the load path will be found
|
21
21
|
for Traject translation maps.
|
22
|
-
* Traject `-
|
22
|
+
* Traject `-G` command line can be used to tell traject to use
|
23
23
|
bundler with a `Gemfile` located at current working dirctory
|
24
|
-
(or give an argument to `-
|
24
|
+
(or give an argument to `-G ./some/myGemfile`)
|
25
25
|
|
26
26
|
## Custom code local to your project
|
27
27
|
|
@@ -161,9 +161,9 @@ possibly with version restrictions, in the [Gemfile](http://bundler.io/v1.3/gemf
|
|
161
161
|
Run `bundle install` from the directory with the Gemfile, on any system
|
162
162
|
at any time, to make sure specified gems are installed.
|
163
163
|
|
164
|
-
**Run traject** with the `-
|
164
|
+
**Run traject** with the `-G` flag to tell it to use the Gemfile:
|
165
165
|
|
166
|
-
traject -
|
166
|
+
traject -G -c some_traject_config.rb ...
|
167
167
|
|
168
168
|
Traject will use bundler to setup with the Gemfile, making sure
|
169
169
|
the specified versions of all gems are used (and also making sure
|
data/doc/settings.md
CHANGED
@@ -54,6 +54,11 @@ for commonly used settings, see `traject -h`.
|
|
54
54
|
* `marc4j_reader.source_encoding`: Used by Marc4JReader only when marc.source_type is 'binary', encoding strings accepted
|
55
55
|
by marc4j MarcPermissiveStreamReader. Default "BESTGUESS", also "UTF-8", "MARC"
|
56
56
|
|
57
|
+
* `marcout.allow_oversized`: Used with `-x marcout` command to output marc when outputting
|
58
|
+
as ISO 2709 binary, set to true or string "true", and the MARC::Writer will have
|
59
|
+
allow_oversized=true set, allowing oversized records to be serialized with length
|
60
|
+
bytes zero'd out -- technically illegal, but can be read by MARC::Reader in permissive mode.
|
61
|
+
|
57
62
|
* `output_file`: Output file to write to for operations that write to files: For instance the `marcout` command,
|
58
63
|
or Writer classes that write to files, like Traject::JsonWriter. Has an shortcut
|
59
64
|
`-o` on command line.
|
data/lib/traject/command_line.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
+
# Require as little as possible at top, so we can bundle require later
|
2
|
+
# if needed, before requiring anything from the bundle. Can't avoid slop
|
3
|
+
# though, to get our bundle arg out, sorry.
|
1
4
|
require 'slop'
|
2
|
-
require 'traject'
|
3
|
-
require 'traject/indexer'
|
4
5
|
|
5
6
|
module Traject
|
6
7
|
# The class that executes for the Traject command line utility.
|
@@ -8,7 +9,7 @@ module Traject
|
|
8
9
|
# Warning, does do things like exit entire program on error at present.
|
9
10
|
# You probably don't want to use this class for anything but an actual
|
10
11
|
# shell command line, if you want to execute indexing directly, just
|
11
|
-
# use the Traject::Indexer directly.
|
12
|
+
# use the Traject::Indexer directly.
|
12
13
|
#
|
13
14
|
# A CommandLine object has a single persistent Indexer object it uses
|
14
15
|
class CommandLine
|
@@ -43,10 +44,16 @@ module Traject
|
|
43
44
|
|
44
45
|
# have to use Slop object to tell diff between
|
45
46
|
# no arg supplied and no option -g given at all
|
46
|
-
if slop.present? :
|
47
|
-
require_bundler_setup(options[:
|
47
|
+
if slop.present? :Gemfile
|
48
|
+
require_bundler_setup(options[:Gemfile])
|
48
49
|
end
|
49
50
|
|
51
|
+
# We require them here instead of top of file,
|
52
|
+
# so we have done bundler require before we require these.
|
53
|
+
require 'traject'
|
54
|
+
require 'traject/indexer'
|
55
|
+
|
56
|
+
|
50
57
|
(options[:load_path] || []).each do |path|
|
51
58
|
$LOAD_PATH << path unless $LOAD_PATH.include? path
|
52
59
|
end
|
@@ -114,7 +121,7 @@ module Traject
|
|
114
121
|
when "xml"
|
115
122
|
writer = MARC::XMLWriter.new(output_arg)
|
116
123
|
when "human"
|
117
|
-
writer = output_arg.kind_of?(String) ? File.open(output_arg, "w:binary") : output_arg
|
124
|
+
writer = output_arg.kind_of?(String) ? File.open(output_arg, "w:binary") : output_arg
|
118
125
|
else
|
119
126
|
raise ArgumentError.new("traject marcout unrecognized marcout.type: #{output_type}")
|
120
127
|
end
|
@@ -142,14 +149,17 @@ module Traject
|
|
142
149
|
# * It INSISTS on reading from ARGFV, making it hard to test, or use when you want to give
|
143
150
|
# it a list of files on something other than ARGV.
|
144
151
|
#
|
145
|
-
# So for now we do just one file, or stdin if
|
146
|
-
|
147
|
-
|
148
|
-
exit 1
|
149
|
-
end
|
150
|
-
if argv.length == 0
|
152
|
+
# So for now we do just one file, or stdin if specified. Sorry!
|
153
|
+
|
154
|
+
if options[:stdin]
|
151
155
|
indexer.logger.info "Reading from STDIN..."
|
152
156
|
io = $stdin
|
157
|
+
elsif argv.length > 1
|
158
|
+
self.console.puts "Sorry, traject can only handle one input file at a time right now. `#{argv}` Exiting..."
|
159
|
+
exit 1
|
160
|
+
elsif argv.length == 0
|
161
|
+
indexer.logger.warn "Warning, no file input given..."
|
162
|
+
io = File.open(File::NULL, 'r')
|
153
163
|
else
|
154
164
|
indexer.logger.info "Reading from #{argv.first}"
|
155
165
|
io = File.open(argv.first, 'r')
|
@@ -160,15 +170,24 @@ module Traject
|
|
160
170
|
def load_configuration_files!(my_indexer, conf_files)
|
161
171
|
conf_files.each do |conf_path|
|
162
172
|
begin
|
163
|
-
|
173
|
+
file_io = File.open(conf_path)
|
164
174
|
rescue Errno::ENOENT => e
|
165
175
|
self.console.puts "Could not find configuration file '#{conf_path}', exiting..."
|
166
176
|
exit 2
|
177
|
+
end
|
178
|
+
|
179
|
+
begin
|
180
|
+
my_indexer.instance_eval(file_io.read, conf_path)
|
167
181
|
rescue Exception => e
|
168
|
-
|
169
|
-
|
182
|
+
if (conf_trace = e.backtrace.find {|l| l.start_with? conf_path}) &&
|
183
|
+
(conf_trace =~ /\A.*\:(\d+)\:in/)
|
184
|
+
line_number = $1
|
185
|
+
end
|
186
|
+
|
187
|
+
self.console.puts "Error processing configuration file '#{conf_path}' at line #{line_number}"
|
188
|
+
self.console.puts " #{e.class}: #{e.message}"
|
170
189
|
if e.backtrace.first =~ /\A(.*)\:in/
|
171
|
-
self.console.puts " #{$1}"
|
190
|
+
self.console.puts " from #{$1}"
|
172
191
|
end
|
173
192
|
exit 3
|
174
193
|
end
|
@@ -191,10 +210,10 @@ module Traject
|
|
191
210
|
def require_bundler_setup(gemfile=nil)
|
192
211
|
if gemfile
|
193
212
|
# tell bundler what gemfile to use
|
194
|
-
gem_path = File.expand_path(
|
213
|
+
gem_path = File.expand_path( gemfile )
|
195
214
|
# bundler not good at error reporting, we check ourselves
|
196
215
|
unless File.exists? gem_path
|
197
|
-
self.console.puts "Gemfile `#{
|
216
|
+
self.console.puts "Gemfile `#{gemfile}` does not exist, exiting..."
|
198
217
|
self.console.puts
|
199
218
|
self.console.puts slop.help
|
200
219
|
exit 2
|
@@ -263,9 +282,11 @@ module Traject
|
|
263
282
|
on :j, "output as pretty printed json, shortcut for -s writer_class_name=JsonWriter -s json_writer.pretty_print=true"
|
264
283
|
on :t, :marc_type, "xml, json or binary. shortcut for -s marc_source.type=", :argument => true
|
265
284
|
on :I, "load_path", "append paths to ruby $LOAD_PATH", :argument => true, :as => Array, :delimiter => ":"
|
266
|
-
on :
|
285
|
+
on :G, "Gemfile", "run with bundler and optionally specified Gemfile", :argument => :optional, :default => ""
|
267
286
|
|
268
287
|
on :x, "command", "alternate traject command: process (default); marcout", :argument => true, :default => "process"
|
288
|
+
|
289
|
+
on "stdin", "read input from stdin"
|
269
290
|
end
|
270
291
|
end
|
271
292
|
|
@@ -78,19 +78,27 @@ module Traject::Macros
|
|
78
78
|
# [json] marc-in-json (http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
|
79
79
|
# [binary] Standard ISO 2709 binary marc. By default WILL be base64-encoded,
|
80
80
|
# assumed destination a solr 'binary' field.
|
81
|
-
# add option `:binary_escape => false` to do straight binary -- unclear
|
81
|
+
# * add option `:binary_escape => false` to do straight binary -- unclear
|
82
82
|
# what Solr's documented behavior is when you do this, and add a string
|
83
83
|
# with binary control chars to solr. May do different things in diff
|
84
84
|
# Solr versions, including raising exceptions.
|
85
|
+
# * add option `:allow_oversized => true` to pass that flat
|
86
|
+
# to the MARC::Writer. Oversized records will then still be
|
87
|
+
# serialized, with certain header bytes filled with ascii 0's
|
88
|
+
# -- technically illegal MARC, but can still be read by
|
89
|
+
# ruby MARC::Reader in permissive mode.
|
85
90
|
def serialized_marc(options)
|
86
|
-
|
87
|
-
|
91
|
+
format = options[:format].to_s
|
92
|
+
binary_escape = (options[:binary_escape] != false)
|
93
|
+
allow_oversized = (options[:allow_oversized] == true)
|
94
|
+
|
95
|
+
raise ArgumentError.new("Need :format => [binary|xml|json] arg") unless %w{binary xml json}.include?(format)
|
88
96
|
|
89
97
|
lambda do |record, accumulator, context|
|
90
|
-
case
|
98
|
+
case format
|
91
99
|
when "binary"
|
92
|
-
binary = record
|
93
|
-
binary = Base64.encode64(binary)
|
100
|
+
binary = MARC::Writer.encode(record, allow_oversized)
|
101
|
+
binary = Base64.encode64(binary) if binary_escape
|
94
102
|
accumulator << binary
|
95
103
|
when "xml"
|
96
104
|
# ruby-marc #to_xml returns a REXML object at time of this writing, bah!@
|
@@ -35,8 +35,12 @@ require 'marc'
|
|
35
35
|
# Default 'BESTGUESS', but marc records in the wild are so wrong here, recommend setting.
|
36
36
|
# (will ALWAYS be transcoded to UTF-8 on the way out. We insist.)
|
37
37
|
#
|
38
|
-
# * marc4j_reader.jar_dir:
|
38
|
+
# * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
|
39
39
|
# be loaded. If unset, uses marc4j.jar bundled with traject.
|
40
|
+
#
|
41
|
+
# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
|
42
|
+
# the eventual ruby-marc record via record#original_marc4j
|
43
|
+
|
40
44
|
class Traject::Marc4JReader
|
41
45
|
include Enumerable
|
42
46
|
|
@@ -47,6 +51,13 @@ class Traject::Marc4JReader
|
|
47
51
|
@input_stream = input_stream
|
48
52
|
|
49
53
|
ensure_marc4j_loaded!
|
54
|
+
|
55
|
+
if @settings['marc4j_reader.keep_marc4j'] &&
|
56
|
+
! (MARC::Record.instance_methods.include?(:original_marc4j) &&
|
57
|
+
MARC::Record.instance_methods.include?(:"original_marc4j="))
|
58
|
+
MARC::Record.class_eval('attr_accessor :original_marc4j')
|
59
|
+
end
|
60
|
+
|
50
61
|
end
|
51
62
|
|
52
63
|
# Loads solrj unless it appears to already be loaded.
|
@@ -91,6 +102,9 @@ class Traject::Marc4JReader
|
|
91
102
|
begin
|
92
103
|
marc4j = internal_reader.next
|
93
104
|
rubymarc = convert_marc4j_to_rubymarc(marc4j)
|
105
|
+
if @settings['marc4j_reader.keep_marc4j']
|
106
|
+
rubymarc.original_marc4j = marc4j
|
107
|
+
end
|
94
108
|
rescue Exception =>e
|
95
109
|
msg = "MARC4JReader: Error reading MARC, fatal, re-raising"
|
96
110
|
if marc4j
|
data/lib/traject/mock_reader.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require 'marc'
|
3
3
|
require 'json'
|
4
4
|
module Traject
|
5
|
-
|
5
|
+
|
6
6
|
# A mock reader, designed to do almost no work during a run to provide better benchmarking
|
7
7
|
#
|
8
8
|
# It pulls in 20 records from the end of this file and then just returns
|
@@ -12,42 +12,53 @@ module Traject
|
|
12
12
|
#
|
13
13
|
# require 'traject/mock_writer'
|
14
14
|
# require 'traject/mock_reader'
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# settings do
|
17
17
|
# store "reader_class_name", "Traject::MockReader"
|
18
18
|
# store "writer_class_name", "Traject::MockWriter"
|
19
19
|
# store "mock_reader.limit", 4_000 # default is 10_000
|
20
20
|
# end
|
21
|
-
|
21
|
+
|
22
22
|
class MockReader
|
23
|
-
|
23
|
+
|
24
24
|
attr_accessor :limit
|
25
|
-
|
25
|
+
|
26
26
|
# @param [Ignored] input_stream (ignored)
|
27
27
|
# @param [Hash] settings (looks only for an integer in 'mock_reader.limit')
|
28
28
|
def initialize(input_stream, settings = {})
|
29
|
-
@
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
@limit = (settings["mock_reader.limit"] || 10_000).to_i
|
30
|
+
|
31
|
+
@records = load_ndjson(File.open(__FILE__))
|
32
|
+
|
33
|
+
# freeze it immutable for thread safety and performance
|
34
|
+
@records.each {|r| r.fields.freeze}
|
35
|
+
end
|
36
|
+
|
37
|
+
# newline delimited json, assuming no internal unescaped
|
38
|
+
# newlines in json too!
|
39
|
+
def load_ndjson(file_io)
|
40
|
+
records = []
|
41
|
+
|
42
|
+
this_file_iter = file_io.each_line
|
43
|
+
|
34
44
|
while true
|
35
45
|
line = this_file_iter.next
|
36
46
|
break if line =~ /^\_\_END\_\_/
|
37
47
|
end
|
38
|
-
|
48
|
+
|
39
49
|
begin
|
40
50
|
while true
|
41
51
|
json = this_file_iter.next
|
42
52
|
next unless json =~ /\S/
|
43
|
-
|
53
|
+
records << MARC::Record.new_from_hash(JSON.parse(json))
|
44
54
|
end
|
45
55
|
rescue StopIteration
|
46
56
|
end
|
47
|
-
|
57
|
+
|
58
|
+
return records
|
48
59
|
end
|
49
60
|
|
50
|
-
|
61
|
+
|
51
62
|
def each
|
52
63
|
unless block_given?
|
53
64
|
enum_for(:each)
|
@@ -58,8 +69,8 @@ module Traject
|
|
58
69
|
end
|
59
70
|
end
|
60
71
|
end
|
61
|
-
|
62
|
-
|
72
|
+
|
73
|
+
|
63
74
|
end
|
64
75
|
end
|
65
76
|
|
data/lib/traject/mock_writer.rb
CHANGED
@@ -1,17 +1,5 @@
|
|
1
|
-
# A writer
|
2
|
-
#
|
3
|
-
#
|
4
|
-
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
5
|
-
# concurrently), by wrapping write to actual output file in a mutex synchronize.
|
6
|
-
# This does not seem to effect performance much, as far as I could tell
|
7
|
-
# benchmarking.
|
8
|
-
#
|
9
|
-
# Output will be sent to settings["output_file"] string path, or else
|
10
|
-
# settings["output_stream"] (ruby IO object), or else stdout.
|
11
|
-
#
|
12
|
-
# This class can be sub-classed to write out different serialized
|
13
|
-
# reprentations -- subclasses will just override the #serialize
|
14
|
-
# method. For instance, see JsonWriter.
|
1
|
+
# A Null writer that does absolutely nothing with records given to it,
|
2
|
+
# just drops em on the floor.
|
15
3
|
class Traject::MockWriter
|
16
4
|
attr_reader :settings
|
17
5
|
|
@@ -21,7 +9,7 @@ class Traject::MockWriter
|
|
21
9
|
|
22
10
|
def serialize(context)
|
23
11
|
# null
|
24
|
-
end
|
12
|
+
end
|
25
13
|
|
26
14
|
def put(context)
|
27
15
|
# null
|
data/lib/traject/solrj_writer.rb
CHANGED
@@ -257,6 +257,12 @@ class Traject::SolrJWriter
|
|
257
257
|
return true
|
258
258
|
end
|
259
259
|
|
260
|
+
# Consider Solr server returning HTTP 500 Internal Server Error to be fatal.
|
261
|
+
# This can mean, for instance, that disk space is exhausted on solr server.
|
262
|
+
if e.kind_of?(Java::OrgApacheSolrCommon::SolrException) && e.code == 500
|
263
|
+
return true
|
264
|
+
end
|
265
|
+
|
260
266
|
return false
|
261
267
|
end
|
262
268
|
|
@@ -184,7 +184,7 @@ module Traject
|
|
184
184
|
|
185
185
|
class NotFound < Exception
|
186
186
|
def initialize(path)
|
187
|
-
super("No translation map definition file found at '#{path}[.rb|.yaml]' in load path")
|
187
|
+
super("No translation map definition file found at '#{path}[.rb|.yaml]' in load path: #{$LOAD_PATH}")
|
188
188
|
end
|
189
189
|
end
|
190
190
|
|
data/lib/traject/version.rb
CHANGED
data/test/marc4j_reader_test.rb
CHANGED
@@ -75,4 +75,14 @@ describe "Marc4JReader" do
|
|
75
75
|
assert first['245']['a'].encoding.name, "UTF-8"
|
76
76
|
assert_equal "Fikr-i Ayāz /", first['245']['a']
|
77
77
|
end
|
78
|
+
|
79
|
+
it "keeps marc4j object when asked" do
|
80
|
+
file = File.new(support_file_path "test_data.utf8.marc.xml")
|
81
|
+
settings = Traject::Indexer::Settings.new("marc_source.type" => "xml", 'marc4j_reader.keep_marc4j' => true)
|
82
|
+
record = Traject::Marc4JReader.new(file, settings).to_a.first
|
83
|
+
assert_kind_of MARC::Record, record
|
84
|
+
assert_kind_of Java::org.marc4j.marc.impl::RecordImpl, record.original_marc4j
|
85
|
+
end
|
86
|
+
|
87
|
+
|
78
88
|
end
|
data/traject.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
20
|
|
21
|
-
spec.add_dependency "marc", ">= 0.7.
|
21
|
+
spec.add_dependency "marc", ">= 0.7.1"
|
22
22
|
spec.add_dependency "hashie", ">= 2.0.5", "< 2.1" # used for Indexer#settings
|
23
23
|
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
24
24
|
spec.add_dependency "yell" # logging
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.11.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: marc
|
@@ -17,13 +17,13 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - '>='
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 0.7.
|
20
|
+
version: 0.7.1
|
21
21
|
none: false
|
22
22
|
requirement: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.7.
|
26
|
+
version: 0.7.1
|
27
27
|
none: false
|
28
28
|
prerelease: false
|
29
29
|
type: :runtime
|