traject 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +0 -2
- data/lib/traject/indexer.rb +32 -1
- data/lib/traject/marc4j_reader.rb +13 -46
- data/lib/traject/solrj_writer.rb +1 -4
- data/lib/traject/util.rb +0 -44
- data/lib/traject/version.rb +1 -1
- data/test/indexer/map_record_test.rb +19 -0
- data/traject.gemspec +1 -0
- metadata +18 -4
- data/vendor/marc4j/README.md +0 -17
- data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
data/LICENSE.txt
CHANGED
data/lib/traject/indexer.rb
CHANGED
@@ -205,6 +205,8 @@ class Traject::Indexer
|
|
205
205
|
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
206
206
|
def map_to_context!(context)
|
207
207
|
@index_steps.each do |index_step|
|
208
|
+
# Don't bother if we're skipping this record
|
209
|
+
break if context.skip?
|
208
210
|
if index_step[:type] == :to_field
|
209
211
|
|
210
212
|
accumulator = []
|
@@ -327,7 +329,12 @@ class Traject::Indexer
|
|
327
329
|
thread_pool.maybe_in_thread_pool do
|
328
330
|
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
329
331
|
map_to_context!(context)
|
330
|
-
|
332
|
+
if context.skip?
|
333
|
+
log_skip(context)
|
334
|
+
else
|
335
|
+
writer.put context
|
336
|
+
end
|
337
|
+
|
331
338
|
end
|
332
339
|
|
333
340
|
end
|
@@ -353,6 +360,12 @@ class Traject::Indexer
|
|
353
360
|
|
354
361
|
return true
|
355
362
|
end
|
363
|
+
|
364
|
+
# Log that the current record is being skipped, using
|
365
|
+
# data in context.position and context.skipmessage
|
366
|
+
def log_skip(context)
|
367
|
+
logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
|
368
|
+
end
|
356
369
|
|
357
370
|
def reader_class
|
358
371
|
unless defined? @reader_class
|
@@ -471,11 +484,29 @@ class Traject::Indexer
|
|
471
484
|
hash_init.each_pair do |key, value|
|
472
485
|
self.send("#{key}=", value)
|
473
486
|
end
|
487
|
+
|
488
|
+
@skip = false
|
474
489
|
end
|
475
490
|
|
476
491
|
attr_accessor :clipboard, :output_hash
|
477
492
|
attr_accessor :field_name, :source_record, :settings
|
478
493
|
# 1-based position in stream of processed records.
|
479
494
|
attr_accessor :position
|
495
|
+
|
496
|
+
# Should we be skipping this record?
|
497
|
+
attr_accessor :skipmessage
|
498
|
+
|
499
|
+
# Set the fact that this record should be skipped, with an
|
500
|
+
# optional message
|
501
|
+
def skip!(msg = '(no message given)')
|
502
|
+
@skipmessage = msg
|
503
|
+
@skip = true
|
504
|
+
end
|
505
|
+
|
506
|
+
# Should we skip this record?
|
507
|
+
def skip?
|
508
|
+
@skip
|
509
|
+
end
|
510
|
+
|
480
511
|
end
|
481
512
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'traject'
|
2
2
|
require 'marc'
|
3
|
+
require 'marc/marc4j'
|
3
4
|
|
4
5
|
# Uses Marc4J to read the marc records, but then translates them to
|
5
6
|
# ruby-marc before delivering them still, Marc4J is just inside the black
|
@@ -50,28 +51,25 @@ class Traject::Marc4JReader
|
|
50
51
|
@settings = Traject::Indexer::Settings.new settings
|
51
52
|
@input_stream = input_stream
|
52
53
|
|
53
|
-
ensure_marc4j_loaded!
|
54
|
-
|
55
54
|
if @settings['marc4j_reader.keep_marc4j'] &&
|
56
55
|
! (MARC::Record.instance_methods.include?(:original_marc4j) &&
|
57
56
|
MARC::Record.instance_methods.include?(:"original_marc4j="))
|
58
57
|
MARC::Record.class_eval('attr_accessor :original_marc4j')
|
59
58
|
end
|
59
|
+
|
60
|
+
# Creating a converter will do the following:
|
61
|
+
# - nothing, if it detects that the marc4j jar is already loaded
|
62
|
+
# - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
|
63
|
+
# - load the marc4j jar file bundled with MARC::MARC4J otherwise
|
64
|
+
|
65
|
+
@converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
|
66
|
+
|
67
|
+
# Convenience
|
68
|
+
java_import org.marc4j.MarcPermissiveStreamReader
|
69
|
+
java_import org.marc4j.MarcXmlReader
|
60
70
|
|
61
71
|
end
|
62
72
|
|
63
|
-
# Loads solrj unless it appears to already be loaded.
|
64
|
-
#
|
65
|
-
# Will load from settings['marc4j_reader.jar_dir'] if given, otherwise
|
66
|
-
# bundled vendor location.
|
67
|
-
#
|
68
|
-
# Will java_import MarcPermissiveStreamReader and MarcXmlReader so you
|
69
|
-
# have those available as un-namespaced classes.
|
70
|
-
def ensure_marc4j_loaded!
|
71
|
-
unless defined?(MarcPermissiveStreamReader) && defined?(MarcXmlReader)
|
72
|
-
Traject::Util.require_marc4j_jars(settings)
|
73
|
-
end
|
74
|
-
end
|
75
73
|
|
76
74
|
def internal_reader
|
77
75
|
@internal_reader ||= create_marc_reader!
|
@@ -101,7 +99,7 @@ class Traject::Marc4JReader
|
|
101
99
|
while (internal_reader.hasNext)
|
102
100
|
begin
|
103
101
|
marc4j = internal_reader.next
|
104
|
-
rubymarc =
|
102
|
+
rubymarc = @converter.marc4j_to_rubymarc(marc4j)
|
105
103
|
if @settings['marc4j_reader.keep_marc4j']
|
106
104
|
rubymarc.original_marc4j = marc4j
|
107
105
|
end
|
@@ -123,35 +121,4 @@ class Traject::Marc4JReader
|
|
123
121
|
@logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
124
122
|
end
|
125
123
|
|
126
|
-
def convert_marc4j_to_rubymarc(marc4j)
|
127
|
-
rmarc = MARC::Record.new
|
128
|
-
rmarc.leader = marc4j.getLeader.marshal
|
129
|
-
|
130
|
-
marc4j.getControlFields.each do |marc4j_control|
|
131
|
-
rmarc.append( MARC::ControlField.new(marc4j_control.getTag(), marc4j_control.getData ) )
|
132
|
-
end
|
133
|
-
|
134
|
-
marc4j.getDataFields.each do |marc4j_data|
|
135
|
-
rdata = MARC::DataField.new( marc4j_data.getTag, marc4j_data.getIndicator1.chr, marc4j_data.getIndicator2.chr )
|
136
|
-
|
137
|
-
marc4j_data.getSubfields.each do |subfield|
|
138
|
-
|
139
|
-
# We assume Marc21, skip corrupted data
|
140
|
-
# if subfield.getCode is more than 255, subsequent .chr
|
141
|
-
# would raise.
|
142
|
-
if subfield.getCode > 255
|
143
|
-
logger.warn("Marc4JReader: Corrupted MARC data, record id #{marc4j.getControlNumber}, field #{marc4j_data.tag}, corrupt subfield code byte #{subfield.getCode}. Skipping subfield, but continuing with record.")
|
144
|
-
next
|
145
|
-
end
|
146
|
-
|
147
|
-
rsubfield = MARC::Subfield.new(subfield.getCode.chr, subfield.getData)
|
148
|
-
rdata.append rsubfield
|
149
|
-
end
|
150
|
-
|
151
|
-
rmarc.append rdata
|
152
|
-
end
|
153
|
-
|
154
|
-
return rmarc
|
155
|
-
end
|
156
|
-
|
157
124
|
end
|
data/lib/traject/solrj_writer.rb
CHANGED
@@ -148,10 +148,7 @@ class Traject::SolrJWriter
|
|
148
148
|
|
149
149
|
if settings["solrj_writer.batch_size"].to_i > 1
|
150
150
|
ready_batch = []
|
151
|
-
|
152
|
-
# Synchronize access to our shared batched_queue state,
|
153
|
-
# but once we've pulled out what we want in local var
|
154
|
-
# `ready_batch`, don't need to synchronize anymore.
|
151
|
+
|
155
152
|
batched_queue.add(package)
|
156
153
|
if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
|
157
154
|
batched_queue.drain_to(ready_batch)
|
data/lib/traject/util.rb
CHANGED
@@ -26,50 +26,6 @@ module Traject
|
|
26
26
|
str.split(':in `').first
|
27
27
|
end
|
28
28
|
|
29
|
-
# Requires marc4j jar(s) from settings['marc4j.jar_dir'] if given, otherwise
|
30
|
-
# uses jars bundled with traject gem in ./vendor
|
31
|
-
#
|
32
|
-
# Have to pass in a settings arg, so we can check it for specified jar dir.
|
33
|
-
#
|
34
|
-
# Tries not to do the dirglob and require if marc4j has already been loaded.
|
35
|
-
# Will define global constants with classes MarcPermissiveStreamReader and MarcXmlReader
|
36
|
-
# if not already defined.
|
37
|
-
#
|
38
|
-
# This is all a bit janky, maybe there's a better way to do this? We do want
|
39
|
-
# a 'require' method defined somewhere utility, so multiple classes can
|
40
|
-
# use it, including extra gems. This method IS used by extra gems, so should
|
41
|
-
# be considered part of the API -- after it's called, those top-level
|
42
|
-
# globals should be available, and marc4j should be loaded.
|
43
|
-
def self.require_marc4j_jars(settings)
|
44
|
-
jruby_ensure_init!
|
45
|
-
|
46
|
-
tries = 0
|
47
|
-
begin
|
48
|
-
tries += 1
|
49
|
-
|
50
|
-
org.marc4j
|
51
|
-
|
52
|
-
# java_import which we'd normally use weirdly doesn't work
|
53
|
-
# from a class method. https://github.com/jruby/jruby/issues/975
|
54
|
-
Object.const_set("MarcPermissiveStreamReader", org.marc4j.MarcPermissiveStreamReader) unless defined? ::MarcPermissiveStreamReader
|
55
|
-
Object.const_set("MarcXmlReader", org.marc4j.MarcXmlReader) unless defined? ::MarcXmlReader
|
56
|
-
rescue NameError => e
|
57
|
-
# /Users/jrochkind/code/solrj-gem/lib"
|
58
|
-
|
59
|
-
include_jar_dir = File.expand_path("../../vendor/marc4j/lib", File.dirname(__FILE__))
|
60
|
-
|
61
|
-
jardir = settings["marc4j.jar_dir"] || include_jar_dir
|
62
|
-
Dir.glob("#{jardir}/*.jar") do |x|
|
63
|
-
require x
|
64
|
-
end
|
65
|
-
|
66
|
-
if tries > 1
|
67
|
-
raise LoadError.new("Can not find Marc4J java classes")
|
68
|
-
else
|
69
|
-
retry
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
29
|
|
74
30
|
# Requires solrj jar(s) from settings['solrj.jar_dir'] if given, otherwise
|
75
31
|
# uses jars bundled with traject gem in ./vendor
|
data/lib/traject/version.rb
CHANGED
@@ -184,6 +184,25 @@ describe "Traject::Indexer#map_record" do
|
|
184
184
|
|
185
185
|
assert called, "Called mapping routine"
|
186
186
|
end
|
187
|
+
|
188
|
+
it "skips records" do
|
189
|
+
|
190
|
+
@indexer.to_field("beforeSkip") do |rec, acc|
|
191
|
+
acc << "Before"
|
192
|
+
end
|
193
|
+
|
194
|
+
@indexer.to_field('radical') do |rec, acc, context|
|
195
|
+
context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
|
196
|
+
end
|
197
|
+
|
198
|
+
@indexer.to_field('afterSkip') do |rec, acc|
|
199
|
+
acc << "After. Should never happen"
|
200
|
+
end
|
201
|
+
|
202
|
+
output = @indexer.map_record(@record)
|
203
|
+
assert_equal ['Before'], output['beforeSkip']
|
204
|
+
assert_nil output['afterSkip']
|
205
|
+
end
|
187
206
|
|
188
207
|
end
|
189
208
|
|
data/traject.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
|
20
20
|
|
21
21
|
spec.add_dependency "marc", ">= 0.7.1"
|
22
|
+
spec.add_dependency "marc-marc4j", ">=0.1.1"
|
22
23
|
spec.add_dependency "hashie", ">= 2.0.5", "< 2.1" # used for Indexer#settings
|
23
24
|
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
24
25
|
spec.add_dependency "yell" # logging
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.13.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: marc
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
none: false
|
28
28
|
prerelease: false
|
29
29
|
type: :runtime
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: marc-marc4j
|
32
|
+
version_requirements: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 0.1.1
|
37
|
+
none: false
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - '>='
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.1.1
|
43
|
+
none: false
|
44
|
+
prerelease: false
|
45
|
+
type: :runtime
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: hashie
|
32
48
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -236,8 +252,6 @@ files:
|
|
236
252
|
- test/translation_maps/translate_array_test.yaml
|
237
253
|
- test/translation_maps/yaml_map.yaml
|
238
254
|
- traject.gemspec
|
239
|
-
- vendor/marc4j/README.md
|
240
|
-
- vendor/marc4j/lib/marc4j-2.5.1-beta.jar
|
241
255
|
- vendor/solrj/README
|
242
256
|
- vendor/solrj/build.xml
|
243
257
|
- vendor/solrj/ivy.xml
|
data/vendor/marc4j/README.md
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
The marc4j.jar file that is in `./lib` was created by:
|
2
|
-
|
3
|
-
Checking out the marc4j source from github.com/marc4j/marc4j, and building
|
4
|
-
with `ant` on 23 July 2013.
|
5
|
-
|
6
|
-
That produced the jar you see, with the name it has. (I am not sure the version
|
7
|
-
in the jar name is accurate, it's not actually the beta1 release, but the RC1
|
8
|
-
release, or maybe even a subsequent release?)
|
9
|
-
|
10
|
-
It can be regenerated by doing the same.
|
11
|
-
|
12
|
-
Bundling the marc4j jar with our traject gem is not neccesarily the
|
13
|
-
best way to go, but it's what we're doing now. See top-level README
|
14
|
-
TODO.
|
15
|
-
|
16
|
-
(You can use your own custom marc4j.jar by using a runtime setting,
|
17
|
-
see Marc4JReader class docs. )
|
Binary file
|