traject 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +0 -2
- data/lib/traject/indexer.rb +32 -1
- data/lib/traject/marc4j_reader.rb +13 -46
- data/lib/traject/solrj_writer.rb +1 -4
- data/lib/traject/util.rb +0 -44
- data/lib/traject/version.rb +1 -1
- data/test/indexer/map_record_test.rb +19 -0
- data/traject.gemspec +1 -0
- metadata +18 -4
- data/vendor/marc4j/README.md +0 -17
- data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
data/LICENSE.txt
CHANGED
data/lib/traject/indexer.rb
CHANGED
@@ -205,6 +205,8 @@ class Traject::Indexer
|
|
205
205
|
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
206
206
|
def map_to_context!(context)
|
207
207
|
@index_steps.each do |index_step|
|
208
|
+
# Don't bother if we're skipping this record
|
209
|
+
break if context.skip?
|
208
210
|
if index_step[:type] == :to_field
|
209
211
|
|
210
212
|
accumulator = []
|
@@ -327,7 +329,12 @@ class Traject::Indexer
|
|
327
329
|
thread_pool.maybe_in_thread_pool do
|
328
330
|
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
329
331
|
map_to_context!(context)
|
330
|
-
|
332
|
+
if context.skip?
|
333
|
+
log_skip(context)
|
334
|
+
else
|
335
|
+
writer.put context
|
336
|
+
end
|
337
|
+
|
331
338
|
end
|
332
339
|
|
333
340
|
end
|
@@ -353,6 +360,12 @@ class Traject::Indexer
|
|
353
360
|
|
354
361
|
return true
|
355
362
|
end
|
363
|
+
|
364
|
+
# Log that the current record is being skipped, using
|
365
|
+
# data in context.position and context.skipmessage
|
366
|
+
def log_skip(context)
|
367
|
+
logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
|
368
|
+
end
|
356
369
|
|
357
370
|
def reader_class
|
358
371
|
unless defined? @reader_class
|
@@ -471,11 +484,29 @@ class Traject::Indexer
|
|
471
484
|
hash_init.each_pair do |key, value|
|
472
485
|
self.send("#{key}=", value)
|
473
486
|
end
|
487
|
+
|
488
|
+
@skip = false
|
474
489
|
end
|
475
490
|
|
476
491
|
attr_accessor :clipboard, :output_hash
|
477
492
|
attr_accessor :field_name, :source_record, :settings
|
478
493
|
# 1-based position in stream of processed records.
|
479
494
|
attr_accessor :position
|
495
|
+
|
496
|
+
# Should we be skipping this record?
|
497
|
+
attr_accessor :skipmessage
|
498
|
+
|
499
|
+
# Set the fact that this record should be skipped, with an
|
500
|
+
# optional message
|
501
|
+
def skip!(msg = '(no message given)')
|
502
|
+
@skipmessage = msg
|
503
|
+
@skip = true
|
504
|
+
end
|
505
|
+
|
506
|
+
# Should we skip this record?
|
507
|
+
def skip?
|
508
|
+
@skip
|
509
|
+
end
|
510
|
+
|
480
511
|
end
|
481
512
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'traject'
|
2
2
|
require 'marc'
|
3
|
+
require 'marc/marc4j'
|
3
4
|
|
4
5
|
# Uses Marc4J to read the marc records, but then translates them to
|
5
6
|
# ruby-marc before delivering them still, Marc4J is just inside the black
|
@@ -50,28 +51,25 @@ class Traject::Marc4JReader
|
|
50
51
|
@settings = Traject::Indexer::Settings.new settings
|
51
52
|
@input_stream = input_stream
|
52
53
|
|
53
|
-
ensure_marc4j_loaded!
|
54
|
-
|
55
54
|
if @settings['marc4j_reader.keep_marc4j'] &&
|
56
55
|
! (MARC::Record.instance_methods.include?(:original_marc4j) &&
|
57
56
|
MARC::Record.instance_methods.include?(:"original_marc4j="))
|
58
57
|
MARC::Record.class_eval('attr_accessor :original_marc4j')
|
59
58
|
end
|
59
|
+
|
60
|
+
# Creating a converter will do the following:
|
61
|
+
# - nothing, if it detects that the marc4j jar is already loaded
|
62
|
+
# - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
|
63
|
+
# - load the marc4j jar file bundled with MARC::MARC4J otherwise
|
64
|
+
|
65
|
+
@converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
|
66
|
+
|
67
|
+
# Convenience
|
68
|
+
java_import org.marc4j.MarcPermissiveStreamReader
|
69
|
+
java_import org.marc4j.MarcXmlReader
|
60
70
|
|
61
71
|
end
|
62
72
|
|
63
|
-
# Loads solrj unless it appears to already be loaded.
|
64
|
-
#
|
65
|
-
# Will load from settings['marc4j_reader.jar_dir'] if given, otherwise
|
66
|
-
# bundled vendor location.
|
67
|
-
#
|
68
|
-
# Will java_import MarcPermissiveStreamReader and MarcXmlReader so you
|
69
|
-
# have those available as un-namespaced classes.
|
70
|
-
def ensure_marc4j_loaded!
|
71
|
-
unless defined?(MarcPermissiveStreamReader) && defined?(MarcXmlReader)
|
72
|
-
Traject::Util.require_marc4j_jars(settings)
|
73
|
-
end
|
74
|
-
end
|
75
73
|
|
76
74
|
def internal_reader
|
77
75
|
@internal_reader ||= create_marc_reader!
|
@@ -101,7 +99,7 @@ class Traject::Marc4JReader
|
|
101
99
|
while (internal_reader.hasNext)
|
102
100
|
begin
|
103
101
|
marc4j = internal_reader.next
|
104
|
-
rubymarc =
|
102
|
+
rubymarc = @converter.marc4j_to_rubymarc(marc4j)
|
105
103
|
if @settings['marc4j_reader.keep_marc4j']
|
106
104
|
rubymarc.original_marc4j = marc4j
|
107
105
|
end
|
@@ -123,35 +121,4 @@ class Traject::Marc4JReader
|
|
123
121
|
@logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
124
122
|
end
|
125
123
|
|
126
|
-
def convert_marc4j_to_rubymarc(marc4j)
|
127
|
-
rmarc = MARC::Record.new
|
128
|
-
rmarc.leader = marc4j.getLeader.marshal
|
129
|
-
|
130
|
-
marc4j.getControlFields.each do |marc4j_control|
|
131
|
-
rmarc.append( MARC::ControlField.new(marc4j_control.getTag(), marc4j_control.getData ) )
|
132
|
-
end
|
133
|
-
|
134
|
-
marc4j.getDataFields.each do |marc4j_data|
|
135
|
-
rdata = MARC::DataField.new( marc4j_data.getTag, marc4j_data.getIndicator1.chr, marc4j_data.getIndicator2.chr )
|
136
|
-
|
137
|
-
marc4j_data.getSubfields.each do |subfield|
|
138
|
-
|
139
|
-
# We assume Marc21, skip corrupted data
|
140
|
-
# if subfield.getCode is more than 255, subsequent .chr
|
141
|
-
# would raise.
|
142
|
-
if subfield.getCode > 255
|
143
|
-
logger.warn("Marc4JReader: Corrupted MARC data, record id #{marc4j.getControlNumber}, field #{marc4j_data.tag}, corrupt subfield code byte #{subfield.getCode}. Skipping subfield, but continuing with record.")
|
144
|
-
next
|
145
|
-
end
|
146
|
-
|
147
|
-
rsubfield = MARC::Subfield.new(subfield.getCode.chr, subfield.getData)
|
148
|
-
rdata.append rsubfield
|
149
|
-
end
|
150
|
-
|
151
|
-
rmarc.append rdata
|
152
|
-
end
|
153
|
-
|
154
|
-
return rmarc
|
155
|
-
end
|
156
|
-
|
157
124
|
end
|
data/lib/traject/solrj_writer.rb
CHANGED
@@ -148,10 +148,7 @@ class Traject::SolrJWriter
|
|
148
148
|
|
149
149
|
if settings["solrj_writer.batch_size"].to_i > 1
|
150
150
|
ready_batch = []
|
151
|
-
|
152
|
-
# Synchronize access to our shared batched_queue state,
|
153
|
-
# but once we've pulled out what we want in local var
|
154
|
-
# `ready_batch`, don't need to synchronize anymore.
|
151
|
+
|
155
152
|
batched_queue.add(package)
|
156
153
|
if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
|
157
154
|
batched_queue.drain_to(ready_batch)
|
data/lib/traject/util.rb
CHANGED
@@ -26,50 +26,6 @@ module Traject
|
|
26
26
|
str.split(':in `').first
|
27
27
|
end
|
28
28
|
|
29
|
-
# Requires marc4j jar(s) from settings['marc4j.jar_dir'] if given, otherwise
|
30
|
-
# uses jars bundled with traject gem in ./vendor
|
31
|
-
#
|
32
|
-
# Have to pass in a settings arg, so we can check it for specified jar dir.
|
33
|
-
#
|
34
|
-
# Tries not to do the dirglob and require if marc4j has already been loaded.
|
35
|
-
# Will define global constants with classes MarcPermissiveStreamReader and MarcXmlReader
|
36
|
-
# if not already defined.
|
37
|
-
#
|
38
|
-
# This is all a bit janky, maybe there's a better way to do this? We do want
|
39
|
-
# a 'require' method defined somewhere utility, so multiple classes can
|
40
|
-
# use it, including extra gems. This method IS used by extra gems, so should
|
41
|
-
# be considered part of the API -- after it's called, those top-level
|
42
|
-
# globals should be available, and marc4j should be loaded.
|
43
|
-
def self.require_marc4j_jars(settings)
|
44
|
-
jruby_ensure_init!
|
45
|
-
|
46
|
-
tries = 0
|
47
|
-
begin
|
48
|
-
tries += 1
|
49
|
-
|
50
|
-
org.marc4j
|
51
|
-
|
52
|
-
# java_import which we'd normally use weirdly doesn't work
|
53
|
-
# from a class method. https://github.com/jruby/jruby/issues/975
|
54
|
-
Object.const_set("MarcPermissiveStreamReader", org.marc4j.MarcPermissiveStreamReader) unless defined? ::MarcPermissiveStreamReader
|
55
|
-
Object.const_set("MarcXmlReader", org.marc4j.MarcXmlReader) unless defined? ::MarcXmlReader
|
56
|
-
rescue NameError => e
|
57
|
-
# /Users/jrochkind/code/solrj-gem/lib"
|
58
|
-
|
59
|
-
include_jar_dir = File.expand_path("../../vendor/marc4j/lib", File.dirname(__FILE__))
|
60
|
-
|
61
|
-
jardir = settings["marc4j.jar_dir"] || include_jar_dir
|
62
|
-
Dir.glob("#{jardir}/*.jar") do |x|
|
63
|
-
require x
|
64
|
-
end
|
65
|
-
|
66
|
-
if tries > 1
|
67
|
-
raise LoadError.new("Can not find Marc4J java classes")
|
68
|
-
else
|
69
|
-
retry
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
29
|
|
74
30
|
# Requires solrj jar(s) from settings['solrj.jar_dir'] if given, otherwise
|
75
31
|
# uses jars bundled with traject gem in ./vendor
|
data/lib/traject/version.rb
CHANGED
@@ -184,6 +184,25 @@ describe "Traject::Indexer#map_record" do
|
|
184
184
|
|
185
185
|
assert called, "Called mapping routine"
|
186
186
|
end
|
187
|
+
|
188
|
+
it "skips records" do
|
189
|
+
|
190
|
+
@indexer.to_field("beforeSkip") do |rec, acc|
|
191
|
+
acc << "Before"
|
192
|
+
end
|
193
|
+
|
194
|
+
@indexer.to_field('radical') do |rec, acc, context|
|
195
|
+
context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
|
196
|
+
end
|
197
|
+
|
198
|
+
@indexer.to_field('afterSkip') do |rec, acc|
|
199
|
+
acc << "After. Should never happen"
|
200
|
+
end
|
201
|
+
|
202
|
+
output = @indexer.map_record(@record)
|
203
|
+
assert_equal ['Before'], output['beforeSkip']
|
204
|
+
assert_nil output['afterSkip']
|
205
|
+
end
|
187
206
|
|
188
207
|
end
|
189
208
|
|
data/traject.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
|
20
20
|
|
21
21
|
spec.add_dependency "marc", ">= 0.7.1"
|
22
|
+
spec.add_dependency "marc-marc4j", ">=0.1.1"
|
22
23
|
spec.add_dependency "hashie", ">= 2.0.5", "< 2.1" # used for Indexer#settings
|
23
24
|
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
24
25
|
spec.add_dependency "yell" # logging
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.13.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-09-
|
12
|
+
date: 2013-09-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: marc
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
none: false
|
28
28
|
prerelease: false
|
29
29
|
type: :runtime
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: marc-marc4j
|
32
|
+
version_requirements: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 0.1.1
|
37
|
+
none: false
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - '>='
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.1.1
|
43
|
+
none: false
|
44
|
+
prerelease: false
|
45
|
+
type: :runtime
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: hashie
|
32
48
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -236,8 +252,6 @@ files:
|
|
236
252
|
- test/translation_maps/translate_array_test.yaml
|
237
253
|
- test/translation_maps/yaml_map.yaml
|
238
254
|
- traject.gemspec
|
239
|
-
- vendor/marc4j/README.md
|
240
|
-
- vendor/marc4j/lib/marc4j-2.5.1-beta.jar
|
241
255
|
- vendor/solrj/README
|
242
256
|
- vendor/solrj/build.xml
|
243
257
|
- vendor/solrj/ivy.xml
|
data/vendor/marc4j/README.md
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
The marc4j.jar file that is in `./lib` was created by:
|
2
|
-
|
3
|
-
Checking out the marc4j source from github.com/marc4j/marc4j, and building
|
4
|
-
with `ant` on 23 July 2013.
|
5
|
-
|
6
|
-
That produced the jar you see, with the name it has. (I am not sure the version
|
7
|
-
in the jar name is accurate, it's not actually the beta1 release, but the RC1
|
8
|
-
release, or maybe even a subsequent release?)
|
9
|
-
|
10
|
-
It can be regenerated by doing the same.
|
11
|
-
|
12
|
-
Bundling the marc4j jar with our traject gem is not neccesarily the
|
13
|
-
best way to go, but it's what we're doing now. See top-level README
|
14
|
-
TODO.
|
15
|
-
|
16
|
-
(You can use your own custom marc4j.jar by using a runtime setting,
|
17
|
-
see Marc4JReader class docs. )
|
Binary file
|