traject 0.12.0 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE.txt CHANGED
@@ -1,5 +1,3 @@
1
- Copyright (c) 2013 TODO: Write your name
2
-
3
1
  MIT License
4
2
 
5
3
  Permission is hereby granted, free of charge, to any person obtaining
@@ -205,6 +205,8 @@ class Traject::Indexer
205
205
  # Returns the context passed in as second arg, as a convenience for chaining etc.
206
206
  def map_to_context!(context)
207
207
  @index_steps.each do |index_step|
208
+ # Don't bother if we're skipping this record
209
+ break if context.skip?
208
210
  if index_step[:type] == :to_field
209
211
 
210
212
  accumulator = []
@@ -327,7 +329,12 @@ class Traject::Indexer
327
329
  thread_pool.maybe_in_thread_pool do
328
330
  context = Context.new(:source_record => record, :settings => settings, :position => position)
329
331
  map_to_context!(context)
330
- writer.put context
332
+ if context.skip?
333
+ log_skip(context)
334
+ else
335
+ writer.put context
336
+ end
337
+
331
338
  end
332
339
 
333
340
  end
@@ -353,6 +360,12 @@ class Traject::Indexer
353
360
 
354
361
  return true
355
362
  end
363
+
364
+ # Log that the current record is being skipped, using
365
+ # data in context.position and context.skipmessage
366
+ def log_skip(context)
367
+ logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
368
+ end
356
369
 
357
370
  def reader_class
358
371
  unless defined? @reader_class
@@ -471,11 +484,29 @@ class Traject::Indexer
471
484
  hash_init.each_pair do |key, value|
472
485
  self.send("#{key}=", value)
473
486
  end
487
+
488
+ @skip = false
474
489
  end
475
490
 
476
491
  attr_accessor :clipboard, :output_hash
477
492
  attr_accessor :field_name, :source_record, :settings
478
493
  # 1-based position in stream of processed records.
479
494
  attr_accessor :position
495
+
496
+ # Should we be skipping this record?
497
+ attr_accessor :skipmessage
498
+
499
+ # Set the fact that this record should be skipped, with an
500
+ # optional message
501
+ def skip!(msg = '(no message given)')
502
+ @skipmessage = msg
503
+ @skip = true
504
+ end
505
+
506
+ # Should we skip this record?
507
+ def skip?
508
+ @skip
509
+ end
510
+
480
511
  end
481
512
  end
@@ -1,5 +1,6 @@
1
1
  require 'traject'
2
2
  require 'marc'
3
+ require 'marc/marc4j'
3
4
 
4
5
  # Uses Marc4J to read the marc records, but then translates them to
5
6
  # ruby-marc before delivering them still, Marc4J is just inside the black
@@ -50,28 +51,25 @@ class Traject::Marc4JReader
50
51
  @settings = Traject::Indexer::Settings.new settings
51
52
  @input_stream = input_stream
52
53
 
53
- ensure_marc4j_loaded!
54
-
55
54
  if @settings['marc4j_reader.keep_marc4j'] &&
56
55
  ! (MARC::Record.instance_methods.include?(:original_marc4j) &&
57
56
  MARC::Record.instance_methods.include?(:"original_marc4j="))
58
57
  MARC::Record.class_eval('attr_accessor :original_marc4j')
59
58
  end
59
+
60
+ # Creating a converter will do the following:
61
+ # - nothing, if it detects that the marc4j jar is already loaded
62
+ # - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
63
+ # - load the marc4j jar file bundled with MARC::MARC4J otherwise
64
+
65
+ @converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
66
+
67
+ # Convenience
68
+ java_import org.marc4j.MarcPermissiveStreamReader
69
+ java_import org.marc4j.MarcXmlReader
60
70
 
61
71
  end
62
72
 
63
- # Loads solrj unless it appears to already be loaded.
64
- #
65
- # Will load from settings['marc4j_reader.jar_dir'] if given, otherwise
66
- # bundled vendor location.
67
- #
68
- # Will java_import MarcPermissiveStreamReader and MarcXmlReader so you
69
- # have those available as un-namespaced classes.
70
- def ensure_marc4j_loaded!
71
- unless defined?(MarcPermissiveStreamReader) && defined?(MarcXmlReader)
72
- Traject::Util.require_marc4j_jars(settings)
73
- end
74
- end
75
73
 
76
74
  def internal_reader
77
75
  @internal_reader ||= create_marc_reader!
@@ -101,7 +99,7 @@ class Traject::Marc4JReader
101
99
  while (internal_reader.hasNext)
102
100
  begin
103
101
  marc4j = internal_reader.next
104
- rubymarc = convert_marc4j_to_rubymarc(marc4j)
102
+ rubymarc = @converter.marc4j_to_rubymarc(marc4j)
105
103
  if @settings['marc4j_reader.keep_marc4j']
106
104
  rubymarc.original_marc4j = marc4j
107
105
  end
@@ -123,35 +121,4 @@ class Traject::Marc4JReader
123
121
  @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
124
122
  end
125
123
 
126
- def convert_marc4j_to_rubymarc(marc4j)
127
- rmarc = MARC::Record.new
128
- rmarc.leader = marc4j.getLeader.marshal
129
-
130
- marc4j.getControlFields.each do |marc4j_control|
131
- rmarc.append( MARC::ControlField.new(marc4j_control.getTag(), marc4j_control.getData ) )
132
- end
133
-
134
- marc4j.getDataFields.each do |marc4j_data|
135
- rdata = MARC::DataField.new( marc4j_data.getTag, marc4j_data.getIndicator1.chr, marc4j_data.getIndicator2.chr )
136
-
137
- marc4j_data.getSubfields.each do |subfield|
138
-
139
- # We assume Marc21, skip corrupted data
140
- # if subfield.getCode is more than 255, subsequent .chr
141
- # would raise.
142
- if subfield.getCode > 255
143
- logger.warn("Marc4JReader: Corrupted MARC data, record id #{marc4j.getControlNumber}, field #{marc4j_data.tag}, corrupt subfield code byte #{subfield.getCode}. Skipping subfield, but continuing with record.")
144
- next
145
- end
146
-
147
- rsubfield = MARC::Subfield.new(subfield.getCode.chr, subfield.getData)
148
- rdata.append rsubfield
149
- end
150
-
151
- rmarc.append rdata
152
- end
153
-
154
- return rmarc
155
- end
156
-
157
124
  end
@@ -148,10 +148,7 @@ class Traject::SolrJWriter
148
148
 
149
149
  if settings["solrj_writer.batch_size"].to_i > 1
150
150
  ready_batch = []
151
-
152
- # Synchronize access to our shared batched_queue state,
153
- # but once we've pulled out what we want in local var
154
- # `ready_batch`, don't need to synchronize anymore.
151
+
155
152
  batched_queue.add(package)
156
153
  if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
157
154
  batched_queue.drain_to(ready_batch)
data/lib/traject/util.rb CHANGED
@@ -26,50 +26,6 @@ module Traject
26
26
  str.split(':in `').first
27
27
  end
28
28
 
29
- # Requires marc4j jar(s) from settings['marc4j.jar_dir'] if given, otherwise
30
- # uses jars bundled with traject gem in ./vendor
31
- #
32
- # Have to pass in a settings arg, so we can check it for specified jar dir.
33
- #
34
- # Tries not to do the dirglob and require if marc4j has already been loaded.
35
- # Will define global constants with classes MarcPermissiveStreamReader and MarcXmlReader
36
- # if not already defined.
37
- #
38
- # This is all a bit janky, maybe there's a better way to do this? We do want
39
- # a 'require' method defined somewhere utility, so multiple classes can
40
- # use it, including extra gems. This method IS used by extra gems, so should
41
- # be considered part of the API -- after it's called, those top-level
42
- # globals should be available, and marc4j should be loaded.
43
- def self.require_marc4j_jars(settings)
44
- jruby_ensure_init!
45
-
46
- tries = 0
47
- begin
48
- tries += 1
49
-
50
- org.marc4j
51
-
52
- # java_import which we'd normally use weirdly doesn't work
53
- # from a class method. https://github.com/jruby/jruby/issues/975
54
- Object.const_set("MarcPermissiveStreamReader", org.marc4j.MarcPermissiveStreamReader) unless defined? ::MarcPermissiveStreamReader
55
- Object.const_set("MarcXmlReader", org.marc4j.MarcXmlReader) unless defined? ::MarcXmlReader
56
- rescue NameError => e
57
- # /Users/jrochkind/code/solrj-gem/lib"
58
-
59
- include_jar_dir = File.expand_path("../../vendor/marc4j/lib", File.dirname(__FILE__))
60
-
61
- jardir = settings["marc4j.jar_dir"] || include_jar_dir
62
- Dir.glob("#{jardir}/*.jar") do |x|
63
- require x
64
- end
65
-
66
- if tries > 1
67
- raise LoadError.new("Can not find Marc4J java classes")
68
- else
69
- retry
70
- end
71
- end
72
- end
73
29
 
74
30
  # Requires solrj jar(s) from settings['solrj.jar_dir'] if given, otherwise
75
31
  # uses jars bundled with traject gem in ./vendor
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "0.12.0"
2
+ VERSION = "0.13.0"
3
3
  end
@@ -184,6 +184,25 @@ describe "Traject::Indexer#map_record" do
184
184
 
185
185
  assert called, "Called mapping routine"
186
186
  end
187
+
188
+ it "skips records" do
189
+
190
+ @indexer.to_field("beforeSkip") do |rec, acc|
191
+ acc << "Before"
192
+ end
193
+
194
+ @indexer.to_field('radical') do |rec, acc, context|
195
+ context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
196
+ end
197
+
198
+ @indexer.to_field('afterSkip') do |rec, acc|
199
+ acc << "After. Should never happen"
200
+ end
201
+
202
+ output = @indexer.map_record(@record)
203
+ assert_equal ['Before'], output['beforeSkip']
204
+ assert_nil output['afterSkip']
205
+ end
187
206
 
188
207
  end
189
208
 
data/traject.gemspec CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
19
19
 
20
20
 
21
21
  spec.add_dependency "marc", ">= 0.7.1"
22
+ spec.add_dependency "marc-marc4j", ">=0.1.1"
22
23
  spec.add_dependency "hashie", ">= 2.0.5", "< 2.1" # used for Indexer#settings
23
24
  spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
24
25
  spec.add_dependency "yell" # logging
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.12.0
5
+ version: 0.13.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Jonathan Rochkind
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-10 00:00:00.000000000 Z
12
+ date: 2013-09-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: marc
@@ -27,6 +27,22 @@ dependencies:
27
27
  none: false
28
28
  prerelease: false
29
29
  type: :runtime
30
+ - !ruby/object:Gem::Dependency
31
+ name: marc-marc4j
32
+ version_requirements: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 0.1.1
37
+ none: false
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: 0.1.1
43
+ none: false
44
+ prerelease: false
45
+ type: :runtime
30
46
  - !ruby/object:Gem::Dependency
31
47
  name: hashie
32
48
  version_requirements: !ruby/object:Gem::Requirement
@@ -236,8 +252,6 @@ files:
236
252
  - test/translation_maps/translate_array_test.yaml
237
253
  - test/translation_maps/yaml_map.yaml
238
254
  - traject.gemspec
239
- - vendor/marc4j/README.md
240
- - vendor/marc4j/lib/marc4j-2.5.1-beta.jar
241
255
  - vendor/solrj/README
242
256
  - vendor/solrj/build.xml
243
257
  - vendor/solrj/ivy.xml
@@ -1,17 +0,0 @@
1
- The marc4j.jar file that is in `./lib` was created by:
2
-
3
- Checking out the marc4j source from github.com/marc4j/marc4j, and building
4
- with `ant` on 23 July 2013.
5
-
6
- That produced the jar you see, with the name it has. (I am not sure the version
7
- in the jar name is accurate, it's not actually the beta1 release, but the RC1
8
- release, or maybe even a subsequent release?)
9
-
10
- It can be regenerated by doing the same.
11
-
12
- Bundling the marc4j jar with our traject gem is not neccesarily the
13
- best way to go, but it's what we're doing now. See top-level README
14
- TODO.
15
-
16
- (You can use your own custom marc4j.jar by using a runtime setting,
17
- see Marc4JReader class docs. )