traject 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE.txt CHANGED
@@ -1,5 +1,3 @@
1
- Copyright (c) 2013 TODO: Write your name
2
-
3
1
  MIT License
4
2
 
5
3
  Permission is hereby granted, free of charge, to any person obtaining
@@ -205,6 +205,8 @@ class Traject::Indexer
205
205
  # Returns the context passed in as second arg, as a convenience for chaining etc.
206
206
  def map_to_context!(context)
207
207
  @index_steps.each do |index_step|
208
+ # Don't bother if we're skipping this record
209
+ break if context.skip?
208
210
  if index_step[:type] == :to_field
209
211
 
210
212
  accumulator = []
@@ -327,7 +329,12 @@ class Traject::Indexer
327
329
  thread_pool.maybe_in_thread_pool do
328
330
  context = Context.new(:source_record => record, :settings => settings, :position => position)
329
331
  map_to_context!(context)
330
- writer.put context
332
+ if context.skip?
333
+ log_skip(context)
334
+ else
335
+ writer.put context
336
+ end
337
+
331
338
  end
332
339
 
333
340
  end
@@ -353,6 +360,12 @@ class Traject::Indexer
353
360
 
354
361
  return true
355
362
  end
363
+
364
+ # Log that the current record is being skipped, using
365
+ # data in context.position and context.skipmessage
366
+ def log_skip(context)
367
+ logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
368
+ end
356
369
 
357
370
  def reader_class
358
371
  unless defined? @reader_class
@@ -471,11 +484,29 @@ class Traject::Indexer
471
484
  hash_init.each_pair do |key, value|
472
485
  self.send("#{key}=", value)
473
486
  end
487
+
488
+ @skip = false
474
489
  end
475
490
 
476
491
  attr_accessor :clipboard, :output_hash
477
492
  attr_accessor :field_name, :source_record, :settings
478
493
  # 1-based position in stream of processed records.
479
494
  attr_accessor :position
495
+
496
+ # Should we be skipping this record?
497
+ attr_accessor :skipmessage
498
+
499
+ # Set the fact that this record should be skipped, with an
500
+ # optional message
501
+ def skip!(msg = '(no message given)')
502
+ @skipmessage = msg
503
+ @skip = true
504
+ end
505
+
506
+ # Should we skip this record?
507
+ def skip?
508
+ @skip
509
+ end
510
+
480
511
  end
481
512
  end
@@ -1,5 +1,6 @@
1
1
  require 'traject'
2
2
  require 'marc'
3
+ require 'marc/marc4j'
3
4
 
4
5
  # Uses Marc4J to read the marc records, but then translates them to
5
6
  # ruby-marc before delivering them still, Marc4J is just inside the black
@@ -50,28 +51,25 @@ class Traject::Marc4JReader
50
51
  @settings = Traject::Indexer::Settings.new settings
51
52
  @input_stream = input_stream
52
53
 
53
- ensure_marc4j_loaded!
54
-
55
54
  if @settings['marc4j_reader.keep_marc4j'] &&
56
55
  ! (MARC::Record.instance_methods.include?(:original_marc4j) &&
57
56
  MARC::Record.instance_methods.include?(:"original_marc4j="))
58
57
  MARC::Record.class_eval('attr_accessor :original_marc4j')
59
58
  end
59
+
60
+ # Creating a converter will do the following:
61
+ # - nothing, if it detects that the marc4j jar is already loaded
62
+ # - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
63
+ # - load the marc4j jar file bundled with MARC::MARC4J otherwise
64
+
65
+ @converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
66
+
67
+ # Convenience
68
+ java_import org.marc4j.MarcPermissiveStreamReader
69
+ java_import org.marc4j.MarcXmlReader
60
70
 
61
71
  end
62
72
 
63
- # Loads solrj unless it appears to already be loaded.
64
- #
65
- # Will load from settings['marc4j_reader.jar_dir'] if given, otherwise
66
- # bundled vendor location.
67
- #
68
- # Will java_import MarcPermissiveStreamReader and MarcXmlReader so you
69
- # have those available as un-namespaced classes.
70
- def ensure_marc4j_loaded!
71
- unless defined?(MarcPermissiveStreamReader) && defined?(MarcXmlReader)
72
- Traject::Util.require_marc4j_jars(settings)
73
- end
74
- end
75
73
 
76
74
  def internal_reader
77
75
  @internal_reader ||= create_marc_reader!
@@ -101,7 +99,7 @@ class Traject::Marc4JReader
101
99
  while (internal_reader.hasNext)
102
100
  begin
103
101
  marc4j = internal_reader.next
104
- rubymarc = convert_marc4j_to_rubymarc(marc4j)
102
+ rubymarc = @converter.marc4j_to_rubymarc(marc4j)
105
103
  if @settings['marc4j_reader.keep_marc4j']
106
104
  rubymarc.original_marc4j = marc4j
107
105
  end
@@ -123,35 +121,4 @@ class Traject::Marc4JReader
123
121
  @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
124
122
  end
125
123
 
126
- def convert_marc4j_to_rubymarc(marc4j)
127
- rmarc = MARC::Record.new
128
- rmarc.leader = marc4j.getLeader.marshal
129
-
130
- marc4j.getControlFields.each do |marc4j_control|
131
- rmarc.append( MARC::ControlField.new(marc4j_control.getTag(), marc4j_control.getData ) )
132
- end
133
-
134
- marc4j.getDataFields.each do |marc4j_data|
135
- rdata = MARC::DataField.new( marc4j_data.getTag, marc4j_data.getIndicator1.chr, marc4j_data.getIndicator2.chr )
136
-
137
- marc4j_data.getSubfields.each do |subfield|
138
-
139
- # We assume Marc21, skip corrupted data
140
- # if subfield.getCode is more than 255, subsequent .chr
141
- # would raise.
142
- if subfield.getCode > 255
143
- logger.warn("Marc4JReader: Corrupted MARC data, record id #{marc4j.getControlNumber}, field #{marc4j_data.tag}, corrupt subfield code byte #{subfield.getCode}. Skipping subfield, but continuing with record.")
144
- next
145
- end
146
-
147
- rsubfield = MARC::Subfield.new(subfield.getCode.chr, subfield.getData)
148
- rdata.append rsubfield
149
- end
150
-
151
- rmarc.append rdata
152
- end
153
-
154
- return rmarc
155
- end
156
-
157
124
  end
@@ -148,10 +148,7 @@ class Traject::SolrJWriter
148
148
 
149
149
  if settings["solrj_writer.batch_size"].to_i > 1
150
150
  ready_batch = []
151
-
152
- # Synchronize access to our shared batched_queue state,
153
- # but once we've pulled out what we want in local var
154
- # `ready_batch`, don't need to synchronize anymore.
151
+
155
152
  batched_queue.add(package)
156
153
  if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
157
154
  batched_queue.drain_to(ready_batch)
data/lib/traject/util.rb CHANGED
@@ -26,50 +26,6 @@ module Traject
26
26
  str.split(':in `').first
27
27
  end
28
28
 
29
- # Requires marc4j jar(s) from settings['marc4j.jar_dir'] if given, otherwise
30
- # uses jars bundled with traject gem in ./vendor
31
- #
32
- # Have to pass in a settings arg, so we can check it for specified jar dir.
33
- #
34
- # Tries not to do the dirglob and require if marc4j has already been loaded.
35
- # Will define global constants with classes MarcPermissiveStreamReader and MarcXmlReader
36
- # if not already defined.
37
- #
38
- # This is all a bit janky, maybe there's a better way to do this? We do want
39
- # a 'require' method defined somewhere utility, so multiple classes can
40
- # use it, including extra gems. This method IS used by extra gems, so should
41
- # be considered part of the API -- after it's called, those top-level
42
- # globals should be available, and marc4j should be loaded.
43
- def self.require_marc4j_jars(settings)
44
- jruby_ensure_init!
45
-
46
- tries = 0
47
- begin
48
- tries += 1
49
-
50
- org.marc4j
51
-
52
- # java_import which we'd normally use weirdly doesn't work
53
- # from a class method. https://github.com/jruby/jruby/issues/975
54
- Object.const_set("MarcPermissiveStreamReader", org.marc4j.MarcPermissiveStreamReader) unless defined? ::MarcPermissiveStreamReader
55
- Object.const_set("MarcXmlReader", org.marc4j.MarcXmlReader) unless defined? ::MarcXmlReader
56
- rescue NameError => e
57
- # /Users/jrochkind/code/solrj-gem/lib"
58
-
59
- include_jar_dir = File.expand_path("../../vendor/marc4j/lib", File.dirname(__FILE__))
60
-
61
- jardir = settings["marc4j.jar_dir"] || include_jar_dir
62
- Dir.glob("#{jardir}/*.jar") do |x|
63
- require x
64
- end
65
-
66
- if tries > 1
67
- raise LoadError.new("Can not find Marc4J java classes")
68
- else
69
- retry
70
- end
71
- end
72
- end
73
29
 
74
30
  # Requires solrj jar(s) from settings['solrj.jar_dir'] if given, otherwise
75
31
  # uses jars bundled with traject gem in ./vendor
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "0.12.0"
2
+ VERSION = "0.13.0"
3
3
  end
@@ -184,6 +184,25 @@ describe "Traject::Indexer#map_record" do
184
184
 
185
185
  assert called, "Called mapping routine"
186
186
  end
187
+
188
+ it "skips records" do
189
+
190
+ @indexer.to_field("beforeSkip") do |rec, acc|
191
+ acc << "Before"
192
+ end
193
+
194
+ @indexer.to_field('radical') do |rec, acc, context|
195
+ context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
196
+ end
197
+
198
+ @indexer.to_field('afterSkip') do |rec, acc|
199
+ acc << "After. Should never happen"
200
+ end
201
+
202
+ output = @indexer.map_record(@record)
203
+ assert_equal ['Before'], output['beforeSkip']
204
+ assert_nil output['afterSkip']
205
+ end
187
206
 
188
207
  end
189
208
 
data/traject.gemspec CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
19
19
 
20
20
 
21
21
  spec.add_dependency "marc", ">= 0.7.1"
22
+ spec.add_dependency "marc-marc4j", ">=0.1.1"
22
23
  spec.add_dependency "hashie", ">= 2.0.5", "< 2.1" # used for Indexer#settings
23
24
  spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
24
25
  spec.add_dependency "yell" # logging
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.12.0
5
+ version: 0.13.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Jonathan Rochkind
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-10 00:00:00.000000000 Z
12
+ date: 2013-09-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: marc
@@ -27,6 +27,22 @@ dependencies:
27
27
  none: false
28
28
  prerelease: false
29
29
  type: :runtime
30
+ - !ruby/object:Gem::Dependency
31
+ name: marc-marc4j
32
+ version_requirements: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 0.1.1
37
+ none: false
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: 0.1.1
43
+ none: false
44
+ prerelease: false
45
+ type: :runtime
30
46
  - !ruby/object:Gem::Dependency
31
47
  name: hashie
32
48
  version_requirements: !ruby/object:Gem::Requirement
@@ -236,8 +252,6 @@ files:
236
252
  - test/translation_maps/translate_array_test.yaml
237
253
  - test/translation_maps/yaml_map.yaml
238
254
  - traject.gemspec
239
- - vendor/marc4j/README.md
240
- - vendor/marc4j/lib/marc4j-2.5.1-beta.jar
241
255
  - vendor/solrj/README
242
256
  - vendor/solrj/build.xml
243
257
  - vendor/solrj/ivy.xml
@@ -1,17 +0,0 @@
1
- The marc4j.jar file that is in `./lib` was created by:
2
-
3
- Checking out the marc4j source from github.com/marc4j/marc4j, and building
4
- with `ant` on 23 July 2013.
5
-
6
- That produced the jar you see, with the name it has. (I am not sure the version
7
- in the jar name is accurate, it's not actually the beta1 release, but the RC1
8
- release, or maybe even a subsequent release?)
9
-
10
- It can be regenerated by doing the same.
11
-
12
- Bundling the marc4j jar with our traject gem is not neccesarily the
13
- best way to go, but it's what we're doing now. See top-level README
14
- TODO.
15
-
16
- (You can use your own custom marc4j.jar by using a runtime setting,
17
- see Marc4JReader class docs. )