traject 2.1.0-java → 2.2.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ # Represents a single specification for extracting data
2
+ # from a marc field, like "600abc" or "600|1*|x".
3
+ #
4
+ # Includes the tag for reference, although this is redundant and not actually used
5
+ # in logic, since the tag is also implicit in the overall spec_hash
6
+ # with tag => [spec1, spec2]
7
+
8
+
9
+ module Traject
10
+ class MarcExtractor
11
+
12
+ # A set of specs
13
+ class SpecSet
14
+
15
+ attr_accessor :hash
16
+
17
+ def self.new(seedset = {})
18
+
19
+ case seedset
20
+ when String
21
+ s = allocate
22
+ s.hash = Spec.hash_from_string(seedset)
23
+ s
24
+ when Hash
25
+ s = allocate
26
+ hash = Hash.new
27
+ seedset.each_pair do |k, v|
28
+ hash[k] = Array(v)
29
+ end
30
+ s.hash = hash
31
+ s
32
+ when SpecSet
33
+ seedset
34
+ else
35
+ raise ArgumentError.new, "SpecSet can only be constructed from a string, a hash, or another SpecSet"
36
+ end
37
+ end
38
+
39
+ def add(spec)
40
+ @hash[spec.tag] << spec
41
+ end
42
+
43
+ def tags
44
+ @hash.keys
45
+ end
46
+
47
+ def specs_for_tag(tag)
48
+ @hash[tag] || []
49
+ end
50
+
51
+ def specs_matching_field(field, use_alternate_script = false)
52
+
53
+ tag = if use_alternate_script
54
+ effective_tag(field)
55
+ else
56
+ field.tag
57
+ end
58
+ specs_for_tag(tag).select { |s| s.matches_indicators?(field) }
59
+ end
60
+
61
+ def effective_tag(field)
62
+ if field.tag == ALTERNATE_SCRIPT_TAG and field['6']
63
+ field["6"].encode(field["6"].encoding).byteslice(0, 3)
64
+ else
65
+ field.tag
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+ class Spec
72
+ attr_accessor :tag, :subfields
73
+ attr_reader :indicator1, :indicator2, :byte1, :byte2, :bytes
74
+
75
+ # Allow use of a hash to initialize. Should ditch this and use
76
+ # optional keyword args once folks move to 2.x syntax
77
+ def initialize(hash = nil)
78
+ if hash
79
+ hash.each_pair do |key, value|
80
+ self.send("#{key}=", value)
81
+ end
82
+ end
83
+ end
84
+
85
+ # Should subfields extracted by joined, if we have a seperator?
86
+ # * '630' no subfields specified => join all subfields
87
+ # * '630abc' multiple subfields specified = join all subfields
88
+ # * '633a' one subfield => do not join, return one value for each $a in the field
89
+ # * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
90
+ #
91
+ # Last case is handled implicitly at the moment when subfields == ['a', 'a']
92
+ def joinable?
93
+ (self.subfields.nil? || self.subfields.size != 1)
94
+ end
95
+
96
+ def indicator1=(ind1)
97
+ ind1 == '*' ? @indicator1 = nil : @indicator1 = ind1
98
+ end
99
+
100
+ def indicator2=(ind2)
101
+ ind2 == '*' ? @indicator2 = nil : @indicator2 = ind2
102
+ end
103
+
104
+ def byte1=(byte1)
105
+ @byte1 = byte1.to_i if byte1
106
+ set_bytes(@byte1, @byte2)
107
+ end
108
+
109
+ def byte2=(byte2)
110
+ @byte2 = byte2.to_i if byte2
111
+ set_bytes(@byte1, @byte2)
112
+ end
113
+
114
+ def set_bytes(byte1, byte2)
115
+ if byte1 && byte2
116
+ @bytes = ((byte1.to_i)..(byte2.to_i))
117
+ elsif byte1
118
+ @bytes = byte1.to_i
119
+ end
120
+ end
121
+
122
+ # Pass in a MARC field, do it's indicators match indicators
123
+ # in this spec? nil indicators in spec mean we don't care, everything
124
+ # matches.
125
+ def matches_indicators?(field)
126
+ return (indicator1.nil? || indicator1 == field.indicator1) &&
127
+ (indicator2.nil? || indicator2 == field.indicator2)
128
+ end
129
+
130
+ # Pass in a string subfield code like 'a'; does this
131
+ # spec include it?
132
+ def includes_subfield_code?(code)
133
+ # subfields nil means include them all
134
+ self.subfields.nil? || self.subfields.include?(code)
135
+ end
136
+
137
+ # Simple equality definition
138
+ def ==(spec)
139
+ return false unless spec.kind_of?(Spec)
140
+
141
+ return (self.tag == spec.tag) &&
142
+ (self.subfields == spec.subfields) &&
143
+ (self.indicator1 == spec.indicator1) &&
144
+ (self.indicator2 == spec.indicator2) &&
145
+ (self.bytes == spec.bytes)
146
+ end
147
+
148
+
149
+ # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
150
+ # to represent the specification. See comments at head of class for
151
+ # documentation of string specification format.
152
+ #
153
+ #
154
+ # ## Return value
155
+ #
156
+ # The hash returned is keyed by tag, and has as values an array of 0 or
157
+ # or more MarcExtractor::Spec objects representing the specified extraction
158
+ # operations for that tag.
159
+ #
160
+ # It's an array of possibly more than one, because you can specify
161
+ # multiple extractions on the same tag: for instance "245a:245abc"
162
+ #
163
+ # See tests for more examples.
164
+
165
+ DATAFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*])([a-z0-9\ \*])\|)?([a-z0-9]*)?\Z/
166
+ CONTROLFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/
167
+
168
+ def self.hash_from_string(spec_string)
169
+ # hash defaults to []
170
+ hash = Hash.new
171
+
172
+ # Split the string(s) given on colon
173
+ spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/s*:\s*/)
174
+
175
+ spec_strings.each do |part|
176
+ if m = DATAFIELD_PATTERN.match(part)
177
+
178
+ tag, ind1, ind2, subfields = m[1], m[3], m[4], m[5]
179
+
180
+ spec = create_datafield_spec(tag, ind1, ind2, subfields)
181
+
182
+ hash[spec.tag] ||= []
183
+ hash[spec.tag] << spec
184
+
185
+ elsif m = CONTROLFIELD_PATTERN.match(part)
186
+ tag, byte1, byte2 = m[1], m[3], m[5]
187
+
188
+ spec = create_controlfield_spec(tag, byte1, byte2)
189
+
190
+ hash[spec.tag] ||= []
191
+ hash[spec.tag] << spec
192
+ else
193
+ raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
194
+ end
195
+ end
196
+
197
+ return hash
198
+ end
199
+
200
+
201
+ # Create a new datafield spec. Most of the logic about how to deal
202
+ # with special characters is built into the Spec class.
203
+
204
+ def self.create_datafield_spec(tag, ind1, ind2, subfields)
205
+ spec = Spec.new(:tag => tag)
206
+ spec.indicator1 = ind1
207
+ spec.indicator2 = ind2
208
+
209
+ if subfields and !subfields.empty?
210
+ spec.subfields = subfields.split('')
211
+ end
212
+
213
+ spec
214
+
215
+ end
216
+
217
+ # Create a new controlfield spec
218
+ def self.create_controlfield_spec(tag, byte1, byte2)
219
+ spec = Spec.new(:tag => tag)
220
+ spec.set_bytes(byte1, byte2)
221
+ spec
222
+ end
223
+
224
+
225
+ end
226
+ end
227
+
228
+ end
229
+
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "2.1.0"
2
+ VERSION = "2.2.0"
3
3
  end
@@ -32,6 +32,47 @@ describe 'Simple output' do
32
32
 
33
33
  end
34
34
 
35
+ it "deals ok with a missing ID" do
36
+ context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
37
+ logger_strio = StringIO.new
38
+ idfield = 'id'
39
+
40
+ context.logger = Logger.new(logger_strio)
41
+ context.position = 1
42
+
43
+ context.output_hash.delete(idfield)
44
+ @writer.put context
45
+ expected = [
46
+ "record_num_1 title #{@title}",
47
+ ]
48
+ assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
49
+ assert_match /At least one record \(\#1\) doesn't define field 'id'/, logger_strio.string
50
+ @writer.close
51
+
52
+ end
53
+
54
+ it "sets the idfield correctly" do
55
+ bad_rec_id_field = 'iden'
56
+ writer = Traject::DebugWriter.new("output_stream" => @io, "debug_writer.idfield" => bad_rec_id_field)
57
+
58
+ context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
59
+
60
+ logger_strio = StringIO.new
61
+
62
+ context.logger = Logger.new(logger_strio)
63
+ context.position = 1
64
+
65
+ writer.put context
66
+ expected = [
67
+ "record_num_1 id #{@id }",
68
+ "record_num_1 title #{@title}",
69
+ ]
70
+ assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
71
+ assert_match /At least one record \(\#1\) doesn't define field 'iden'/, logger_strio.string
72
+ writer.close
73
+
74
+ end
75
+
35
76
  end
36
77
 
37
78
 
@@ -9,14 +9,14 @@ describe "Traject::MarcExtractor" do
9
9
  it "is frozen read-only" do
10
10
  extractor = Traject::MarcExtractor.new("100abcde", :seperator => ";")
11
11
  assert extractor.frozen?
12
- assert extractor.spec_hash.frozen?
12
+ assert extractor.spec_set.frozen?
13
13
  assert extractor.options.frozen?
14
14
  end
15
15
 
16
16
 
17
17
  describe "#parse_marc_spec" do
18
18
  it "parses single spec with all elements" do
19
- parsed = Traject::MarcExtractor.parse_string_spec("245|1*|abcg")
19
+ parsed = Traject::MarcExtractor::Spec.hash_from_string("245|1*|abcg")
20
20
 
21
21
  assert_kind_of Hash, parsed
22
22
  assert_equal 1, parsed.keys.length
@@ -30,7 +30,7 @@ describe "Traject::MarcExtractor" do
30
30
  end
31
31
 
32
32
  it "parses a mixed bag" do
33
- parsed = Traject::MarcExtractor.parse_string_spec("245abcde:810:700|*4|bcd")
33
+ parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcde:810:700|*4|bcd")
34
34
  spec245 = parsed['245'].first
35
35
  spec810 = parsed['810'].first
36
36
  spec700 = parsed['700'].first
@@ -57,14 +57,14 @@ describe "Traject::MarcExtractor" do
57
57
  end
58
58
 
59
59
  it "parses fixed field byte offsets" do
60
- parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
60
+ parsed = Traject::MarcExtractor::Spec.hash_from_string("005[5]:008[7-10]")
61
61
 
62
62
  assert_equal 5, parsed["005"].first.bytes
63
63
  assert_equal 7..10, parsed["008"].first.bytes
64
64
  end
65
65
 
66
66
  it "allows arrays of specs" do
67
- parsed = Traject::MarcExtractor.parse_string_spec %w(
67
+ parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
68
68
  245abcde
69
69
  810
70
70
  700|*4|bcd
@@ -73,7 +73,7 @@ describe "Traject::MarcExtractor" do
73
73
  end
74
74
 
75
75
  it "allows mixture of array and colon-delimited specs" do
76
- parsed = Traject::MarcExtractor.parse_string_spec %w(
76
+ parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
77
77
  245abcde
78
78
  100:110:111
79
79
  810
@@ -127,13 +127,13 @@ describe "Traject::MarcExtractor" do
127
127
 
128
128
  describe "#extract_by_spec" do
129
129
  before do
130
- @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
130
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").first
131
131
  end
132
132
 
133
133
  describe "extracts a basic case" do
134
134
  before do
135
- parsed_spec = Traject::MarcExtractor.parse_string_spec("700abcdef:856|*2|:505|1*|:245ba")
136
- @values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
135
+ @parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("700abcdef:856|*2|:505|1*|:245ba")
136
+ @values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
137
137
  end
138
138
 
139
139
  it "returns an array" do
@@ -150,7 +150,7 @@ describe "Traject::MarcExtractor" do
150
150
  end
151
151
 
152
152
  it "does not have 505, due to non-matching indicators" do
153
- assert ! @values.find {|s| s.include? "propaganda model"}
153
+ assert !@values.find { |s| s.include? "propaganda model" }, @values
154
154
  end
155
155
 
156
156
 
@@ -166,20 +166,20 @@ describe "Traject::MarcExtractor" do
166
166
 
167
167
  describe "extracts fixed fields" do
168
168
  it ", complete" do
169
- parsed_spec = Traject::MarcExtractor.parse_string_spec("001")
170
- values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
169
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("001")
170
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
171
171
 
172
172
  assert_equal ["2710183"], values
173
173
  end
174
174
  it ", single byte offset" do
175
- parsed_spec = Traject::MarcExtractor.parse_string_spec("008[5]")
176
- values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
175
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[5]")
176
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
177
177
 
178
178
  assert_equal ["1"], values
179
179
  end
180
180
  it ", byte range" do
181
- parsed_spec = Traject::MarcExtractor.parse_string_spec("008[7-10]")
182
- values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
181
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[7-10]")
182
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
183
183
 
184
184
  assert_equal ["2002"], values
185
185
  end
@@ -187,15 +187,15 @@ describe "Traject::MarcExtractor" do
187
187
 
188
188
  describe "separator argument" do
189
189
  it "causes non-join when nil" do
190
- parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
191
- values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
190
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
191
+ values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
192
192
 
193
193
  assert_length 3, values
194
194
  end
195
195
 
196
196
  it "can be non-default" do
197
- parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
198
- values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
197
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
198
+ values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
199
199
 
200
200
  assert_length 1, values
201
201
  assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
@@ -204,8 +204,8 @@ describe "Traject::MarcExtractor" do
204
204
 
205
205
  describe "extracts alternate script" do
206
206
  before do
207
- @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
208
- @parsed_spec = Traject::MarcExtractor.parse_string_spec("245b")
207
+ @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
208
+ @parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245b")
209
209
  end
210
210
  it "from default :include" do
211
211
 
@@ -301,10 +301,10 @@ describe "Traject::MarcExtractor" do
301
301
  describe "MarcExtractor.cached" do
302
302
  it "creates" do
303
303
  extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
304
- spec_hash = extractor.spec_hash
304
+ spec_set = extractor.spec_set
305
305
 
306
306
  assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
307
- assert_equal({"245"=>[Traject::MarcExtractor::Spec.new(:tag => "245", :subfields=>["a", "b", "c"])]}, spec_hash)
307
+ assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245", :subfields => ["a", "b", "c"])], spec_set.specs_for_tag('245'))
308
308
  end
309
309
  it "caches" do
310
310
  ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
@@ -1,4 +1,4 @@
1
- # A sample traject configration, save as say `traject_config.rb`, then
1
+ # A sample traject configuration, save as say `traject_config.rb`, then
2
2
  # run `traject -c traject_config.rb marc_file.marc` to index to
3
3
  # solr specified in config file, according to rules specified in
4
4
  # config file
data/traject.gemspec CHANGED
@@ -29,19 +29,19 @@ Gem::Specification.new do |spec|
29
29
  spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
30
30
  spec.add_dependency "httpclient", "~> 2.5"
31
31
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
32
-
33
- # If we're building the package under JRuby, add in the
32
+
33
+ # If we're building the package under JRuby, add in the
34
34
  # jruby-only gems and specify the platform.
35
-
35
+
36
36
  if defined? JRUBY_VERSION
37
37
  spec.platform = 'java'
38
38
  spec.add_dependency "traject-marc4j_reader", "~> 1.0"
39
39
  else
40
40
  spec.platform = "ruby"
41
41
  end
42
-
43
42
 
44
- spec.add_development_dependency "bundler", "~> 1.3"
43
+
44
+ spec.add_development_dependency "bundler", "~> 1.7"
45
45
  spec.add_development_dependency "rake"
46
46
  spec.add_development_dependency "minitest"
47
47
  end