traject 2.1.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,229 @@
1
+ # Represents a single specification for extracting data
2
+ # from a marc field, like "600abc" or "600|1*|x".
3
+ #
4
+ # Includes the tag for reference, although this is redundant and not actually used
5
+ # in logic, since the tag is also implicit in the overall spec_hash
6
+ # with tag => [spec1, spec2]
7
+
8
+
9
+ module Traject
10
+ class MarcExtractor
11
+
12
+ # A set of specs
13
+ class SpecSet
14
+
15
+ attr_accessor :hash
16
+
17
+ def self.new(seedset = {})
18
+
19
+ case seedset
20
+ when String
21
+ s = allocate
22
+ s.hash = Spec.hash_from_string(seedset)
23
+ s
24
+ when Hash
25
+ s = allocate
26
+ hash = Hash.new
27
+ seedset.each_pair do |k, v|
28
+ hash[k] = Array(v)
29
+ end
30
+ s.hash = hash
31
+ s
32
+ when SpecSet
33
+ seedset
34
+ else
35
+ raise ArgumentError.new, "SpecSet can only be constructed from a string, a hash, or another SpecSet"
36
+ end
37
+ end
38
+
39
+ def add(spec)
40
+ @hash[spec.tag] << spec
41
+ end
42
+
43
+ def tags
44
+ @hash.keys
45
+ end
46
+
47
+ def specs_for_tag(tag)
48
+ @hash[tag] || []
49
+ end
50
+
51
+ def specs_matching_field(field, use_alternate_script = false)
52
+
53
+ tag = if use_alternate_script
54
+ effective_tag(field)
55
+ else
56
+ field.tag
57
+ end
58
+ specs_for_tag(tag).select { |s| s.matches_indicators?(field) }
59
+ end
60
+
61
+ def effective_tag(field)
62
+ if field.tag == ALTERNATE_SCRIPT_TAG and field['6']
63
+ field["6"].encode(field["6"].encoding).byteslice(0, 3)
64
+ else
65
+ field.tag
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+ class Spec
72
+ attr_accessor :tag, :subfields
73
+ attr_reader :indicator1, :indicator2, :byte1, :byte2, :bytes
74
+
75
+ # Allow use of a hash to initialize. Should ditch this and use
76
+ # optional keyword args once folks move to 2.x syntax
77
+ def initialize(hash = nil)
78
+ if hash
79
+ hash.each_pair do |key, value|
80
+ self.send("#{key}=", value)
81
+ end
82
+ end
83
+ end
84
+
85
+ # Should subfields extracted by joined, if we have a seperator?
86
+ # * '630' no subfields specified => join all subfields
87
+ # * '630abc' multiple subfields specified = join all subfields
88
+ # * '633a' one subfield => do not join, return one value for each $a in the field
89
+ # * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
90
+ #
91
+ # Last case is handled implicitly at the moment when subfields == ['a', 'a']
92
+ def joinable?
93
+ (self.subfields.nil? || self.subfields.size != 1)
94
+ end
95
+
96
+ def indicator1=(ind1)
97
+ ind1 == '*' ? @indicator1 = nil : @indicator1 = ind1
98
+ end
99
+
100
+ def indicator2=(ind2)
101
+ ind2 == '*' ? @indicator2 = nil : @indicator2 = ind2
102
+ end
103
+
104
+ def byte1=(byte1)
105
+ @byte1 = byte1.to_i if byte1
106
+ set_bytes(@byte1, @byte2)
107
+ end
108
+
109
+ def byte2=(byte2)
110
+ @byte2 = byte2.to_i if byte2
111
+ set_bytes(@byte1, @byte2)
112
+ end
113
+
114
+ def set_bytes(byte1, byte2)
115
+ if byte1 && byte2
116
+ @bytes = ((byte1.to_i)..(byte2.to_i))
117
+ elsif byte1
118
+ @bytes = byte1.to_i
119
+ end
120
+ end
121
+
122
+ # Pass in a MARC field, do it's indicators match indicators
123
+ # in this spec? nil indicators in spec mean we don't care, everything
124
+ # matches.
125
+ def matches_indicators?(field)
126
+ return (indicator1.nil? || indicator1 == field.indicator1) &&
127
+ (indicator2.nil? || indicator2 == field.indicator2)
128
+ end
129
+
130
+ # Pass in a string subfield code like 'a'; does this
131
+ # spec include it?
132
+ def includes_subfield_code?(code)
133
+ # subfields nil means include them all
134
+ self.subfields.nil? || self.subfields.include?(code)
135
+ end
136
+
137
+ # Simple equality definition
138
+ def ==(spec)
139
+ return false unless spec.kind_of?(Spec)
140
+
141
+ return (self.tag == spec.tag) &&
142
+ (self.subfields == spec.subfields) &&
143
+ (self.indicator1 == spec.indicator1) &&
144
+ (self.indicator2 == spec.indicator2) &&
145
+ (self.bytes == spec.bytes)
146
+ end
147
+
148
+
149
+ # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
150
+ # to represent the specification. See comments at head of class for
151
+ # documentation of string specification format.
152
+ #
153
+ #
154
+ # ## Return value
155
+ #
156
+ # The hash returned is keyed by tag, and has as values an array of 0 or
157
+ # or more MarcExtractor::Spec objects representing the specified extraction
158
+ # operations for that tag.
159
+ #
160
+ # It's an array of possibly more than one, because you can specify
161
+ # multiple extractions on the same tag: for instance "245a:245abc"
162
+ #
163
+ # See tests for more examples.
164
+
165
+ DATAFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*])([a-z0-9\ \*])\|)?([a-z0-9]*)?\Z/
166
+ CONTROLFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/
167
+
168
+ def self.hash_from_string(spec_string)
169
+ # hash defaults to []
170
+ hash = Hash.new
171
+
172
+ # Split the string(s) given on colon
173
+ spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/s*:\s*/)
174
+
175
+ spec_strings.each do |part|
176
+ if m = DATAFIELD_PATTERN.match(part)
177
+
178
+ tag, ind1, ind2, subfields = m[1], m[3], m[4], m[5]
179
+
180
+ spec = create_datafield_spec(tag, ind1, ind2, subfields)
181
+
182
+ hash[spec.tag] ||= []
183
+ hash[spec.tag] << spec
184
+
185
+ elsif m = CONTROLFIELD_PATTERN.match(part)
186
+ tag, byte1, byte2 = m[1], m[3], m[5]
187
+
188
+ spec = create_controlfield_spec(tag, byte1, byte2)
189
+
190
+ hash[spec.tag] ||= []
191
+ hash[spec.tag] << spec
192
+ else
193
+ raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
194
+ end
195
+ end
196
+
197
+ return hash
198
+ end
199
+
200
+
201
+ # Create a new datafield spec. Most of the logic about how to deal
202
+ # with special characters is built into the Spec class.
203
+
204
+ def self.create_datafield_spec(tag, ind1, ind2, subfields)
205
+ spec = Spec.new(:tag => tag)
206
+ spec.indicator1 = ind1
207
+ spec.indicator2 = ind2
208
+
209
+ if subfields and !subfields.empty?
210
+ spec.subfields = subfields.split('')
211
+ end
212
+
213
+ spec
214
+
215
+ end
216
+
217
+ # Create a new controlfield spec
218
+ def self.create_controlfield_spec(tag, byte1, byte2)
219
+ spec = Spec.new(:tag => tag)
220
+ spec.set_bytes(byte1, byte2)
221
+ spec
222
+ end
223
+
224
+
225
+ end
226
+ end
227
+
228
+ end
229
+
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "2.1.0"
2
+ VERSION = "2.2.0"
3
3
  end
@@ -32,6 +32,47 @@ describe 'Simple output' do
32
32
 
33
33
  end
34
34
 
35
+ it "deals ok with a missing ID" do
36
+ context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
37
+ logger_strio = StringIO.new
38
+ idfield = 'id'
39
+
40
+ context.logger = Logger.new(logger_strio)
41
+ context.position = 1
42
+
43
+ context.output_hash.delete(idfield)
44
+ @writer.put context
45
+ expected = [
46
+ "record_num_1 title #{@title}",
47
+ ]
48
+ assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
49
+ assert_match /At least one record \(\#1\) doesn't define field 'id'/, logger_strio.string
50
+ @writer.close
51
+
52
+ end
53
+
54
+ it "sets the idfield correctly" do
55
+ bad_rec_id_field = 'iden'
56
+ writer = Traject::DebugWriter.new("output_stream" => @io, "debug_writer.idfield" => bad_rec_id_field)
57
+
58
+ context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
59
+
60
+ logger_strio = StringIO.new
61
+
62
+ context.logger = Logger.new(logger_strio)
63
+ context.position = 1
64
+
65
+ writer.put context
66
+ expected = [
67
+ "record_num_1 id #{@id }",
68
+ "record_num_1 title #{@title}",
69
+ ]
70
+ assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
71
+ assert_match /At least one record \(\#1\) doesn't define field 'iden'/, logger_strio.string
72
+ writer.close
73
+
74
+ end
75
+
35
76
  end
36
77
 
37
78
 
@@ -9,14 +9,14 @@ describe "Traject::MarcExtractor" do
9
9
  it "is frozen read-only" do
10
10
  extractor = Traject::MarcExtractor.new("100abcde", :seperator => ";")
11
11
  assert extractor.frozen?
12
- assert extractor.spec_hash.frozen?
12
+ assert extractor.spec_set.frozen?
13
13
  assert extractor.options.frozen?
14
14
  end
15
15
 
16
16
 
17
17
  describe "#parse_marc_spec" do
18
18
  it "parses single spec with all elements" do
19
- parsed = Traject::MarcExtractor.parse_string_spec("245|1*|abcg")
19
+ parsed = Traject::MarcExtractor::Spec.hash_from_string("245|1*|abcg")
20
20
 
21
21
  assert_kind_of Hash, parsed
22
22
  assert_equal 1, parsed.keys.length
@@ -30,7 +30,7 @@ describe "Traject::MarcExtractor" do
30
30
  end
31
31
 
32
32
  it "parses a mixed bag" do
33
- parsed = Traject::MarcExtractor.parse_string_spec("245abcde:810:700|*4|bcd")
33
+ parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcde:810:700|*4|bcd")
34
34
  spec245 = parsed['245'].first
35
35
  spec810 = parsed['810'].first
36
36
  spec700 = parsed['700'].first
@@ -57,14 +57,14 @@ describe "Traject::MarcExtractor" do
57
57
  end
58
58
 
59
59
  it "parses fixed field byte offsets" do
60
- parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
60
+ parsed = Traject::MarcExtractor::Spec.hash_from_string("005[5]:008[7-10]")
61
61
 
62
62
  assert_equal 5, parsed["005"].first.bytes
63
63
  assert_equal 7..10, parsed["008"].first.bytes
64
64
  end
65
65
 
66
66
  it "allows arrays of specs" do
67
- parsed = Traject::MarcExtractor.parse_string_spec %w(
67
+ parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
68
68
  245abcde
69
69
  810
70
70
  700|*4|bcd
@@ -73,7 +73,7 @@ describe "Traject::MarcExtractor" do
73
73
  end
74
74
 
75
75
  it "allows mixture of array and colon-delimited specs" do
76
- parsed = Traject::MarcExtractor.parse_string_spec %w(
76
+ parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
77
77
  245abcde
78
78
  100:110:111
79
79
  810
@@ -127,13 +127,13 @@ describe "Traject::MarcExtractor" do
127
127
 
128
128
  describe "#extract_by_spec" do
129
129
  before do
130
- @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
130
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").first
131
131
  end
132
132
 
133
133
  describe "extracts a basic case" do
134
134
  before do
135
- parsed_spec = Traject::MarcExtractor.parse_string_spec("700abcdef:856|*2|:505|1*|:245ba")
136
- @values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
135
+ @parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("700abcdef:856|*2|:505|1*|:245ba")
136
+ @values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
137
137
  end
138
138
 
139
139
  it "returns an array" do
@@ -150,7 +150,7 @@ describe "Traject::MarcExtractor" do
150
150
  end
151
151
 
152
152
  it "does not have 505, due to non-matching indicators" do
153
- assert ! @values.find {|s| s.include? "propaganda model"}
153
+ assert !@values.find { |s| s.include? "propaganda model" }, @values
154
154
  end
155
155
 
156
156
 
@@ -166,20 +166,20 @@ describe "Traject::MarcExtractor" do
166
166
 
167
167
  describe "extracts fixed fields" do
168
168
  it ", complete" do
169
- parsed_spec = Traject::MarcExtractor.parse_string_spec("001")
170
- values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
169
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("001")
170
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
171
171
 
172
172
  assert_equal ["2710183"], values
173
173
  end
174
174
  it ", single byte offset" do
175
- parsed_spec = Traject::MarcExtractor.parse_string_spec("008[5]")
176
- values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
175
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[5]")
176
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
177
177
 
178
178
  assert_equal ["1"], values
179
179
  end
180
180
  it ", byte range" do
181
- parsed_spec = Traject::MarcExtractor.parse_string_spec("008[7-10]")
182
- values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
181
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[7-10]")
182
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
183
183
 
184
184
  assert_equal ["2002"], values
185
185
  end
@@ -187,15 +187,15 @@ describe "Traject::MarcExtractor" do
187
187
 
188
188
  describe "separator argument" do
189
189
  it "causes non-join when nil" do
190
- parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
191
- values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
190
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
191
+ values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
192
192
 
193
193
  assert_length 3, values
194
194
  end
195
195
 
196
196
  it "can be non-default" do
197
- parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
198
- values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
197
+ parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
198
+ values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
199
199
 
200
200
  assert_length 1, values
201
201
  assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
@@ -204,8 +204,8 @@ describe "Traject::MarcExtractor" do
204
204
 
205
205
  describe "extracts alternate script" do
206
206
  before do
207
- @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
208
- @parsed_spec = Traject::MarcExtractor.parse_string_spec("245b")
207
+ @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
208
+ @parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245b")
209
209
  end
210
210
  it "from default :include" do
211
211
 
@@ -301,10 +301,10 @@ describe "Traject::MarcExtractor" do
301
301
  describe "MarcExtractor.cached" do
302
302
  it "creates" do
303
303
  extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
304
- spec_hash = extractor.spec_hash
304
+ spec_set = extractor.spec_set
305
305
 
306
306
  assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
307
- assert_equal({"245"=>[Traject::MarcExtractor::Spec.new(:tag => "245", :subfields=>["a", "b", "c"])]}, spec_hash)
307
+ assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245", :subfields => ["a", "b", "c"])], spec_set.specs_for_tag('245'))
308
308
  end
309
309
  it "caches" do
310
310
  ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
@@ -1,4 +1,4 @@
1
- # A sample traject configration, save as say `traject_config.rb`, then
1
+ # A sample traject configuration, save as say `traject_config.rb`, then
2
2
  # run `traject -c traject_config.rb marc_file.marc` to index to
3
3
  # solr specified in config file, according to rules specified in
4
4
  # config file
data/traject.gemspec CHANGED
@@ -29,19 +29,19 @@ Gem::Specification.new do |spec|
29
29
  spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
30
30
  spec.add_dependency "httpclient", "~> 2.5"
31
31
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
32
-
33
- # If we're building the package under JRuby, add in the
32
+
33
+ # If we're building the package under JRuby, add in the
34
34
  # jruby-only gems and specify the platform.
35
-
35
+
36
36
  if defined? JRUBY_VERSION
37
37
  spec.platform = 'java'
38
38
  spec.add_dependency "traject-marc4j_reader", "~> 1.0"
39
39
  else
40
40
  spec.platform = "ruby"
41
41
  end
42
-
43
42
 
44
- spec.add_development_dependency "bundler", "~> 1.3"
43
+
44
+ spec.add_development_dependency "bundler", "~> 1.7"
45
45
  spec.add_development_dependency "rake"
46
46
  spec.add_development_dependency "minitest"
47
47
  end