traject 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +8 -20
- data/CHANGES.md +14 -0
- data/README.md +35 -56
- data/doc/extending.md +20 -27
- data/doc/indexing_rules.md +46 -57
- data/doc/settings.md +17 -48
- data/lib/traject/debug_writer.rb +31 -5
- data/lib/traject/indexer.rb +6 -4
- data/lib/traject/marc_extractor.rb +37 -157
- data/lib/traject/marc_extractor_spec.rb +229 -0
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +41 -0
- data/test/marc_extractor_test.rb +24 -24
- data/test/test_support/demo_config.rb +1 -1
- data/traject.gemspec +5 -5
- metadata +6 -5
@@ -0,0 +1,229 @@
|
|
1
|
+
# Represents a single specification for extracting data
|
2
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
3
|
+
#
|
4
|
+
# Includes the tag for reference, although this is redundant and not actually used
|
5
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
6
|
+
# with tag => [spec1, spec2]
|
7
|
+
|
8
|
+
|
9
|
+
module Traject
|
10
|
+
class MarcExtractor
|
11
|
+
|
12
|
+
# A set of specs
|
13
|
+
class SpecSet
|
14
|
+
|
15
|
+
attr_accessor :hash
|
16
|
+
|
17
|
+
def self.new(seedset = {})
|
18
|
+
|
19
|
+
case seedset
|
20
|
+
when String
|
21
|
+
s = allocate
|
22
|
+
s.hash = Spec.hash_from_string(seedset)
|
23
|
+
s
|
24
|
+
when Hash
|
25
|
+
s = allocate
|
26
|
+
hash = Hash.new
|
27
|
+
seedset.each_pair do |k, v|
|
28
|
+
hash[k] = Array(v)
|
29
|
+
end
|
30
|
+
s.hash = hash
|
31
|
+
s
|
32
|
+
when SpecSet
|
33
|
+
seedset
|
34
|
+
else
|
35
|
+
raise ArgumentError.new, "SpecSet can only be constructed from a string, a hash, or another SpecSet"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def add(spec)
|
40
|
+
@hash[spec.tag] << spec
|
41
|
+
end
|
42
|
+
|
43
|
+
def tags
|
44
|
+
@hash.keys
|
45
|
+
end
|
46
|
+
|
47
|
+
def specs_for_tag(tag)
|
48
|
+
@hash[tag] || []
|
49
|
+
end
|
50
|
+
|
51
|
+
def specs_matching_field(field, use_alternate_script = false)
|
52
|
+
|
53
|
+
tag = if use_alternate_script
|
54
|
+
effective_tag(field)
|
55
|
+
else
|
56
|
+
field.tag
|
57
|
+
end
|
58
|
+
specs_for_tag(tag).select { |s| s.matches_indicators?(field) }
|
59
|
+
end
|
60
|
+
|
61
|
+
def effective_tag(field)
|
62
|
+
if field.tag == ALTERNATE_SCRIPT_TAG and field['6']
|
63
|
+
field["6"].encode(field["6"].encoding).byteslice(0, 3)
|
64
|
+
else
|
65
|
+
field.tag
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
class Spec
|
72
|
+
attr_accessor :tag, :subfields
|
73
|
+
attr_reader :indicator1, :indicator2, :byte1, :byte2, :bytes
|
74
|
+
|
75
|
+
# Allow use of a hash to initialize. Should ditch this and use
|
76
|
+
# optional keyword args once folks move to 2.x syntax
|
77
|
+
def initialize(hash = nil)
|
78
|
+
if hash
|
79
|
+
hash.each_pair do |key, value|
|
80
|
+
self.send("#{key}=", value)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Should subfields extracted by joined, if we have a seperator?
|
86
|
+
# * '630' no subfields specified => join all subfields
|
87
|
+
# * '630abc' multiple subfields specified = join all subfields
|
88
|
+
# * '633a' one subfield => do not join, return one value for each $a in the field
|
89
|
+
# * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
|
90
|
+
#
|
91
|
+
# Last case is handled implicitly at the moment when subfields == ['a', 'a']
|
92
|
+
def joinable?
|
93
|
+
(self.subfields.nil? || self.subfields.size != 1)
|
94
|
+
end
|
95
|
+
|
96
|
+
def indicator1=(ind1)
|
97
|
+
ind1 == '*' ? @indicator1 = nil : @indicator1 = ind1
|
98
|
+
end
|
99
|
+
|
100
|
+
def indicator2=(ind2)
|
101
|
+
ind2 == '*' ? @indicator2 = nil : @indicator2 = ind2
|
102
|
+
end
|
103
|
+
|
104
|
+
def byte1=(byte1)
|
105
|
+
@byte1 = byte1.to_i if byte1
|
106
|
+
set_bytes(@byte1, @byte2)
|
107
|
+
end
|
108
|
+
|
109
|
+
def byte2=(byte2)
|
110
|
+
@byte2 = byte2.to_i if byte2
|
111
|
+
set_bytes(@byte1, @byte2)
|
112
|
+
end
|
113
|
+
|
114
|
+
def set_bytes(byte1, byte2)
|
115
|
+
if byte1 && byte2
|
116
|
+
@bytes = ((byte1.to_i)..(byte2.to_i))
|
117
|
+
elsif byte1
|
118
|
+
@bytes = byte1.to_i
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Pass in a MARC field, do it's indicators match indicators
|
123
|
+
# in this spec? nil indicators in spec mean we don't care, everything
|
124
|
+
# matches.
|
125
|
+
def matches_indicators?(field)
|
126
|
+
return (indicator1.nil? || indicator1 == field.indicator1) &&
|
127
|
+
(indicator2.nil? || indicator2 == field.indicator2)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Pass in a string subfield code like 'a'; does this
|
131
|
+
# spec include it?
|
132
|
+
def includes_subfield_code?(code)
|
133
|
+
# subfields nil means include them all
|
134
|
+
self.subfields.nil? || self.subfields.include?(code)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Simple equality definition
|
138
|
+
def ==(spec)
|
139
|
+
return false unless spec.kind_of?(Spec)
|
140
|
+
|
141
|
+
return (self.tag == spec.tag) &&
|
142
|
+
(self.subfields == spec.subfields) &&
|
143
|
+
(self.indicator1 == spec.indicator1) &&
|
144
|
+
(self.indicator2 == spec.indicator2) &&
|
145
|
+
(self.bytes == spec.bytes)
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
150
|
+
# to represent the specification. See comments at head of class for
|
151
|
+
# documentation of string specification format.
|
152
|
+
#
|
153
|
+
#
|
154
|
+
# ## Return value
|
155
|
+
#
|
156
|
+
# The hash returned is keyed by tag, and has as values an array of 0 or
|
157
|
+
# or more MarcExtractor::Spec objects representing the specified extraction
|
158
|
+
# operations for that tag.
|
159
|
+
#
|
160
|
+
# It's an array of possibly more than one, because you can specify
|
161
|
+
# multiple extractions on the same tag: for instance "245a:245abc"
|
162
|
+
#
|
163
|
+
# See tests for more examples.
|
164
|
+
|
165
|
+
DATAFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*])([a-z0-9\ \*])\|)?([a-z0-9]*)?\Z/
|
166
|
+
CONTROLFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/
|
167
|
+
|
168
|
+
def self.hash_from_string(spec_string)
|
169
|
+
# hash defaults to []
|
170
|
+
hash = Hash.new
|
171
|
+
|
172
|
+
# Split the string(s) given on colon
|
173
|
+
spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/s*:\s*/)
|
174
|
+
|
175
|
+
spec_strings.each do |part|
|
176
|
+
if m = DATAFIELD_PATTERN.match(part)
|
177
|
+
|
178
|
+
tag, ind1, ind2, subfields = m[1], m[3], m[4], m[5]
|
179
|
+
|
180
|
+
spec = create_datafield_spec(tag, ind1, ind2, subfields)
|
181
|
+
|
182
|
+
hash[spec.tag] ||= []
|
183
|
+
hash[spec.tag] << spec
|
184
|
+
|
185
|
+
elsif m = CONTROLFIELD_PATTERN.match(part)
|
186
|
+
tag, byte1, byte2 = m[1], m[3], m[5]
|
187
|
+
|
188
|
+
spec = create_controlfield_spec(tag, byte1, byte2)
|
189
|
+
|
190
|
+
hash[spec.tag] ||= []
|
191
|
+
hash[spec.tag] << spec
|
192
|
+
else
|
193
|
+
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
return hash
|
198
|
+
end
|
199
|
+
|
200
|
+
|
201
|
+
# Create a new datafield spec. Most of the logic about how to deal
|
202
|
+
# with special characters is built into the Spec class.
|
203
|
+
|
204
|
+
def self.create_datafield_spec(tag, ind1, ind2, subfields)
|
205
|
+
spec = Spec.new(:tag => tag)
|
206
|
+
spec.indicator1 = ind1
|
207
|
+
spec.indicator2 = ind2
|
208
|
+
|
209
|
+
if subfields and !subfields.empty?
|
210
|
+
spec.subfields = subfields.split('')
|
211
|
+
end
|
212
|
+
|
213
|
+
spec
|
214
|
+
|
215
|
+
end
|
216
|
+
|
217
|
+
# Create a new controlfield spec
|
218
|
+
def self.create_controlfield_spec(tag, byte1, byte2)
|
219
|
+
spec = Spec.new(:tag => tag)
|
220
|
+
spec.set_bytes(byte1, byte2)
|
221
|
+
spec
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
|
data/lib/traject/version.rb
CHANGED
data/test/debug_writer_test.rb
CHANGED
@@ -32,6 +32,47 @@ describe 'Simple output' do
|
|
32
32
|
|
33
33
|
end
|
34
34
|
|
35
|
+
it "deals ok with a missing ID" do
|
36
|
+
context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
37
|
+
logger_strio = StringIO.new
|
38
|
+
idfield = 'id'
|
39
|
+
|
40
|
+
context.logger = Logger.new(logger_strio)
|
41
|
+
context.position = 1
|
42
|
+
|
43
|
+
context.output_hash.delete(idfield)
|
44
|
+
@writer.put context
|
45
|
+
expected = [
|
46
|
+
"record_num_1 title #{@title}",
|
47
|
+
]
|
48
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
49
|
+
assert_match /At least one record \(\#1\) doesn't define field 'id'/, logger_strio.string
|
50
|
+
@writer.close
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
it "sets the idfield correctly" do
|
55
|
+
bad_rec_id_field = 'iden'
|
56
|
+
writer = Traject::DebugWriter.new("output_stream" => @io, "debug_writer.idfield" => bad_rec_id_field)
|
57
|
+
|
58
|
+
context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
59
|
+
|
60
|
+
logger_strio = StringIO.new
|
61
|
+
|
62
|
+
context.logger = Logger.new(logger_strio)
|
63
|
+
context.position = 1
|
64
|
+
|
65
|
+
writer.put context
|
66
|
+
expected = [
|
67
|
+
"record_num_1 id #{@id }",
|
68
|
+
"record_num_1 title #{@title}",
|
69
|
+
]
|
70
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
71
|
+
assert_match /At least one record \(\#1\) doesn't define field 'iden'/, logger_strio.string
|
72
|
+
writer.close
|
73
|
+
|
74
|
+
end
|
75
|
+
|
35
76
|
end
|
36
77
|
|
37
78
|
|
data/test/marc_extractor_test.rb
CHANGED
@@ -9,14 +9,14 @@ describe "Traject::MarcExtractor" do
|
|
9
9
|
it "is frozen read-only" do
|
10
10
|
extractor = Traject::MarcExtractor.new("100abcde", :seperator => ";")
|
11
11
|
assert extractor.frozen?
|
12
|
-
assert extractor.
|
12
|
+
assert extractor.spec_set.frozen?
|
13
13
|
assert extractor.options.frozen?
|
14
14
|
end
|
15
15
|
|
16
16
|
|
17
17
|
describe "#parse_marc_spec" do
|
18
18
|
it "parses single spec with all elements" do
|
19
|
-
parsed = Traject::MarcExtractor.
|
19
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("245|1*|abcg")
|
20
20
|
|
21
21
|
assert_kind_of Hash, parsed
|
22
22
|
assert_equal 1, parsed.keys.length
|
@@ -30,7 +30,7 @@ describe "Traject::MarcExtractor" do
|
|
30
30
|
end
|
31
31
|
|
32
32
|
it "parses a mixed bag" do
|
33
|
-
parsed
|
33
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcde:810:700|*4|bcd")
|
34
34
|
spec245 = parsed['245'].first
|
35
35
|
spec810 = parsed['810'].first
|
36
36
|
spec700 = parsed['700'].first
|
@@ -57,14 +57,14 @@ describe "Traject::MarcExtractor" do
|
|
57
57
|
end
|
58
58
|
|
59
59
|
it "parses fixed field byte offsets" do
|
60
|
-
parsed = Traject::MarcExtractor.
|
60
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("005[5]:008[7-10]")
|
61
61
|
|
62
62
|
assert_equal 5, parsed["005"].first.bytes
|
63
63
|
assert_equal 7..10, parsed["008"].first.bytes
|
64
64
|
end
|
65
65
|
|
66
66
|
it "allows arrays of specs" do
|
67
|
-
parsed = Traject::MarcExtractor.
|
67
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
|
68
68
|
245abcde
|
69
69
|
810
|
70
70
|
700|*4|bcd
|
@@ -73,7 +73,7 @@ describe "Traject::MarcExtractor" do
|
|
73
73
|
end
|
74
74
|
|
75
75
|
it "allows mixture of array and colon-delimited specs" do
|
76
|
-
parsed = Traject::MarcExtractor.
|
76
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
|
77
77
|
245abcde
|
78
78
|
100:110:111
|
79
79
|
810
|
@@ -127,13 +127,13 @@ describe "Traject::MarcExtractor" do
|
|
127
127
|
|
128
128
|
describe "#extract_by_spec" do
|
129
129
|
before do
|
130
|
-
@record = MARC::Reader.new(support_file_path
|
130
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").first
|
131
131
|
end
|
132
132
|
|
133
133
|
describe "extracts a basic case" do
|
134
134
|
before do
|
135
|
-
parsed_spec = Traject::MarcExtractor.
|
136
|
-
@values
|
135
|
+
@parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("700abcdef:856|*2|:505|1*|:245ba")
|
136
|
+
@values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
|
137
137
|
end
|
138
138
|
|
139
139
|
it "returns an array" do
|
@@ -150,7 +150,7 @@ describe "Traject::MarcExtractor" do
|
|
150
150
|
end
|
151
151
|
|
152
152
|
it "does not have 505, due to non-matching indicators" do
|
153
|
-
assert
|
153
|
+
assert !@values.find { |s| s.include? "propaganda model" }, @values
|
154
154
|
end
|
155
155
|
|
156
156
|
|
@@ -166,20 +166,20 @@ describe "Traject::MarcExtractor" do
|
|
166
166
|
|
167
167
|
describe "extracts fixed fields" do
|
168
168
|
it ", complete" do
|
169
|
-
parsed_spec = Traject::MarcExtractor.
|
170
|
-
values
|
169
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("001")
|
170
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
171
171
|
|
172
172
|
assert_equal ["2710183"], values
|
173
173
|
end
|
174
174
|
it ", single byte offset" do
|
175
|
-
parsed_spec = Traject::MarcExtractor.
|
176
|
-
values
|
175
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[5]")
|
176
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
177
177
|
|
178
178
|
assert_equal ["1"], values
|
179
179
|
end
|
180
180
|
it ", byte range" do
|
181
|
-
parsed_spec = Traject::MarcExtractor.
|
182
|
-
values
|
181
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[7-10]")
|
182
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
183
183
|
|
184
184
|
assert_equal ["2002"], values
|
185
185
|
end
|
@@ -187,15 +187,15 @@ describe "Traject::MarcExtractor" do
|
|
187
187
|
|
188
188
|
describe "separator argument" do
|
189
189
|
it "causes non-join when nil" do
|
190
|
-
parsed_spec = Traject::MarcExtractor.
|
191
|
-
values
|
190
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
|
191
|
+
values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
|
192
192
|
|
193
193
|
assert_length 3, values
|
194
194
|
end
|
195
195
|
|
196
196
|
it "can be non-default" do
|
197
|
-
parsed_spec = Traject::MarcExtractor.
|
198
|
-
values
|
197
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
|
198
|
+
values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
|
199
199
|
|
200
200
|
assert_length 1, values
|
201
201
|
assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
|
@@ -204,8 +204,8 @@ describe "Traject::MarcExtractor" do
|
|
204
204
|
|
205
205
|
describe "extracts alternate script" do
|
206
206
|
before do
|
207
|
-
@record
|
208
|
-
@parsed_spec = Traject::MarcExtractor.
|
207
|
+
@record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
|
208
|
+
@parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245b")
|
209
209
|
end
|
210
210
|
it "from default :include" do
|
211
211
|
|
@@ -301,10 +301,10 @@ describe "Traject::MarcExtractor" do
|
|
301
301
|
describe "MarcExtractor.cached" do
|
302
302
|
it "creates" do
|
303
303
|
extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
304
|
-
|
304
|
+
spec_set = extractor.spec_set
|
305
305
|
|
306
306
|
assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
|
307
|
-
assert_equal(
|
307
|
+
assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245", :subfields => ["a", "b", "c"])], spec_set.specs_for_tag('245'))
|
308
308
|
end
|
309
309
|
it "caches" do
|
310
310
|
ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# A sample traject
|
1
|
+
# A sample traject configuration, save as say `traject_config.rb`, then
|
2
2
|
# run `traject -c traject_config.rb marc_file.marc` to index to
|
3
3
|
# solr specified in config file, according to rules specified in
|
4
4
|
# config file
|
data/traject.gemspec
CHANGED
@@ -29,19 +29,19 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
30
|
spec.add_dependency "httpclient", "~> 2.5"
|
31
31
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
32
|
-
|
33
|
-
# If we're building the package under JRuby, add in the
|
32
|
+
|
33
|
+
# If we're building the package under JRuby, add in the
|
34
34
|
# jruby-only gems and specify the platform.
|
35
|
-
|
35
|
+
|
36
36
|
if defined? JRUBY_VERSION
|
37
37
|
spec.platform = 'java'
|
38
38
|
spec.add_dependency "traject-marc4j_reader", "~> 1.0"
|
39
39
|
else
|
40
40
|
spec.platform = "ruby"
|
41
41
|
end
|
42
|
-
|
43
42
|
|
44
|
-
|
43
|
+
|
44
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
45
45
|
spec.add_development_dependency "rake"
|
46
46
|
spec.add_development_dependency "minitest"
|
47
47
|
end
|