traject 2.1.0-java → 2.2.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +8 -20
- data/CHANGES.md +14 -0
- data/README.md +35 -56
- data/doc/extending.md +20 -27
- data/doc/indexing_rules.md +46 -57
- data/doc/settings.md +17 -48
- data/lib/traject/debug_writer.rb +31 -5
- data/lib/traject/indexer.rb +6 -4
- data/lib/traject/marc_extractor.rb +37 -157
- data/lib/traject/marc_extractor_spec.rb +229 -0
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +41 -0
- data/test/marc_extractor_test.rb +24 -24
- data/test/test_support/demo_config.rb +1 -1
- data/traject.gemspec +5 -5
- metadata +74 -73
@@ -0,0 +1,229 @@
|
|
1
|
+
# Represents a single specification for extracting data
|
2
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
3
|
+
#
|
4
|
+
# Includes the tag for reference, although this is redundant and not actually used
|
5
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
6
|
+
# with tag => [spec1, spec2]
|
7
|
+
|
8
|
+
|
9
|
+
module Traject
|
10
|
+
class MarcExtractor
|
11
|
+
|
12
|
+
# A set of specs
|
13
|
+
class SpecSet
|
14
|
+
|
15
|
+
attr_accessor :hash
|
16
|
+
|
17
|
+
def self.new(seedset = {})
|
18
|
+
|
19
|
+
case seedset
|
20
|
+
when String
|
21
|
+
s = allocate
|
22
|
+
s.hash = Spec.hash_from_string(seedset)
|
23
|
+
s
|
24
|
+
when Hash
|
25
|
+
s = allocate
|
26
|
+
hash = Hash.new
|
27
|
+
seedset.each_pair do |k, v|
|
28
|
+
hash[k] = Array(v)
|
29
|
+
end
|
30
|
+
s.hash = hash
|
31
|
+
s
|
32
|
+
when SpecSet
|
33
|
+
seedset
|
34
|
+
else
|
35
|
+
raise ArgumentError.new, "SpecSet can only be constructed from a string, a hash, or another SpecSet"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def add(spec)
|
40
|
+
@hash[spec.tag] << spec
|
41
|
+
end
|
42
|
+
|
43
|
+
def tags
|
44
|
+
@hash.keys
|
45
|
+
end
|
46
|
+
|
47
|
+
def specs_for_tag(tag)
|
48
|
+
@hash[tag] || []
|
49
|
+
end
|
50
|
+
|
51
|
+
def specs_matching_field(field, use_alternate_script = false)
|
52
|
+
|
53
|
+
tag = if use_alternate_script
|
54
|
+
effective_tag(field)
|
55
|
+
else
|
56
|
+
field.tag
|
57
|
+
end
|
58
|
+
specs_for_tag(tag).select { |s| s.matches_indicators?(field) }
|
59
|
+
end
|
60
|
+
|
61
|
+
def effective_tag(field)
|
62
|
+
if field.tag == ALTERNATE_SCRIPT_TAG and field['6']
|
63
|
+
field["6"].encode(field["6"].encoding).byteslice(0, 3)
|
64
|
+
else
|
65
|
+
field.tag
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
class Spec
|
72
|
+
attr_accessor :tag, :subfields
|
73
|
+
attr_reader :indicator1, :indicator2, :byte1, :byte2, :bytes
|
74
|
+
|
75
|
+
# Allow use of a hash to initialize. Should ditch this and use
|
76
|
+
# optional keyword args once folks move to 2.x syntax
|
77
|
+
def initialize(hash = nil)
|
78
|
+
if hash
|
79
|
+
hash.each_pair do |key, value|
|
80
|
+
self.send("#{key}=", value)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Should subfields extracted by joined, if we have a seperator?
|
86
|
+
# * '630' no subfields specified => join all subfields
|
87
|
+
# * '630abc' multiple subfields specified = join all subfields
|
88
|
+
# * '633a' one subfield => do not join, return one value for each $a in the field
|
89
|
+
# * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
|
90
|
+
#
|
91
|
+
# Last case is handled implicitly at the moment when subfields == ['a', 'a']
|
92
|
+
def joinable?
|
93
|
+
(self.subfields.nil? || self.subfields.size != 1)
|
94
|
+
end
|
95
|
+
|
96
|
+
def indicator1=(ind1)
|
97
|
+
ind1 == '*' ? @indicator1 = nil : @indicator1 = ind1
|
98
|
+
end
|
99
|
+
|
100
|
+
def indicator2=(ind2)
|
101
|
+
ind2 == '*' ? @indicator2 = nil : @indicator2 = ind2
|
102
|
+
end
|
103
|
+
|
104
|
+
def byte1=(byte1)
|
105
|
+
@byte1 = byte1.to_i if byte1
|
106
|
+
set_bytes(@byte1, @byte2)
|
107
|
+
end
|
108
|
+
|
109
|
+
def byte2=(byte2)
|
110
|
+
@byte2 = byte2.to_i if byte2
|
111
|
+
set_bytes(@byte1, @byte2)
|
112
|
+
end
|
113
|
+
|
114
|
+
def set_bytes(byte1, byte2)
|
115
|
+
if byte1 && byte2
|
116
|
+
@bytes = ((byte1.to_i)..(byte2.to_i))
|
117
|
+
elsif byte1
|
118
|
+
@bytes = byte1.to_i
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Pass in a MARC field, do it's indicators match indicators
|
123
|
+
# in this spec? nil indicators in spec mean we don't care, everything
|
124
|
+
# matches.
|
125
|
+
def matches_indicators?(field)
|
126
|
+
return (indicator1.nil? || indicator1 == field.indicator1) &&
|
127
|
+
(indicator2.nil? || indicator2 == field.indicator2)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Pass in a string subfield code like 'a'; does this
|
131
|
+
# spec include it?
|
132
|
+
def includes_subfield_code?(code)
|
133
|
+
# subfields nil means include them all
|
134
|
+
self.subfields.nil? || self.subfields.include?(code)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Simple equality definition
|
138
|
+
def ==(spec)
|
139
|
+
return false unless spec.kind_of?(Spec)
|
140
|
+
|
141
|
+
return (self.tag == spec.tag) &&
|
142
|
+
(self.subfields == spec.subfields) &&
|
143
|
+
(self.indicator1 == spec.indicator1) &&
|
144
|
+
(self.indicator2 == spec.indicator2) &&
|
145
|
+
(self.bytes == spec.bytes)
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
150
|
+
# to represent the specification. See comments at head of class for
|
151
|
+
# documentation of string specification format.
|
152
|
+
#
|
153
|
+
#
|
154
|
+
# ## Return value
|
155
|
+
#
|
156
|
+
# The hash returned is keyed by tag, and has as values an array of 0 or
|
157
|
+
# or more MarcExtractor::Spec objects representing the specified extraction
|
158
|
+
# operations for that tag.
|
159
|
+
#
|
160
|
+
# It's an array of possibly more than one, because you can specify
|
161
|
+
# multiple extractions on the same tag: for instance "245a:245abc"
|
162
|
+
#
|
163
|
+
# See tests for more examples.
|
164
|
+
|
165
|
+
DATAFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*])([a-z0-9\ \*])\|)?([a-z0-9]*)?\Z/
|
166
|
+
CONTROLFIELD_PATTERN = /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/
|
167
|
+
|
168
|
+
def self.hash_from_string(spec_string)
|
169
|
+
# hash defaults to []
|
170
|
+
hash = Hash.new
|
171
|
+
|
172
|
+
# Split the string(s) given on colon
|
173
|
+
spec_strings = spec_string.is_a?(Array) ? spec_string.map { |s| s.split(/\s*:\s*/) }.flatten : spec_string.split(/s*:\s*/)
|
174
|
+
|
175
|
+
spec_strings.each do |part|
|
176
|
+
if m = DATAFIELD_PATTERN.match(part)
|
177
|
+
|
178
|
+
tag, ind1, ind2, subfields = m[1], m[3], m[4], m[5]
|
179
|
+
|
180
|
+
spec = create_datafield_spec(tag, ind1, ind2, subfields)
|
181
|
+
|
182
|
+
hash[spec.tag] ||= []
|
183
|
+
hash[spec.tag] << spec
|
184
|
+
|
185
|
+
elsif m = CONTROLFIELD_PATTERN.match(part)
|
186
|
+
tag, byte1, byte2 = m[1], m[3], m[5]
|
187
|
+
|
188
|
+
spec = create_controlfield_spec(tag, byte1, byte2)
|
189
|
+
|
190
|
+
hash[spec.tag] ||= []
|
191
|
+
hash[spec.tag] << spec
|
192
|
+
else
|
193
|
+
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
return hash
|
198
|
+
end
|
199
|
+
|
200
|
+
|
201
|
+
# Create a new datafield spec. Most of the logic about how to deal
|
202
|
+
# with special characters is built into the Spec class.
|
203
|
+
|
204
|
+
def self.create_datafield_spec(tag, ind1, ind2, subfields)
|
205
|
+
spec = Spec.new(:tag => tag)
|
206
|
+
spec.indicator1 = ind1
|
207
|
+
spec.indicator2 = ind2
|
208
|
+
|
209
|
+
if subfields and !subfields.empty?
|
210
|
+
spec.subfields = subfields.split('')
|
211
|
+
end
|
212
|
+
|
213
|
+
spec
|
214
|
+
|
215
|
+
end
|
216
|
+
|
217
|
+
# Create a new controlfield spec
|
218
|
+
def self.create_controlfield_spec(tag, byte1, byte2)
|
219
|
+
spec = Spec.new(:tag => tag)
|
220
|
+
spec.set_bytes(byte1, byte2)
|
221
|
+
spec
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
|
data/lib/traject/version.rb
CHANGED
data/test/debug_writer_test.rb
CHANGED
@@ -32,6 +32,47 @@ describe 'Simple output' do
|
|
32
32
|
|
33
33
|
end
|
34
34
|
|
35
|
+
it "deals ok with a missing ID" do
|
36
|
+
context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
37
|
+
logger_strio = StringIO.new
|
38
|
+
idfield = 'id'
|
39
|
+
|
40
|
+
context.logger = Logger.new(logger_strio)
|
41
|
+
context.position = 1
|
42
|
+
|
43
|
+
context.output_hash.delete(idfield)
|
44
|
+
@writer.put context
|
45
|
+
expected = [
|
46
|
+
"record_num_1 title #{@title}",
|
47
|
+
]
|
48
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
49
|
+
assert_match /At least one record \(\#1\) doesn't define field 'id'/, logger_strio.string
|
50
|
+
@writer.close
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
it "sets the idfield correctly" do
|
55
|
+
bad_rec_id_field = 'iden'
|
56
|
+
writer = Traject::DebugWriter.new("output_stream" => @io, "debug_writer.idfield" => bad_rec_id_field)
|
57
|
+
|
58
|
+
context = Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
|
59
|
+
|
60
|
+
logger_strio = StringIO.new
|
61
|
+
|
62
|
+
context.logger = Logger.new(logger_strio)
|
63
|
+
context.position = 1
|
64
|
+
|
65
|
+
writer.put context
|
66
|
+
expected = [
|
67
|
+
"record_num_1 id #{@id }",
|
68
|
+
"record_num_1 title #{@title}",
|
69
|
+
]
|
70
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
71
|
+
assert_match /At least one record \(\#1\) doesn't define field 'iden'/, logger_strio.string
|
72
|
+
writer.close
|
73
|
+
|
74
|
+
end
|
75
|
+
|
35
76
|
end
|
36
77
|
|
37
78
|
|
data/test/marc_extractor_test.rb
CHANGED
@@ -9,14 +9,14 @@ describe "Traject::MarcExtractor" do
|
|
9
9
|
it "is frozen read-only" do
|
10
10
|
extractor = Traject::MarcExtractor.new("100abcde", :seperator => ";")
|
11
11
|
assert extractor.frozen?
|
12
|
-
assert extractor.
|
12
|
+
assert extractor.spec_set.frozen?
|
13
13
|
assert extractor.options.frozen?
|
14
14
|
end
|
15
15
|
|
16
16
|
|
17
17
|
describe "#parse_marc_spec" do
|
18
18
|
it "parses single spec with all elements" do
|
19
|
-
parsed = Traject::MarcExtractor.
|
19
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("245|1*|abcg")
|
20
20
|
|
21
21
|
assert_kind_of Hash, parsed
|
22
22
|
assert_equal 1, parsed.keys.length
|
@@ -30,7 +30,7 @@ describe "Traject::MarcExtractor" do
|
|
30
30
|
end
|
31
31
|
|
32
32
|
it "parses a mixed bag" do
|
33
|
-
parsed
|
33
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("245abcde:810:700|*4|bcd")
|
34
34
|
spec245 = parsed['245'].first
|
35
35
|
spec810 = parsed['810'].first
|
36
36
|
spec700 = parsed['700'].first
|
@@ -57,14 +57,14 @@ describe "Traject::MarcExtractor" do
|
|
57
57
|
end
|
58
58
|
|
59
59
|
it "parses fixed field byte offsets" do
|
60
|
-
parsed = Traject::MarcExtractor.
|
60
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string("005[5]:008[7-10]")
|
61
61
|
|
62
62
|
assert_equal 5, parsed["005"].first.bytes
|
63
63
|
assert_equal 7..10, parsed["008"].first.bytes
|
64
64
|
end
|
65
65
|
|
66
66
|
it "allows arrays of specs" do
|
67
|
-
parsed = Traject::MarcExtractor.
|
67
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
|
68
68
|
245abcde
|
69
69
|
810
|
70
70
|
700|*4|bcd
|
@@ -73,7 +73,7 @@ describe "Traject::MarcExtractor" do
|
|
73
73
|
end
|
74
74
|
|
75
75
|
it "allows mixture of array and colon-delimited specs" do
|
76
|
-
parsed = Traject::MarcExtractor.
|
76
|
+
parsed = Traject::MarcExtractor::Spec.hash_from_string %w(
|
77
77
|
245abcde
|
78
78
|
100:110:111
|
79
79
|
810
|
@@ -127,13 +127,13 @@ describe "Traject::MarcExtractor" do
|
|
127
127
|
|
128
128
|
describe "#extract_by_spec" do
|
129
129
|
before do
|
130
|
-
@record = MARC::Reader.new(support_file_path
|
130
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").first
|
131
131
|
end
|
132
132
|
|
133
133
|
describe "extracts a basic case" do
|
134
134
|
before do
|
135
|
-
parsed_spec = Traject::MarcExtractor.
|
136
|
-
@values
|
135
|
+
@parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("700abcdef:856|*2|:505|1*|:245ba")
|
136
|
+
@values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
|
137
137
|
end
|
138
138
|
|
139
139
|
it "returns an array" do
|
@@ -150,7 +150,7 @@ describe "Traject::MarcExtractor" do
|
|
150
150
|
end
|
151
151
|
|
152
152
|
it "does not have 505, due to non-matching indicators" do
|
153
|
-
assert
|
153
|
+
assert !@values.find { |s| s.include? "propaganda model" }, @values
|
154
154
|
end
|
155
155
|
|
156
156
|
|
@@ -166,20 +166,20 @@ describe "Traject::MarcExtractor" do
|
|
166
166
|
|
167
167
|
describe "extracts fixed fields" do
|
168
168
|
it ", complete" do
|
169
|
-
parsed_spec = Traject::MarcExtractor.
|
170
|
-
values
|
169
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("001")
|
170
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
171
171
|
|
172
172
|
assert_equal ["2710183"], values
|
173
173
|
end
|
174
174
|
it ", single byte offset" do
|
175
|
-
parsed_spec = Traject::MarcExtractor.
|
176
|
-
values
|
175
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[5]")
|
176
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
177
177
|
|
178
178
|
assert_equal ["1"], values
|
179
179
|
end
|
180
180
|
it ", byte range" do
|
181
|
-
parsed_spec = Traject::MarcExtractor.
|
182
|
-
values
|
181
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("008[7-10]")
|
182
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
183
183
|
|
184
184
|
assert_equal ["2002"], values
|
185
185
|
end
|
@@ -187,15 +187,15 @@ describe "Traject::MarcExtractor" do
|
|
187
187
|
|
188
188
|
describe "separator argument" do
|
189
189
|
it "causes non-join when nil" do
|
190
|
-
parsed_spec = Traject::MarcExtractor.
|
191
|
-
values
|
190
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
|
191
|
+
values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
|
192
192
|
|
193
193
|
assert_length 3, values
|
194
194
|
end
|
195
195
|
|
196
196
|
it "can be non-default" do
|
197
|
-
parsed_spec = Traject::MarcExtractor.
|
198
|
-
values
|
197
|
+
parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245")
|
198
|
+
values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
|
199
199
|
|
200
200
|
assert_length 1, values
|
201
201
|
assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
|
@@ -204,8 +204,8 @@ describe "Traject::MarcExtractor" do
|
|
204
204
|
|
205
205
|
describe "extracts alternate script" do
|
206
206
|
before do
|
207
|
-
@record
|
208
|
-
@parsed_spec = Traject::MarcExtractor.
|
207
|
+
@record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
|
208
|
+
@parsed_spec = Traject::MarcExtractor::Spec.hash_from_string("245b")
|
209
209
|
end
|
210
210
|
it "from default :include" do
|
211
211
|
|
@@ -301,10 +301,10 @@ describe "Traject::MarcExtractor" do
|
|
301
301
|
describe "MarcExtractor.cached" do
|
302
302
|
it "creates" do
|
303
303
|
extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
304
|
-
|
304
|
+
spec_set = extractor.spec_set
|
305
305
|
|
306
306
|
assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
|
307
|
-
assert_equal(
|
307
|
+
assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245", :subfields => ["a", "b", "c"])], spec_set.specs_for_tag('245'))
|
308
308
|
end
|
309
309
|
it "caches" do
|
310
310
|
ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# A sample traject
|
1
|
+
# A sample traject configuration, save as say `traject_config.rb`, then
|
2
2
|
# run `traject -c traject_config.rb marc_file.marc` to index to
|
3
3
|
# solr specified in config file, according to rules specified in
|
4
4
|
# config file
|
data/traject.gemspec
CHANGED
@@ -29,19 +29,19 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
30
|
spec.add_dependency "httpclient", "~> 2.5"
|
31
31
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
32
|
-
|
33
|
-
# If we're building the package under JRuby, add in the
|
32
|
+
|
33
|
+
# If we're building the package under JRuby, add in the
|
34
34
|
# jruby-only gems and specify the platform.
|
35
|
-
|
35
|
+
|
36
36
|
if defined? JRUBY_VERSION
|
37
37
|
spec.platform = 'java'
|
38
38
|
spec.add_dependency "traject-marc4j_reader", "~> 1.0"
|
39
39
|
else
|
40
40
|
spec.platform = "ruby"
|
41
41
|
end
|
42
|
-
|
43
42
|
|
44
|
-
|
43
|
+
|
44
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
45
45
|
spec.add_development_dependency "rake"
|
46
46
|
spec.add_development_dependency "minitest"
|
47
47
|
end
|