traject 0.15.0 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +27 -24
- data/doc/settings.md +8 -2
- data/lib/traject/command_line.rb +1 -6
- data/lib/traject/macros/marc21_semantics.rb +12 -5
- data/lib/traject/marc_extractor.rb +178 -84
- data/lib/traject/translation_map.rb +7 -7
- data/lib/traject/version.rb +1 -1
- data/test/indexer/macros_marc21_semantics_test.rb +2 -1
- data/test/indexer/to_field_test.rb +23 -2
- data/test/marc_extractor_test.rb +79 -32
- metadata +2 -2
data/README.md
CHANGED
@@ -49,8 +49,9 @@ The traject command-line utility requires you to supply it with a configuration
|
|
49
49
|
|
50
50
|
Configuration files are actually just ruby -- so by convention they end in `.rb`.
|
51
51
|
|
52
|
-
|
53
|
-
of ruby
|
52
|
+
We hope you can write basic useful configuration files without being a ruby expert,
|
53
|
+
they give you a subset of ruby to work with. But the full power
|
54
|
+
of ruby is available to you if needed.
|
54
55
|
|
55
56
|
**rubyist tip**: Technically, config files are executed with `instance_eval` in a Traject::Indexer instance, so the special commands you see are just methods on Traject::Indexer (or mixed into it). But you can
|
56
57
|
call ordinary ruby `require` in config files, etc., too, to load
|
@@ -84,9 +85,6 @@ settings do
|
|
84
85
|
# you have to tell it.
|
85
86
|
provide "marc_source.type", "xml"
|
86
87
|
|
87
|
-
# settings can be set on command line instead of
|
88
|
-
# config file too.
|
89
|
-
|
90
88
|
# various others...
|
91
89
|
provide "solrj_writer.commit_on_close", "true"
|
92
90
|
|
@@ -163,39 +161,44 @@ Other examples of the specification string, which can include multiple tag menti
|
|
163
161
|
# "*" is a wildcard in indicator spec. So
|
164
162
|
# 856 with first indicator '0', subfield u.
|
165
163
|
to_field "email_addresses", extract_marc("856|0*|u")
|
166
|
-
|
167
|
-
# Instead of joining subfields from the same field
|
168
|
-
# into one string, joined by spaces, leave them
|
169
|
-
# each in separate strings:
|
170
|
-
to_field "isbn", extract_marc("020az", :separator => nil)
|
171
|
-
|
172
|
-
# Same thing, but more explicit
|
173
|
-
to_field "isbn", extract_marc("020a:020z")
|
174
164
|
|
175
|
-
|
176
|
-
#
|
177
|
-
|
178
|
-
to_field 'language008', extract_marc('008[35-37]', :deduplicate=>true)
|
165
|
+
# Can list tag twice with different field combinations
|
166
|
+
# to extract separately
|
167
|
+
to_field "isbn", extract_marc("245a:245abcde")
|
179
168
|
~~~
|
180
169
|
|
181
170
|
The `extract_marc` function *by default* includes any linked
|
182
171
|
MARC `880` fields with alternate-script versions. Another reason
|
183
172
|
to use the `:first` option if you really only want one.
|
184
173
|
|
174
|
+
By default, specifications with multiple subfields (like "240abc") will produce
|
175
|
+
one single string of output for each matching field. Specifications
|
176
|
+
with single subfields (like "020a") will split subfields and produce
|
177
|
+
an output string for each matching subfield.
|
178
|
+
|
185
179
|
For MARC control (aka 'fixed') fields, you can use square
|
186
180
|
brackets to take a slice by byte offset.
|
187
181
|
|
182
|
+
~~~ruby
|
188
183
|
to_field "langauge_code", extract_marc("008[35-37]")
|
184
|
+
~~~
|
185
|
+
|
186
|
+
For more information on extraction specifications, see
|
187
|
+
the [MarcExtractor class](./lib/traject/marc_extractor.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/MarcExtractor)).
|
189
188
|
|
190
189
|
`extract_marc` also supports `translation maps` similar
|
191
|
-
to SolrMarc's. There
|
192
|
-
and you can
|
190
|
+
to SolrMarc's. There are some translation maps provided by traject,
|
191
|
+
and you can also define your own. translation maps can be supplied
|
193
192
|
in yaml or ruby. Translation maps are especially useful
|
194
|
-
for mapping form MARC codes to user-displayable strings
|
193
|
+
for mapping form MARC codes to user-displayable strings:
|
195
194
|
|
195
|
+
~~~ruby
|
196
196
|
# "translation_map" will be passed to Traject::TranslationMap.new
|
197
197
|
# and the created map used to translate all values
|
198
198
|
to_field "language", extract_marc("008[35-37]:041a:041d", :translation_map => "marc_language_code")
|
199
|
+
~~~
|
200
|
+
|
201
|
+
See [Traject::TranslationMap](./lib/traject/translation_map.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/TranslationMap)) for more info on translation mapping.
|
199
202
|
|
200
203
|
#### Direct indexing logic vs. Macros
|
201
204
|
|
@@ -348,11 +351,11 @@ This will over-ride any settings set with `provide` in conf files.
|
|
348
351
|
There are some built-in command-line option shortcuts for useful
|
349
352
|
settings:
|
350
353
|
|
351
|
-
Use
|
352
|
-
|
353
|
-
checking.
|
354
|
+
Use `--debug-mode` to output in a human-readable format, instead of sending to solr.
|
355
|
+
Also turns on debug logging and restricts processing to single-threaded. Useful for
|
356
|
+
debugging or sanity checking.
|
354
357
|
|
355
|
-
traject -
|
358
|
+
traject --debug-mode -c conf_file.rb marc_file
|
356
359
|
|
357
360
|
Use `-u` as a shortcut for `s solr.url=X`
|
358
361
|
|
data/doc/settings.md
CHANGED
@@ -4,7 +4,8 @@ Traject settings are a flat list of key/value pairs -- a single
|
|
4
4
|
Hash, not nested. Keys are always strings, and dots (".") can be
|
5
5
|
used for grouping and namespacing.
|
6
6
|
|
7
|
-
Values are usually strings, but occasionally something else.
|
7
|
+
Values are usually strings, but occasionally something else. String values can be easily
|
8
|
+
set via the command line.
|
8
9
|
|
9
10
|
Settings can be set in configuration files, usually like:
|
10
11
|
|
@@ -17,6 +18,11 @@ end
|
|
17
18
|
or on the command line: `-s key=value`. There are also some command line shortcuts
|
18
19
|
for commonly used settings, see `traject -h`.
|
19
20
|
|
21
|
+
`provide` will only set the key if it was previously unset, so first time to set 'wins'. And command-line
|
22
|
+
settings are applied first of all. It's recommended you use `provide`.
|
23
|
+
|
24
|
+
`store` is also available, and forces setting of the new value overriding any previous value set.
|
25
|
+
|
20
26
|
## Known settings
|
21
27
|
|
22
28
|
* `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
|
@@ -101,4 +107,4 @@ for commonly used settings, see `traject -h`.
|
|
101
107
|
Note that processing_thread_pool threads can end up submitting
|
102
108
|
to solr too, if solrj_writer.thread_pool is full.
|
103
109
|
|
104
|
-
* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJWriter, also available Traject::JsonWriter. See Traject::Indexer for more info. Command line shortcut `-w`
|
110
|
+
* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJWriter, also available Traject::JsonWriter. See Traject::Indexer for more info. Command line shortcut `-w`
|
data/lib/traject/command_line.rb
CHANGED
@@ -268,10 +268,6 @@ module Traject
|
|
268
268
|
if options[:solr]
|
269
269
|
settings["solr.url"] = options[:solr]
|
270
270
|
end
|
271
|
-
if options[:j]
|
272
|
-
settings["writer_class_name"] = "JsonWriter"
|
273
|
-
settings["json_writer.pretty_print"] = "true"
|
274
|
-
end
|
275
271
|
if options[:marc_type]
|
276
272
|
settings["marc_source.type"] = options[:marc_type]
|
277
273
|
end
|
@@ -296,12 +292,11 @@ module Traject
|
|
296
292
|
on :o, "output_file", "output file for Writer classes that write to files", :argument => true
|
297
293
|
on :w, :writer, "Set writer class, shortcut for -s writer_class_name=", :argument => true
|
298
294
|
on :u, :solr, "Set solr url, shortcut for -s solr.url=", :argument => true
|
299
|
-
on :j, "output as pretty printed json, shortcut for -s writer_class_name=JsonWriter -s json_writer.pretty_print=true"
|
300
295
|
on :t, :marc_type, "xml, json or binary. shortcut for -s marc_source.type=", :argument => true
|
301
296
|
on :I, "load_path", "append paths to ruby $LOAD_PATH", :argument => true, :as => Array, :delimiter => ":"
|
302
297
|
on :G, "Gemfile", "run with bundler and optionally specified Gemfile", :argument => :optional, :default => nil
|
303
298
|
|
304
|
-
on :x, "command", "alternate traject command: process (default); marcout", :argument => true, :default => "process"
|
299
|
+
on :x, "command", "alternate traject command: process (default); marcout; commit", :argument => true, :default => "process"
|
305
300
|
|
306
301
|
on "stdin", "read input from stdin"
|
307
302
|
on "debug-mode", "debug logging, single threaded, output human readable hashes"
|
@@ -144,7 +144,7 @@ module Traject::Macros
|
|
144
144
|
# return the filing version (i.e., the string without the
|
145
145
|
# non-filing characters)
|
146
146
|
|
147
|
-
def self.filing_version(field, str,
|
147
|
+
def self.filing_version(field, str, spec)
|
148
148
|
# Control fields don't have non-filing characters
|
149
149
|
return str if field.kind_of? MARC::ControlField
|
150
150
|
|
@@ -155,7 +155,7 @@ module Traject::Macros
|
|
155
155
|
# The spechash must either (a) have no subfields specified, or
|
156
156
|
# (b) include the first subfield in the record
|
157
157
|
|
158
|
-
subs =
|
158
|
+
subs = spec.subfields
|
159
159
|
return str unless subs && subs.include?(field.subfields[0].code)
|
160
160
|
|
161
161
|
# OK. If we got this far we actually need to strip characters off the string
|
@@ -183,7 +183,7 @@ module Traject::Macros
|
|
183
183
|
lambda do |record, accumulator|
|
184
184
|
codes = extractor.collect_matching_lines(record) do |field, spec, extractor|
|
185
185
|
if extractor.control_field?(field)
|
186
|
-
(spec
|
186
|
+
(spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
|
187
187
|
else
|
188
188
|
extractor.collect_subfields(field, spec).collect do |value|
|
189
189
|
# sometimes multiple language codes are jammed together in one subfield, and
|
@@ -212,9 +212,16 @@ module Traject::Macros
|
|
212
212
|
extractor = MarcExtractor.new(spec)
|
213
213
|
|
214
214
|
lambda do |record, accumulator|
|
215
|
-
|
215
|
+
values = extractor.collect_matching_lines(record) do |field, spec, extractor|
|
216
216
|
extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
|
217
|
-
end.compact
|
217
|
+
end.compact
|
218
|
+
|
219
|
+
# trim punctuation
|
220
|
+
values.collect! do |s|
|
221
|
+
Marc21.trim_punctuation(s)
|
222
|
+
end
|
223
|
+
|
224
|
+
accumulator.concat( values )
|
218
225
|
end
|
219
226
|
end
|
220
227
|
|
@@ -6,9 +6,79 @@ module Traject
|
|
6
6
|
#
|
7
7
|
# Examples:
|
8
8
|
#
|
9
|
-
# array_of_stuff
|
10
|
-
# values
|
9
|
+
# array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
|
10
|
+
# values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
|
11
|
+
# seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
|
12
|
+
# bytes = MarcExtractor.new("008[35-37]")
|
11
13
|
#
|
14
|
+
# == String extraction specifications
|
15
|
+
#
|
16
|
+
# Extraction directions are supplied in strings, usually as the first
|
17
|
+
# parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
|
18
|
+
# are also the first parameter to the #marc_extract macro.
|
19
|
+
#
|
20
|
+
# A String specification is a string (or array of strings) which consists
|
21
|
+
# of one or more Data and Control Field Specifications seperated by colons.
|
22
|
+
#
|
23
|
+
# A Data Field Specification is of the form:
|
24
|
+
# `{tag}{|indicators|}{subfields}`
|
25
|
+
# * {tag} is three chars (usually but not neccesarily numeric)
|
26
|
+
# * {indicators} are optional two chars enclosed in pipe ('|') characters,
|
27
|
+
# * {subfields} are optional list of chars (alphanumeric)
|
28
|
+
#
|
29
|
+
# indicator spec must be two chars, but one can be * meaning "don't care".
|
30
|
+
# space to mean 'blank'
|
31
|
+
#
|
32
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
33
|
+
#
|
34
|
+
# A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
|
35
|
+
# and includes a tag and a a byte slice specification.
|
36
|
+
#
|
37
|
+
# "008[35-37]:007[5]""
|
38
|
+
# => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
|
39
|
+
# "LDR" as a pseudo-tag to take byte slices of leader?)
|
40
|
+
#
|
41
|
+
# * subfields and indicators can only be provided for marc data/variable fields
|
42
|
+
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
43
|
+
#
|
44
|
+
# == Subfield concatenation
|
45
|
+
#
|
46
|
+
# Normally, for a spec including multiple subfield codes, multiple subfields
|
47
|
+
# from the same MARC field will be concatenated into one string separated by spaces:
|
48
|
+
#
|
49
|
+
# 600 a| Chomsky, Noam x| Philosophy.
|
50
|
+
# 600 a| Chomsky, Noam x| Political and social views.
|
51
|
+
# MarcExtractor.new("600ax").extract(record)
|
52
|
+
# # results in two values sent to Solr:
|
53
|
+
# "Chomsky, Noam Philosophy."
|
54
|
+
# "Chomsky, Noam Political and social views."
|
55
|
+
#
|
56
|
+
# You can turn off this concatenation and leave individual subfields in seperate
|
57
|
+
# strings by setting the `separator` option to nil:
|
58
|
+
#
|
59
|
+
# MarcExtractor.new("600ax", :separator => nil).extract(record)
|
60
|
+
# # Results in four values being sent to Solr (or 3 if you de-dup):
|
61
|
+
# "Chomksy, Noam"
|
62
|
+
# "Philosophy."
|
63
|
+
# "Chomsky, Noam"
|
64
|
+
# "Political and social views."
|
65
|
+
#
|
66
|
+
# However, **the default is different for specifications with only a single
|
67
|
+
# subfield**, these are by default kept seperated:
|
68
|
+
#
|
69
|
+
# 020 a| 285197145X a| 9782851971456
|
70
|
+
# MarcExtractor.new("020a:020z").extract(record)
|
71
|
+
# # two seperate strings sent to Solr:
|
72
|
+
# "285197145X"
|
73
|
+
# "9782851971456"
|
74
|
+
#
|
75
|
+
# For single subfield specifications, you force concatenation by
|
76
|
+
# repeating the subfield specification:
|
77
|
+
#
|
78
|
+
# MarcExtractor.new("020aa:020zz").extract(record)
|
79
|
+
# # would result in a single string sent to solr for
|
80
|
+
# # the single field, by default space-separated:
|
81
|
+
# "285197145X 9782851971456"
|
12
82
|
#
|
13
83
|
# == Note on Performance and MarcExtractor creation and reuse
|
14
84
|
#
|
@@ -37,14 +107,15 @@ module Traject
|
|
37
107
|
class MarcExtractor
|
38
108
|
attr_accessor :options, :spec_hash
|
39
109
|
|
40
|
-
#
|
41
|
-
#
|
110
|
+
# First arg is a specification for extraction of data from a MARC record.
|
111
|
+
# Specification can be given in two forms:
|
42
112
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
113
|
+
# * a string specification like "008[35]:020a:245abc", see top of class
|
114
|
+
# for examples. A string specification is most typical argument.
|
115
|
+
# * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
|
116
|
+
# a 'pre-parsed' specification.
|
46
117
|
#
|
47
|
-
# options:
|
118
|
+
# Second arg is options:
|
48
119
|
#
|
49
120
|
# [:separator] default ' ' (space), what to use to separate
|
50
121
|
# subfield values when joining strings
|
@@ -108,57 +179,30 @@ module Traject
|
|
108
179
|
|
109
180
|
# Check to see if a tag is interesting (meaning it may be covered by a spec
|
110
181
|
# and the passed-in options about alternate scripts)
|
111
|
-
|
112
182
|
def interesting_tag?(tag)
|
113
183
|
return @interesting_tags_hash.include?(tag)
|
114
184
|
end
|
115
185
|
|
116
186
|
|
117
|
-
# Converts from a string marc spec like "245abc:700a" to a
|
118
|
-
# to represent the specification.
|
119
|
-
#
|
120
|
-
# a String specification is a string (or array of strings) of form:
|
121
|
-
# {tag}{|indicators|}{subfields} separated by colons
|
122
|
-
# tag is three chars (usually but not neccesarily numeric),
|
123
|
-
# indicators are optional two chars enclosed in pipe ('|') characters,
|
124
|
-
# subfields are optional list of chars (alphanumeric)
|
125
|
-
#
|
126
|
-
# indicator spec must be two chars, but one can be * meaning "don't care".
|
127
|
-
# space to mean 'blank'
|
128
|
-
#
|
129
|
-
# "245|01|abc65:345abc:700|*5|:800"
|
130
|
-
#
|
131
|
-
# Or, for control (fixed) fields (ordinarily fields 001-010), you can include a byte slice specification,
|
132
|
-
# but can NOT include subfield or indicator specifications. Plus can use special tag "LDR" for
|
133
|
-
# the marc leader. (TODO)
|
134
|
-
#
|
135
|
-
# "008[35-37]:LDR[5]"
|
136
|
-
# => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader.
|
187
|
+
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
188
|
+
# to represent the specification. See comments at head of class for
|
189
|
+
# documentation of string specification format.
|
137
190
|
#
|
138
|
-
# Returns a nested hash whose keys are tags and whose value is an array
|
139
|
-
# of hash structures indicating what indicators and subfields (or
|
140
|
-
# byte-offsets for control fields) are needed, e.g.
|
141
191
|
#
|
142
|
-
#
|
192
|
+
# == Return value
|
143
193
|
#
|
144
|
-
#
|
145
|
-
#
|
146
|
-
#
|
147
|
-
# {:subfields => ['a', 'b']}
|
148
|
-
# ]
|
149
|
-
# '110' => [{}] # all subfields, indicators don't matter
|
150
|
-
# '008' => [
|
151
|
-
# {:bytes => (15..17)}
|
152
|
-
# {:bytes => 17}
|
153
|
-
# ]
|
154
|
-
# }
|
194
|
+
# The hash returned is keyed by tag, and has as values an array of 0 or
|
195
|
+
# or more MarcExtractor::Spec objects representing the specified extraction
|
196
|
+
# operations for that tag.
|
155
197
|
#
|
156
|
-
#
|
157
|
-
#
|
198
|
+
# It's an array of possibly more than one, because you can specify
|
199
|
+
# multiple extractions on the same tag: for instance "245a:245abc"
|
158
200
|
#
|
159
201
|
# See tests for more examples.
|
160
202
|
def self.parse_string_spec(spec_string)
|
161
|
-
hash
|
203
|
+
# hash defaults to []
|
204
|
+
hash = Hash.new {|hash,key| hash[key] = []}
|
205
|
+
|
162
206
|
spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
|
163
207
|
|
164
208
|
spec_strings.each do |part|
|
@@ -166,31 +210,32 @@ module Traject
|
|
166
210
|
# variable field
|
167
211
|
tag, indicators, subfields = $1, $3, $4
|
168
212
|
|
169
|
-
|
170
|
-
spec = {}
|
213
|
+
spec = Spec.new(:tag => tag)
|
171
214
|
|
172
215
|
if subfields and !subfields.empty?
|
173
|
-
spec
|
216
|
+
spec.subfields = subfields.split('')
|
174
217
|
end
|
175
218
|
|
176
219
|
if indicators
|
177
|
-
|
220
|
+
# if specified as '*', leave nil
|
221
|
+
spec.indicator1 = indicators[0] if indicators[0] != "*"
|
222
|
+
spec.indicator2 = indicators[1] if indicators[1] != "*"
|
178
223
|
end
|
224
|
+
|
225
|
+
hash[spec.tag] << spec
|
179
226
|
|
180
|
-
|
181
|
-
|
182
|
-
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]"
|
227
|
+
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
|
183
228
|
tag, byte1, byte2 = $1, $3, $5
|
184
|
-
|
185
|
-
spec =
|
229
|
+
|
230
|
+
spec = Spec.new(:tag => tag)
|
186
231
|
|
187
232
|
if byte1 && byte2
|
188
|
-
spec
|
233
|
+
spec.bytes = ((byte1.to_i)..(byte2.to_i))
|
189
234
|
elsif byte1
|
190
|
-
spec
|
235
|
+
spec.bytes = byte1.to_i
|
191
236
|
end
|
192
237
|
|
193
|
-
hash[tag] << spec
|
238
|
+
hash[spec.tag] << spec
|
194
239
|
else
|
195
240
|
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
196
241
|
end
|
@@ -206,7 +251,7 @@ module Traject
|
|
206
251
|
|
207
252
|
self.each_matching_line(marc_record) do |field, spec|
|
208
253
|
if control_field?(field)
|
209
|
-
results << (spec
|
254
|
+
results << (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
|
210
255
|
else
|
211
256
|
results.concat collect_subfields(field, spec)
|
212
257
|
end
|
@@ -217,7 +262,7 @@ module Traject
|
|
217
262
|
|
218
263
|
# Yields a block for every line in source record that matches
|
219
264
|
# spec. First arg to block is MARC::DataField or ControlField, second
|
220
|
-
# is the
|
265
|
+
# is the MarcExtractor::Spec that it matched on. May take account
|
221
266
|
# of options such as :alternate_script
|
222
267
|
#
|
223
268
|
# Third (optional) arg to block is self, the MarcExtractor object, useful for custom
|
@@ -225,19 +270,14 @@ module Traject
|
|
225
270
|
def each_matching_line(marc_record)
|
226
271
|
marc_record.fields(@interesting_tags_hash.keys).each do |field|
|
227
272
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
# Make sure it matches indicators too, spec_covering_field
|
234
|
-
# doens't check that.
|
235
|
-
|
236
|
-
specs.each do |spec|
|
237
|
-
if matches_indicators(field, spec)
|
273
|
+
# Make sure it matches indicators too, specs_covering_field
|
274
|
+
# doesn't check that.
|
275
|
+
specs_covering_field(field).each do |spec|
|
276
|
+
if spec.matches_indicators?(field)
|
238
277
|
yield(field, spec, self)
|
239
278
|
end
|
240
279
|
end
|
280
|
+
|
241
281
|
end
|
242
282
|
end
|
243
283
|
|
@@ -245,6 +285,8 @@ module Traject
|
|
245
285
|
# but collects results of block into an array -- flattens any subarrays for you!
|
246
286
|
#
|
247
287
|
# Useful for re-use of this class for custom processing
|
288
|
+
#
|
289
|
+
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
248
290
|
def collect_matching_lines(marc_record)
|
249
291
|
results = []
|
250
292
|
self.each_matching_line(marc_record) do |field, spec, extractor|
|
@@ -254,31 +296,36 @@ module Traject
|
|
254
296
|
end
|
255
297
|
|
256
298
|
|
257
|
-
# Pass in a marc data field and a
|
258
|
-
# an ARRAY of one or more strings, subfields extracted
|
299
|
+
# Pass in a marc data field and a Spec object with extraction
|
300
|
+
# instructions, returns an ARRAY of one or more strings, subfields extracted
|
259
301
|
# and processed per spec. Takes account of options such
|
260
302
|
# as :separator
|
261
303
|
#
|
262
304
|
# Always returns array, sometimes empty array.
|
263
305
|
def collect_subfields(field, spec)
|
264
306
|
subfields = field.subfields.collect do |subfield|
|
265
|
-
subfield.value if spec
|
307
|
+
subfield.value if spec.includes_subfield_code?(subfield.code)
|
266
308
|
end.compact
|
267
309
|
|
268
310
|
return subfields if subfields.empty? # empty array, just return it.
|
269
311
|
|
270
|
-
|
312
|
+
if options[:separator] && spec.joinable?
|
313
|
+
subfields = [subfields.join(options[:separator])]
|
314
|
+
end
|
315
|
+
|
316
|
+
return subfields
|
271
317
|
end
|
272
318
|
|
273
319
|
|
274
|
-
|
320
|
+
|
321
|
+
# Find Spec objects, if any, covering extraction from this field.
|
322
|
+
# Returns an array of 0 or more MarcExtractor::Spec objects
|
275
323
|
#
|
276
324
|
# When given an 880, will return the spec (if any) for the linked tag iff
|
277
325
|
# we have a $6 and we want the alternate script.
|
278
326
|
#
|
279
|
-
# Returns
|
280
|
-
|
281
|
-
def spec_covering_field(field)
|
327
|
+
# Returns an empty array in case of no matching extraction specs.
|
328
|
+
def specs_covering_field(field)
|
282
329
|
tag = field.tag
|
283
330
|
|
284
331
|
# Short-circuit the unintersting stuff
|
@@ -301,13 +348,60 @@ module Traject
|
|
301
348
|
# define #control_field? on both ControlField and DataField?
|
302
349
|
return field.kind_of? MARC::ControlField
|
303
350
|
end
|
351
|
+
|
304
352
|
|
305
|
-
# a
|
306
|
-
|
307
|
-
|
353
|
+
# Represents a single specification for extracting data
|
354
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
355
|
+
#
|
356
|
+
# Includes the tag for reference, although this is redundant and not actually used
|
357
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
358
|
+
# with tag => [spec1, spec2]
|
359
|
+
class Spec
|
360
|
+
attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
|
361
|
+
|
362
|
+
def initialize(hash = {})
|
363
|
+
hash.each_pair do |key, value|
|
364
|
+
self.send("#{key}=", value)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
|
369
|
+
# Should subfields extracted by joined, if we have a seperator?
|
370
|
+
# * '630' no subfields specified => join all subfields
|
371
|
+
# * '630abc' multiple subfields specified = join all subfields
|
372
|
+
# * '633a' one subfield => do not join, return one value for each $a in the field
|
373
|
+
# * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
|
374
|
+
#
|
375
|
+
# Last case is handled implicitly at the moment when subfields == ['a', 'a']
|
376
|
+
def joinable?
|
377
|
+
(self.subfields.nil? || self.subfields.size != 1)
|
378
|
+
end
|
379
|
+
|
380
|
+
# Pass in a MARC field, do it's indicators match indicators
|
381
|
+
# in this spec? nil indicators in spec mean we don't care, everything
|
382
|
+
# matches.
|
383
|
+
def matches_indicators?(field)
|
384
|
+
return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
|
385
|
+
(self.indicator2.nil? || self.indicator2 == field.indicator2)
|
386
|
+
end
|
308
387
|
|
309
|
-
|
310
|
-
|
388
|
+
# Pass in a string subfield code like 'a'; does this
|
389
|
+
# spec include it?
|
390
|
+
def includes_subfield_code?(code)
|
391
|
+
# subfields nil means include them all
|
392
|
+
self.subfields.nil? || self.subfields.include?(code)
|
393
|
+
end
|
394
|
+
|
395
|
+
def ==(spec)
|
396
|
+
return false unless spec.kind_of?(Spec)
|
397
|
+
|
398
|
+
return (self.tag == spec.tag) &&
|
399
|
+
(self.subfields == spec.subfields) &&
|
400
|
+
(self.indicator1 == spec.indicator1) &&
|
401
|
+
(self.indicator1 == spec.indicator2) &&
|
402
|
+
(self.bytes == spec.bytes)
|
403
|
+
end
|
311
404
|
end
|
405
|
+
|
312
406
|
end
|
313
407
|
end
|
@@ -7,7 +7,7 @@ module Traject
|
|
7
7
|
# A TranslationMap is basically just something that has a hash-like #[]
|
8
8
|
# method to map from input strings to output strings:
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# translation_map["some_input"] #=> some_output
|
11
11
|
#
|
12
12
|
# Input is assumed to always be string, output is either string
|
13
13
|
# or array of strings.
|
@@ -17,10 +17,10 @@ module Traject
|
|
17
17
|
# yaml, or java .properties. (Limited basic .properties, don't try any fancy escaping please,
|
18
18
|
# no = or : in key names, no split lines.)
|
19
19
|
#
|
20
|
-
#
|
20
|
+
# TranslationMap.new("dir/some_file")
|
21
21
|
#
|
22
|
-
# Will look
|
23
|
-
#
|
22
|
+
# Will look for a file named `some_file.rb` or `some_file.yaml` or `some_file.properties`,
|
23
|
+
# somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
|
24
24
|
# * Looks for "/translation_maps" subdir in load paths, so
|
25
25
|
# for instance you can have a gem that keeps translation maps
|
26
26
|
# in ./lib/translation_maps, and it Just Works.
|
@@ -47,12 +47,12 @@ module Traject
|
|
47
47
|
# Or, when calling TranslationMap.new(), you can pass in options over-riding special
|
48
48
|
# key too:
|
49
49
|
#
|
50
|
-
#
|
51
|
-
#
|
50
|
+
# TranslationMap.new("something", :default => "foo")
|
51
|
+
# TranslationMap.new("something", :default => :passthrough)
|
52
52
|
#
|
53
53
|
# == Output: String or array of strings
|
54
54
|
#
|
55
|
-
# The output can be a string or an array of strings, or nil. It should not be anything
|
55
|
+
# The output can be a string or an array of strings, or nil. It should not be anything else.
|
56
56
|
# When used with the #translate_array! method, one string can be replaced by multiple values
|
57
57
|
# (array of strings) or removed (nil)
|
58
58
|
#
|
data/lib/traject/version.rb
CHANGED
@@ -36,7 +36,8 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
36
36
|
end
|
37
37
|
output = @indexer.map_record(@record)
|
38
38
|
|
39
|
-
|
39
|
+
# trims punctuation too
|
40
|
+
assert_equal ["Big bands"], output["series_facet"]
|
40
41
|
end
|
41
42
|
|
42
43
|
describe "marc_sortable_author" do
|
@@ -51,6 +51,27 @@ describe "Traject::Indexer.to_field" do
|
|
51
51
|
flunk("Should only fail with a NamingError")
|
52
52
|
end
|
53
53
|
end
|
54
|
-
|
55
54
|
|
56
|
-
|
55
|
+
# Just verifying this is how it works
|
56
|
+
it "doesn't allow you to just wholesale assignment to the accumulator" do
|
57
|
+
@indexer.to_field('foo') do |rec, acc|
|
58
|
+
acc = ['hello']
|
59
|
+
end
|
60
|
+
output = @indexer.map_record('never looked at')
|
61
|
+
assert_equal nil, output['foo']
|
62
|
+
end
|
63
|
+
|
64
|
+
it "allows use of accumulator.replace" do
|
65
|
+
@indexer.to_field('foo') do |rec, acc|
|
66
|
+
acc.replace ['hello']
|
67
|
+
end
|
68
|
+
output = @indexer.map_record('never looked at')
|
69
|
+
assert_equal ['hello'], output['foo']
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
data/test/marc_extractor_test.rb
CHANGED
@@ -13,15 +13,12 @@ describe "Traject::MarcExtractor" do
|
|
13
13
|
assert_kind_of Hash, parsed
|
14
14
|
assert_equal 1, parsed.keys.length
|
15
15
|
spec = parsed['245'].first
|
16
|
-
assert_kind_of
|
17
|
-
|
18
|
-
assert_kind_of Array, spec[:indicators]
|
19
|
-
assert_equal 2, spec[:indicators].length
|
20
|
-
assert_equal "1", spec[:indicators][0]
|
21
|
-
assert_nil spec[:indicators][1]
|
22
|
-
|
23
|
-
assert_kind_of Array, spec[:subfields]
|
16
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
24
17
|
|
18
|
+
assert_equal "1", spec.indicator1
|
19
|
+
assert_nil spec.indicator2
|
20
|
+
|
21
|
+
assert_kind_of Array, spec.subfields
|
25
22
|
end
|
26
23
|
|
27
24
|
it "parses a mixed bag" do
|
@@ -34,25 +31,28 @@ describe "Traject::MarcExtractor" do
|
|
34
31
|
|
35
32
|
#245abcde
|
36
33
|
assert spec245
|
37
|
-
assert_nil spec245
|
38
|
-
|
34
|
+
assert_nil spec245.indicator1
|
35
|
+
assert_nil spec245.indicator2
|
36
|
+
assert_equal %w{a b c d e}, spec245.subfields
|
39
37
|
|
40
38
|
#810
|
41
39
|
assert spec810
|
42
|
-
assert_nil spec810
|
43
|
-
assert_nil spec810
|
40
|
+
assert_nil spec810.indicator1
|
41
|
+
assert_nil spec810.indicator2
|
42
|
+
assert_nil spec810.subfields, "No subfields"
|
44
43
|
|
45
44
|
#700-*4bcd
|
46
45
|
assert spec700
|
47
|
-
|
48
|
-
assert_equal
|
46
|
+
assert_nil spec700.indicator1
|
47
|
+
assert_equal "4", spec700.indicator2
|
48
|
+
assert_equal %w{b c d}, spec700.subfields
|
49
49
|
end
|
50
50
|
|
51
51
|
it "parses fixed field byte offsets" do
|
52
52
|
parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
|
53
53
|
|
54
|
-
assert_equal 5, parsed["005"].first
|
55
|
-
assert_equal 7..10, parsed["008"].first
|
54
|
+
assert_equal 5, parsed["005"].first.bytes
|
55
|
+
assert_equal 7..10, parsed["008"].first.bytes
|
56
56
|
end
|
57
57
|
|
58
58
|
it "allows arrays of specs" do
|
@@ -79,7 +79,7 @@ describe "Traject::MarcExtractor" do
|
|
79
79
|
|
80
80
|
# Mostly an internal method, not neccesarily API, but
|
81
81
|
# an important one, so we unit test some parts of it.
|
82
|
-
describe "#
|
82
|
+
describe "#specs_covering_field" do
|
83
83
|
describe "for alternate script tags" do
|
84
84
|
before do
|
85
85
|
@record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
|
@@ -102,17 +102,17 @@ describe "Traject::MarcExtractor" do
|
|
102
102
|
assert ! @a880_100.nil?, "Found an 880-100 to test"
|
103
103
|
end
|
104
104
|
it "finds spec for relevant 880" do
|
105
|
-
assert_equal( [
|
106
|
-
|
105
|
+
assert_equal( [Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245) )
|
106
|
+
assert_equal [], @extractor.specs_covering_field(@a880_100)
|
107
107
|
end
|
108
108
|
it "does not find spec for 880 if disabled" do
|
109
109
|
@extractor = Traject::MarcExtractor.new("245", :alternate_script => false)
|
110
|
-
assert_nil @extractor.
|
110
|
+
assert_nil @extractor.specs_covering_field(@a880_245)
|
111
111
|
end
|
112
112
|
it "finds only 880 if so configured" do
|
113
113
|
@extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
|
114
|
-
assert_nil @extractor.
|
115
|
-
assert_equal([
|
114
|
+
assert_nil @extractor.specs_covering_field(@a245)
|
115
|
+
assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245))
|
116
116
|
end
|
117
117
|
end
|
118
118
|
end
|
@@ -260,7 +260,7 @@ describe "Traject::MarcExtractor" do
|
|
260
260
|
@extractor.each_matching_line(@record) do |field, spec|
|
261
261
|
called = true
|
262
262
|
assert_kind_of MARC::DataField, field
|
263
|
-
assert_kind_of
|
263
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
264
264
|
end
|
265
265
|
assert called, "calls block"
|
266
266
|
end
|
@@ -269,7 +269,7 @@ describe "Traject::MarcExtractor" do
|
|
269
269
|
@extractor.each_matching_line(@record) do |field, spec, extractor|
|
270
270
|
called = true
|
271
271
|
assert_kind_of MARC::DataField, field
|
272
|
-
assert_kind_of
|
272
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
273
273
|
assert_kind_of Traject::MarcExtractor, extractor
|
274
274
|
assert_same @extractor, extractor
|
275
275
|
end
|
@@ -292,9 +292,11 @@ describe "Traject::MarcExtractor" do
|
|
292
292
|
|
293
293
|
describe "MarcExtractor.cached" do
|
294
294
|
it "creates" do
|
295
|
-
|
296
|
-
|
297
|
-
|
295
|
+
extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
296
|
+
spec_hash = extractor.spec_hash
|
297
|
+
|
298
|
+
assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
|
299
|
+
assert_equal({"245"=>[Traject::MarcExtractor::Spec.new(:tag => "245", :subfields=>["a", "b", "c"])]}, spec_hash)
|
298
300
|
end
|
299
301
|
it "caches" do
|
300
302
|
ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
@@ -326,11 +328,45 @@ describe "Traject::MarcExtractor" do
|
|
326
328
|
|
327
329
|
|
328
330
|
|
329
|
-
it "
|
330
|
-
|
331
|
-
|
332
|
-
|
331
|
+
it "provides multiple values for repeated subfields with single specified subfield" do
|
332
|
+
ex = Traject::MarcExtractor.new("245a")
|
333
|
+
f = @record.fields('245').first
|
334
|
+
title_a = f['a']
|
335
|
+
f.append(MARC::Subfield.new('a', title_a))
|
336
|
+
results = ex.extract(@record)
|
337
|
+
assert_equal [title_a, title_a], results
|
338
|
+
end
|
339
|
+
|
340
|
+
it "concats single subfield spec when given as eg 245aa" do
|
341
|
+
ex = Traject::MarcExtractor.new("245aa")
|
342
|
+
f = @record.fields('245').first
|
343
|
+
title_a = f['a']
|
344
|
+
f.append(MARC::Subfield.new('a', title_a))
|
345
|
+
results = ex.extract(@record)
|
346
|
+
assert_equal ["#{title_a} #{title_a}"], results
|
347
|
+
end
|
348
|
+
|
349
|
+
it "provides single value for repeated subfields with multiple specified subfields" do
|
350
|
+
ex = Traject::MarcExtractor.new("245ab")
|
351
|
+
f = @record.fields('245').first
|
352
|
+
title_a = f['a']
|
353
|
+
title_b = f['b']
|
354
|
+
f.append(MARC::Subfield.new('a', title_a))
|
355
|
+
results = ex.extract(@record)
|
356
|
+
assert_equal ["#{title_a} #{title_b} #{title_a}"], results
|
357
|
+
|
358
|
+
end
|
359
|
+
|
360
|
+
it "provides single value for repeated subfields with no specified subfield" do
|
361
|
+
ex = Traject::MarcExtractor.new("245")
|
362
|
+
f = @record.fields('245').first
|
363
|
+
title_a = f['a']
|
364
|
+
f.append(MARC::Subfield.new('a', title_a))
|
365
|
+
results = ex.extract(@record)
|
366
|
+
assert_equal 1, results.size
|
333
367
|
end
|
368
|
+
|
369
|
+
|
334
370
|
|
335
371
|
|
336
372
|
it "allows repeated tags for a control field" do
|
@@ -352,6 +388,17 @@ describe "Traject::MarcExtractor" do
|
|
352
388
|
end
|
353
389
|
|
354
390
|
end
|
391
|
+
|
392
|
+
describe "MarcExtractor::Spec" do
|
393
|
+
describe "==" do
|
394
|
+
it "equals when equal" do
|
395
|
+
assert_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c})
|
396
|
+
end
|
397
|
+
it "does not equal when not" do
|
398
|
+
refute_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}, :indicator2 => '1')
|
399
|
+
end
|
400
|
+
end
|
401
|
+
end
|
355
402
|
|
356
403
|
|
357
|
-
end
|
404
|
+
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.16.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-09-
|
13
|
+
date: 2013-09-30 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: marc
|