traject 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +27 -24
- data/doc/settings.md +8 -2
- data/lib/traject/command_line.rb +1 -6
- data/lib/traject/macros/marc21_semantics.rb +12 -5
- data/lib/traject/marc_extractor.rb +178 -84
- data/lib/traject/translation_map.rb +7 -7
- data/lib/traject/version.rb +1 -1
- data/test/indexer/macros_marc21_semantics_test.rb +2 -1
- data/test/indexer/to_field_test.rb +23 -2
- data/test/marc_extractor_test.rb +79 -32
- metadata +2 -2
data/README.md
CHANGED
@@ -49,8 +49,9 @@ The traject command-line utility requires you to supply it with a configuration
|
|
49
49
|
|
50
50
|
Configuration files are actually just ruby -- so by convention they end in `.rb`.
|
51
51
|
|
52
|
-
|
53
|
-
of ruby
|
52
|
+
We hope you can write basic useful configuration files without being a ruby expert,
|
53
|
+
they give you a subset of ruby to work with. But the full power
|
54
|
+
of ruby is available to you if needed.
|
54
55
|
|
55
56
|
**rubyist tip**: Technically, config files are executed with `instance_eval` in a Traject::Indexer instance, so the special commands you see are just methods on Traject::Indexer (or mixed into it). But you can
|
56
57
|
call ordinary ruby `require` in config files, etc., too, to load
|
@@ -84,9 +85,6 @@ settings do
|
|
84
85
|
# you have to tell it.
|
85
86
|
provide "marc_source.type", "xml"
|
86
87
|
|
87
|
-
# settings can be set on command line instead of
|
88
|
-
# config file too.
|
89
|
-
|
90
88
|
# various others...
|
91
89
|
provide "solrj_writer.commit_on_close", "true"
|
92
90
|
|
@@ -163,39 +161,44 @@ Other examples of the specification string, which can include multiple tag menti
|
|
163
161
|
# "*" is a wildcard in indicator spec. So
|
164
162
|
# 856 with first indicator '0', subfield u.
|
165
163
|
to_field "email_addresses", extract_marc("856|0*|u")
|
166
|
-
|
167
|
-
# Instead of joining subfields from the same field
|
168
|
-
# into one string, joined by spaces, leave them
|
169
|
-
# each in separate strings:
|
170
|
-
to_field "isbn", extract_marc("020az", :separator => nil)
|
171
|
-
|
172
|
-
# Same thing, but more explicit
|
173
|
-
to_field "isbn", extract_marc("020a:020z")
|
174
164
|
|
175
|
-
|
176
|
-
#
|
177
|
-
|
178
|
-
to_field 'language008', extract_marc('008[35-37]', :deduplicate=>true)
|
165
|
+
# Can list tag twice with different field combinations
|
166
|
+
# to extract separately
|
167
|
+
to_field "isbn", extract_marc("245a:245abcde")
|
179
168
|
~~~
|
180
169
|
|
181
170
|
The `extract_marc` function *by default* includes any linked
|
182
171
|
MARC `880` fields with alternate-script versions. Another reason
|
183
172
|
to use the `:first` option if you really only want one.
|
184
173
|
|
174
|
+
By default, specifications with multiple subfields (like "240abc") will produce
|
175
|
+
one single string of output for each matching field. Specifications
|
176
|
+
with single subfields (like "020a") will split subfields and produce
|
177
|
+
an output string for each matching subfield.
|
178
|
+
|
185
179
|
For MARC control (aka 'fixed') fields, you can use square
|
186
180
|
brackets to take a slice by byte offset.
|
187
181
|
|
182
|
+
~~~ruby
|
188
183
|
to_field "langauge_code", extract_marc("008[35-37]")
|
184
|
+
~~~
|
185
|
+
|
186
|
+
For more information on extraction specifications, see
|
187
|
+
the [MarcExtractor class](./lib/traject/marc_extractor.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/MarcExtractor)).
|
189
188
|
|
190
189
|
`extract_marc` also supports `translation maps` similar
|
191
|
-
to SolrMarc's. There
|
192
|
-
and you can
|
190
|
+
to SolrMarc's. There are some translation maps provided by traject,
|
191
|
+
and you can also define your own. translation maps can be supplied
|
193
192
|
in yaml or ruby. Translation maps are especially useful
|
194
|
-
for mapping form MARC codes to user-displayable strings
|
193
|
+
for mapping form MARC codes to user-displayable strings:
|
195
194
|
|
195
|
+
~~~ruby
|
196
196
|
# "translation_map" will be passed to Traject::TranslationMap.new
|
197
197
|
# and the created map used to translate all values
|
198
198
|
to_field "language", extract_marc("008[35-37]:041a:041d", :translation_map => "marc_language_code")
|
199
|
+
~~~
|
200
|
+
|
201
|
+
See [Traject::TranslationMap](./lib/traject/translation_map.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/TranslationMap)) for more info on translation mapping.
|
199
202
|
|
200
203
|
#### Direct indexing logic vs. Macros
|
201
204
|
|
@@ -348,11 +351,11 @@ This will over-ride any settings set with `provide` in conf files.
|
|
348
351
|
There are some built-in command-line option shortcuts for useful
|
349
352
|
settings:
|
350
353
|
|
351
|
-
Use
|
352
|
-
|
353
|
-
checking.
|
354
|
+
Use `--debug-mode` to output in a human-readable format, instead of sending to solr.
|
355
|
+
Also turns on debug logging and restricts processing to single-threaded. Useful for
|
356
|
+
debugging or sanity checking.
|
354
357
|
|
355
|
-
traject -
|
358
|
+
traject --debug-mode -c conf_file.rb marc_file
|
356
359
|
|
357
360
|
Use `-u` as a shortcut for `s solr.url=X`
|
358
361
|
|
data/doc/settings.md
CHANGED
@@ -4,7 +4,8 @@ Traject settings are a flat list of key/value pairs -- a single
|
|
4
4
|
Hash, not nested. Keys are always strings, and dots (".") can be
|
5
5
|
used for grouping and namespacing.
|
6
6
|
|
7
|
-
Values are usually strings, but occasionally something else.
|
7
|
+
Values are usually strings, but occasionally something else. String values can be easily
|
8
|
+
set via the command line.
|
8
9
|
|
9
10
|
Settings can be set in configuration files, usually like:
|
10
11
|
|
@@ -17,6 +18,11 @@ end
|
|
17
18
|
or on the command line: `-s key=value`. There are also some command line shortcuts
|
18
19
|
for commonly used settings, see `traject -h`.
|
19
20
|
|
21
|
+
`provide` will only set the key if it was previously unset, so first time to set 'wins'. And command-line
|
22
|
+
settings are applied first of all. It's recommended you use `provide`.
|
23
|
+
|
24
|
+
`store` is also available, and forces setting of the new value overriding any previous value set.
|
25
|
+
|
20
26
|
## Known settings
|
21
27
|
|
22
28
|
* `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
|
@@ -101,4 +107,4 @@ for commonly used settings, see `traject -h`.
|
|
101
107
|
Note that processing_thread_pool threads can end up submitting
|
102
108
|
to solr too, if solrj_writer.thread_pool is full.
|
103
109
|
|
104
|
-
* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJWriter, also available Traject::JsonWriter. See Traject::Indexer for more info. Command line shortcut `-w`
|
110
|
+
* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJWriter, also available Traject::JsonWriter. See Traject::Indexer for more info. Command line shortcut `-w`
|
data/lib/traject/command_line.rb
CHANGED
@@ -268,10 +268,6 @@ module Traject
|
|
268
268
|
if options[:solr]
|
269
269
|
settings["solr.url"] = options[:solr]
|
270
270
|
end
|
271
|
-
if options[:j]
|
272
|
-
settings["writer_class_name"] = "JsonWriter"
|
273
|
-
settings["json_writer.pretty_print"] = "true"
|
274
|
-
end
|
275
271
|
if options[:marc_type]
|
276
272
|
settings["marc_source.type"] = options[:marc_type]
|
277
273
|
end
|
@@ -296,12 +292,11 @@ module Traject
|
|
296
292
|
on :o, "output_file", "output file for Writer classes that write to files", :argument => true
|
297
293
|
on :w, :writer, "Set writer class, shortcut for -s writer_class_name=", :argument => true
|
298
294
|
on :u, :solr, "Set solr url, shortcut for -s solr.url=", :argument => true
|
299
|
-
on :j, "output as pretty printed json, shortcut for -s writer_class_name=JsonWriter -s json_writer.pretty_print=true"
|
300
295
|
on :t, :marc_type, "xml, json or binary. shortcut for -s marc_source.type=", :argument => true
|
301
296
|
on :I, "load_path", "append paths to ruby $LOAD_PATH", :argument => true, :as => Array, :delimiter => ":"
|
302
297
|
on :G, "Gemfile", "run with bundler and optionally specified Gemfile", :argument => :optional, :default => nil
|
303
298
|
|
304
|
-
on :x, "command", "alternate traject command: process (default); marcout", :argument => true, :default => "process"
|
299
|
+
on :x, "command", "alternate traject command: process (default); marcout; commit", :argument => true, :default => "process"
|
305
300
|
|
306
301
|
on "stdin", "read input from stdin"
|
307
302
|
on "debug-mode", "debug logging, single threaded, output human readable hashes"
|
@@ -144,7 +144,7 @@ module Traject::Macros
|
|
144
144
|
# return the filing version (i.e., the string without the
|
145
145
|
# non-filing characters)
|
146
146
|
|
147
|
-
def self.filing_version(field, str,
|
147
|
+
def self.filing_version(field, str, spec)
|
148
148
|
# Control fields don't have non-filing characters
|
149
149
|
return str if field.kind_of? MARC::ControlField
|
150
150
|
|
@@ -155,7 +155,7 @@ module Traject::Macros
|
|
155
155
|
# The spechash must either (a) have no subfields specified, or
|
156
156
|
# (b) include the first subfield in the record
|
157
157
|
|
158
|
-
subs =
|
158
|
+
subs = spec.subfields
|
159
159
|
return str unless subs && subs.include?(field.subfields[0].code)
|
160
160
|
|
161
161
|
# OK. If we got this far we actually need to strip characters off the string
|
@@ -183,7 +183,7 @@ module Traject::Macros
|
|
183
183
|
lambda do |record, accumulator|
|
184
184
|
codes = extractor.collect_matching_lines(record) do |field, spec, extractor|
|
185
185
|
if extractor.control_field?(field)
|
186
|
-
(spec
|
186
|
+
(spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
|
187
187
|
else
|
188
188
|
extractor.collect_subfields(field, spec).collect do |value|
|
189
189
|
# sometimes multiple language codes are jammed together in one subfield, and
|
@@ -212,9 +212,16 @@ module Traject::Macros
|
|
212
212
|
extractor = MarcExtractor.new(spec)
|
213
213
|
|
214
214
|
lambda do |record, accumulator|
|
215
|
-
|
215
|
+
values = extractor.collect_matching_lines(record) do |field, spec, extractor|
|
216
216
|
extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
|
217
|
-
end.compact
|
217
|
+
end.compact
|
218
|
+
|
219
|
+
# trim punctuation
|
220
|
+
values.collect! do |s|
|
221
|
+
Marc21.trim_punctuation(s)
|
222
|
+
end
|
223
|
+
|
224
|
+
accumulator.concat( values )
|
218
225
|
end
|
219
226
|
end
|
220
227
|
|
@@ -6,9 +6,79 @@ module Traject
|
|
6
6
|
#
|
7
7
|
# Examples:
|
8
8
|
#
|
9
|
-
# array_of_stuff
|
10
|
-
# values
|
9
|
+
# array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
|
10
|
+
# values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
|
11
|
+
# seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
|
12
|
+
# bytes = MarcExtractor.new("008[35-37]")
|
11
13
|
#
|
14
|
+
# == String extraction specifications
|
15
|
+
#
|
16
|
+
# Extraction directions are supplied in strings, usually as the first
|
17
|
+
# parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
|
18
|
+
# are also the first parameter to the #marc_extract macro.
|
19
|
+
#
|
20
|
+
# A String specification is a string (or array of strings) which consists
|
21
|
+
# of one or more Data and Control Field Specifications seperated by colons.
|
22
|
+
#
|
23
|
+
# A Data Field Specification is of the form:
|
24
|
+
# `{tag}{|indicators|}{subfields}`
|
25
|
+
# * {tag} is three chars (usually but not neccesarily numeric)
|
26
|
+
# * {indicators} are optional two chars enclosed in pipe ('|') characters,
|
27
|
+
# * {subfields} are optional list of chars (alphanumeric)
|
28
|
+
#
|
29
|
+
# indicator spec must be two chars, but one can be * meaning "don't care".
|
30
|
+
# space to mean 'blank'
|
31
|
+
#
|
32
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
33
|
+
#
|
34
|
+
# A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
|
35
|
+
# and includes a tag and a a byte slice specification.
|
36
|
+
#
|
37
|
+
# "008[35-37]:007[5]""
|
38
|
+
# => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
|
39
|
+
# "LDR" as a pseudo-tag to take byte slices of leader?)
|
40
|
+
#
|
41
|
+
# * subfields and indicators can only be provided for marc data/variable fields
|
42
|
+
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
43
|
+
#
|
44
|
+
# == Subfield concatenation
|
45
|
+
#
|
46
|
+
# Normally, for a spec including multiple subfield codes, multiple subfields
|
47
|
+
# from the same MARC field will be concatenated into one string separated by spaces:
|
48
|
+
#
|
49
|
+
# 600 a| Chomsky, Noam x| Philosophy.
|
50
|
+
# 600 a| Chomsky, Noam x| Political and social views.
|
51
|
+
# MarcExtractor.new("600ax").extract(record)
|
52
|
+
# # results in two values sent to Solr:
|
53
|
+
# "Chomsky, Noam Philosophy."
|
54
|
+
# "Chomsky, Noam Political and social views."
|
55
|
+
#
|
56
|
+
# You can turn off this concatenation and leave individual subfields in seperate
|
57
|
+
# strings by setting the `separator` option to nil:
|
58
|
+
#
|
59
|
+
# MarcExtractor.new("600ax", :separator => nil).extract(record)
|
60
|
+
# # Results in four values being sent to Solr (or 3 if you de-dup):
|
61
|
+
# "Chomksy, Noam"
|
62
|
+
# "Philosophy."
|
63
|
+
# "Chomsky, Noam"
|
64
|
+
# "Political and social views."
|
65
|
+
#
|
66
|
+
# However, **the default is different for specifications with only a single
|
67
|
+
# subfield**, these are by default kept seperated:
|
68
|
+
#
|
69
|
+
# 020 a| 285197145X a| 9782851971456
|
70
|
+
# MarcExtractor.new("020a:020z").extract(record)
|
71
|
+
# # two seperate strings sent to Solr:
|
72
|
+
# "285197145X"
|
73
|
+
# "9782851971456"
|
74
|
+
#
|
75
|
+
# For single subfield specifications, you force concatenation by
|
76
|
+
# repeating the subfield specification:
|
77
|
+
#
|
78
|
+
# MarcExtractor.new("020aa:020zz").extract(record)
|
79
|
+
# # would result in a single string sent to solr for
|
80
|
+
# # the single field, by default space-separated:
|
81
|
+
# "285197145X 9782851971456"
|
12
82
|
#
|
13
83
|
# == Note on Performance and MarcExtractor creation and reuse
|
14
84
|
#
|
@@ -37,14 +107,15 @@ module Traject
|
|
37
107
|
class MarcExtractor
|
38
108
|
attr_accessor :options, :spec_hash
|
39
109
|
|
40
|
-
#
|
41
|
-
#
|
110
|
+
# First arg is a specification for extraction of data from a MARC record.
|
111
|
+
# Specification can be given in two forms:
|
42
112
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
113
|
+
# * a string specification like "008[35]:020a:245abc", see top of class
|
114
|
+
# for examples. A string specification is most typical argument.
|
115
|
+
# * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
|
116
|
+
# a 'pre-parsed' specification.
|
46
117
|
#
|
47
|
-
# options:
|
118
|
+
# Second arg is options:
|
48
119
|
#
|
49
120
|
# [:separator] default ' ' (space), what to use to separate
|
50
121
|
# subfield values when joining strings
|
@@ -108,57 +179,30 @@ module Traject
|
|
108
179
|
|
109
180
|
# Check to see if a tag is interesting (meaning it may be covered by a spec
|
110
181
|
# and the passed-in options about alternate scripts)
|
111
|
-
|
112
182
|
def interesting_tag?(tag)
|
113
183
|
return @interesting_tags_hash.include?(tag)
|
114
184
|
end
|
115
185
|
|
116
186
|
|
117
|
-
# Converts from a string marc spec like "245abc:700a" to a
|
118
|
-
# to represent the specification.
|
119
|
-
#
|
120
|
-
# a String specification is a string (or array of strings) of form:
|
121
|
-
# {tag}{|indicators|}{subfields} separated by colons
|
122
|
-
# tag is three chars (usually but not neccesarily numeric),
|
123
|
-
# indicators are optional two chars enclosed in pipe ('|') characters,
|
124
|
-
# subfields are optional list of chars (alphanumeric)
|
125
|
-
#
|
126
|
-
# indicator spec must be two chars, but one can be * meaning "don't care".
|
127
|
-
# space to mean 'blank'
|
128
|
-
#
|
129
|
-
# "245|01|abc65:345abc:700|*5|:800"
|
130
|
-
#
|
131
|
-
# Or, for control (fixed) fields (ordinarily fields 001-010), you can include a byte slice specification,
|
132
|
-
# but can NOT include subfield or indicator specifications. Plus can use special tag "LDR" for
|
133
|
-
# the marc leader. (TODO)
|
134
|
-
#
|
135
|
-
# "008[35-37]:LDR[5]"
|
136
|
-
# => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader.
|
187
|
+
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
188
|
+
# to represent the specification. See comments at head of class for
|
189
|
+
# documentation of string specification format.
|
137
190
|
#
|
138
|
-
# Returns a nested hash whose keys are tags and whose value is an array
|
139
|
-
# of hash structures indicating what indicators and subfields (or
|
140
|
-
# byte-offsets for control fields) are needed, e.g.
|
141
191
|
#
|
142
|
-
#
|
192
|
+
# == Return value
|
143
193
|
#
|
144
|
-
#
|
145
|
-
#
|
146
|
-
#
|
147
|
-
# {:subfields => ['a', 'b']}
|
148
|
-
# ]
|
149
|
-
# '110' => [{}] # all subfields, indicators don't matter
|
150
|
-
# '008' => [
|
151
|
-
# {:bytes => (15..17)}
|
152
|
-
# {:bytes => 17}
|
153
|
-
# ]
|
154
|
-
# }
|
194
|
+
# The hash returned is keyed by tag, and has as values an array of 0 or
|
195
|
+
# or more MarcExtractor::Spec objects representing the specified extraction
|
196
|
+
# operations for that tag.
|
155
197
|
#
|
156
|
-
#
|
157
|
-
#
|
198
|
+
# It's an array of possibly more than one, because you can specify
|
199
|
+
# multiple extractions on the same tag: for instance "245a:245abc"
|
158
200
|
#
|
159
201
|
# See tests for more examples.
|
160
202
|
def self.parse_string_spec(spec_string)
|
161
|
-
hash
|
203
|
+
# hash defaults to []
|
204
|
+
hash = Hash.new {|hash,key| hash[key] = []}
|
205
|
+
|
162
206
|
spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
|
163
207
|
|
164
208
|
spec_strings.each do |part|
|
@@ -166,31 +210,32 @@ module Traject
|
|
166
210
|
# variable field
|
167
211
|
tag, indicators, subfields = $1, $3, $4
|
168
212
|
|
169
|
-
|
170
|
-
spec = {}
|
213
|
+
spec = Spec.new(:tag => tag)
|
171
214
|
|
172
215
|
if subfields and !subfields.empty?
|
173
|
-
spec
|
216
|
+
spec.subfields = subfields.split('')
|
174
217
|
end
|
175
218
|
|
176
219
|
if indicators
|
177
|
-
|
220
|
+
# if specified as '*', leave nil
|
221
|
+
spec.indicator1 = indicators[0] if indicators[0] != "*"
|
222
|
+
spec.indicator2 = indicators[1] if indicators[1] != "*"
|
178
223
|
end
|
224
|
+
|
225
|
+
hash[spec.tag] << spec
|
179
226
|
|
180
|
-
|
181
|
-
|
182
|
-
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]"
|
227
|
+
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
|
183
228
|
tag, byte1, byte2 = $1, $3, $5
|
184
|
-
|
185
|
-
spec =
|
229
|
+
|
230
|
+
spec = Spec.new(:tag => tag)
|
186
231
|
|
187
232
|
if byte1 && byte2
|
188
|
-
spec
|
233
|
+
spec.bytes = ((byte1.to_i)..(byte2.to_i))
|
189
234
|
elsif byte1
|
190
|
-
spec
|
235
|
+
spec.bytes = byte1.to_i
|
191
236
|
end
|
192
237
|
|
193
|
-
hash[tag] << spec
|
238
|
+
hash[spec.tag] << spec
|
194
239
|
else
|
195
240
|
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
196
241
|
end
|
@@ -206,7 +251,7 @@ module Traject
|
|
206
251
|
|
207
252
|
self.each_matching_line(marc_record) do |field, spec|
|
208
253
|
if control_field?(field)
|
209
|
-
results << (spec
|
254
|
+
results << (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
|
210
255
|
else
|
211
256
|
results.concat collect_subfields(field, spec)
|
212
257
|
end
|
@@ -217,7 +262,7 @@ module Traject
|
|
217
262
|
|
218
263
|
# Yields a block for every line in source record that matches
|
219
264
|
# spec. First arg to block is MARC::DataField or ControlField, second
|
220
|
-
# is the
|
265
|
+
# is the MarcExtractor::Spec that it matched on. May take account
|
221
266
|
# of options such as :alternate_script
|
222
267
|
#
|
223
268
|
# Third (optional) arg to block is self, the MarcExtractor object, useful for custom
|
@@ -225,19 +270,14 @@ module Traject
|
|
225
270
|
def each_matching_line(marc_record)
|
226
271
|
marc_record.fields(@interesting_tags_hash.keys).each do |field|
|
227
272
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
# Make sure it matches indicators too, spec_covering_field
|
234
|
-
# doens't check that.
|
235
|
-
|
236
|
-
specs.each do |spec|
|
237
|
-
if matches_indicators(field, spec)
|
273
|
+
# Make sure it matches indicators too, specs_covering_field
|
274
|
+
# doesn't check that.
|
275
|
+
specs_covering_field(field).each do |spec|
|
276
|
+
if spec.matches_indicators?(field)
|
238
277
|
yield(field, spec, self)
|
239
278
|
end
|
240
279
|
end
|
280
|
+
|
241
281
|
end
|
242
282
|
end
|
243
283
|
|
@@ -245,6 +285,8 @@ module Traject
|
|
245
285
|
# but collects results of block into an array -- flattens any subarrays for you!
|
246
286
|
#
|
247
287
|
# Useful for re-use of this class for custom processing
|
288
|
+
#
|
289
|
+
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
248
290
|
def collect_matching_lines(marc_record)
|
249
291
|
results = []
|
250
292
|
self.each_matching_line(marc_record) do |field, spec, extractor|
|
@@ -254,31 +296,36 @@ module Traject
|
|
254
296
|
end
|
255
297
|
|
256
298
|
|
257
|
-
# Pass in a marc data field and a
|
258
|
-
# an ARRAY of one or more strings, subfields extracted
|
299
|
+
# Pass in a marc data field and a Spec object with extraction
|
300
|
+
# instructions, returns an ARRAY of one or more strings, subfields extracted
|
259
301
|
# and processed per spec. Takes account of options such
|
260
302
|
# as :separator
|
261
303
|
#
|
262
304
|
# Always returns array, sometimes empty array.
|
263
305
|
def collect_subfields(field, spec)
|
264
306
|
subfields = field.subfields.collect do |subfield|
|
265
|
-
subfield.value if spec
|
307
|
+
subfield.value if spec.includes_subfield_code?(subfield.code)
|
266
308
|
end.compact
|
267
309
|
|
268
310
|
return subfields if subfields.empty? # empty array, just return it.
|
269
311
|
|
270
|
-
|
312
|
+
if options[:separator] && spec.joinable?
|
313
|
+
subfields = [subfields.join(options[:separator])]
|
314
|
+
end
|
315
|
+
|
316
|
+
return subfields
|
271
317
|
end
|
272
318
|
|
273
319
|
|
274
|
-
|
320
|
+
|
321
|
+
# Find Spec objects, if any, covering extraction from this field.
|
322
|
+
# Returns an array of 0 or more MarcExtractor::Spec objects
|
275
323
|
#
|
276
324
|
# When given an 880, will return the spec (if any) for the linked tag iff
|
277
325
|
# we have a $6 and we want the alternate script.
|
278
326
|
#
|
279
|
-
# Returns
|
280
|
-
|
281
|
-
def spec_covering_field(field)
|
327
|
+
# Returns an empty array in case of no matching extraction specs.
|
328
|
+
def specs_covering_field(field)
|
282
329
|
tag = field.tag
|
283
330
|
|
284
331
|
# Short-circuit the unintersting stuff
|
@@ -301,13 +348,60 @@ module Traject
|
|
301
348
|
# define #control_field? on both ControlField and DataField?
|
302
349
|
return field.kind_of? MARC::ControlField
|
303
350
|
end
|
351
|
+
|
304
352
|
|
305
|
-
# a
|
306
|
-
|
307
|
-
|
353
|
+
# Represents a single specification for extracting data
|
354
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
355
|
+
#
|
356
|
+
# Includes the tag for reference, although this is redundant and not actually used
|
357
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
358
|
+
# with tag => [spec1, spec2]
|
359
|
+
class Spec
|
360
|
+
attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
|
361
|
+
|
362
|
+
def initialize(hash = {})
|
363
|
+
hash.each_pair do |key, value|
|
364
|
+
self.send("#{key}=", value)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
|
369
|
+
# Should subfields extracted by joined, if we have a seperator?
|
370
|
+
# * '630' no subfields specified => join all subfields
|
371
|
+
# * '630abc' multiple subfields specified = join all subfields
|
372
|
+
# * '633a' one subfield => do not join, return one value for each $a in the field
|
373
|
+
# * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
|
374
|
+
#
|
375
|
+
# Last case is handled implicitly at the moment when subfields == ['a', 'a']
|
376
|
+
def joinable?
|
377
|
+
(self.subfields.nil? || self.subfields.size != 1)
|
378
|
+
end
|
379
|
+
|
380
|
+
# Pass in a MARC field, do it's indicators match indicators
|
381
|
+
# in this spec? nil indicators in spec mean we don't care, everything
|
382
|
+
# matches.
|
383
|
+
def matches_indicators?(field)
|
384
|
+
return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
|
385
|
+
(self.indicator2.nil? || self.indicator2 == field.indicator2)
|
386
|
+
end
|
308
387
|
|
309
|
-
|
310
|
-
|
388
|
+
# Pass in a string subfield code like 'a'; does this
|
389
|
+
# spec include it?
|
390
|
+
def includes_subfield_code?(code)
|
391
|
+
# subfields nil means include them all
|
392
|
+
self.subfields.nil? || self.subfields.include?(code)
|
393
|
+
end
|
394
|
+
|
395
|
+
def ==(spec)
|
396
|
+
return false unless spec.kind_of?(Spec)
|
397
|
+
|
398
|
+
return (self.tag == spec.tag) &&
|
399
|
+
(self.subfields == spec.subfields) &&
|
400
|
+
(self.indicator1 == spec.indicator1) &&
|
401
|
+
(self.indicator1 == spec.indicator2) &&
|
402
|
+
(self.bytes == spec.bytes)
|
403
|
+
end
|
311
404
|
end
|
405
|
+
|
312
406
|
end
|
313
407
|
end
|
@@ -7,7 +7,7 @@ module Traject
|
|
7
7
|
# A TranslationMap is basically just something that has a hash-like #[]
|
8
8
|
# method to map from input strings to output strings:
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# translation_map["some_input"] #=> some_output
|
11
11
|
#
|
12
12
|
# Input is assumed to always be string, output is either string
|
13
13
|
# or array of strings.
|
@@ -17,10 +17,10 @@ module Traject
|
|
17
17
|
# yaml, or java .properties. (Limited basic .properties, don't try any fancy escaping please,
|
18
18
|
# no = or : in key names, no split lines.)
|
19
19
|
#
|
20
|
-
#
|
20
|
+
# TranslationMap.new("dir/some_file")
|
21
21
|
#
|
22
|
-
# Will look
|
23
|
-
#
|
22
|
+
# Will look for a file named `some_file.rb` or `some_file.yaml` or `some_file.properties`,
|
23
|
+
# somewhere in the ruby $LOAD_PATH in a `/translation_maps` subdir.
|
24
24
|
# * Looks for "/translation_maps" subdir in load paths, so
|
25
25
|
# for instance you can have a gem that keeps translation maps
|
26
26
|
# in ./lib/translation_maps, and it Just Works.
|
@@ -47,12 +47,12 @@ module Traject
|
|
47
47
|
# Or, when calling TranslationMap.new(), you can pass in options over-riding special
|
48
48
|
# key too:
|
49
49
|
#
|
50
|
-
#
|
51
|
-
#
|
50
|
+
# TranslationMap.new("something", :default => "foo")
|
51
|
+
# TranslationMap.new("something", :default => :passthrough)
|
52
52
|
#
|
53
53
|
# == Output: String or array of strings
|
54
54
|
#
|
55
|
-
# The output can be a string or an array of strings, or nil. It should not be anything
|
55
|
+
# The output can be a string or an array of strings, or nil. It should not be anything else.
|
56
56
|
# When used with the #translate_array! method, one string can be replaced by multiple values
|
57
57
|
# (array of strings) or removed (nil)
|
58
58
|
#
|
data/lib/traject/version.rb
CHANGED
@@ -36,7 +36,8 @@ describe "Traject::Macros::Marc21Semantics" do
|
|
36
36
|
end
|
37
37
|
output = @indexer.map_record(@record)
|
38
38
|
|
39
|
-
|
39
|
+
# trims punctuation too
|
40
|
+
assert_equal ["Big bands"], output["series_facet"]
|
40
41
|
end
|
41
42
|
|
42
43
|
describe "marc_sortable_author" do
|
@@ -51,6 +51,27 @@ describe "Traject::Indexer.to_field" do
|
|
51
51
|
flunk("Should only fail with a NamingError")
|
52
52
|
end
|
53
53
|
end
|
54
|
-
|
55
54
|
|
56
|
-
|
55
|
+
# Just verifying this is how it works
|
56
|
+
it "doesn't allow you to just wholesale assignment to the accumulator" do
|
57
|
+
@indexer.to_field('foo') do |rec, acc|
|
58
|
+
acc = ['hello']
|
59
|
+
end
|
60
|
+
output = @indexer.map_record('never looked at')
|
61
|
+
assert_equal nil, output['foo']
|
62
|
+
end
|
63
|
+
|
64
|
+
it "allows use of accumulator.replace" do
|
65
|
+
@indexer.to_field('foo') do |rec, acc|
|
66
|
+
acc.replace ['hello']
|
67
|
+
end
|
68
|
+
output = @indexer.map_record('never looked at')
|
69
|
+
assert_equal ['hello'], output['foo']
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
data/test/marc_extractor_test.rb
CHANGED
@@ -13,15 +13,12 @@ describe "Traject::MarcExtractor" do
|
|
13
13
|
assert_kind_of Hash, parsed
|
14
14
|
assert_equal 1, parsed.keys.length
|
15
15
|
spec = parsed['245'].first
|
16
|
-
assert_kind_of
|
17
|
-
|
18
|
-
assert_kind_of Array, spec[:indicators]
|
19
|
-
assert_equal 2, spec[:indicators].length
|
20
|
-
assert_equal "1", spec[:indicators][0]
|
21
|
-
assert_nil spec[:indicators][1]
|
22
|
-
|
23
|
-
assert_kind_of Array, spec[:subfields]
|
16
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
24
17
|
|
18
|
+
assert_equal "1", spec.indicator1
|
19
|
+
assert_nil spec.indicator2
|
20
|
+
|
21
|
+
assert_kind_of Array, spec.subfields
|
25
22
|
end
|
26
23
|
|
27
24
|
it "parses a mixed bag" do
|
@@ -34,25 +31,28 @@ describe "Traject::MarcExtractor" do
|
|
34
31
|
|
35
32
|
#245abcde
|
36
33
|
assert spec245
|
37
|
-
assert_nil spec245
|
38
|
-
|
34
|
+
assert_nil spec245.indicator1
|
35
|
+
assert_nil spec245.indicator2
|
36
|
+
assert_equal %w{a b c d e}, spec245.subfields
|
39
37
|
|
40
38
|
#810
|
41
39
|
assert spec810
|
42
|
-
assert_nil spec810
|
43
|
-
assert_nil spec810
|
40
|
+
assert_nil spec810.indicator1
|
41
|
+
assert_nil spec810.indicator2
|
42
|
+
assert_nil spec810.subfields, "No subfields"
|
44
43
|
|
45
44
|
#700-*4bcd
|
46
45
|
assert spec700
|
47
|
-
|
48
|
-
assert_equal
|
46
|
+
assert_nil spec700.indicator1
|
47
|
+
assert_equal "4", spec700.indicator2
|
48
|
+
assert_equal %w{b c d}, spec700.subfields
|
49
49
|
end
|
50
50
|
|
51
51
|
it "parses fixed field byte offsets" do
|
52
52
|
parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
|
53
53
|
|
54
|
-
assert_equal 5, parsed["005"].first
|
55
|
-
assert_equal 7..10, parsed["008"].first
|
54
|
+
assert_equal 5, parsed["005"].first.bytes
|
55
|
+
assert_equal 7..10, parsed["008"].first.bytes
|
56
56
|
end
|
57
57
|
|
58
58
|
it "allows arrays of specs" do
|
@@ -79,7 +79,7 @@ describe "Traject::MarcExtractor" do
|
|
79
79
|
|
80
80
|
# Mostly an internal method, not neccesarily API, but
|
81
81
|
# an important one, so we unit test some parts of it.
|
82
|
-
describe "#
|
82
|
+
describe "#specs_covering_field" do
|
83
83
|
describe "for alternate script tags" do
|
84
84
|
before do
|
85
85
|
@record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
|
@@ -102,17 +102,17 @@ describe "Traject::MarcExtractor" do
|
|
102
102
|
assert ! @a880_100.nil?, "Found an 880-100 to test"
|
103
103
|
end
|
104
104
|
it "finds spec for relevant 880" do
|
105
|
-
assert_equal( [
|
106
|
-
|
105
|
+
assert_equal( [Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245) )
|
106
|
+
assert_equal [], @extractor.specs_covering_field(@a880_100)
|
107
107
|
end
|
108
108
|
it "does not find spec for 880 if disabled" do
|
109
109
|
@extractor = Traject::MarcExtractor.new("245", :alternate_script => false)
|
110
|
-
assert_nil @extractor.
|
110
|
+
assert_nil @extractor.specs_covering_field(@a880_245)
|
111
111
|
end
|
112
112
|
it "finds only 880 if so configured" do
|
113
113
|
@extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
|
114
|
-
assert_nil @extractor.
|
115
|
-
assert_equal([
|
114
|
+
assert_nil @extractor.specs_covering_field(@a245)
|
115
|
+
assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245))
|
116
116
|
end
|
117
117
|
end
|
118
118
|
end
|
@@ -260,7 +260,7 @@ describe "Traject::MarcExtractor" do
|
|
260
260
|
@extractor.each_matching_line(@record) do |field, spec|
|
261
261
|
called = true
|
262
262
|
assert_kind_of MARC::DataField, field
|
263
|
-
assert_kind_of
|
263
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
264
264
|
end
|
265
265
|
assert called, "calls block"
|
266
266
|
end
|
@@ -269,7 +269,7 @@ describe "Traject::MarcExtractor" do
|
|
269
269
|
@extractor.each_matching_line(@record) do |field, spec, extractor|
|
270
270
|
called = true
|
271
271
|
assert_kind_of MARC::DataField, field
|
272
|
-
assert_kind_of
|
272
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
273
273
|
assert_kind_of Traject::MarcExtractor, extractor
|
274
274
|
assert_same @extractor, extractor
|
275
275
|
end
|
@@ -292,9 +292,11 @@ describe "Traject::MarcExtractor" do
|
|
292
292
|
|
293
293
|
describe "MarcExtractor.cached" do
|
294
294
|
it "creates" do
|
295
|
-
|
296
|
-
|
297
|
-
|
295
|
+
extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
296
|
+
spec_hash = extractor.spec_hash
|
297
|
+
|
298
|
+
assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
|
299
|
+
assert_equal({"245"=>[Traject::MarcExtractor::Spec.new(:tag => "245", :subfields=>["a", "b", "c"])]}, spec_hash)
|
298
300
|
end
|
299
301
|
it "caches" do
|
300
302
|
ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
@@ -326,11 +328,45 @@ describe "Traject::MarcExtractor" do
|
|
326
328
|
|
327
329
|
|
328
330
|
|
329
|
-
it "
|
330
|
-
|
331
|
-
|
332
|
-
|
331
|
+
it "provides multiple values for repeated subfields with single specified subfield" do
|
332
|
+
ex = Traject::MarcExtractor.new("245a")
|
333
|
+
f = @record.fields('245').first
|
334
|
+
title_a = f['a']
|
335
|
+
f.append(MARC::Subfield.new('a', title_a))
|
336
|
+
results = ex.extract(@record)
|
337
|
+
assert_equal [title_a, title_a], results
|
338
|
+
end
|
339
|
+
|
340
|
+
it "concats single subfield spec when given as eg 245aa" do
|
341
|
+
ex = Traject::MarcExtractor.new("245aa")
|
342
|
+
f = @record.fields('245').first
|
343
|
+
title_a = f['a']
|
344
|
+
f.append(MARC::Subfield.new('a', title_a))
|
345
|
+
results = ex.extract(@record)
|
346
|
+
assert_equal ["#{title_a} #{title_a}"], results
|
347
|
+
end
|
348
|
+
|
349
|
+
it "provides single value for repeated subfields with multiple specified subfields" do
|
350
|
+
ex = Traject::MarcExtractor.new("245ab")
|
351
|
+
f = @record.fields('245').first
|
352
|
+
title_a = f['a']
|
353
|
+
title_b = f['b']
|
354
|
+
f.append(MARC::Subfield.new('a', title_a))
|
355
|
+
results = ex.extract(@record)
|
356
|
+
assert_equal ["#{title_a} #{title_b} #{title_a}"], results
|
357
|
+
|
358
|
+
end
|
359
|
+
|
360
|
+
it "provides single value for repeated subfields with no specified subfield" do
|
361
|
+
ex = Traject::MarcExtractor.new("245")
|
362
|
+
f = @record.fields('245').first
|
363
|
+
title_a = f['a']
|
364
|
+
f.append(MARC::Subfield.new('a', title_a))
|
365
|
+
results = ex.extract(@record)
|
366
|
+
assert_equal 1, results.size
|
333
367
|
end
|
368
|
+
|
369
|
+
|
334
370
|
|
335
371
|
|
336
372
|
it "allows repeated tags for a control field" do
|
@@ -352,6 +388,17 @@ describe "Traject::MarcExtractor" do
|
|
352
388
|
end
|
353
389
|
|
354
390
|
end
|
391
|
+
|
392
|
+
describe "MarcExtractor::Spec" do
|
393
|
+
describe "==" do
|
394
|
+
it "equals when equal" do
|
395
|
+
assert_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c})
|
396
|
+
end
|
397
|
+
it "does not equal when not" do
|
398
|
+
refute_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}, :indicator2 => '1')
|
399
|
+
end
|
400
|
+
end
|
401
|
+
end
|
355
402
|
|
356
403
|
|
357
|
-
end
|
404
|
+
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.16.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Jonathan Rochkind
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-09-
|
13
|
+
date: 2013-09-30 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: marc
|