traject 0.16.0 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +183 -191
- data/bench/bench.rb +1 -1
- data/doc/batch_execution.md +14 -0
- data/doc/extending.md +14 -12
- data/doc/indexing_rules.md +265 -0
- data/lib/traject/command_line.rb +12 -41
- data/lib/traject/debug_writer.rb +32 -13
- data/lib/traject/indexer.rb +101 -24
- data/lib/traject/indexer/settings.rb +18 -17
- data/lib/traject/json_writer.rb +32 -11
- data/lib/traject/line_writer.rb +6 -6
- data/lib/traject/macros/basic.rb +1 -1
- data/lib/traject/macros/marc21.rb +17 -13
- data/lib/traject/macros/marc21_semantics.rb +27 -25
- data/lib/traject/macros/marc_format_classifier.rb +39 -25
- data/lib/traject/marc4j_reader.rb +36 -22
- data/lib/traject/marc_extractor.rb +79 -75
- data/lib/traject/marc_reader.rb +33 -25
- data/lib/traject/mock_reader.rb +9 -10
- data/lib/traject/ndj_reader.rb +7 -7
- data/lib/traject/null_writer.rb +1 -1
- data/lib/traject/qualified_const_get.rb +12 -2
- data/lib/traject/solrj_writer.rb +61 -52
- data/lib/traject/thread_pool.rb +45 -45
- data/lib/traject/translation_map.rb +59 -27
- data/lib/traject/util.rb +3 -3
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +1 -1
- data/test/debug_writer_test.rb +7 -7
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_semantics_test.rb +12 -12
- data/test/indexer/macros_marc21_test.rb +10 -10
- data/test/indexer/macros_test.rb +1 -1
- data/test/indexer/map_record_test.rb +6 -6
- data/test/indexer/read_write_test.rb +43 -4
- data/test/indexer/settings_test.rb +2 -2
- data/test/indexer/to_field_test.rb +8 -8
- data/test/marc4j_reader_test.rb +4 -4
- data/test/marc_extractor_test.rb +33 -25
- data/test/marc_format_classifier_test.rb +3 -3
- data/test/marc_reader_test.rb +2 -2
- data/test/test_helper.rb +3 -3
- data/test/test_support/demo_config.rb +52 -48
- data/test/translation_map_test.rb +22 -4
- data/test/translation_maps/bad_ruby.rb +2 -2
- data/test/translation_maps/both_map.rb +1 -1
- data/test/translation_maps/default_literal.rb +1 -1
- data/test/translation_maps/default_passthrough.rb +1 -1
- data/test/translation_maps/ruby_map.rb +1 -1
- metadata +7 -31
- data/doc/macros.md +0 -103
@@ -62,10 +62,10 @@ module Traject::Macros
|
|
62
62
|
def self.get_sortable_author(record)
|
63
63
|
onexx = MarcExtractor.cached("100:110:111", :first => true, :trim_punctuation => true).extract(record).first
|
64
64
|
onexx = onexx.strip if onexx
|
65
|
-
|
65
|
+
|
66
66
|
titles = []
|
67
67
|
MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
|
68
|
-
non_filing = field.indicator2.to_i
|
68
|
+
non_filing = field.indicator2.to_i
|
69
69
|
|
70
70
|
str = field.subfields.collect {|sf| Marc21.trim_punctuation(sf.value.strip).strip}.join(" ")
|
71
71
|
str = str.slice(non_filing, str.length)
|
@@ -73,7 +73,7 @@ module Traject::Macros
|
|
73
73
|
end.first
|
74
74
|
title = titles.first
|
75
75
|
title = title.strip if title
|
76
|
-
|
76
|
+
|
77
77
|
return [onexx, title].compact.join(" ")
|
78
78
|
end
|
79
79
|
|
@@ -105,26 +105,26 @@ module Traject::Macros
|
|
105
105
|
str
|
106
106
|
end.first
|
107
107
|
end
|
108
|
-
|
109
|
-
|
110
|
-
|
108
|
+
|
109
|
+
|
110
|
+
|
111
111
|
# A generic way to strip a filing version (i.e., a string with the non-filing
|
112
112
|
# characters stripped off)
|
113
113
|
#
|
114
114
|
# Always returns an array. If :include_original=>true is passed in,
|
115
115
|
# that array will include the original string with the non-filing
|
116
116
|
# characters still in it.
|
117
|
-
|
117
|
+
|
118
118
|
def extract_marc_filing_version(spec='245abdefghknp', opts={})
|
119
119
|
include_original = opts.delete(:include_original)
|
120
120
|
if opts.size > 0
|
121
121
|
raise RuntimeError.new("extract_marc_filing_version can take only :include_original as an argument, not #{opts.keys.map{|x| "'#{x}'"}.join(' or ')}")
|
122
122
|
end
|
123
|
-
|
123
|
+
|
124
124
|
extractor = Traject::MarcExtractor.cached(spec, opts)
|
125
|
-
|
125
|
+
|
126
126
|
lambda do |record, accumulator, context|
|
127
|
-
extractor.collect_matching_lines(record) do |field, spec|
|
127
|
+
extractor.collect_matching_lines(record) do |field, spec|
|
128
128
|
str = extractor.collect_subfields(field, spec).first
|
129
129
|
next unless str and !str.empty?
|
130
130
|
vals = [Marc21Semantics.filing_version(field, str, spec)]
|
@@ -136,34 +136,34 @@ module Traject::Macros
|
|
136
136
|
end
|
137
137
|
end
|
138
138
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
139
|
+
|
140
|
+
|
141
|
+
|
142
|
+
|
143
143
|
# Take in a field, a string extracted from that field, and a spec and
|
144
|
-
# return the filing version (i.e., the string without the
|
144
|
+
# return the filing version (i.e., the string without the
|
145
145
|
# non-filing characters)
|
146
|
-
|
146
|
+
|
147
147
|
def self.filing_version(field, str, spec)
|
148
148
|
# Control fields don't have non-filing characters
|
149
149
|
return str if field.kind_of? MARC::ControlField
|
150
|
-
|
150
|
+
|
151
151
|
# 2nd indicator must be > 0
|
152
152
|
ind2 = field.indicator2.to_i
|
153
153
|
return str unless ind2 > 0
|
154
|
-
|
154
|
+
|
155
155
|
# The spechash must either (a) have no subfields specified, or
|
156
156
|
# (b) include the first subfield in the record
|
157
|
-
|
157
|
+
|
158
158
|
subs = spec.subfields
|
159
159
|
return str unless subs && subs.include?(field.subfields[0].code)
|
160
|
-
|
160
|
+
|
161
161
|
# OK. If we got this far we actually need to strip characters off the string
|
162
|
-
|
162
|
+
|
163
163
|
return str[ind2..-1]
|
164
164
|
end
|
165
|
-
|
166
|
-
|
165
|
+
|
166
|
+
|
167
167
|
|
168
168
|
|
169
169
|
# maps languages, by default out of 008[35-37] and 041a and 041d
|
@@ -367,6 +367,9 @@ module Traject::Macros
|
|
367
367
|
return found_date
|
368
368
|
end
|
369
369
|
|
370
|
+
# REGEX meant to rule out obvious non-LCC's, and only allow things
|
371
|
+
# plausibly LCC's.
|
372
|
+
LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
|
370
373
|
# Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH)
|
371
374
|
# from usual parts of the marc record. Maps them to high-level broad categories,
|
372
375
|
# basically just using the first part of the LCC. Note it's just looking in bib-level
|
@@ -379,7 +382,6 @@ module Traject::Macros
|
|
379
382
|
# or nil.
|
380
383
|
#
|
381
384
|
# The categories output aren't great, but they're something.
|
382
|
-
LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
|
383
385
|
def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a")
|
384
386
|
# Trying to match things that look like LCC, and not match things
|
385
387
|
# that don't. Is tricky.
|
@@ -503,4 +505,4 @@ module Traject::Macros
|
|
503
505
|
|
504
506
|
|
505
507
|
end
|
506
|
-
end
|
508
|
+
end
|
@@ -1,9 +1,19 @@
|
|
1
1
|
module Traject
|
2
2
|
module Macros
|
3
|
-
#
|
3
|
+
# To use the marc_format macro, in your configuration file:
|
4
|
+
#
|
5
|
+
# require 'traject/macros/marc_formats
|
6
|
+
# extend Traject::Macros::MarcFormats
|
7
|
+
#
|
8
|
+
# to_field("format_s") marc_formats
|
9
|
+
#
|
10
|
+
# See also MarcClassifier which can be used directly for a bit more
|
11
|
+
# control.
|
4
12
|
module MarcFormats
|
5
13
|
# very opionated macro that just adds a grab bag of format/genre/types
|
6
|
-
#
|
14
|
+
# from our own custom vocabulary, all into one field.
|
15
|
+
# You may want to build your own from MarcFormatClassifier functions instead.
|
16
|
+
#
|
7
17
|
def marc_formats
|
8
18
|
lambda do |record, accumulator|
|
9
19
|
accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
|
@@ -12,10 +22,11 @@ module Traject
|
|
12
22
|
end
|
13
23
|
|
14
24
|
|
15
|
-
#
|
16
|
-
#
|
25
|
+
# A tool for classifiying MARC records according to format/form/genre/type,
|
26
|
+
# just using our own custom vocabulary for those things.
|
17
27
|
#
|
18
|
-
#
|
28
|
+
# used by the `marc_formats` macro, but you can also use it directly
|
29
|
+
# for a bit more control.
|
19
30
|
class MarcFormatClassifier
|
20
31
|
attr_reader :record
|
21
32
|
|
@@ -24,22 +35,25 @@ module Traject
|
|
24
35
|
end
|
25
36
|
|
26
37
|
# A very opinionated method that just kind of jams together
|
27
|
-
# all the possible format/genre/types into one array of 1 to N elements.
|
38
|
+
# all the possible format/genre/types into one array of 1 to N elements.
|
28
39
|
#
|
29
|
-
#
|
40
|
+
# If no other values are present, the default value "Other" will be used.
|
41
|
+
#
|
42
|
+
# See also individual methods which you can use you seperate into
|
43
|
+
# different facets or do other custom things.
|
30
44
|
def formats(options = {})
|
31
45
|
options = {:default => "Other"}.merge(options)
|
32
46
|
|
33
47
|
formats = []
|
34
48
|
|
35
49
|
formats.concat genre
|
36
|
-
|
50
|
+
|
37
51
|
formats << "Manuscript/Archive" if manuscript_archive?
|
38
52
|
formats << "Microform" if microform?
|
39
53
|
formats << "Online" if online?
|
40
54
|
|
41
55
|
# In our own data, if it's an audio recording, it might show up
|
42
|
-
# as print, but it's probably not.
|
56
|
+
# as print, but it's probably not.
|
43
57
|
formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
|
44
58
|
|
45
59
|
# If it's a Dissertation, we decide it's NOT a book
|
@@ -64,11 +78,11 @@ module Traject
|
|
64
78
|
# Returns 1 or more values in an array from:
|
65
79
|
# Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
|
66
80
|
# Image; Software/Data; Video/Film
|
67
|
-
#
|
68
|
-
# Uses leader byte 6, leader byte 7, and 007 byte 0.
|
81
|
+
#
|
82
|
+
# Uses leader byte 6, leader byte 7, and 007 byte 0.
|
69
83
|
#
|
70
84
|
# Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
|
71
|
-
# so you can customize labels if you want.
|
85
|
+
# so you can customize labels if you want.
|
72
86
|
def genre
|
73
87
|
marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
|
74
88
|
marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
|
@@ -96,18 +110,18 @@ module Traject
|
|
96
110
|
end
|
97
111
|
end
|
98
112
|
|
99
|
-
# Algorithm with help from Chris Case.
|
100
|
-
# * If it has any RDA 338, then it's print if it has a value of
|
101
|
-
# volume, sheet, or card.
|
113
|
+
# Algorithm with help from Chris Case.
|
114
|
+
# * If it has any RDA 338, then it's print if it has a value of
|
115
|
+
# volume, sheet, or card.
|
102
116
|
# * If it does not have an RDA 338, it's print if and only if it has
|
103
|
-
# NO 245$h GMD.
|
117
|
+
# NO 245$h GMD.
|
104
118
|
#
|
105
|
-
# * Here at JH, for legacy reasons we also choose to not
|
119
|
+
# * Here at JH, for legacy reasons we also choose to not
|
106
120
|
# call it print if it's already been marked audio, but
|
107
|
-
# we do that in a different method.
|
121
|
+
# we do that in a different method.
|
108
122
|
#
|
109
123
|
# This algorithm is definitely going to get some things wrong in
|
110
|
-
# both directions, with real world data. But seems to be good enough.
|
124
|
+
# both directions, with real world data. But seems to be good enough.
|
111
125
|
def print?
|
112
126
|
|
113
127
|
|
@@ -116,7 +130,7 @@ module Traject
|
|
116
130
|
end
|
117
131
|
|
118
132
|
if rda338.length > 0
|
119
|
-
rda338.find do |field|
|
133
|
+
rda338.find do |field|
|
120
134
|
field.subfields.find do |sf|
|
121
135
|
(sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
|
122
136
|
(sf.code == "b" && %w{nc no nb}.include?(sf.value))
|
@@ -128,7 +142,7 @@ module Traject
|
|
128
142
|
end
|
129
143
|
|
130
144
|
# We use marc 007 to determine if this represents an online
|
131
|
-
# resource. But sometimes resort to 245$h GMD too.
|
145
|
+
# resource. But sometimes resort to 245$h GMD too.
|
132
146
|
def online?
|
133
147
|
# field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
|
134
148
|
found_007 = record.find do |field|
|
@@ -140,8 +154,8 @@ module Traject
|
|
140
154
|
# Otherwise, if it has a GMD ["electronic resource"], we count it
|
141
155
|
# as online only if NO 007[0] == 'c' exists, cause if it does we already
|
142
156
|
# know it's electronic but not remote, otherwise first try would
|
143
|
-
# have found it.
|
144
|
-
return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
|
157
|
+
# have found it.
|
158
|
+
return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
|
145
159
|
end
|
146
160
|
|
147
161
|
# if field 007 byte 0 is 'h', that's microform. But many of our microform
|
@@ -153,7 +167,7 @@ module Traject
|
|
153
167
|
record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
|
154
168
|
end
|
155
169
|
|
156
|
-
# Marked as manuscript OR archive.
|
170
|
+
# Marked as manuscript OR archive.
|
157
171
|
def manuscript_archive?
|
158
172
|
leader06 = record.leader.slice(6)
|
159
173
|
leader08 = record.leader.slice(8)
|
@@ -177,4 +191,4 @@ module Traject
|
|
177
191
|
|
178
192
|
end
|
179
193
|
end
|
180
|
-
end
|
194
|
+
end
|
@@ -2,24 +2,21 @@ require 'traject'
|
|
2
2
|
require 'marc'
|
3
3
|
require 'marc/marc4j'
|
4
4
|
|
5
|
-
#
|
6
|
-
# ruby-marc
|
7
|
-
#
|
5
|
+
# `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
|
6
|
+
# into standard ruby-marc MARC::Record objects. This reader is often faster than
|
7
|
+
# Traject::MarcReader, especially for XML, and offers support for reading Marc8
|
8
|
+
# encoded records and transcoding to UTF8.
|
8
9
|
#
|
9
|
-
#
|
10
|
-
#
|
10
|
+
# Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
|
11
|
+
# for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
|
12
|
+
# for reading xml. The actual code for dealing with Marc4J is in the separate
|
13
|
+
# [marc-marc4j gem](https://github.com/billdueber/ruby-marc-marc4j).
|
11
14
|
#
|
12
|
-
#
|
15
|
+
# See also the pure ruby Traject::MarcReader as an alternative, if you need to read
|
16
|
+
# marc-in-json, or if you don't need binary Marc8 support, it may in some cases
|
17
|
+
# be faster.
|
13
18
|
#
|
14
|
-
#
|
15
|
-
# in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
|
16
|
-
# for xml.
|
17
|
-
#
|
18
|
-
# NOTE: If you aren't reading in binary records encoded in MARC8, you may
|
19
|
-
# find the pure-ruby Traject::MarcReader faster; the extra step to read
|
20
|
-
# Marc4J but translate to ruby MARC::Record adds some overhead.
|
21
|
-
#
|
22
|
-
# Settings:
|
19
|
+
# ## Settings
|
23
20
|
#
|
24
21
|
# * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
|
25
22
|
#
|
@@ -39,9 +36,26 @@ require 'marc/marc4j'
|
|
39
36
|
# * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
|
40
37
|
# be loaded. If unset, uses marc4j.jar bundled with traject.
|
41
38
|
#
|
42
|
-
# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
|
43
|
-
#
|
44
|
-
|
39
|
+
# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
|
40
|
+
# the eventual ruby-marc record via record#original_marc4j. Intended for
|
41
|
+
# those that have legacy java code for which a marc4j object is needed. .
|
42
|
+
#
|
43
|
+
#
|
44
|
+
# ## Example
|
45
|
+
#
|
46
|
+
# In a configuration file:
|
47
|
+
#
|
48
|
+
# require 'traject/marc4j_reader
|
49
|
+
# settings do
|
50
|
+
# provide "reader_class_name", "Traject::Marc4JReader"
|
51
|
+
#
|
52
|
+
# #for MarcXML:
|
53
|
+
# # provide "marc_source.type", "xml"
|
54
|
+
#
|
55
|
+
# # Or instead for binary:
|
56
|
+
# provide "marc4j_reader.permissive", true
|
57
|
+
# provide "marc4j_reader.source_encoding", "MARC8"
|
58
|
+
# end
|
45
59
|
class Traject::Marc4JReader
|
46
60
|
include Enumerable
|
47
61
|
|
@@ -56,14 +70,14 @@ class Traject::Marc4JReader
|
|
56
70
|
MARC::Record.instance_methods.include?(:"original_marc4j="))
|
57
71
|
MARC::Record.class_eval('attr_accessor :original_marc4j')
|
58
72
|
end
|
59
|
-
|
73
|
+
|
60
74
|
# Creating a converter will do the following:
|
61
75
|
# - nothing, if it detects that the marc4j jar is already loaded
|
62
76
|
# - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
|
63
77
|
# - load the marc4j jar file bundled with MARC::MARC4J otherwise
|
64
|
-
|
78
|
+
|
65
79
|
@converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
|
66
|
-
|
80
|
+
|
67
81
|
# Convenience
|
68
82
|
java_import org.marc4j.MarcPermissiveStreamReader
|
69
83
|
java_import org.marc4j.MarcXmlReader
|
@@ -121,4 +135,4 @@ class Traject::Marc4JReader
|
|
121
135
|
@logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
122
136
|
end
|
123
137
|
|
124
|
-
end
|
138
|
+
end
|
@@ -6,22 +6,23 @@ module Traject
|
|
6
6
|
#
|
7
7
|
# Examples:
|
8
8
|
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
9
|
+
# array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
|
10
|
+
# values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
|
11
|
+
# seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
|
12
|
+
# bytes = MarcExtractor.new("008[35-37]")
|
13
13
|
#
|
14
|
-
#
|
14
|
+
# ## String extraction specifications
|
15
15
|
#
|
16
16
|
# Extraction directions are supplied in strings, usually as the first
|
17
17
|
# parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
|
18
|
-
# are also the first parameter to the #marc_extract macro.
|
18
|
+
# are also the first parameter to the #marc_extract macro.
|
19
19
|
#
|
20
20
|
# A String specification is a string (or array of strings) which consists
|
21
|
-
# of one or more Data and Control Field Specifications seperated by colons.
|
21
|
+
# of one or more Data and Control Field Specifications seperated by colons.
|
22
22
|
#
|
23
23
|
# A Data Field Specification is of the form:
|
24
|
-
#
|
24
|
+
#
|
25
|
+
# * `{tag}{|indicators|}{subfields}`
|
25
26
|
# * {tag} is three chars (usually but not neccesarily numeric)
|
26
27
|
# * {indicators} are optional two chars enclosed in pipe ('|') characters,
|
27
28
|
# * {subfields} are optional list of chars (alphanumeric)
|
@@ -29,58 +30,58 @@ module Traject
|
|
29
30
|
# indicator spec must be two chars, but one can be * meaning "don't care".
|
30
31
|
# space to mean 'blank'
|
31
32
|
#
|
32
|
-
#
|
33
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
33
34
|
#
|
34
35
|
# A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
|
35
|
-
# and includes a tag and a a byte slice specification.
|
36
|
+
# and includes a tag and a a byte slice specification.
|
36
37
|
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
38
|
+
# "008[35-37]:007[5]""
|
39
|
+
# => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
|
40
|
+
# "LDR" as a pseudo-tag to take byte slices of leader?)
|
40
41
|
#
|
41
42
|
# * subfields and indicators can only be provided for marc data/variable fields
|
42
43
|
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
43
44
|
#
|
44
|
-
#
|
45
|
+
# ## Subfield concatenation
|
45
46
|
#
|
46
47
|
# Normally, for a spec including multiple subfield codes, multiple subfields
|
47
48
|
# from the same MARC field will be concatenated into one string separated by spaces:
|
48
49
|
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
50
|
+
# 600 a| Chomsky, Noam x| Philosophy.
|
51
|
+
# 600 a| Chomsky, Noam x| Political and social views.
|
52
|
+
# MarcExtractor.new("600ax").extract(record)
|
53
|
+
# # results in two values sent to Solr:
|
54
|
+
# "Chomsky, Noam Philosophy."
|
55
|
+
# "Chomsky, Noam Political and social views."
|
55
56
|
#
|
56
57
|
# You can turn off this concatenation and leave individual subfields in seperate
|
57
58
|
# strings by setting the `separator` option to nil:
|
58
59
|
#
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
60
|
+
# MarcExtractor.new("600ax", :separator => nil).extract(record)
|
61
|
+
# # Results in four values being sent to Solr (or 3 if you de-dup):
|
62
|
+
# "Chomksy, Noam"
|
63
|
+
# "Philosophy."
|
64
|
+
# "Chomsky, Noam"
|
65
|
+
# "Political and social views."
|
65
66
|
#
|
66
67
|
# However, **the default is different for specifications with only a single
|
67
68
|
# subfield**, these are by default kept seperated:
|
68
69
|
#
|
69
|
-
#
|
70
|
-
#
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
70
|
+
# 020 a| 285197145X a| 9782851971456
|
71
|
+
# MarcExtractor.new("020a:020z").extract(record)
|
72
|
+
# # two seperate strings sent to Solr:
|
73
|
+
# "285197145X"
|
74
|
+
# "9782851971456"
|
74
75
|
#
|
75
76
|
# For single subfield specifications, you force concatenation by
|
76
77
|
# repeating the subfield specification:
|
77
78
|
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
79
|
+
# MarcExtractor.new("020aa:020zz").extract(record)
|
80
|
+
# # would result in a single string sent to solr for
|
81
|
+
# # the single field, by default space-separated:
|
82
|
+
# "285197145X 9782851971456"
|
82
83
|
#
|
83
|
-
#
|
84
|
+
# ## Note on Performance and MarcExtractor creation and reuse
|
84
85
|
#
|
85
86
|
# A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
|
86
87
|
# benchmarking to be a bottleneck if you end up creating one for each marc record
|
@@ -90,15 +91,15 @@ module Traject
|
|
90
91
|
# If you are creating a traject 'macro' method, here's one way to do that,
|
91
92
|
# capturing the MarcExtractor under closure:
|
92
93
|
#
|
93
|
-
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
#
|
97
|
-
#
|
98
|
-
#
|
99
|
-
#
|
100
|
-
#
|
101
|
-
#
|
94
|
+
# def some_macro(spec, other_args, whatever)
|
95
|
+
# extractor = MarcExtractor.new( spec )
|
96
|
+
# # ...
|
97
|
+
# return lambda do |record, accumulator, context|
|
98
|
+
# #...
|
99
|
+
# accumulator.concat extractor.extract(record)
|
100
|
+
# #...
|
101
|
+
# end
|
102
|
+
# end
|
102
103
|
#
|
103
104
|
# In other cases, you may find it convenient to improve performance by
|
104
105
|
# using the MarcExtractor#cached method, instead of MarcExtractor#new, to
|
@@ -107,13 +108,13 @@ module Traject
|
|
107
108
|
class MarcExtractor
|
108
109
|
attr_accessor :options, :spec_hash
|
109
110
|
|
110
|
-
# First arg is a specification for extraction of data from a MARC record.
|
111
|
+
# First arg is a specification for extraction of data from a MARC record.
|
111
112
|
# Specification can be given in two forms:
|
112
113
|
#
|
113
114
|
# * a string specification like "008[35]:020a:245abc", see top of class
|
114
|
-
# for examples. A string specification is most typical argument.
|
115
|
+
# for examples. A string specification is most typical argument.
|
115
116
|
# * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
|
116
|
-
# a 'pre-parsed' specification.
|
117
|
+
# a 'pre-parsed' specification.
|
117
118
|
#
|
118
119
|
# Second arg is options:
|
119
120
|
#
|
@@ -146,6 +147,8 @@ module Traject
|
|
146
147
|
if options[:alternate_script] != false
|
147
148
|
@interesting_tags_hash['880'] = true
|
148
149
|
end
|
150
|
+
|
151
|
+
self.freeze
|
149
152
|
end
|
150
153
|
|
151
154
|
# Takes the same arguments as MarcExtractor.new, but will re-use an existing
|
@@ -164,17 +167,10 @@ module Traject
|
|
164
167
|
# although if you try hard enough you can surely find a way to do something
|
165
168
|
# you shouldn't.
|
166
169
|
#
|
167
|
-
#
|
170
|
+
# extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
|
168
171
|
def self.cached(*args)
|
169
172
|
cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
|
170
|
-
|
171
|
-
ex = Traject::MarcExtractor.new(*args).freeze
|
172
|
-
ex.options.freeze
|
173
|
-
ex.spec_hash.freeze
|
174
|
-
ex
|
175
|
-
end)
|
176
|
-
|
177
|
-
return extractor
|
173
|
+
return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
|
178
174
|
end
|
179
175
|
|
180
176
|
# Check to see if a tag is interesting (meaning it may be covered by a spec
|
@@ -186,14 +182,14 @@ module Traject
|
|
186
182
|
|
187
183
|
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
188
184
|
# to represent the specification. See comments at head of class for
|
189
|
-
# documentation of string specification format.
|
185
|
+
# documentation of string specification format.
|
190
186
|
#
|
191
187
|
#
|
192
|
-
#
|
188
|
+
# ## Return value
|
193
189
|
#
|
194
190
|
# The hash returned is keyed by tag, and has as values an array of 0 or
|
195
191
|
# or more MarcExtractor::Spec objects representing the specified extraction
|
196
|
-
# operations for that tag.
|
192
|
+
# operations for that tag.
|
197
193
|
#
|
198
194
|
# It's an array of possibly more than one, because you can specify
|
199
195
|
# multiple extractions on the same tag: for instance "245a:245abc"
|
@@ -201,7 +197,7 @@ module Traject
|
|
201
197
|
# See tests for more examples.
|
202
198
|
def self.parse_string_spec(spec_string)
|
203
199
|
# hash defaults to []
|
204
|
-
hash = Hash.new
|
200
|
+
hash = Hash.new
|
205
201
|
|
206
202
|
spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
|
207
203
|
|
@@ -222,8 +218,9 @@ module Traject
|
|
222
218
|
spec.indicator2 = indicators[1] if indicators[1] != "*"
|
223
219
|
end
|
224
220
|
|
221
|
+
hash[spec.tag] ||= []
|
225
222
|
hash[spec.tag] << spec
|
226
|
-
|
223
|
+
|
227
224
|
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
|
228
225
|
tag, byte1, byte2 = $1, $3, $5
|
229
226
|
|
@@ -234,7 +231,8 @@ module Traject
|
|
234
231
|
elsif byte1
|
235
232
|
spec.bytes = byte1.to_i
|
236
233
|
end
|
237
|
-
|
234
|
+
|
235
|
+
hash[spec.tag] ||= []
|
238
236
|
hash[spec.tag] << spec
|
239
237
|
else
|
240
238
|
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
@@ -286,7 +284,7 @@ module Traject
|
|
286
284
|
#
|
287
285
|
# Useful for re-use of this class for custom processing
|
288
286
|
#
|
289
|
-
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
287
|
+
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
290
288
|
def collect_matching_lines(marc_record)
|
291
289
|
results = []
|
292
290
|
self.each_matching_line(marc_record) do |field, spec, extractor|
|
@@ -312,7 +310,7 @@ module Traject
|
|
312
310
|
if options[:separator] && spec.joinable?
|
313
311
|
subfields = [subfields.join(options[:separator])]
|
314
312
|
end
|
315
|
-
|
313
|
+
|
316
314
|
return subfields
|
317
315
|
end
|
318
316
|
|
@@ -324,12 +322,12 @@ module Traject
|
|
324
322
|
# When given an 880, will return the spec (if any) for the linked tag iff
|
325
323
|
# we have a $6 and we want the alternate script.
|
326
324
|
#
|
327
|
-
# Returns an empty array in case of no matching extraction specs.
|
325
|
+
# Returns an empty array in case of no matching extraction specs.
|
328
326
|
def specs_covering_field(field)
|
329
327
|
tag = field.tag
|
330
328
|
|
331
329
|
# Short-circuit the unintersting stuff
|
332
|
-
return
|
330
|
+
return [] unless interesting_tag?(tag)
|
333
331
|
|
334
332
|
# Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
|
335
333
|
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
@@ -339,7 +337,7 @@ module Traject
|
|
339
337
|
end
|
340
338
|
|
341
339
|
# Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
|
342
|
-
spec = self.spec_hash[tag]
|
340
|
+
spec = self.spec_hash[tag] || []
|
343
341
|
end
|
344
342
|
|
345
343
|
|
@@ -348,13 +346,19 @@ module Traject
|
|
348
346
|
# define #control_field? on both ControlField and DataField?
|
349
347
|
return field.kind_of? MARC::ControlField
|
350
348
|
end
|
351
|
-
|
349
|
+
|
350
|
+
def freeze
|
351
|
+
self.options.freeze
|
352
|
+
self.spec_hash.freeze
|
353
|
+
super
|
354
|
+
end
|
355
|
+
|
352
356
|
|
353
357
|
# Represents a single specification for extracting data
|
354
|
-
# from a marc field, like "600abc" or "600|1*|x".
|
358
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
355
359
|
#
|
356
360
|
# Includes the tag for reference, although this is redundant and not actually used
|
357
|
-
# in logic, since the tag is also implicit in the overall spec_hash
|
361
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
358
362
|
# with tag => [spec1, spec2]
|
359
363
|
class Spec
|
360
364
|
attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
|
@@ -365,7 +369,7 @@ module Traject
|
|
365
369
|
end
|
366
370
|
end
|
367
371
|
|
368
|
-
|
372
|
+
|
369
373
|
# Should subfields extracted by joined, if we have a seperator?
|
370
374
|
# * '630' no subfields specified => join all subfields
|
371
375
|
# * '630abc' multiple subfields specified = join all subfields
|
@@ -379,8 +383,8 @@ module Traject
|
|
379
383
|
|
380
384
|
# Pass in a MARC field, do it's indicators match indicators
|
381
385
|
# in this spec? nil indicators in spec mean we don't care, everything
|
382
|
-
# matches.
|
383
|
-
def matches_indicators?(field)
|
386
|
+
# matches.
|
387
|
+
def matches_indicators?(field)
|
384
388
|
return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
|
385
389
|
(self.indicator2.nil? || self.indicator2 == field.indicator2)
|
386
390
|
end
|
@@ -396,7 +400,7 @@ module Traject
|
|
396
400
|
return false unless spec.kind_of?(Spec)
|
397
401
|
|
398
402
|
return (self.tag == spec.tag) &&
|
399
|
-
(self.subfields == spec.subfields) &&
|
403
|
+
(self.subfields == spec.subfields) &&
|
400
404
|
(self.indicator1 == spec.indicator1) &&
|
401
405
|
(self.indicator1 == spec.indicator2) &&
|
402
406
|
(self.bytes == spec.bytes)
|