traject 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +183 -191
- data/bench/bench.rb +1 -1
- data/doc/batch_execution.md +14 -0
- data/doc/extending.md +14 -12
- data/doc/indexing_rules.md +265 -0
- data/lib/traject/command_line.rb +12 -41
- data/lib/traject/debug_writer.rb +32 -13
- data/lib/traject/indexer.rb +101 -24
- data/lib/traject/indexer/settings.rb +18 -17
- data/lib/traject/json_writer.rb +32 -11
- data/lib/traject/line_writer.rb +6 -6
- data/lib/traject/macros/basic.rb +1 -1
- data/lib/traject/macros/marc21.rb +17 -13
- data/lib/traject/macros/marc21_semantics.rb +27 -25
- data/lib/traject/macros/marc_format_classifier.rb +39 -25
- data/lib/traject/marc4j_reader.rb +36 -22
- data/lib/traject/marc_extractor.rb +79 -75
- data/lib/traject/marc_reader.rb +33 -25
- data/lib/traject/mock_reader.rb +9 -10
- data/lib/traject/ndj_reader.rb +7 -7
- data/lib/traject/null_writer.rb +1 -1
- data/lib/traject/qualified_const_get.rb +12 -2
- data/lib/traject/solrj_writer.rb +61 -52
- data/lib/traject/thread_pool.rb +45 -45
- data/lib/traject/translation_map.rb +59 -27
- data/lib/traject/util.rb +3 -3
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +1 -1
- data/test/debug_writer_test.rb +7 -7
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_semantics_test.rb +12 -12
- data/test/indexer/macros_marc21_test.rb +10 -10
- data/test/indexer/macros_test.rb +1 -1
- data/test/indexer/map_record_test.rb +6 -6
- data/test/indexer/read_write_test.rb +43 -4
- data/test/indexer/settings_test.rb +2 -2
- data/test/indexer/to_field_test.rb +8 -8
- data/test/marc4j_reader_test.rb +4 -4
- data/test/marc_extractor_test.rb +33 -25
- data/test/marc_format_classifier_test.rb +3 -3
- data/test/marc_reader_test.rb +2 -2
- data/test/test_helper.rb +3 -3
- data/test/test_support/demo_config.rb +52 -48
- data/test/translation_map_test.rb +22 -4
- data/test/translation_maps/bad_ruby.rb +2 -2
- data/test/translation_maps/both_map.rb +1 -1
- data/test/translation_maps/default_literal.rb +1 -1
- data/test/translation_maps/default_passthrough.rb +1 -1
- data/test/translation_maps/ruby_map.rb +1 -1
- metadata +7 -31
- data/doc/macros.md +0 -103
@@ -62,10 +62,10 @@ module Traject::Macros
|
|
62
62
|
def self.get_sortable_author(record)
|
63
63
|
onexx = MarcExtractor.cached("100:110:111", :first => true, :trim_punctuation => true).extract(record).first
|
64
64
|
onexx = onexx.strip if onexx
|
65
|
-
|
65
|
+
|
66
66
|
titles = []
|
67
67
|
MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
|
68
|
-
non_filing = field.indicator2.to_i
|
68
|
+
non_filing = field.indicator2.to_i
|
69
69
|
|
70
70
|
str = field.subfields.collect {|sf| Marc21.trim_punctuation(sf.value.strip).strip}.join(" ")
|
71
71
|
str = str.slice(non_filing, str.length)
|
@@ -73,7 +73,7 @@ module Traject::Macros
|
|
73
73
|
end.first
|
74
74
|
title = titles.first
|
75
75
|
title = title.strip if title
|
76
|
-
|
76
|
+
|
77
77
|
return [onexx, title].compact.join(" ")
|
78
78
|
end
|
79
79
|
|
@@ -105,26 +105,26 @@ module Traject::Macros
|
|
105
105
|
str
|
106
106
|
end.first
|
107
107
|
end
|
108
|
-
|
109
|
-
|
110
|
-
|
108
|
+
|
109
|
+
|
110
|
+
|
111
111
|
# A generic way to strip a filing version (i.e., a string with the non-filing
|
112
112
|
# characters stripped off)
|
113
113
|
#
|
114
114
|
# Always returns an array. If :include_original=>true is passed in,
|
115
115
|
# that array will include the original string with the non-filing
|
116
116
|
# characters still in it.
|
117
|
-
|
117
|
+
|
118
118
|
def extract_marc_filing_version(spec='245abdefghknp', opts={})
|
119
119
|
include_original = opts.delete(:include_original)
|
120
120
|
if opts.size > 0
|
121
121
|
raise RuntimeError.new("extract_marc_filing_version can take only :include_original as an argument, not #{opts.keys.map{|x| "'#{x}'"}.join(' or ')}")
|
122
122
|
end
|
123
|
-
|
123
|
+
|
124
124
|
extractor = Traject::MarcExtractor.cached(spec, opts)
|
125
|
-
|
125
|
+
|
126
126
|
lambda do |record, accumulator, context|
|
127
|
-
extractor.collect_matching_lines(record) do |field, spec|
|
127
|
+
extractor.collect_matching_lines(record) do |field, spec|
|
128
128
|
str = extractor.collect_subfields(field, spec).first
|
129
129
|
next unless str and !str.empty?
|
130
130
|
vals = [Marc21Semantics.filing_version(field, str, spec)]
|
@@ -136,34 +136,34 @@ module Traject::Macros
|
|
136
136
|
end
|
137
137
|
end
|
138
138
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
139
|
+
|
140
|
+
|
141
|
+
|
142
|
+
|
143
143
|
# Take in a field, a string extracted from that field, and a spec and
|
144
|
-
# return the filing version (i.e., the string without the
|
144
|
+
# return the filing version (i.e., the string without the
|
145
145
|
# non-filing characters)
|
146
|
-
|
146
|
+
|
147
147
|
def self.filing_version(field, str, spec)
|
148
148
|
# Control fields don't have non-filing characters
|
149
149
|
return str if field.kind_of? MARC::ControlField
|
150
|
-
|
150
|
+
|
151
151
|
# 2nd indicator must be > 0
|
152
152
|
ind2 = field.indicator2.to_i
|
153
153
|
return str unless ind2 > 0
|
154
|
-
|
154
|
+
|
155
155
|
# The spechash must either (a) have no subfields specified, or
|
156
156
|
# (b) include the first subfield in the record
|
157
|
-
|
157
|
+
|
158
158
|
subs = spec.subfields
|
159
159
|
return str unless subs && subs.include?(field.subfields[0].code)
|
160
|
-
|
160
|
+
|
161
161
|
# OK. If we got this far we actually need to strip characters off the string
|
162
|
-
|
162
|
+
|
163
163
|
return str[ind2..-1]
|
164
164
|
end
|
165
|
-
|
166
|
-
|
165
|
+
|
166
|
+
|
167
167
|
|
168
168
|
|
169
169
|
# maps languages, by default out of 008[35-37] and 041a and 041d
|
@@ -367,6 +367,9 @@ module Traject::Macros
|
|
367
367
|
return found_date
|
368
368
|
end
|
369
369
|
|
370
|
+
# REGEX meant to rule out obvious non-LCC's, and only allow things
|
371
|
+
# plausibly LCC's.
|
372
|
+
LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
|
370
373
|
# Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH)
|
371
374
|
# from usual parts of the marc record. Maps them to high-level broad categories,
|
372
375
|
# basically just using the first part of the LCC. Note it's just looking in bib-level
|
@@ -379,7 +382,6 @@ module Traject::Macros
|
|
379
382
|
# or nil.
|
380
383
|
#
|
381
384
|
# The categories output aren't great, but they're something.
|
382
|
-
LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
|
383
385
|
def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a")
|
384
386
|
# Trying to match things that look like LCC, and not match things
|
385
387
|
# that don't. Is tricky.
|
@@ -503,4 +505,4 @@ module Traject::Macros
|
|
503
505
|
|
504
506
|
|
505
507
|
end
|
506
|
-
end
|
508
|
+
end
|
@@ -1,9 +1,19 @@
|
|
1
1
|
module Traject
|
2
2
|
module Macros
|
3
|
-
#
|
3
|
+
# To use the marc_format macro, in your configuration file:
|
4
|
+
#
|
5
|
+
# require 'traject/macros/marc_formats
|
6
|
+
# extend Traject::Macros::MarcFormats
|
7
|
+
#
|
8
|
+
# to_field("format_s") marc_formats
|
9
|
+
#
|
10
|
+
# See also MarcClassifier which can be used directly for a bit more
|
11
|
+
# control.
|
4
12
|
module MarcFormats
|
5
13
|
# very opionated macro that just adds a grab bag of format/genre/types
|
6
|
-
#
|
14
|
+
# from our own custom vocabulary, all into one field.
|
15
|
+
# You may want to build your own from MarcFormatClassifier functions instead.
|
16
|
+
#
|
7
17
|
def marc_formats
|
8
18
|
lambda do |record, accumulator|
|
9
19
|
accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
|
@@ -12,10 +22,11 @@ module Traject
|
|
12
22
|
end
|
13
23
|
|
14
24
|
|
15
|
-
#
|
16
|
-
#
|
25
|
+
# A tool for classifiying MARC records according to format/form/genre/type,
|
26
|
+
# just using our own custom vocabulary for those things.
|
17
27
|
#
|
18
|
-
#
|
28
|
+
# used by the `marc_formats` macro, but you can also use it directly
|
29
|
+
# for a bit more control.
|
19
30
|
class MarcFormatClassifier
|
20
31
|
attr_reader :record
|
21
32
|
|
@@ -24,22 +35,25 @@ module Traject
|
|
24
35
|
end
|
25
36
|
|
26
37
|
# A very opinionated method that just kind of jams together
|
27
|
-
# all the possible format/genre/types into one array of 1 to N elements.
|
38
|
+
# all the possible format/genre/types into one array of 1 to N elements.
|
28
39
|
#
|
29
|
-
#
|
40
|
+
# If no other values are present, the default value "Other" will be used.
|
41
|
+
#
|
42
|
+
# See also individual methods which you can use you seperate into
|
43
|
+
# different facets or do other custom things.
|
30
44
|
def formats(options = {})
|
31
45
|
options = {:default => "Other"}.merge(options)
|
32
46
|
|
33
47
|
formats = []
|
34
48
|
|
35
49
|
formats.concat genre
|
36
|
-
|
50
|
+
|
37
51
|
formats << "Manuscript/Archive" if manuscript_archive?
|
38
52
|
formats << "Microform" if microform?
|
39
53
|
formats << "Online" if online?
|
40
54
|
|
41
55
|
# In our own data, if it's an audio recording, it might show up
|
42
|
-
# as print, but it's probably not.
|
56
|
+
# as print, but it's probably not.
|
43
57
|
formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
|
44
58
|
|
45
59
|
# If it's a Dissertation, we decide it's NOT a book
|
@@ -64,11 +78,11 @@ module Traject
|
|
64
78
|
# Returns 1 or more values in an array from:
|
65
79
|
# Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
|
66
80
|
# Image; Software/Data; Video/Film
|
67
|
-
#
|
68
|
-
# Uses leader byte 6, leader byte 7, and 007 byte 0.
|
81
|
+
#
|
82
|
+
# Uses leader byte 6, leader byte 7, and 007 byte 0.
|
69
83
|
#
|
70
84
|
# Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
|
71
|
-
# so you can customize labels if you want.
|
85
|
+
# so you can customize labels if you want.
|
72
86
|
def genre
|
73
87
|
marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
|
74
88
|
marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
|
@@ -96,18 +110,18 @@ module Traject
|
|
96
110
|
end
|
97
111
|
end
|
98
112
|
|
99
|
-
# Algorithm with help from Chris Case.
|
100
|
-
# * If it has any RDA 338, then it's print if it has a value of
|
101
|
-
# volume, sheet, or card.
|
113
|
+
# Algorithm with help from Chris Case.
|
114
|
+
# * If it has any RDA 338, then it's print if it has a value of
|
115
|
+
# volume, sheet, or card.
|
102
116
|
# * If it does not have an RDA 338, it's print if and only if it has
|
103
|
-
# NO 245$h GMD.
|
117
|
+
# NO 245$h GMD.
|
104
118
|
#
|
105
|
-
# * Here at JH, for legacy reasons we also choose to not
|
119
|
+
# * Here at JH, for legacy reasons we also choose to not
|
106
120
|
# call it print if it's already been marked audio, but
|
107
|
-
# we do that in a different method.
|
121
|
+
# we do that in a different method.
|
108
122
|
#
|
109
123
|
# This algorithm is definitely going to get some things wrong in
|
110
|
-
# both directions, with real world data. But seems to be good enough.
|
124
|
+
# both directions, with real world data. But seems to be good enough.
|
111
125
|
def print?
|
112
126
|
|
113
127
|
|
@@ -116,7 +130,7 @@ module Traject
|
|
116
130
|
end
|
117
131
|
|
118
132
|
if rda338.length > 0
|
119
|
-
rda338.find do |field|
|
133
|
+
rda338.find do |field|
|
120
134
|
field.subfields.find do |sf|
|
121
135
|
(sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
|
122
136
|
(sf.code == "b" && %w{nc no nb}.include?(sf.value))
|
@@ -128,7 +142,7 @@ module Traject
|
|
128
142
|
end
|
129
143
|
|
130
144
|
# We use marc 007 to determine if this represents an online
|
131
|
-
# resource. But sometimes resort to 245$h GMD too.
|
145
|
+
# resource. But sometimes resort to 245$h GMD too.
|
132
146
|
def online?
|
133
147
|
# field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
|
134
148
|
found_007 = record.find do |field|
|
@@ -140,8 +154,8 @@ module Traject
|
|
140
154
|
# Otherwise, if it has a GMD ["electronic resource"], we count it
|
141
155
|
# as online only if NO 007[0] == 'c' exists, cause if it does we already
|
142
156
|
# know it's electronic but not remote, otherwise first try would
|
143
|
-
# have found it.
|
144
|
-
return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
|
157
|
+
# have found it.
|
158
|
+
return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
|
145
159
|
end
|
146
160
|
|
147
161
|
# if field 007 byte 0 is 'h', that's microform. But many of our microform
|
@@ -153,7 +167,7 @@ module Traject
|
|
153
167
|
record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
|
154
168
|
end
|
155
169
|
|
156
|
-
# Marked as manuscript OR archive.
|
170
|
+
# Marked as manuscript OR archive.
|
157
171
|
def manuscript_archive?
|
158
172
|
leader06 = record.leader.slice(6)
|
159
173
|
leader08 = record.leader.slice(8)
|
@@ -177,4 +191,4 @@ module Traject
|
|
177
191
|
|
178
192
|
end
|
179
193
|
end
|
180
|
-
end
|
194
|
+
end
|
@@ -2,24 +2,21 @@ require 'traject'
|
|
2
2
|
require 'marc'
|
3
3
|
require 'marc/marc4j'
|
4
4
|
|
5
|
-
#
|
6
|
-
# ruby-marc
|
7
|
-
#
|
5
|
+
# `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
|
6
|
+
# into standard ruby-marc MARC::Record objects. This reader is often faster than
|
7
|
+
# Traject::MarcReader, especially for XML, and offers support for reading Marc8
|
8
|
+
# encoded records and transcoding to UTF8.
|
8
9
|
#
|
9
|
-
#
|
10
|
-
#
|
10
|
+
# Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
|
11
|
+
# for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
|
12
|
+
# for reading xml. The actual code for dealing with Marc4J is in the separate
|
13
|
+
# [marc-marc4j gem](https://github.com/billdueber/ruby-marc-marc4j).
|
11
14
|
#
|
12
|
-
#
|
15
|
+
# See also the pure ruby Traject::MarcReader as an alternative, if you need to read
|
16
|
+
# marc-in-json, or if you don't need binary Marc8 support, it may in some cases
|
17
|
+
# be faster.
|
13
18
|
#
|
14
|
-
#
|
15
|
-
# in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
|
16
|
-
# for xml.
|
17
|
-
#
|
18
|
-
# NOTE: If you aren't reading in binary records encoded in MARC8, you may
|
19
|
-
# find the pure-ruby Traject::MarcReader faster; the extra step to read
|
20
|
-
# Marc4J but translate to ruby MARC::Record adds some overhead.
|
21
|
-
#
|
22
|
-
# Settings:
|
19
|
+
# ## Settings
|
23
20
|
#
|
24
21
|
# * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
|
25
22
|
#
|
@@ -39,9 +36,26 @@ require 'marc/marc4j'
|
|
39
36
|
# * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
|
40
37
|
# be loaded. If unset, uses marc4j.jar bundled with traject.
|
41
38
|
#
|
42
|
-
# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
|
43
|
-
#
|
44
|
-
|
39
|
+
# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
|
40
|
+
# the eventual ruby-marc record via record#original_marc4j. Intended for
|
41
|
+
# those that have legacy java code for which a marc4j object is needed. .
|
42
|
+
#
|
43
|
+
#
|
44
|
+
# ## Example
|
45
|
+
#
|
46
|
+
# In a configuration file:
|
47
|
+
#
|
48
|
+
# require 'traject/marc4j_reader
|
49
|
+
# settings do
|
50
|
+
# provide "reader_class_name", "Traject::Marc4JReader"
|
51
|
+
#
|
52
|
+
# #for MarcXML:
|
53
|
+
# # provide "marc_source.type", "xml"
|
54
|
+
#
|
55
|
+
# # Or instead for binary:
|
56
|
+
# provide "marc4j_reader.permissive", true
|
57
|
+
# provide "marc4j_reader.source_encoding", "MARC8"
|
58
|
+
# end
|
45
59
|
class Traject::Marc4JReader
|
46
60
|
include Enumerable
|
47
61
|
|
@@ -56,14 +70,14 @@ class Traject::Marc4JReader
|
|
56
70
|
MARC::Record.instance_methods.include?(:"original_marc4j="))
|
57
71
|
MARC::Record.class_eval('attr_accessor :original_marc4j')
|
58
72
|
end
|
59
|
-
|
73
|
+
|
60
74
|
# Creating a converter will do the following:
|
61
75
|
# - nothing, if it detects that the marc4j jar is already loaded
|
62
76
|
# - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
|
63
77
|
# - load the marc4j jar file bundled with MARC::MARC4J otherwise
|
64
|
-
|
78
|
+
|
65
79
|
@converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
|
66
|
-
|
80
|
+
|
67
81
|
# Convenience
|
68
82
|
java_import org.marc4j.MarcPermissiveStreamReader
|
69
83
|
java_import org.marc4j.MarcXmlReader
|
@@ -121,4 +135,4 @@ class Traject::Marc4JReader
|
|
121
135
|
@logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
122
136
|
end
|
123
137
|
|
124
|
-
end
|
138
|
+
end
|
@@ -6,22 +6,23 @@ module Traject
|
|
6
6
|
#
|
7
7
|
# Examples:
|
8
8
|
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
9
|
+
# array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
|
10
|
+
# values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
|
11
|
+
# seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
|
12
|
+
# bytes = MarcExtractor.new("008[35-37]")
|
13
13
|
#
|
14
|
-
#
|
14
|
+
# ## String extraction specifications
|
15
15
|
#
|
16
16
|
# Extraction directions are supplied in strings, usually as the first
|
17
17
|
# parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
|
18
|
-
# are also the first parameter to the #marc_extract macro.
|
18
|
+
# are also the first parameter to the #marc_extract macro.
|
19
19
|
#
|
20
20
|
# A String specification is a string (or array of strings) which consists
|
21
|
-
# of one or more Data and Control Field Specifications seperated by colons.
|
21
|
+
# of one or more Data and Control Field Specifications seperated by colons.
|
22
22
|
#
|
23
23
|
# A Data Field Specification is of the form:
|
24
|
-
#
|
24
|
+
#
|
25
|
+
# * `{tag}{|indicators|}{subfields}`
|
25
26
|
# * {tag} is three chars (usually but not neccesarily numeric)
|
26
27
|
# * {indicators} are optional two chars enclosed in pipe ('|') characters,
|
27
28
|
# * {subfields} are optional list of chars (alphanumeric)
|
@@ -29,58 +30,58 @@ module Traject
|
|
29
30
|
# indicator spec must be two chars, but one can be * meaning "don't care".
|
30
31
|
# space to mean 'blank'
|
31
32
|
#
|
32
|
-
#
|
33
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
33
34
|
#
|
34
35
|
# A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
|
35
|
-
# and includes a tag and a a byte slice specification.
|
36
|
+
# and includes a tag and a a byte slice specification.
|
36
37
|
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
38
|
+
# "008[35-37]:007[5]""
|
39
|
+
# => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
|
40
|
+
# "LDR" as a pseudo-tag to take byte slices of leader?)
|
40
41
|
#
|
41
42
|
# * subfields and indicators can only be provided for marc data/variable fields
|
42
43
|
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
43
44
|
#
|
44
|
-
#
|
45
|
+
# ## Subfield concatenation
|
45
46
|
#
|
46
47
|
# Normally, for a spec including multiple subfield codes, multiple subfields
|
47
48
|
# from the same MARC field will be concatenated into one string separated by spaces:
|
48
49
|
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
50
|
+
# 600 a| Chomsky, Noam x| Philosophy.
|
51
|
+
# 600 a| Chomsky, Noam x| Political and social views.
|
52
|
+
# MarcExtractor.new("600ax").extract(record)
|
53
|
+
# # results in two values sent to Solr:
|
54
|
+
# "Chomsky, Noam Philosophy."
|
55
|
+
# "Chomsky, Noam Political and social views."
|
55
56
|
#
|
56
57
|
# You can turn off this concatenation and leave individual subfields in seperate
|
57
58
|
# strings by setting the `separator` option to nil:
|
58
59
|
#
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
60
|
+
# MarcExtractor.new("600ax", :separator => nil).extract(record)
|
61
|
+
# # Results in four values being sent to Solr (or 3 if you de-dup):
|
62
|
+
# "Chomksy, Noam"
|
63
|
+
# "Philosophy."
|
64
|
+
# "Chomsky, Noam"
|
65
|
+
# "Political and social views."
|
65
66
|
#
|
66
67
|
# However, **the default is different for specifications with only a single
|
67
68
|
# subfield**, these are by default kept seperated:
|
68
69
|
#
|
69
|
-
#
|
70
|
-
#
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
70
|
+
# 020 a| 285197145X a| 9782851971456
|
71
|
+
# MarcExtractor.new("020a:020z").extract(record)
|
72
|
+
# # two seperate strings sent to Solr:
|
73
|
+
# "285197145X"
|
74
|
+
# "9782851971456"
|
74
75
|
#
|
75
76
|
# For single subfield specifications, you force concatenation by
|
76
77
|
# repeating the subfield specification:
|
77
78
|
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
79
|
+
# MarcExtractor.new("020aa:020zz").extract(record)
|
80
|
+
# # would result in a single string sent to solr for
|
81
|
+
# # the single field, by default space-separated:
|
82
|
+
# "285197145X 9782851971456"
|
82
83
|
#
|
83
|
-
#
|
84
|
+
# ## Note on Performance and MarcExtractor creation and reuse
|
84
85
|
#
|
85
86
|
# A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
|
86
87
|
# benchmarking to be a bottleneck if you end up creating one for each marc record
|
@@ -90,15 +91,15 @@ module Traject
|
|
90
91
|
# If you are creating a traject 'macro' method, here's one way to do that,
|
91
92
|
# capturing the MarcExtractor under closure:
|
92
93
|
#
|
93
|
-
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
#
|
97
|
-
#
|
98
|
-
#
|
99
|
-
#
|
100
|
-
#
|
101
|
-
#
|
94
|
+
# def some_macro(spec, other_args, whatever)
|
95
|
+
# extractor = MarcExtractor.new( spec )
|
96
|
+
# # ...
|
97
|
+
# return lambda do |record, accumulator, context|
|
98
|
+
# #...
|
99
|
+
# accumulator.concat extractor.extract(record)
|
100
|
+
# #...
|
101
|
+
# end
|
102
|
+
# end
|
102
103
|
#
|
103
104
|
# In other cases, you may find it convenient to improve performance by
|
104
105
|
# using the MarcExtractor#cached method, instead of MarcExtractor#new, to
|
@@ -107,13 +108,13 @@ module Traject
|
|
107
108
|
class MarcExtractor
|
108
109
|
attr_accessor :options, :spec_hash
|
109
110
|
|
110
|
-
# First arg is a specification for extraction of data from a MARC record.
|
111
|
+
# First arg is a specification for extraction of data from a MARC record.
|
111
112
|
# Specification can be given in two forms:
|
112
113
|
#
|
113
114
|
# * a string specification like "008[35]:020a:245abc", see top of class
|
114
|
-
# for examples. A string specification is most typical argument.
|
115
|
+
# for examples. A string specification is most typical argument.
|
115
116
|
# * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
|
116
|
-
# a 'pre-parsed' specification.
|
117
|
+
# a 'pre-parsed' specification.
|
117
118
|
#
|
118
119
|
# Second arg is options:
|
119
120
|
#
|
@@ -146,6 +147,8 @@ module Traject
|
|
146
147
|
if options[:alternate_script] != false
|
147
148
|
@interesting_tags_hash['880'] = true
|
148
149
|
end
|
150
|
+
|
151
|
+
self.freeze
|
149
152
|
end
|
150
153
|
|
151
154
|
# Takes the same arguments as MarcExtractor.new, but will re-use an existing
|
@@ -164,17 +167,10 @@ module Traject
|
|
164
167
|
# although if you try hard enough you can surely find a way to do something
|
165
168
|
# you shouldn't.
|
166
169
|
#
|
167
|
-
#
|
170
|
+
# extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
|
168
171
|
def self.cached(*args)
|
169
172
|
cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
|
170
|
-
|
171
|
-
ex = Traject::MarcExtractor.new(*args).freeze
|
172
|
-
ex.options.freeze
|
173
|
-
ex.spec_hash.freeze
|
174
|
-
ex
|
175
|
-
end)
|
176
|
-
|
177
|
-
return extractor
|
173
|
+
return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
|
178
174
|
end
|
179
175
|
|
180
176
|
# Check to see if a tag is interesting (meaning it may be covered by a spec
|
@@ -186,14 +182,14 @@ module Traject
|
|
186
182
|
|
187
183
|
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
188
184
|
# to represent the specification. See comments at head of class for
|
189
|
-
# documentation of string specification format.
|
185
|
+
# documentation of string specification format.
|
190
186
|
#
|
191
187
|
#
|
192
|
-
#
|
188
|
+
# ## Return value
|
193
189
|
#
|
194
190
|
# The hash returned is keyed by tag, and has as values an array of 0 or
|
195
191
|
# or more MarcExtractor::Spec objects representing the specified extraction
|
196
|
-
# operations for that tag.
|
192
|
+
# operations for that tag.
|
197
193
|
#
|
198
194
|
# It's an array of possibly more than one, because you can specify
|
199
195
|
# multiple extractions on the same tag: for instance "245a:245abc"
|
@@ -201,7 +197,7 @@ module Traject
|
|
201
197
|
# See tests for more examples.
|
202
198
|
def self.parse_string_spec(spec_string)
|
203
199
|
# hash defaults to []
|
204
|
-
hash = Hash.new
|
200
|
+
hash = Hash.new
|
205
201
|
|
206
202
|
spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
|
207
203
|
|
@@ -222,8 +218,9 @@ module Traject
|
|
222
218
|
spec.indicator2 = indicators[1] if indicators[1] != "*"
|
223
219
|
end
|
224
220
|
|
221
|
+
hash[spec.tag] ||= []
|
225
222
|
hash[spec.tag] << spec
|
226
|
-
|
223
|
+
|
227
224
|
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
|
228
225
|
tag, byte1, byte2 = $1, $3, $5
|
229
226
|
|
@@ -234,7 +231,8 @@ module Traject
|
|
234
231
|
elsif byte1
|
235
232
|
spec.bytes = byte1.to_i
|
236
233
|
end
|
237
|
-
|
234
|
+
|
235
|
+
hash[spec.tag] ||= []
|
238
236
|
hash[spec.tag] << spec
|
239
237
|
else
|
240
238
|
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
@@ -286,7 +284,7 @@ module Traject
|
|
286
284
|
#
|
287
285
|
# Useful for re-use of this class for custom processing
|
288
286
|
#
|
289
|
-
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
287
|
+
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
290
288
|
def collect_matching_lines(marc_record)
|
291
289
|
results = []
|
292
290
|
self.each_matching_line(marc_record) do |field, spec, extractor|
|
@@ -312,7 +310,7 @@ module Traject
|
|
312
310
|
if options[:separator] && spec.joinable?
|
313
311
|
subfields = [subfields.join(options[:separator])]
|
314
312
|
end
|
315
|
-
|
313
|
+
|
316
314
|
return subfields
|
317
315
|
end
|
318
316
|
|
@@ -324,12 +322,12 @@ module Traject
|
|
324
322
|
# When given an 880, will return the spec (if any) for the linked tag iff
|
325
323
|
# we have a $6 and we want the alternate script.
|
326
324
|
#
|
327
|
-
# Returns an empty array in case of no matching extraction specs.
|
325
|
+
# Returns an empty array in case of no matching extraction specs.
|
328
326
|
def specs_covering_field(field)
|
329
327
|
tag = field.tag
|
330
328
|
|
331
329
|
# Short-circuit the unintersting stuff
|
332
|
-
return
|
330
|
+
return [] unless interesting_tag?(tag)
|
333
331
|
|
334
332
|
# Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
|
335
333
|
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
@@ -339,7 +337,7 @@ module Traject
|
|
339
337
|
end
|
340
338
|
|
341
339
|
# Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
|
342
|
-
spec = self.spec_hash[tag]
|
340
|
+
spec = self.spec_hash[tag] || []
|
343
341
|
end
|
344
342
|
|
345
343
|
|
@@ -348,13 +346,19 @@ module Traject
|
|
348
346
|
# define #control_field? on both ControlField and DataField?
|
349
347
|
return field.kind_of? MARC::ControlField
|
350
348
|
end
|
351
|
-
|
349
|
+
|
350
|
+
def freeze
|
351
|
+
self.options.freeze
|
352
|
+
self.spec_hash.freeze
|
353
|
+
super
|
354
|
+
end
|
355
|
+
|
352
356
|
|
353
357
|
# Represents a single specification for extracting data
|
354
|
-
# from a marc field, like "600abc" or "600|1*|x".
|
358
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
355
359
|
#
|
356
360
|
# Includes the tag for reference, although this is redundant and not actually used
|
357
|
-
# in logic, since the tag is also implicit in the overall spec_hash
|
361
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
358
362
|
# with tag => [spec1, spec2]
|
359
363
|
class Spec
|
360
364
|
attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
|
@@ -365,7 +369,7 @@ module Traject
|
|
365
369
|
end
|
366
370
|
end
|
367
371
|
|
368
|
-
|
372
|
+
|
369
373
|
# Should subfields extracted by joined, if we have a seperator?
|
370
374
|
# * '630' no subfields specified => join all subfields
|
371
375
|
# * '630abc' multiple subfields specified = join all subfields
|
@@ -379,8 +383,8 @@ module Traject
|
|
379
383
|
|
380
384
|
# Pass in a MARC field, do it's indicators match indicators
|
381
385
|
# in this spec? nil indicators in spec mean we don't care, everything
|
382
|
-
# matches.
|
383
|
-
def matches_indicators?(field)
|
386
|
+
# matches.
|
387
|
+
def matches_indicators?(field)
|
384
388
|
return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
|
385
389
|
(self.indicator2.nil? || self.indicator2 == field.indicator2)
|
386
390
|
end
|
@@ -396,7 +400,7 @@ module Traject
|
|
396
400
|
return false unless spec.kind_of?(Spec)
|
397
401
|
|
398
402
|
return (self.tag == spec.tag) &&
|
399
|
-
(self.subfields == spec.subfields) &&
|
403
|
+
(self.subfields == spec.subfields) &&
|
400
404
|
(self.indicator1 == spec.indicator1) &&
|
401
405
|
(self.indicator1 == spec.indicator2) &&
|
402
406
|
(self.bytes == spec.bytes)
|