traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
module Traject
|
|
2
|
+
module Macros
|
|
3
|
+
# To use the marc_format macro, in your configuration file:
|
|
4
|
+
#
|
|
5
|
+
# require 'traject/macros/marc_formats
|
|
6
|
+
# extend Traject::Macros::MarcFormats
|
|
7
|
+
#
|
|
8
|
+
# to_field("format_s") marc_formats
|
|
9
|
+
#
|
|
10
|
+
# See also MarcClassifier which can be used directly for a bit more
|
|
11
|
+
# control.
|
|
12
|
+
module MarcFormats
|
|
13
|
+
# very opionated macro that just adds a grab bag of format/genre/types
|
|
14
|
+
# from our own custom vocabulary, all into one field.
|
|
15
|
+
# You may want to build your own from MarcFormatClassifier functions instead.
|
|
16
|
+
#
|
|
17
|
+
def marc_formats
|
|
18
|
+
lambda do |record, accumulator|
|
|
19
|
+
accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# A tool for classifiying MARC records according to format/form/genre/type,
|
|
26
|
+
# just using our own custom vocabulary for those things.
|
|
27
|
+
#
|
|
28
|
+
# used by the `marc_formats` macro, but you can also use it directly
|
|
29
|
+
# for a bit more control.
|
|
30
|
+
class MarcFormatClassifier
|
|
31
|
+
attr_reader :record
|
|
32
|
+
|
|
33
|
+
def initialize(marc_record)
|
|
34
|
+
@record = marc_record
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# A very opinionated method that just kind of jams together
|
|
38
|
+
# all the possible format/genre/types into one array of 1 to N elements.
|
|
39
|
+
#
|
|
40
|
+
# If no other values are present, the default value "Other" will be used.
|
|
41
|
+
#
|
|
42
|
+
# See also individual methods which you can use you seperate into
|
|
43
|
+
# different facets or do other custom things.
|
|
44
|
+
def formats(options = {})
|
|
45
|
+
options = {:default => "Other"}.merge(options)
|
|
46
|
+
|
|
47
|
+
formats = []
|
|
48
|
+
|
|
49
|
+
formats.concat genre
|
|
50
|
+
|
|
51
|
+
formats << "Manuscript/Archive" if manuscript_archive?
|
|
52
|
+
formats << "Microform" if microform?
|
|
53
|
+
formats << "Online" if online?
|
|
54
|
+
|
|
55
|
+
# In our own data, if it's an audio recording, it might show up
|
|
56
|
+
# as print, but it's probably not.
|
|
57
|
+
formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
|
|
58
|
+
|
|
59
|
+
# If it's a Dissertation, we decide it's NOT a book
|
|
60
|
+
if thesis?
|
|
61
|
+
formats.delete("Book")
|
|
62
|
+
formats << "Dissertation/Thesis"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
if proceeding?
|
|
66
|
+
formats << "Conference"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
if formats.empty?
|
|
70
|
+
formats << options[:default]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
return formats
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# Returns 1 or more values in an array from:
|
|
79
|
+
# Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
|
|
80
|
+
# Image; Software/Data; Video/Film
|
|
81
|
+
#
|
|
82
|
+
# Uses leader byte 6, leader byte 7, and 007 byte 0.
|
|
83
|
+
#
|
|
84
|
+
# Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
|
|
85
|
+
# so you can customize labels if you want.
|
|
86
|
+
def genre
|
|
87
|
+
marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
|
|
88
|
+
marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
|
|
89
|
+
|
|
90
|
+
results = marc_genre_leader[ record.leader.slice(6,2) ] ||
|
|
91
|
+
marc_genre_leader[ record.leader.slice(6)] ||
|
|
92
|
+
record.find_all {|f| f.tag == "007"}.collect {|f| marc_genre_007[f.value.slice(0)]}
|
|
93
|
+
|
|
94
|
+
[results].flatten
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Just checks if it has a 502, if it does it's considered a thesis
|
|
98
|
+
def thesis?
|
|
99
|
+
@thesis_q ||= begin
|
|
100
|
+
! record.find {|a| a.tag == "502"}.nil?
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Just checks all $6xx for a $v "Congresses"
|
|
105
|
+
def proceeding?
|
|
106
|
+
@proceeding_q ||= begin
|
|
107
|
+
! record.find do |field|
|
|
108
|
+
field.tag.slice(0) == '6' && field.subfields.find {|sf| sf.code == "v" && sf.value =~ /^\s*(C|c)ongresses\.?\s*$/}
|
|
109
|
+
end.nil?
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Algorithm with help from Chris Case.
|
|
114
|
+
# * If it has any RDA 338, then it's print if it has a value of
|
|
115
|
+
# volume, sheet, or card.
|
|
116
|
+
# * If it does not have an RDA 338, it's print if and only if it has
|
|
117
|
+
# no 245$h GMD.
|
|
118
|
+
#
|
|
119
|
+
# * Here at JH, for legacy reasons we also choose to not
|
|
120
|
+
# call it print if it's already been marked audio, but
|
|
121
|
+
# we do that in a different method.
|
|
122
|
+
#
|
|
123
|
+
# Note that any record that has neither a 245 nor a 338rda is going
|
|
124
|
+
# to be marked print
|
|
125
|
+
#
|
|
126
|
+
# This algorithm is definitely going to get some things wrong in
|
|
127
|
+
# both directions, with real world data. But seems to be good enough.
|
|
128
|
+
def print?
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
rda338 = record.find_all do |field|
|
|
132
|
+
field.tag == "338" && field['2'] == "rdacarrier"
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
if rda338.length > 0
|
|
136
|
+
rda338.find do |field|
|
|
137
|
+
field.subfields.find do |sf|
|
|
138
|
+
(sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
|
|
139
|
+
(sf.code == "b" && %w{nc no nb}.include?(sf.value))
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
else
|
|
143
|
+
normalized_gmd.length == 0
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# We use marc 007 to determine if this represents an online
|
|
148
|
+
# resource. But sometimes resort to 245$h GMD too.
|
|
149
|
+
def online?
|
|
150
|
+
# field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
|
|
151
|
+
found_007 = record.fields('007').find do |field|
|
|
152
|
+
field.value.slice(0) == "c" && field.value.slice(1) == "r"
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
return true if found_007
|
|
156
|
+
|
|
157
|
+
# Otherwise, if it has a GMD ["electronic resource"], we count it
|
|
158
|
+
# as online only if NO 007[0] == 'c' exists, cause if it does we already
|
|
159
|
+
# know it's electronic but not remote, otherwise first try would
|
|
160
|
+
# have found it.
|
|
161
|
+
return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# if field 007 byte 0 is 'h', that's microform. But many of our microform
|
|
165
|
+
# don't have that. If leader byte 6 is 'h', that's an obsolete way of saying
|
|
166
|
+
# microform. And finally, if GMD is
|
|
167
|
+
def microform?
|
|
168
|
+
normalized_gmd.start_with?("[microform]") ||
|
|
169
|
+
record.leader[6] == "h" ||
|
|
170
|
+
record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Marked as manuscript OR archive.
|
|
174
|
+
def manuscript_archive?
|
|
175
|
+
leader06 = record.leader.slice(6)
|
|
176
|
+
leader08 = record.leader.slice(8)
|
|
177
|
+
|
|
178
|
+
# leader 6 t=Manuscript Language Material, d=Manuscript Music,
|
|
179
|
+
# f=Manuscript Cartographic
|
|
180
|
+
#
|
|
181
|
+
# leader 06 = 'b' is obsolete, but if it exists it means archival countrl
|
|
182
|
+
#
|
|
183
|
+
# leader 08 'a'='archival control'
|
|
184
|
+
%w{t d f b}.include?(leader06) || leader08 == "a"
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# downcased version of the gmd, or else empty string
|
|
188
|
+
def normalized_gmd
|
|
189
|
+
@gmd ||= begin
|
|
190
|
+
((a245 = record['245']) && a245['h'] && a245['h'].downcase) || ""
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
module Traject
|
|
2
|
+
# MarcExtractor is a class for extracting lists of strings from a MARC::Record,
|
|
3
|
+
# according to specifications. See #parse_string_spec for description of string
|
|
4
|
+
# string arguments used to specify extraction. See #initialize for options
|
|
5
|
+
# that can be set controlling extraction.
|
|
6
|
+
#
|
|
7
|
+
# Examples:
|
|
8
|
+
#
|
|
9
|
+
# array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
|
|
10
|
+
# values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
|
|
11
|
+
# seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
|
|
12
|
+
# bytes = MarcExtractor.new("008[35-37]")
|
|
13
|
+
#
|
|
14
|
+
# ## String extraction specifications
|
|
15
|
+
#
|
|
16
|
+
# Extraction directions are supplied in strings, usually as the first
|
|
17
|
+
# parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
|
|
18
|
+
# are also the first parameter to the #marc_extract macro.
|
|
19
|
+
#
|
|
20
|
+
# A String specification is a string (or array of strings) which consists
|
|
21
|
+
# of one or more Data and Control Field Specifications seperated by colons.
|
|
22
|
+
#
|
|
23
|
+
# A Data Field Specification is of the form:
|
|
24
|
+
#
|
|
25
|
+
# * `{tag}{|indicators|}{subfields}`
|
|
26
|
+
# * {tag} is three chars (usually but not neccesarily numeric)
|
|
27
|
+
# * {indicators} are optional two chars enclosed in pipe ('|') characters,
|
|
28
|
+
# * {subfields} are optional list of chars (alphanumeric)
|
|
29
|
+
#
|
|
30
|
+
# indicator spec must be two chars, but one can be * meaning "don't care".
|
|
31
|
+
# space to mean 'blank'
|
|
32
|
+
#
|
|
33
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
|
34
|
+
#
|
|
35
|
+
# A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
|
|
36
|
+
# and includes a tag and a a byte slice specification.
|
|
37
|
+
#
|
|
38
|
+
# "008[35-37]:007[5]""
|
|
39
|
+
# => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007
|
|
40
|
+
#
|
|
41
|
+
# * subfields and indicators can only be provided for marc data/variable fields
|
|
42
|
+
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
|
43
|
+
#
|
|
44
|
+
# ## Subfield concatenation
|
|
45
|
+
#
|
|
46
|
+
# Normally, for a spec including multiple subfield codes, multiple subfields
|
|
47
|
+
# from the same MARC field will be concatenated into one string separated by spaces:
|
|
48
|
+
#
|
|
49
|
+
# 600 a| Chomsky, Noam x| Philosophy.
|
|
50
|
+
# 600 a| Chomsky, Noam x| Political and social views.
|
|
51
|
+
# MarcExtractor.new("600ax").extract(record)
|
|
52
|
+
# # results in two values sent to Solr:
|
|
53
|
+
# "Chomsky, Noam Philosophy."
|
|
54
|
+
# "Chomsky, Noam Political and social views."
|
|
55
|
+
#
|
|
56
|
+
# You can turn off this concatenation and leave individual subfields in seperate
|
|
57
|
+
# strings by setting the `separator` option to nil:
|
|
58
|
+
#
|
|
59
|
+
# MarcExtractor.new("600ax", :separator => nil).extract(record)
|
|
60
|
+
# # Results in four values being sent to Solr (or 3 if you de-dup):
|
|
61
|
+
# "Chomksy, Noam"
|
|
62
|
+
# "Philosophy."
|
|
63
|
+
# "Chomsky, Noam"
|
|
64
|
+
# "Political and social views."
|
|
65
|
+
#
|
|
66
|
+
# However, **the default is different for specifications with only a single
|
|
67
|
+
# subfield**, these are by default kept seperated:
|
|
68
|
+
#
|
|
69
|
+
# 020 a| 285197145X a| 9782851971456
|
|
70
|
+
# MarcExtractor.new("020a:020z").extract(record)
|
|
71
|
+
# # two seperate strings sent to Solr:
|
|
72
|
+
# "285197145X"
|
|
73
|
+
# "9782851971456"
|
|
74
|
+
#
|
|
75
|
+
# For single subfield specifications, you force concatenation by
|
|
76
|
+
# repeating the subfield specification:
|
|
77
|
+
#
|
|
78
|
+
# MarcExtractor.new("020aa:020zz").extract(record)
|
|
79
|
+
# # would result in a single string sent to solr for
|
|
80
|
+
# # the single field, by default space-separated:
|
|
81
|
+
# "285197145X 9782851971456"
|
|
82
|
+
#
|
|
83
|
+
# ## Note on Performance and MarcExtractor creation and reuse
|
|
84
|
+
#
|
|
85
|
+
# A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
|
|
86
|
+
# benchmarking to be a bottleneck if you end up creating one for each marc record
|
|
87
|
+
# processed. Instead, a single MarcExtractor should be created, and re-used
|
|
88
|
+
# per MARC record.
|
|
89
|
+
#
|
|
90
|
+
# If you are creating a traject 'macro' method, here's one way to do that,
|
|
91
|
+
# capturing the MarcExtractor under closure:
|
|
92
|
+
#
|
|
93
|
+
# def some_macro(spec, other_args, whatever)
|
|
94
|
+
# extractor = MarcExtractor.new( spec )
|
|
95
|
+
# # ...
|
|
96
|
+
# return lambda do |record, accumulator, context|
|
|
97
|
+
# #...
|
|
98
|
+
# accumulator.concat extractor.extract(record)
|
|
99
|
+
# #...
|
|
100
|
+
# end
|
|
101
|
+
# end
|
|
102
|
+
#
|
|
103
|
+
# In other cases, you may find it convenient to improve performance by
|
|
104
|
+
# using the MarcExtractor#cached method, instead of MarcExtractor#new, to
|
|
105
|
+
# lazily create and then re-use a MarcExtractor object with
|
|
106
|
+
# particular initialization arguments.
|
|
107
|
+
class MarcExtractor
|
|
108
|
+
attr_accessor :options, :spec_hash
|
|
109
|
+
|
|
110
|
+
# First arg is a specification for extraction of data from a MARC record.
|
|
111
|
+
# Specification can be given in two forms:
|
|
112
|
+
#
|
|
113
|
+
# * a string specification like "008[35]:020a:245abc", see top of class
|
|
114
|
+
# for examples. A string specification is most typical argument.
|
|
115
|
+
# * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
|
|
116
|
+
# a 'pre-parsed' specification.
|
|
117
|
+
#
|
|
118
|
+
# Second arg is options:
|
|
119
|
+
#
|
|
120
|
+
# [:separator] default ' ' (space), what to use to separate
|
|
121
|
+
# subfield values when joining strings
|
|
122
|
+
#
|
|
123
|
+
# [:alternate_script] default :include, include linked 880s for tags
|
|
124
|
+
# that match spec. Also:
|
|
125
|
+
# * false => do not include.
|
|
126
|
+
# * :only => only include linked 880s, not original
|
|
127
|
+
def initialize(spec, options = {})
|
|
128
|
+
self.options = {
|
|
129
|
+
:separator => ' ',
|
|
130
|
+
:alternate_script => :include
|
|
131
|
+
}.merge(options)
|
|
132
|
+
|
|
133
|
+
self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# Tags are "interesting" if we have a spec that might cover it
|
|
137
|
+
@interesting_tags_hash = {}
|
|
138
|
+
|
|
139
|
+
# By default, interesting tags are those represented by keys in spec_hash.
|
|
140
|
+
# Add them unless we only care about alternate scripts.
|
|
141
|
+
unless options[:alternate_script] == :only
|
|
142
|
+
self.spec_hash.keys.each {|tag| @interesting_tags_hash[tag] = true}
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# If we *are* interested in alternate scripts, add the 880
|
|
146
|
+
if options[:alternate_script] != false
|
|
147
|
+
@interesting_tags_hash['880'] = true
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
self.freeze
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Takes the same arguments as MarcExtractor.new, but will re-use an existing
|
|
154
|
+
# cached MarcExtractor already created with given initialization arguments,
|
|
155
|
+
# if available.
|
|
156
|
+
#
|
|
157
|
+
# This can be used to increase performance of indexing routines, as
|
|
158
|
+
# MarcExtractor creation has been shown via profiling/benchmarking
|
|
159
|
+
# to be expensive.
|
|
160
|
+
#
|
|
161
|
+
# Cache is thread-local, so should be thread-safe.
|
|
162
|
+
#
|
|
163
|
+
# You should _not_ modify the state of any MarcExtractor retrieved
|
|
164
|
+
# via cached, as the MarcExtractor will be re-used and shared (possibly
|
|
165
|
+
# between threads even!). We try to use ruby #freeze to keep you from doing so,
|
|
166
|
+
# although if you try hard enough you can surely find a way to do something
|
|
167
|
+
# you shouldn't.
|
|
168
|
+
#
|
|
169
|
+
# extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
|
|
170
|
+
def self.cached(*args)
|
|
171
|
+
cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
|
|
172
|
+
return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Check to see if a tag is interesting (meaning it may be covered by a spec
|
|
176
|
+
# and the passed-in options about alternate scripts)
|
|
177
|
+
def interesting_tag?(tag)
|
|
178
|
+
return @interesting_tags_hash.include?(tag)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
|
183
|
+
# to represent the specification. See comments at head of class for
|
|
184
|
+
# documentation of string specification format.
|
|
185
|
+
#
|
|
186
|
+
#
|
|
187
|
+
# ## Return value
|
|
188
|
+
#
|
|
189
|
+
# The hash returned is keyed by tag, and has as values an array of 0 or
|
|
190
|
+
# or more MarcExtractor::Spec objects representing the specified extraction
|
|
191
|
+
# operations for that tag.
|
|
192
|
+
#
|
|
193
|
+
# It's an array of possibly more than one, because you can specify
|
|
194
|
+
# multiple extractions on the same tag: for instance "245a:245abc"
|
|
195
|
+
#
|
|
196
|
+
# See tests for more examples.
|
|
197
|
+
def self.parse_string_spec(spec_string)
|
|
198
|
+
# hash defaults to []
|
|
199
|
+
hash = Hash.new
|
|
200
|
+
|
|
201
|
+
spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
|
|
202
|
+
|
|
203
|
+
spec_strings.each do |part|
|
|
204
|
+
if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/)
|
|
205
|
+
# variable field
|
|
206
|
+
tag, indicators, subfields = $1, $3, $4
|
|
207
|
+
|
|
208
|
+
spec = Spec.new(:tag => tag)
|
|
209
|
+
|
|
210
|
+
if subfields and !subfields.empty?
|
|
211
|
+
spec.subfields = subfields.split('')
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
if indicators
|
|
215
|
+
# if specified as '*', leave nil
|
|
216
|
+
spec.indicator1 = indicators[0] if indicators[0] != "*"
|
|
217
|
+
spec.indicator2 = indicators[1] if indicators[1] != "*"
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
hash[spec.tag] ||= []
|
|
221
|
+
hash[spec.tag] << spec
|
|
222
|
+
|
|
223
|
+
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
|
|
224
|
+
tag, byte1, byte2 = $1, $3, $5
|
|
225
|
+
|
|
226
|
+
spec = Spec.new(:tag => tag)
|
|
227
|
+
|
|
228
|
+
if byte1 && byte2
|
|
229
|
+
spec.bytes = ((byte1.to_i)..(byte2.to_i))
|
|
230
|
+
elsif byte1
|
|
231
|
+
spec.bytes = byte1.to_i
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
hash[spec.tag] ||= []
|
|
235
|
+
hash[spec.tag] << spec
|
|
236
|
+
else
|
|
237
|
+
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
return hash
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# Returns array of strings, extracted values. Maybe empty array.
|
|
246
|
+
def extract(marc_record)
|
|
247
|
+
results = []
|
|
248
|
+
|
|
249
|
+
self.each_matching_line(marc_record) do |field, spec|
|
|
250
|
+
if control_field?(field)
|
|
251
|
+
results << (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
|
|
252
|
+
else
|
|
253
|
+
results.concat collect_subfields(field, spec)
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
return results
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Yields a block for every line in source record that matches
|
|
261
|
+
# spec. First arg to block is MARC::DataField or ControlField, second
|
|
262
|
+
# is the MarcExtractor::Spec that it matched on. May take account
|
|
263
|
+
# of options such as :alternate_script
|
|
264
|
+
#
|
|
265
|
+
# Third (optional) arg to block is self, the MarcExtractor object, useful for custom
|
|
266
|
+
# implementations.
|
|
267
|
+
def each_matching_line(marc_record)
|
|
268
|
+
marc_record.fields(@interesting_tags_hash.keys).each do |field|
|
|
269
|
+
|
|
270
|
+
# Make sure it matches indicators too, specs_covering_field
|
|
271
|
+
# doesn't check that.
|
|
272
|
+
specs_covering_field(field).each do |spec|
|
|
273
|
+
if spec.matches_indicators?(field)
|
|
274
|
+
yield(field, spec, self)
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# line each_matching_line, takes a block to process each matching line,
|
|
282
|
+
# but collects results of block into an array -- flattens any subarrays for you!
|
|
283
|
+
#
|
|
284
|
+
# Useful for re-use of this class for custom processing
|
|
285
|
+
#
|
|
286
|
+
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
|
287
|
+
def collect_matching_lines(marc_record)
|
|
288
|
+
results = []
|
|
289
|
+
self.each_matching_line(marc_record) do |field, spec, extractor|
|
|
290
|
+
results.concat [yield(field, spec, extractor)].flatten
|
|
291
|
+
end
|
|
292
|
+
return results
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# Pass in a marc data field and a Spec object with extraction
|
|
297
|
+
# instructions, returns an ARRAY of one or more strings, subfields extracted
|
|
298
|
+
# and processed per spec. Takes account of options such
|
|
299
|
+
# as :separator
|
|
300
|
+
#
|
|
301
|
+
# Always returns array, sometimes empty array.
|
|
302
|
+
def collect_subfields(field, spec)
|
|
303
|
+
subfields = field.subfields.collect do |subfield|
|
|
304
|
+
subfield.value if spec.includes_subfield_code?(subfield.code)
|
|
305
|
+
end.compact
|
|
306
|
+
|
|
307
|
+
return subfields if subfields.empty? # empty array, just return it.
|
|
308
|
+
|
|
309
|
+
if options[:separator] && spec.joinable?
|
|
310
|
+
subfields = [subfields.join(options[:separator])]
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
return subfields
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# Find Spec objects, if any, covering extraction from this field.
|
|
319
|
+
# Returns an array of 0 or more MarcExtractor::Spec objects
|
|
320
|
+
#
|
|
321
|
+
# When given an 880, will return the spec (if any) for the linked tag iff
|
|
322
|
+
# we have a $6 and we want the alternate script.
|
|
323
|
+
#
|
|
324
|
+
# Returns an empty array in case of no matching extraction specs.
|
|
325
|
+
def specs_covering_field(field)
|
|
326
|
+
tag = field.tag
|
|
327
|
+
|
|
328
|
+
# Short-circuit the unintersting stuff
|
|
329
|
+
return [] unless interesting_tag?(tag)
|
|
330
|
+
|
|
331
|
+
# Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
|
|
332
|
+
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
|
333
|
+
|
|
334
|
+
if tag == "880" && field['6']
|
|
335
|
+
tag = field["6"].encode(field["6"].encoding).byteslice(0,3)
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
|
|
339
|
+
spec = self.spec_hash[tag] || []
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def control_field?(field)
|
|
344
|
+
# should the MARC gem have a more efficient way to do this,
|
|
345
|
+
# define #control_field? on both ControlField and DataField?
|
|
346
|
+
return field.kind_of? MARC::ControlField
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
def freeze
|
|
350
|
+
self.options.freeze
|
|
351
|
+
self.spec_hash.freeze
|
|
352
|
+
super
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
# Represents a single specification for extracting data
|
|
357
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
|
358
|
+
#
|
|
359
|
+
# Includes the tag for reference, although this is redundant and not actually used
|
|
360
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
|
361
|
+
# with tag => [spec1, spec2]
|
|
362
|
+
class Spec
|
|
363
|
+
attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
|
|
364
|
+
|
|
365
|
+
def initialize(hash = {})
|
|
366
|
+
hash.each_pair do |key, value|
|
|
367
|
+
self.send("#{key}=", value)
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
# Should subfields extracted by joined, if we have a seperator?
|
|
373
|
+
# * '630' no subfields specified => join all subfields
|
|
374
|
+
# * '630abc' multiple subfields specified = join all subfields
|
|
375
|
+
# * '633a' one subfield => do not join, return one value for each $a in the field
|
|
376
|
+
# * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
|
|
377
|
+
#
|
|
378
|
+
# Last case is handled implicitly at the moment when subfields == ['a', 'a']
|
|
379
|
+
def joinable?
|
|
380
|
+
(self.subfields.nil? || self.subfields.size != 1)
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
# Pass in a MARC field, do it's indicators match indicators
|
|
384
|
+
# in this spec? nil indicators in spec mean we don't care, everything
|
|
385
|
+
# matches.
|
|
386
|
+
def matches_indicators?(field)
|
|
387
|
+
return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
|
|
388
|
+
(self.indicator2.nil? || self.indicator2 == field.indicator2)
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# Pass in a string subfield code like 'a'; does this
|
|
392
|
+
# spec include it?
|
|
393
|
+
def includes_subfield_code?(code)
|
|
394
|
+
# subfields nil means include them all
|
|
395
|
+
self.subfields.nil? || self.subfields.include?(code)
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
def ==(spec)
|
|
399
|
+
return false unless spec.kind_of?(Spec)
|
|
400
|
+
|
|
401
|
+
return (self.tag == spec.tag) &&
|
|
402
|
+
(self.subfields == spec.subfields) &&
|
|
403
|
+
(self.indicator1 == spec.indicator1) &&
|
|
404
|
+
(self.indicator1 == spec.indicator2) &&
|
|
405
|
+
(self.bytes == spec.bytes)
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
end
|
|
410
|
+
end
|