traject 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
module Traject
|
2
|
+
module Macros
|
3
|
+
# To use the marc_format macro, in your configuration file:
|
4
|
+
#
|
5
|
+
# require 'traject/macros/marc_formats
|
6
|
+
# extend Traject::Macros::MarcFormats
|
7
|
+
#
|
8
|
+
# to_field("format_s") marc_formats
|
9
|
+
#
|
10
|
+
# See also MarcClassifier which can be used directly for a bit more
|
11
|
+
# control.
|
12
|
+
module MarcFormats
|
13
|
+
# very opionated macro that just adds a grab bag of format/genre/types
|
14
|
+
# from our own custom vocabulary, all into one field.
|
15
|
+
# You may want to build your own from MarcFormatClassifier functions instead.
|
16
|
+
#
|
17
|
+
def marc_formats
|
18
|
+
lambda do |record, accumulator|
|
19
|
+
accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
# A tool for classifiying MARC records according to format/form/genre/type,
|
26
|
+
# just using our own custom vocabulary for those things.
|
27
|
+
#
|
28
|
+
# used by the `marc_formats` macro, but you can also use it directly
|
29
|
+
# for a bit more control.
|
30
|
+
class MarcFormatClassifier
|
31
|
+
attr_reader :record
|
32
|
+
|
33
|
+
def initialize(marc_record)
|
34
|
+
@record = marc_record
|
35
|
+
end
|
36
|
+
|
37
|
+
# A very opinionated method that just kind of jams together
|
38
|
+
# all the possible format/genre/types into one array of 1 to N elements.
|
39
|
+
#
|
40
|
+
# If no other values are present, the default value "Other" will be used.
|
41
|
+
#
|
42
|
+
# See also individual methods which you can use you seperate into
|
43
|
+
# different facets or do other custom things.
|
44
|
+
def formats(options = {})
|
45
|
+
options = {:default => "Other"}.merge(options)
|
46
|
+
|
47
|
+
formats = []
|
48
|
+
|
49
|
+
formats.concat genre
|
50
|
+
|
51
|
+
formats << "Manuscript/Archive" if manuscript_archive?
|
52
|
+
formats << "Microform" if microform?
|
53
|
+
formats << "Online" if online?
|
54
|
+
|
55
|
+
# In our own data, if it's an audio recording, it might show up
|
56
|
+
# as print, but it's probably not.
|
57
|
+
formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
|
58
|
+
|
59
|
+
# If it's a Dissertation, we decide it's NOT a book
|
60
|
+
if thesis?
|
61
|
+
formats.delete("Book")
|
62
|
+
formats << "Dissertation/Thesis"
|
63
|
+
end
|
64
|
+
|
65
|
+
if proceeding?
|
66
|
+
formats << "Conference"
|
67
|
+
end
|
68
|
+
|
69
|
+
if formats.empty?
|
70
|
+
formats << options[:default]
|
71
|
+
end
|
72
|
+
|
73
|
+
return formats
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
# Returns 1 or more values in an array from:
|
79
|
+
# Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
|
80
|
+
# Image; Software/Data; Video/Film
|
81
|
+
#
|
82
|
+
# Uses leader byte 6, leader byte 7, and 007 byte 0.
|
83
|
+
#
|
84
|
+
# Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
|
85
|
+
# so you can customize labels if you want.
|
86
|
+
def genre
|
87
|
+
marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
|
88
|
+
marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
|
89
|
+
|
90
|
+
results = marc_genre_leader[ record.leader.slice(6,2) ] ||
|
91
|
+
marc_genre_leader[ record.leader.slice(6)] ||
|
92
|
+
record.find_all {|f| f.tag == "007"}.collect {|f| marc_genre_007[f.value.slice(0)]}
|
93
|
+
|
94
|
+
[results].flatten
|
95
|
+
end
|
96
|
+
|
97
|
+
# Just checks if it has a 502, if it does it's considered a thesis
|
98
|
+
def thesis?
|
99
|
+
@thesis_q ||= begin
|
100
|
+
! record.find {|a| a.tag == "502"}.nil?
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Just checks all $6xx for a $v "Congresses"
|
105
|
+
def proceeding?
|
106
|
+
@proceeding_q ||= begin
|
107
|
+
! record.find do |field|
|
108
|
+
field.tag.slice(0) == '6' && field.subfields.find {|sf| sf.code == "v" && sf.value =~ /^\s*(C|c)ongresses\.?\s*$/}
|
109
|
+
end.nil?
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Algorithm with help from Chris Case.
|
114
|
+
# * If it has any RDA 338, then it's print if it has a value of
|
115
|
+
# volume, sheet, or card.
|
116
|
+
# * If it does not have an RDA 338, it's print if and only if it has
|
117
|
+
# no 245$h GMD.
|
118
|
+
#
|
119
|
+
# * Here at JH, for legacy reasons we also choose to not
|
120
|
+
# call it print if it's already been marked audio, but
|
121
|
+
# we do that in a different method.
|
122
|
+
#
|
123
|
+
# Note that any record that has neither a 245 nor a 338rda is going
|
124
|
+
# to be marked print
|
125
|
+
#
|
126
|
+
# This algorithm is definitely going to get some things wrong in
|
127
|
+
# both directions, with real world data. But seems to be good enough.
|
128
|
+
def print?
|
129
|
+
|
130
|
+
|
131
|
+
rda338 = record.find_all do |field|
|
132
|
+
field.tag == "338" && field['2'] == "rdacarrier"
|
133
|
+
end
|
134
|
+
|
135
|
+
if rda338.length > 0
|
136
|
+
rda338.find do |field|
|
137
|
+
field.subfields.find do |sf|
|
138
|
+
(sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
|
139
|
+
(sf.code == "b" && %w{nc no nb}.include?(sf.value))
|
140
|
+
end
|
141
|
+
end
|
142
|
+
else
|
143
|
+
normalized_gmd.length == 0
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# We use marc 007 to determine if this represents an online
|
148
|
+
# resource. But sometimes resort to 245$h GMD too.
|
149
|
+
def online?
|
150
|
+
# field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
|
151
|
+
found_007 = record.fields('007').find do |field|
|
152
|
+
field.value.slice(0) == "c" && field.value.slice(1) == "r"
|
153
|
+
end
|
154
|
+
|
155
|
+
return true if found_007
|
156
|
+
|
157
|
+
# Otherwise, if it has a GMD ["electronic resource"], we count it
|
158
|
+
# as online only if NO 007[0] == 'c' exists, cause if it does we already
|
159
|
+
# know it's electronic but not remote, otherwise first try would
|
160
|
+
# have found it.
|
161
|
+
return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
|
162
|
+
end
|
163
|
+
|
164
|
+
# if field 007 byte 0 is 'h', that's microform. But many of our microform
|
165
|
+
# don't have that. If leader byte 6 is 'h', that's an obsolete way of saying
|
166
|
+
# microform. And finally, if GMD is
|
167
|
+
def microform?
|
168
|
+
normalized_gmd.start_with?("[microform]") ||
|
169
|
+
record.leader[6] == "h" ||
|
170
|
+
record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
|
171
|
+
end
|
172
|
+
|
173
|
+
# Marked as manuscript OR archive.
|
174
|
+
def manuscript_archive?
|
175
|
+
leader06 = record.leader.slice(6)
|
176
|
+
leader08 = record.leader.slice(8)
|
177
|
+
|
178
|
+
# leader 6 t=Manuscript Language Material, d=Manuscript Music,
|
179
|
+
# f=Manuscript Cartographic
|
180
|
+
#
|
181
|
+
# leader 06 = 'b' is obsolete, but if it exists it means archival countrl
|
182
|
+
#
|
183
|
+
# leader 08 'a'='archival control'
|
184
|
+
%w{t d f b}.include?(leader06) || leader08 == "a"
|
185
|
+
end
|
186
|
+
|
187
|
+
# downcased version of the gmd, or else empty string
|
188
|
+
def normalized_gmd
|
189
|
+
@gmd ||= begin
|
190
|
+
((a245 = record['245']) && a245['h'] && a245['h'].downcase) || ""
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,410 @@
|
|
1
|
+
module Traject
|
2
|
+
# MarcExtractor is a class for extracting lists of strings from a MARC::Record,
|
3
|
+
# according to specifications. See #parse_string_spec for description of string
|
4
|
+
# string arguments used to specify extraction. See #initialize for options
|
5
|
+
# that can be set controlling extraction.
|
6
|
+
#
|
7
|
+
# Examples:
|
8
|
+
#
|
9
|
+
# array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
|
10
|
+
# values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
|
11
|
+
# seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
|
12
|
+
# bytes = MarcExtractor.new("008[35-37]")
|
13
|
+
#
|
14
|
+
# ## String extraction specifications
|
15
|
+
#
|
16
|
+
# Extraction directions are supplied in strings, usually as the first
|
17
|
+
# parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
|
18
|
+
# are also the first parameter to the #marc_extract macro.
|
19
|
+
#
|
20
|
+
# A String specification is a string (or array of strings) which consists
|
21
|
+
# of one or more Data and Control Field Specifications seperated by colons.
|
22
|
+
#
|
23
|
+
# A Data Field Specification is of the form:
|
24
|
+
#
|
25
|
+
# * `{tag}{|indicators|}{subfields}`
|
26
|
+
# * {tag} is three chars (usually but not neccesarily numeric)
|
27
|
+
# * {indicators} are optional two chars enclosed in pipe ('|') characters,
|
28
|
+
# * {subfields} are optional list of chars (alphanumeric)
|
29
|
+
#
|
30
|
+
# indicator spec must be two chars, but one can be * meaning "don't care".
|
31
|
+
# space to mean 'blank'
|
32
|
+
#
|
33
|
+
# "245|01|abc65:345abc:700|*5|:800"
|
34
|
+
#
|
35
|
+
# A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
|
36
|
+
# and includes a tag and a a byte slice specification.
|
37
|
+
#
|
38
|
+
# "008[35-37]:007[5]""
|
39
|
+
# => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007
|
40
|
+
#
|
41
|
+
# * subfields and indicators can only be provided for marc data/variable fields
|
42
|
+
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
43
|
+
#
|
44
|
+
# ## Subfield concatenation
|
45
|
+
#
|
46
|
+
# Normally, for a spec including multiple subfield codes, multiple subfields
|
47
|
+
# from the same MARC field will be concatenated into one string separated by spaces:
|
48
|
+
#
|
49
|
+
# 600 a| Chomsky, Noam x| Philosophy.
|
50
|
+
# 600 a| Chomsky, Noam x| Political and social views.
|
51
|
+
# MarcExtractor.new("600ax").extract(record)
|
52
|
+
# # results in two values sent to Solr:
|
53
|
+
# "Chomsky, Noam Philosophy."
|
54
|
+
# "Chomsky, Noam Political and social views."
|
55
|
+
#
|
56
|
+
# You can turn off this concatenation and leave individual subfields in seperate
|
57
|
+
# strings by setting the `separator` option to nil:
|
58
|
+
#
|
59
|
+
# MarcExtractor.new("600ax", :separator => nil).extract(record)
|
60
|
+
# # Results in four values being sent to Solr (or 3 if you de-dup):
|
61
|
+
# "Chomksy, Noam"
|
62
|
+
# "Philosophy."
|
63
|
+
# "Chomsky, Noam"
|
64
|
+
# "Political and social views."
|
65
|
+
#
|
66
|
+
# However, **the default is different for specifications with only a single
|
67
|
+
# subfield**, these are by default kept seperated:
|
68
|
+
#
|
69
|
+
# 020 a| 285197145X a| 9782851971456
|
70
|
+
# MarcExtractor.new("020a:020z").extract(record)
|
71
|
+
# # two seperate strings sent to Solr:
|
72
|
+
# "285197145X"
|
73
|
+
# "9782851971456"
|
74
|
+
#
|
75
|
+
# For single subfield specifications, you force concatenation by
|
76
|
+
# repeating the subfield specification:
|
77
|
+
#
|
78
|
+
# MarcExtractor.new("020aa:020zz").extract(record)
|
79
|
+
# # would result in a single string sent to solr for
|
80
|
+
# # the single field, by default space-separated:
|
81
|
+
# "285197145X 9782851971456"
|
82
|
+
#
|
83
|
+
# ## Note on Performance and MarcExtractor creation and reuse
|
84
|
+
#
|
85
|
+
# A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
|
86
|
+
# benchmarking to be a bottleneck if you end up creating one for each marc record
|
87
|
+
# processed. Instead, a single MarcExtractor should be created, and re-used
|
88
|
+
# per MARC record.
|
89
|
+
#
|
90
|
+
# If you are creating a traject 'macro' method, here's one way to do that,
|
91
|
+
# capturing the MarcExtractor under closure:
|
92
|
+
#
|
93
|
+
# def some_macro(spec, other_args, whatever)
|
94
|
+
# extractor = MarcExtractor.new( spec )
|
95
|
+
# # ...
|
96
|
+
# return lambda do |record, accumulator, context|
|
97
|
+
# #...
|
98
|
+
# accumulator.concat extractor.extract(record)
|
99
|
+
# #...
|
100
|
+
# end
|
101
|
+
# end
|
102
|
+
#
|
103
|
+
# In other cases, you may find it convenient to improve performance by
|
104
|
+
# using the MarcExtractor#cached method, instead of MarcExtractor#new, to
|
105
|
+
# lazily create and then re-use a MarcExtractor object with
|
106
|
+
# particular initialization arguments.
|
107
|
+
class MarcExtractor
|
108
|
+
attr_accessor :options, :spec_hash
|
109
|
+
|
110
|
+
# First arg is a specification for extraction of data from a MARC record.
|
111
|
+
# Specification can be given in two forms:
|
112
|
+
#
|
113
|
+
# * a string specification like "008[35]:020a:245abc", see top of class
|
114
|
+
# for examples. A string specification is most typical argument.
|
115
|
+
# * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
|
116
|
+
# a 'pre-parsed' specification.
|
117
|
+
#
|
118
|
+
# Second arg is options:
|
119
|
+
#
|
120
|
+
# [:separator] default ' ' (space), what to use to separate
|
121
|
+
# subfield values when joining strings
|
122
|
+
#
|
123
|
+
# [:alternate_script] default :include, include linked 880s for tags
|
124
|
+
# that match spec. Also:
|
125
|
+
# * false => do not include.
|
126
|
+
# * :only => only include linked 880s, not original
|
127
|
+
def initialize(spec, options = {})
|
128
|
+
self.options = {
|
129
|
+
:separator => ' ',
|
130
|
+
:alternate_script => :include
|
131
|
+
}.merge(options)
|
132
|
+
|
133
|
+
self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
|
134
|
+
|
135
|
+
|
136
|
+
# Tags are "interesting" if we have a spec that might cover it
|
137
|
+
@interesting_tags_hash = {}
|
138
|
+
|
139
|
+
# By default, interesting tags are those represented by keys in spec_hash.
|
140
|
+
# Add them unless we only care about alternate scripts.
|
141
|
+
unless options[:alternate_script] == :only
|
142
|
+
self.spec_hash.keys.each {|tag| @interesting_tags_hash[tag] = true}
|
143
|
+
end
|
144
|
+
|
145
|
+
# If we *are* interested in alternate scripts, add the 880
|
146
|
+
if options[:alternate_script] != false
|
147
|
+
@interesting_tags_hash['880'] = true
|
148
|
+
end
|
149
|
+
|
150
|
+
self.freeze
|
151
|
+
end
|
152
|
+
|
153
|
+
# Takes the same arguments as MarcExtractor.new, but will re-use an existing
|
154
|
+
# cached MarcExtractor already created with given initialization arguments,
|
155
|
+
# if available.
|
156
|
+
#
|
157
|
+
# This can be used to increase performance of indexing routines, as
|
158
|
+
# MarcExtractor creation has been shown via profiling/benchmarking
|
159
|
+
# to be expensive.
|
160
|
+
#
|
161
|
+
# Cache is thread-local, so should be thread-safe.
|
162
|
+
#
|
163
|
+
# You should _not_ modify the state of any MarcExtractor retrieved
|
164
|
+
# via cached, as the MarcExtractor will be re-used and shared (possibly
|
165
|
+
# between threads even!). We try to use ruby #freeze to keep you from doing so,
|
166
|
+
# although if you try hard enough you can surely find a way to do something
|
167
|
+
# you shouldn't.
|
168
|
+
#
|
169
|
+
# extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
|
170
|
+
def self.cached(*args)
|
171
|
+
cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
|
172
|
+
return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
|
173
|
+
end
|
174
|
+
|
175
|
+
# Check to see if a tag is interesting (meaning it may be covered by a spec
|
176
|
+
# and the passed-in options about alternate scripts)
|
177
|
+
def interesting_tag?(tag)
|
178
|
+
return @interesting_tags_hash.include?(tag)
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
# Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
|
183
|
+
# to represent the specification. See comments at head of class for
|
184
|
+
# documentation of string specification format.
|
185
|
+
#
|
186
|
+
#
|
187
|
+
# ## Return value
|
188
|
+
#
|
189
|
+
# The hash returned is keyed by tag, and has as values an array of 0 or
|
190
|
+
# or more MarcExtractor::Spec objects representing the specified extraction
|
191
|
+
# operations for that tag.
|
192
|
+
#
|
193
|
+
# It's an array of possibly more than one, because you can specify
|
194
|
+
# multiple extractions on the same tag: for instance "245a:245abc"
|
195
|
+
#
|
196
|
+
# See tests for more examples.
|
197
|
+
def self.parse_string_spec(spec_string)
|
198
|
+
# hash defaults to []
|
199
|
+
hash = Hash.new
|
200
|
+
|
201
|
+
spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
|
202
|
+
|
203
|
+
spec_strings.each do |part|
|
204
|
+
if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/)
|
205
|
+
# variable field
|
206
|
+
tag, indicators, subfields = $1, $3, $4
|
207
|
+
|
208
|
+
spec = Spec.new(:tag => tag)
|
209
|
+
|
210
|
+
if subfields and !subfields.empty?
|
211
|
+
spec.subfields = subfields.split('')
|
212
|
+
end
|
213
|
+
|
214
|
+
if indicators
|
215
|
+
# if specified as '*', leave nil
|
216
|
+
spec.indicator1 = indicators[0] if indicators[0] != "*"
|
217
|
+
spec.indicator2 = indicators[1] if indicators[1] != "*"
|
218
|
+
end
|
219
|
+
|
220
|
+
hash[spec.tag] ||= []
|
221
|
+
hash[spec.tag] << spec
|
222
|
+
|
223
|
+
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
|
224
|
+
tag, byte1, byte2 = $1, $3, $5
|
225
|
+
|
226
|
+
spec = Spec.new(:tag => tag)
|
227
|
+
|
228
|
+
if byte1 && byte2
|
229
|
+
spec.bytes = ((byte1.to_i)..(byte2.to_i))
|
230
|
+
elsif byte1
|
231
|
+
spec.bytes = byte1.to_i
|
232
|
+
end
|
233
|
+
|
234
|
+
hash[spec.tag] ||= []
|
235
|
+
hash[spec.tag] << spec
|
236
|
+
else
|
237
|
+
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
return hash
|
242
|
+
end
|
243
|
+
|
244
|
+
|
245
|
+
# Returns array of strings, extracted values. Maybe empty array.
|
246
|
+
def extract(marc_record)
|
247
|
+
results = []
|
248
|
+
|
249
|
+
self.each_matching_line(marc_record) do |field, spec|
|
250
|
+
if control_field?(field)
|
251
|
+
results << (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
|
252
|
+
else
|
253
|
+
results.concat collect_subfields(field, spec)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
return results
|
258
|
+
end
|
259
|
+
|
260
|
+
# Yields a block for every line in source record that matches
|
261
|
+
# spec. First arg to block is MARC::DataField or ControlField, second
|
262
|
+
# is the MarcExtractor::Spec that it matched on. May take account
|
263
|
+
# of options such as :alternate_script
|
264
|
+
#
|
265
|
+
# Third (optional) arg to block is self, the MarcExtractor object, useful for custom
|
266
|
+
# implementations.
|
267
|
+
def each_matching_line(marc_record)
|
268
|
+
marc_record.fields(@interesting_tags_hash.keys).each do |field|
|
269
|
+
|
270
|
+
# Make sure it matches indicators too, specs_covering_field
|
271
|
+
# doesn't check that.
|
272
|
+
specs_covering_field(field).each do |spec|
|
273
|
+
if spec.matches_indicators?(field)
|
274
|
+
yield(field, spec, self)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
# line each_matching_line, takes a block to process each matching line,
|
282
|
+
# but collects results of block into an array -- flattens any subarrays for you!
|
283
|
+
#
|
284
|
+
# Useful for re-use of this class for custom processing
|
285
|
+
#
|
286
|
+
# yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
|
287
|
+
def collect_matching_lines(marc_record)
|
288
|
+
results = []
|
289
|
+
self.each_matching_line(marc_record) do |field, spec, extractor|
|
290
|
+
results.concat [yield(field, spec, extractor)].flatten
|
291
|
+
end
|
292
|
+
return results
|
293
|
+
end
|
294
|
+
|
295
|
+
|
296
|
+
# Pass in a marc data field and a Spec object with extraction
|
297
|
+
# instructions, returns an ARRAY of one or more strings, subfields extracted
|
298
|
+
# and processed per spec. Takes account of options such
|
299
|
+
# as :separator
|
300
|
+
#
|
301
|
+
# Always returns array, sometimes empty array.
|
302
|
+
def collect_subfields(field, spec)
|
303
|
+
subfields = field.subfields.collect do |subfield|
|
304
|
+
subfield.value if spec.includes_subfield_code?(subfield.code)
|
305
|
+
end.compact
|
306
|
+
|
307
|
+
return subfields if subfields.empty? # empty array, just return it.
|
308
|
+
|
309
|
+
if options[:separator] && spec.joinable?
|
310
|
+
subfields = [subfields.join(options[:separator])]
|
311
|
+
end
|
312
|
+
|
313
|
+
return subfields
|
314
|
+
end
|
315
|
+
|
316
|
+
|
317
|
+
|
318
|
+
# Find Spec objects, if any, covering extraction from this field.
|
319
|
+
# Returns an array of 0 or more MarcExtractor::Spec objects
|
320
|
+
#
|
321
|
+
# When given an 880, will return the spec (if any) for the linked tag iff
|
322
|
+
# we have a $6 and we want the alternate script.
|
323
|
+
#
|
324
|
+
# Returns an empty array in case of no matching extraction specs.
|
325
|
+
def specs_covering_field(field)
|
326
|
+
tag = field.tag
|
327
|
+
|
328
|
+
# Short-circuit the unintersting stuff
|
329
|
+
return [] unless interesting_tag?(tag)
|
330
|
+
|
331
|
+
# Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
|
332
|
+
# to do this weird encode gymnastics, which fixes it for mysterious reasons.
|
333
|
+
|
334
|
+
if tag == "880" && field['6']
|
335
|
+
tag = field["6"].encode(field["6"].encoding).byteslice(0,3)
|
336
|
+
end
|
337
|
+
|
338
|
+
# Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
|
339
|
+
spec = self.spec_hash[tag] || []
|
340
|
+
end
|
341
|
+
|
342
|
+
|
343
|
+
def control_field?(field)
|
344
|
+
# should the MARC gem have a more efficient way to do this,
|
345
|
+
# define #control_field? on both ControlField and DataField?
|
346
|
+
return field.kind_of? MARC::ControlField
|
347
|
+
end
|
348
|
+
|
349
|
+
def freeze
|
350
|
+
self.options.freeze
|
351
|
+
self.spec_hash.freeze
|
352
|
+
super
|
353
|
+
end
|
354
|
+
|
355
|
+
|
356
|
+
# Represents a single specification for extracting data
|
357
|
+
# from a marc field, like "600abc" or "600|1*|x".
|
358
|
+
#
|
359
|
+
# Includes the tag for reference, although this is redundant and not actually used
|
360
|
+
# in logic, since the tag is also implicit in the overall spec_hash
|
361
|
+
# with tag => [spec1, spec2]
|
362
|
+
class Spec
|
363
|
+
attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
|
364
|
+
|
365
|
+
def initialize(hash = {})
|
366
|
+
hash.each_pair do |key, value|
|
367
|
+
self.send("#{key}=", value)
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
|
372
|
+
# Should subfields extracted by joined, if we have a seperator?
|
373
|
+
# * '630' no subfields specified => join all subfields
|
374
|
+
# * '630abc' multiple subfields specified = join all subfields
|
375
|
+
# * '633a' one subfield => do not join, return one value for each $a in the field
|
376
|
+
# * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
|
377
|
+
#
|
378
|
+
# Last case is handled implicitly at the moment when subfields == ['a', 'a']
|
379
|
+
def joinable?
|
380
|
+
(self.subfields.nil? || self.subfields.size != 1)
|
381
|
+
end
|
382
|
+
|
383
|
+
# Pass in a MARC field, do it's indicators match indicators
|
384
|
+
# in this spec? nil indicators in spec mean we don't care, everything
|
385
|
+
# matches.
|
386
|
+
def matches_indicators?(field)
|
387
|
+
return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
|
388
|
+
(self.indicator2.nil? || self.indicator2 == field.indicator2)
|
389
|
+
end
|
390
|
+
|
391
|
+
# Pass in a string subfield code like 'a'; does this
|
392
|
+
# spec include it?
|
393
|
+
def includes_subfield_code?(code)
|
394
|
+
# subfields nil means include them all
|
395
|
+
self.subfields.nil? || self.subfields.include?(code)
|
396
|
+
end
|
397
|
+
|
398
|
+
def ==(spec)
|
399
|
+
return false unless spec.kind_of?(Spec)
|
400
|
+
|
401
|
+
return (self.tag == spec.tag) &&
|
402
|
+
(self.subfields == spec.subfields) &&
|
403
|
+
(self.indicator1 == spec.indicator1) &&
|
404
|
+
(self.indicator1 == spec.indicator2) &&
|
405
|
+
(self.bytes == spec.bytes)
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
end
|
410
|
+
end
|