traject 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,197 @@
1
+ module Traject
2
+ module Macros
3
+ # To use the marc_format macro, in your configuration file:
4
+ #
5
+ # require 'traject/macros/marc_formats
6
+ # extend Traject::Macros::MarcFormats
7
+ #
8
+ # to_field("format_s") marc_formats
9
+ #
10
+ # See also MarcClassifier which can be used directly for a bit more
11
+ # control.
12
+ module MarcFormats
13
+ # very opionated macro that just adds a grab bag of format/genre/types
14
+ # from our own custom vocabulary, all into one field.
15
+ # You may want to build your own from MarcFormatClassifier functions instead.
16
+ #
17
+ def marc_formats
18
+ lambda do |record, accumulator|
19
+ accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
20
+ end
21
+ end
22
+ end
23
+
24
+
25
+ # A tool for classifiying MARC records according to format/form/genre/type,
26
+ # just using our own custom vocabulary for those things.
27
+ #
28
+ # used by the `marc_formats` macro, but you can also use it directly
29
+ # for a bit more control.
30
+ class MarcFormatClassifier
31
+ attr_reader :record
32
+
33
+ def initialize(marc_record)
34
+ @record = marc_record
35
+ end
36
+
37
+ # A very opinionated method that just kind of jams together
38
+ # all the possible format/genre/types into one array of 1 to N elements.
39
+ #
40
+ # If no other values are present, the default value "Other" will be used.
41
+ #
42
+ # See also individual methods which you can use you seperate into
43
+ # different facets or do other custom things.
44
+ def formats(options = {})
45
+ options = {:default => "Other"}.merge(options)
46
+
47
+ formats = []
48
+
49
+ formats.concat genre
50
+
51
+ formats << "Manuscript/Archive" if manuscript_archive?
52
+ formats << "Microform" if microform?
53
+ formats << "Online" if online?
54
+
55
+ # In our own data, if it's an audio recording, it might show up
56
+ # as print, but it's probably not.
57
+ formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
58
+
59
+ # If it's a Dissertation, we decide it's NOT a book
60
+ if thesis?
61
+ formats.delete("Book")
62
+ formats << "Dissertation/Thesis"
63
+ end
64
+
65
+ if proceeding?
66
+ formats << "Conference"
67
+ end
68
+
69
+ if formats.empty?
70
+ formats << options[:default]
71
+ end
72
+
73
+ return formats
74
+ end
75
+
76
+
77
+
78
+ # Returns 1 or more values in an array from:
79
+ # Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
80
+ # Image; Software/Data; Video/Film
81
+ #
82
+ # Uses leader byte 6, leader byte 7, and 007 byte 0.
83
+ #
84
+ # Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
85
+ # so you can customize labels if you want.
86
+ def genre
87
+ marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
88
+ marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
89
+
90
+ results = marc_genre_leader[ record.leader.slice(6,2) ] ||
91
+ marc_genre_leader[ record.leader.slice(6)] ||
92
+ record.find_all {|f| f.tag == "007"}.collect {|f| marc_genre_007[f.value.slice(0)]}
93
+
94
+ [results].flatten
95
+ end
96
+
97
+ # Just checks if it has a 502, if it does it's considered a thesis
98
+ def thesis?
99
+ @thesis_q ||= begin
100
+ ! record.find {|a| a.tag == "502"}.nil?
101
+ end
102
+ end
103
+
104
+ # Just checks all $6xx for a $v "Congresses"
105
+ def proceeding?
106
+ @proceeding_q ||= begin
107
+ ! record.find do |field|
108
+ field.tag.slice(0) == '6' && field.subfields.find {|sf| sf.code == "v" && sf.value =~ /^\s*(C|c)ongresses\.?\s*$/}
109
+ end.nil?
110
+ end
111
+ end
112
+
113
+ # Algorithm with help from Chris Case.
114
+ # * If it has any RDA 338, then it's print if it has a value of
115
+ # volume, sheet, or card.
116
+ # * If it does not have an RDA 338, it's print if and only if it has
117
+ # no 245$h GMD.
118
+ #
119
+ # * Here at JH, for legacy reasons we also choose to not
120
+ # call it print if it's already been marked audio, but
121
+ # we do that in a different method.
122
+ #
123
+ # Note that any record that has neither a 245 nor a 338rda is going
124
+ # to be marked print
125
+ #
126
+ # This algorithm is definitely going to get some things wrong in
127
+ # both directions, with real world data. But seems to be good enough.
128
+ def print?
129
+
130
+
131
+ rda338 = record.find_all do |field|
132
+ field.tag == "338" && field['2'] == "rdacarrier"
133
+ end
134
+
135
+ if rda338.length > 0
136
+ rda338.find do |field|
137
+ field.subfields.find do |sf|
138
+ (sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
139
+ (sf.code == "b" && %w{nc no nb}.include?(sf.value))
140
+ end
141
+ end
142
+ else
143
+ normalized_gmd.length == 0
144
+ end
145
+ end
146
+
147
+ # We use marc 007 to determine if this represents an online
148
+ # resource. But sometimes resort to 245$h GMD too.
149
+ def online?
150
+ # field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
151
+ found_007 = record.fields('007').find do |field|
152
+ field.value.slice(0) == "c" && field.value.slice(1) == "r"
153
+ end
154
+
155
+ return true if found_007
156
+
157
+ # Otherwise, if it has a GMD ["electronic resource"], we count it
158
+ # as online only if NO 007[0] == 'c' exists, cause if it does we already
159
+ # know it's electronic but not remote, otherwise first try would
160
+ # have found it.
161
+ return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
162
+ end
163
+
164
+ # if field 007 byte 0 is 'h', that's microform. But many of our microform
165
+ # don't have that. If leader byte 6 is 'h', that's an obsolete way of saying
166
+ # microform. And finally, if GMD is
167
+ def microform?
168
+ normalized_gmd.start_with?("[microform]") ||
169
+ record.leader[6] == "h" ||
170
+ record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
171
+ end
172
+
173
+ # Marked as manuscript OR archive.
174
+ def manuscript_archive?
175
+ leader06 = record.leader.slice(6)
176
+ leader08 = record.leader.slice(8)
177
+
178
+ # leader 6 t=Manuscript Language Material, d=Manuscript Music,
179
+ # f=Manuscript Cartographic
180
+ #
181
+ # leader 06 = 'b' is obsolete, but if it exists it means archival countrl
182
+ #
183
+ # leader 08 'a'='archival control'
184
+ %w{t d f b}.include?(leader06) || leader08 == "a"
185
+ end
186
+
187
+ # downcased version of the gmd, or else empty string
188
+ def normalized_gmd
189
+ @gmd ||= begin
190
+ ((a245 = record['245']) && a245['h'] && a245['h'].downcase) || ""
191
+ end
192
+ end
193
+
194
+
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,410 @@
1
+ module Traject
2
+ # MarcExtractor is a class for extracting lists of strings from a MARC::Record,
3
+ # according to specifications. See #parse_string_spec for description of string
4
+ # string arguments used to specify extraction. See #initialize for options
5
+ # that can be set controlling extraction.
6
+ #
7
+ # Examples:
8
+ #
9
+ # array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
10
+ # values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
11
+ # seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
12
+ # bytes = MarcExtractor.new("008[35-37]")
13
+ #
14
+ # ## String extraction specifications
15
+ #
16
+ # Extraction directions are supplied in strings, usually as the first
17
+ # parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
18
+ # are also the first parameter to the #marc_extract macro.
19
+ #
20
+ # A String specification is a string (or array of strings) which consists
21
+ # of one or more Data and Control Field Specifications seperated by colons.
22
+ #
23
+ # A Data Field Specification is of the form:
24
+ #
25
+ # * `{tag}{|indicators|}{subfields}`
26
+ # * {tag} is three chars (usually but not neccesarily numeric)
27
+ # * {indicators} are optional two chars enclosed in pipe ('|') characters,
28
+ # * {subfields} are optional list of chars (alphanumeric)
29
+ #
30
+ # indicator spec must be two chars, but one can be * meaning "don't care".
31
+ # space to mean 'blank'
32
+ #
33
+ # "245|01|abc65:345abc:700|*5|:800"
34
+ #
35
+ # A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
36
+ # and includes a tag and a a byte slice specification.
37
+ #
38
+ # "008[35-37]:007[5]""
39
+ # => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007
40
+ #
41
+ # * subfields and indicators can only be provided for marc data/variable fields
42
+ # * byte slice can only be provided for marc control fields (generally tags less than 010)
43
+ #
44
+ # ## Subfield concatenation
45
+ #
46
+ # Normally, for a spec including multiple subfield codes, multiple subfields
47
+ # from the same MARC field will be concatenated into one string separated by spaces:
48
+ #
49
+ # 600 a| Chomsky, Noam x| Philosophy.
50
+ # 600 a| Chomsky, Noam x| Political and social views.
51
+ # MarcExtractor.new("600ax").extract(record)
52
+ # # results in two values sent to Solr:
53
+ # "Chomsky, Noam Philosophy."
54
+ # "Chomsky, Noam Political and social views."
55
+ #
56
+ # You can turn off this concatenation and leave individual subfields in seperate
57
+ # strings by setting the `separator` option to nil:
58
+ #
59
+ # MarcExtractor.new("600ax", :separator => nil).extract(record)
60
+ # # Results in four values being sent to Solr (or 3 if you de-dup):
61
+ # "Chomksy, Noam"
62
+ # "Philosophy."
63
+ # "Chomsky, Noam"
64
+ # "Political and social views."
65
+ #
66
+ # However, **the default is different for specifications with only a single
67
+ # subfield**, these are by default kept seperated:
68
+ #
69
+ # 020 a| 285197145X a| 9782851971456
70
+ # MarcExtractor.new("020a:020z").extract(record)
71
+ # # two seperate strings sent to Solr:
72
+ # "285197145X"
73
+ # "9782851971456"
74
+ #
75
+ # For single subfield specifications, you force concatenation by
76
+ # repeating the subfield specification:
77
+ #
78
+ # MarcExtractor.new("020aa:020zz").extract(record)
79
+ # # would result in a single string sent to solr for
80
+ # # the single field, by default space-separated:
81
+ # "285197145X 9782851971456"
82
+ #
83
+ # ## Note on Performance and MarcExtractor creation and reuse
84
+ #
85
+ # A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
86
+ # benchmarking to be a bottleneck if you end up creating one for each marc record
87
+ # processed. Instead, a single MarcExtractor should be created, and re-used
88
+ # per MARC record.
89
+ #
90
+ # If you are creating a traject 'macro' method, here's one way to do that,
91
+ # capturing the MarcExtractor under closure:
92
+ #
93
+ # def some_macro(spec, other_args, whatever)
94
+ # extractor = MarcExtractor.new( spec )
95
+ # # ...
96
+ # return lambda do |record, accumulator, context|
97
+ # #...
98
+ # accumulator.concat extractor.extract(record)
99
+ # #...
100
+ # end
101
+ # end
102
+ #
103
+ # In other cases, you may find it convenient to improve performance by
104
+ # using the MarcExtractor#cached method, instead of MarcExtractor#new, to
105
+ # lazily create and then re-use a MarcExtractor object with
106
+ # particular initialization arguments.
107
+ class MarcExtractor
108
+ attr_accessor :options, :spec_hash
109
+
110
+ # First arg is a specification for extraction of data from a MARC record.
111
+ # Specification can be given in two forms:
112
+ #
113
+ # * a string specification like "008[35]:020a:245abc", see top of class
114
+ # for examples. A string specification is most typical argument.
115
+ # * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
116
+ # a 'pre-parsed' specification.
117
+ #
118
+ # Second arg is options:
119
+ #
120
+ # [:separator] default ' ' (space), what to use to separate
121
+ # subfield values when joining strings
122
+ #
123
+ # [:alternate_script] default :include, include linked 880s for tags
124
+ # that match spec. Also:
125
+ # * false => do not include.
126
+ # * :only => only include linked 880s, not original
127
+ def initialize(spec, options = {})
128
+ self.options = {
129
+ :separator => ' ',
130
+ :alternate_script => :include
131
+ }.merge(options)
132
+
133
+ self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
134
+
135
+
136
+ # Tags are "interesting" if we have a spec that might cover it
137
+ @interesting_tags_hash = {}
138
+
139
+ # By default, interesting tags are those represented by keys in spec_hash.
140
+ # Add them unless we only care about alternate scripts.
141
+ unless options[:alternate_script] == :only
142
+ self.spec_hash.keys.each {|tag| @interesting_tags_hash[tag] = true}
143
+ end
144
+
145
+ # If we *are* interested in alternate scripts, add the 880
146
+ if options[:alternate_script] != false
147
+ @interesting_tags_hash['880'] = true
148
+ end
149
+
150
+ self.freeze
151
+ end
152
+
153
+ # Takes the same arguments as MarcExtractor.new, but will re-use an existing
154
+ # cached MarcExtractor already created with given initialization arguments,
155
+ # if available.
156
+ #
157
+ # This can be used to increase performance of indexing routines, as
158
+ # MarcExtractor creation has been shown via profiling/benchmarking
159
+ # to be expensive.
160
+ #
161
+ # Cache is thread-local, so should be thread-safe.
162
+ #
163
+ # You should _not_ modify the state of any MarcExtractor retrieved
164
+ # via cached, as the MarcExtractor will be re-used and shared (possibly
165
+ # between threads even!). We try to use ruby #freeze to keep you from doing so,
166
+ # although if you try hard enough you can surely find a way to do something
167
+ # you shouldn't.
168
+ #
169
+ # extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
170
+ def self.cached(*args)
171
+ cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
172
+ return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
173
+ end
174
+
175
+ # Check to see if a tag is interesting (meaning it may be covered by a spec
176
+ # and the passed-in options about alternate scripts)
177
+ def interesting_tag?(tag)
178
+ return @interesting_tags_hash.include?(tag)
179
+ end
180
+
181
+
182
+ # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
183
+ # to represent the specification. See comments at head of class for
184
+ # documentation of string specification format.
185
+ #
186
+ #
187
+ # ## Return value
188
+ #
189
+ # The hash returned is keyed by tag, and has as values an array of 0 or
190
+ # or more MarcExtractor::Spec objects representing the specified extraction
191
+ # operations for that tag.
192
+ #
193
+ # It's an array of possibly more than one, because you can specify
194
+ # multiple extractions on the same tag: for instance "245a:245abc"
195
+ #
196
+ # See tests for more examples.
197
+ def self.parse_string_spec(spec_string)
198
+ # hash defaults to []
199
+ hash = Hash.new
200
+
201
+ spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
202
+
203
+ spec_strings.each do |part|
204
+ if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/)
205
+ # variable field
206
+ tag, indicators, subfields = $1, $3, $4
207
+
208
+ spec = Spec.new(:tag => tag)
209
+
210
+ if subfields and !subfields.empty?
211
+ spec.subfields = subfields.split('')
212
+ end
213
+
214
+ if indicators
215
+ # if specified as '*', leave nil
216
+ spec.indicator1 = indicators[0] if indicators[0] != "*"
217
+ spec.indicator2 = indicators[1] if indicators[1] != "*"
218
+ end
219
+
220
+ hash[spec.tag] ||= []
221
+ hash[spec.tag] << spec
222
+
223
+ elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
224
+ tag, byte1, byte2 = $1, $3, $5
225
+
226
+ spec = Spec.new(:tag => tag)
227
+
228
+ if byte1 && byte2
229
+ spec.bytes = ((byte1.to_i)..(byte2.to_i))
230
+ elsif byte1
231
+ spec.bytes = byte1.to_i
232
+ end
233
+
234
+ hash[spec.tag] ||= []
235
+ hash[spec.tag] << spec
236
+ else
237
+ raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
238
+ end
239
+ end
240
+
241
+ return hash
242
+ end
243
+
244
+
245
+ # Returns array of strings, extracted values. Maybe empty array.
246
+ def extract(marc_record)
247
+ results = []
248
+
249
+ self.each_matching_line(marc_record) do |field, spec|
250
+ if control_field?(field)
251
+ results << (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
252
+ else
253
+ results.concat collect_subfields(field, spec)
254
+ end
255
+ end
256
+
257
+ return results
258
+ end
259
+
260
+ # Yields a block for every line in source record that matches
261
+ # spec. First arg to block is MARC::DataField or ControlField, second
262
+ # is the MarcExtractor::Spec that it matched on. May take account
263
+ # of options such as :alternate_script
264
+ #
265
+ # Third (optional) arg to block is self, the MarcExtractor object, useful for custom
266
+ # implementations.
267
+ def each_matching_line(marc_record)
268
+ marc_record.fields(@interesting_tags_hash.keys).each do |field|
269
+
270
+ # Make sure it matches indicators too, specs_covering_field
271
+ # doesn't check that.
272
+ specs_covering_field(field).each do |spec|
273
+ if spec.matches_indicators?(field)
274
+ yield(field, spec, self)
275
+ end
276
+ end
277
+
278
+ end
279
+ end
280
+
281
+ # line each_matching_line, takes a block to process each matching line,
282
+ # but collects results of block into an array -- flattens any subarrays for you!
283
+ #
284
+ # Useful for re-use of this class for custom processing
285
+ #
286
+ # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
287
+ def collect_matching_lines(marc_record)
288
+ results = []
289
+ self.each_matching_line(marc_record) do |field, spec, extractor|
290
+ results.concat [yield(field, spec, extractor)].flatten
291
+ end
292
+ return results
293
+ end
294
+
295
+
296
+ # Pass in a marc data field and a Spec object with extraction
297
+ # instructions, returns an ARRAY of one or more strings, subfields extracted
298
+ # and processed per spec. Takes account of options such
299
+ # as :separator
300
+ #
301
+ # Always returns array, sometimes empty array.
302
+ def collect_subfields(field, spec)
303
+ subfields = field.subfields.collect do |subfield|
304
+ subfield.value if spec.includes_subfield_code?(subfield.code)
305
+ end.compact
306
+
307
+ return subfields if subfields.empty? # empty array, just return it.
308
+
309
+ if options[:separator] && spec.joinable?
310
+ subfields = [subfields.join(options[:separator])]
311
+ end
312
+
313
+ return subfields
314
+ end
315
+
316
+
317
+
318
+ # Find Spec objects, if any, covering extraction from this field.
319
+ # Returns an array of 0 or more MarcExtractor::Spec objects
320
+ #
321
+ # When given an 880, will return the spec (if any) for the linked tag iff
322
+ # we have a $6 and we want the alternate script.
323
+ #
324
+ # Returns an empty array in case of no matching extraction specs.
325
+ def specs_covering_field(field)
326
+ tag = field.tag
327
+
328
+ # Short-circuit the unintersting stuff
329
+ return [] unless interesting_tag?(tag)
330
+
331
+ # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
332
+ # to do this weird encode gymnastics, which fixes it for mysterious reasons.
333
+
334
+ if tag == "880" && field['6']
335
+ tag = field["6"].encode(field["6"].encoding).byteslice(0,3)
336
+ end
337
+
338
+ # Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
339
+ spec = self.spec_hash[tag] || []
340
+ end
341
+
342
+
343
+ def control_field?(field)
344
+ # should the MARC gem have a more efficient way to do this,
345
+ # define #control_field? on both ControlField and DataField?
346
+ return field.kind_of? MARC::ControlField
347
+ end
348
+
349
+ def freeze
350
+ self.options.freeze
351
+ self.spec_hash.freeze
352
+ super
353
+ end
354
+
355
+
356
+ # Represents a single specification for extracting data
357
+ # from a marc field, like "600abc" or "600|1*|x".
358
+ #
359
+ # Includes the tag for reference, although this is redundant and not actually used
360
+ # in logic, since the tag is also implicit in the overall spec_hash
361
+ # with tag => [spec1, spec2]
362
+ class Spec
363
+ attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
364
+
365
+ def initialize(hash = {})
366
+ hash.each_pair do |key, value|
367
+ self.send("#{key}=", value)
368
+ end
369
+ end
370
+
371
+
372
+ # Should subfields extracted by joined, if we have a seperator?
373
+ # * '630' no subfields specified => join all subfields
374
+ # * '630abc' multiple subfields specified = join all subfields
375
+ # * '633a' one subfield => do not join, return one value for each $a in the field
376
+ # * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
377
+ #
378
+ # Last case is handled implicitly at the moment when subfields == ['a', 'a']
379
+ def joinable?
380
+ (self.subfields.nil? || self.subfields.size != 1)
381
+ end
382
+
383
+ # Pass in a MARC field, do it's indicators match indicators
384
+ # in this spec? nil indicators in spec mean we don't care, everything
385
+ # matches.
386
+ def matches_indicators?(field)
387
+ return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
388
+ (self.indicator2.nil? || self.indicator2 == field.indicator2)
389
+ end
390
+
391
+ # Pass in a string subfield code like 'a'; does this
392
+ # spec include it?
393
+ def includes_subfield_code?(code)
394
+ # subfields nil means include them all
395
+ self.subfields.nil? || self.subfields.include?(code)
396
+ end
397
+
398
+ def ==(spec)
399
+ return false unless spec.kind_of?(Spec)
400
+
401
+ return (self.tag == spec.tag) &&
402
+ (self.subfields == spec.subfields) &&
403
+ (self.indicator1 == spec.indicator1) &&
404
+ (self.indicator1 == spec.indicator2) &&
405
+ (self.bytes == spec.bytes)
406
+ end
407
+ end
408
+
409
+ end
410
+ end