traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,197 @@
1
+ module Traject
2
+ module Macros
3
+ # To use the marc_format macro, in your configuration file:
4
+ #
5
+ # require 'traject/macros/marc_formats
6
+ # extend Traject::Macros::MarcFormats
7
+ #
8
+ # to_field("format_s") marc_formats
9
+ #
10
+ # See also MarcClassifier which can be used directly for a bit more
11
+ # control.
12
+ module MarcFormats
13
+ # very opionated macro that just adds a grab bag of format/genre/types
14
+ # from our own custom vocabulary, all into one field.
15
+ # You may want to build your own from MarcFormatClassifier functions instead.
16
+ #
17
+ def marc_formats
18
+ lambda do |record, accumulator|
19
+ accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
20
+ end
21
+ end
22
+ end
23
+
24
+
25
+ # A tool for classifiying MARC records according to format/form/genre/type,
26
+ # just using our own custom vocabulary for those things.
27
+ #
28
+ # used by the `marc_formats` macro, but you can also use it directly
29
+ # for a bit more control.
30
+ class MarcFormatClassifier
31
+ attr_reader :record
32
+
33
+ def initialize(marc_record)
34
+ @record = marc_record
35
+ end
36
+
37
+ # A very opinionated method that just kind of jams together
38
+ # all the possible format/genre/types into one array of 1 to N elements.
39
+ #
40
+ # If no other values are present, the default value "Other" will be used.
41
+ #
42
+ # See also individual methods which you can use you seperate into
43
+ # different facets or do other custom things.
44
+ def formats(options = {})
45
+ options = {:default => "Other"}.merge(options)
46
+
47
+ formats = []
48
+
49
+ formats.concat genre
50
+
51
+ formats << "Manuscript/Archive" if manuscript_archive?
52
+ formats << "Microform" if microform?
53
+ formats << "Online" if online?
54
+
55
+ # In our own data, if it's an audio recording, it might show up
56
+ # as print, but it's probably not.
57
+ formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
58
+
59
+ # If it's a Dissertation, we decide it's NOT a book
60
+ if thesis?
61
+ formats.delete("Book")
62
+ formats << "Dissertation/Thesis"
63
+ end
64
+
65
+ if proceeding?
66
+ formats << "Conference"
67
+ end
68
+
69
+ if formats.empty?
70
+ formats << options[:default]
71
+ end
72
+
73
+ return formats
74
+ end
75
+
76
+
77
+
78
+ # Returns 1 or more values in an array from:
79
+ # Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
80
+ # Image; Software/Data; Video/Film
81
+ #
82
+ # Uses leader byte 6, leader byte 7, and 007 byte 0.
83
+ #
84
+ # Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
85
+ # so you can customize labels if you want.
86
+ def genre
87
+ marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
88
+ marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
89
+
90
+ results = marc_genre_leader[ record.leader.slice(6,2) ] ||
91
+ marc_genre_leader[ record.leader.slice(6)] ||
92
+ record.find_all {|f| f.tag == "007"}.collect {|f| marc_genre_007[f.value.slice(0)]}
93
+
94
+ [results].flatten
95
+ end
96
+
97
+ # Just checks if it has a 502, if it does it's considered a thesis
98
+ def thesis?
99
+ @thesis_q ||= begin
100
+ ! record.find {|a| a.tag == "502"}.nil?
101
+ end
102
+ end
103
+
104
+ # Just checks all $6xx for a $v "Congresses"
105
+ def proceeding?
106
+ @proceeding_q ||= begin
107
+ ! record.find do |field|
108
+ field.tag.slice(0) == '6' && field.subfields.find {|sf| sf.code == "v" && sf.value =~ /^\s*(C|c)ongresses\.?\s*$/}
109
+ end.nil?
110
+ end
111
+ end
112
+
113
+ # Algorithm with help from Chris Case.
114
+ # * If it has any RDA 338, then it's print if it has a value of
115
+ # volume, sheet, or card.
116
+ # * If it does not have an RDA 338, it's print if and only if it has
117
+ # no 245$h GMD.
118
+ #
119
+ # * Here at JH, for legacy reasons we also choose to not
120
+ # call it print if it's already been marked audio, but
121
+ # we do that in a different method.
122
+ #
123
+ # Note that any record that has neither a 245 nor a 338rda is going
124
+ # to be marked print
125
+ #
126
+ # This algorithm is definitely going to get some things wrong in
127
+ # both directions, with real world data. But seems to be good enough.
128
+ def print?
129
+
130
+
131
+ rda338 = record.find_all do |field|
132
+ field.tag == "338" && field['2'] == "rdacarrier"
133
+ end
134
+
135
+ if rda338.length > 0
136
+ rda338.find do |field|
137
+ field.subfields.find do |sf|
138
+ (sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
139
+ (sf.code == "b" && %w{nc no nb}.include?(sf.value))
140
+ end
141
+ end
142
+ else
143
+ normalized_gmd.length == 0
144
+ end
145
+ end
146
+
147
+ # We use marc 007 to determine if this represents an online
148
+ # resource. But sometimes resort to 245$h GMD too.
149
+ def online?
150
+ # field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
151
+ found_007 = record.fields('007').find do |field|
152
+ field.value.slice(0) == "c" && field.value.slice(1) == "r"
153
+ end
154
+
155
+ return true if found_007
156
+
157
+ # Otherwise, if it has a GMD ["electronic resource"], we count it
158
+ # as online only if NO 007[0] == 'c' exists, cause if it does we already
159
+ # know it's electronic but not remote, otherwise first try would
160
+ # have found it.
161
+ return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
162
+ end
163
+
164
+ # if field 007 byte 0 is 'h', that's microform. But many of our microform
165
+ # don't have that. If leader byte 6 is 'h', that's an obsolete way of saying
166
+ # microform. And finally, if GMD is
167
+ def microform?
168
+ normalized_gmd.start_with?("[microform]") ||
169
+ record.leader[6] == "h" ||
170
+ record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
171
+ end
172
+
173
+ # Marked as manuscript OR archive.
174
+ def manuscript_archive?
175
+ leader06 = record.leader.slice(6)
176
+ leader08 = record.leader.slice(8)
177
+
178
+ # leader 6 t=Manuscript Language Material, d=Manuscript Music,
179
+ # f=Manuscript Cartographic
180
+ #
181
+ # leader 06 = 'b' is obsolete, but if it exists it means archival countrl
182
+ #
183
+ # leader 08 'a'='archival control'
184
+ %w{t d f b}.include?(leader06) || leader08 == "a"
185
+ end
186
+
187
+ # downcased version of the gmd, or else empty string
188
+ def normalized_gmd
189
+ @gmd ||= begin
190
+ ((a245 = record['245']) && a245['h'] && a245['h'].downcase) || ""
191
+ end
192
+ end
193
+
194
+
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,410 @@
1
+ module Traject
2
+ # MarcExtractor is a class for extracting lists of strings from a MARC::Record,
3
+ # according to specifications. See #parse_string_spec for description of string
4
+ # string arguments used to specify extraction. See #initialize for options
5
+ # that can be set controlling extraction.
6
+ #
7
+ # Examples:
8
+ #
9
+ # array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
10
+ # values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
11
+ # seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
12
+ # bytes = MarcExtractor.new("008[35-37]")
13
+ #
14
+ # ## String extraction specifications
15
+ #
16
+ # Extraction directions are supplied in strings, usually as the first
17
+ # parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
18
+ # are also the first parameter to the #marc_extract macro.
19
+ #
20
+ # A String specification is a string (or array of strings) which consists
21
+ # of one or more Data and Control Field Specifications seperated by colons.
22
+ #
23
+ # A Data Field Specification is of the form:
24
+ #
25
+ # * `{tag}{|indicators|}{subfields}`
26
+ # * {tag} is three chars (usually but not neccesarily numeric)
27
+ # * {indicators} are optional two chars enclosed in pipe ('|') characters,
28
+ # * {subfields} are optional list of chars (alphanumeric)
29
+ #
30
+ # indicator spec must be two chars, but one can be * meaning "don't care".
31
+ # space to mean 'blank'
32
+ #
33
+ # "245|01|abc65:345abc:700|*5|:800"
34
+ #
35
+ # A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
36
+ # and includes a tag and a a byte slice specification.
37
+ #
38
+ # "008[35-37]:007[5]""
39
+ # => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007
40
+ #
41
+ # * subfields and indicators can only be provided for marc data/variable fields
42
+ # * byte slice can only be provided for marc control fields (generally tags less than 010)
43
+ #
44
+ # ## Subfield concatenation
45
+ #
46
+ # Normally, for a spec including multiple subfield codes, multiple subfields
47
+ # from the same MARC field will be concatenated into one string separated by spaces:
48
+ #
49
+ # 600 a| Chomsky, Noam x| Philosophy.
50
+ # 600 a| Chomsky, Noam x| Political and social views.
51
+ # MarcExtractor.new("600ax").extract(record)
52
+ # # results in two values sent to Solr:
53
+ # "Chomsky, Noam Philosophy."
54
+ # "Chomsky, Noam Political and social views."
55
+ #
56
+ # You can turn off this concatenation and leave individual subfields in seperate
57
+ # strings by setting the `separator` option to nil:
58
+ #
59
+ # MarcExtractor.new("600ax", :separator => nil).extract(record)
60
+ # # Results in four values being sent to Solr (or 3 if you de-dup):
61
+ # "Chomksy, Noam"
62
+ # "Philosophy."
63
+ # "Chomsky, Noam"
64
+ # "Political and social views."
65
+ #
66
+ # However, **the default is different for specifications with only a single
67
+ # subfield**, these are by default kept seperated:
68
+ #
69
+ # 020 a| 285197145X a| 9782851971456
70
+ # MarcExtractor.new("020a:020z").extract(record)
71
+ # # two seperate strings sent to Solr:
72
+ # "285197145X"
73
+ # "9782851971456"
74
+ #
75
+ # For single subfield specifications, you force concatenation by
76
+ # repeating the subfield specification:
77
+ #
78
+ # MarcExtractor.new("020aa:020zz").extract(record)
79
+ # # would result in a single string sent to solr for
80
+ # # the single field, by default space-separated:
81
+ # "285197145X 9782851971456"
82
+ #
83
+ # ## Note on Performance and MarcExtractor creation and reuse
84
+ #
85
+ # A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
86
+ # benchmarking to be a bottleneck if you end up creating one for each marc record
87
+ # processed. Instead, a single MarcExtractor should be created, and re-used
88
+ # per MARC record.
89
+ #
90
+ # If you are creating a traject 'macro' method, here's one way to do that,
91
+ # capturing the MarcExtractor under closure:
92
+ #
93
+ # def some_macro(spec, other_args, whatever)
94
+ # extractor = MarcExtractor.new( spec )
95
+ # # ...
96
+ # return lambda do |record, accumulator, context|
97
+ # #...
98
+ # accumulator.concat extractor.extract(record)
99
+ # #...
100
+ # end
101
+ # end
102
+ #
103
+ # In other cases, you may find it convenient to improve performance by
104
+ # using the MarcExtractor#cached method, instead of MarcExtractor#new, to
105
+ # lazily create and then re-use a MarcExtractor object with
106
+ # particular initialization arguments.
107
+ class MarcExtractor
108
+ attr_accessor :options, :spec_hash
109
+
110
+ # First arg is a specification for extraction of data from a MARC record.
111
+ # Specification can be given in two forms:
112
+ #
113
+ # * a string specification like "008[35]:020a:245abc", see top of class
114
+ # for examples. A string specification is most typical argument.
115
+ # * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
116
+ # a 'pre-parsed' specification.
117
+ #
118
+ # Second arg is options:
119
+ #
120
+ # [:separator] default ' ' (space), what to use to separate
121
+ # subfield values when joining strings
122
+ #
123
+ # [:alternate_script] default :include, include linked 880s for tags
124
+ # that match spec. Also:
125
+ # * false => do not include.
126
+ # * :only => only include linked 880s, not original
127
+ def initialize(spec, options = {})
128
+ self.options = {
129
+ :separator => ' ',
130
+ :alternate_script => :include
131
+ }.merge(options)
132
+
133
+ self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
134
+
135
+
136
+ # Tags are "interesting" if we have a spec that might cover it
137
+ @interesting_tags_hash = {}
138
+
139
+ # By default, interesting tags are those represented by keys in spec_hash.
140
+ # Add them unless we only care about alternate scripts.
141
+ unless options[:alternate_script] == :only
142
+ self.spec_hash.keys.each {|tag| @interesting_tags_hash[tag] = true}
143
+ end
144
+
145
+ # If we *are* interested in alternate scripts, add the 880
146
+ if options[:alternate_script] != false
147
+ @interesting_tags_hash['880'] = true
148
+ end
149
+
150
+ self.freeze
151
+ end
152
+
153
+ # Takes the same arguments as MarcExtractor.new, but will re-use an existing
154
+ # cached MarcExtractor already created with given initialization arguments,
155
+ # if available.
156
+ #
157
+ # This can be used to increase performance of indexing routines, as
158
+ # MarcExtractor creation has been shown via profiling/benchmarking
159
+ # to be expensive.
160
+ #
161
+ # Cache is thread-local, so should be thread-safe.
162
+ #
163
+ # You should _not_ modify the state of any MarcExtractor retrieved
164
+ # via cached, as the MarcExtractor will be re-used and shared (possibly
165
+ # between threads even!). We try to use ruby #freeze to keep you from doing so,
166
+ # although if you try hard enough you can surely find a way to do something
167
+ # you shouldn't.
168
+ #
169
+ # extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
170
+ def self.cached(*args)
171
+ cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
172
+ return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
173
+ end
174
+
175
+ # Check to see if a tag is interesting (meaning it may be covered by a spec
176
+ # and the passed-in options about alternate scripts)
177
+ def interesting_tag?(tag)
178
+ return @interesting_tags_hash.include?(tag)
179
+ end
180
+
181
+
182
+ # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
183
+ # to represent the specification. See comments at head of class for
184
+ # documentation of string specification format.
185
+ #
186
+ #
187
+ # ## Return value
188
+ #
189
+ # The hash returned is keyed by tag, and has as values an array of 0 or
190
+ # or more MarcExtractor::Spec objects representing the specified extraction
191
+ # operations for that tag.
192
+ #
193
+ # It's an array of possibly more than one, because you can specify
194
+ # multiple extractions on the same tag: for instance "245a:245abc"
195
+ #
196
+ # See tests for more examples.
197
+ def self.parse_string_spec(spec_string)
198
+ # hash defaults to []
199
+ hash = Hash.new
200
+
201
+ spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
202
+
203
+ spec_strings.each do |part|
204
+ if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/)
205
+ # variable field
206
+ tag, indicators, subfields = $1, $3, $4
207
+
208
+ spec = Spec.new(:tag => tag)
209
+
210
+ if subfields and !subfields.empty?
211
+ spec.subfields = subfields.split('')
212
+ end
213
+
214
+ if indicators
215
+ # if specified as '*', leave nil
216
+ spec.indicator1 = indicators[0] if indicators[0] != "*"
217
+ spec.indicator2 = indicators[1] if indicators[1] != "*"
218
+ end
219
+
220
+ hash[spec.tag] ||= []
221
+ hash[spec.tag] << spec
222
+
223
+ elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
224
+ tag, byte1, byte2 = $1, $3, $5
225
+
226
+ spec = Spec.new(:tag => tag)
227
+
228
+ if byte1 && byte2
229
+ spec.bytes = ((byte1.to_i)..(byte2.to_i))
230
+ elsif byte1
231
+ spec.bytes = byte1.to_i
232
+ end
233
+
234
+ hash[spec.tag] ||= []
235
+ hash[spec.tag] << spec
236
+ else
237
+ raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
238
+ end
239
+ end
240
+
241
+ return hash
242
+ end
243
+
244
+
245
+ # Returns array of strings, extracted values. Maybe empty array.
246
+ def extract(marc_record)
247
+ results = []
248
+
249
+ self.each_matching_line(marc_record) do |field, spec|
250
+ if control_field?(field)
251
+ results << (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
252
+ else
253
+ results.concat collect_subfields(field, spec)
254
+ end
255
+ end
256
+
257
+ return results
258
+ end
259
+
260
+ # Yields a block for every line in source record that matches
261
+ # spec. First arg to block is MARC::DataField or ControlField, second
262
+ # is the MarcExtractor::Spec that it matched on. May take account
263
+ # of options such as :alternate_script
264
+ #
265
+ # Third (optional) arg to block is self, the MarcExtractor object, useful for custom
266
+ # implementations.
267
+ def each_matching_line(marc_record)
268
+ marc_record.fields(@interesting_tags_hash.keys).each do |field|
269
+
270
+ # Make sure it matches indicators too, specs_covering_field
271
+ # doesn't check that.
272
+ specs_covering_field(field).each do |spec|
273
+ if spec.matches_indicators?(field)
274
+ yield(field, spec, self)
275
+ end
276
+ end
277
+
278
+ end
279
+ end
280
+
281
+ # line each_matching_line, takes a block to process each matching line,
282
+ # but collects results of block into an array -- flattens any subarrays for you!
283
+ #
284
+ # Useful for re-use of this class for custom processing
285
+ #
286
+ # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
287
+ def collect_matching_lines(marc_record)
288
+ results = []
289
+ self.each_matching_line(marc_record) do |field, spec, extractor|
290
+ results.concat [yield(field, spec, extractor)].flatten
291
+ end
292
+ return results
293
+ end
294
+
295
+
296
+ # Pass in a marc data field and a Spec object with extraction
297
+ # instructions, returns an ARRAY of one or more strings, subfields extracted
298
+ # and processed per spec. Takes account of options such
299
+ # as :separator
300
+ #
301
+ # Always returns array, sometimes empty array.
302
+ def collect_subfields(field, spec)
303
+ subfields = field.subfields.collect do |subfield|
304
+ subfield.value if spec.includes_subfield_code?(subfield.code)
305
+ end.compact
306
+
307
+ return subfields if subfields.empty? # empty array, just return it.
308
+
309
+ if options[:separator] && spec.joinable?
310
+ subfields = [subfields.join(options[:separator])]
311
+ end
312
+
313
+ return subfields
314
+ end
315
+
316
+
317
+
318
+ # Find Spec objects, if any, covering extraction from this field.
319
+ # Returns an array of 0 or more MarcExtractor::Spec objects
320
+ #
321
+ # When given an 880, will return the spec (if any) for the linked tag iff
322
+ # we have a $6 and we want the alternate script.
323
+ #
324
+ # Returns an empty array in case of no matching extraction specs.
325
+ def specs_covering_field(field)
326
+ tag = field.tag
327
+
328
+ # Short-circuit the unintersting stuff
329
+ return [] unless interesting_tag?(tag)
330
+
331
+ # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
332
+ # to do this weird encode gymnastics, which fixes it for mysterious reasons.
333
+
334
+ if tag == "880" && field['6']
335
+ tag = field["6"].encode(field["6"].encoding).byteslice(0,3)
336
+ end
337
+
338
+ # Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
339
+ spec = self.spec_hash[tag] || []
340
+ end
341
+
342
+
343
+ def control_field?(field)
344
+ # should the MARC gem have a more efficient way to do this,
345
+ # define #control_field? on both ControlField and DataField?
346
+ return field.kind_of? MARC::ControlField
347
+ end
348
+
349
+ def freeze
350
+ self.options.freeze
351
+ self.spec_hash.freeze
352
+ super
353
+ end
354
+
355
+
356
+ # Represents a single specification for extracting data
357
+ # from a marc field, like "600abc" or "600|1*|x".
358
+ #
359
+ # Includes the tag for reference, although this is redundant and not actually used
360
+ # in logic, since the tag is also implicit in the overall spec_hash
361
+ # with tag => [spec1, spec2]
362
+ class Spec
363
+ attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
364
+
365
+ def initialize(hash = {})
366
+ hash.each_pair do |key, value|
367
+ self.send("#{key}=", value)
368
+ end
369
+ end
370
+
371
+
372
+ # Should subfields extracted by joined, if we have a seperator?
373
+ # * '630' no subfields specified => join all subfields
374
+ # * '630abc' multiple subfields specified = join all subfields
375
+ # * '633a' one subfield => do not join, return one value for each $a in the field
376
+ # * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
377
+ #
378
+ # Last case is handled implicitly at the moment when subfields == ['a', 'a']
379
+ def joinable?
380
+ (self.subfields.nil? || self.subfields.size != 1)
381
+ end
382
+
383
+ # Pass in a MARC field, do it's indicators match indicators
384
+ # in this spec? nil indicators in spec mean we don't care, everything
385
+ # matches.
386
+ def matches_indicators?(field)
387
+ return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
388
+ (self.indicator2.nil? || self.indicator2 == field.indicator2)
389
+ end
390
+
391
+ # Pass in a string subfield code like 'a'; does this
392
+ # spec include it?
393
+ def includes_subfield_code?(code)
394
+ # subfields nil means include them all
395
+ self.subfields.nil? || self.subfields.include?(code)
396
+ end
397
+
398
+ def ==(spec)
399
+ return false unless spec.kind_of?(Spec)
400
+
401
+ return (self.tag == spec.tag) &&
402
+ (self.subfields == spec.subfields) &&
403
+ (self.indicator1 == spec.indicator1) &&
404
+ (self.indicator1 == spec.indicator2) &&
405
+ (self.bytes == spec.bytes)
406
+ end
407
+ end
408
+
409
+ end
410
+ end