traject 0.0.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +85 -61
  3. data/Rakefile +5 -0
  4. data/bin/traject +31 -3
  5. data/doc/settings.md +74 -13
  6. data/lib/tasks/load_maps.rake +48 -0
  7. data/lib/traject/indexer/settings.rb +75 -0
  8. data/lib/traject/indexer.rb +255 -45
  9. data/lib/traject/json_writer.rb +4 -2
  10. data/lib/traject/macros/marc21.rb +18 -6
  11. data/lib/traject/macros/marc21_semantics.rb +405 -0
  12. data/lib/traject/macros/marc_format_classifier.rb +180 -0
  13. data/lib/traject/marc4j_reader.rb +160 -0
  14. data/lib/traject/marc_extractor.rb +33 -17
  15. data/lib/traject/marc_reader.rb +14 -11
  16. data/lib/traject/solrj_writer.rb +247 -9
  17. data/lib/traject/thread_pool.rb +154 -0
  18. data/lib/traject/translation_map.rb +46 -4
  19. data/lib/traject/util.rb +30 -0
  20. data/lib/traject/version.rb +1 -1
  21. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  22. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  23. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  24. data/lib/translation_maps/marc_geographic.yaml +589 -0
  25. data/lib/translation_maps/marc_instruments.yaml +102 -0
  26. data/lib/translation_maps/marc_languages.yaml +490 -0
  27. data/test/indexer/each_record_test.rb +34 -0
  28. data/test/indexer/macros_marc21_semantics_test.rb +206 -0
  29. data/test/indexer/macros_marc21_test.rb +10 -1
  30. data/test/indexer/map_record_test.rb +78 -8
  31. data/test/indexer/read_write_test.rb +43 -10
  32. data/test/indexer/settings_test.rb +60 -4
  33. data/test/indexer/to_field_test.rb +39 -0
  34. data/test/marc4j_reader_test.rb +75 -0
  35. data/test/marc_extractor_test.rb +62 -0
  36. data/test/marc_format_classifier_test.rb +91 -0
  37. data/test/marc_reader_test.rb +12 -0
  38. data/test/solrj_writer_test.rb +146 -43
  39. data/test/test_helper.rb +50 -0
  40. data/test/test_support/245_no_ab.marc +1 -0
  41. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  42. data/test/test_support/bad_subfield_code.marc +1 -0
  43. data/test/test_support/date_resort_to_260.marc +1 -0
  44. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  45. data/test/test_support/date_with_u.marc +1 -0
  46. data/test/test_support/demo_config.rb +153 -0
  47. data/test/test_support/emptyish_record.marc +1 -0
  48. data/test/test_support/louis_armstrong.marc +1 -0
  49. data/test/test_support/manuscript_online_thesis.marc +1 -0
  50. data/test/test_support/microform_online_conference.marc +1 -0
  51. data/test/test_support/multi_era.marc +1 -0
  52. data/test/test_support/multi_geo.marc +1 -0
  53. data/test/test_support/musical_cage.marc +1 -0
  54. data/test/test_support/one-marc8.mrc +1 -0
  55. data/test/test_support/online_only.marc +1 -0
  56. data/test/test_support/packed_041a_lang.marc +1 -0
  57. data/test/test_support/the_business_ren.marc +1 -0
  58. data/test/translation_map_test.rb +8 -0
  59. data/test/translation_maps/properties_map.properties +5 -0
  60. data/traject.gemspec +1 -1
  61. data/vendor/marc4j/README.md +17 -0
  62. data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
  63. metadata +81 -2
@@ -0,0 +1,405 @@
1
+ require 'traject/marc_extractor'
2
+
3
+ module Traject::Macros
4
+ # extracting various semantic parts out of a Marc21 record. Few of these
5
+ # come directly from Marc21 spec or other specs with no judgement, they
6
+ # are all to some extent opinionated, based on actual practice and actual
7
+ # data, some more than others. If it doens't do what you want, don't use it.
8
+ # But if it does, you can use it, and continue to get updates with future
9
+ # versions of Traject.
10
+ module Marc21Semantics
11
+ # shortcut
12
+ MarcExtractor = Traject::MarcExtractor
13
+
14
+ # Extract OCLC numbers from, by default 035a's, then strip known prefixes to get
15
+ # just the num, and de-dup.
16
+ def oclcnum(extract_fields = "035a")
17
+ lambda do |record, accumulator|
18
+ list = MarcExtractor.extract_by_spec(record, extract_fields, :seperator => nil).collect! do |o|
19
+ Marc21Semantics.oclcnum_trim(o)
20
+ end
21
+
22
+ accumulator.concat list.uniq if list
23
+ end
24
+ end
25
+ def self.oclcnum_trim(num)
26
+ num.gsub(/\A(ocm)|(ocn)|(on)|(\(OCoLC\))/, '')
27
+ end
28
+
29
+
30
+ # A sortable author value, created by concatenating:
31
+ # * the main entry author, if there is one (fields 100, 110 or 111)
32
+ # * the main entry uniform title (240), if there is one - not including non-filing chars as noted in 2nd indicator of the 240
33
+ # * If no 240, the 245 title, not including non-filing chars as noted in ind 2 of the 245
34
+ #
35
+ # Always returns a SINGLE string, based on concatenation.
36
+ #
37
+ # Thanks SolrMarc for basic logic.
38
+ #
39
+ # Note: You'll want to pay attention to the Solr schema field definition
40
+ # you're using, and have it do case-insensitivity or any other normalization
41
+ # you might want.
42
+ #
43
+ # these probably should be taking only certain subfields, but we're copying
44
+ # from SolrMarc that didn't do so either and nobody noticed, so not bothering for now.
45
+ def marc_sortable_author
46
+ lambda do |record, accumulator|
47
+ accumulator << Marc21Semantics.get_sortable_author(record)
48
+ end
49
+ end
50
+ def self.get_sortable_author(record)
51
+ onexx = MarcExtractor.extract_by_spec(record, "100:110:111", :first => true).first
52
+ onexx = onexx.strip if onexx
53
+
54
+ titles = []
55
+ MarcExtractor.new(record, "240:245", :first => true).each_matching_line do |field, spec|
56
+ non_filing = field.indicator2.to_i
57
+
58
+ str = field.subfields.collect {|sf| sf.value}.join(" ")
59
+ str = str.slice(non_filing, str.length)
60
+ titles << str
61
+ end.first
62
+ title = titles.first
63
+ title = title.strip if title
64
+
65
+ return "#{onexx}#{title}"
66
+ end
67
+
68
+
69
+ # 245 a and b, with non-filing characters stripped off
70
+ def marc_sortable_title
71
+ lambda do |record, accumulator|
72
+ accumulator << Marc21Semantics.get_sortable_title(record)
73
+ end
74
+ end
75
+ def self.get_sortable_title(record)
76
+ MarcExtractor.new(record, "245ab").collect_matching_lines do |field, spec, extractor|
77
+ str = extractor.collect_subfields(field, spec).first
78
+
79
+ if str.nil?
80
+ # maybe an APPM archival record with only a 'k'
81
+ str = field['k']
82
+ end
83
+ if str.nil?
84
+ # still? All we can do is bail, I guess
85
+ return nil
86
+ end
87
+
88
+ non_filing = field.indicator2.to_i
89
+ str = str.slice(non_filing, str.length)
90
+ str = Marc21.trim_punctuation(str)
91
+
92
+ str
93
+ end.first
94
+ end
95
+
96
+ # maps languages, by default out of 008[35-37] and 041a and 041d
97
+ #
98
+ # Can specify other spec if you want, say, 041b (lang of abstract)
99
+ # or 041e (lang of librettos), or 041h (lang of original) instead or in addition.
100
+ #
101
+ # de-dups values so you don't get the same one twice.
102
+ #
103
+ # Exact spec of #marc_languages may change with new user data on what
104
+ # works best.
105
+ def marc_languages(spec = "008[35-37]:041a:041d")
106
+ translation_map = Traject::TranslationMap.new("marc_languages")
107
+
108
+ lambda do |record, accumulator|
109
+ codes = MarcExtractor.new(record, spec, :seperator => "nil").collect_matching_lines do |field, spec, extractor|
110
+ if extractor.control_field?(field)
111
+ (spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value)
112
+ else
113
+ extractor.collect_subfields(field, spec).collect do |value|
114
+ # sometimes multiple language codes are jammed together in one subfield, and
115
+ # we need to seperate ourselves. sigh.
116
+ unless value.length == 3
117
+ value = value.scan(/.{1,3}/) # split into an array of 3-length substrs
118
+ end
119
+ value
120
+ end.flatten
121
+ end
122
+ end
123
+ codes = codes.uniq
124
+
125
+ translation_map.translate_array!(codes)
126
+
127
+ accumulator.concat codes
128
+ end
129
+ end
130
+
131
+ # Adds in marc fields in spec (default is recommended series spec, but you can specify your own)
132
+ # -- only trick is that 490's are skipped of first indicator is 1 -- if 490 first
133
+ # indicator is "1", "series traced", that means the series title mentioned here is
134
+ # already covered by another field we're including, so we don't want to double count it, possibly
135
+ # with slight variation.
136
+ def marc_series_facet(spec = "440a:490a:800abcdt:810abcdt:811acdeft:830adfgklmnoprst")
137
+ lambda do |record, accumulator|
138
+ MarcExtractor.new(record, spec).collect_matching_lines do |field, spec, extractor|
139
+ extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
140
+ end
141
+ end
142
+ end
143
+
144
+
145
+ # Takes marc 048ab instrument code, and translates it to human-displayable
146
+ # string. Takes first two chars of 048a or b, to translate (ignores numeric code)
147
+ #
148
+ # Pass in custom spec if you want just a or b, to seperate soloists or whatever.
149
+ def marc_instrumentation_humanized(spec = "048ab", options = {})
150
+ translation_map = Traject::TranslationMap.new(options[:translation_map] || "marc_instruments")
151
+
152
+ lambda do |record, accumulator|
153
+ values = Traject::MarcExtractor.extract_by_spec(record, spec, :seperator => nil)
154
+ human = values.collect do |value|
155
+ translation_map[ value.slice(0, 2) ]
156
+ end.uniq
157
+ accumulator.concat human if human && human.length > 0
158
+ end
159
+ end
160
+
161
+ # This weird one actually returns marc instrumentation codes, not
162
+ # humanized. But it normalizes them by breaking them down into a numeric and non-numeric
163
+ # version. For instance "ba01" will be indexed as both "ba01" and "ba".
164
+ # ALSO, if the code is in a subfield b (soloist), it'll be indexed
165
+ # _additionally_ as "ba01.s" and "ba.s".
166
+ #
167
+ # This has proven useful for expert music librarian searching by hand; it could
168
+ # also be the basis of a GUI that executes searches behind the scenes for these
169
+ # codes.
170
+ def marc_instrument_codes_normalized(spec = "048")
171
+ soloist_suffix = ".s"
172
+ return lambda do |record, accumulator|
173
+ accumulator.concat(
174
+ MarcExtractor.new(record, "048", :seperator => nil).collect_matching_lines do |field, spec, extractor|
175
+ values = []
176
+
177
+ field.subfields.each do |sf|
178
+ v = sf.value
179
+ # Unless there's at least two chars, it's malformed, we can
180
+ # do nothing
181
+ next unless v.length >= 2
182
+
183
+ # Index both with and without number -- both with soloist suffix
184
+ # if in a $b
185
+ values << v
186
+ values << "#{v}#{soloist_suffix}" if sf.code == 'b'
187
+ if v.length >= 4
188
+ bare = v.slice(0,2) # just the prefix
189
+ values << bare
190
+ values << "#{bare}#{soloist_suffix}" if sf.code == 'b'
191
+ end
192
+ end
193
+ values
194
+ end.uniq
195
+ )
196
+ end
197
+ end
198
+
199
+ # An opinionated algorithm for getting a SINGLE publication date out of marc
200
+ #
201
+ # * Prefers using 008, but will resort to 260c
202
+ # * If 008 represents a date range, will take the midpoint of the range,
203
+ # only if range is smaller than estimate_tolerance, default 15 years.
204
+ # * Ignores dates below min_year (default 500) or above max_year (this year plus 6 years),
205
+ # because experience shows too many of these were in error.
206
+ #
207
+ # Yeah, this code ends up ridiculous.
208
+ def marc_publication_date(options = {})
209
+ estimate_tolerance = options[:estimate_tolerance] || 15
210
+ min_year = options[:min_year] || 500
211
+ max_year = options[:max_year] || (Time.new.year + 6)
212
+
213
+ lambda do |record, accumulator|
214
+ date = Marc21Semantics.publication_date(record, estimate_tolerance, min_year, max_year)
215
+ accumulator << date if date
216
+ end
217
+ end
218
+
219
+ # See #marc_publication_date. Yeah, this is a holy mess.
220
+ # Maybe it should actually be extracted to it's own class!
221
+ def self.publication_date(record, estimate_tolerance = 15, min_year = 500, max_year = (Time.new.year + 6))
222
+ field008 = MarcExtractor.extract_by_spec(record, "008").first
223
+ found_date = nil
224
+
225
+ if field008 && field008.length >= 11
226
+ date_type = field008.slice(6)
227
+ date1_str = field008.slice(7,4)
228
+ date2_str = field008.slice(11, 4) if field008.length > 15
229
+
230
+ # for date_type q=questionable, we have a range.
231
+ if (date_type == 'q')
232
+ # make unknown digits at the beginning or end of range,
233
+ date1 = date1_str.sub("u", "0").to_i
234
+ date2 = date2_str.sub("u", "9").to_i
235
+ # do we have a range we can use?
236
+ if (date2 > date1) && ((date2 - date1) <= estimate_tolerance)
237
+ found_date = (date2 + date1)/2
238
+ end
239
+ end
240
+ # didn't find a date that way, and anything OTHER than date_type
241
+ # n=unknown, q=questionable, try single date -- for some date types,
242
+ # there's a date range between date1 and date2, yeah, we often take
243
+ # the FIRST date then, the earliest. That's just what we're doing.
244
+ if found_date.nil? && date_type != 'n' && date_type != 'q'
245
+ # in date_type 'r', second date is original publication date, use that I think?
246
+ date_str = (date_type == 'r' && date2_str.to_i != 0) ? date2_str : date1_str
247
+ # Deal with stupid 'u's, which end up meaning a range too,
248
+ # find midpoint and make sure our tolerance is okay.
249
+ ucount = 0
250
+ while (!date_str.nil?) && (i = date_str.index('u'))
251
+ ucount += 1
252
+ date_str[i] = "0"
253
+ end
254
+ date = date_str.to_i
255
+ if ucount > 0 && date != 0
256
+ delta = 10 ** ucount # 10^ucount, expontent
257
+ if delta <= estimate_tolerance
258
+ found_date = date + (delta/2)
259
+ end
260
+ elsif date != 0
261
+ found_date = date
262
+ end
263
+ end
264
+ end
265
+ # Okay, nothing from 008, try 260
266
+ if found_date.nil?
267
+ v260c = MarcExtractor.extract_by_spec(record, "260c", :seperator => nil).first
268
+ # just try to take the first four digits out of there, we're not going to try
269
+ # anything crazy.
270
+ if v260c =~ /(\d{4})/
271
+ found_date = $1.to_i
272
+ end
273
+ end
274
+
275
+ # is it within our acceptable range?
276
+ found_date = nil if found_date && (found_date < min_year || found_date > max_year)
277
+
278
+ return found_date
279
+ end
280
+
281
+ # Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH)
282
+ # from usual parts of the marc record. Maps them to high-level broad categories,
283
+ # basically just using the first part of the LCC. Note it's just looking in bib-level
284
+ # locations for LCCs, you're on your own with holdings.
285
+ #
286
+ # Sanity checks to make sure the thing looks like an LCC with a regex, before
287
+ # mapping.
288
+ #
289
+ # Will call it 'Unknown' if it's got nothing else, or pass in :default => something else,
290
+ # or nil.
291
+ #
292
+ # The categories output aren't great, but they're something.
293
+ LCC_REGEX = / *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
294
+ def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a")
295
+ # Trying to match things that look like LCC, and not match things
296
+ # that don't. Is tricky.
297
+ lcc_regex = LCC_REGEX
298
+ default_value = options.has_key?(:default) ? options[:default] : "Unknown"
299
+ translation_map = Traject::TranslationMap.new("lcc_top_level")
300
+
301
+ lambda do |record, accumulator|
302
+ candidates = MarcExtractor.extract_by_spec(record, spec, :seperator => nil)
303
+
304
+ candidates.reject! do |candidate|
305
+ !(candidate =~ lcc_regex)
306
+ end
307
+
308
+ accumulator.concat translation_map.translate_array!(candidates.collect {|a| a.lstrip.slice(0, 1)}).uniq
309
+
310
+ if default_value && accumulator.empty?
311
+ accumulator << default_value
312
+ end
313
+ end
314
+ end
315
+
316
+ # An opinionated method of making a geographic facet out of BOTH 048 marc
317
+ # codes, AND geo subdivisions in 6xx LCSH subjects.
318
+ #
319
+ # The LCSH geo subdivisions are further normalized:
320
+ # * geo qualifiers in $z fields into parens, so "Germany -- Berlin" becomes "Berlin (Germany)"
321
+ # (to be consistent with how same areas are written in $a fields -- doesn't
322
+ # get everything, but gets lots of em)
323
+ # * qualified regions like that are additionally 'posted up', so "Germany -- Berlin" gets
324
+ # recorded additionally as "Germany"
325
+ def marc_geo_facet(options = {})
326
+ marc_geo_map = Traject::TranslationMap.new("marc_geographic")
327
+
328
+ a_fields_spec = options[:geo_a_fields] || "651a:691a"
329
+ z_fields_spec = options[:geo_z_fields] || "600:610:611:630:648:650:654:655:656:690:651:691"
330
+
331
+ lambda do |record, accumulator|
332
+
333
+ accumulator.concat(
334
+ MarcExtractor.extract_by_spec(record, "043a", :seperator => nil).collect do |code|
335
+ # remove any trailing hyphens, then map
336
+ marc_geo_map[code.gsub(/\-+\Z/, '')]
337
+ end.compact
338
+ )
339
+
340
+ #LCSH 651a and 691a go in more or less normally.
341
+ accumulator.concat(
342
+ MarcExtractor.extract_by_spec(record, a_fields_spec, :seperator => nil).collect do |s|
343
+ # remove trailing periods, which they sometimes have if they were
344
+ # at end of LCSH.
345
+ s.sub(/\. */, '')
346
+ end
347
+ )
348
+
349
+ # fields we take z's from have a bit more normalization
350
+ MarcExtractor.new(record, z_fields_spec).each_matching_line do |field, spec, extractor|
351
+ z_fields = field.subfields.find_all {|sf| sf.code == "z"}.collect {|sf| sf.value }
352
+ # depending on position in total field, may be a period on the end
353
+ # we want to remove.
354
+ z_fields.collect! {|s| s.gsub(/\. *\Z/, '')}
355
+
356
+ if z_fields.length == 2
357
+ # normalize subdivision as parenthetical
358
+ accumulator << "#{z_fields[1]} (#{z_fields[0]})"
359
+ # and 'post up'
360
+ accumulator << z_fields[0]
361
+ else
362
+ # just add all the z's if there's 1 or more than 2.
363
+ accumulator.concat z_fields
364
+ end
365
+ end
366
+ end
367
+ end
368
+
369
+ # Opinionated routine to create values for a chronology/era facet out of
370
+ # LCSH chron subdivisions. Does some normalization:
371
+ # for 651 with a chron facet fitting the form
372
+ # "aaaaa, yyyy-yyyy", it will add in the $a. For instance:
373
+ # 651 a| United States x| History y| Civil War, 1861-1865
374
+ # --> "United States: Civil War, 1861-1865"
375
+ def marc_era_facet
376
+ ordinary_fields_spec = "600y:610y:611y:630y:648ay:650y:654y:656y:690y"
377
+ special_fields_spec = "651:691"
378
+ seperator = ": "
379
+ lambda do |record, accumulator|
380
+ # straightforward ones
381
+
382
+
383
+ accumulator.concat( MarcExtractor.extract_by_spec(record, ordinary_fields_spec).collect do |v|
384
+ # May have a period we have to remove, if it was at end of tag
385
+ v.sub(/\. *\Z/, '')
386
+ end)
387
+
388
+ # weird ones
389
+ MarcExtractor.new(record, special_fields_spec).each_matching_line do |field, spec, extractor|
390
+ field.subfields.each do |sf|
391
+ next unless sf.code == 'y'
392
+ if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
393
+ # it's our pattern, add the $a in please
394
+ accumulator << "#{field['a']}#{seperator}#{sf.value.sub(/\. *\Z/, '')}"
395
+ else
396
+ accumulator << sf.value.sub(/\. *\Z/, '')
397
+ end
398
+ end
399
+ end
400
+ end
401
+ end
402
+
403
+
404
+ end
405
+ end
@@ -0,0 +1,180 @@
1
+ module Traject
2
+ module Macros
3
+ # See MarcFormatClassifier class
4
+ module MarcFormats
5
+ # very opionated macro that just adds a grab bag of format/genre/types
6
+ # into one field. You may want ot build your own from MarcFormatClassifier functions instead.
7
+ def marc_formats
8
+ lambda do |record, accumulator|
9
+ accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
10
+ end
11
+ end
12
+ end
13
+
14
+
15
+ # Not actually a macro, but we're keeping it here for now,
16
+ # a class for classifying marc according to format/genre/type.
17
+ #
18
+ # VERY opinionated.
19
+ class MarcFormatClassifier
20
+ attr_reader :record
21
+
22
+ def initialize(marc_record)
23
+ @record = marc_record
24
+ end
25
+
26
+ # A very opinionated method that just kind of jams together
27
+ # all the possible format/genre/types into one array of 1 to N elements.
28
+ #
29
+ # Default "Other" will be used
30
+ def formats(options = {})
31
+ options = {:default => "Other"}.merge(options)
32
+
33
+ formats = []
34
+
35
+ formats.concat genre
36
+
37
+ formats << "Manuscript/Archive" if manuscript_archive?
38
+ formats << "Microform" if microform?
39
+ formats << "Online" if online?
40
+
41
+ # In our own data, if it's an audio recording, it might show up
42
+ # as print, but it's probably not.
43
+ formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
44
+
45
+ # If it's a Dissertation, we decide it's NOT a book
46
+ if thesis?
47
+ formats.delete("Book")
48
+ formats << "Dissertation/Thesis"
49
+ end
50
+
51
+ if proceeding?
52
+ formats << "Conference"
53
+ end
54
+
55
+ if formats.empty?
56
+ formats << options[:default]
57
+ end
58
+
59
+ return formats
60
+ end
61
+
62
+
63
+
64
+ # Returns 1 or more values in an array from:
65
+ # Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
66
+ # Image; Software/Data; Video/Film
67
+ #
68
+ # Uses leader byte 6, leader byte 7, and 007 byte 0.
69
+ #
70
+ # Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
71
+ # so you can customize labels if you want.
72
+ def genre
73
+ marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
74
+ marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
75
+
76
+ results = marc_genre_leader[ record.leader.slice(6,2) ] ||
77
+ marc_genre_leader[ record.leader.slice(6)] ||
78
+ record.find_all {|f| f.tag == "007"}.collect {|f| marc_genre_007[f.value.slice(0)]}
79
+
80
+ [results].flatten
81
+ end
82
+
83
+ # Just checks if it has a 502, if it does it's considered a thesis
84
+ def thesis?
85
+ @thesis_q ||= begin
86
+ ! record.find {|a| a.tag == "502"}.nil?
87
+ end
88
+ end
89
+
90
+ # Just checks all $6xx for a $v "Congresses"
91
+ def proceeding?
92
+ @proceeding_q ||= begin
93
+ ! record.find do |field|
94
+ field.tag.slice(0) == '6' && field.subfields.find {|sf| sf.code == "v" && sf.value =~ /^\s*(C|c)ongresses\.?\s*$/}
95
+ end.nil?
96
+ end
97
+ end
98
+
99
+ # Algorithm with help from Chris Case.
100
+ # * If it has any RDA 338, then it's print if it has a value of
101
+ # volume, sheet, or card.
102
+ # * If it does not have an RDA 338, it's print if and only if it has
103
+ # NO 245$h GMD.
104
+ #
105
+ # * Here at JH, for legacy reasons we also choose to not
106
+ # call it print if it's already been marked audio, but
107
+ # we do that in a different method.
108
+ #
109
+ # This algorithm is definitely going to get some things wrong in
110
+ # both directions, with real world data. But seems to be good enough.
111
+ def print?
112
+
113
+
114
+ rda338 = record.find_all do |field|
115
+ field.tag == "338" && field['2'] == "rdacarrier"
116
+ end
117
+
118
+ if rda338.length > 0
119
+ rda338.find do |field|
120
+ field.subfields.find do |sf|
121
+ (sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
122
+ (sf.code == "b" && %w{nc no nb}.include?(sf.value))
123
+ end
124
+ end
125
+ else
126
+ normalized_gmd.length == 0
127
+ end
128
+ end
129
+
130
+ # We use marc 007 to determine if this represents an online
131
+ # resource. But sometimes resort to 245$h GMD too.
132
+ def online?
133
+ # field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
134
+ found_007 = record.find do |field|
135
+ field.tag == "007" && field.value.slice(0) == "c" && field.value.slice(1) == "r"
136
+ end
137
+
138
+ return true if found_007
139
+
140
+ # Otherwise, if it has a GMD ["electronic resource"], we count it
141
+ # as online only if NO 007[0] == 'c' exists, cause if it does we already
142
+ # know it's electronic but not remote, otherwise first try would
143
+ # have found it.
144
+ return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
145
+ end
146
+
147
+ # if field 007 byte 0 is 'h', that's microform. But many of our microform
148
+ # don't have that. If leader byte 6 is 'h', that's an obsolete way of saying
149
+ # microform. And finally, if GMD is
150
+ def microform?
151
+ normalized_gmd.start_with?("[microform]") ||
152
+ record.leader['6'] == "h" ||
153
+ record.find {|f| (f.tag == "007") && (f.value['0'] == "h")}
154
+ end
155
+
156
+ # Marked as manuscript OR archive.
157
+ def manuscript_archive?
158
+ leader06 = record.leader.slice(6)
159
+ leader08 = record.leader.slice(8)
160
+
161
+ # leader 6 t=Manuscript Language Material, d=Manuscript Music,
162
+ # f=Manuscript Cartograhpic
163
+ #
164
+ # leader 06 = 'b' is obsolete, but if it exists it means archival countrl
165
+ #
166
+ # leader 08 'a'='archival control'
167
+ %w{t d f b}.include?(leader06) || leader08 == "a"
168
+ end
169
+
170
+ # downcased version of the gmd, or else empty string
171
+ def normalized_gmd
172
+ @gmd ||= begin
173
+ ((a245 = record['245']) && a245['h'] && a245['h'].downcase) || ""
174
+ end
175
+ end
176
+
177
+
178
+ end
179
+ end
180
+ end