traject 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,584 @@
1
+ # Encoding: UTF-8
2
+
3
+ require 'traject/marc_extractor'
4
+
5
+ module Traject::Macros
6
+ # extracting various semantic parts out of a Marc21 record. Few of these
7
+ # come directly from Marc21 spec or other specs with no judgement, they
8
+ # are all to some extent opinionated, based on actual practice and actual
9
+ # data, some more than others. If it doens't do what you want, don't use it.
10
+ # But if it does, you can use it, and continue to get updates with future
11
+ # versions of Traject.
12
+ module Marc21Semantics
13
+ # shortcut
14
+ MarcExtractor = Traject::MarcExtractor
15
+
16
+ # Extract OCLC numbers from, by default 035a's by known prefixes, then stripped
17
+ # just the num, and de-dup.
18
+ def oclcnum(extract_fields = "035a")
19
+ extractor = MarcExtractor.new(extract_fields, :separator => nil)
20
+
21
+ lambda do |record, accumulator|
22
+ list = extractor.extract(record).collect! do |o|
23
+ Marc21Semantics.oclcnum_extract(o)
24
+ end.compact
25
+
26
+ accumulator.concat list.uniq if list
27
+ end
28
+ end
29
+
30
+ # If a num begins with a known OCLC prefix, return it without the prefix.
31
+ # otherwise nil.
32
+ #
33
+ # Allow (OCoLC) and/or ocn/ocm/on
34
+
35
+ OCLCPAT = /
36
+ \A\s*
37
+ (?:(?:\(OCoLC\)) |
38
+ (?:\(OCoLC\))?(?:(?:ocm)|(?:ocn)|(?:on))
39
+ )(\d+)
40
+ /x
41
+
42
+ def self.oclcnum_extract(num)
43
+ if OCLCPAT.match(num)
44
+ return $1
45
+ else
46
+ return nil
47
+ end
48
+ end
49
+
50
+
51
+ # A sortable author value, created by concatenating:
52
+ # * the main entry author, if there is one (fields 100, 110 or 111)
53
+ # * the main entry uniform title (240), if there is one - not including non-filing chars as noted in 2nd indicator of the 240
54
+ # * If no 240, the 245 title, not including non-filing chars as noted in ind 2 of the 245
55
+ #
56
+ # Always returns a SINGLE string, based on concatenation.
57
+ #
58
+ # Thanks SolrMarc for basic logic.
59
+ #
60
+ # Note: You'll want to pay attention to the Solr schema field definition
61
+ # you're using, and have it do case-insensitivity or any other normalization
62
+ # you might want.
63
+ #
64
+ # these probably should be taking only certain subfields, but we're copying
65
+ # from SolrMarc that didn't do so either and nobody noticed, so not bothering for now.
66
+ def marc_sortable_author
67
+ lambda do |record, accumulator|
68
+ accumulator << Marc21Semantics.get_sortable_author(record)
69
+ end
70
+ end
71
+
72
+ def self.get_sortable_author(record)
73
+ onexx = MarcExtractor.cached("100:110:111", :first => true, :trim_punctuation => true).extract(record).first
74
+ onexx = onexx.strip if onexx
75
+
76
+ titles = []
77
+ MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
78
+ non_filing = field.indicator2.to_i
79
+
80
+ str = field.subfields.collect {|sf| Marc21.trim_punctuation(sf.value.strip).strip}.join(" ")
81
+ str = str.slice(non_filing, str.length)
82
+ titles << str
83
+ end.first
84
+ title = titles.first
85
+ title = title.strip if title
86
+
87
+ return [onexx, title].compact.join(" ")
88
+ end
89
+
90
+
91
+ # 245 a and b, with non-filing characters stripped off
92
+ def marc_sortable_title
93
+ lambda do |record, accumulator|
94
+ st = Marc21Semantics.get_sortable_title(record)
95
+ accumulator << st if st
96
+ end
97
+ end
98
+
99
+ def self.get_sortable_title(record)
100
+ MarcExtractor.cached("245ab").collect_matching_lines(record) do |field, spec, extractor|
101
+ str = extractor.collect_subfields(field, spec).first
102
+
103
+ if str.nil?
104
+ # maybe an APPM archival record with only a 'k'
105
+ str = field['k']
106
+ end
107
+ if str.nil?
108
+ # still? All we can do is bail, I guess
109
+ return nil
110
+ end
111
+
112
+ non_filing = field.indicator2.to_i
113
+ str = str.slice(non_filing, str.length)
114
+ str = Marc21.trim_punctuation(str)
115
+
116
+ str
117
+ end.first
118
+ end
119
+
120
+
121
+
122
+ # A generic way to strip a filing version (i.e., a string with the non-filing
123
+ # characters stripped off)
124
+ #
125
+ # Always returns an array. If :include_original=>true is passed in,
126
+ # that array will include the original string with the non-filing
127
+ # characters still in it.
128
+
129
+ def extract_marc_filing_version(spec='245abdefghknp', opts={})
130
+ include_original = opts.delete(:include_original)
131
+ if opts.size > 0
132
+ raise RuntimeError.new("extract_marc_filing_version can take only :include_original as an argument, not #{opts.keys.map{|x| "'#{x}'"}.join(' or ')}")
133
+ end
134
+
135
+ extractor = Traject::MarcExtractor.cached(spec, opts)
136
+
137
+ lambda do |record, accumulator, context|
138
+ extractor.collect_matching_lines(record) do |field, spec|
139
+ str = extractor.collect_subfields(field, spec).first
140
+ next unless str and !str.empty?
141
+ vals = [Marc21Semantics.filing_version(field, str, spec)]
142
+ if include_original
143
+ vals.unshift str
144
+ vals.uniq!
145
+ end
146
+ accumulator.concat vals
147
+ end
148
+ end
149
+ end
150
+
151
+
152
+
153
+
154
+ # Take in a field, a string extracted from that field, and a spec and
155
+ # return the filing version (i.e., the string without the
156
+ # non-filing characters)
157
+
158
+ def self.filing_version(field, str, spec)
159
+ # Control fields don't have non-filing characters
160
+ return str if field.kind_of? MARC::ControlField
161
+
162
+ # 2nd indicator must be > 0
163
+ ind2 = field.indicator2.to_i
164
+ return str unless ind2 > 0
165
+
166
+ # The spechash must either (a) have no subfields specified, or
167
+ # (b) include the first subfield in the record
168
+
169
+ subs = spec.subfields
170
+ return str unless subs && subs.include?(field.subfields[0].code)
171
+
172
+ # OK. If we got this far we actually need to strip characters off the string
173
+
174
+ return str[ind2..-1]
175
+ end
176
+
177
+
178
+
179
+
180
+ # maps languages, by default out of 008[35-37] and 041a and 041d
181
+ #
182
+ # Can specify other spec if you want, say, 041b (lang of abstract)
183
+ # or 041e (lang of librettos), or 041h (lang of original) instead or in addition.
184
+ #
185
+ # de-dups values so you don't get the same one twice.
186
+ #
187
+ # Exact spec of #marc_languages may change with new user data on what
188
+ # works best.
189
+ def marc_languages(spec = "008[35-37]:041a:041d")
190
+ translation_map = Traject::TranslationMap.new("marc_languages")
191
+
192
+ extractor = MarcExtractor.new(spec, :separator => nil)
193
+
194
+ lambda do |record, accumulator|
195
+ codes = extractor.collect_matching_lines(record) do |field, spec, extractor|
196
+ if extractor.control_field?(field)
197
+ (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
198
+ else
199
+ extractor.collect_subfields(field, spec).collect do |value|
200
+ # sometimes multiple language codes are jammed together in one subfield, and
201
+ # we need to separate ourselves. sigh.
202
+ unless value.length == 3
203
+ value = value.scan(/.{1,3}/) # split into an array of 3-length substrs
204
+ end
205
+ value
206
+ end.flatten
207
+ end
208
+ end
209
+ codes = codes.uniq
210
+
211
+ translation_map.translate_array!(codes)
212
+
213
+ accumulator.concat codes
214
+ end
215
+ end
216
+
217
+ # Adds in marc fields in spec (default is recommended series spec, but you can specify your own)
218
+ # -- only trick is that 490's are skipped of first indicator is 1 -- if 490 first
219
+ # indicator is "1", "series traced", that means the series title mentioned here is
220
+ # already covered by another field we're including, so we don't want to double count it, possibly
221
+ # with slight variation.
222
+ def marc_series_facet(spec = "440a:490a:800abcdt:810abcdt:811acdeft:830adfgklmnoprst")
223
+ extractor = MarcExtractor.new(spec)
224
+
225
+ lambda do |record, accumulator|
226
+ values = extractor.collect_matching_lines(record) do |field, spec, extractor|
227
+ extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
228
+ end.compact
229
+
230
+ # trim punctuation
231
+ values.collect! do |s|
232
+ Marc21.trim_punctuation(s)
233
+ end
234
+
235
+ accumulator.concat( values )
236
+ end
237
+ end
238
+
239
+
240
+ # Takes marc 048ab instrument code, and translates it to human-displayable
241
+ # string. Takes first two chars of 048a or b, to translate (ignores numeric code)
242
+ #
243
+ # Pass in custom spec if you want just a or b, to separate soloists or whatever.
244
+ def marc_instrumentation_humanized(spec = "048ab", options = {})
245
+ translation_map = Traject::TranslationMap.new(options[:translation_map] || "marc_instruments")
246
+
247
+ extractor = MarcExtractor.new(spec, :separator => nil)
248
+
249
+ lambda do |record, accumulator|
250
+ values = extractor.extract(record)
251
+ human = values.collect do |value|
252
+ translation_map[ value.slice(0, 2) ]
253
+ end.uniq
254
+ accumulator.concat human if human && human.length > 0
255
+ end
256
+ end
257
+
258
+ # This weird one actually returns marc instrumentation codes, not
259
+ # humanized. But it normalizes them by breaking them down into a numeric and non-numeric
260
+ # version. For instance "ba01" will be indexed as both "ba01" and "ba".
261
+ # ALSO, if the code is in a subfield b (soloist), it'll be indexed
262
+ # _additionally_ as "ba01.s" and "ba.s".
263
+ #
264
+ # This has proven useful for expert music librarian searching by hand; it could
265
+ # also be the basis of a GUI that executes searches behind the scenes for these
266
+ # codes.
267
+ def marc_instrument_codes_normalized(spec = "048")
268
+ soloist_suffix = ".s"
269
+
270
+ extractor = MarcExtractor.new("048", :separator => nil)
271
+
272
+ return lambda do |record, accumulator|
273
+ accumulator.concat(
274
+ extractor.collect_matching_lines(record) do |field, spec, extractor|
275
+ values = []
276
+
277
+ field.subfields.each do |sf|
278
+ v = sf.value
279
+ # Unless there's at least two chars, it's malformed, we can
280
+ # do nothing
281
+ next unless v.length >= 2
282
+
283
+ # Index both with and without number -- both with soloist suffix
284
+ # if in a $b
285
+ values << v
286
+ values << "#{v}#{soloist_suffix}" if sf.code == 'b'
287
+ if v.length >= 4
288
+ bare = v.slice(0,2) # just the prefix
289
+ values << bare
290
+ values << "#{bare}#{soloist_suffix}" if sf.code == 'b'
291
+ end
292
+ end
293
+ values
294
+ end.uniq
295
+ )
296
+ end
297
+ end
298
+
299
+ # An opinionated algorithm for getting a SINGLE publication date out of marc
300
+ #
301
+ # * Prefers using 008, but will resort to 260c
302
+ # * If 008 represents a date range, will take the midpoint of the range,
303
+ # only if range is smaller than estimate_tolerance, default 15 years.
304
+ # * Ignores dates below min_year (default 500) or above max_year (this year plus 6 years),
305
+ # because experience shows too many of these were in error.
306
+ #
307
+ # Yeah, this code ends up ridiculous.
308
+ def marc_publication_date(options = {})
309
+ estimate_tolerance = options[:estimate_tolerance] || 15
310
+ min_year = options[:min_year] || 500
311
+ max_year = options[:max_year] || (Time.new.year + 6)
312
+
313
+ lambda do |record, accumulator|
314
+ date = Marc21Semantics.publication_date(record, estimate_tolerance, min_year, max_year)
315
+ accumulator << date if date
316
+ end
317
+ end
318
+
319
+ # See #marc_publication_date. Yeah, this is a holy mess.
320
+ # Maybe it should actually be extracted to it's own class!
321
+ def self.publication_date(record, estimate_tolerance = 15, min_year = 500, max_year = (Time.new.year + 6))
322
+ field008 = MarcExtractor.cached("008").extract(record).first
323
+ found_date = nil
324
+
325
+ if field008 && field008.length >= 11
326
+ date_type = field008.slice(6)
327
+ date1_str = field008.slice(7,4)
328
+ date2_str = field008.slice(11, 4) if field008.length > 15
329
+
330
+ # for date_type q=questionable, we have a range.
331
+ if (date_type == 'q')
332
+ # make unknown digits at the beginning or end of range,
333
+ date1 = date1_str.sub("u", "0").to_i
334
+ date2 = date2_str.sub("u", "9").to_i
335
+ # do we have a range we can use?
336
+ if (date2 > date1) && ((date2 - date1) <= estimate_tolerance)
337
+ found_date = (date2 + date1)/2
338
+ end
339
+ end
340
+ # didn't find a date that way, and anything OTHER than date_type
341
+ # n=unknown, q=questionable, try single date -- for some date types,
342
+ # there's a date range between date1 and date2, yeah, we often take
343
+ # the FIRST date then, the earliest. That's just what we're doing.
344
+ if found_date.nil? && date_type != 'n' && date_type != 'q'
345
+ # in date_type 'r', second date is original publication date, use that I think?
346
+ date_str = (date_type == 'r' && date2_str.to_i != 0) ? date2_str : date1_str
347
+ # Deal with stupid 'u's, which end up meaning a range too,
348
+ # find midpoint and make sure our tolerance is okay.
349
+ ucount = 0
350
+ while (!date_str.nil?) && (i = date_str.index('u'))
351
+ ucount += 1
352
+ date_str[i] = "0"
353
+ end
354
+ date = date_str.to_i
355
+ if ucount > 0 && date != 0
356
+ delta = 10 ** ucount # 10^ucount, expontent
357
+ if delta <= estimate_tolerance
358
+ found_date = date + (delta/2)
359
+ end
360
+ elsif date != 0
361
+ found_date = date
362
+ end
363
+ end
364
+ end
365
+ # Okay, nothing from 008, try 260
366
+ if found_date.nil?
367
+ v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
368
+ # just try to take the first four digits out of there, we're not going to try
369
+ # anything crazy.
370
+ if v260c =~ /(\d{4})/
371
+ found_date = $1.to_i
372
+ end
373
+ end
374
+
375
+ # is it within our acceptable range?
376
+ found_date = nil if found_date && (found_date < min_year || found_date > max_year)
377
+
378
+ return found_date
379
+ end
380
+
381
+ # REGEX meant to rule out obvious non-LCC's, and only allow things
382
+ # plausibly LCC's.
383
+ LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
384
+ # Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH)
385
+ # from usual parts of the marc record. Maps them to high-level broad categories,
386
+ # basically just using the first part of the LCC. Note it's just looking in bib-level
387
+ # locations for LCCs, you're on your own with holdings.
388
+ #
389
+ # Sanity checks to make sure the thing looks like an LCC with a regex, before
390
+ # mapping.
391
+ #
392
+ # Will call it 'Unknown' if it's got nothing else, or pass in :default => something else,
393
+ # or nil.
394
+ #
395
+ # The categories output aren't great, but they're something.
396
+ def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a")
397
+ # Trying to match things that look like LCC, and not match things
398
+ # that don't. Is tricky.
399
+ lcc_regex = LCC_REGEX
400
+ default_value = options.has_key?(:default) ? options[:default] : "Unknown"
401
+ translation_map = Traject::TranslationMap.new("lcc_top_level")
402
+
403
+ extractor = MarcExtractor.new(spec, :separator => nil)
404
+
405
+ lambda do |record, accumulator|
406
+ candidates = extractor.extract(record)
407
+
408
+ candidates.reject! do |candidate|
409
+ !(candidate =~ lcc_regex)
410
+ end
411
+
412
+ accumulator.concat translation_map.translate_array!(candidates.collect {|a| a.lstrip.slice(0, 1)}).uniq
413
+
414
+ if default_value && accumulator.empty?
415
+ accumulator << default_value
416
+ end
417
+ end
418
+ end
419
+
420
+ # An opinionated method of making a geographic facet out of BOTH 048 marc
421
+ # codes, AND geo subdivisions in 6xx LCSH subjects.
422
+ #
423
+ # The LCSH geo subdivisions are further normalized:
424
+ # * geo qualifiers in $z fields into parens, so "Germany -- Berlin" becomes "Berlin (Germany)"
425
+ # (to be consistent with how same areas are written in $a fields -- doesn't
426
+ # get everything, but gets lots of em)
427
+ # * qualified regions like that are additionally 'posted up', so "Germany -- Berlin" gets
428
+ # recorded additionally as "Germany"
429
+ def marc_geo_facet(options = {})
430
+ marc_geo_map = Traject::TranslationMap.new("marc_geographic")
431
+
432
+ a_fields_spec = options[:geo_a_fields] || "651a:691a"
433
+ z_fields_spec = options[:geo_z_fields] || "600:610:611:630:648:650:654:655:656:690:651:691"
434
+
435
+ extractor_043a = MarcExtractor.new("043a", :separator => nil)
436
+ extractor_a_fields = MarcExtractor.new(a_fields_spec, :separator => nil)
437
+ extractor_z_fields = MarcExtractor.new(z_fields_spec)
438
+
439
+ lambda do |record, accumulator|
440
+
441
+ accumulator.concat(
442
+ extractor_043a.extract(record).collect do |code|
443
+ # remove any trailing hyphens, then map
444
+ marc_geo_map[code.gsub(/\-+\Z/, '')]
445
+ end.compact
446
+ )
447
+
448
+ #LCSH 651a and 691a go in more or less normally.
449
+ accumulator.concat(
450
+ extractor_a_fields.extract(record).collect do |s|
451
+ # remove trailing periods, which they sometimes have if they were
452
+ # at end of LCSH.
453
+ s.sub(/\. */, '')
454
+ end
455
+ )
456
+
457
+ # fields we take z's from have a bit more normalization
458
+ extractor_z_fields.each_matching_line(record) do |field, spec, extractor|
459
+ z_fields = field.subfields.find_all {|sf| sf.code == "z"}.collect {|sf| sf.value }
460
+ # depending on position in total field, may be a period on the end
461
+ # we want to remove.
462
+ z_fields.collect! {|s| s.gsub(/\. *\Z/, '')}
463
+
464
+ if z_fields.length == 2
465
+ # normalize subdivision as parenthetical
466
+ accumulator << "#{z_fields[1]} (#{z_fields[0]})"
467
+ # and 'post up'
468
+ accumulator << z_fields[0]
469
+ else
470
+ # just add all the z's if there's 1 or more than 2.
471
+ accumulator.concat z_fields
472
+ end
473
+ end
474
+ accumulator.uniq!
475
+ end
476
+ end
477
+
478
+ # Opinionated routine to create values for a chronology/era facet out of
479
+ # LCSH chron subdivisions. Does some normalization:
480
+ # for 651 with a chron facet fitting the form
481
+ # "aaaaa, yyyy-yyyy", it will add in the $a. For instance:
482
+ # 651 a| United States x| History y| Civil War, 1861-1865
483
+ # --> "United States: Civil War, 1861-1865"
484
+ def marc_era_facet
485
+ ordinary_fields_spec = "600y:610y:611y:630y:648ay:650y:654y:656y:690y"
486
+ special_fields_spec = "651:691"
487
+ separator = ": "
488
+
489
+ extractor_ordinary_fields = MarcExtractor.new(ordinary_fields_spec)
490
+ extractor_special_fields = MarcExtractor.new(special_fields_spec)
491
+
492
+ lambda do |record, accumulator|
493
+ # straightforward ones
494
+
495
+
496
+ accumulator.concat( extractor_ordinary_fields.extract(record).collect do |v|
497
+ # May have a period we have to remove, if it was at end of tag
498
+ v.sub(/\. *\Z/, '')
499
+ end)
500
+
501
+ # weird ones
502
+ extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
503
+ field.subfields.each do |sf|
504
+ next unless sf.code == 'y'
505
+ if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
506
+ # it's our pattern, add the $a in please
507
+ accumulator << "#{field['a']}#{separator}#{sf.value.sub(/\. *\Z/, '')}"
508
+ else
509
+ accumulator << sf.value.sub(/\. *\Z/, '')
510
+ end
511
+ end
512
+ end
513
+ accumulator.uniq!
514
+ end
515
+ end
516
+
517
+ # Extracts LCSH-carrying fields, and formatting them
518
+ # as a pre-coordinated LCSH string, for instance suitable for including
519
+ # in a facet.
520
+ #
521
+ # You can supply your own list of fields as a spec, but for significant
522
+ # customization you probably just want to write your own method in
523
+ # terms of the Marc21Semantics.assemble_lcsh method.
524
+ def marc_lcsh_formatted(options = {})
525
+ spec = options[:spec] || "600:610:611:630:648:650:651:654:662"
526
+ subd_separator = options[:subdivison_separator] || " — "
527
+ other_separator = options[:other_separator] || " "
528
+
529
+ extractor = MarcExtractor.new(spec)
530
+
531
+ return lambda do |record, accumulator|
532
+ accumulator.concat( extractor.collect_matching_lines(record) do |field, spec|
533
+ Marc21Semantics.assemble_lcsh(field, subd_separator, other_separator)
534
+ end)
535
+ end
536
+
537
+ end
538
+
539
+ # Takes a MARC::Field and formats it into a pre-coordinated LCSH string
540
+ # with subdivision seperators in the right place.
541
+ #
542
+ # For 600 fields especially, need to not just join with subdivision seperator
543
+ # to take acount of $a$d$t -- for other fields, might be able to just
544
+ # join subfields, not sure.
545
+ #
546
+ # WILL strip trailing period from generated string, contrary to some LCSH practice.
547
+ # Our data is inconsistent on whether it has period or not, this was
548
+ # the easiest way to standardize.
549
+ #
550
+ # Default subdivision seperator is em-dash with spaces, set to '--' if you want.
551
+ #
552
+ # Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading
553
+ # is not carried in the MARC record. It may be system generated as a display constant
554
+ # associated with the content of subfield $v, $x, $y, and $z."
555
+ # http://www.loc.gov/marc/bibliographic/bd600.html
556
+ def self.assemble_lcsh(marc_field, subd_separator = " — ", other_separator = " ")
557
+ str = ""
558
+ subd_prefix_codes = %w{v x y z}
559
+
560
+
561
+ marc_field.subfields.each_with_index do |sf, i|
562
+ # ignore non-alphabetic, like numeric control subfields
563
+ next unless sf.code =~ /\A[a-z]\Z/
564
+
565
+ prefix = if subd_prefix_codes.include? sf.code
566
+ subd_separator
567
+ elsif i == 0
568
+ ""
569
+ else
570
+ other_separator
571
+ end
572
+ str << prefix << sf.value
573
+ end
574
+
575
+ str.gsub!(/\.\Z/, '')
576
+
577
+ return nil if str == ""
578
+
579
+ return str
580
+ end
581
+
582
+
583
+ end
584
+ end