pennmarc 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/legacy/indexer.rb DELETED
@@ -1,568 +0,0 @@
1
- # rubocop:disable all
2
- $:.unshift './config'
3
-
4
- require 'date'
5
-
6
- # This fixes a bug in older versions of glibc, where name resolution under high load sometimes fails.
7
- # We require this here, because indexing jobs don't load Rails initializers
8
- require 'resolv-replace'
9
-
10
- require 'traject'
11
-
12
- require 'penn_lib/marc'
13
- require 'penn_lib/code_mappings'
14
-
15
- # Indexer for Franklin-native records (i.e. from Alma).
16
- # This is also used as a parent class for Hathi and CRL
17
- # since the vast majority of the indexing rules are the same.
18
- # Overrideable field definitions should go into define_* methods
19
- # and called in this constructor.
20
- class FranklinIndexer < BaseIndexer
21
-
22
- # this mixin defines lambda facotry method get_format for legacy marc formats
23
- include Blacklight::Marc::Indexer::Formats
24
- include BlacklightSolrplugins::Indexer
25
-
26
- # This behaves like the wrapped MARC::Record object it contains
27
- # except that the #each method filters out fields with non-standard tags.
28
- class PlainMarcRecord
29
-
30
- def initialize(record)
31
- @record = record
32
- @valid_tag_regex ||= /^\d\d\d$/
33
- end
34
-
35
- def method_missing(*args)
36
- @record.send(*args)
37
- end
38
-
39
- def each
40
- for field in @record.fields
41
- yield field if field.tag =~ @valid_tag_regex
42
- end
43
- end
44
- end
45
-
46
- # Filter out enriched fields from ALMA because a lot of them can cause
47
- # the stored MARC XML in Solr to exceed max field size. Note that the
48
- # marc_view partial filters out non-standard MARC tags on display side too.
49
- # @return [Proc] proc object to be used by traject
50
- def get_plain_marc_xml
51
- lambda do |record, accumulator|
52
- accumulator << MARC::FastXMLWriter.encode(PlainMarcRecord.new(record))
53
- end
54
- end
55
-
56
- def initialize
57
- super
58
-
59
- # append extra params to the Solr update URL for solr-side cross reference handling
60
- # and duplicate ID deletion
61
- processors = [ 'xref-copyfield', 'fl-multiplex', 'shingles', 'id_hash' ]
62
- if ENV['SOLR_USE_UID_DISTRIB_PROCESSOR']
63
- # disable; handle deletion outside of solr, either permanently or pending bug fixes
64
- #processors << 'uid-distrib'
65
- end
66
-
67
- solr_update_url = [ ENV['SOLR_URL'].chomp('/'), 'update', 'json' ].join('/') + "?processor=#{processors.join(',')}"
68
-
69
- settings do
70
- # type may be 'binary', 'xml', or 'json'
71
- provide "marc_source.type", "xml"
72
- # set this to be non-negative if threshold should be enforced
73
- provide 'solr_writer.max_skipped', -1
74
-
75
- provide 'solr.update_url', solr_update_url
76
-
77
- store 'writer_class_name', 'PennLib::FranklinSolrJsonWriter'
78
-
79
- # uncomment these lines to write to a file
80
- # store "writer_class_name", "Traject::JsonWriter"
81
- # store 'output_file', "traject_output.json"
82
-
83
- if defined? JRUBY_VERSION
84
- # 'store' overrides existing settings, 'provide' does not
85
- store 'reader_class_name', "Traject::Marc4JReader"
86
- store 'solr_writer.thread_pool', 4
87
- store 'processing_thread_pool', 4
88
- end
89
-
90
- store 'solr_writer.commit_on_close', false
91
- store 'solr_writer.batch_size', 2000
92
-
93
- end
94
-
95
- define_all_fields
96
- end
97
-
98
- def define_all_fields
99
-
100
- define_id
101
-
102
- define_grouped_id
103
-
104
- define_record_source_id
105
-
106
- define_record_source_facet
107
-
108
- to_field 'nocirc_f_stored' do |rec, acc|
109
- acc << pennlibmarc.items_nocirc(rec)
110
- end
111
-
112
- define_mms_id
113
-
114
- define_oclc_id
115
-
116
- define_cluster_id
117
-
118
- define_full_text_link_text_a
119
-
120
- # do NOT use *_xml_stored_single because it uses a Str (max 32k) for storage
121
- to_field 'marcrecord_xml_stored_single_large', get_plain_marc_xml
122
-
123
- # Our keyword searches use pf/qf to search multiple fields, so
124
- # we don't need this field; leaving it commented out here just in case.
125
- #
126
- # to_field "text_search", extract_all_marc_values do |r, acc|
127
- # acc.unshift(r['001'].try(:value))
128
- # acc.replace [acc.join(' ')] # turn it into a single string
129
- # end
130
-
131
- define_access_facet
132
-
133
- to_field 'format_f_stored' do |rec, acc|
134
- acc.concat(pennlibmarc.get_format(rec))
135
- end
136
-
137
- author_creator_spec = %W{
138
- 100abcdjq
139
- 110abcdjq
140
- 700abcdjq
141
- 710abcdjq
142
- 800abcdjq
143
- 810abcdjq
144
- 111abcen
145
- 711abcen
146
- 811abcen
147
- }.join(':')
148
-
149
- # this is now automatically copied on the Solr side
150
- # to_field "author_creator_f", extract_marc(author_creator_spec, :trim_punctuation => true)
151
-
152
- # TODO: xfacet field, do not migrate
153
- to_field 'author_creator_xfacet2_input', extract_marc(author_creator_spec, :trim_punctuation => true) do |r, acc|
154
- acc.map! { |v| 'n' + v }
155
- end
156
-
157
- # this is now automatically copied on the Solr side
158
- # to_field 'subject_f_stored' do |rec, acc|
159
- # acc.concat(pennlibmarc.get_subject_facet_values(rec))
160
- # end
161
-
162
- to_field "db_type_f_stored" do |rec, acc|
163
- acc.concat(pennlibmarc.get_db_types(rec))
164
- end
165
-
166
- to_field "db_category_f_stored" do |rec, acc|
167
- acc.concat(pennlibmarc.get_db_categories(rec))
168
- end
169
-
170
- to_field "db_subcategory_f_stored" do |rec, acc|
171
- acc.concat(pennlibmarc.get_db_subcategories(rec))
172
- end
173
-
174
- to_field 'subject_search' do |rec, acc|
175
- acc.concat(pennlibmarc.get_subject_search_values(rec))
176
- end
177
-
178
- to_field 'toplevel_subject_f' do |rec, acc|
179
- acc.concat(pennlibmarc.get_subject_facet_values(rec, true))
180
- end
181
-
182
- # TODO: xfacet field, do not migrate
183
- to_field 'call_number_xfacet' do |rec, acc|
184
- acc.concat(pennlibmarc.get_call_number_xfacet_values(rec))
185
- end
186
-
187
- to_field "language_f_stored" do |rec, acc|
188
- acc.concat(pennlibmarc.get_language_values(rec))
189
- end
190
-
191
- to_field "language_search" do |rec, acc|
192
- acc.concat(pennlibmarc.get_language_values(rec))
193
- end
194
-
195
- to_field "library_f_stored" do |rec, acc|
196
- acc.concat(pennlibmarc.get_library_values(rec))
197
- end
198
-
199
- to_field "specific_location_f_stored" do |rec, acc|
200
- acc.concat(pennlibmarc.get_specific_location_values(rec))
201
- end
202
-
203
- to_field "classification_f_stored" do |rec, acc|
204
- acc.concat(pennlibmarc.get_classification_values(rec))
205
- end
206
-
207
- to_field "genre_f_stored" do |rec, acc|
208
- acc.concat(pennlibmarc.get_genre_values(rec))
209
- end
210
-
211
- to_field "genre_search" do |rec, acc|
212
- acc.concat(pennlibmarc.get_genre_search_values(rec))
213
- end
214
-
215
- # Title fields
216
-
217
- to_field 'title_1_search' do |rec, acc|
218
- acc.concat(pennlibmarc.get_title_1_search_values(rec))
219
- end
220
-
221
- to_field 'title_2_search' do |rec, acc|
222
- acc.concat(pennlibmarc.get_title_2_search_values(rec))
223
- end
224
-
225
- to_field 'journal_title_1_search' do |rec, acc|
226
- acc.concat(pennlibmarc.get_journal_title_1_search_values(rec))
227
- end
228
-
229
- to_field 'journal_title_2_search' do |rec, acc|
230
- acc.concat(pennlibmarc.get_journal_title_2_search_values(rec))
231
- end
232
-
233
- to_field 'author_creator_1_search' do |rec, acc|
234
- acc.concat(pennlibmarc.get_author_creator_1_search_values(rec))
235
- end
236
-
237
- to_field 'author_creator_2_search' do |rec, acc|
238
- acc.concat(pennlibmarc.get_author_creator_2_search_values(rec))
239
- end
240
-
241
- to_field 'author_creator_a' do |rec, acc|
242
- acc.concat(pennlibmarc.get_author_creator_values(rec))
243
- end
244
-
245
- to_field 'author_880_a' do |rec, acc|
246
- acc.concat(pennlibmarc.get_author_880_values(rec))
247
- end
248
-
249
- to_field 'title' do |rec, acc|
250
- acc.concat(pennlibmarc.get_title_values(rec))
251
- end
252
-
253
- to_field 'title_880_a' do |rec,acc|
254
- acc.concat(pennlibmarc.get_title_880_values(rec))
255
- end
256
-
257
- to_field 'standardized_title_a' do |rec, acc|
258
- acc.concat(pennlibmarc.get_standardized_title_values(rec))
259
- end
260
-
261
- # TODO: xfacet field, do not migrate
262
- to_field 'title_xfacet' do |rec, acc|
263
- acc.concat(pennlibmarc.get_title_xfacet_values(rec))
264
- end
265
-
266
- to_field 'title_nssort' do |rec, acc|
267
- acc.concat(pennlibmarc.get_title_sort_values(rec))
268
- end
269
-
270
- to_field 'title_sort_tl' do |rec, acc|
271
- acc.concat(pennlibmarc.get_title_sort_filing_parts(rec, false))
272
- pennlibmarc.append_title_variants(rec, acc)
273
- end
274
-
275
- # Author fields
276
-
277
- to_field 'author_creator_nssort' do |rec, acc|
278
- acc.concat(pennlibmarc.get_author_creator_sort_values(rec))
279
- end
280
-
281
- to_field 'edition' do |rec, acc|
282
- acc.concat(pennlibmarc.get_edition_values(rec))
283
- end
284
-
285
- to_field 'conference_a' do |rec, acc|
286
- acc.concat(pennlibmarc.get_conference_values(rec))
287
- end
288
-
289
- to_field 'series' do |rec, acc|
290
- acc.concat(pennlibmarc.get_series_values(rec))
291
- end
292
-
293
- to_field 'publication_a' do |rec, acc|
294
- acc.concat(pennlibmarc.get_publication_values(rec))
295
- end
296
-
297
- to_field 'contained_within_a' do |rec, acc|
298
- acc.concat(pennlibmarc.get_contained_within_values(rec))
299
- end
300
-
301
- to_field 'elvl_rank_isort' do |rec, acc|
302
- val = pennlibmarc.get_encoding_level_rank(rec)
303
- acc << val if val
304
- end
305
-
306
- to_field 'hld_count_isort' do |rec, acc|
307
- val = pennlibmarc.get_hld_count(rec)
308
- acc << val if val
309
- end
310
-
311
- to_field 'itm_count_isort' do |rec, acc|
312
- val = pennlibmarc.get_itm_count(rec)
313
- acc << val if val
314
- end
315
-
316
- to_field 'empty_hld_count_isort' do |rec, acc|
317
- val = pennlibmarc.get_empty_hld_count(rec)
318
- acc << val if val
319
- end
320
-
321
- to_field 'prt_count_isort' do |rec, acc|
322
- val = pennlibmarc.get_prt_count(rec)
323
- acc << val if val
324
- end
325
-
326
- each_record do |rec, ctx|
327
- ctx.clipboard.tap do |c|
328
- c[:timestamps] = pennlibmarc.prepare_timestamps(rec)
329
- c[:dates] = pennlibmarc.prepare_dates(rec)
330
- c[:subjects] = PennLib::SubjectConfig.prepare_subjects(rec)
331
- end
332
- end
333
-
334
- # All browseable/facetable subject types are multiplexed through this field; for corresponding display,
335
- # these values are then mapped Solr-side to the `*_subject_stored_a` fields below. The fields are still
336
- # directly configured below for storage of values that should be displayed, but not directly
337
- # browseable/facetable
338
- # TODO: while we should not migrate this field directly, we need to
339
- # ensure that the copyfield behavior is incorporated into our indexer
340
- to_field 'subject_xfacet2_input' do |rec, acc, ctx|
341
- val = ctx.clipboard.dig(:subjects, :xfacet)
342
- acc.concat(val) if val
343
- end
344
-
345
- # The fields below exist because there are some values that appear in _display_, but should not be
346
- # _directly_ browseable/facetable (except perhaps as xrefs).
347
- # Note, this is a step towards consolidation/consistency in management of subjects generally; there are
348
- # choices that are preserved here initially for functional backward compatibility, but some of the behavior
349
- # we're preserving is of questionable merit. Namely, the fields below allow the display of fields that will
350
- # be links, but which will in some cases not be present in the linked "browse" view. We'll take this one
351
- # step at a time, consolidating first with minimal behavioral changes; but note that some of the preserved
352
- # behavior may be ripe for reconsideration.
353
- # BEGIN STORED SUBJECTS
354
- to_field 'lcsh_subject_stored_a' do |rec, acc, ctx|
355
- val = ctx.clipboard.dig(:subjects, :stored_lcsh)
356
- acc.concat(val) if val
357
- end
358
-
359
- to_field 'childrens_subject_stored_a' do |rec, acc, ctx|
360
- val = ctx.clipboard.dig(:subjects, :stored_childrens)
361
- acc.concat(val) if val
362
- end
363
-
364
- to_field 'mesh_subject_stored_a' do |rec, acc, ctx|
365
- val = ctx.clipboard.dig(:subjects, :stored_mesh)
366
- acc.concat(val) if val
367
- end
368
-
369
- to_field 'local_subject_stored_a' do |rec, acc, ctx|
370
- val = ctx.clipboard.dig(:subjects, :stored_local)
371
- acc.concat(val) if val
372
- end
373
- # END STORED SUBJECTS
374
-
375
- to_field 'recently_added_isort' do |rec, acc, ctx|
376
- val = ctx.clipboard.dig(:timestamps, :most_recent_add)
377
- acc << val if val
378
- end
379
-
380
- to_field 'last_update_isort' do |rec, acc, ctx|
381
- val = ctx.clipboard.dig(:timestamps, :last_update)
382
- acc << val if val
383
- end
384
-
385
- to_field 'publication_date_ssort' do |rec, acc, ctx|
386
- val = ctx.clipboard.dig(:dates, :pub_date_sort)
387
- acc << val if val
388
- end
389
-
390
- to_field 'pub_min_dtsort' do |rec, acc, ctx|
391
- val = ctx.clipboard.dig(:dates, :pub_date_minsort)
392
- acc << val if val
393
- end
394
-
395
- to_field 'pub_max_dtsort' do |rec, acc, ctx|
396
- val = ctx.clipboard.dig(:dates, :pub_date_maxsort)
397
- acc << val if val
398
- end
399
-
400
- to_field 'content_min_dtsort' do |rec, acc, ctx|
401
- val = ctx.clipboard.dig(:dates, :content_date_minsort)
402
- acc << val if val
403
- end
404
-
405
- to_field 'content_max_dtsort' do |rec, acc, ctx|
406
- val = ctx.clipboard.dig(:dates, :content_date_maxsort)
407
- acc << val if val
408
- end
409
-
410
- to_field 'publication_date_f_stored' do |rec, acc, ctx|
411
- val = ctx.clipboard.dig(:dates, :pub_date_decade)
412
- acc << val if val
413
- end
414
-
415
- to_field 'publication_dr' do |rec, acc, ctx|
416
- val = ctx.clipboard.dig(:dates, :pub_date_range)
417
- acc << val if val
418
- end
419
-
420
- to_field 'content_dr' do |rec, acc, ctx|
421
- val = ctx.clipboard.dig(:dates, :content_date_range)
422
- acc << val if val
423
- end
424
-
425
- to_field "isbn_isxn_stored", extract_marc(%W{020az 022alz}, :separator=>nil) do |rec, acc|
426
- orig = acc.dup
427
- acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)}
428
- acc << orig
429
- acc.flatten!
430
- acc.uniq!
431
- end
432
-
433
- to_field 'call_number_search' do |rec, acc|
434
- acc.concat(pennlibmarc.get_call_number_search_values(rec))
435
- end
436
-
437
- to_field 'physical_holdings_json' do |rec, acc|
438
- result = pennlibmarc.get_physical_holdings(rec)
439
- if result.present?
440
- acc << result.to_json
441
- end
442
- end
443
-
444
- to_field 'electronic_holdings_json' do |rec, acc|
445
- result = pennlibmarc.get_electronic_holdings(rec)
446
- if result.present?
447
- acc << result.to_json
448
- end
449
- end
450
-
451
- # store IDs of associated boundwith records, where the actual holdings are attached.
452
- # this is a multi-valued field because a bib may have multiple copies, each associated
453
- # with a different boundwith record (a few such cases do exist).
454
- # we use this to pass to the Availability API.
455
- to_field 'bound_with_ids_a' do |rec, acc|
456
- acc.concat(pennlibmarc.get_bound_with_id_values(rec))
457
- end
458
-
459
- to_field 'conference_search' do |rec, acc|
460
- acc.concat(pennlibmarc.get_conference_search_values(rec))
461
- end
462
-
463
- to_field 'contents_note_search' do |rec, acc|
464
- acc.concat(pennlibmarc.get_contents_note_search_values(rec))
465
- end
466
-
467
- to_field 'corporate_author_search' do |rec, acc|
468
- acc.concat(pennlibmarc.get_corporate_author_search_values(rec))
469
- end
470
-
471
- to_field 'place_of_publication_search', extract_marc('260a:264|*1|a')
472
-
473
- to_field 'publisher_search', extract_marc('260b:264|*1|b')
474
-
475
- to_field 'pubnum_search', extract_marc('024a:028a')
476
-
477
- to_field 'series_search' do |rec, acc|
478
- acc.concat(pennlibmarc.get_series_search_values(rec))
479
- end
480
-
481
- end
482
-
483
- def pennlibmarc
484
- @code_mappings ||= PennLib::CodeMappings.new(Rails.root.join('config').join('translation_maps'))
485
- @pennlibmarc ||= PennLib::Marc.new(@code_mappings)
486
- end
487
-
488
- def define_id
489
- to_field "id", trim(extract_marc("001"), :first => true) do |rec, acc, context|
490
- acc.map! { |id| "FRANKLIN_#{id}" }
491
-
492
- # we do this check in the first 'id' field so that it happens early
493
- if pennlibmarc.is_boundwith_record(rec)
494
- context.skip!("Skipping boundwith record #{acc.first}")
495
- end
496
- end
497
- end
498
-
499
- def define_mms_id
500
- to_field 'alma_mms_id', trim(extract_marc('001'), :first => true)
501
- end
502
-
503
- def define_access_facet
504
- to_field "access_f_stored" do |rec, acc|
505
- acc.concat(pennlibmarc.get_access_values(rec))
506
- end
507
- end
508
-
509
- def define_oclc_id
510
- to_field 'oclc_id' do |rec, acc|
511
- oclc_ids = pennlibmarc.get_oclc_id_values(rec)
512
- acc << oclc_ids.first unless oclc_ids.empty?
513
- end
514
- end
515
-
516
- def get_cluster_id(rec)
517
- pennlibmarc.get_oclc_id_values(rec).first || begin
518
- id = rec.fields('001').take(1).map(&:value).first
519
- digest = Digest::MD5.hexdigest(id)
520
- # first 16 hex digits = first 8 bytes. construct an int out of that hex str.
521
- digest[0,16].hex
522
- end
523
- end
524
-
525
- def define_cluster_id
526
- to_field 'cluster_id' do |rec, acc|
527
- acc << get_cluster_id(rec)
528
- end
529
- end
530
-
531
- def define_grouped_id
532
- to_field 'grouped_id', trim(extract_marc('001'), :first => true) do |rec, acc, context|
533
- oclc_ids = pennlibmarc.get_oclc_id_values(rec)
534
- acc.map! { |id|
535
- if oclc_ids.size > 1
536
- puts 'Warning: Multiple OCLC IDs found, using the first one'
537
- end
538
- oclc_id = oclc_ids.first
539
- prefix = oclc_id.present? ? "#{oclc_id}!" : ''
540
- "#{prefix}FRANKLIN_#{id}"
541
- }
542
- end
543
- end
544
-
545
- def define_record_source_id
546
- to_field 'record_source_id' do |rec, acc|
547
- acc << RecordSource::PENN
548
- end
549
- end
550
-
551
- def define_record_source_facet
552
- to_field 'record_source_f' do |rec, acc|
553
- acc << 'Penn'
554
- acc << 'HathiTrust' if pennlibmarc.is_etas(rec)
555
- end
556
- end
557
-
558
- def define_full_text_link_text_a
559
- to_field 'full_text_link_text_a' do |rec, acc|
560
- result = pennlibmarc.get_full_text_link_values(rec)
561
- if result.present?
562
- acc << result.to_json
563
- end
564
- end
565
- end
566
-
567
- end
568
- # rubocop:enable all