pennmarc 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/legacy/indexer.rb DELETED
@@ -1,568 +0,0 @@
1
- # rubocop:disable all
2
- $:.unshift './config'
3
-
4
- require 'date'
5
-
6
- # This fixes a bug in older versions of glibc, where name resolution under high load sometimes fails.
7
- # We require this here, because indexing jobs don't load Rails initializers
8
- require 'resolv-replace'
9
-
10
- require 'traject'
11
-
12
- require 'penn_lib/marc'
13
- require 'penn_lib/code_mappings'
14
-
15
- # Indexer for Franklin-native records (i.e. from Alma).
16
- # This is also used as a parent class for Hathi and CRL
17
- # since the vast majority of the indexing rules are the same.
18
- # Overrideable field definitions should go into define_* methods
19
- # and called in this constructor.
20
- class FranklinIndexer < BaseIndexer
21
-
22
- # this mixin defines lambda facotry method get_format for legacy marc formats
23
- include Blacklight::Marc::Indexer::Formats
24
- include BlacklightSolrplugins::Indexer
25
-
26
- # This behaves like the wrapped MARC::Record object it contains
27
- # except that the #each method filters out fields with non-standard tags.
28
- class PlainMarcRecord
29
-
30
- def initialize(record)
31
- @record = record
32
- @valid_tag_regex ||= /^\d\d\d$/
33
- end
34
-
35
- def method_missing(*args)
36
- @record.send(*args)
37
- end
38
-
39
- def each
40
- for field in @record.fields
41
- yield field if field.tag =~ @valid_tag_regex
42
- end
43
- end
44
- end
45
-
46
- # Filter out enriched fields from ALMA because a lot of them can cause
47
- # the stored MARC XML in Solr to exceed max field size. Note that the
48
- # marc_view partial filters out non-standard MARC tags on display side too.
49
- # @return [Proc] proc object to be used by traject
50
- def get_plain_marc_xml
51
- lambda do |record, accumulator|
52
- accumulator << MARC::FastXMLWriter.encode(PlainMarcRecord.new(record))
53
- end
54
- end
55
-
56
- def initialize
57
- super
58
-
59
- # append extra params to the Solr update URL for solr-side cross reference handling
60
- # and duplicate ID deletion
61
- processors = [ 'xref-copyfield', 'fl-multiplex', 'shingles', 'id_hash' ]
62
- if ENV['SOLR_USE_UID_DISTRIB_PROCESSOR']
63
- # disable; handle deletion outside of solr, either permanently or pending bug fixes
64
- #processors << 'uid-distrib'
65
- end
66
-
67
- solr_update_url = [ ENV['SOLR_URL'].chomp('/'), 'update', 'json' ].join('/') + "?processor=#{processors.join(',')}"
68
-
69
- settings do
70
- # type may be 'binary', 'xml', or 'json'
71
- provide "marc_source.type", "xml"
72
- # set this to be non-negative if threshold should be enforced
73
- provide 'solr_writer.max_skipped', -1
74
-
75
- provide 'solr.update_url', solr_update_url
76
-
77
- store 'writer_class_name', 'PennLib::FranklinSolrJsonWriter'
78
-
79
- # uncomment these lines to write to a file
80
- # store "writer_class_name", "Traject::JsonWriter"
81
- # store 'output_file', "traject_output.json"
82
-
83
- if defined? JRUBY_VERSION
84
- # 'store' overrides existing settings, 'provide' does not
85
- store 'reader_class_name', "Traject::Marc4JReader"
86
- store 'solr_writer.thread_pool', 4
87
- store 'processing_thread_pool', 4
88
- end
89
-
90
- store 'solr_writer.commit_on_close', false
91
- store 'solr_writer.batch_size', 2000
92
-
93
- end
94
-
95
- define_all_fields
96
- end
97
-
98
- def define_all_fields
99
-
100
- define_id
101
-
102
- define_grouped_id
103
-
104
- define_record_source_id
105
-
106
- define_record_source_facet
107
-
108
- to_field 'nocirc_f_stored' do |rec, acc|
109
- acc << pennlibmarc.items_nocirc(rec)
110
- end
111
-
112
- define_mms_id
113
-
114
- define_oclc_id
115
-
116
- define_cluster_id
117
-
118
- define_full_text_link_text_a
119
-
120
- # do NOT use *_xml_stored_single because it uses a Str (max 32k) for storage
121
- to_field 'marcrecord_xml_stored_single_large', get_plain_marc_xml
122
-
123
- # Our keyword searches use pf/qf to search multiple fields, so
124
- # we don't need this field; leaving it commented out here just in case.
125
- #
126
- # to_field "text_search", extract_all_marc_values do |r, acc|
127
- # acc.unshift(r['001'].try(:value))
128
- # acc.replace [acc.join(' ')] # turn it into a single string
129
- # end
130
-
131
- define_access_facet
132
-
133
- to_field 'format_f_stored' do |rec, acc|
134
- acc.concat(pennlibmarc.get_format(rec))
135
- end
136
-
137
- author_creator_spec = %W{
138
- 100abcdjq
139
- 110abcdjq
140
- 700abcdjq
141
- 710abcdjq
142
- 800abcdjq
143
- 810abcdjq
144
- 111abcen
145
- 711abcen
146
- 811abcen
147
- }.join(':')
148
-
149
- # this is now automatically copied on the Solr side
150
- # to_field "author_creator_f", extract_marc(author_creator_spec, :trim_punctuation => true)
151
-
152
- # TODO: xfacet field, do not migrate
153
- to_field 'author_creator_xfacet2_input', extract_marc(author_creator_spec, :trim_punctuation => true) do |r, acc|
154
- acc.map! { |v| 'n' + v }
155
- end
156
-
157
- # this is now automatically copied on the Solr side
158
- # to_field 'subject_f_stored' do |rec, acc|
159
- # acc.concat(pennlibmarc.get_subject_facet_values(rec))
160
- # end
161
-
162
- to_field "db_type_f_stored" do |rec, acc|
163
- acc.concat(pennlibmarc.get_db_types(rec))
164
- end
165
-
166
- to_field "db_category_f_stored" do |rec, acc|
167
- acc.concat(pennlibmarc.get_db_categories(rec))
168
- end
169
-
170
- to_field "db_subcategory_f_stored" do |rec, acc|
171
- acc.concat(pennlibmarc.get_db_subcategories(rec))
172
- end
173
-
174
- to_field 'subject_search' do |rec, acc|
175
- acc.concat(pennlibmarc.get_subject_search_values(rec))
176
- end
177
-
178
- to_field 'toplevel_subject_f' do |rec, acc|
179
- acc.concat(pennlibmarc.get_subject_facet_values(rec, true))
180
- end
181
-
182
- # TODO: xfacet field, do not migrate
183
- to_field 'call_number_xfacet' do |rec, acc|
184
- acc.concat(pennlibmarc.get_call_number_xfacet_values(rec))
185
- end
186
-
187
- to_field "language_f_stored" do |rec, acc|
188
- acc.concat(pennlibmarc.get_language_values(rec))
189
- end
190
-
191
- to_field "language_search" do |rec, acc|
192
- acc.concat(pennlibmarc.get_language_values(rec))
193
- end
194
-
195
- to_field "library_f_stored" do |rec, acc|
196
- acc.concat(pennlibmarc.get_library_values(rec))
197
- end
198
-
199
- to_field "specific_location_f_stored" do |rec, acc|
200
- acc.concat(pennlibmarc.get_specific_location_values(rec))
201
- end
202
-
203
- to_field "classification_f_stored" do |rec, acc|
204
- acc.concat(pennlibmarc.get_classification_values(rec))
205
- end
206
-
207
- to_field "genre_f_stored" do |rec, acc|
208
- acc.concat(pennlibmarc.get_genre_values(rec))
209
- end
210
-
211
- to_field "genre_search" do |rec, acc|
212
- acc.concat(pennlibmarc.get_genre_search_values(rec))
213
- end
214
-
215
- # Title fields
216
-
217
- to_field 'title_1_search' do |rec, acc|
218
- acc.concat(pennlibmarc.get_title_1_search_values(rec))
219
- end
220
-
221
- to_field 'title_2_search' do |rec, acc|
222
- acc.concat(pennlibmarc.get_title_2_search_values(rec))
223
- end
224
-
225
- to_field 'journal_title_1_search' do |rec, acc|
226
- acc.concat(pennlibmarc.get_journal_title_1_search_values(rec))
227
- end
228
-
229
- to_field 'journal_title_2_search' do |rec, acc|
230
- acc.concat(pennlibmarc.get_journal_title_2_search_values(rec))
231
- end
232
-
233
- to_field 'author_creator_1_search' do |rec, acc|
234
- acc.concat(pennlibmarc.get_author_creator_1_search_values(rec))
235
- end
236
-
237
- to_field 'author_creator_2_search' do |rec, acc|
238
- acc.concat(pennlibmarc.get_author_creator_2_search_values(rec))
239
- end
240
-
241
- to_field 'author_creator_a' do |rec, acc|
242
- acc.concat(pennlibmarc.get_author_creator_values(rec))
243
- end
244
-
245
- to_field 'author_880_a' do |rec, acc|
246
- acc.concat(pennlibmarc.get_author_880_values(rec))
247
- end
248
-
249
- to_field 'title' do |rec, acc|
250
- acc.concat(pennlibmarc.get_title_values(rec))
251
- end
252
-
253
- to_field 'title_880_a' do |rec,acc|
254
- acc.concat(pennlibmarc.get_title_880_values(rec))
255
- end
256
-
257
- to_field 'standardized_title_a' do |rec, acc|
258
- acc.concat(pennlibmarc.get_standardized_title_values(rec))
259
- end
260
-
261
- # TODO: xfacet field, do not migrate
262
- to_field 'title_xfacet' do |rec, acc|
263
- acc.concat(pennlibmarc.get_title_xfacet_values(rec))
264
- end
265
-
266
- to_field 'title_nssort' do |rec, acc|
267
- acc.concat(pennlibmarc.get_title_sort_values(rec))
268
- end
269
-
270
- to_field 'title_sort_tl' do |rec, acc|
271
- acc.concat(pennlibmarc.get_title_sort_filing_parts(rec, false))
272
- pennlibmarc.append_title_variants(rec, acc)
273
- end
274
-
275
- # Author fields
276
-
277
- to_field 'author_creator_nssort' do |rec, acc|
278
- acc.concat(pennlibmarc.get_author_creator_sort_values(rec))
279
- end
280
-
281
- to_field 'edition' do |rec, acc|
282
- acc.concat(pennlibmarc.get_edition_values(rec))
283
- end
284
-
285
- to_field 'conference_a' do |rec, acc|
286
- acc.concat(pennlibmarc.get_conference_values(rec))
287
- end
288
-
289
- to_field 'series' do |rec, acc|
290
- acc.concat(pennlibmarc.get_series_values(rec))
291
- end
292
-
293
- to_field 'publication_a' do |rec, acc|
294
- acc.concat(pennlibmarc.get_publication_values(rec))
295
- end
296
-
297
- to_field 'contained_within_a' do |rec, acc|
298
- acc.concat(pennlibmarc.get_contained_within_values(rec))
299
- end
300
-
301
- to_field 'elvl_rank_isort' do |rec, acc|
302
- val = pennlibmarc.get_encoding_level_rank(rec)
303
- acc << val if val
304
- end
305
-
306
- to_field 'hld_count_isort' do |rec, acc|
307
- val = pennlibmarc.get_hld_count(rec)
308
- acc << val if val
309
- end
310
-
311
- to_field 'itm_count_isort' do |rec, acc|
312
- val = pennlibmarc.get_itm_count(rec)
313
- acc << val if val
314
- end
315
-
316
- to_field 'empty_hld_count_isort' do |rec, acc|
317
- val = pennlibmarc.get_empty_hld_count(rec)
318
- acc << val if val
319
- end
320
-
321
- to_field 'prt_count_isort' do |rec, acc|
322
- val = pennlibmarc.get_prt_count(rec)
323
- acc << val if val
324
- end
325
-
326
- each_record do |rec, ctx|
327
- ctx.clipboard.tap do |c|
328
- c[:timestamps] = pennlibmarc.prepare_timestamps(rec)
329
- c[:dates] = pennlibmarc.prepare_dates(rec)
330
- c[:subjects] = PennLib::SubjectConfig.prepare_subjects(rec)
331
- end
332
- end
333
-
334
- # All browseable/facetable subject types are multiplexed through this field; for corresponding display,
335
- # these values are then mapped Solr-side to the `*_subject_stored_a` fields below. The fields are still
336
- # directly configured below for storage of values that should be displayed, but not directly
337
- # browseable/facetable
338
- # TODO: while we should not migrate this field directly, we need to
339
- # ensure that the copyfield behavior is incorporated into our indexer
340
- to_field 'subject_xfacet2_input' do |rec, acc, ctx|
341
- val = ctx.clipboard.dig(:subjects, :xfacet)
342
- acc.concat(val) if val
343
- end
344
-
345
- # The fields below exist because there are some values that appear in _display_, but should not be
346
- # _directly_ browseable/facetable (except perhaps as xrefs).
347
- # Note, this is a step towards consolidation/consistency in management of subjects generally; there are
348
- # choices that are preserved here initially for functional backward compatibility, but some of the behavior
349
- # we're preserving is of questionable merit. Namely, the fields below allow the display of fields that will
350
- # be links, but which will in some cases not be present in the linked "browse" view. We'll take this one
351
- # step at a time, consolidating first with minimal behavioral changes; but note that some of the preserved
352
- # behavior may be ripe for reconsideration.
353
- # BEGIN STORED SUBJECTS
354
- to_field 'lcsh_subject_stored_a' do |rec, acc, ctx|
355
- val = ctx.clipboard.dig(:subjects, :stored_lcsh)
356
- acc.concat(val) if val
357
- end
358
-
359
- to_field 'childrens_subject_stored_a' do |rec, acc, ctx|
360
- val = ctx.clipboard.dig(:subjects, :stored_childrens)
361
- acc.concat(val) if val
362
- end
363
-
364
- to_field 'mesh_subject_stored_a' do |rec, acc, ctx|
365
- val = ctx.clipboard.dig(:subjects, :stored_mesh)
366
- acc.concat(val) if val
367
- end
368
-
369
- to_field 'local_subject_stored_a' do |rec, acc, ctx|
370
- val = ctx.clipboard.dig(:subjects, :stored_local)
371
- acc.concat(val) if val
372
- end
373
- # END STORED SUBJECTS
374
-
375
- to_field 'recently_added_isort' do |rec, acc, ctx|
376
- val = ctx.clipboard.dig(:timestamps, :most_recent_add)
377
- acc << val if val
378
- end
379
-
380
- to_field 'last_update_isort' do |rec, acc, ctx|
381
- val = ctx.clipboard.dig(:timestamps, :last_update)
382
- acc << val if val
383
- end
384
-
385
- to_field 'publication_date_ssort' do |rec, acc, ctx|
386
- val = ctx.clipboard.dig(:dates, :pub_date_sort)
387
- acc << val if val
388
- end
389
-
390
- to_field 'pub_min_dtsort' do |rec, acc, ctx|
391
- val = ctx.clipboard.dig(:dates, :pub_date_minsort)
392
- acc << val if val
393
- end
394
-
395
- to_field 'pub_max_dtsort' do |rec, acc, ctx|
396
- val = ctx.clipboard.dig(:dates, :pub_date_maxsort)
397
- acc << val if val
398
- end
399
-
400
- to_field 'content_min_dtsort' do |rec, acc, ctx|
401
- val = ctx.clipboard.dig(:dates, :content_date_minsort)
402
- acc << val if val
403
- end
404
-
405
- to_field 'content_max_dtsort' do |rec, acc, ctx|
406
- val = ctx.clipboard.dig(:dates, :content_date_maxsort)
407
- acc << val if val
408
- end
409
-
410
- to_field 'publication_date_f_stored' do |rec, acc, ctx|
411
- val = ctx.clipboard.dig(:dates, :pub_date_decade)
412
- acc << val if val
413
- end
414
-
415
- to_field 'publication_dr' do |rec, acc, ctx|
416
- val = ctx.clipboard.dig(:dates, :pub_date_range)
417
- acc << val if val
418
- end
419
-
420
- to_field 'content_dr' do |rec, acc, ctx|
421
- val = ctx.clipboard.dig(:dates, :content_date_range)
422
- acc << val if val
423
- end
424
-
425
- to_field "isbn_isxn_stored", extract_marc(%W{020az 022alz}, :separator=>nil) do |rec, acc|
426
- orig = acc.dup
427
- acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)}
428
- acc << orig
429
- acc.flatten!
430
- acc.uniq!
431
- end
432
-
433
- to_field 'call_number_search' do |rec, acc|
434
- acc.concat(pennlibmarc.get_call_number_search_values(rec))
435
- end
436
-
437
- to_field 'physical_holdings_json' do |rec, acc|
438
- result = pennlibmarc.get_physical_holdings(rec)
439
- if result.present?
440
- acc << result.to_json
441
- end
442
- end
443
-
444
- to_field 'electronic_holdings_json' do |rec, acc|
445
- result = pennlibmarc.get_electronic_holdings(rec)
446
- if result.present?
447
- acc << result.to_json
448
- end
449
- end
450
-
451
- # store IDs of associated boundwith records, where the actual holdings are attached.
452
- # this is a multi-valued field because a bib may have multiple copies, each associated
453
- # with a different boundwith record (a few such cases do exist).
454
- # we use this to pass to the Availability API.
455
- to_field 'bound_with_ids_a' do |rec, acc|
456
- acc.concat(pennlibmarc.get_bound_with_id_values(rec))
457
- end
458
-
459
- to_field 'conference_search' do |rec, acc|
460
- acc.concat(pennlibmarc.get_conference_search_values(rec))
461
- end
462
-
463
- to_field 'contents_note_search' do |rec, acc|
464
- acc.concat(pennlibmarc.get_contents_note_search_values(rec))
465
- end
466
-
467
- to_field 'corporate_author_search' do |rec, acc|
468
- acc.concat(pennlibmarc.get_corporate_author_search_values(rec))
469
- end
470
-
471
- to_field 'place_of_publication_search', extract_marc('260a:264|*1|a')
472
-
473
- to_field 'publisher_search', extract_marc('260b:264|*1|b')
474
-
475
- to_field 'pubnum_search', extract_marc('024a:028a')
476
-
477
- to_field 'series_search' do |rec, acc|
478
- acc.concat(pennlibmarc.get_series_search_values(rec))
479
- end
480
-
481
- end
482
-
483
- def pennlibmarc
484
- @code_mappings ||= PennLib::CodeMappings.new(Rails.root.join('config').join('translation_maps'))
485
- @pennlibmarc ||= PennLib::Marc.new(@code_mappings)
486
- end
487
-
488
- def define_id
489
- to_field "id", trim(extract_marc("001"), :first => true) do |rec, acc, context|
490
- acc.map! { |id| "FRANKLIN_#{id}" }
491
-
492
- # we do this check in the first 'id' field so that it happens early
493
- if pennlibmarc.is_boundwith_record(rec)
494
- context.skip!("Skipping boundwith record #{acc.first}")
495
- end
496
- end
497
- end
498
-
499
- def define_mms_id
500
- to_field 'alma_mms_id', trim(extract_marc('001'), :first => true)
501
- end
502
-
503
- def define_access_facet
504
- to_field "access_f_stored" do |rec, acc|
505
- acc.concat(pennlibmarc.get_access_values(rec))
506
- end
507
- end
508
-
509
- def define_oclc_id
510
- to_field 'oclc_id' do |rec, acc|
511
- oclc_ids = pennlibmarc.get_oclc_id_values(rec)
512
- acc << oclc_ids.first unless oclc_ids.empty?
513
- end
514
- end
515
-
516
- def get_cluster_id(rec)
517
- pennlibmarc.get_oclc_id_values(rec).first || begin
518
- id = rec.fields('001').take(1).map(&:value).first
519
- digest = Digest::MD5.hexdigest(id)
520
- # first 16 hex digits = first 8 bytes. construct an int out of that hex str.
521
- digest[0,16].hex
522
- end
523
- end
524
-
525
- def define_cluster_id
526
- to_field 'cluster_id' do |rec, acc|
527
- acc << get_cluster_id(rec)
528
- end
529
- end
530
-
531
- def define_grouped_id
532
- to_field 'grouped_id', trim(extract_marc('001'), :first => true) do |rec, acc, context|
533
- oclc_ids = pennlibmarc.get_oclc_id_values(rec)
534
- acc.map! { |id|
535
- if oclc_ids.size > 1
536
- puts 'Warning: Multiple OCLC IDs found, using the first one'
537
- end
538
- oclc_id = oclc_ids.first
539
- prefix = oclc_id.present? ? "#{oclc_id}!" : ''
540
- "#{prefix}FRANKLIN_#{id}"
541
- }
542
- end
543
- end
544
-
545
- def define_record_source_id
546
- to_field 'record_source_id' do |rec, acc|
547
- acc << RecordSource::PENN
548
- end
549
- end
550
-
551
- def define_record_source_facet
552
- to_field 'record_source_f' do |rec, acc|
553
- acc << 'Penn'
554
- acc << 'HathiTrust' if pennlibmarc.is_etas(rec)
555
- end
556
- end
557
-
558
- def define_full_text_link_text_a
559
- to_field 'full_text_link_text_a' do |rec, acc|
560
- result = pennlibmarc.get_full_text_link_values(rec)
561
- if result.present?
562
- acc << result.to_json
563
- end
564
- end
565
- end
566
-
567
- end
568
- # rubocop:enable all