pennmarc 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +6 -0
  3. data/.rspec +2 -0
  4. data/.ruby-version +1 -0
  5. data/Gemfile +23 -0
  6. data/Gemfile.lock +119 -0
  7. data/README.md +82 -0
  8. data/legacy/indexer.rb +568 -0
  9. data/legacy/marc.rb +2964 -0
  10. data/legacy/test_file_output.json +49 -0
  11. data/lib/pennmarc/encoding_level.rb +43 -0
  12. data/lib/pennmarc/enriched_marc.rb +36 -0
  13. data/lib/pennmarc/heading_control.rb +11 -0
  14. data/lib/pennmarc/helpers/citation.rb +31 -0
  15. data/lib/pennmarc/helpers/creator.rb +237 -0
  16. data/lib/pennmarc/helpers/database.rb +89 -0
  17. data/lib/pennmarc/helpers/date.rb +85 -0
  18. data/lib/pennmarc/helpers/edition.rb +90 -0
  19. data/lib/pennmarc/helpers/format.rb +312 -0
  20. data/lib/pennmarc/helpers/genre.rb +71 -0
  21. data/lib/pennmarc/helpers/helper.rb +11 -0
  22. data/lib/pennmarc/helpers/identifier.rb +134 -0
  23. data/lib/pennmarc/helpers/language.rb +37 -0
  24. data/lib/pennmarc/helpers/link.rb +12 -0
  25. data/lib/pennmarc/helpers/location.rb +97 -0
  26. data/lib/pennmarc/helpers/note.rb +132 -0
  27. data/lib/pennmarc/helpers/production.rb +131 -0
  28. data/lib/pennmarc/helpers/relation.rb +135 -0
  29. data/lib/pennmarc/helpers/series.rb +118 -0
  30. data/lib/pennmarc/helpers/subject.rb +304 -0
  31. data/lib/pennmarc/helpers/title.rb +197 -0
  32. data/lib/pennmarc/mappings/language.yml +516 -0
  33. data/lib/pennmarc/mappings/locations.yml +1801 -0
  34. data/lib/pennmarc/mappings/relator.yml +263 -0
  35. data/lib/pennmarc/parser.rb +177 -0
  36. data/lib/pennmarc/util.rb +240 -0
  37. data/lib/pennmarc.rb +6 -0
  38. data/pennmarc.gemspec +22 -0
  39. data/spec/fixtures/marcxml/test.xml +167 -0
  40. data/spec/lib/pennmarc/helpers/citation_spec.rb +27 -0
  41. data/spec/lib/pennmarc/helpers/creator_spec.rb +183 -0
  42. data/spec/lib/pennmarc/helpers/database_spec.rb +60 -0
  43. data/spec/lib/pennmarc/helpers/date_spec.rb +105 -0
  44. data/spec/lib/pennmarc/helpers/edition_spec.rb +38 -0
  45. data/spec/lib/pennmarc/helpers/format_spec.rb +200 -0
  46. data/spec/lib/pennmarc/helpers/genre_spec.rb +89 -0
  47. data/spec/lib/pennmarc/helpers/identifer_spec.rb +105 -0
  48. data/spec/lib/pennmarc/helpers/language_spec.rb +30 -0
  49. data/spec/lib/pennmarc/helpers/location_spec.rb +70 -0
  50. data/spec/lib/pennmarc/helpers/note_spec.rb +233 -0
  51. data/spec/lib/pennmarc/helpers/production_spec.rb +193 -0
  52. data/spec/lib/pennmarc/helpers/relation_spec.rb +120 -0
  53. data/spec/lib/pennmarc/helpers/subject_spec.rb +262 -0
  54. data/spec/lib/pennmarc/helpers/title_spec.rb +169 -0
  55. data/spec/lib/pennmarc/marc_util_spec.rb +206 -0
  56. data/spec/lib/pennmarc/parser_spec.rb +13 -0
  57. data/spec/spec_helper.rb +104 -0
  58. data/spec/support/marc_spec_helpers.rb +84 -0
  59. metadata +171 -0
data/legacy/indexer.rb ADDED
@@ -0,0 +1,568 @@
1
+ # rubocop:disable all
2
+ $:.unshift './config'
3
+
4
+ require 'date'
5
+
6
+ # This fixes a bug in older versions of glibc, where name resolution under high load sometimes fails.
7
+ # We require this here, because indexing jobs don't load Rails initializers
8
+ require 'resolv-replace'
9
+
10
+ require 'traject'
11
+
12
+ require 'penn_lib/marc'
13
+ require 'penn_lib/code_mappings'
14
+
15
+ # Indexer for Franklin-native records (i.e. from Alma).
16
+ # This is also used as a parent class for Hathi and CRL
17
+ # since the vast majority of the indexing rules are the same.
18
+ # Overrideable field definitions should go into define_* methods
19
+ # and called in this constructor.
20
+ class FranklinIndexer < BaseIndexer
21
+
22
+ # this mixin defines lambda facotry method get_format for legacy marc formats
23
+ include Blacklight::Marc::Indexer::Formats
24
+ include BlacklightSolrplugins::Indexer
25
+
26
+ # This behaves like the wrapped MARC::Record object it contains
27
+ # except that the #each method filters out fields with non-standard tags.
28
+ class PlainMarcRecord
29
+
30
+ def initialize(record)
31
+ @record = record
32
+ @valid_tag_regex ||= /^\d\d\d$/
33
+ end
34
+
35
+ def method_missing(*args)
36
+ @record.send(*args)
37
+ end
38
+
39
+ def each
40
+ for field in @record.fields
41
+ yield field if field.tag =~ @valid_tag_regex
42
+ end
43
+ end
44
+ end
45
+
46
+ # Filter out enriched fields from ALMA because a lot of them can cause
47
+ # the stored MARC XML in Solr to exceed max field size. Note that the
48
+ # marc_view partial filters out non-standard MARC tags on display side too.
49
+ # @return [Proc] proc object to be used by traject
50
+ def get_plain_marc_xml
51
+ lambda do |record, accumulator|
52
+ accumulator << MARC::FastXMLWriter.encode(PlainMarcRecord.new(record))
53
+ end
54
+ end
55
+
56
+ def initialize
57
+ super
58
+
59
+ # append extra params to the Solr update URL for solr-side cross reference handling
60
+ # and duplicate ID deletion
61
+ processors = [ 'xref-copyfield', 'fl-multiplex', 'shingles', 'id_hash' ]
62
+ if ENV['SOLR_USE_UID_DISTRIB_PROCESSOR']
63
+ # disable; handle deletion outside of solr, either permanently or pending bug fixes
64
+ #processors << 'uid-distrib'
65
+ end
66
+
67
+ solr_update_url = [ ENV['SOLR_URL'].chomp('/'), 'update', 'json' ].join('/') + "?processor=#{processors.join(',')}"
68
+
69
+ settings do
70
+ # type may be 'binary', 'xml', or 'json'
71
+ provide "marc_source.type", "xml"
72
+ # set this to be non-negative if threshold should be enforced
73
+ provide 'solr_writer.max_skipped', -1
74
+
75
+ provide 'solr.update_url', solr_update_url
76
+
77
+ store 'writer_class_name', 'PennLib::FranklinSolrJsonWriter'
78
+
79
+ # uncomment these lines to write to a file
80
+ # store "writer_class_name", "Traject::JsonWriter"
81
+ # store 'output_file', "traject_output.json"
82
+
83
+ if defined? JRUBY_VERSION
84
+ # 'store' overrides existing settings, 'provide' does not
85
+ store 'reader_class_name', "Traject::Marc4JReader"
86
+ store 'solr_writer.thread_pool', 4
87
+ store 'processing_thread_pool', 4
88
+ end
89
+
90
+ store 'solr_writer.commit_on_close', false
91
+ store 'solr_writer.batch_size', 2000
92
+
93
+ end
94
+
95
+ define_all_fields
96
+ end
97
+
98
+ def define_all_fields
99
+
100
+ define_id
101
+
102
+ define_grouped_id
103
+
104
+ define_record_source_id
105
+
106
+ define_record_source_facet
107
+
108
+ to_field 'nocirc_f_stored' do |rec, acc|
109
+ acc << pennlibmarc.items_nocirc(rec)
110
+ end
111
+
112
+ define_mms_id
113
+
114
+ define_oclc_id
115
+
116
+ define_cluster_id
117
+
118
+ define_full_text_link_text_a
119
+
120
+ # do NOT use *_xml_stored_single because it uses a Str (max 32k) for storage
121
+ to_field 'marcrecord_xml_stored_single_large', get_plain_marc_xml
122
+
123
+ # Our keyword searches use pf/qf to search multiple fields, so
124
+ # we don't need this field; leaving it commented out here just in case.
125
+ #
126
+ # to_field "text_search", extract_all_marc_values do |r, acc|
127
+ # acc.unshift(r['001'].try(:value))
128
+ # acc.replace [acc.join(' ')] # turn it into a single string
129
+ # end
130
+
131
+ define_access_facet
132
+
133
+ to_field 'format_f_stored' do |rec, acc|
134
+ acc.concat(pennlibmarc.get_format(rec))
135
+ end
136
+
137
+ author_creator_spec = %W{
138
+ 100abcdjq
139
+ 110abcdjq
140
+ 700abcdjq
141
+ 710abcdjq
142
+ 800abcdjq
143
+ 810abcdjq
144
+ 111abcen
145
+ 711abcen
146
+ 811abcen
147
+ }.join(':')
148
+
149
+ # this is now automatically copied on the Solr side
150
+ # to_field "author_creator_f", extract_marc(author_creator_spec, :trim_punctuation => true)
151
+
152
+ # TODO: xfacet field, do not migrate
153
+ to_field 'author_creator_xfacet2_input', extract_marc(author_creator_spec, :trim_punctuation => true) do |r, acc|
154
+ acc.map! { |v| 'n' + v }
155
+ end
156
+
157
+ # this is now automatically copied on the Solr side
158
+ # to_field 'subject_f_stored' do |rec, acc|
159
+ # acc.concat(pennlibmarc.get_subject_facet_values(rec))
160
+ # end
161
+
162
+ to_field "db_type_f_stored" do |rec, acc|
163
+ acc.concat(pennlibmarc.get_db_types(rec))
164
+ end
165
+
166
+ to_field "db_category_f_stored" do |rec, acc|
167
+ acc.concat(pennlibmarc.get_db_categories(rec))
168
+ end
169
+
170
+ to_field "db_subcategory_f_stored" do |rec, acc|
171
+ acc.concat(pennlibmarc.get_db_subcategories(rec))
172
+ end
173
+
174
+ to_field 'subject_search' do |rec, acc|
175
+ acc.concat(pennlibmarc.get_subject_search_values(rec))
176
+ end
177
+
178
+ to_field 'toplevel_subject_f' do |rec, acc|
179
+ acc.concat(pennlibmarc.get_subject_facet_values(rec, true))
180
+ end
181
+
182
+ # TODO: xfacet field, do not migrate
183
+ to_field 'call_number_xfacet' do |rec, acc|
184
+ acc.concat(pennlibmarc.get_call_number_xfacet_values(rec))
185
+ end
186
+
187
+ to_field "language_f_stored" do |rec, acc|
188
+ acc.concat(pennlibmarc.get_language_values(rec))
189
+ end
190
+
191
+ to_field "language_search" do |rec, acc|
192
+ acc.concat(pennlibmarc.get_language_values(rec))
193
+ end
194
+
195
+ to_field "library_f_stored" do |rec, acc|
196
+ acc.concat(pennlibmarc.get_library_values(rec))
197
+ end
198
+
199
+ to_field "specific_location_f_stored" do |rec, acc|
200
+ acc.concat(pennlibmarc.get_specific_location_values(rec))
201
+ end
202
+
203
+ to_field "classification_f_stored" do |rec, acc|
204
+ acc.concat(pennlibmarc.get_classification_values(rec))
205
+ end
206
+
207
+ to_field "genre_f_stored" do |rec, acc|
208
+ acc.concat(pennlibmarc.get_genre_values(rec))
209
+ end
210
+
211
+ to_field "genre_search" do |rec, acc|
212
+ acc.concat(pennlibmarc.get_genre_search_values(rec))
213
+ end
214
+
215
+ # Title fields
216
+
217
+ to_field 'title_1_search' do |rec, acc|
218
+ acc.concat(pennlibmarc.get_title_1_search_values(rec))
219
+ end
220
+
221
+ to_field 'title_2_search' do |rec, acc|
222
+ acc.concat(pennlibmarc.get_title_2_search_values(rec))
223
+ end
224
+
225
+ to_field 'journal_title_1_search' do |rec, acc|
226
+ acc.concat(pennlibmarc.get_journal_title_1_search_values(rec))
227
+ end
228
+
229
+ to_field 'journal_title_2_search' do |rec, acc|
230
+ acc.concat(pennlibmarc.get_journal_title_2_search_values(rec))
231
+ end
232
+
233
+ to_field 'author_creator_1_search' do |rec, acc|
234
+ acc.concat(pennlibmarc.get_author_creator_1_search_values(rec))
235
+ end
236
+
237
+ to_field 'author_creator_2_search' do |rec, acc|
238
+ acc.concat(pennlibmarc.get_author_creator_2_search_values(rec))
239
+ end
240
+
241
+ to_field 'author_creator_a' do |rec, acc|
242
+ acc.concat(pennlibmarc.get_author_creator_values(rec))
243
+ end
244
+
245
+ to_field 'author_880_a' do |rec, acc|
246
+ acc.concat(pennlibmarc.get_author_880_values(rec))
247
+ end
248
+
249
+ to_field 'title' do |rec, acc|
250
+ acc.concat(pennlibmarc.get_title_values(rec))
251
+ end
252
+
253
+ to_field 'title_880_a' do |rec,acc|
254
+ acc.concat(pennlibmarc.get_title_880_values(rec))
255
+ end
256
+
257
+ to_field 'standardized_title_a' do |rec, acc|
258
+ acc.concat(pennlibmarc.get_standardized_title_values(rec))
259
+ end
260
+
261
+ # TODO: xfacet field, do not migrate
262
+ to_field 'title_xfacet' do |rec, acc|
263
+ acc.concat(pennlibmarc.get_title_xfacet_values(rec))
264
+ end
265
+
266
+ to_field 'title_nssort' do |rec, acc|
267
+ acc.concat(pennlibmarc.get_title_sort_values(rec))
268
+ end
269
+
270
+ to_field 'title_sort_tl' do |rec, acc|
271
+ acc.concat(pennlibmarc.get_title_sort_filing_parts(rec, false))
272
+ pennlibmarc.append_title_variants(rec, acc)
273
+ end
274
+
275
+ # Author fields
276
+
277
+ to_field 'author_creator_nssort' do |rec, acc|
278
+ acc.concat(pennlibmarc.get_author_creator_sort_values(rec))
279
+ end
280
+
281
+ to_field 'edition' do |rec, acc|
282
+ acc.concat(pennlibmarc.get_edition_values(rec))
283
+ end
284
+
285
+ to_field 'conference_a' do |rec, acc|
286
+ acc.concat(pennlibmarc.get_conference_values(rec))
287
+ end
288
+
289
+ to_field 'series' do |rec, acc|
290
+ acc.concat(pennlibmarc.get_series_values(rec))
291
+ end
292
+
293
+ to_field 'publication_a' do |rec, acc|
294
+ acc.concat(pennlibmarc.get_publication_values(rec))
295
+ end
296
+
297
+ to_field 'contained_within_a' do |rec, acc|
298
+ acc.concat(pennlibmarc.get_contained_within_values(rec))
299
+ end
300
+
301
+ to_field 'elvl_rank_isort' do |rec, acc|
302
+ val = pennlibmarc.get_encoding_level_rank(rec)
303
+ acc << val if val
304
+ end
305
+
306
+ to_field 'hld_count_isort' do |rec, acc|
307
+ val = pennlibmarc.get_hld_count(rec)
308
+ acc << val if val
309
+ end
310
+
311
+ to_field 'itm_count_isort' do |rec, acc|
312
+ val = pennlibmarc.get_itm_count(rec)
313
+ acc << val if val
314
+ end
315
+
316
+ to_field 'empty_hld_count_isort' do |rec, acc|
317
+ val = pennlibmarc.get_empty_hld_count(rec)
318
+ acc << val if val
319
+ end
320
+
321
+ to_field 'prt_count_isort' do |rec, acc|
322
+ val = pennlibmarc.get_prt_count(rec)
323
+ acc << val if val
324
+ end
325
+
326
+ each_record do |rec, ctx|
327
+ ctx.clipboard.tap do |c|
328
+ c[:timestamps] = pennlibmarc.prepare_timestamps(rec)
329
+ c[:dates] = pennlibmarc.prepare_dates(rec)
330
+ c[:subjects] = PennLib::SubjectConfig.prepare_subjects(rec)
331
+ end
332
+ end
333
+
334
+ # All browseable/facetable subject types are multiplexed through this field; for corresponding display,
335
+ # these values are then mapped Solr-side to the `*_subject_stored_a` fields below. The fields are still
336
+ # directly configured below for storage of values that should be displayed, but not directly
337
+ # browseable/facetable
338
+ # TODO: while we should not migrate this field directly, we need to
339
+ # ensure that the copyfield behavior is incorporated into our indexer
340
+ to_field 'subject_xfacet2_input' do |rec, acc, ctx|
341
+ val = ctx.clipboard.dig(:subjects, :xfacet)
342
+ acc.concat(val) if val
343
+ end
344
+
345
+ # The fields below exist because there are some values that appear in _display_, but should not be
346
+ # _directly_ browseable/facetable (except perhaps as xrefs).
347
+ # Note, this is a step towards consolidation/consistency in management of subjects generally; there are
348
+ # choices that are preserved here initially for functional backward compatibility, but some of the behavior
349
+ # we're preserving is of questionable merit. Namely, the fields below allow the display of fields that will
350
+ # be links, but which will in some cases not be present in the linked "browse" view. We'll take this one
351
+ # step at a time, consolidating first with minimal behavioral changes; but note that some of the preserved
352
+ # behavior may be ripe for reconsideration.
353
+ # BEGIN STORED SUBJECTS
354
+ to_field 'lcsh_subject_stored_a' do |rec, acc, ctx|
355
+ val = ctx.clipboard.dig(:subjects, :stored_lcsh)
356
+ acc.concat(val) if val
357
+ end
358
+
359
+ to_field 'childrens_subject_stored_a' do |rec, acc, ctx|
360
+ val = ctx.clipboard.dig(:subjects, :stored_childrens)
361
+ acc.concat(val) if val
362
+ end
363
+
364
+ to_field 'mesh_subject_stored_a' do |rec, acc, ctx|
365
+ val = ctx.clipboard.dig(:subjects, :stored_mesh)
366
+ acc.concat(val) if val
367
+ end
368
+
369
+ to_field 'local_subject_stored_a' do |rec, acc, ctx|
370
+ val = ctx.clipboard.dig(:subjects, :stored_local)
371
+ acc.concat(val) if val
372
+ end
373
+ # END STORED SUBJECTS
374
+
375
+ to_field 'recently_added_isort' do |rec, acc, ctx|
376
+ val = ctx.clipboard.dig(:timestamps, :most_recent_add)
377
+ acc << val if val
378
+ end
379
+
380
+ to_field 'last_update_isort' do |rec, acc, ctx|
381
+ val = ctx.clipboard.dig(:timestamps, :last_update)
382
+ acc << val if val
383
+ end
384
+
385
+ to_field 'publication_date_ssort' do |rec, acc, ctx|
386
+ val = ctx.clipboard.dig(:dates, :pub_date_sort)
387
+ acc << val if val
388
+ end
389
+
390
+ to_field 'pub_min_dtsort' do |rec, acc, ctx|
391
+ val = ctx.clipboard.dig(:dates, :pub_date_minsort)
392
+ acc << val if val
393
+ end
394
+
395
+ to_field 'pub_max_dtsort' do |rec, acc, ctx|
396
+ val = ctx.clipboard.dig(:dates, :pub_date_maxsort)
397
+ acc << val if val
398
+ end
399
+
400
+ to_field 'content_min_dtsort' do |rec, acc, ctx|
401
+ val = ctx.clipboard.dig(:dates, :content_date_minsort)
402
+ acc << val if val
403
+ end
404
+
405
+ to_field 'content_max_dtsort' do |rec, acc, ctx|
406
+ val = ctx.clipboard.dig(:dates, :content_date_maxsort)
407
+ acc << val if val
408
+ end
409
+
410
+ to_field 'publication_date_f_stored' do |rec, acc, ctx|
411
+ val = ctx.clipboard.dig(:dates, :pub_date_decade)
412
+ acc << val if val
413
+ end
414
+
415
+ to_field 'publication_dr' do |rec, acc, ctx|
416
+ val = ctx.clipboard.dig(:dates, :pub_date_range)
417
+ acc << val if val
418
+ end
419
+
420
+ to_field 'content_dr' do |rec, acc, ctx|
421
+ val = ctx.clipboard.dig(:dates, :content_date_range)
422
+ acc << val if val
423
+ end
424
+
425
+ to_field "isbn_isxn_stored", extract_marc(%W{020az 022alz}, :separator=>nil) do |rec, acc|
426
+ orig = acc.dup
427
+ acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)}
428
+ acc << orig
429
+ acc.flatten!
430
+ acc.uniq!
431
+ end
432
+
433
+ to_field 'call_number_search' do |rec, acc|
434
+ acc.concat(pennlibmarc.get_call_number_search_values(rec))
435
+ end
436
+
437
+ to_field 'physical_holdings_json' do |rec, acc|
438
+ result = pennlibmarc.get_physical_holdings(rec)
439
+ if result.present?
440
+ acc << result.to_json
441
+ end
442
+ end
443
+
444
+ to_field 'electronic_holdings_json' do |rec, acc|
445
+ result = pennlibmarc.get_electronic_holdings(rec)
446
+ if result.present?
447
+ acc << result.to_json
448
+ end
449
+ end
450
+
451
+ # store IDs of associated boundwith records, where the actual holdings are attached.
452
+ # this is a multi-valued field because a bib may have multiple copies, each associated
453
+ # with a different boundwith record (a few such cases do exist).
454
+ # we use this to pass to the Availability API.
455
+ to_field 'bound_with_ids_a' do |rec, acc|
456
+ acc.concat(pennlibmarc.get_bound_with_id_values(rec))
457
+ end
458
+
459
+ to_field 'conference_search' do |rec, acc|
460
+ acc.concat(pennlibmarc.get_conference_search_values(rec))
461
+ end
462
+
463
+ to_field 'contents_note_search' do |rec, acc|
464
+ acc.concat(pennlibmarc.get_contents_note_search_values(rec))
465
+ end
466
+
467
+ to_field 'corporate_author_search' do |rec, acc|
468
+ acc.concat(pennlibmarc.get_corporate_author_search_values(rec))
469
+ end
470
+
471
+ to_field 'place_of_publication_search', extract_marc('260a:264|*1|a')
472
+
473
+ to_field 'publisher_search', extract_marc('260b:264|*1|b')
474
+
475
+ to_field 'pubnum_search', extract_marc('024a:028a')
476
+
477
+ to_field 'series_search' do |rec, acc|
478
+ acc.concat(pennlibmarc.get_series_search_values(rec))
479
+ end
480
+
481
+ end
482
+
483
+ def pennlibmarc
484
+ @code_mappings ||= PennLib::CodeMappings.new(Rails.root.join('config').join('translation_maps'))
485
+ @pennlibmarc ||= PennLib::Marc.new(@code_mappings)
486
+ end
487
+
488
+ def define_id
489
+ to_field "id", trim(extract_marc("001"), :first => true) do |rec, acc, context|
490
+ acc.map! { |id| "FRANKLIN_#{id}" }
491
+
492
+ # we do this check in the first 'id' field so that it happens early
493
+ if pennlibmarc.is_boundwith_record(rec)
494
+ context.skip!("Skipping boundwith record #{acc.first}")
495
+ end
496
+ end
497
+ end
498
+
499
+ def define_mms_id
500
+ to_field 'alma_mms_id', trim(extract_marc('001'), :first => true)
501
+ end
502
+
503
+ def define_access_facet
504
+ to_field "access_f_stored" do |rec, acc|
505
+ acc.concat(pennlibmarc.get_access_values(rec))
506
+ end
507
+ end
508
+
509
+ def define_oclc_id
510
+ to_field 'oclc_id' do |rec, acc|
511
+ oclc_ids = pennlibmarc.get_oclc_id_values(rec)
512
+ acc << oclc_ids.first unless oclc_ids.empty?
513
+ end
514
+ end
515
+
516
+ def get_cluster_id(rec)
517
+ pennlibmarc.get_oclc_id_values(rec).first || begin
518
+ id = rec.fields('001').take(1).map(&:value).first
519
+ digest = Digest::MD5.hexdigest(id)
520
+ # first 16 hex digits = first 8 bytes. construct an int out of that hex str.
521
+ digest[0,16].hex
522
+ end
523
+ end
524
+
525
+ def define_cluster_id
526
+ to_field 'cluster_id' do |rec, acc|
527
+ acc << get_cluster_id(rec)
528
+ end
529
+ end
530
+
531
+ def define_grouped_id
532
+ to_field 'grouped_id', trim(extract_marc('001'), :first => true) do |rec, acc, context|
533
+ oclc_ids = pennlibmarc.get_oclc_id_values(rec)
534
+ acc.map! { |id|
535
+ if oclc_ids.size > 1
536
+ puts 'Warning: Multiple OCLC IDs found, using the first one'
537
+ end
538
+ oclc_id = oclc_ids.first
539
+ prefix = oclc_id.present? ? "#{oclc_id}!" : ''
540
+ "#{prefix}FRANKLIN_#{id}"
541
+ }
542
+ end
543
+ end
544
+
545
+ def define_record_source_id
546
+ to_field 'record_source_id' do |rec, acc|
547
+ acc << RecordSource::PENN
548
+ end
549
+ end
550
+
551
+ def define_record_source_facet
552
+ to_field 'record_source_f' do |rec, acc|
553
+ acc << 'Penn'
554
+ acc << 'HathiTrust' if pennlibmarc.is_etas(rec)
555
+ end
556
+ end
557
+
558
+ def define_full_text_link_text_a
559
+ to_field 'full_text_link_text_a' do |rec, acc|
560
+ result = pennlibmarc.get_full_text_link_values(rec)
561
+ if result.present?
562
+ acc << result.to_json
563
+ end
564
+ end
565
+ end
566
+
567
+ end
568
+ # rubocop:enable all