pennmarc 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +6 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +23 -0
- data/Gemfile.lock +119 -0
- data/README.md +82 -0
- data/legacy/indexer.rb +568 -0
- data/legacy/marc.rb +2964 -0
- data/legacy/test_file_output.json +49 -0
- data/lib/pennmarc/encoding_level.rb +43 -0
- data/lib/pennmarc/enriched_marc.rb +36 -0
- data/lib/pennmarc/heading_control.rb +11 -0
- data/lib/pennmarc/helpers/citation.rb +31 -0
- data/lib/pennmarc/helpers/creator.rb +237 -0
- data/lib/pennmarc/helpers/database.rb +89 -0
- data/lib/pennmarc/helpers/date.rb +85 -0
- data/lib/pennmarc/helpers/edition.rb +90 -0
- data/lib/pennmarc/helpers/format.rb +312 -0
- data/lib/pennmarc/helpers/genre.rb +71 -0
- data/lib/pennmarc/helpers/helper.rb +11 -0
- data/lib/pennmarc/helpers/identifier.rb +134 -0
- data/lib/pennmarc/helpers/language.rb +37 -0
- data/lib/pennmarc/helpers/link.rb +12 -0
- data/lib/pennmarc/helpers/location.rb +97 -0
- data/lib/pennmarc/helpers/note.rb +132 -0
- data/lib/pennmarc/helpers/production.rb +131 -0
- data/lib/pennmarc/helpers/relation.rb +135 -0
- data/lib/pennmarc/helpers/series.rb +118 -0
- data/lib/pennmarc/helpers/subject.rb +304 -0
- data/lib/pennmarc/helpers/title.rb +197 -0
- data/lib/pennmarc/mappings/language.yml +516 -0
- data/lib/pennmarc/mappings/locations.yml +1801 -0
- data/lib/pennmarc/mappings/relator.yml +263 -0
- data/lib/pennmarc/parser.rb +177 -0
- data/lib/pennmarc/util.rb +240 -0
- data/lib/pennmarc.rb +6 -0
- data/pennmarc.gemspec +22 -0
- data/spec/fixtures/marcxml/test.xml +167 -0
- data/spec/lib/pennmarc/helpers/citation_spec.rb +27 -0
- data/spec/lib/pennmarc/helpers/creator_spec.rb +183 -0
- data/spec/lib/pennmarc/helpers/database_spec.rb +60 -0
- data/spec/lib/pennmarc/helpers/date_spec.rb +105 -0
- data/spec/lib/pennmarc/helpers/edition_spec.rb +38 -0
- data/spec/lib/pennmarc/helpers/format_spec.rb +200 -0
- data/spec/lib/pennmarc/helpers/genre_spec.rb +89 -0
- data/spec/lib/pennmarc/helpers/identifer_spec.rb +105 -0
- data/spec/lib/pennmarc/helpers/language_spec.rb +30 -0
- data/spec/lib/pennmarc/helpers/location_spec.rb +70 -0
- data/spec/lib/pennmarc/helpers/note_spec.rb +233 -0
- data/spec/lib/pennmarc/helpers/production_spec.rb +193 -0
- data/spec/lib/pennmarc/helpers/relation_spec.rb +120 -0
- data/spec/lib/pennmarc/helpers/subject_spec.rb +262 -0
- data/spec/lib/pennmarc/helpers/title_spec.rb +169 -0
- data/spec/lib/pennmarc/marc_util_spec.rb +206 -0
- data/spec/lib/pennmarc/parser_spec.rb +13 -0
- data/spec/spec_helper.rb +104 -0
- data/spec/support/marc_spec_helpers.rb +84 -0
- metadata +171 -0
data/legacy/indexer.rb
ADDED
@@ -0,0 +1,568 @@
|
|
1
|
+
# rubocop:disable all
|
2
|
+
$:.unshift './config'
|
3
|
+
|
4
|
+
require 'date'
|
5
|
+
|
6
|
+
# This fixes a bug in older versions of glibc, where name resolution under high load sometimes fails.
|
7
|
+
# We require this here, because indexing jobs don't load Rails initializers
|
8
|
+
require 'resolv-replace'
|
9
|
+
|
10
|
+
require 'traject'
|
11
|
+
|
12
|
+
require 'penn_lib/marc'
|
13
|
+
require 'penn_lib/code_mappings'
|
14
|
+
|
15
|
+
# Indexer for Franklin-native records (i.e. from Alma).
|
16
|
+
# This is also used as a parent class for Hathi and CRL
|
17
|
+
# since the vast majority of the indexing rules are the same.
|
18
|
+
# Overrideable field definitions should go into define_* methods
|
19
|
+
# and called in this constructor.
|
20
|
+
class FranklinIndexer < BaseIndexer
|
21
|
+
|
22
|
+
# this mixin defines lambda facotry method get_format for legacy marc formats
|
23
|
+
include Blacklight::Marc::Indexer::Formats
|
24
|
+
include BlacklightSolrplugins::Indexer
|
25
|
+
|
26
|
+
# This behaves like the wrapped MARC::Record object it contains
|
27
|
+
# except that the #each method filters out fields with non-standard tags.
|
28
|
+
class PlainMarcRecord
|
29
|
+
|
30
|
+
def initialize(record)
|
31
|
+
@record = record
|
32
|
+
@valid_tag_regex ||= /^\d\d\d$/
|
33
|
+
end
|
34
|
+
|
35
|
+
def method_missing(*args)
|
36
|
+
@record.send(*args)
|
37
|
+
end
|
38
|
+
|
39
|
+
def each
|
40
|
+
for field in @record.fields
|
41
|
+
yield field if field.tag =~ @valid_tag_regex
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Filter out enriched fields from ALMA because a lot of them can cause
|
47
|
+
# the stored MARC XML in Solr to exceed max field size. Note that the
|
48
|
+
# marc_view partial filters out non-standard MARC tags on display side too.
|
49
|
+
# @return [Proc] proc object to be used by traject
|
50
|
+
def get_plain_marc_xml
|
51
|
+
lambda do |record, accumulator|
|
52
|
+
accumulator << MARC::FastXMLWriter.encode(PlainMarcRecord.new(record))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def initialize
|
57
|
+
super
|
58
|
+
|
59
|
+
# append extra params to the Solr update URL for solr-side cross reference handling
|
60
|
+
# and duplicate ID deletion
|
61
|
+
processors = [ 'xref-copyfield', 'fl-multiplex', 'shingles', 'id_hash' ]
|
62
|
+
if ENV['SOLR_USE_UID_DISTRIB_PROCESSOR']
|
63
|
+
# disable; handle deletion outside of solr, either permanently or pending bug fixes
|
64
|
+
#processors << 'uid-distrib'
|
65
|
+
end
|
66
|
+
|
67
|
+
solr_update_url = [ ENV['SOLR_URL'].chomp('/'), 'update', 'json' ].join('/') + "?processor=#{processors.join(',')}"
|
68
|
+
|
69
|
+
settings do
|
70
|
+
# type may be 'binary', 'xml', or 'json'
|
71
|
+
provide "marc_source.type", "xml"
|
72
|
+
# set this to be non-negative if threshold should be enforced
|
73
|
+
provide 'solr_writer.max_skipped', -1
|
74
|
+
|
75
|
+
provide 'solr.update_url', solr_update_url
|
76
|
+
|
77
|
+
store 'writer_class_name', 'PennLib::FranklinSolrJsonWriter'
|
78
|
+
|
79
|
+
# uncomment these lines to write to a file
|
80
|
+
# store "writer_class_name", "Traject::JsonWriter"
|
81
|
+
# store 'output_file', "traject_output.json"
|
82
|
+
|
83
|
+
if defined? JRUBY_VERSION
|
84
|
+
# 'store' overrides existing settings, 'provide' does not
|
85
|
+
store 'reader_class_name', "Traject::Marc4JReader"
|
86
|
+
store 'solr_writer.thread_pool', 4
|
87
|
+
store 'processing_thread_pool', 4
|
88
|
+
end
|
89
|
+
|
90
|
+
store 'solr_writer.commit_on_close', false
|
91
|
+
store 'solr_writer.batch_size', 2000
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
define_all_fields
|
96
|
+
end
|
97
|
+
|
98
|
+
def define_all_fields
|
99
|
+
|
100
|
+
define_id
|
101
|
+
|
102
|
+
define_grouped_id
|
103
|
+
|
104
|
+
define_record_source_id
|
105
|
+
|
106
|
+
define_record_source_facet
|
107
|
+
|
108
|
+
to_field 'nocirc_f_stored' do |rec, acc|
|
109
|
+
acc << pennlibmarc.items_nocirc(rec)
|
110
|
+
end
|
111
|
+
|
112
|
+
define_mms_id
|
113
|
+
|
114
|
+
define_oclc_id
|
115
|
+
|
116
|
+
define_cluster_id
|
117
|
+
|
118
|
+
define_full_text_link_text_a
|
119
|
+
|
120
|
+
# do NOT use *_xml_stored_single because it uses a Str (max 32k) for storage
|
121
|
+
to_field 'marcrecord_xml_stored_single_large', get_plain_marc_xml
|
122
|
+
|
123
|
+
# Our keyword searches use pf/qf to search multiple fields, so
|
124
|
+
# we don't need this field; leaving it commented out here just in case.
|
125
|
+
#
|
126
|
+
# to_field "text_search", extract_all_marc_values do |r, acc|
|
127
|
+
# acc.unshift(r['001'].try(:value))
|
128
|
+
# acc.replace [acc.join(' ')] # turn it into a single string
|
129
|
+
# end
|
130
|
+
|
131
|
+
define_access_facet
|
132
|
+
|
133
|
+
to_field 'format_f_stored' do |rec, acc|
|
134
|
+
acc.concat(pennlibmarc.get_format(rec))
|
135
|
+
end
|
136
|
+
|
137
|
+
author_creator_spec = %W{
|
138
|
+
100abcdjq
|
139
|
+
110abcdjq
|
140
|
+
700abcdjq
|
141
|
+
710abcdjq
|
142
|
+
800abcdjq
|
143
|
+
810abcdjq
|
144
|
+
111abcen
|
145
|
+
711abcen
|
146
|
+
811abcen
|
147
|
+
}.join(':')
|
148
|
+
|
149
|
+
# this is now automatically copied on the Solr side
|
150
|
+
# to_field "author_creator_f", extract_marc(author_creator_spec, :trim_punctuation => true)
|
151
|
+
|
152
|
+
# TODO: xfacet field, do not migrate
|
153
|
+
to_field 'author_creator_xfacet2_input', extract_marc(author_creator_spec, :trim_punctuation => true) do |r, acc|
|
154
|
+
acc.map! { |v| 'n' + v }
|
155
|
+
end
|
156
|
+
|
157
|
+
# this is now automatically copied on the Solr side
|
158
|
+
# to_field 'subject_f_stored' do |rec, acc|
|
159
|
+
# acc.concat(pennlibmarc.get_subject_facet_values(rec))
|
160
|
+
# end
|
161
|
+
|
162
|
+
to_field "db_type_f_stored" do |rec, acc|
|
163
|
+
acc.concat(pennlibmarc.get_db_types(rec))
|
164
|
+
end
|
165
|
+
|
166
|
+
to_field "db_category_f_stored" do |rec, acc|
|
167
|
+
acc.concat(pennlibmarc.get_db_categories(rec))
|
168
|
+
end
|
169
|
+
|
170
|
+
to_field "db_subcategory_f_stored" do |rec, acc|
|
171
|
+
acc.concat(pennlibmarc.get_db_subcategories(rec))
|
172
|
+
end
|
173
|
+
|
174
|
+
to_field 'subject_search' do |rec, acc|
|
175
|
+
acc.concat(pennlibmarc.get_subject_search_values(rec))
|
176
|
+
end
|
177
|
+
|
178
|
+
to_field 'toplevel_subject_f' do |rec, acc|
|
179
|
+
acc.concat(pennlibmarc.get_subject_facet_values(rec, true))
|
180
|
+
end
|
181
|
+
|
182
|
+
# TODO: xfacet field, do not migrate
|
183
|
+
to_field 'call_number_xfacet' do |rec, acc|
|
184
|
+
acc.concat(pennlibmarc.get_call_number_xfacet_values(rec))
|
185
|
+
end
|
186
|
+
|
187
|
+
to_field "language_f_stored" do |rec, acc|
|
188
|
+
acc.concat(pennlibmarc.get_language_values(rec))
|
189
|
+
end
|
190
|
+
|
191
|
+
to_field "language_search" do |rec, acc|
|
192
|
+
acc.concat(pennlibmarc.get_language_values(rec))
|
193
|
+
end
|
194
|
+
|
195
|
+
to_field "library_f_stored" do |rec, acc|
|
196
|
+
acc.concat(pennlibmarc.get_library_values(rec))
|
197
|
+
end
|
198
|
+
|
199
|
+
to_field "specific_location_f_stored" do |rec, acc|
|
200
|
+
acc.concat(pennlibmarc.get_specific_location_values(rec))
|
201
|
+
end
|
202
|
+
|
203
|
+
to_field "classification_f_stored" do |rec, acc|
|
204
|
+
acc.concat(pennlibmarc.get_classification_values(rec))
|
205
|
+
end
|
206
|
+
|
207
|
+
to_field "genre_f_stored" do |rec, acc|
|
208
|
+
acc.concat(pennlibmarc.get_genre_values(rec))
|
209
|
+
end
|
210
|
+
|
211
|
+
to_field "genre_search" do |rec, acc|
|
212
|
+
acc.concat(pennlibmarc.get_genre_search_values(rec))
|
213
|
+
end
|
214
|
+
|
215
|
+
# Title fields
|
216
|
+
|
217
|
+
to_field 'title_1_search' do |rec, acc|
|
218
|
+
acc.concat(pennlibmarc.get_title_1_search_values(rec))
|
219
|
+
end
|
220
|
+
|
221
|
+
to_field 'title_2_search' do |rec, acc|
|
222
|
+
acc.concat(pennlibmarc.get_title_2_search_values(rec))
|
223
|
+
end
|
224
|
+
|
225
|
+
to_field 'journal_title_1_search' do |rec, acc|
|
226
|
+
acc.concat(pennlibmarc.get_journal_title_1_search_values(rec))
|
227
|
+
end
|
228
|
+
|
229
|
+
to_field 'journal_title_2_search' do |rec, acc|
|
230
|
+
acc.concat(pennlibmarc.get_journal_title_2_search_values(rec))
|
231
|
+
end
|
232
|
+
|
233
|
+
to_field 'author_creator_1_search' do |rec, acc|
|
234
|
+
acc.concat(pennlibmarc.get_author_creator_1_search_values(rec))
|
235
|
+
end
|
236
|
+
|
237
|
+
to_field 'author_creator_2_search' do |rec, acc|
|
238
|
+
acc.concat(pennlibmarc.get_author_creator_2_search_values(rec))
|
239
|
+
end
|
240
|
+
|
241
|
+
to_field 'author_creator_a' do |rec, acc|
|
242
|
+
acc.concat(pennlibmarc.get_author_creator_values(rec))
|
243
|
+
end
|
244
|
+
|
245
|
+
to_field 'author_880_a' do |rec, acc|
|
246
|
+
acc.concat(pennlibmarc.get_author_880_values(rec))
|
247
|
+
end
|
248
|
+
|
249
|
+
to_field 'title' do |rec, acc|
|
250
|
+
acc.concat(pennlibmarc.get_title_values(rec))
|
251
|
+
end
|
252
|
+
|
253
|
+
to_field 'title_880_a' do |rec,acc|
|
254
|
+
acc.concat(pennlibmarc.get_title_880_values(rec))
|
255
|
+
end
|
256
|
+
|
257
|
+
to_field 'standardized_title_a' do |rec, acc|
|
258
|
+
acc.concat(pennlibmarc.get_standardized_title_values(rec))
|
259
|
+
end
|
260
|
+
|
261
|
+
# TODO: xfacet field, do not migrate
|
262
|
+
to_field 'title_xfacet' do |rec, acc|
|
263
|
+
acc.concat(pennlibmarc.get_title_xfacet_values(rec))
|
264
|
+
end
|
265
|
+
|
266
|
+
to_field 'title_nssort' do |rec, acc|
|
267
|
+
acc.concat(pennlibmarc.get_title_sort_values(rec))
|
268
|
+
end
|
269
|
+
|
270
|
+
to_field 'title_sort_tl' do |rec, acc|
|
271
|
+
acc.concat(pennlibmarc.get_title_sort_filing_parts(rec, false))
|
272
|
+
pennlibmarc.append_title_variants(rec, acc)
|
273
|
+
end
|
274
|
+
|
275
|
+
# Author fields
|
276
|
+
|
277
|
+
to_field 'author_creator_nssort' do |rec, acc|
|
278
|
+
acc.concat(pennlibmarc.get_author_creator_sort_values(rec))
|
279
|
+
end
|
280
|
+
|
281
|
+
to_field 'edition' do |rec, acc|
|
282
|
+
acc.concat(pennlibmarc.get_edition_values(rec))
|
283
|
+
end
|
284
|
+
|
285
|
+
to_field 'conference_a' do |rec, acc|
|
286
|
+
acc.concat(pennlibmarc.get_conference_values(rec))
|
287
|
+
end
|
288
|
+
|
289
|
+
to_field 'series' do |rec, acc|
|
290
|
+
acc.concat(pennlibmarc.get_series_values(rec))
|
291
|
+
end
|
292
|
+
|
293
|
+
to_field 'publication_a' do |rec, acc|
|
294
|
+
acc.concat(pennlibmarc.get_publication_values(rec))
|
295
|
+
end
|
296
|
+
|
297
|
+
to_field 'contained_within_a' do |rec, acc|
|
298
|
+
acc.concat(pennlibmarc.get_contained_within_values(rec))
|
299
|
+
end
|
300
|
+
|
301
|
+
to_field 'elvl_rank_isort' do |rec, acc|
|
302
|
+
val = pennlibmarc.get_encoding_level_rank(rec)
|
303
|
+
acc << val if val
|
304
|
+
end
|
305
|
+
|
306
|
+
to_field 'hld_count_isort' do |rec, acc|
|
307
|
+
val = pennlibmarc.get_hld_count(rec)
|
308
|
+
acc << val if val
|
309
|
+
end
|
310
|
+
|
311
|
+
to_field 'itm_count_isort' do |rec, acc|
|
312
|
+
val = pennlibmarc.get_itm_count(rec)
|
313
|
+
acc << val if val
|
314
|
+
end
|
315
|
+
|
316
|
+
to_field 'empty_hld_count_isort' do |rec, acc|
|
317
|
+
val = pennlibmarc.get_empty_hld_count(rec)
|
318
|
+
acc << val if val
|
319
|
+
end
|
320
|
+
|
321
|
+
to_field 'prt_count_isort' do |rec, acc|
|
322
|
+
val = pennlibmarc.get_prt_count(rec)
|
323
|
+
acc << val if val
|
324
|
+
end
|
325
|
+
|
326
|
+
each_record do |rec, ctx|
|
327
|
+
ctx.clipboard.tap do |c|
|
328
|
+
c[:timestamps] = pennlibmarc.prepare_timestamps(rec)
|
329
|
+
c[:dates] = pennlibmarc.prepare_dates(rec)
|
330
|
+
c[:subjects] = PennLib::SubjectConfig.prepare_subjects(rec)
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
# All browseable/facetable subject types are multiplexed through this field; for corresponding display,
|
335
|
+
# these values are then mapped Solr-side to the `*_subject_stored_a` fields below. The fields are still
|
336
|
+
# directly configured below for storage of values that should be displayed, but not directly
|
337
|
+
# browseable/facetable
|
338
|
+
# TODO: while we should not migrate this field directly, we need to
|
339
|
+
# ensure that the copyfield behavior is incorporated into our indexer
|
340
|
+
to_field 'subject_xfacet2_input' do |rec, acc, ctx|
|
341
|
+
val = ctx.clipboard.dig(:subjects, :xfacet)
|
342
|
+
acc.concat(val) if val
|
343
|
+
end
|
344
|
+
|
345
|
+
# The fields below exist because there are some values that appear in _display_, but should not be
|
346
|
+
# _directly_ browseable/facetable (except perhaps as xrefs).
|
347
|
+
# Note, this is a step towards consolidation/consistency in management of subjects generally; there are
|
348
|
+
# choices that are preserved here initially for functional backward compatibility, but some of the behavior
|
349
|
+
# we're preserving is of questionable merit. Namely, the fields below allow the display of fields that will
|
350
|
+
# be links, but which will in some cases not be present in the linked "browse" view. We'll take this one
|
351
|
+
# step at a time, consolidating first with minimal behavioral changes; but note that some of the preserved
|
352
|
+
# behavior may be ripe for reconsideration.
|
353
|
+
# BEGIN STORED SUBJECTS
|
354
|
+
to_field 'lcsh_subject_stored_a' do |rec, acc, ctx|
|
355
|
+
val = ctx.clipboard.dig(:subjects, :stored_lcsh)
|
356
|
+
acc.concat(val) if val
|
357
|
+
end
|
358
|
+
|
359
|
+
to_field 'childrens_subject_stored_a' do |rec, acc, ctx|
|
360
|
+
val = ctx.clipboard.dig(:subjects, :stored_childrens)
|
361
|
+
acc.concat(val) if val
|
362
|
+
end
|
363
|
+
|
364
|
+
to_field 'mesh_subject_stored_a' do |rec, acc, ctx|
|
365
|
+
val = ctx.clipboard.dig(:subjects, :stored_mesh)
|
366
|
+
acc.concat(val) if val
|
367
|
+
end
|
368
|
+
|
369
|
+
to_field 'local_subject_stored_a' do |rec, acc, ctx|
|
370
|
+
val = ctx.clipboard.dig(:subjects, :stored_local)
|
371
|
+
acc.concat(val) if val
|
372
|
+
end
|
373
|
+
# END STORED SUBJECTS
|
374
|
+
|
375
|
+
to_field 'recently_added_isort' do |rec, acc, ctx|
|
376
|
+
val = ctx.clipboard.dig(:timestamps, :most_recent_add)
|
377
|
+
acc << val if val
|
378
|
+
end
|
379
|
+
|
380
|
+
to_field 'last_update_isort' do |rec, acc, ctx|
|
381
|
+
val = ctx.clipboard.dig(:timestamps, :last_update)
|
382
|
+
acc << val if val
|
383
|
+
end
|
384
|
+
|
385
|
+
to_field 'publication_date_ssort' do |rec, acc, ctx|
|
386
|
+
val = ctx.clipboard.dig(:dates, :pub_date_sort)
|
387
|
+
acc << val if val
|
388
|
+
end
|
389
|
+
|
390
|
+
to_field 'pub_min_dtsort' do |rec, acc, ctx|
|
391
|
+
val = ctx.clipboard.dig(:dates, :pub_date_minsort)
|
392
|
+
acc << val if val
|
393
|
+
end
|
394
|
+
|
395
|
+
to_field 'pub_max_dtsort' do |rec, acc, ctx|
|
396
|
+
val = ctx.clipboard.dig(:dates, :pub_date_maxsort)
|
397
|
+
acc << val if val
|
398
|
+
end
|
399
|
+
|
400
|
+
to_field 'content_min_dtsort' do |rec, acc, ctx|
|
401
|
+
val = ctx.clipboard.dig(:dates, :content_date_minsort)
|
402
|
+
acc << val if val
|
403
|
+
end
|
404
|
+
|
405
|
+
to_field 'content_max_dtsort' do |rec, acc, ctx|
|
406
|
+
val = ctx.clipboard.dig(:dates, :content_date_maxsort)
|
407
|
+
acc << val if val
|
408
|
+
end
|
409
|
+
|
410
|
+
to_field 'publication_date_f_stored' do |rec, acc, ctx|
|
411
|
+
val = ctx.clipboard.dig(:dates, :pub_date_decade)
|
412
|
+
acc << val if val
|
413
|
+
end
|
414
|
+
|
415
|
+
to_field 'publication_dr' do |rec, acc, ctx|
|
416
|
+
val = ctx.clipboard.dig(:dates, :pub_date_range)
|
417
|
+
acc << val if val
|
418
|
+
end
|
419
|
+
|
420
|
+
to_field 'content_dr' do |rec, acc, ctx|
|
421
|
+
val = ctx.clipboard.dig(:dates, :content_date_range)
|
422
|
+
acc << val if val
|
423
|
+
end
|
424
|
+
|
425
|
+
to_field "isbn_isxn_stored", extract_marc(%W{020az 022alz}, :separator=>nil) do |rec, acc|
|
426
|
+
orig = acc.dup
|
427
|
+
acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)}
|
428
|
+
acc << orig
|
429
|
+
acc.flatten!
|
430
|
+
acc.uniq!
|
431
|
+
end
|
432
|
+
|
433
|
+
to_field 'call_number_search' do |rec, acc|
|
434
|
+
acc.concat(pennlibmarc.get_call_number_search_values(rec))
|
435
|
+
end
|
436
|
+
|
437
|
+
to_field 'physical_holdings_json' do |rec, acc|
|
438
|
+
result = pennlibmarc.get_physical_holdings(rec)
|
439
|
+
if result.present?
|
440
|
+
acc << result.to_json
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
to_field 'electronic_holdings_json' do |rec, acc|
|
445
|
+
result = pennlibmarc.get_electronic_holdings(rec)
|
446
|
+
if result.present?
|
447
|
+
acc << result.to_json
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
# store IDs of associated boundwith records, where the actual holdings are attached.
|
452
|
+
# this is a multi-valued field because a bib may have multiple copies, each associated
|
453
|
+
# with a different boundwith record (a few such cases do exist).
|
454
|
+
# we use this to pass to the Availability API.
|
455
|
+
to_field 'bound_with_ids_a' do |rec, acc|
|
456
|
+
acc.concat(pennlibmarc.get_bound_with_id_values(rec))
|
457
|
+
end
|
458
|
+
|
459
|
+
to_field 'conference_search' do |rec, acc|
|
460
|
+
acc.concat(pennlibmarc.get_conference_search_values(rec))
|
461
|
+
end
|
462
|
+
|
463
|
+
to_field 'contents_note_search' do |rec, acc|
|
464
|
+
acc.concat(pennlibmarc.get_contents_note_search_values(rec))
|
465
|
+
end
|
466
|
+
|
467
|
+
to_field 'corporate_author_search' do |rec, acc|
|
468
|
+
acc.concat(pennlibmarc.get_corporate_author_search_values(rec))
|
469
|
+
end
|
470
|
+
|
471
|
+
to_field 'place_of_publication_search', extract_marc('260a:264|*1|a')
|
472
|
+
|
473
|
+
to_field 'publisher_search', extract_marc('260b:264|*1|b')
|
474
|
+
|
475
|
+
to_field 'pubnum_search', extract_marc('024a:028a')
|
476
|
+
|
477
|
+
to_field 'series_search' do |rec, acc|
|
478
|
+
acc.concat(pennlibmarc.get_series_search_values(rec))
|
479
|
+
end
|
480
|
+
|
481
|
+
end
|
482
|
+
|
483
|
+
def pennlibmarc
|
484
|
+
@code_mappings ||= PennLib::CodeMappings.new(Rails.root.join('config').join('translation_maps'))
|
485
|
+
@pennlibmarc ||= PennLib::Marc.new(@code_mappings)
|
486
|
+
end
|
487
|
+
|
488
|
+
def define_id
|
489
|
+
to_field "id", trim(extract_marc("001"), :first => true) do |rec, acc, context|
|
490
|
+
acc.map! { |id| "FRANKLIN_#{id}" }
|
491
|
+
|
492
|
+
# we do this check in the first 'id' field so that it happens early
|
493
|
+
if pennlibmarc.is_boundwith_record(rec)
|
494
|
+
context.skip!("Skipping boundwith record #{acc.first}")
|
495
|
+
end
|
496
|
+
end
|
497
|
+
end
|
498
|
+
|
499
|
+
def define_mms_id
|
500
|
+
to_field 'alma_mms_id', trim(extract_marc('001'), :first => true)
|
501
|
+
end
|
502
|
+
|
503
|
+
def define_access_facet
|
504
|
+
to_field "access_f_stored" do |rec, acc|
|
505
|
+
acc.concat(pennlibmarc.get_access_values(rec))
|
506
|
+
end
|
507
|
+
end
|
508
|
+
|
509
|
+
def define_oclc_id
|
510
|
+
to_field 'oclc_id' do |rec, acc|
|
511
|
+
oclc_ids = pennlibmarc.get_oclc_id_values(rec)
|
512
|
+
acc << oclc_ids.first unless oclc_ids.empty?
|
513
|
+
end
|
514
|
+
end
|
515
|
+
|
516
|
+
def get_cluster_id(rec)
|
517
|
+
pennlibmarc.get_oclc_id_values(rec).first || begin
|
518
|
+
id = rec.fields('001').take(1).map(&:value).first
|
519
|
+
digest = Digest::MD5.hexdigest(id)
|
520
|
+
# first 16 hex digits = first 8 bytes. construct an int out of that hex str.
|
521
|
+
digest[0,16].hex
|
522
|
+
end
|
523
|
+
end
|
524
|
+
|
525
|
+
def define_cluster_id
|
526
|
+
to_field 'cluster_id' do |rec, acc|
|
527
|
+
acc << get_cluster_id(rec)
|
528
|
+
end
|
529
|
+
end
|
530
|
+
|
531
|
+
def define_grouped_id
|
532
|
+
to_field 'grouped_id', trim(extract_marc('001'), :first => true) do |rec, acc, context|
|
533
|
+
oclc_ids = pennlibmarc.get_oclc_id_values(rec)
|
534
|
+
acc.map! { |id|
|
535
|
+
if oclc_ids.size > 1
|
536
|
+
puts 'Warning: Multiple OCLC IDs found, using the first one'
|
537
|
+
end
|
538
|
+
oclc_id = oclc_ids.first
|
539
|
+
prefix = oclc_id.present? ? "#{oclc_id}!" : ''
|
540
|
+
"#{prefix}FRANKLIN_#{id}"
|
541
|
+
}
|
542
|
+
end
|
543
|
+
end
|
544
|
+
|
545
|
+
def define_record_source_id
|
546
|
+
to_field 'record_source_id' do |rec, acc|
|
547
|
+
acc << RecordSource::PENN
|
548
|
+
end
|
549
|
+
end
|
550
|
+
|
551
|
+
def define_record_source_facet
|
552
|
+
to_field 'record_source_f' do |rec, acc|
|
553
|
+
acc << 'Penn'
|
554
|
+
acc << 'HathiTrust' if pennlibmarc.is_etas(rec)
|
555
|
+
end
|
556
|
+
end
|
557
|
+
|
558
|
+
def define_full_text_link_text_a
|
559
|
+
to_field 'full_text_link_text_a' do |rec, acc|
|
560
|
+
result = pennlibmarc.get_full_text_link_values(rec)
|
561
|
+
if result.present?
|
562
|
+
acc << result.to_json
|
563
|
+
end
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
567
|
+
end
|
568
|
+
# rubocop:enable all
|