folio-data-import 0.5.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ """Custom exceptions for the Folio Data Import module."""
2
+
3
+
4
+ class FolioDataImportError(Exception):
5
+ """Base class for all exceptions in the Folio Data Import module."""
6
+
7
+ pass
8
+
9
+
10
+ class FolioDataImportBatchError(FolioDataImportError):
11
+ """Exception raised for errors in the Folio Data Import batch process.
12
+
13
+ Attributes:
14
+ batch_id -- ID of the batch that caused the error
15
+ message -- explanation of the error
16
+ """
17
+
18
+ def __init__(self, batch_id, message, exception=None) -> None:
19
+ self.batch_id = batch_id
20
+ self.message = message
21
+ super().__init__(f"Unhandled error posting batch {batch_id}: {message}")
22
+
23
+
24
+ class FolioDataImportJobError(FolioDataImportError):
25
+ """Exception raised for errors in the Folio Data Import job process.
26
+
27
+ Attributes:
28
+ job_id -- ID of the job that caused the error
29
+ message -- explanation of the error
30
+ """
31
+
32
+ def __init__(self, job_id, message, exception=None) -> None:
33
+ self.job_id = job_id
34
+ self.message = message
35
+ super().__init__(f"Unhandled error processing job {job_id}: {message}")
@@ -0,0 +1,29 @@
1
+ """MARC preprocessors for data import into FOLIO."""
2
+
3
+ from ._preprocessors import (
4
+ MARCPreprocessor,
5
+ clean_999_fields,
6
+ clean_empty_fields,
7
+ clean_non_ff_999_fields,
8
+ fix_bib_leader,
9
+ move_authority_subfield_9_to_0_all_controllable_fields,
10
+ prepend_abes_prefix_001,
11
+ prepend_ppn_prefix_001,
12
+ prepend_prefix_001,
13
+ strip_999_ff_fields,
14
+ sudoc_supercede_prep,
15
+ )
16
+
17
+ __all__ = [
18
+ "MARCPreprocessor",
19
+ "clean_999_fields",
20
+ "clean_empty_fields",
21
+ "clean_non_ff_999_fields",
22
+ "fix_bib_leader",
23
+ "move_authority_subfield_9_to_0_all_controllable_fields",
24
+ "prepend_abes_prefix_001",
25
+ "prepend_ppn_prefix_001",
26
+ "prepend_prefix_001",
27
+ "strip_999_ff_fields",
28
+ "sudoc_supercede_prep",
29
+ ]
@@ -0,0 +1,517 @@
1
+ import importlib
2
+ import logging
3
+ import re
4
+ import sys
5
+ from typing import Callable, Dict, List, Tuple, Union
6
+
7
+ import pymarc
8
+ from pymarc.record import Record
9
+
10
+ logger = logging.getLogger("folio_data_import.MARCDataImport")
11
+
12
+
13
+ class MARCPreprocessor:
14
+ """
15
+ A class to preprocess MARC records for data import into FOLIO.
16
+ """
17
+
18
+ def __init__(self, preprocessors: Union[str, List[Callable]], **kwargs) -> None:
19
+ """
20
+ Initialize the MARCPreprocessor with a list of preprocessors.
21
+
22
+ Args:
23
+ preprocessors (Union[str, List[Callable]]): A string of comma-separated function names
24
+ or a list of callable preprocessor functions to apply.
25
+ """
26
+ self.preprocessor_args: Dict[str, Dict] = kwargs
27
+ self.preprocessors: List[Tuple[Callable, Dict]] = self._get_preprocessor_functions(
28
+ preprocessors
29
+ )
30
+ self.proc_kwargs = kwargs
31
+ self.record = None
32
+
33
+ def _get_preprocessor_args(self, func: Callable) -> Dict:
34
+ """
35
+ Get the arguments for the preprocessor function.
36
+
37
+ Args:
38
+ func (Callable): The preprocessor function.
39
+
40
+ Returns:
41
+ Dict: A dictionary of arguments for the preprocessor function.
42
+ """
43
+ func_path = f"{func.__module__}.{func.__name__}"
44
+ path_args: Dict = self.preprocessor_args.get("default", {})
45
+ path_args.update(self.preprocessor_args.get(func.__name__, {}))
46
+ path_args.update(self.preprocessor_args.get(func_path, {}))
47
+ return path_args
48
+
49
+ def _get_preprocessor_functions(
50
+ self, func_list: str | List[Callable]
51
+ ) -> List[Tuple[Callable, Dict]]:
52
+ """
53
+ Get the preprocessor functions based on the provided names.
54
+
55
+ Args:
56
+ func_list (Union[str, List[Callable]]): A string of comma-separated function names or a
57
+ list of callable preprocessor functions.
58
+
59
+ Returns:
60
+ List[callable]: A list of preprocessor functions.
61
+ """
62
+ preprocessors: List[Tuple[Callable, Dict]] = []
63
+ if isinstance(func_list, str):
64
+ func_paths = [f.strip() for f in func_list.split(",")]
65
+ else:
66
+ for f in func_list:
67
+ if not callable(f):
68
+ logger.warning(f"Preprocessing function {f} is not callable. Skipping.")
69
+ else:
70
+ preprocessors.append((f, self._get_preprocessor_args(f)))
71
+ return preprocessors
72
+ for f_path in func_paths:
73
+ f_import = f_path.rsplit(".", 1)
74
+ if len(f_import) == 1:
75
+ # If the function is not a full path, assume it's in the current module
76
+ if func := getattr(sys.modules[__name__], f_import[0], None):
77
+ if callable(func):
78
+ preprocessors.append((func, self._get_preprocessor_args(func)))
79
+ else:
80
+ logger.warning(
81
+ f"Preprocessing function {f_path} is not callable. Skipping."
82
+ )
83
+ else:
84
+ logger.warning(
85
+ f"Preprocessing function {f_path} not found in current module. Skipping."
86
+ )
87
+ elif len(f_import) == 2:
88
+ # If the function is a full path, import it
89
+ module_path, func_name = f_import
90
+ try:
91
+ module = importlib.import_module(module_path)
92
+ func = getattr(module, func_name)
93
+ preprocessors.append((func, self._get_preprocessor_args(func)))
94
+ except ImportError as e:
95
+ logger.warning(
96
+ f"Error importing preprocessing function {f_path}: {e}. Skipping."
97
+ )
98
+ return preprocessors
99
+
100
+ def do_work(self, record: Record) -> Record:
101
+ """
102
+ Preprocess the MARC record.
103
+ """
104
+ for proc, kwargs in self.preprocessors:
105
+ record = proc(record, **kwargs)
106
+ return record
107
+
108
+
109
+ def prepend_prefix_001(record: Record, prefix: str) -> Record:
110
+ """
111
+ Prepend a prefix to the record's 001 field.
112
+
113
+ Args:
114
+ record (Record): The MARC record to preprocess.
115
+ prefix (str): The prefix to prepend to the 001 field.
116
+
117
+ Returns:
118
+ Record: The preprocessed MARC record.
119
+ """
120
+ if "001" in record:
121
+ record["001"].data = (
122
+ f"({prefix})" + record["001"].data if record["001"].data else f"({prefix})"
123
+ )
124
+ else:
125
+ logger.warning("Field '001' not found in record. Skipping prefix prepend.")
126
+ return record
127
+
128
+
129
+ def prepend_ppn_prefix_001(record: Record, **kwargs) -> Record:
130
+ """
131
+ Prepend the PPN prefix to the record's 001 field. Useful when
132
+ importing records from the ABES SUDOC catalog
133
+
134
+ Args:
135
+ record (Record): The MARC record to preprocess.
136
+
137
+ Returns:
138
+ Record: The preprocessed MARC record.
139
+ """
140
+ return prepend_prefix_001(record, "PPN")
141
+
142
+
143
+ def prepend_abes_prefix_001(record: Record, **kwargs) -> Record:
144
+ """
145
+ Prepend the ABES prefix to the record's 001 field. Useful when
146
+ importing records from the ABES SUDOC catalog
147
+
148
+ Args:
149
+ record (Record): The MARC record to preprocess.
150
+
151
+ Returns:
152
+ Record: The preprocessed MARC record.
153
+ """
154
+ return prepend_prefix_001(record, "ABES")
155
+
156
+
157
+ def strip_999_ff_fields(record: Record, **kwargs) -> Record:
158
+ """
159
+ Strip all 999 fields with ff indicators from the record.
160
+ Useful when importing records exported from another FOLIO system
161
+
162
+ Args:
163
+ record (Record): The MARC record to preprocess.
164
+
165
+ Returns:
166
+ Record: The preprocessed MARC record.
167
+ """
168
+ for field in record.get_fields("999"):
169
+ if field.indicators == pymarc.Indicators(*["f", "f"]):
170
+ record.remove_field(field)
171
+ return record
172
+
173
+
174
+ def clean_999_fields(record: Record, **kwargs) -> Record:
175
+ """
176
+ The presence of 999 fields, with or without ff indicators, can cause
177
+ issues with data import mapping in FOLIO. This function calls strip_999_ff_fields
178
+ to remove 999 fields with ff indicators and then copies the remaining 999 fields
179
+ to 945 fields.
180
+
181
+ Args:
182
+ record (Record): The MARC record to preprocess.
183
+
184
+ Returns:
185
+ Record: The preprocessed MARC record.
186
+ """
187
+ record = strip_999_ff_fields(record)
188
+ for field in record.get_fields("999"):
189
+ _945 = pymarc.Field(
190
+ tag="945",
191
+ indicators=field.indicators,
192
+ subfields=field.subfields,
193
+ )
194
+ record.add_ordered_field(_945)
195
+ record.remove_field(field)
196
+ return record
197
+
198
+
199
+ def clean_non_ff_999_fields(record: Record, **kwargs) -> Record:
200
+ """
201
+ When loading migrated MARC records from folio_migration_tools, the presence of other 999 fields
202
+ than those set by the migration process can cause the record to fail to load properly. This
203
+ preprocessor function moves all 999 fields with non-ff indicators to 945 fields with 99
204
+ indicators.
205
+ """
206
+ for field in record.get_fields("999"):
207
+ if field.indicators != pymarc.Indicators(*["f", "f"]):
208
+ logger.log(
209
+ 26,
210
+ "DATA ISSUE\t%s\t%s\t%s",
211
+ record["001"].value(),
212
+ "Record contains a 999 field with non-ff indicators: Moving field to a 945 with"
213
+ ' indicators "99"',
214
+ field,
215
+ )
216
+ _945 = pymarc.Field(
217
+ tag="945",
218
+ indicators=pymarc.Indicators("9", "9"),
219
+ subfields=field.subfields,
220
+ )
221
+ record.add_ordered_field(_945)
222
+ record.remove_field(field)
223
+ return record
224
+
225
+
226
+ def sudoc_supercede_prep(record: Record, **kwargs) -> Record:
227
+ """
228
+ Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
229
+ with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
230
+ prefixed with "(ABES)". This is useful when importing newly-merged records
231
+ from the SUDOC catalog when you want the new record to replace the old one
232
+ in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
233
+
234
+ Args:
235
+ record (Record): The MARC record to preprocess.
236
+
237
+ Returns:
238
+ Record: The preprocessed MARC record.
239
+ """
240
+ record = prepend_abes_prefix_001(record)
241
+ for field in record.get_fields("035"):
242
+ if "a" in field and "9" in field and field["9"] == "sudoc":
243
+ _935 = pymarc.Field(
244
+ tag="935",
245
+ indicators=["f", "f"],
246
+ subfields=[pymarc.field.Subfield("a", "(ABES)" + field["a"])],
247
+ )
248
+ record.add_ordered_field(_935)
249
+ return record
250
+
251
+
252
+ def clean_empty_fields(record: Record, **kwargs) -> Record:
253
+ """
254
+ Remove empty fields and subfields from the record. These can cause
255
+ data import mapping issues in FOLIO. Removals are logged at custom
256
+ log level 26, which is used by folio_migration_tools to populate the
257
+ data issues report.
258
+
259
+ Args:
260
+ record (Record): The MARC record to preprocess.
261
+
262
+ Returns:
263
+ Record: The preprocessed MARC record.
264
+ """
265
+ MAPPED_FIELDS = {
266
+ "010": ["a", "z"],
267
+ "020": ["a", "y", "z"],
268
+ "035": ["a", "z"],
269
+ "040": ["a", "b", "c", "d", "e", "f", "g", "h", "k", "m", "n", "p", "r", "s"],
270
+ "050": ["a", "b"],
271
+ "082": ["a", "b"],
272
+ "100": ["a", "b", "c", "d", "q"],
273
+ "110": ["a", "b", "c"],
274
+ "111": ["a", "c", "d"],
275
+ "130": [
276
+ "a",
277
+ "d",
278
+ "f",
279
+ "k",
280
+ "l",
281
+ "m",
282
+ "n",
283
+ "o",
284
+ "p",
285
+ "r",
286
+ "s",
287
+ "t",
288
+ "x",
289
+ "y",
290
+ "z",
291
+ ],
292
+ "180": ["x", "y", "z"],
293
+ "210": ["a", "c"],
294
+ "240": ["a", "f", "k", "l", "m", "n", "o", "p", "r", "s", "t", "x", "y", "z"],
295
+ "245": ["a", "b", "c", "f", "g", "h", "k", "n", "p", "s"],
296
+ "246": ["a", "f", "g", "n", "p", "s"],
297
+ "250": ["a", "b"],
298
+ "260": ["a", "b", "c", "e", "f", "g"],
299
+ "300": ["a", "b", "c", "e", "f", "g"],
300
+ "440": ["a", "n", "p", "v", "x", "y", "z"],
301
+ "490": ["a", "v", "x", "y", "z"],
302
+ "500": ["a", "c", "d", "n", "p", "v", "x", "y", "z"],
303
+ "505": ["a", "g", "r", "t", "u"],
304
+ "520": ["a", "b", "c", "u"],
305
+ "600": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
306
+ "610": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
307
+ "611": ["a", "c", "d", "t", "v", "x", "y", "z"],
308
+ "630": [
309
+ "a",
310
+ "d",
311
+ "f",
312
+ "k",
313
+ "l",
314
+ "m",
315
+ "n",
316
+ "o",
317
+ "p",
318
+ "r",
319
+ "s",
320
+ "t",
321
+ "x",
322
+ "y",
323
+ "z",
324
+ ],
325
+ "650": ["a", "d", "v", "x", "y", "z"],
326
+ "651": ["a", "v", "x", "y", "z"],
327
+ "655": ["a", "v", "x", "y", "z"],
328
+ "700": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
329
+ "710": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
330
+ "711": ["a", "c", "d", "t", "v", "x", "y", "z"],
331
+ "730": [
332
+ "a",
333
+ "d",
334
+ "f",
335
+ "k",
336
+ "l",
337
+ "m",
338
+ "n",
339
+ "o",
340
+ "p",
341
+ "r",
342
+ "s",
343
+ "t",
344
+ "x",
345
+ "y",
346
+ "z",
347
+ ],
348
+ "740": ["a", "n", "p", "v", "x", "y", "z"],
349
+ "800": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
350
+ "810": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
351
+ "811": ["a", "c", "d", "t", "v", "x", "y", "z"],
352
+ "830": [
353
+ "a",
354
+ "d",
355
+ "f",
356
+ "k",
357
+ "l",
358
+ "m",
359
+ "n",
360
+ "o",
361
+ "p",
362
+ "r",
363
+ "s",
364
+ "t",
365
+ "x",
366
+ "y",
367
+ "z",
368
+ ],
369
+ "856": ["u", "y", "z"],
370
+ }
371
+
372
+ for field in record.get_fields(*MAPPED_FIELDS.keys()):
373
+ len_subs = len(field.subfields)
374
+ subfield_value = (
375
+ bool(re.sub(r"[.,-]", "", field.subfields[0].value).strip()) if len_subs else False
376
+ )
377
+ if int(field.tag) > 9 and len_subs == 0:
378
+ logger.log(
379
+ 26,
380
+ "DATA ISSUE\t%s\t%s\t%s",
381
+ record["001"].value(),
382
+ f"{field.tag} is empty, removing field",
383
+ field,
384
+ )
385
+ record.remove_field(field)
386
+ elif len_subs == 1 and not subfield_value:
387
+ logger.log(
388
+ 26,
389
+ "DATA ISSUE\t%s\t%s\t%s",
390
+ record["001"].value(),
391
+ f"{field.tag}${field.subfields[0].code} is empty,"
392
+ " no other subfields present, removing field",
393
+ field,
394
+ )
395
+ record.remove_field(field)
396
+ else:
397
+ if len_subs > 1 and "a" in field and not field["a"].strip():
398
+ logger.log(
399
+ 26,
400
+ "DATA ISSUE\t%s\t%s\t%s",
401
+ record["001"].value(),
402
+ f"{field.tag}$a is empty, removing subfield",
403
+ field,
404
+ )
405
+ field.delete_subfield("a")
406
+ for idx, subfield in enumerate(list(field.subfields), start=1):
407
+ if subfield.code in MAPPED_FIELDS.get(field.tag, []) and not subfield.value:
408
+ logger.log(
409
+ 26,
410
+ "DATA ISSUE\t%s\t%s\t%s",
411
+ record["001"].value(),
412
+ f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but "
413
+ "other subfields have values, removing subfield",
414
+ field,
415
+ )
416
+ field.delete_subfield(subfield.code)
417
+ if len(field.subfields) == 0:
418
+ logger.log(
419
+ 26,
420
+ "DATA ISSUE\t%s\t%s\t%s",
421
+ record["001"].value(),
422
+ f"{field.tag} has no non-empty subfields after cleaning, removing field",
423
+ field,
424
+ )
425
+ record.remove_field(field)
426
+ return record
427
+
428
+
429
+ def fix_bib_leader(record: Record, **kwargs) -> Record:
430
+ """
431
+ Fixes the leader of the record by setting the record status to 'c' (modified
432
+ record) and the type of record to 'a' (language material).
433
+
434
+ Args:
435
+ record (Record): The MARC record to preprocess.
436
+
437
+ Returns:
438
+ Record: The preprocessed MARC record.
439
+ """
440
+ VALID_STATUSES = ["a", "c", "d", "n", "p"]
441
+ VALID_TYPES = ["a", "c", "d", "e", "f", "g", "i", "j", "k", "m", "o", "p", "r", "t"]
442
+ if record.leader[5] not in VALID_STATUSES:
443
+ logger.log(
444
+ 26,
445
+ "DATA ISSUE\t%s\t%s\t%s",
446
+ record["001"].value(),
447
+ f"Invalid record status: {record.leader[5]}, setting to 'c'",
448
+ record.leader,
449
+ )
450
+ record.leader = pymarc.Leader(record.leader[:5] + "c" + record.leader[6:])
451
+ if record.leader[6] not in VALID_TYPES:
452
+ logger.log(
453
+ 26,
454
+ "DATA ISSUE\t%s\t%s\t%s",
455
+ record["001"].value(),
456
+ f"Invalid record type: {record.leader[6]}, setting to 'a'",
457
+ record.leader,
458
+ )
459
+ record.leader = pymarc.Leader(record.leader[:6] + "a" + record.leader[7:])
460
+ return record
461
+
462
+
463
+ def move_authority_subfield_9_to_0_all_controllable_fields(record: Record, **kwargs) -> Record:
464
+ """
465
+ Move subfield 9 from authority fields to subfield 0. This is useful when
466
+ importing records from the ABES SUDOC catalog.
467
+
468
+ Args:
469
+ record (Record): The MARC record to preprocess.
470
+
471
+ Returns:
472
+ Record: The preprocessed MARC record.
473
+ """
474
+ controlled_fields = [
475
+ "100",
476
+ "110",
477
+ "111",
478
+ "130",
479
+ "600",
480
+ "610",
481
+ "611",
482
+ "630",
483
+ "650",
484
+ "651",
485
+ "655",
486
+ "700",
487
+ "710",
488
+ "711",
489
+ "730",
490
+ "800",
491
+ "810",
492
+ "811",
493
+ "830",
494
+ "880",
495
+ ]
496
+ for field in record.get_fields(*controlled_fields):
497
+ _subfields = field.get_subfields("9")
498
+ for subfield in _subfields:
499
+ field.add_subfield("0", subfield)
500
+ field.delete_subfield("9")
501
+ logger.log(
502
+ 26,
503
+ "DATA ISSUE\t%s\t%s\t%s",
504
+ record["001"].value(),
505
+ f"Subfield 9 moved to subfield 0 in {field.tag}",
506
+ field,
507
+ )
508
+ return record
509
+
510
+
511
+ def ordinal(n: int) -> str:
512
+ s = ("th", "st", "nd", "rd") + ("th",) * 10
513
+ v = n % 100
514
+ if v > 13:
515
+ return f"{n}{s[v % 10]}"
516
+ else:
517
+ return f"{n}{s[v]}"