folio-data-import 0.2.8rc11__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- folio_data_import/MARCDataImport.py +240 -153
- folio_data_import/UserImport.py +11 -11
- folio_data_import/custom_exceptions.py +17 -0
- folio_data_import/marc_preprocessors/_preprocessors.py +218 -67
- {folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/METADATA +2 -3
- folio_data_import-0.3.0.dist-info/RECORD +12 -0
- {folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/WHEEL +1 -1
- folio_data_import-0.2.8rc11.dist-info/RECORD +0 -11
- {folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/LICENSE +0 -0
- {folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -2,9 +2,10 @@ import argparse
|
|
|
2
2
|
import asyncio
|
|
3
3
|
import datetime
|
|
4
4
|
import glob
|
|
5
|
-
import importlib
|
|
6
5
|
import io
|
|
6
|
+
import json
|
|
7
7
|
import logging
|
|
8
|
+
import math
|
|
8
9
|
import os
|
|
9
10
|
import sys
|
|
10
11
|
import uuid
|
|
@@ -14,7 +15,7 @@ from functools import cached_property
|
|
|
14
15
|
from getpass import getpass
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from time import sleep
|
|
17
|
-
from typing import List
|
|
18
|
+
from typing import Any, BinaryIO, Callable, Dict, List, Union
|
|
18
19
|
|
|
19
20
|
import folioclient
|
|
20
21
|
import httpx
|
|
@@ -24,6 +25,9 @@ import tabulate
|
|
|
24
25
|
from humps import decamelize
|
|
25
26
|
from tqdm import tqdm
|
|
26
27
|
|
|
28
|
+
from folio_data_import.custom_exceptions import FolioDataImportBatchError
|
|
29
|
+
from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
|
|
30
|
+
|
|
27
31
|
try:
|
|
28
32
|
datetime_utc = datetime.UTC
|
|
29
33
|
except AttributeError:
|
|
@@ -62,7 +66,6 @@ class MARCImportJob:
|
|
|
62
66
|
import_profile_name (str): The name of the data import job profile to use.
|
|
63
67
|
batch_size (int): The number of source records to include in a record batch (default=10).
|
|
64
68
|
batch_delay (float): The number of seconds to wait between record batches (default=0).
|
|
65
|
-
consolidate (bool): Consolidate files into a single job. Default is one job for each file.
|
|
66
69
|
no_progress (bool): Disable progress bars (eg. for running in a CI environment).
|
|
67
70
|
"""
|
|
68
71
|
|
|
@@ -74,10 +77,14 @@ class MARCImportJob:
|
|
|
74
77
|
http_client: httpx.Client
|
|
75
78
|
current_file: List[Path]
|
|
76
79
|
record_batch: List[dict] = []
|
|
77
|
-
error_records: int = 0
|
|
78
80
|
last_current: int = 0
|
|
79
81
|
total_records_sent: int = 0
|
|
80
82
|
finished: bool = False
|
|
83
|
+
job_id: str = ""
|
|
84
|
+
job_hrid: int = 0
|
|
85
|
+
current_file: Union[List[Path],List[io.BytesIO]] = []
|
|
86
|
+
_max_summary_retries: int = 2
|
|
87
|
+
_summary_retries: int = 0
|
|
81
88
|
|
|
82
89
|
def __init__(
|
|
83
90
|
self,
|
|
@@ -86,12 +93,17 @@ class MARCImportJob:
|
|
|
86
93
|
import_profile_name: str,
|
|
87
94
|
batch_size=10,
|
|
88
95
|
batch_delay=0,
|
|
89
|
-
marc_record_preprocessor=
|
|
90
|
-
|
|
96
|
+
marc_record_preprocessor: Union[List[Callable], str]=[],
|
|
97
|
+
preprocessor_args: Dict[str,Dict]={},
|
|
91
98
|
no_progress=False,
|
|
92
99
|
let_summary_fail=False,
|
|
100
|
+
split_files=False,
|
|
101
|
+
split_size=1000,
|
|
102
|
+
split_offset=0,
|
|
93
103
|
) -> None:
|
|
94
|
-
self.
|
|
104
|
+
self.split_files = split_files
|
|
105
|
+
self.split_size = split_size
|
|
106
|
+
self.split_offset = split_offset
|
|
95
107
|
self.no_progress = no_progress
|
|
96
108
|
self.let_summary_fail = let_summary_fail
|
|
97
109
|
self.folio_client: folioclient.FolioClient = folio_client
|
|
@@ -100,20 +112,14 @@ class MARCImportJob:
|
|
|
100
112
|
self.batch_size = batch_size
|
|
101
113
|
self.batch_delay = batch_delay
|
|
102
114
|
self.current_retry_timeout = None
|
|
103
|
-
self.marc_record_preprocessor = marc_record_preprocessor
|
|
104
|
-
self.pbar_sent: tqdm
|
|
105
|
-
self.pbar_imported: tqdm
|
|
106
|
-
self._max_summary_retries: int = 2
|
|
107
|
-
self._summary_retries: int = 0
|
|
115
|
+
self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(marc_record_preprocessor, **preprocessor_args)
|
|
108
116
|
|
|
109
117
|
async def do_work(self) -> None:
|
|
110
118
|
"""
|
|
111
119
|
Performs the necessary work for data import.
|
|
112
120
|
|
|
113
121
|
This method initializes an HTTP client, files to store records that fail to send,
|
|
114
|
-
and calls
|
|
115
|
-
it imports all the files specified in `import_files` as a single batch. Otherwise,
|
|
116
|
-
it imports each file as a separate import job.
|
|
122
|
+
and calls the appropriate method to import MARC files based on the configuration.
|
|
117
123
|
|
|
118
124
|
Returns:
|
|
119
125
|
None
|
|
@@ -138,15 +144,33 @@ class MARCImportJob:
|
|
|
138
144
|
self.failed_batches_file = failed_batches
|
|
139
145
|
logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
|
|
140
146
|
self.http_client = http_client
|
|
141
|
-
if self.
|
|
142
|
-
|
|
143
|
-
await self.import_marc_file()
|
|
147
|
+
if self.split_files:
|
|
148
|
+
await self.process_split_files()
|
|
144
149
|
else:
|
|
145
150
|
for file in self.import_files:
|
|
146
151
|
self.current_file = [file]
|
|
147
152
|
await self.import_marc_file()
|
|
148
153
|
await self.wrap_up()
|
|
149
154
|
|
|
155
|
+
async def process_split_files(self):
|
|
156
|
+
"""
|
|
157
|
+
Process the import of files in smaller batches.
|
|
158
|
+
This method is called when `split_files` is set to True.
|
|
159
|
+
It splits each file into smaller chunks and processes them one by one.
|
|
160
|
+
"""
|
|
161
|
+
for file in self.import_files:
|
|
162
|
+
with open(file, "rb") as f:
|
|
163
|
+
file_length = await self.read_total_records([f])
|
|
164
|
+
expected_batches = math.ceil(file_length /self.split_size)
|
|
165
|
+
logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
|
|
166
|
+
zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
|
|
167
|
+
for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
|
|
168
|
+
if idx > self.split_offset:
|
|
169
|
+
batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
|
|
170
|
+
self.current_file = [batch]
|
|
171
|
+
await self.import_marc_file()
|
|
172
|
+
self.move_file_to_complete(file)
|
|
173
|
+
|
|
150
174
|
async def wrap_up(self) -> None:
|
|
151
175
|
"""
|
|
152
176
|
Wraps up the data import process.
|
|
@@ -194,7 +218,7 @@ class MARCImportJob:
|
|
|
194
218
|
)
|
|
195
219
|
self.current_retry_timeout = None
|
|
196
220
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
197
|
-
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
221
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504, 401]:
|
|
198
222
|
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
199
223
|
logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
|
|
200
224
|
sleep(0.25)
|
|
@@ -256,7 +280,7 @@ class MARCImportJob:
|
|
|
256
280
|
"""
|
|
257
281
|
try:
|
|
258
282
|
create_job = self.http_client.post(
|
|
259
|
-
self.folio_client.
|
|
283
|
+
self.folio_client.gateway_url + "/change-manager/jobExecutions",
|
|
260
284
|
headers=self.folio_client.okapi_headers,
|
|
261
285
|
json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
|
|
262
286
|
)
|
|
@@ -275,7 +299,7 @@ class MARCImportJob:
|
|
|
275
299
|
)
|
|
276
300
|
raise e
|
|
277
301
|
self.job_id = create_job.json()["parentJobExecutionId"]
|
|
278
|
-
logger.info("Created job:
|
|
302
|
+
logger.info(f"Created job: {self.job_id}")
|
|
279
303
|
|
|
280
304
|
@cached_property
|
|
281
305
|
def import_profile(self) -> dict:
|
|
@@ -305,7 +329,7 @@ class MARCImportJob:
|
|
|
305
329
|
The response from the HTTP request to set the job profile.
|
|
306
330
|
"""
|
|
307
331
|
set_job_profile = self.http_client.put(
|
|
308
|
-
self.folio_client.
|
|
332
|
+
self.folio_client.gateway_url
|
|
309
333
|
+ "/change-manager/jobExecutions/"
|
|
310
334
|
+ self.job_id
|
|
311
335
|
+ "/jobProfile",
|
|
@@ -318,6 +342,8 @@ class MARCImportJob:
|
|
|
318
342
|
)
|
|
319
343
|
try:
|
|
320
344
|
set_job_profile.raise_for_status()
|
|
345
|
+
self.job_hrid = set_job_profile.json()['hrId']
|
|
346
|
+
logger.info(f"Job HRID: {self.job_hrid}")
|
|
321
347
|
except httpx.HTTPError as e:
|
|
322
348
|
logger.error(
|
|
323
349
|
"Error creating job: "
|
|
@@ -328,7 +354,7 @@ class MARCImportJob:
|
|
|
328
354
|
raise e
|
|
329
355
|
|
|
330
356
|
@staticmethod
|
|
331
|
-
async def read_total_records(files) -> int:
|
|
357
|
+
async def read_total_records(files: List[BinaryIO]) -> int:
|
|
332
358
|
"""
|
|
333
359
|
Reads the total number of records from the given files.
|
|
334
360
|
|
|
@@ -357,17 +383,15 @@ class MARCImportJob:
|
|
|
357
383
|
"""
|
|
358
384
|
try:
|
|
359
385
|
post_batch = self.http_client.post(
|
|
360
|
-
self.folio_client.
|
|
386
|
+
self.folio_client.gateway_url
|
|
361
387
|
+ f"/change-manager/jobExecutions/{self.job_id}/records",
|
|
362
388
|
headers=self.folio_client.okapi_headers,
|
|
363
389
|
json=batch_payload,
|
|
364
390
|
)
|
|
365
|
-
# if batch_payload["recordsMetadata"]["last"]:
|
|
366
|
-
# logger.log(
|
|
367
|
-
# 25,
|
|
368
|
-
# f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
|
|
369
|
-
# )
|
|
370
391
|
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
392
|
+
logger.warning(
|
|
393
|
+
f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying..."
|
|
394
|
+
)
|
|
371
395
|
sleep(0.25)
|
|
372
396
|
return await self.process_record_batch(batch_payload)
|
|
373
397
|
try:
|
|
@@ -375,20 +399,21 @@ class MARCImportJob:
|
|
|
375
399
|
self.total_records_sent += len(self.record_batch)
|
|
376
400
|
self.record_batch = []
|
|
377
401
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
378
|
-
except
|
|
402
|
+
except httpx.HTTPStatusError as e:
|
|
379
403
|
if (
|
|
380
|
-
|
|
381
|
-
): # TODO:
|
|
404
|
+
e.response.status_code in [500, 400, 422]
|
|
405
|
+
): # TODO: Update once we no longer have to support < Sunflower to just be 400
|
|
382
406
|
self.total_records_sent += len(self.record_batch)
|
|
383
407
|
self.record_batch = []
|
|
384
408
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
385
409
|
else:
|
|
386
|
-
logger.error("Error posting batch: " + str(e))
|
|
387
410
|
for record in self.record_batch:
|
|
388
411
|
self.failed_batches_file.write(record)
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
412
|
+
raise FolioDataImportBatchError(
|
|
413
|
+
batch_payload['id'],
|
|
414
|
+
f"{e}\n{e.response.text}",
|
|
415
|
+
e
|
|
416
|
+
)
|
|
392
417
|
await self.get_job_status()
|
|
393
418
|
sleep(self.batch_delay)
|
|
394
419
|
|
|
@@ -417,16 +442,12 @@ class MARCImportJob:
|
|
|
417
442
|
await self.create_batch_payload(
|
|
418
443
|
counter,
|
|
419
444
|
total_records,
|
|
420
|
-
|
|
421
|
-
== (total_records - self.error_records),
|
|
445
|
+
counter == total_records,
|
|
422
446
|
),
|
|
423
447
|
)
|
|
424
448
|
sleep(0.25)
|
|
425
449
|
if record:
|
|
426
|
-
|
|
427
|
-
record = await self.apply_marc_record_preprocessing(
|
|
428
|
-
record, self.marc_record_preprocessor
|
|
429
|
-
)
|
|
450
|
+
record = self.marc_record_preprocessor.do_work(record)
|
|
430
451
|
self.record_batch.append(record.as_marc())
|
|
431
452
|
counter += 1
|
|
432
453
|
else:
|
|
@@ -437,75 +458,26 @@ class MARCImportJob:
|
|
|
437
458
|
"",
|
|
438
459
|
)
|
|
439
460
|
self.bad_records_file.write(reader.current_chunk)
|
|
440
|
-
if self.
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
)
|
|
449
|
-
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
450
|
-
if not import_complete_path.exists():
|
|
451
|
-
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
452
|
-
import_complete_path.mkdir(exist_ok=True)
|
|
453
|
-
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
454
|
-
file_path.rename(
|
|
455
|
-
file_path.parent.joinpath("import_complete", file_path.name)
|
|
456
|
-
)
|
|
457
|
-
|
|
458
|
-
@staticmethod
|
|
459
|
-
async def apply_marc_record_preprocessing(
|
|
460
|
-
record: pymarc.Record, func_or_path
|
|
461
|
-
) -> pymarc.Record:
|
|
462
|
-
"""
|
|
463
|
-
Apply preprocessing to the MARC record before sending it to FOLIO.
|
|
464
|
-
|
|
465
|
-
Args:
|
|
466
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
467
|
-
func_or_path (Union[Callable, str]): The preprocessing function or its import path.
|
|
468
|
-
|
|
469
|
-
Returns:
|
|
470
|
-
pymarc.Record: The preprocessed MARC record.
|
|
471
|
-
"""
|
|
472
|
-
if isinstance(func_or_path, str):
|
|
473
|
-
func_paths = func_or_path.split(",")
|
|
474
|
-
for func_path in func_paths:
|
|
475
|
-
record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
|
|
476
|
-
record, func_path
|
|
477
|
-
)
|
|
478
|
-
elif callable(func_or_path):
|
|
479
|
-
record = func_or_path(record)
|
|
480
|
-
else:
|
|
481
|
-
logger.warning(
|
|
482
|
-
f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
|
|
461
|
+
if not self.split_files:
|
|
462
|
+
self.move_file_to_complete(file_path)
|
|
463
|
+
if self.record_batch or not self.finished:
|
|
464
|
+
await self.process_record_batch(
|
|
465
|
+
await self.create_batch_payload(
|
|
466
|
+
counter,
|
|
467
|
+
total_records,
|
|
468
|
+
counter == total_records,
|
|
469
|
+
),
|
|
483
470
|
)
|
|
484
|
-
return record
|
|
485
471
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
func_path (str): The path to the preprocessing function.
|
|
495
|
-
|
|
496
|
-
Returns:
|
|
497
|
-
pymarc.Record: The preprocessed MARC record.
|
|
498
|
-
"""
|
|
499
|
-
try:
|
|
500
|
-
module_path, func_name = func_path.rsplit(".", 1)
|
|
501
|
-
module = importlib.import_module(module_path)
|
|
502
|
-
func = getattr(module, func_name)
|
|
503
|
-
record = func(record)
|
|
504
|
-
except Exception as e:
|
|
505
|
-
logger.warning(
|
|
506
|
-
f"Error applying preprocessing function {func_path}: {e}. Skipping."
|
|
472
|
+
def move_file_to_complete(self, file_path: Path):
|
|
473
|
+
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
474
|
+
if not import_complete_path.exists():
|
|
475
|
+
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
476
|
+
import_complete_path.mkdir(exist_ok=True)
|
|
477
|
+
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
478
|
+
file_path.rename(
|
|
479
|
+
file_path.parent.joinpath("import_complete", file_path.name)
|
|
507
480
|
)
|
|
508
|
-
return record
|
|
509
481
|
|
|
510
482
|
async def create_batch_payload(self, counter, total_records, is_last) -> dict:
|
|
511
483
|
"""
|
|
@@ -523,13 +495,53 @@ class MARCImportJob:
|
|
|
523
495
|
"id": str(uuid.uuid4()),
|
|
524
496
|
"recordsMetadata": {
|
|
525
497
|
"last": is_last,
|
|
526
|
-
"counter": counter
|
|
498
|
+
"counter": counter,
|
|
527
499
|
"contentType": "MARC_RAW",
|
|
528
|
-
"total": total_records
|
|
500
|
+
"total": total_records,
|
|
529
501
|
},
|
|
530
502
|
"initialRecords": [{"record": x.decode()} for x in self.record_batch],
|
|
531
503
|
}
|
|
532
504
|
|
|
505
|
+
@staticmethod
|
|
506
|
+
def split_marc_file(file_path, batch_size):
|
|
507
|
+
"""Generator to iterate over MARC records in batches, yielding BytesIO objects."""
|
|
508
|
+
with open(file_path, "rb") as f:
|
|
509
|
+
batch = io.BytesIO()
|
|
510
|
+
count = 0
|
|
511
|
+
|
|
512
|
+
while True:
|
|
513
|
+
leader = f.read(24)
|
|
514
|
+
if not leader:
|
|
515
|
+
break # End of file
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
record_length = int(leader[:5]) # Extract record length from leader
|
|
519
|
+
except ValueError:
|
|
520
|
+
raise ValueError("Invalid MARC record length encountered.")
|
|
521
|
+
|
|
522
|
+
record_body = f.read(record_length - 24)
|
|
523
|
+
if len(record_body) != record_length - 24:
|
|
524
|
+
raise ValueError("Unexpected end of file while reading MARC record.")
|
|
525
|
+
|
|
526
|
+
# Verify record terminator
|
|
527
|
+
if record_body[-1:] != b'\x1D':
|
|
528
|
+
raise ValueError("MARC record does not end with the expected terminator (0x1D).")
|
|
529
|
+
|
|
530
|
+
# Write the full record to the batch buffer
|
|
531
|
+
batch.write(leader + record_body)
|
|
532
|
+
count += 1
|
|
533
|
+
|
|
534
|
+
if count >= batch_size:
|
|
535
|
+
batch.seek(0)
|
|
536
|
+
yield batch
|
|
537
|
+
batch = io.BytesIO() # Reset buffer
|
|
538
|
+
count = 0
|
|
539
|
+
|
|
540
|
+
# Yield any remaining records
|
|
541
|
+
if count > 0:
|
|
542
|
+
batch.seek(0)
|
|
543
|
+
yield batch
|
|
544
|
+
|
|
533
545
|
async def import_marc_file(self) -> None:
|
|
534
546
|
"""
|
|
535
547
|
Imports MARC file into the system.
|
|
@@ -551,13 +563,24 @@ class MARCImportJob:
|
|
|
551
563
|
await self.create_folio_import_job()
|
|
552
564
|
await self.set_job_profile()
|
|
553
565
|
with ExitStack() as stack:
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
566
|
+
try:
|
|
567
|
+
if isinstance(self.current_file[0], Path):
|
|
568
|
+
files = [
|
|
569
|
+
stack.enter_context(open(file, "rb")) for file in self.current_file
|
|
570
|
+
]
|
|
571
|
+
elif isinstance(self.current_file[0], io.BytesIO):
|
|
572
|
+
files = [
|
|
573
|
+
stack.enter_context(file) for file in self.current_file
|
|
574
|
+
]
|
|
575
|
+
else:
|
|
576
|
+
raise ValueError("Invalid file type. Must be Path or BytesIO.")
|
|
577
|
+
except IndexError as e:
|
|
578
|
+
logger.error(f"Error opening file: {e}")
|
|
579
|
+
raise e
|
|
557
580
|
total_records = await self.read_total_records(files)
|
|
558
581
|
with (
|
|
559
582
|
tqdm(
|
|
560
|
-
desc="Imported: ",
|
|
583
|
+
desc=f"Imported ({self.job_hrid}): ",
|
|
561
584
|
total=total_records,
|
|
562
585
|
position=1,
|
|
563
586
|
disable=self.no_progress,
|
|
@@ -569,48 +592,81 @@ class MARCImportJob:
|
|
|
569
592
|
disable=self.no_progress,
|
|
570
593
|
) as pbar_sent,
|
|
571
594
|
):
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
595
|
+
try:
|
|
596
|
+
self.pbar_sent = pbar_sent
|
|
597
|
+
self.pbar_imported = pbar_imported
|
|
598
|
+
await self.process_records(files, total_records)
|
|
599
|
+
while not self.finished:
|
|
600
|
+
await self.get_job_status()
|
|
601
|
+
sleep(1)
|
|
602
|
+
except FolioDataImportBatchError as e:
|
|
603
|
+
logger.error(
|
|
604
|
+
f"Unhandled error posting batch {e.batch_id}: {e.message}"
|
|
605
|
+
)
|
|
606
|
+
await self.cancel_job()
|
|
607
|
+
raise e
|
|
578
608
|
if self.finished:
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
609
|
+
await self.log_job_summary()
|
|
610
|
+
self.last_current = 0
|
|
611
|
+
self.finished = False
|
|
612
|
+
|
|
613
|
+
async def cancel_job(self) -> None:
|
|
614
|
+
"""
|
|
615
|
+
Cancels the current job execution.
|
|
616
|
+
|
|
617
|
+
This method sends a request to cancel the job execution and logs the result.
|
|
618
|
+
|
|
619
|
+
Returns:
|
|
620
|
+
None
|
|
621
|
+
"""
|
|
622
|
+
try:
|
|
623
|
+
cancel = self.http_client.delete(
|
|
624
|
+
self.folio_client.gateway_url
|
|
625
|
+
+ f"/change-manager/jobExecutions/{self.job_id}/records",
|
|
626
|
+
headers=self.folio_client.okapi_headers,
|
|
627
|
+
)
|
|
628
|
+
cancel.raise_for_status()
|
|
629
|
+
self.finished = True
|
|
630
|
+
logger.info(f"Cancelled job: {self.job_id}")
|
|
631
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
632
|
+
logger.warning(f"CONNECTION ERROR cancelling job {self.job_id}. Retrying...")
|
|
633
|
+
sleep(0.25)
|
|
634
|
+
await self.cancel_job()
|
|
635
|
+
|
|
636
|
+
async def log_job_summary(self):
|
|
637
|
+
if job_summary := await self.get_job_summary():
|
|
638
|
+
job_id = job_summary.pop("jobExecutionId", None)
|
|
639
|
+
total_errors = job_summary.pop("totalErrors", 0)
|
|
640
|
+
columns = ["Summary"] + list(job_summary.keys())
|
|
641
|
+
rows = set()
|
|
642
|
+
for key in columns[1:]:
|
|
643
|
+
rows.update(job_summary[key].keys())
|
|
644
|
+
|
|
645
|
+
table_data = []
|
|
646
|
+
for row in rows:
|
|
647
|
+
metric_name = decamelize(row).split("_")[1]
|
|
648
|
+
table_row = [metric_name]
|
|
649
|
+
for col in columns[1:]:
|
|
650
|
+
table_row.append(job_summary[col].get(row, "N/A"))
|
|
651
|
+
table_data.append(table_row)
|
|
652
|
+
table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
|
|
653
|
+
columns = columns[:1] + [
|
|
596
654
|
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
597
655
|
]
|
|
598
|
-
|
|
656
|
+
logger.info(
|
|
599
657
|
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
600
658
|
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
601
659
|
)
|
|
602
|
-
|
|
660
|
+
logger.info(
|
|
603
661
|
"\n"
|
|
604
662
|
+ tabulate.tabulate(
|
|
605
663
|
table_data, headers=columns, tablefmt="fancy_grid"
|
|
606
664
|
),
|
|
607
665
|
)
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
self.last_current = 0
|
|
613
|
-
self.finished = False
|
|
666
|
+
if total_errors:
|
|
667
|
+
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
668
|
+
else:
|
|
669
|
+
logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
|
|
614
670
|
|
|
615
671
|
async def get_job_summary(self) -> dict:
|
|
616
672
|
"""
|
|
@@ -749,19 +805,31 @@ async def main() -> None:
|
|
|
749
805
|
"--preprocessor",
|
|
750
806
|
type=str,
|
|
751
807
|
help=(
|
|
752
|
-
"
|
|
753
|
-
"to apply to each MARC record before sending to FOLIO."
|
|
808
|
+
"Comma-separated python import paths to Python function(s) "
|
|
809
|
+
"to apply to each MARC record before sending to FOLIO. Function should take "
|
|
810
|
+
"a pymarc.Record object as input and return a pymarc.Record object."
|
|
754
811
|
),
|
|
755
812
|
default=None,
|
|
756
813
|
)
|
|
814
|
+
|
|
757
815
|
parser.add_argument(
|
|
758
|
-
"--
|
|
816
|
+
"--split-files",
|
|
759
817
|
action="store_true",
|
|
760
|
-
help=
|
|
761
|
-
"Consolidate records into a single job. "
|
|
762
|
-
"Default is to create a new job for each MARC file."
|
|
763
|
-
),
|
|
818
|
+
help="Split files into smaller parts before importing.",
|
|
764
819
|
)
|
|
820
|
+
parser.add_argument(
|
|
821
|
+
"--split-size",
|
|
822
|
+
type=int,
|
|
823
|
+
help="The number of records to include in each split file.",
|
|
824
|
+
default=1000,
|
|
825
|
+
)
|
|
826
|
+
parser.add_argument(
|
|
827
|
+
"--split-offset",
|
|
828
|
+
type=int,
|
|
829
|
+
help="The number of record batches of <split-size> to skip before starting import.",
|
|
830
|
+
default=0,
|
|
831
|
+
)
|
|
832
|
+
|
|
765
833
|
parser.add_argument(
|
|
766
834
|
"--no-progress",
|
|
767
835
|
action="store_true",
|
|
@@ -772,6 +840,16 @@ async def main() -> None:
|
|
|
772
840
|
action="store_true",
|
|
773
841
|
help="Do not retry fetching the final job summary if it fails",
|
|
774
842
|
)
|
|
843
|
+
parser.add_argument(
|
|
844
|
+
"--preprocessor-config",
|
|
845
|
+
type=str,
|
|
846
|
+
help=(
|
|
847
|
+
"JSON file containing configuration for preprocessor functions. "
|
|
848
|
+
"This is passed to MARCPreprocessor class as a dict of dicts."
|
|
849
|
+
),
|
|
850
|
+
default=None,
|
|
851
|
+
)
|
|
852
|
+
|
|
775
853
|
args = parser.parse_args()
|
|
776
854
|
if not args.password:
|
|
777
855
|
args.password = getpass("Enter FOLIO password: ")
|
|
@@ -796,6 +874,12 @@ async def main() -> None:
|
|
|
796
874
|
else:
|
|
797
875
|
logger.info(marc_files)
|
|
798
876
|
|
|
877
|
+
if args.preprocessor_config:
|
|
878
|
+
with open(args.preprocessor_config, "r") as f:
|
|
879
|
+
preprocessor_args = json.load(f)
|
|
880
|
+
else:
|
|
881
|
+
preprocessor_args = {}
|
|
882
|
+
|
|
799
883
|
if not args.import_profile_name:
|
|
800
884
|
import_profiles = folio_client.folio_get(
|
|
801
885
|
"/data-import-profiles/jobProfiles",
|
|
@@ -824,9 +908,12 @@ async def main() -> None:
|
|
|
824
908
|
batch_size=args.batch_size,
|
|
825
909
|
batch_delay=args.batch_delay,
|
|
826
910
|
marc_record_preprocessor=args.preprocessor,
|
|
827
|
-
|
|
911
|
+
preprocessor_args=preprocessor_args,
|
|
828
912
|
no_progress=bool(args.no_progress),
|
|
829
913
|
let_summary_fail=bool(args.let_summary_fail),
|
|
914
|
+
split_files=bool(args.split_files),
|
|
915
|
+
split_size=args.split_size,
|
|
916
|
+
split_offset=args.split_offset,
|
|
830
917
|
).do_work()
|
|
831
918
|
except Exception as e:
|
|
832
919
|
logger.error("Error importing files: " + str(e))
|
folio_data_import/UserImport.py
CHANGED
|
@@ -137,7 +137,7 @@ class UserImporter: # noqa: R0902
|
|
|
137
137
|
match_key = "id" if ("id" in user_obj) else self.match_key
|
|
138
138
|
try:
|
|
139
139
|
existing_user = await self.http_client.get(
|
|
140
|
-
self.folio_client.
|
|
140
|
+
self.folio_client.gateway_url + "/users",
|
|
141
141
|
headers=self.folio_client.okapi_headers,
|
|
142
142
|
params={"query": f"{match_key}=={user_obj[match_key]}"},
|
|
143
143
|
)
|
|
@@ -161,7 +161,7 @@ class UserImporter: # noqa: R0902
|
|
|
161
161
|
"""
|
|
162
162
|
try:
|
|
163
163
|
existing_rp = await self.http_client.get(
|
|
164
|
-
self.folio_client.
|
|
164
|
+
self.folio_client.gateway_url
|
|
165
165
|
+ "/request-preference-storage/request-preference",
|
|
166
166
|
headers=self.folio_client.okapi_headers,
|
|
167
167
|
params={
|
|
@@ -188,7 +188,7 @@ class UserImporter: # noqa: R0902
|
|
|
188
188
|
"""
|
|
189
189
|
try:
|
|
190
190
|
existing_pu = await self.http_client.get(
|
|
191
|
-
self.folio_client.
|
|
191
|
+
self.folio_client.gateway_url + "/perms/users",
|
|
192
192
|
headers=self.folio_client.okapi_headers,
|
|
193
193
|
params={
|
|
194
194
|
"query": f"userId=={existing_user.get('id', user_obj.get('id', ''))}"
|
|
@@ -369,7 +369,7 @@ class UserImporter: # noqa: R0902
|
|
|
369
369
|
else:
|
|
370
370
|
existing_user[key] = value
|
|
371
371
|
create_update_user = await self.http_client.put(
|
|
372
|
-
self.folio_client.
|
|
372
|
+
self.folio_client.gateway_url + f"/users/{existing_user['id']}",
|
|
373
373
|
headers=self.folio_client.okapi_headers,
|
|
374
374
|
json=existing_user,
|
|
375
375
|
)
|
|
@@ -389,7 +389,7 @@ class UserImporter: # noqa: R0902
|
|
|
389
389
|
HTTPError: If the HTTP request to create the user fails.
|
|
390
390
|
"""
|
|
391
391
|
response = await self.http_client.post(
|
|
392
|
-
self.folio_client.
|
|
392
|
+
self.folio_client.gateway_url + "/users",
|
|
393
393
|
headers=self.folio_client.okapi_headers,
|
|
394
394
|
json=user_obj,
|
|
395
395
|
)
|
|
@@ -589,7 +589,7 @@ class UserImporter: # noqa: R0902
|
|
|
589
589
|
rp_obj["userId"] = new_user_obj["id"]
|
|
590
590
|
# print(rp_obj)
|
|
591
591
|
response = await self.http_client.post(
|
|
592
|
-
self.folio_client.
|
|
592
|
+
self.folio_client.gateway_url
|
|
593
593
|
+ "/request-preference-storage/request-preference",
|
|
594
594
|
headers=self.folio_client.okapi_headers,
|
|
595
595
|
json=rp_obj,
|
|
@@ -613,7 +613,7 @@ class UserImporter: # noqa: R0902
|
|
|
613
613
|
existing_rp.update(rp_obj)
|
|
614
614
|
# print(existing_rp)
|
|
615
615
|
response = await self.http_client.put(
|
|
616
|
-
self.folio_client.
|
|
616
|
+
self.folio_client.gateway_url
|
|
617
617
|
+ f"/request-preference-storage/request-preference/{existing_rp['id']}",
|
|
618
618
|
headers=self.folio_client.okapi_headers,
|
|
619
619
|
json=existing_rp,
|
|
@@ -635,7 +635,7 @@ class UserImporter: # noqa: R0902
|
|
|
635
635
|
"""
|
|
636
636
|
perms_user_obj = {"userId": new_user_obj["id"], "permissions": []}
|
|
637
637
|
response = await self.http_client.post(
|
|
638
|
-
self.folio_client.
|
|
638
|
+
self.folio_client.gateway_url + "/perms/users",
|
|
639
639
|
headers=self.folio_client.okapi_headers,
|
|
640
640
|
json=perms_user_obj,
|
|
641
641
|
)
|
|
@@ -788,7 +788,7 @@ class UserImporter: # noqa: R0902
|
|
|
788
788
|
"""
|
|
789
789
|
try:
|
|
790
790
|
existing_spu = await self.http_client.get(
|
|
791
|
-
self.folio_client.
|
|
791
|
+
self.folio_client.gateway_url + "/service-points-users",
|
|
792
792
|
headers=self.folio_client.okapi_headers,
|
|
793
793
|
params={"query": f"userId=={existing_user['id']}"},
|
|
794
794
|
)
|
|
@@ -812,7 +812,7 @@ class UserImporter: # noqa: R0902
|
|
|
812
812
|
"""
|
|
813
813
|
spu_obj["userId"] = existing_user["id"]
|
|
814
814
|
response = await self.http_client.post(
|
|
815
|
-
self.folio_client.
|
|
815
|
+
self.folio_client.gateway_url + "/service-points-users",
|
|
816
816
|
headers=self.folio_client.okapi_headers,
|
|
817
817
|
json=spu_obj,
|
|
818
818
|
)
|
|
@@ -831,7 +831,7 @@ class UserImporter: # noqa: R0902
|
|
|
831
831
|
"""
|
|
832
832
|
existing_spu.update(spu_obj)
|
|
833
833
|
response = await self.http_client.put(
|
|
834
|
-
self.folio_client.
|
|
834
|
+
self.folio_client.gateway_url + f"/service-points-users/{existing_spu['id']}",
|
|
835
835
|
headers=self.folio_client.okapi_headers,
|
|
836
836
|
json=existing_spu,
|
|
837
837
|
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Custom exceptions for the Folio Data Import module."""
|
|
2
|
+
|
|
3
|
+
class FolioDataImportError(Exception):
|
|
4
|
+
"""Base class for all exceptions in the Folio Data Import module."""
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
class FolioDataImportBatchError(FolioDataImportError):
|
|
8
|
+
"""Exception raised for errors in the Folio Data Import batch process.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
batch_id -- ID of the batch that caused the error
|
|
12
|
+
message -- explanation of the error
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self, batch_id, message, exception=None):
|
|
15
|
+
self.batch_id = batch_id
|
|
16
|
+
self.message = message
|
|
17
|
+
super().__init__(f"Unhandled error posting batch {batch_id}: {message}")
|
|
@@ -1,69 +1,168 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Callable, Dict, List, Tuple, Union
|
|
1
4
|
import pymarc
|
|
2
5
|
import logging
|
|
3
6
|
|
|
7
|
+
from pymarc.record import Record
|
|
8
|
+
|
|
4
9
|
logger = logging.getLogger("folio_data_import.MARCDataImport")
|
|
5
10
|
|
|
11
|
+
class MARCPreprocessor:
|
|
12
|
+
"""
|
|
13
|
+
A class to preprocess MARC records for data import into FOLIO.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, preprocessors: Union[str,List[Callable]], **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
Initialize the MARCPreprocessor with a list of preprocessors.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
preprocessors (Union[str, List[Callable]]): A string of comma-separated function names or a list of callable preprocessor functions to apply.
|
|
22
|
+
"""
|
|
23
|
+
self.preprocessor_args: Dict[str, Dict] = kwargs
|
|
24
|
+
self.preprocessors: List[Tuple[Callable, Dict]] = self._get_preprocessor_functions(
|
|
25
|
+
preprocessors
|
|
26
|
+
)
|
|
27
|
+
self.proc_kwargs = kwargs
|
|
28
|
+
self.record = None
|
|
29
|
+
|
|
30
|
+
def _get_preprocessor_args(self, func: Callable) -> Dict:
|
|
31
|
+
"""
|
|
32
|
+
Get the arguments for the preprocessor function.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
func (Callable): The preprocessor function.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dict: A dictionary of arguments for the preprocessor function.
|
|
39
|
+
"""
|
|
40
|
+
func_path = f"{func.__module__}.{func.__name__}"
|
|
41
|
+
path_args: Dict = self.preprocessor_args.get("default", {})
|
|
42
|
+
path_args.update(self.preprocessor_args.get(func.__name__, {}))
|
|
43
|
+
path_args.update(self.preprocessor_args.get(func_path, {}))
|
|
44
|
+
return path_args
|
|
45
|
+
|
|
46
|
+
def _get_preprocessor_functions(self, func_list: Union[str, List[Callable]]) -> List[Callable]:
|
|
47
|
+
"""
|
|
48
|
+
Get the preprocessor functions based on the provided names.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
List[callable]: A list of preprocessor functions.
|
|
52
|
+
"""
|
|
53
|
+
preprocessors = []
|
|
54
|
+
if isinstance(func_list, str):
|
|
55
|
+
func_list = func_list.split(",")
|
|
56
|
+
else:
|
|
57
|
+
for f in func_list:
|
|
58
|
+
if not callable(f):
|
|
59
|
+
logger.warning(
|
|
60
|
+
f"Preprocessing function {f} is not callable. Skipping."
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
preprocessors.append((f, self._get_preprocessor_args(f)))
|
|
64
|
+
return preprocessors
|
|
65
|
+
for f_path in func_list:
|
|
66
|
+
f_import = f_path.rsplit(".", 1)
|
|
67
|
+
if len(f_import) == 1:
|
|
68
|
+
# If the function is not a full path, assume it's in the current module
|
|
69
|
+
if func := getattr(sys.modules[__name__], f_import[0], None):
|
|
70
|
+
if callable(func):
|
|
71
|
+
preprocessors.append((func, self._get_preprocessor_args(func)))
|
|
72
|
+
else:
|
|
73
|
+
logger.warning(
|
|
74
|
+
f"Preprocessing function {f_path} is not callable. Skipping."
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
logger.warning(
|
|
78
|
+
f"Preprocessing function {f_path} not found in current module. Skipping."
|
|
79
|
+
)
|
|
80
|
+
elif len(f_import) == 2:
|
|
81
|
+
# If the function is a full path, import it
|
|
82
|
+
module_path, func_name = f_import
|
|
83
|
+
try:
|
|
84
|
+
module = importlib.import_module(module_path)
|
|
85
|
+
func = getattr(module, func_name)
|
|
86
|
+
preprocessors.append((func, self._get_preprocessor_args(func)))
|
|
87
|
+
except ImportError as e:
|
|
88
|
+
logger.warning(
|
|
89
|
+
f"Error importing preprocessing function {f_path}: {e}. Skipping."
|
|
90
|
+
)
|
|
91
|
+
return preprocessors
|
|
92
|
+
|
|
93
|
+
def do_work(self, record: Record) -> Record:
|
|
94
|
+
"""
|
|
95
|
+
Preprocess the MARC record.
|
|
96
|
+
"""
|
|
97
|
+
for proc, kwargs in self.preprocessors:
|
|
98
|
+
record = proc(record, **kwargs)
|
|
99
|
+
return record
|
|
6
100
|
|
|
7
|
-
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def prepend_prefix_001(record: Record, prefix: str) -> Record:
|
|
8
104
|
"""
|
|
9
105
|
Prepend a prefix to the record's 001 field.
|
|
10
106
|
|
|
11
107
|
Args:
|
|
12
|
-
record (
|
|
108
|
+
record (Record): The MARC record to preprocess.
|
|
13
109
|
prefix (str): The prefix to prepend to the 001 field.
|
|
14
110
|
|
|
15
111
|
Returns:
|
|
16
|
-
|
|
112
|
+
Record: The preprocessed MARC record.
|
|
17
113
|
"""
|
|
18
|
-
|
|
114
|
+
if "001" in record:
|
|
115
|
+
record["001"].data = f"({prefix})" + record["001"].data
|
|
116
|
+
else:
|
|
117
|
+
logger.warning("Field '001' not found in record. Skipping prefix prepend.")
|
|
19
118
|
return record
|
|
20
119
|
|
|
21
120
|
|
|
22
|
-
def prepend_ppn_prefix_001(record:
|
|
121
|
+
def prepend_ppn_prefix_001(record: Record, **kwargs) -> Record:
|
|
23
122
|
"""
|
|
24
123
|
Prepend the PPN prefix to the record's 001 field. Useful when
|
|
25
124
|
importing records from the ABES SUDOC catalog
|
|
26
125
|
|
|
27
126
|
Args:
|
|
28
|
-
record (
|
|
127
|
+
record (Record): The MARC record to preprocess.
|
|
29
128
|
|
|
30
129
|
Returns:
|
|
31
|
-
|
|
130
|
+
Record: The preprocessed MARC record.
|
|
32
131
|
"""
|
|
33
132
|
return prepend_prefix_001(record, "PPN")
|
|
34
133
|
|
|
35
134
|
|
|
36
|
-
def prepend_abes_prefix_001(record:
|
|
135
|
+
def prepend_abes_prefix_001(record: Record, **kwargs) -> Record:
|
|
37
136
|
"""
|
|
38
137
|
Prepend the ABES prefix to the record's 001 field. Useful when
|
|
39
138
|
importing records from the ABES SUDOC catalog
|
|
40
139
|
|
|
41
140
|
Args:
|
|
42
|
-
record (
|
|
141
|
+
record (Record): The MARC record to preprocess.
|
|
43
142
|
|
|
44
143
|
Returns:
|
|
45
|
-
|
|
144
|
+
Record: The preprocessed MARC record.
|
|
46
145
|
"""
|
|
47
146
|
return prepend_prefix_001(record, "ABES")
|
|
48
147
|
|
|
49
148
|
|
|
50
|
-
def strip_999_ff_fields(record:
|
|
149
|
+
def strip_999_ff_fields(record: Record, **kwargs) -> Record:
|
|
51
150
|
"""
|
|
52
151
|
Strip all 999 fields with ff indicators from the record.
|
|
53
152
|
Useful when importing records exported from another FOLIO system
|
|
54
153
|
|
|
55
154
|
Args:
|
|
56
|
-
record (
|
|
155
|
+
record (Record): The MARC record to preprocess.
|
|
57
156
|
|
|
58
157
|
Returns:
|
|
59
|
-
|
|
158
|
+
Record: The preprocessed MARC record.
|
|
60
159
|
"""
|
|
61
160
|
for field in record.get_fields("999"):
|
|
62
161
|
if field.indicators == pymarc.Indicators(*["f", "f"]):
|
|
63
162
|
record.remove_field(field)
|
|
64
163
|
return record
|
|
65
164
|
|
|
66
|
-
def clean_999_fields(record:
|
|
165
|
+
def clean_999_fields(record: Record, **kwargs) -> Record:
|
|
67
166
|
"""
|
|
68
167
|
The presence of 999 fields, with or without ff indicators, can cause
|
|
69
168
|
issues with data import mapping in FOLIO. This function calls strip_999_ff_fields
|
|
@@ -71,10 +170,10 @@ def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
71
170
|
to 945 fields.
|
|
72
171
|
|
|
73
172
|
Args:
|
|
74
|
-
record (
|
|
173
|
+
record (Record): The MARC record to preprocess.
|
|
75
174
|
|
|
76
175
|
Returns:
|
|
77
|
-
|
|
176
|
+
Record: The preprocessed MARC record.
|
|
78
177
|
"""
|
|
79
178
|
record = strip_999_ff_fields(record)
|
|
80
179
|
for field in record.get_fields("999"):
|
|
@@ -87,7 +186,31 @@ def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
87
186
|
record.remove_field(field)
|
|
88
187
|
return record
|
|
89
188
|
|
|
90
|
-
def
|
|
189
|
+
def clean_non_ff_999_fields(record: Record, **kwargs) -> Record:
|
|
190
|
+
"""
|
|
191
|
+
When loading migrated MARC records from folio_migration_tools, the presence of other 999 fields
|
|
192
|
+
than those set by the migration process can cause the record to fail to load properly. This preprocessor
|
|
193
|
+
function moves all 999 fields with non-ff indicators to 945 fields with 99 indicators.
|
|
194
|
+
"""
|
|
195
|
+
for field in record.get_fields("999"):
|
|
196
|
+
if field.indicators != pymarc.Indicators(*["f", "f"]):
|
|
197
|
+
logger.log(
|
|
198
|
+
26,
|
|
199
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
200
|
+
record["001"].value(),
|
|
201
|
+
"Record contains a 999 field with non-ff indicators: Moving field to a 945 with indicators \"99\"",
|
|
202
|
+
field,
|
|
203
|
+
)
|
|
204
|
+
_945 = pymarc.Field(
|
|
205
|
+
tag="945",
|
|
206
|
+
indicators=pymarc.Indicators("9","9"),
|
|
207
|
+
subfields=field.subfields,
|
|
208
|
+
)
|
|
209
|
+
record.add_ordered_field(_945)
|
|
210
|
+
record.remove_field(field)
|
|
211
|
+
return record
|
|
212
|
+
|
|
213
|
+
def sudoc_supercede_prep(record: Record, **kwargs) -> Record:
|
|
91
214
|
"""
|
|
92
215
|
Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
|
|
93
216
|
with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
|
|
@@ -96,10 +219,10 @@ def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
|
96
219
|
in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
|
|
97
220
|
|
|
98
221
|
Args:
|
|
99
|
-
record (
|
|
222
|
+
record (Record): The MARC record to preprocess.
|
|
100
223
|
|
|
101
224
|
Returns:
|
|
102
|
-
|
|
225
|
+
Record: The preprocessed MARC record.
|
|
103
226
|
"""
|
|
104
227
|
record = prepend_abes_prefix_001(record)
|
|
105
228
|
for field in record.get_fields("035"):
|
|
@@ -113,7 +236,7 @@ def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
|
113
236
|
return record
|
|
114
237
|
|
|
115
238
|
|
|
116
|
-
def clean_empty_fields(record:
|
|
239
|
+
def clean_empty_fields(record: Record, **kwargs) -> Record:
|
|
117
240
|
"""
|
|
118
241
|
Remove empty fields and subfields from the record. These can cause
|
|
119
242
|
data import mapping issues in FOLIO. Removals are logged at custom
|
|
@@ -121,10 +244,10 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
121
244
|
data issues report.
|
|
122
245
|
|
|
123
246
|
Args:
|
|
124
|
-
record (
|
|
247
|
+
record (Record): The MARC record to preprocess.
|
|
125
248
|
|
|
126
249
|
Returns:
|
|
127
|
-
|
|
250
|
+
Record: The preprocessed MARC record.
|
|
128
251
|
"""
|
|
129
252
|
MAPPED_FIELDS = {
|
|
130
253
|
"010": ["a", "z"],
|
|
@@ -233,73 +356,72 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
233
356
|
"856": ["u", "y", "z"],
|
|
234
357
|
}
|
|
235
358
|
|
|
236
|
-
for field in
|
|
359
|
+
for field in record.get_fields(*MAPPED_FIELDS.keys()):
|
|
237
360
|
len_subs = len(field.subfields)
|
|
238
|
-
subfield_value = bool(field.subfields[0].value) if len_subs
|
|
239
|
-
if
|
|
240
|
-
|
|
361
|
+
subfield_value = bool(field.subfields[0].value) if len_subs else False
|
|
362
|
+
if int(field.tag) > 9 and len_subs == 0:
|
|
363
|
+
logger.log(
|
|
364
|
+
26,
|
|
365
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
366
|
+
record["001"].value(),
|
|
367
|
+
f"{field.tag} is empty, removing field",
|
|
368
|
+
field,
|
|
369
|
+
)
|
|
370
|
+
record.remove_field(field)
|
|
371
|
+
elif len_subs == 1 and not subfield_value:
|
|
372
|
+
logger.log(
|
|
373
|
+
26,
|
|
374
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
375
|
+
record["001"].value(),
|
|
376
|
+
f"{field.tag}${field.subfields[0].code} is empty, no other subfields present, removing field",
|
|
377
|
+
field,
|
|
378
|
+
)
|
|
379
|
+
record.remove_field(field)
|
|
380
|
+
else:
|
|
381
|
+
if len_subs > 1 and "a" in field and not field["a"].strip():
|
|
241
382
|
logger.log(
|
|
242
383
|
26,
|
|
243
384
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
244
385
|
record["001"].value(),
|
|
245
|
-
f"{field.tag} is empty, removing
|
|
386
|
+
f"{field.tag}$a is empty, removing subfield",
|
|
246
387
|
field,
|
|
247
388
|
)
|
|
248
|
-
|
|
249
|
-
|
|
389
|
+
field.delete_subfield("a")
|
|
390
|
+
for idx, subfield in enumerate(list(field.subfields), start=1):
|
|
391
|
+
if (
|
|
392
|
+
subfield.code in MAPPED_FIELDS.get(field.tag, [])
|
|
393
|
+
and not subfield.value
|
|
394
|
+
):
|
|
395
|
+
logger.log(
|
|
396
|
+
26,
|
|
397
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
398
|
+
record["001"].value(),
|
|
399
|
+
f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
|
|
400
|
+
field,
|
|
401
|
+
)
|
|
402
|
+
field.delete_subfield(subfield.code)
|
|
403
|
+
if len(field.subfields) == 0:
|
|
250
404
|
logger.log(
|
|
251
405
|
26,
|
|
252
406
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
253
407
|
record["001"].value(),
|
|
254
|
-
f"{field.tag}
|
|
408
|
+
f"{field.tag} has no non-empty subfields after cleaning, removing field",
|
|
255
409
|
field,
|
|
256
410
|
)
|
|
257
411
|
record.remove_field(field)
|
|
258
|
-
else:
|
|
259
|
-
if len_subs > 1 and "a" in field and not field["a"].strip():
|
|
260
|
-
logger.log(
|
|
261
|
-
26,
|
|
262
|
-
"DATA ISSUE\t%s\t%s\t%s",
|
|
263
|
-
record["001"].value(),
|
|
264
|
-
f"{field.tag}$a is empty, removing subfield",
|
|
265
|
-
field,
|
|
266
|
-
)
|
|
267
|
-
field.delete_subfield("a")
|
|
268
|
-
for idx, subfield in enumerate(list(field.subfields), start=1):
|
|
269
|
-
if (
|
|
270
|
-
subfield.code in MAPPED_FIELDS.get(field.tag, [])
|
|
271
|
-
and not subfield.value
|
|
272
|
-
):
|
|
273
|
-
logger.log(
|
|
274
|
-
26,
|
|
275
|
-
"DATA ISSUE\t%s\t%s\t%s",
|
|
276
|
-
record["001"].value(),
|
|
277
|
-
f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
|
|
278
|
-
field,
|
|
279
|
-
)
|
|
280
|
-
field.delete_subfield(subfield.code)
|
|
281
|
-
if len(field.subfields) == 0:
|
|
282
|
-
logger.log(
|
|
283
|
-
26,
|
|
284
|
-
"DATA ISSUE\t%s\t%s\t%s",
|
|
285
|
-
record["001"].value(),
|
|
286
|
-
f"{field.tag} has no non-empty subfields after cleaning, removing field",
|
|
287
|
-
field,
|
|
288
|
-
)
|
|
289
|
-
record.remove_field(field)
|
|
290
412
|
return record
|
|
291
413
|
|
|
292
414
|
|
|
293
|
-
def fix_leader(record:
|
|
415
|
+
def fix_leader(record: Record, **kwargs) -> Record:
|
|
294
416
|
"""
|
|
295
417
|
Fixes the leader of the record by setting the record status to 'c' (modified
|
|
296
418
|
record) and the type of record to 'a' (language material).
|
|
297
419
|
|
|
298
420
|
Args:
|
|
299
|
-
record (
|
|
421
|
+
record (Record): The MARC record to preprocess.
|
|
300
422
|
|
|
301
423
|
Returns:
|
|
302
|
-
|
|
424
|
+
Record: The preprocessed MARC record.
|
|
303
425
|
"""
|
|
304
426
|
VALID_STATUSES = ["a", "c", "d", "n", "p"]
|
|
305
427
|
VALID_TYPES = ["a", "c", "d", "e", "f", "g", "i", "j", "k", "m", "o", "p", "r", "t"]
|
|
@@ -309,7 +431,7 @@ def fix_leader(record: pymarc.Record) -> pymarc.Record:
|
|
|
309
431
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
310
432
|
record["001"].value(),
|
|
311
433
|
f"Invalid record status: {record.leader[5]}, setting to 'c'",
|
|
312
|
-
record,
|
|
434
|
+
record.leader,
|
|
313
435
|
)
|
|
314
436
|
record.leader = pymarc.Leader(record.leader[:5] + "c" + record.leader[6:])
|
|
315
437
|
if record.leader[6] not in VALID_TYPES:
|
|
@@ -318,11 +440,40 @@ def fix_leader(record: pymarc.Record) -> pymarc.Record:
|
|
|
318
440
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
319
441
|
record["001"].value(),
|
|
320
442
|
f"Invalid record type: {record.leader[6]}, setting to 'a'",
|
|
321
|
-
record,
|
|
443
|
+
record.leader,
|
|
322
444
|
)
|
|
323
445
|
record.leader = pymarc.Leader(record.leader[:6] + "a" + record.leader[7:])
|
|
324
446
|
return record
|
|
325
447
|
|
|
448
|
+
def move_authority_subfield_9_to_0_all_controllable_fields(record: Record, **kwargs) -> Record:
|
|
449
|
+
"""
|
|
450
|
+
Move subfield 9 from authority fields to subfield 0. This is useful when
|
|
451
|
+
importing records from the ABES SUDOC catalog.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
record (Record): The MARC record to preprocess.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Record: The preprocessed MARC record.
|
|
458
|
+
"""
|
|
459
|
+
controlled_fields = [
|
|
460
|
+
"100", "110", "111", "130",
|
|
461
|
+
"600", "610", "611", "630", "650", "651", "655",
|
|
462
|
+
"700", "710", "711", "730",
|
|
463
|
+
"800", "810", "811", "830"
|
|
464
|
+
]
|
|
465
|
+
for field in record.get_fields(*controlled_fields):
|
|
466
|
+
for subfield in list(field.get_subfields("9")):
|
|
467
|
+
field.add_subfield("0", subfield)
|
|
468
|
+
field.delete_subfield("9", subfield)
|
|
469
|
+
logger.log(
|
|
470
|
+
26,
|
|
471
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
472
|
+
record["001"].value(),
|
|
473
|
+
f"Subfield 9 moved to subfield 0 in {field.tag}",
|
|
474
|
+
field,
|
|
475
|
+
)
|
|
476
|
+
return record
|
|
326
477
|
|
|
327
478
|
def ordinal(n):
|
|
328
479
|
s = ("th", "st", "nd", "rd") + ("th",) * 10
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: folio_data_import
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Brooks Travis
|
|
@@ -19,8 +19,7 @@ Requires-Dist: flake8-black (>=0.3.6,<0.4.0)
|
|
|
19
19
|
Requires-Dist: flake8-bugbear (>=24.8.19,<25.0.0)
|
|
20
20
|
Requires-Dist: flake8-docstrings (>=1.7.0,<2.0.0)
|
|
21
21
|
Requires-Dist: flake8-isort (>=6.1.1,<7.0.0)
|
|
22
|
-
Requires-Dist: folioclient (>=0.
|
|
23
|
-
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
|
22
|
+
Requires-Dist: folioclient (>=0.70.1,<0.71.0)
|
|
24
23
|
Requires-Dist: inquirer (>=3.4.0,<4.0.0)
|
|
25
24
|
Requires-Dist: pyhumps (>=3.8.0,<4.0.0)
|
|
26
25
|
Requires-Dist: pymarc (>=5.2.2,<6.0.0)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
folio_data_import/MARCDataImport.py,sha256=je3TdCdaDR-gYA3Gh1k4AX9l3v83sCTt4Y9lOFxayu8,36220
|
|
2
|
+
folio_data_import/UserImport.py,sha256=ZulGaGJhI_N5vmR69YF_qbzbGeVyzcthXklSjDpZCyA,40998
|
|
3
|
+
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
+
folio_data_import/custom_exceptions.py,sha256=xOeIbM86d2r5-z3ul4JFTJLT3vI3kwmEq62cWS-9dOc,646
|
|
6
|
+
folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
|
|
7
|
+
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4Zrp-9LdL7f5QqUTOjyMkK5IaHP2YOkmkqoY_4o585Q,16377
|
|
8
|
+
folio_data_import-0.3.0.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
9
|
+
folio_data_import-0.3.0.dist-info/METADATA,sha256=Aqf0PXhdwFyChMKvl9cOluKN60IyMAUPDKSpb8AOlXI,6069
|
|
10
|
+
folio_data_import-0.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
11
|
+
folio_data_import-0.3.0.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
12
|
+
folio_data_import-0.3.0.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
folio_data_import/MARCDataImport.py,sha256=DjNIfnKSQ7d2IWP0x_R8NRDeDBHoAmalNMmsimeHf94,33164
|
|
2
|
-
folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
|
|
3
|
-
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
-
folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
|
|
6
|
-
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4i1_lEnptzZDx3DojX9sfvJ_hmehwFJUC3aZsUADcwA,10851
|
|
7
|
-
folio_data_import-0.2.8rc11.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
8
|
-
folio_data_import-0.2.8rc11.dist-info/METADATA,sha256=xlq3E8A6c-dme1eF5GTNmskjrvqFBidPWL7Z7K1hsqs,6113
|
|
9
|
-
folio_data_import-0.2.8rc11.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
10
|
-
folio_data_import-0.2.8rc11.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
11
|
-
folio_data_import-0.2.8rc11.dist-info/RECORD,,
|
|
File without changes
|
{folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|