folio-data-import 0.2.8rc12__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- folio_data_import/MARCDataImport.py +215 -177
- folio_data_import/UserImport.py +45 -24
- folio_data_import/custom_exceptions.py +29 -0
- folio_data_import/marc_preprocessors/_preprocessors.py +221 -67
- {folio_data_import-0.2.8rc12.dist-info → folio_data_import-0.3.1.dist-info}/METADATA +2 -3
- folio_data_import-0.3.1.dist-info/RECORD +12 -0
- {folio_data_import-0.2.8rc12.dist-info → folio_data_import-0.3.1.dist-info}/WHEEL +1 -1
- folio_data_import-0.2.8rc12.dist-info/RECORD +0 -11
- {folio_data_import-0.2.8rc12.dist-info → folio_data_import-0.3.1.dist-info}/LICENSE +0 -0
- {folio_data_import-0.2.8rc12.dist-info → folio_data_import-0.3.1.dist-info}/entry_points.txt +0 -0
|
@@ -2,8 +2,8 @@ import argparse
|
|
|
2
2
|
import asyncio
|
|
3
3
|
import datetime
|
|
4
4
|
import glob
|
|
5
|
-
import importlib
|
|
6
5
|
import io
|
|
6
|
+
import json
|
|
7
7
|
import logging
|
|
8
8
|
import math
|
|
9
9
|
import os
|
|
@@ -15,7 +15,7 @@ from functools import cached_property
|
|
|
15
15
|
from getpass import getpass
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
from time import sleep
|
|
18
|
-
from typing import List, Union
|
|
18
|
+
from typing import BinaryIO, Callable, Dict, List, Union
|
|
19
19
|
|
|
20
20
|
import folioclient
|
|
21
21
|
import httpx
|
|
@@ -25,6 +25,9 @@ import tabulate
|
|
|
25
25
|
from humps import decamelize
|
|
26
26
|
from tqdm import tqdm
|
|
27
27
|
|
|
28
|
+
from folio_data_import.custom_exceptions import FolioDataImportBatchError, FolioDataImportJobError
|
|
29
|
+
from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
|
|
30
|
+
|
|
28
31
|
try:
|
|
29
32
|
datetime_utc = datetime.UTC
|
|
30
33
|
except AttributeError:
|
|
@@ -35,21 +38,25 @@ except AttributeError:
|
|
|
35
38
|
REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error": 3}
|
|
36
39
|
|
|
37
40
|
# Set default timeout and backoff values for HTTP requests when retrying job status and final summary checks
|
|
38
|
-
RETRY_TIMEOUT_START =
|
|
39
|
-
RETRY_TIMEOUT_RETRY_FACTOR =
|
|
41
|
+
RETRY_TIMEOUT_START = 5
|
|
42
|
+
RETRY_TIMEOUT_RETRY_FACTOR = 1.5
|
|
43
|
+
RETRY_TIMEOUT_MAX = 25.32
|
|
40
44
|
|
|
41
45
|
# Custom log level for data issues, set to 26
|
|
42
46
|
DATA_ISSUE_LVL_NUM = 26
|
|
43
47
|
logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
|
|
44
48
|
|
|
49
|
+
|
|
45
50
|
def data_issues(self, msg, *args, **kws):
|
|
46
51
|
if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
|
|
47
52
|
self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
|
|
48
53
|
|
|
54
|
+
|
|
49
55
|
logging.Logger.data_issues = data_issues
|
|
50
56
|
|
|
51
57
|
logger = logging.getLogger(__name__)
|
|
52
58
|
|
|
59
|
+
|
|
53
60
|
class MARCImportJob:
|
|
54
61
|
"""
|
|
55
62
|
Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
|
|
@@ -63,7 +70,6 @@ class MARCImportJob:
|
|
|
63
70
|
import_profile_name (str): The name of the data import job profile to use.
|
|
64
71
|
batch_size (int): The number of source records to include in a record batch (default=10).
|
|
65
72
|
batch_delay (float): The number of seconds to wait between record batches (default=0).
|
|
66
|
-
consolidate (bool): Consolidate files into a single job. Default is one job for each file.
|
|
67
73
|
no_progress (bool): Disable progress bars (eg. for running in a CI environment).
|
|
68
74
|
"""
|
|
69
75
|
|
|
@@ -75,14 +81,15 @@ class MARCImportJob:
|
|
|
75
81
|
http_client: httpx.Client
|
|
76
82
|
current_file: List[Path]
|
|
77
83
|
record_batch: List[dict] = []
|
|
78
|
-
error_records: int = 0
|
|
79
84
|
last_current: int = 0
|
|
80
85
|
total_records_sent: int = 0
|
|
81
86
|
finished: bool = False
|
|
82
87
|
job_id: str = ""
|
|
83
88
|
job_hrid: int = 0
|
|
84
|
-
current_file: Union[List[Path],List[io.BytesIO]] = []
|
|
89
|
+
current_file: Union[List[Path], List[io.BytesIO]] = []
|
|
85
90
|
_max_summary_retries: int = 2
|
|
91
|
+
_max_job_retries: int = 2
|
|
92
|
+
_job_retries: int = 0
|
|
86
93
|
_summary_retries: int = 0
|
|
87
94
|
|
|
88
95
|
def __init__(
|
|
@@ -92,18 +99,17 @@ class MARCImportJob:
|
|
|
92
99
|
import_profile_name: str,
|
|
93
100
|
batch_size=10,
|
|
94
101
|
batch_delay=0,
|
|
95
|
-
marc_record_preprocessor=
|
|
96
|
-
|
|
102
|
+
marc_record_preprocessor: Union[List[Callable], str] = [],
|
|
103
|
+
preprocessor_args: Dict[str, Dict] = {},
|
|
97
104
|
no_progress=False,
|
|
98
105
|
let_summary_fail=False,
|
|
99
106
|
split_files=False,
|
|
100
107
|
split_size=1000,
|
|
108
|
+
split_offset=0,
|
|
101
109
|
) -> None:
|
|
102
|
-
self.consolidate_files = consolidate
|
|
103
110
|
self.split_files = split_files
|
|
104
111
|
self.split_size = split_size
|
|
105
|
-
|
|
106
|
-
raise ValueError("Cannot consolidate and split files at the same time.")
|
|
112
|
+
self.split_offset = split_offset
|
|
107
113
|
self.no_progress = no_progress
|
|
108
114
|
self.let_summary_fail = let_summary_fail
|
|
109
115
|
self.folio_client: folioclient.FolioClient = folio_client
|
|
@@ -111,17 +117,17 @@ class MARCImportJob:
|
|
|
111
117
|
self.import_profile_name = import_profile_name
|
|
112
118
|
self.batch_size = batch_size
|
|
113
119
|
self.batch_delay = batch_delay
|
|
114
|
-
self.current_retry_timeout =
|
|
115
|
-
self.marc_record_preprocessor =
|
|
120
|
+
self.current_retry_timeout = 0
|
|
121
|
+
self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(
|
|
122
|
+
marc_record_preprocessor, **preprocessor_args
|
|
123
|
+
)
|
|
116
124
|
|
|
117
125
|
async def do_work(self) -> None:
|
|
118
126
|
"""
|
|
119
127
|
Performs the necessary work for data import.
|
|
120
128
|
|
|
121
129
|
This method initializes an HTTP client, files to store records that fail to send,
|
|
122
|
-
and calls
|
|
123
|
-
it imports all the files specified in `import_files` as a single batch. Otherwise,
|
|
124
|
-
it imports each file as a separate import job.
|
|
130
|
+
and calls the appropriate method to import MARC files based on the configuration.
|
|
125
131
|
|
|
126
132
|
Returns:
|
|
127
133
|
None
|
|
@@ -146,27 +152,37 @@ class MARCImportJob:
|
|
|
146
152
|
self.failed_batches_file = failed_batches
|
|
147
153
|
logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
|
|
148
154
|
self.http_client = http_client
|
|
149
|
-
if self.
|
|
150
|
-
|
|
151
|
-
await self.import_marc_file()
|
|
152
|
-
elif self.split_files:
|
|
153
|
-
for file in self.import_files:
|
|
154
|
-
with open(file, "rb") as f:
|
|
155
|
-
file_length = await self.read_total_records([f])
|
|
156
|
-
expected_batches = math.ceil(file_length /self.split_size)
|
|
157
|
-
logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
|
|
158
|
-
zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
|
|
159
|
-
for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
|
|
160
|
-
batch.name = f"{file.name}_part{idx:0{zero_pad_parts}}"
|
|
161
|
-
self.current_file = [batch]
|
|
162
|
-
await self.import_marc_file()
|
|
163
|
-
self.move_file_to_complete(file)
|
|
155
|
+
if self.split_files:
|
|
156
|
+
await self.process_split_files()
|
|
164
157
|
else:
|
|
165
158
|
for file in self.import_files:
|
|
166
159
|
self.current_file = [file]
|
|
167
160
|
await self.import_marc_file()
|
|
168
161
|
await self.wrap_up()
|
|
169
162
|
|
|
163
|
+
async def process_split_files(self):
|
|
164
|
+
"""
|
|
165
|
+
Process the import of files in smaller batches.
|
|
166
|
+
This method is called when `split_files` is set to True.
|
|
167
|
+
It splits each file into smaller chunks and processes them one by one.
|
|
168
|
+
"""
|
|
169
|
+
for file in self.import_files:
|
|
170
|
+
with open(file, "rb") as f:
|
|
171
|
+
file_length = await self.read_total_records([f])
|
|
172
|
+
expected_batches = math.ceil(file_length / self.split_size)
|
|
173
|
+
logger.info(
|
|
174
|
+
f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches."
|
|
175
|
+
)
|
|
176
|
+
zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
|
|
177
|
+
for idx, batch in enumerate(
|
|
178
|
+
self.split_marc_file(file, self.split_size), start=1
|
|
179
|
+
):
|
|
180
|
+
if idx > self.split_offset:
|
|
181
|
+
batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
|
|
182
|
+
self.current_file = [batch]
|
|
183
|
+
await self.import_marc_file()
|
|
184
|
+
self.move_file_to_complete(file)
|
|
185
|
+
|
|
170
186
|
async def wrap_up(self) -> None:
|
|
171
187
|
"""
|
|
172
188
|
Wraps up the data import process.
|
|
@@ -208,22 +224,29 @@ class MARCImportJob:
|
|
|
208
224
|
timeout=self.current_retry_timeout,
|
|
209
225
|
verify=self.folio_client.ssl_verify,
|
|
210
226
|
) as temp_client:
|
|
227
|
+
self.folio_client.httpx_client = temp_client
|
|
211
228
|
job_status = self.folio_client.folio_get(
|
|
212
229
|
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
213
230
|
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
214
231
|
)
|
|
215
232
|
self.current_retry_timeout = None
|
|
216
233
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
217
|
-
if
|
|
218
|
-
|
|
219
|
-
|
|
234
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
235
|
+
if self.current_retry_timeout <= RETRY_TIMEOUT_MAX and (
|
|
236
|
+
not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
|
|
237
|
+
):
|
|
238
|
+
logger.warning(
|
|
239
|
+
f"SERVER ERROR fetching job status: {error_text}. Retrying."
|
|
240
|
+
)
|
|
220
241
|
sleep(0.25)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
242
|
+
return await self.get_job_status()
|
|
243
|
+
elif self.current_retry_timeout > RETRY_TIMEOUT_MAX and (
|
|
244
|
+
not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
|
|
245
|
+
):
|
|
246
|
+
logger.critical(
|
|
247
|
+
f"SERVER ERROR fetching job status: {error_text}. Max retries exceeded."
|
|
248
|
+
)
|
|
249
|
+
raise FolioDataImportJobError(self.job_id, error_text, e)
|
|
227
250
|
else:
|
|
228
251
|
raise e
|
|
229
252
|
except Exception as e:
|
|
@@ -236,19 +259,29 @@ class MARCImportJob:
|
|
|
236
259
|
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
237
260
|
self.last_current = status["progress"]["current"]
|
|
238
261
|
except (IndexError, ValueError, KeyError):
|
|
239
|
-
logger.debug(
|
|
262
|
+
logger.debug(
|
|
263
|
+
f"No active job found with ID {self.job_id}. Checking for finished job."
|
|
264
|
+
)
|
|
240
265
|
try:
|
|
241
266
|
job_status = self.folio_client.folio_get(
|
|
242
267
|
"/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
|
|
243
268
|
"=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
|
|
244
269
|
)
|
|
245
270
|
status = [
|
|
246
|
-
job
|
|
271
|
+
job
|
|
272
|
+
for job in job_status["jobExecutions"]
|
|
273
|
+
if job["id"] == self.job_id
|
|
247
274
|
][0]
|
|
248
|
-
self.pbar_imported.update(
|
|
275
|
+
self.pbar_imported.update(
|
|
276
|
+
status["progress"]["current"] - self.last_current
|
|
277
|
+
)
|
|
249
278
|
self.last_current = status["progress"]["current"]
|
|
250
279
|
self.finished = True
|
|
251
|
-
except (
|
|
280
|
+
except (
|
|
281
|
+
httpx.ConnectTimeout,
|
|
282
|
+
httpx.ReadTimeout,
|
|
283
|
+
httpx.HTTPStatusError,
|
|
284
|
+
) as e:
|
|
252
285
|
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
253
286
|
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
254
287
|
logger.warning(
|
|
@@ -276,7 +309,7 @@ class MARCImportJob:
|
|
|
276
309
|
"""
|
|
277
310
|
try:
|
|
278
311
|
create_job = self.http_client.post(
|
|
279
|
-
self.folio_client.
|
|
312
|
+
self.folio_client.gateway_url + "/change-manager/jobExecutions",
|
|
280
313
|
headers=self.folio_client.okapi_headers,
|
|
281
314
|
json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
|
|
282
315
|
)
|
|
@@ -325,7 +358,7 @@ class MARCImportJob:
|
|
|
325
358
|
The response from the HTTP request to set the job profile.
|
|
326
359
|
"""
|
|
327
360
|
set_job_profile = self.http_client.put(
|
|
328
|
-
self.folio_client.
|
|
361
|
+
self.folio_client.gateway_url
|
|
329
362
|
+ "/change-manager/jobExecutions/"
|
|
330
363
|
+ self.job_id
|
|
331
364
|
+ "/jobProfile",
|
|
@@ -338,7 +371,7 @@ class MARCImportJob:
|
|
|
338
371
|
)
|
|
339
372
|
try:
|
|
340
373
|
set_job_profile.raise_for_status()
|
|
341
|
-
self.job_hrid = set_job_profile.json()[
|
|
374
|
+
self.job_hrid = set_job_profile.json()["hrId"]
|
|
342
375
|
logger.info(f"Job HRID: {self.job_hrid}")
|
|
343
376
|
except httpx.HTTPError as e:
|
|
344
377
|
logger.error(
|
|
@@ -350,7 +383,7 @@ class MARCImportJob:
|
|
|
350
383
|
raise e
|
|
351
384
|
|
|
352
385
|
@staticmethod
|
|
353
|
-
async def read_total_records(files) -> int:
|
|
386
|
+
async def read_total_records(files: List[BinaryIO]) -> int:
|
|
354
387
|
"""
|
|
355
388
|
Reads the total number of records from the given files.
|
|
356
389
|
|
|
@@ -379,17 +412,15 @@ class MARCImportJob:
|
|
|
379
412
|
"""
|
|
380
413
|
try:
|
|
381
414
|
post_batch = self.http_client.post(
|
|
382
|
-
self.folio_client.
|
|
415
|
+
self.folio_client.gateway_url
|
|
383
416
|
+ f"/change-manager/jobExecutions/{self.job_id}/records",
|
|
384
417
|
headers=self.folio_client.okapi_headers,
|
|
385
418
|
json=batch_payload,
|
|
386
419
|
)
|
|
387
|
-
# if batch_payload["recordsMetadata"]["last"]:
|
|
388
|
-
# logger.log(
|
|
389
|
-
# 25,
|
|
390
|
-
# f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
|
|
391
|
-
# )
|
|
392
420
|
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
421
|
+
logger.warning(
|
|
422
|
+
f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying..."
|
|
423
|
+
)
|
|
393
424
|
sleep(0.25)
|
|
394
425
|
return await self.process_record_batch(batch_payload)
|
|
395
426
|
try:
|
|
@@ -397,20 +428,19 @@ class MARCImportJob:
|
|
|
397
428
|
self.total_records_sent += len(self.record_batch)
|
|
398
429
|
self.record_batch = []
|
|
399
430
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
400
|
-
except
|
|
431
|
+
except httpx.HTTPStatusError as e:
|
|
401
432
|
if (
|
|
402
|
-
|
|
403
|
-
): # TODO:
|
|
433
|
+
e.response.status_code in [500, 400, 422]
|
|
434
|
+
): # TODO: Update once we no longer have to support < Sunflower to just be 400
|
|
404
435
|
self.total_records_sent += len(self.record_batch)
|
|
405
436
|
self.record_batch = []
|
|
406
437
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
407
438
|
else:
|
|
408
|
-
logger.error("Error posting batch: " + str(e))
|
|
409
439
|
for record in self.record_batch:
|
|
410
440
|
self.failed_batches_file.write(record)
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
441
|
+
raise FolioDataImportBatchError(
|
|
442
|
+
batch_payload["id"], f"{e}\n{e.response.text}", e
|
|
443
|
+
)
|
|
414
444
|
await self.get_job_status()
|
|
415
445
|
sleep(self.batch_delay)
|
|
416
446
|
|
|
@@ -439,16 +469,12 @@ class MARCImportJob:
|
|
|
439
469
|
await self.create_batch_payload(
|
|
440
470
|
counter,
|
|
441
471
|
total_records,
|
|
442
|
-
|
|
443
|
-
== (total_records - self.error_records),
|
|
472
|
+
counter == total_records,
|
|
444
473
|
),
|
|
445
474
|
)
|
|
446
475
|
sleep(0.25)
|
|
447
476
|
if record:
|
|
448
|
-
|
|
449
|
-
record = await self.apply_marc_record_preprocessing(
|
|
450
|
-
record, self.marc_record_preprocessor
|
|
451
|
-
)
|
|
477
|
+
record = self.marc_record_preprocessor.do_work(record)
|
|
452
478
|
self.record_batch.append(record.as_marc())
|
|
453
479
|
counter += 1
|
|
454
480
|
else:
|
|
@@ -459,79 +485,26 @@ class MARCImportJob:
|
|
|
459
485
|
"",
|
|
460
486
|
)
|
|
461
487
|
self.bad_records_file.write(reader.current_chunk)
|
|
462
|
-
if self.record_batch:
|
|
463
|
-
await self.process_record_batch(
|
|
464
|
-
await self.create_batch_payload(
|
|
465
|
-
counter,
|
|
466
|
-
total_records,
|
|
467
|
-
(counter - self.error_records)
|
|
468
|
-
== (total_records - self.error_records),
|
|
469
|
-
),
|
|
470
|
-
)
|
|
471
488
|
if not self.split_files:
|
|
472
489
|
self.move_file_to_complete(file_path)
|
|
490
|
+
if self.record_batch or not self.finished:
|
|
491
|
+
await self.process_record_batch(
|
|
492
|
+
await self.create_batch_payload(
|
|
493
|
+
counter,
|
|
494
|
+
total_records,
|
|
495
|
+
counter == total_records,
|
|
496
|
+
),
|
|
497
|
+
)
|
|
473
498
|
|
|
474
|
-
def move_file_to_complete(self, file_path):
|
|
499
|
+
def move_file_to_complete(self, file_path: Path):
|
|
475
500
|
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
476
501
|
if not import_complete_path.exists():
|
|
477
|
-
logger.debug(
|
|
502
|
+
logger.debug(
|
|
503
|
+
f"Creating import_complete directory: {import_complete_path.absolute()}"
|
|
504
|
+
)
|
|
478
505
|
import_complete_path.mkdir(exist_ok=True)
|
|
479
506
|
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
480
|
-
file_path.rename(
|
|
481
|
-
file_path.parent.joinpath("import_complete", file_path.name)
|
|
482
|
-
)
|
|
483
|
-
|
|
484
|
-
@staticmethod
|
|
485
|
-
async def apply_marc_record_preprocessing(
|
|
486
|
-
record: pymarc.Record, func_or_path
|
|
487
|
-
) -> pymarc.Record:
|
|
488
|
-
"""
|
|
489
|
-
Apply preprocessing to the MARC record before sending it to FOLIO.
|
|
490
|
-
|
|
491
|
-
Args:
|
|
492
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
493
|
-
func_or_path (Union[Callable, str]): The preprocessing function or its import path.
|
|
494
|
-
|
|
495
|
-
Returns:
|
|
496
|
-
pymarc.Record: The preprocessed MARC record.
|
|
497
|
-
"""
|
|
498
|
-
if isinstance(func_or_path, str):
|
|
499
|
-
func_paths = func_or_path.split(",")
|
|
500
|
-
for func_path in func_paths:
|
|
501
|
-
record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
|
|
502
|
-
record, func_path
|
|
503
|
-
)
|
|
504
|
-
elif callable(func_or_path):
|
|
505
|
-
record = func_or_path(record)
|
|
506
|
-
else:
|
|
507
|
-
logger.warning(
|
|
508
|
-
f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
|
|
509
|
-
)
|
|
510
|
-
return record
|
|
511
|
-
|
|
512
|
-
async def _apply_single_marc_record_preprocessing_by_path(
|
|
513
|
-
record: pymarc.Record, func_path: str
|
|
514
|
-
) -> pymarc.Record:
|
|
515
|
-
"""
|
|
516
|
-
Apply a single preprocessing function to the MARC record.
|
|
517
|
-
|
|
518
|
-
Args:
|
|
519
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
520
|
-
func_path (str): The path to the preprocessing function.
|
|
521
|
-
|
|
522
|
-
Returns:
|
|
523
|
-
pymarc.Record: The preprocessed MARC record.
|
|
524
|
-
"""
|
|
525
|
-
try:
|
|
526
|
-
module_path, func_name = func_path.rsplit(".", 1)
|
|
527
|
-
module = importlib.import_module(module_path)
|
|
528
|
-
func = getattr(module, func_name)
|
|
529
|
-
record = func(record)
|
|
530
|
-
except Exception as e:
|
|
531
|
-
logger.warning(
|
|
532
|
-
f"Error applying preprocessing function {func_path}: {e}. Skipping."
|
|
533
|
-
)
|
|
534
|
-
return record
|
|
507
|
+
file_path.rename(file_path.parent.joinpath("import_complete", file_path.name))
|
|
535
508
|
|
|
536
509
|
async def create_batch_payload(self, counter, total_records, is_last) -> dict:
|
|
537
510
|
"""
|
|
@@ -549,9 +522,9 @@ class MARCImportJob:
|
|
|
549
522
|
"id": str(uuid.uuid4()),
|
|
550
523
|
"recordsMetadata": {
|
|
551
524
|
"last": is_last,
|
|
552
|
-
"counter": counter
|
|
525
|
+
"counter": counter,
|
|
553
526
|
"contentType": "MARC_RAW",
|
|
554
|
-
"total": total_records
|
|
527
|
+
"total": total_records,
|
|
555
528
|
},
|
|
556
529
|
"initialRecords": [{"record": x.decode()} for x in self.record_batch],
|
|
557
530
|
}
|
|
@@ -575,11 +548,15 @@ class MARCImportJob:
|
|
|
575
548
|
|
|
576
549
|
record_body = f.read(record_length - 24)
|
|
577
550
|
if len(record_body) != record_length - 24:
|
|
578
|
-
raise ValueError(
|
|
551
|
+
raise ValueError(
|
|
552
|
+
"Unexpected end of file while reading MARC record."
|
|
553
|
+
)
|
|
579
554
|
|
|
580
555
|
# Verify record terminator
|
|
581
|
-
if record_body[-1:] != b
|
|
582
|
-
raise ValueError(
|
|
556
|
+
if record_body[-1:] != b"\x1d":
|
|
557
|
+
raise ValueError(
|
|
558
|
+
"MARC record does not end with the expected terminator (0x1D)."
|
|
559
|
+
)
|
|
583
560
|
|
|
584
561
|
# Write the full record to the batch buffer
|
|
585
562
|
batch.write(leader + record_body)
|
|
@@ -620,12 +597,11 @@ class MARCImportJob:
|
|
|
620
597
|
try:
|
|
621
598
|
if isinstance(self.current_file[0], Path):
|
|
622
599
|
files = [
|
|
623
|
-
stack.enter_context(open(file, "rb"))
|
|
600
|
+
stack.enter_context(open(file, "rb"))
|
|
601
|
+
for file in self.current_file
|
|
624
602
|
]
|
|
625
603
|
elif isinstance(self.current_file[0], io.BytesIO):
|
|
626
|
-
files = [
|
|
627
|
-
stack.enter_context(file) for file in self.current_file
|
|
628
|
-
]
|
|
604
|
+
files = [stack.enter_context(file) for file in self.current_file]
|
|
629
605
|
else:
|
|
630
606
|
raise ValueError("Invalid file type. Must be Path or BytesIO.")
|
|
631
607
|
except IndexError as e:
|
|
@@ -646,17 +622,62 @@ class MARCImportJob:
|
|
|
646
622
|
disable=self.no_progress,
|
|
647
623
|
) as pbar_sent,
|
|
648
624
|
):
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
625
|
+
try:
|
|
626
|
+
self.pbar_sent = pbar_sent
|
|
627
|
+
self.pbar_imported = pbar_imported
|
|
628
|
+
await self.process_records(files, total_records)
|
|
629
|
+
while not self.finished:
|
|
630
|
+
await self.get_job_status()
|
|
631
|
+
await asyncio.sleep(5)
|
|
632
|
+
except FolioDataImportBatchError as e:
|
|
633
|
+
logger.error(
|
|
634
|
+
f"Unhandled error posting batch {e.batch_id}: {e.message}"
|
|
635
|
+
)
|
|
636
|
+
await self.cancel_job()
|
|
637
|
+
raise e
|
|
638
|
+
except FolioDataImportJobError as e:
|
|
639
|
+
await self.cancel_job()
|
|
640
|
+
if self._job_retries < self._max_job_retries:
|
|
641
|
+
self._job_retries += 1
|
|
642
|
+
logger.error(
|
|
643
|
+
f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and retrying."
|
|
644
|
+
)
|
|
645
|
+
await self.import_marc_file()
|
|
646
|
+
else:
|
|
647
|
+
logger.critical(
|
|
648
|
+
f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and exiting (maximum retries reached)."
|
|
649
|
+
)
|
|
650
|
+
raise e
|
|
655
651
|
if self.finished:
|
|
656
652
|
await self.log_job_summary()
|
|
657
653
|
self.last_current = 0
|
|
658
654
|
self.finished = False
|
|
659
655
|
|
|
656
|
+
async def cancel_job(self) -> None:
|
|
657
|
+
"""
|
|
658
|
+
Cancels the current job execution.
|
|
659
|
+
|
|
660
|
+
This method sends a request to cancel the job execution and logs the result.
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
None
|
|
664
|
+
"""
|
|
665
|
+
try:
|
|
666
|
+
cancel = self.http_client.delete(
|
|
667
|
+
self.folio_client.gateway_url
|
|
668
|
+
+ f"/change-manager/jobExecutions/{self.job_id}/records",
|
|
669
|
+
headers=self.folio_client.okapi_headers,
|
|
670
|
+
)
|
|
671
|
+
cancel.raise_for_status()
|
|
672
|
+
self.finished = True
|
|
673
|
+
logger.info(f"Cancelled job: {self.job_id}")
|
|
674
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
675
|
+
logger.warning(
|
|
676
|
+
f"CONNECTION ERROR cancelling job {self.job_id}. Retrying..."
|
|
677
|
+
)
|
|
678
|
+
sleep(0.25)
|
|
679
|
+
await self.cancel_job()
|
|
680
|
+
|
|
660
681
|
async def log_job_summary(self):
|
|
661
682
|
if job_summary := await self.get_job_summary():
|
|
662
683
|
job_id = job_summary.pop("jobExecutionId", None)
|
|
@@ -675,22 +696,22 @@ class MARCImportJob:
|
|
|
675
696
|
table_data.append(table_row)
|
|
676
697
|
table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
|
|
677
698
|
columns = columns[:1] + [
|
|
678
|
-
|
|
679
|
-
|
|
699
|
+
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
700
|
+
]
|
|
680
701
|
logger.info(
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
702
|
+
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
703
|
+
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
704
|
+
)
|
|
684
705
|
logger.info(
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
),
|
|
689
|
-
)
|
|
706
|
+
"\n"
|
|
707
|
+
+ tabulate.tabulate(table_data, headers=columns, tablefmt="fancy_grid"),
|
|
708
|
+
)
|
|
690
709
|
if total_errors:
|
|
691
710
|
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
692
711
|
else:
|
|
693
|
-
logger.error(
|
|
712
|
+
logger.error(
|
|
713
|
+
f"No job summary available for job #{self.job_hrid}({self.job_id})."
|
|
714
|
+
)
|
|
694
715
|
|
|
695
716
|
async def get_job_summary(self) -> dict:
|
|
696
717
|
"""
|
|
@@ -715,8 +736,10 @@ class MARCImportJob:
|
|
|
715
736
|
self.current_retry_timeout = None
|
|
716
737
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
717
738
|
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
718
|
-
if (self._max_summary_retries > self._summary_retries) and (
|
|
719
|
-
hasattr(e, "response")
|
|
739
|
+
if (self._max_summary_retries > self._summary_retries) and (
|
|
740
|
+
not hasattr(e, "response")
|
|
741
|
+
or (hasattr(e, "response") and e.response.status_code in [502, 504])
|
|
742
|
+
and not self.let_summary_fail
|
|
720
743
|
):
|
|
721
744
|
logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
|
|
722
745
|
sleep(0.25)
|
|
@@ -727,8 +750,9 @@ class MARCImportJob:
|
|
|
727
750
|
self.folio_client.httpx_client = temp_client
|
|
728
751
|
self._summary_retries += 1
|
|
729
752
|
return await self.get_job_summary()
|
|
730
|
-
elif (self._summary_retries >= self._max_summary_retries) or (
|
|
731
|
-
e
|
|
753
|
+
elif (self._summary_retries >= self._max_summary_retries) or (
|
|
754
|
+
hasattr(e, "response")
|
|
755
|
+
and (e.response.status_code in [502, 504] and self.let_summary_fail)
|
|
732
756
|
):
|
|
733
757
|
logger.warning(
|
|
734
758
|
f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
|
|
@@ -833,19 +857,10 @@ async def main() -> None:
|
|
|
833
857
|
"to apply to each MARC record before sending to FOLIO. Function should take "
|
|
834
858
|
"a pymarc.Record object as input and return a pymarc.Record object."
|
|
835
859
|
),
|
|
836
|
-
default=
|
|
837
|
-
)
|
|
838
|
-
# Add mutually exclusive group for consolidate and split-files options
|
|
839
|
-
group = parser.add_mutually_exclusive_group()
|
|
840
|
-
group.add_argument(
|
|
841
|
-
"--consolidate",
|
|
842
|
-
action="store_true",
|
|
843
|
-
help=(
|
|
844
|
-
"Consolidate records into a single job. "
|
|
845
|
-
"Default is to create a new job for each MARC file."
|
|
846
|
-
),
|
|
860
|
+
default="",
|
|
847
861
|
)
|
|
848
|
-
|
|
862
|
+
|
|
863
|
+
parser.add_argument(
|
|
849
864
|
"--split-files",
|
|
850
865
|
action="store_true",
|
|
851
866
|
help="Split files into smaller parts before importing.",
|
|
@@ -856,6 +871,12 @@ async def main() -> None:
|
|
|
856
871
|
help="The number of records to include in each split file.",
|
|
857
872
|
default=1000,
|
|
858
873
|
)
|
|
874
|
+
parser.add_argument(
|
|
875
|
+
"--split-offset",
|
|
876
|
+
type=int,
|
|
877
|
+
help="The number of record batches of <split-size> to skip before starting import.",
|
|
878
|
+
default=0,
|
|
879
|
+
)
|
|
859
880
|
|
|
860
881
|
parser.add_argument(
|
|
861
882
|
"--no-progress",
|
|
@@ -867,6 +888,16 @@ async def main() -> None:
|
|
|
867
888
|
action="store_true",
|
|
868
889
|
help="Do not retry fetching the final job summary if it fails",
|
|
869
890
|
)
|
|
891
|
+
parser.add_argument(
|
|
892
|
+
"--preprocessor-config",
|
|
893
|
+
type=str,
|
|
894
|
+
help=(
|
|
895
|
+
"JSON file containing configuration for preprocessor functions. "
|
|
896
|
+
"This is passed to MARCPreprocessor class as a dict of dicts."
|
|
897
|
+
),
|
|
898
|
+
default=None,
|
|
899
|
+
)
|
|
900
|
+
|
|
870
901
|
args = parser.parse_args()
|
|
871
902
|
if not args.password:
|
|
872
903
|
args.password = getpass("Enter FOLIO password: ")
|
|
@@ -891,6 +922,12 @@ async def main() -> None:
|
|
|
891
922
|
else:
|
|
892
923
|
logger.info(marc_files)
|
|
893
924
|
|
|
925
|
+
if args.preprocessor_config:
|
|
926
|
+
with open(args.preprocessor_config, "r") as f:
|
|
927
|
+
preprocessor_args = json.load(f)
|
|
928
|
+
else:
|
|
929
|
+
preprocessor_args = {}
|
|
930
|
+
|
|
894
931
|
if not args.import_profile_name:
|
|
895
932
|
import_profiles = folio_client.folio_get(
|
|
896
933
|
"/data-import-profiles/jobProfiles",
|
|
@@ -919,11 +956,12 @@ async def main() -> None:
|
|
|
919
956
|
batch_size=args.batch_size,
|
|
920
957
|
batch_delay=args.batch_delay,
|
|
921
958
|
marc_record_preprocessor=args.preprocessor,
|
|
922
|
-
|
|
959
|
+
preprocessor_args=preprocessor_args,
|
|
923
960
|
no_progress=bool(args.no_progress),
|
|
924
961
|
let_summary_fail=bool(args.let_summary_fail),
|
|
925
962
|
split_files=bool(args.split_files),
|
|
926
963
|
split_size=args.split_size,
|
|
964
|
+
split_offset=args.split_offset,
|
|
927
965
|
).do_work()
|
|
928
966
|
except Exception as e:
|
|
929
967
|
logger.error("Error importing files: " + str(e))
|