folio-data-import 0.2.8rc8__py3-none-any.whl → 0.2.8rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- folio_data_import/MARCDataImport.py +67 -40
- folio_data_import/marc_preprocessors/_preprocessors.py +64 -4
- {folio_data_import-0.2.8rc8.dist-info → folio_data_import-0.2.8rc10.dist-info}/METADATA +1 -1
- folio_data_import-0.2.8rc10.dist-info/RECORD +11 -0
- folio_data_import-0.2.8rc8.dist-info/RECORD +0 -11
- {folio_data_import-0.2.8rc8.dist-info → folio_data_import-0.2.8rc10.dist-info}/LICENSE +0 -0
- {folio_data_import-0.2.8rc8.dist-info → folio_data_import-0.2.8rc10.dist-info}/WHEEL +0 -0
- {folio_data_import-0.2.8rc8.dist-info → folio_data_import-0.2.8rc10.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import asyncio
|
|
3
3
|
import datetime
|
|
4
|
-
from email import message
|
|
5
4
|
import glob
|
|
6
5
|
import importlib
|
|
7
6
|
import io
|
|
@@ -104,6 +103,8 @@ class MARCImportJob:
|
|
|
104
103
|
self.marc_record_preprocessor = marc_record_preprocessor
|
|
105
104
|
self.pbar_sent: tqdm
|
|
106
105
|
self.pbar_imported: tqdm
|
|
106
|
+
self._max_summary_retries: int = 2
|
|
107
|
+
self._summary_retries: int = 0
|
|
107
108
|
|
|
108
109
|
async def do_work(self) -> None:
|
|
109
110
|
"""
|
|
@@ -183,11 +184,15 @@ class MARCImportJob:
|
|
|
183
184
|
if self.current_retry_timeout
|
|
184
185
|
else RETRY_TIMEOUT_START
|
|
185
186
|
)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
)
|
|
190
|
-
|
|
187
|
+
with httpx.Client(
|
|
188
|
+
timeout=self.current_retry_timeout,
|
|
189
|
+
verify=self.folio_client.ssl_verify,
|
|
190
|
+
) as temp_client:
|
|
191
|
+
job_status = self.folio_client.folio_get(
|
|
192
|
+
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
193
|
+
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
194
|
+
)
|
|
195
|
+
self.current_retry_timeout = None
|
|
191
196
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
192
197
|
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
193
198
|
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
@@ -201,13 +206,17 @@ class MARCImportJob:
|
|
|
201
206
|
return await self.get_job_status()
|
|
202
207
|
else:
|
|
203
208
|
raise e
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.error(f"Error fetching job status. {e}")
|
|
211
|
+
|
|
204
212
|
try:
|
|
205
213
|
status = [
|
|
206
214
|
job for job in job_status["jobExecutions"] if job["id"] == self.job_id
|
|
207
215
|
][0]
|
|
208
216
|
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
209
217
|
self.last_current = status["progress"]["current"]
|
|
210
|
-
except IndexError:
|
|
218
|
+
except (IndexError, ValueError, KeyError):
|
|
219
|
+
logger.debug(f"No active job found with ID {self.job_id}. Checking for finished job.")
|
|
211
220
|
try:
|
|
212
221
|
job_status = self.folio_client.folio_get(
|
|
213
222
|
"/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
|
|
@@ -245,21 +254,26 @@ class MARCImportJob:
|
|
|
245
254
|
Raises:
|
|
246
255
|
HTTPError: If there is an error creating the job.
|
|
247
256
|
"""
|
|
248
|
-
create_job = self.http_client.post(
|
|
249
|
-
self.folio_client.okapi_url + "/change-manager/jobExecutions",
|
|
250
|
-
headers=self.folio_client.okapi_headers,
|
|
251
|
-
json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
|
|
252
|
-
)
|
|
253
257
|
try:
|
|
254
|
-
create_job.
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
"
|
|
258
|
-
+ str(e)
|
|
259
|
-
+ "\n"
|
|
260
|
-
+ getattr(getattr(e, "response", ""), "text", "")
|
|
258
|
+
create_job = self.http_client.post(
|
|
259
|
+
self.folio_client.okapi_url + "/change-manager/jobExecutions",
|
|
260
|
+
headers=self.folio_client.okapi_headers,
|
|
261
|
+
json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
|
|
261
262
|
)
|
|
262
|
-
|
|
263
|
+
create_job.raise_for_status()
|
|
264
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
265
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
266
|
+
logger.warning(f"SERVER ERROR creating job: {e}. Retrying.")
|
|
267
|
+
sleep(0.25)
|
|
268
|
+
return await self.create_folio_import_job()
|
|
269
|
+
else:
|
|
270
|
+
logger.error(
|
|
271
|
+
"Error creating job: "
|
|
272
|
+
+ str(e)
|
|
273
|
+
+ "\n"
|
|
274
|
+
+ getattr(getattr(e, "response", ""), "text", "")
|
|
275
|
+
)
|
|
276
|
+
raise e
|
|
263
277
|
self.job_id = create_job.json()["parentJobExecutionId"]
|
|
264
278
|
logger.info("Created job: " + self.job_id)
|
|
265
279
|
|
|
@@ -432,7 +446,7 @@ class MARCImportJob:
|
|
|
432
446
|
),
|
|
433
447
|
)
|
|
434
448
|
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
435
|
-
if import_complete_path.exists():
|
|
449
|
+
if not import_complete_path.exists():
|
|
436
450
|
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
437
451
|
import_complete_path.mkdir(exist_ok=True)
|
|
438
452
|
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
@@ -455,31 +469,42 @@ class MARCImportJob:
|
|
|
455
469
|
pymarc.Record: The preprocessed MARC record.
|
|
456
470
|
"""
|
|
457
471
|
if isinstance(func_or_path, str):
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
func = getattr(module, func_name)
|
|
463
|
-
except (ImportError, AttributeError) as e:
|
|
464
|
-
logger.error(
|
|
465
|
-
f"Error importing preprocessing function {func_or_path}: {e}. Skipping preprocessing."
|
|
472
|
+
func_paths = func_or_path.split(",")
|
|
473
|
+
for func_path in func_paths:
|
|
474
|
+
record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
|
|
475
|
+
record, func_path
|
|
466
476
|
)
|
|
467
|
-
return record
|
|
468
477
|
elif callable(func_or_path):
|
|
469
|
-
|
|
478
|
+
record = func_or_path(record)
|
|
470
479
|
else:
|
|
471
480
|
logger.warning(
|
|
472
481
|
f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
|
|
473
482
|
)
|
|
474
|
-
|
|
483
|
+
return record
|
|
484
|
+
|
|
485
|
+
async def _apply_single_marc_record_preprocessing_by_path(
|
|
486
|
+
record: pymarc.Record, func_path: str
|
|
487
|
+
) -> pymarc.Record:
|
|
488
|
+
"""
|
|
489
|
+
Apply a single preprocessing function to the MARC record.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
493
|
+
func_path (str): The path to the preprocessing function.
|
|
475
494
|
|
|
495
|
+
Returns:
|
|
496
|
+
pymarc.Record: The preprocessed MARC record.
|
|
497
|
+
"""
|
|
476
498
|
try:
|
|
477
|
-
|
|
499
|
+
module_path, func_name = func_path.rsplit(".", 1)
|
|
500
|
+
module = importlib.import_module(module_path)
|
|
501
|
+
func = getattr(module, func_name)
|
|
502
|
+
record = func(record)
|
|
478
503
|
except Exception as e:
|
|
479
|
-
logger.
|
|
480
|
-
f"Error applying preprocessing function: {e}. Skipping
|
|
504
|
+
logger.warning(
|
|
505
|
+
f"Error applying preprocessing function {func_path}: {e}. Skipping."
|
|
481
506
|
)
|
|
482
|
-
|
|
507
|
+
return record
|
|
483
508
|
|
|
484
509
|
async def create_batch_payload(self, counter, total_records, is_last) -> dict:
|
|
485
510
|
"""
|
|
@@ -609,8 +634,8 @@ class MARCImportJob:
|
|
|
609
634
|
self.current_retry_timeout = None
|
|
610
635
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
611
636
|
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
612
|
-
if not hasattr(e, "response") or (
|
|
613
|
-
e.response.status_code in [502, 504] and not self.let_summary_fail
|
|
637
|
+
if (self._max_summary_retries > self._summary_retries) and (not hasattr(e, "response") or (
|
|
638
|
+
hasattr(e, "response") and e.response.status_code in [502, 504]) and not self.let_summary_fail
|
|
614
639
|
):
|
|
615
640
|
logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
|
|
616
641
|
sleep(0.25)
|
|
@@ -619,9 +644,10 @@ class MARCImportJob:
|
|
|
619
644
|
verify=self.folio_client.ssl_verify,
|
|
620
645
|
) as temp_client:
|
|
621
646
|
self.folio_client.httpx_client = temp_client
|
|
647
|
+
self._summary_retries += 1
|
|
622
648
|
return await self.get_job_summary()
|
|
623
|
-
elif hasattr(e, "response") and (
|
|
624
|
-
e.response.status_code in [502, 504] and self.let_summary_fail
|
|
649
|
+
elif (self._summary_retries >= self._max_summary_retries) or (hasattr(e, "response") and (
|
|
650
|
+
e.response.status_code in [502, 504] and self.let_summary_fail)
|
|
625
651
|
):
|
|
626
652
|
logger.warning(
|
|
627
653
|
f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
|
|
@@ -667,6 +693,7 @@ def set_up_cli_logging():
|
|
|
667
693
|
"marc_import_data_issues_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
668
694
|
)
|
|
669
695
|
data_issues_handler.setLevel(26)
|
|
696
|
+
data_issues_handler.addFilter(IncludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
670
697
|
data_issues_formatter = logging.Formatter("%(message)s")
|
|
671
698
|
data_issues_handler.setFormatter(data_issues_formatter)
|
|
672
699
|
logger.addHandler(data_issues_handler)
|
|
@@ -63,6 +63,29 @@ def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
63
63
|
record.remove_field(field)
|
|
64
64
|
return record
|
|
65
65
|
|
|
66
|
+
def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
|
|
67
|
+
"""
|
|
68
|
+
The presence of 999 fields, with or without ff indicators, can cause
|
|
69
|
+
issues with data import mapping in FOLIO. This function calls strip_999_ff_fields
|
|
70
|
+
to remove 999 fields with ff indicators and then copies the remaining 999 fields
|
|
71
|
+
to 945 fields.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
pymarc.Record: The preprocessed MARC record.
|
|
78
|
+
"""
|
|
79
|
+
record = strip_999_ff_fields(record)
|
|
80
|
+
for field in record.get_fields("999"):
|
|
81
|
+
_945 = pymarc.Field(
|
|
82
|
+
tag="945",
|
|
83
|
+
indicators=field.indicators,
|
|
84
|
+
subfields=field.subfields,
|
|
85
|
+
)
|
|
86
|
+
record.add_ordered_field(_945)
|
|
87
|
+
record.remove_field(field)
|
|
88
|
+
return record
|
|
66
89
|
|
|
67
90
|
def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
68
91
|
"""
|
|
@@ -219,7 +242,7 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
219
242
|
26,
|
|
220
243
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
221
244
|
record["001"].value(),
|
|
222
|
-
f"{field.tag} is empty",
|
|
245
|
+
f"{field.tag} is empty, removing field",
|
|
223
246
|
field,
|
|
224
247
|
)
|
|
225
248
|
record.remove_field(field)
|
|
@@ -228,7 +251,7 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
228
251
|
26,
|
|
229
252
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
230
253
|
record["001"].value(),
|
|
231
|
-
f"{field.tag}${field.subfields[0].code} is empty, removing field",
|
|
254
|
+
f"{field.tag}${field.subfields[0].code} is empty, no other subfields present, removing field",
|
|
232
255
|
field,
|
|
233
256
|
)
|
|
234
257
|
record.remove_field(field)
|
|
@@ -238,12 +261,15 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
238
261
|
26,
|
|
239
262
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
240
263
|
record["001"].value(),
|
|
241
|
-
f"{field.tag}$a is empty, removing
|
|
264
|
+
f"{field.tag}$a is empty, removing subfield",
|
|
242
265
|
field,
|
|
243
266
|
)
|
|
244
267
|
field.delete_subfield("a")
|
|
245
268
|
for idx, subfield in enumerate(list(field.subfields), start=1):
|
|
246
|
-
if
|
|
269
|
+
if (
|
|
270
|
+
subfield.code in MAPPED_FIELDS.get(field.tag, [])
|
|
271
|
+
and not subfield.value
|
|
272
|
+
):
|
|
247
273
|
logger.log(
|
|
248
274
|
26,
|
|
249
275
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
@@ -264,6 +290,40 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
264
290
|
return record
|
|
265
291
|
|
|
266
292
|
|
|
293
|
+
def fix_leader(record: pymarc.Record) -> pymarc.Record:
|
|
294
|
+
"""
|
|
295
|
+
Fixes the leader of the record by setting the record status to 'c' (modified
|
|
296
|
+
record) and the type of record to 'a' (language material).
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
pymarc.Record: The preprocessed MARC record.
|
|
303
|
+
"""
|
|
304
|
+
VALID_STATUSES = ["a", "c", "d", "n", "p"]
|
|
305
|
+
VALID_TYPES = ["a", "c", "d", "e", "f", "g", "i", "j", "k", "m", "o", "p", "r", "t"]
|
|
306
|
+
if record.leader[5] not in VALID_STATUSES:
|
|
307
|
+
logger.log(
|
|
308
|
+
26,
|
|
309
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
310
|
+
record["001"].value(),
|
|
311
|
+
f"Invalid record status: {record.leader[5]}, setting to 'c'",
|
|
312
|
+
record,
|
|
313
|
+
)
|
|
314
|
+
record.leader = pymarc.Leader(record.leader[:5] + "c" + record.leader[6:])
|
|
315
|
+
if record.leader[6] not in VALID_TYPES:
|
|
316
|
+
logger.log(
|
|
317
|
+
26,
|
|
318
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
319
|
+
record["001"].value(),
|
|
320
|
+
f"Invalid record type: {record.leader[6]}, setting to 'a'",
|
|
321
|
+
record,
|
|
322
|
+
)
|
|
323
|
+
record.leader = pymarc.Leader(record.leader[:6] + "a" + record.leader[7:])
|
|
324
|
+
return record
|
|
325
|
+
|
|
326
|
+
|
|
267
327
|
def ordinal(n):
|
|
268
328
|
s = ("th", "st", "nd", "rd") + ("th",) * 10
|
|
269
329
|
v = n % 100
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
folio_data_import/MARCDataImport.py,sha256=3MHwnusWMraUYbxaooVoVlKC3eG4D6-zBr4iguk52iA,33164
|
|
2
|
+
folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
|
|
3
|
+
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
+
folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
|
|
6
|
+
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4i1_lEnptzZDx3DojX9sfvJ_hmehwFJUC3aZsUADcwA,10851
|
|
7
|
+
folio_data_import-0.2.8rc10.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
8
|
+
folio_data_import-0.2.8rc10.dist-info/METADATA,sha256=f7nKawpgkE1Ez6j2GeljBRMoWu4yhdSOcVlj5nWtYQ4,6113
|
|
9
|
+
folio_data_import-0.2.8rc10.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
10
|
+
folio_data_import-0.2.8rc10.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
11
|
+
folio_data_import-0.2.8rc10.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
folio_data_import/MARCDataImport.py,sha256=wLh8raMOIXCQiNZd_MLyUPwGlxZU2G-qOrnySKR9XU4,31723
|
|
2
|
-
folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
|
|
3
|
-
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
-
folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
|
|
6
|
-
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=LJdmwW_0oPbcaWb85F0PAcXJWbbBp9HHrFyZQZER5gs,8738
|
|
7
|
-
folio_data_import-0.2.8rc8.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
8
|
-
folio_data_import-0.2.8rc8.dist-info/METADATA,sha256=Nl8YmVukI1et0xX3ObggSQTbcgjNuYbY3E8GrvG_X8M,6112
|
|
9
|
-
folio_data_import-0.2.8rc8.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
10
|
-
folio_data_import-0.2.8rc8.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
11
|
-
folio_data_import-0.2.8rc8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{folio_data_import-0.2.8rc8.dist-info → folio_data_import-0.2.8rc10.dist-info}/entry_points.txt
RENAMED
|
File without changes
|