folio-data-import 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- folio_data_import/MARCDataImport.py +106 -58
- folio_data_import/UserImport.py +34 -13
- folio_data_import/custom_exceptions.py +12 -0
- folio_data_import/marc_preprocessors/_preprocessors.py +7 -3
- {folio_data_import-0.3.0.dist-info → folio_data_import-0.3.2.dist-info}/METADATA +1 -1
- folio_data_import-0.3.2.dist-info/RECORD +12 -0
- folio_data_import-0.3.0.dist-info/RECORD +0 -12
- {folio_data_import-0.3.0.dist-info → folio_data_import-0.3.2.dist-info}/LICENSE +0 -0
- {folio_data_import-0.3.0.dist-info → folio_data_import-0.3.2.dist-info}/WHEEL +0 -0
- {folio_data_import-0.3.0.dist-info → folio_data_import-0.3.2.dist-info}/entry_points.txt +0 -0
|
@@ -15,7 +15,7 @@ from functools import cached_property
|
|
|
15
15
|
from getpass import getpass
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
from time import sleep
|
|
18
|
-
from typing import
|
|
18
|
+
from typing import BinaryIO, Callable, Dict, List, Union
|
|
19
19
|
|
|
20
20
|
import folioclient
|
|
21
21
|
import httpx
|
|
@@ -25,7 +25,7 @@ import tabulate
|
|
|
25
25
|
from humps import decamelize
|
|
26
26
|
from tqdm import tqdm
|
|
27
27
|
|
|
28
|
-
from folio_data_import.custom_exceptions import FolioDataImportBatchError
|
|
28
|
+
from folio_data_import.custom_exceptions import FolioDataImportBatchError, FolioDataImportJobError
|
|
29
29
|
from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
|
|
30
30
|
|
|
31
31
|
try:
|
|
@@ -38,21 +38,25 @@ except AttributeError:
|
|
|
38
38
|
REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error": 3}
|
|
39
39
|
|
|
40
40
|
# Set default timeout and backoff values for HTTP requests when retrying job status and final summary checks
|
|
41
|
-
RETRY_TIMEOUT_START =
|
|
42
|
-
RETRY_TIMEOUT_RETRY_FACTOR =
|
|
41
|
+
RETRY_TIMEOUT_START = 5
|
|
42
|
+
RETRY_TIMEOUT_RETRY_FACTOR = 1.5
|
|
43
|
+
RETRY_TIMEOUT_MAX = 25.32
|
|
43
44
|
|
|
44
45
|
# Custom log level for data issues, set to 26
|
|
45
46
|
DATA_ISSUE_LVL_NUM = 26
|
|
46
47
|
logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
|
|
47
48
|
|
|
49
|
+
|
|
48
50
|
def data_issues(self, msg, *args, **kws):
|
|
49
51
|
if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
|
|
50
52
|
self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
|
|
51
53
|
|
|
54
|
+
|
|
52
55
|
logging.Logger.data_issues = data_issues
|
|
53
56
|
|
|
54
57
|
logger = logging.getLogger(__name__)
|
|
55
58
|
|
|
59
|
+
|
|
56
60
|
class MARCImportJob:
|
|
57
61
|
"""
|
|
58
62
|
Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
|
|
@@ -82,8 +86,10 @@ class MARCImportJob:
|
|
|
82
86
|
finished: bool = False
|
|
83
87
|
job_id: str = ""
|
|
84
88
|
job_hrid: int = 0
|
|
85
|
-
current_file: Union[List[Path],List[io.BytesIO]] = []
|
|
89
|
+
current_file: Union[List[Path], List[io.BytesIO]] = []
|
|
86
90
|
_max_summary_retries: int = 2
|
|
91
|
+
_max_job_retries: int = 2
|
|
92
|
+
_job_retries: int = 0
|
|
87
93
|
_summary_retries: int = 0
|
|
88
94
|
|
|
89
95
|
def __init__(
|
|
@@ -93,8 +99,8 @@ class MARCImportJob:
|
|
|
93
99
|
import_profile_name: str,
|
|
94
100
|
batch_size=10,
|
|
95
101
|
batch_delay=0,
|
|
96
|
-
marc_record_preprocessor: Union[List[Callable], str]=[],
|
|
97
|
-
preprocessor_args: Dict[str,Dict]={},
|
|
102
|
+
marc_record_preprocessor: Union[List[Callable], str] = [],
|
|
103
|
+
preprocessor_args: Dict[str, Dict] = {},
|
|
98
104
|
no_progress=False,
|
|
99
105
|
let_summary_fail=False,
|
|
100
106
|
split_files=False,
|
|
@@ -111,8 +117,10 @@ class MARCImportJob:
|
|
|
111
117
|
self.import_profile_name = import_profile_name
|
|
112
118
|
self.batch_size = batch_size
|
|
113
119
|
self.batch_delay = batch_delay
|
|
114
|
-
self.current_retry_timeout =
|
|
115
|
-
self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(
|
|
120
|
+
self.current_retry_timeout = 0
|
|
121
|
+
self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(
|
|
122
|
+
marc_record_preprocessor, **preprocessor_args
|
|
123
|
+
)
|
|
116
124
|
|
|
117
125
|
async def do_work(self) -> None:
|
|
118
126
|
"""
|
|
@@ -161,10 +169,14 @@ class MARCImportJob:
|
|
|
161
169
|
for file in self.import_files:
|
|
162
170
|
with open(file, "rb") as f:
|
|
163
171
|
file_length = await self.read_total_records([f])
|
|
164
|
-
expected_batches = math.ceil(file_length /self.split_size)
|
|
165
|
-
logger.info(
|
|
172
|
+
expected_batches = math.ceil(file_length / self.split_size)
|
|
173
|
+
logger.info(
|
|
174
|
+
f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches."
|
|
175
|
+
)
|
|
166
176
|
zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
|
|
167
|
-
for idx, batch in enumerate(
|
|
177
|
+
for idx, batch in enumerate(
|
|
178
|
+
self.split_marc_file(file, self.split_size), start=1
|
|
179
|
+
):
|
|
168
180
|
if idx > self.split_offset:
|
|
169
181
|
batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
|
|
170
182
|
self.current_file = [batch]
|
|
@@ -212,22 +224,29 @@ class MARCImportJob:
|
|
|
212
224
|
timeout=self.current_retry_timeout,
|
|
213
225
|
verify=self.folio_client.ssl_verify,
|
|
214
226
|
) as temp_client:
|
|
227
|
+
self.folio_client.httpx_client = temp_client
|
|
215
228
|
job_status = self.folio_client.folio_get(
|
|
216
229
|
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
217
230
|
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
218
231
|
)
|
|
219
232
|
self.current_retry_timeout = None
|
|
220
233
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
221
|
-
if
|
|
222
|
-
|
|
223
|
-
|
|
234
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
235
|
+
if self.current_retry_timeout <= RETRY_TIMEOUT_MAX and (
|
|
236
|
+
not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
|
|
237
|
+
):
|
|
238
|
+
logger.warning(
|
|
239
|
+
f"SERVER ERROR fetching job status: {error_text}. Retrying."
|
|
240
|
+
)
|
|
224
241
|
sleep(0.25)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
242
|
+
return await self.get_job_status()
|
|
243
|
+
elif self.current_retry_timeout > RETRY_TIMEOUT_MAX and (
|
|
244
|
+
not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
|
|
245
|
+
):
|
|
246
|
+
logger.critical(
|
|
247
|
+
f"SERVER ERROR fetching job status: {error_text}. Max retries exceeded."
|
|
248
|
+
)
|
|
249
|
+
raise FolioDataImportJobError(self.job_id, error_text, e)
|
|
231
250
|
else:
|
|
232
251
|
raise e
|
|
233
252
|
except Exception as e:
|
|
@@ -240,19 +259,29 @@ class MARCImportJob:
|
|
|
240
259
|
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
241
260
|
self.last_current = status["progress"]["current"]
|
|
242
261
|
except (IndexError, ValueError, KeyError):
|
|
243
|
-
logger.debug(
|
|
262
|
+
logger.debug(
|
|
263
|
+
f"No active job found with ID {self.job_id}. Checking for finished job."
|
|
264
|
+
)
|
|
244
265
|
try:
|
|
245
266
|
job_status = self.folio_client.folio_get(
|
|
246
267
|
"/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
|
|
247
268
|
"=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
|
|
248
269
|
)
|
|
249
270
|
status = [
|
|
250
|
-
job
|
|
271
|
+
job
|
|
272
|
+
for job in job_status["jobExecutions"]
|
|
273
|
+
if job["id"] == self.job_id
|
|
251
274
|
][0]
|
|
252
|
-
self.pbar_imported.update(
|
|
275
|
+
self.pbar_imported.update(
|
|
276
|
+
status["progress"]["current"] - self.last_current
|
|
277
|
+
)
|
|
253
278
|
self.last_current = status["progress"]["current"]
|
|
254
279
|
self.finished = True
|
|
255
|
-
except (
|
|
280
|
+
except (
|
|
281
|
+
httpx.ConnectTimeout,
|
|
282
|
+
httpx.ReadTimeout,
|
|
283
|
+
httpx.HTTPStatusError,
|
|
284
|
+
) as e:
|
|
256
285
|
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
257
286
|
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
258
287
|
logger.warning(
|
|
@@ -342,7 +371,7 @@ class MARCImportJob:
|
|
|
342
371
|
)
|
|
343
372
|
try:
|
|
344
373
|
set_job_profile.raise_for_status()
|
|
345
|
-
self.job_hrid = set_job_profile.json()[
|
|
374
|
+
self.job_hrid = set_job_profile.json()["hrId"]
|
|
346
375
|
logger.info(f"Job HRID: {self.job_hrid}")
|
|
347
376
|
except httpx.HTTPError as e:
|
|
348
377
|
logger.error(
|
|
@@ -410,9 +439,7 @@ class MARCImportJob:
|
|
|
410
439
|
for record in self.record_batch:
|
|
411
440
|
self.failed_batches_file.write(record)
|
|
412
441
|
raise FolioDataImportBatchError(
|
|
413
|
-
batch_payload[
|
|
414
|
-
f"{e}\n{e.response.text}",
|
|
415
|
-
e
|
|
442
|
+
batch_payload["id"], f"{e}\n{e.response.text}", e
|
|
416
443
|
)
|
|
417
444
|
await self.get_job_status()
|
|
418
445
|
sleep(self.batch_delay)
|
|
@@ -472,12 +499,12 @@ class MARCImportJob:
|
|
|
472
499
|
def move_file_to_complete(self, file_path: Path):
|
|
473
500
|
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
474
501
|
if not import_complete_path.exists():
|
|
475
|
-
logger.debug(
|
|
502
|
+
logger.debug(
|
|
503
|
+
f"Creating import_complete directory: {import_complete_path.absolute()}"
|
|
504
|
+
)
|
|
476
505
|
import_complete_path.mkdir(exist_ok=True)
|
|
477
506
|
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
478
|
-
file_path.rename(
|
|
479
|
-
file_path.parent.joinpath("import_complete", file_path.name)
|
|
480
|
-
)
|
|
507
|
+
file_path.rename(file_path.parent.joinpath("import_complete", file_path.name))
|
|
481
508
|
|
|
482
509
|
async def create_batch_payload(self, counter, total_records, is_last) -> dict:
|
|
483
510
|
"""
|
|
@@ -521,11 +548,15 @@ class MARCImportJob:
|
|
|
521
548
|
|
|
522
549
|
record_body = f.read(record_length - 24)
|
|
523
550
|
if len(record_body) != record_length - 24:
|
|
524
|
-
raise ValueError(
|
|
551
|
+
raise ValueError(
|
|
552
|
+
"Unexpected end of file while reading MARC record."
|
|
553
|
+
)
|
|
525
554
|
|
|
526
555
|
# Verify record terminator
|
|
527
|
-
if record_body[-1:] != b
|
|
528
|
-
raise ValueError(
|
|
556
|
+
if record_body[-1:] != b"\x1d":
|
|
557
|
+
raise ValueError(
|
|
558
|
+
"MARC record does not end with the expected terminator (0x1D)."
|
|
559
|
+
)
|
|
529
560
|
|
|
530
561
|
# Write the full record to the batch buffer
|
|
531
562
|
batch.write(leader + record_body)
|
|
@@ -566,12 +597,11 @@ class MARCImportJob:
|
|
|
566
597
|
try:
|
|
567
598
|
if isinstance(self.current_file[0], Path):
|
|
568
599
|
files = [
|
|
569
|
-
stack.enter_context(open(file, "rb"))
|
|
600
|
+
stack.enter_context(open(file, "rb"))
|
|
601
|
+
for file in self.current_file
|
|
570
602
|
]
|
|
571
603
|
elif isinstance(self.current_file[0], io.BytesIO):
|
|
572
|
-
files = [
|
|
573
|
-
stack.enter_context(file) for file in self.current_file
|
|
574
|
-
]
|
|
604
|
+
files = [stack.enter_context(file) for file in self.current_file]
|
|
575
605
|
else:
|
|
576
606
|
raise ValueError("Invalid file type. Must be Path or BytesIO.")
|
|
577
607
|
except IndexError as e:
|
|
@@ -598,13 +628,26 @@ class MARCImportJob:
|
|
|
598
628
|
await self.process_records(files, total_records)
|
|
599
629
|
while not self.finished:
|
|
600
630
|
await self.get_job_status()
|
|
601
|
-
sleep(
|
|
631
|
+
await asyncio.sleep(5)
|
|
602
632
|
except FolioDataImportBatchError as e:
|
|
603
633
|
logger.error(
|
|
604
634
|
f"Unhandled error posting batch {e.batch_id}: {e.message}"
|
|
605
635
|
)
|
|
606
636
|
await self.cancel_job()
|
|
607
637
|
raise e
|
|
638
|
+
except FolioDataImportJobError as e:
|
|
639
|
+
await self.cancel_job()
|
|
640
|
+
if self._job_retries < self._max_job_retries:
|
|
641
|
+
self._job_retries += 1
|
|
642
|
+
logger.error(
|
|
643
|
+
f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and retrying."
|
|
644
|
+
)
|
|
645
|
+
await self.import_marc_file()
|
|
646
|
+
else:
|
|
647
|
+
logger.critical(
|
|
648
|
+
f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and exiting (maximum retries reached)."
|
|
649
|
+
)
|
|
650
|
+
raise e
|
|
608
651
|
if self.finished:
|
|
609
652
|
await self.log_job_summary()
|
|
610
653
|
self.last_current = 0
|
|
@@ -629,7 +672,9 @@ class MARCImportJob:
|
|
|
629
672
|
self.finished = True
|
|
630
673
|
logger.info(f"Cancelled job: {self.job_id}")
|
|
631
674
|
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
632
|
-
logger.warning(
|
|
675
|
+
logger.warning(
|
|
676
|
+
f"CONNECTION ERROR cancelling job {self.job_id}. Retrying..."
|
|
677
|
+
)
|
|
633
678
|
sleep(0.25)
|
|
634
679
|
await self.cancel_job()
|
|
635
680
|
|
|
@@ -651,22 +696,22 @@ class MARCImportJob:
|
|
|
651
696
|
table_data.append(table_row)
|
|
652
697
|
table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
|
|
653
698
|
columns = columns[:1] + [
|
|
654
|
-
|
|
655
|
-
|
|
699
|
+
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
700
|
+
]
|
|
656
701
|
logger.info(
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
702
|
+
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
703
|
+
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
704
|
+
)
|
|
660
705
|
logger.info(
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
),
|
|
665
|
-
)
|
|
706
|
+
"\n"
|
|
707
|
+
+ tabulate.tabulate(table_data, headers=columns, tablefmt="fancy_grid"),
|
|
708
|
+
)
|
|
666
709
|
if total_errors:
|
|
667
710
|
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
668
711
|
else:
|
|
669
|
-
logger.error(
|
|
712
|
+
logger.error(
|
|
713
|
+
f"No job summary available for job #{self.job_hrid}({self.job_id})."
|
|
714
|
+
)
|
|
670
715
|
|
|
671
716
|
async def get_job_summary(self) -> dict:
|
|
672
717
|
"""
|
|
@@ -691,8 +736,10 @@ class MARCImportJob:
|
|
|
691
736
|
self.current_retry_timeout = None
|
|
692
737
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
693
738
|
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
694
|
-
if (self._max_summary_retries > self._summary_retries) and (
|
|
695
|
-
hasattr(e, "response")
|
|
739
|
+
if (self._max_summary_retries > self._summary_retries) and (
|
|
740
|
+
not hasattr(e, "response")
|
|
741
|
+
or (hasattr(e, "response") and e.response.status_code in [502, 504])
|
|
742
|
+
and not self.let_summary_fail
|
|
696
743
|
):
|
|
697
744
|
logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
|
|
698
745
|
sleep(0.25)
|
|
@@ -703,8 +750,9 @@ class MARCImportJob:
|
|
|
703
750
|
self.folio_client.httpx_client = temp_client
|
|
704
751
|
self._summary_retries += 1
|
|
705
752
|
return await self.get_job_summary()
|
|
706
|
-
elif (self._summary_retries >= self._max_summary_retries) or (
|
|
707
|
-
e
|
|
753
|
+
elif (self._summary_retries >= self._max_summary_retries) or (
|
|
754
|
+
hasattr(e, "response")
|
|
755
|
+
and (e.response.status_code in [502, 504] and self.let_summary_fail)
|
|
708
756
|
):
|
|
709
757
|
logger.warning(
|
|
710
758
|
f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
|
|
@@ -809,7 +857,7 @@ async def main() -> None:
|
|
|
809
857
|
"to apply to each MARC record before sending to FOLIO. Function should take "
|
|
810
858
|
"a pymarc.Record object as input and return a pymarc.Record object."
|
|
811
859
|
),
|
|
812
|
-
default=
|
|
860
|
+
default="",
|
|
813
861
|
)
|
|
814
862
|
|
|
815
863
|
parser.add_argument(
|
folio_data_import/UserImport.py
CHANGED
|
@@ -8,7 +8,7 @@ import time
|
|
|
8
8
|
import uuid
|
|
9
9
|
from datetime import datetime as dt
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Tuple
|
|
11
|
+
from typing import Tuple, List
|
|
12
12
|
|
|
13
13
|
import aiofiles
|
|
14
14
|
import folioclient
|
|
@@ -51,6 +51,7 @@ class UserImporter: # noqa: R0902
|
|
|
51
51
|
user_match_key: str = "externalSystemId",
|
|
52
52
|
only_update_present_fields: bool = False,
|
|
53
53
|
default_preferred_contact_type: str = "002",
|
|
54
|
+
fields_to_protect: List[str] =[],
|
|
54
55
|
) -> None:
|
|
55
56
|
self.limit_simultaneous_requests = limit_simultaneous_requests
|
|
56
57
|
self.batch_size = batch_size
|
|
@@ -77,6 +78,7 @@ class UserImporter: # noqa: R0902
|
|
|
77
78
|
self.match_key = user_match_key
|
|
78
79
|
self.lock: asyncio.Lock = asyncio.Lock()
|
|
79
80
|
self.logs: dict = {"created": 0, "updated": 0, "failed": 0}
|
|
81
|
+
self.fields_to_protect = set(fields_to_protect)
|
|
80
82
|
|
|
81
83
|
@staticmethod
|
|
82
84
|
def build_ref_data_id_map(
|
|
@@ -334,6 +336,7 @@ class UserImporter: # noqa: R0902
|
|
|
334
336
|
None
|
|
335
337
|
|
|
336
338
|
"""
|
|
339
|
+
|
|
337
340
|
await self.set_preferred_contact_type(user_obj, existing_user)
|
|
338
341
|
preferred_contact_type = {"preferredContactTypeId": existing_user.get("personal", {}).pop("preferredContactTypeId")}
|
|
339
342
|
if self.only_update_present_fields:
|
|
@@ -502,7 +505,9 @@ class UserImporter: # noqa: R0902
|
|
|
502
505
|
|
|
503
506
|
async def get_protected_fields(self, existing_user) -> dict:
|
|
504
507
|
"""
|
|
505
|
-
Retrieves the protected fields from the existing user object
|
|
508
|
+
Retrieves the protected fields from the existing user object,
|
|
509
|
+
combining both the customFields.protectedFields list *and*
|
|
510
|
+
any fields_to_protect passed on the CLI.
|
|
506
511
|
|
|
507
512
|
Args:
|
|
508
513
|
existing_user (dict): The existing user object.
|
|
@@ -512,18 +517,19 @@ class UserImporter: # noqa: R0902
|
|
|
512
517
|
"""
|
|
513
518
|
protected_fields = {}
|
|
514
519
|
protected_fields_list = existing_user.get("customFields", {}).get("protectedFields", "").split(",")
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
520
|
+
cli_fields = list(self.fields_to_protect)
|
|
521
|
+
# combine and dedupe:
|
|
522
|
+
all_fields = list(dict.fromkeys(protected_fields_list + cli_fields))
|
|
523
|
+
for field in all_fields:
|
|
524
|
+
if "." in field:
|
|
525
|
+
fld, subfld = field.split(".", 1)
|
|
526
|
+
val = existing_user.get(fld, {}).pop(subfld, None)
|
|
527
|
+
if val is not None:
|
|
528
|
+
protected_fields.setdefault(fld, {})[subfld] = val
|
|
523
529
|
else:
|
|
524
|
-
|
|
525
|
-
if
|
|
526
|
-
protected_fields
|
|
530
|
+
val = existing_user.pop(field, None)
|
|
531
|
+
if val is not None:
|
|
532
|
+
protected_fields[field] = val
|
|
527
533
|
return protected_fields
|
|
528
534
|
|
|
529
535
|
async def process_existing_user(self, user_obj) -> Tuple[dict, dict, dict, dict]:
|
|
@@ -896,6 +902,7 @@ async def main() -> None:
|
|
|
896
902
|
--update_only_present_fields (bool): Only update fields that are present in the new user object.
|
|
897
903
|
--default_preferred_contact_type (str): The default preferred contact type to use if the provided \
|
|
898
904
|
value is not valid or not present. Default "002".
|
|
905
|
+
--fields_to_protect (str): Comma-separated list of top-level or nested (dot-notation) fields to protect.
|
|
899
906
|
|
|
900
907
|
Raises:
|
|
901
908
|
Exception: If an unknown error occurs during the import process.
|
|
@@ -953,7 +960,20 @@ async def main() -> None:
|
|
|
953
960
|
choices=list(PREFERRED_CONTACT_TYPES_MAP.keys()) + list(PREFERRED_CONTACT_TYPES_MAP.values()),
|
|
954
961
|
default="002",
|
|
955
962
|
)
|
|
963
|
+
parser.add_argument(
|
|
964
|
+
"--fields-to-protect", # new flag name
|
|
965
|
+
dest="fields_to_protect", # sets args.fields_to_protect
|
|
966
|
+
help=(
|
|
967
|
+
"Comma-separated list of top-level user fields to protect "
|
|
968
|
+
"(e.g. type,expirationDate)"
|
|
969
|
+
),
|
|
970
|
+
default="",
|
|
971
|
+
)
|
|
956
972
|
args = parser.parse_args()
|
|
973
|
+
protect_fields = [
|
|
974
|
+
f.strip() for f in args.fields_to_protect.split(",")
|
|
975
|
+
if f.strip()
|
|
976
|
+
]
|
|
957
977
|
|
|
958
978
|
library_name = args.library_name
|
|
959
979
|
|
|
@@ -1005,6 +1025,7 @@ async def main() -> None:
|
|
|
1005
1025
|
args.user_match_key,
|
|
1006
1026
|
args.update_only_present_fields,
|
|
1007
1027
|
args.default_preferred_contact_type,
|
|
1028
|
+
fields_to_protect=protect_fields,
|
|
1008
1029
|
)
|
|
1009
1030
|
await importer.do_import()
|
|
1010
1031
|
except Exception as ee:
|
|
@@ -15,3 +15,15 @@ class FolioDataImportBatchError(FolioDataImportError):
|
|
|
15
15
|
self.batch_id = batch_id
|
|
16
16
|
self.message = message
|
|
17
17
|
super().__init__(f"Unhandled error posting batch {batch_id}: {message}")
|
|
18
|
+
|
|
19
|
+
class FolioDataImportJobError(FolioDataImportError):
|
|
20
|
+
"""Exception raised for errors in the Folio Data Import job process.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
job_id -- ID of the job that caused the error
|
|
24
|
+
message -- explanation of the error
|
|
25
|
+
"""
|
|
26
|
+
def __init__(self, job_id, message, exception=None):
|
|
27
|
+
self.job_id = job_id
|
|
28
|
+
self.message = message
|
|
29
|
+
super().__init__(f"Unhandled error processing job {job_id}: {message}")
|
|
@@ -3,6 +3,7 @@ import sys
|
|
|
3
3
|
from typing import Callable, Dict, List, Tuple, Union
|
|
4
4
|
import pymarc
|
|
5
5
|
import logging
|
|
6
|
+
import re
|
|
6
7
|
|
|
7
8
|
from pymarc.record import Record
|
|
8
9
|
|
|
@@ -47,6 +48,9 @@ class MARCPreprocessor:
|
|
|
47
48
|
"""
|
|
48
49
|
Get the preprocessor functions based on the provided names.
|
|
49
50
|
|
|
51
|
+
Args:
|
|
52
|
+
func_list (Union[str, List[Callable]]): A string of comma-separated function names or a list of callable preprocessor functions.
|
|
53
|
+
|
|
50
54
|
Returns:
|
|
51
55
|
List[callable]: A list of preprocessor functions.
|
|
52
56
|
"""
|
|
@@ -358,7 +362,7 @@ def clean_empty_fields(record: Record, **kwargs) -> Record:
|
|
|
358
362
|
|
|
359
363
|
for field in record.get_fields(*MAPPED_FIELDS.keys()):
|
|
360
364
|
len_subs = len(field.subfields)
|
|
361
|
-
subfield_value = bool(field.subfields[0].value) if len_subs else False
|
|
365
|
+
subfield_value = bool(re.sub(r"[.,-]", "", field.subfields[0].value).strip()) if len_subs else False
|
|
362
366
|
if int(field.tag) > 9 and len_subs == 0:
|
|
363
367
|
logger.log(
|
|
364
368
|
26,
|
|
@@ -460,12 +464,12 @@ def move_authority_subfield_9_to_0_all_controllable_fields(record: Record, **kwa
|
|
|
460
464
|
"100", "110", "111", "130",
|
|
461
465
|
"600", "610", "611", "630", "650", "651", "655",
|
|
462
466
|
"700", "710", "711", "730",
|
|
463
|
-
"800", "810", "811", "830"
|
|
467
|
+
"800", "810", "811", "830", "880"
|
|
464
468
|
]
|
|
465
469
|
for field in record.get_fields(*controlled_fields):
|
|
466
470
|
for subfield in list(field.get_subfields("9")):
|
|
467
471
|
field.add_subfield("0", subfield)
|
|
468
|
-
field.delete_subfield("9"
|
|
472
|
+
field.delete_subfield("9")
|
|
469
473
|
logger.log(
|
|
470
474
|
26,
|
|
471
475
|
"DATA ISSUE\t%s\t%s\t%s",
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
folio_data_import/MARCDataImport.py,sha256=Qfpc3NtlQe4VUobXsFhmSnScBhiNRGEzBN7FZrdsBYc,37633
|
|
2
|
+
folio_data_import/UserImport.py,sha256=4Bn_Z1xX5DvnodscW9NgGeBZgvFeShVoIbxDGrt6BMo,41748
|
|
3
|
+
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
+
folio_data_import/custom_exceptions.py,sha256=1xw1BI5fW7WDd37zUIOcw0DAvrFKtklnqmbRhZXSAiE,1093
|
|
6
|
+
folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
|
|
7
|
+
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=HnW7hw7DFTxyiUxYtl-8v0liW4FDi0y5SvI2ZwhdWPU,16570
|
|
8
|
+
folio_data_import-0.3.2.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
9
|
+
folio_data_import-0.3.2.dist-info/METADATA,sha256=r8J3ZZm833GcuaCAAj-mBZ2bVPyq4Xa6rPgqSbgVBkE,6069
|
|
10
|
+
folio_data_import-0.3.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
11
|
+
folio_data_import-0.3.2.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
12
|
+
folio_data_import-0.3.2.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
folio_data_import/MARCDataImport.py,sha256=je3TdCdaDR-gYA3Gh1k4AX9l3v83sCTt4Y9lOFxayu8,36220
|
|
2
|
-
folio_data_import/UserImport.py,sha256=ZulGaGJhI_N5vmR69YF_qbzbGeVyzcthXklSjDpZCyA,40998
|
|
3
|
-
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
-
folio_data_import/custom_exceptions.py,sha256=xOeIbM86d2r5-z3ul4JFTJLT3vI3kwmEq62cWS-9dOc,646
|
|
6
|
-
folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
|
|
7
|
-
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4Zrp-9LdL7f5QqUTOjyMkK5IaHP2YOkmkqoY_4o585Q,16377
|
|
8
|
-
folio_data_import-0.3.0.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
9
|
-
folio_data_import-0.3.0.dist-info/METADATA,sha256=Aqf0PXhdwFyChMKvl9cOluKN60IyMAUPDKSpb8AOlXI,6069
|
|
10
|
-
folio_data_import-0.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
11
|
-
folio_data_import-0.3.0.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
12
|
-
folio_data_import-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|