folio-data-import 0.2.8rc7__py3-none-any.whl → 0.2.8rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- folio_data_import/MARCDataImport.py +319 -121
- folio_data_import/marc_preprocessors/__init__.py +1 -1
- folio_data_import/marc_preprocessors/_preprocessors.py +261 -12
- {folio_data_import-0.2.8rc7.dist-info → folio_data_import-0.2.8rc9.dist-info}/METADATA +3 -3
- folio_data_import-0.2.8rc9.dist-info/RECORD +11 -0
- folio_data_import-0.2.8rc7.dist-info/RECORD +0 -11
- {folio_data_import-0.2.8rc7.dist-info → folio_data_import-0.2.8rc9.dist-info}/LICENSE +0 -0
- {folio_data_import-0.2.8rc7.dist-info → folio_data_import-0.2.8rc9.dist-info}/WHEEL +0 -0
- {folio_data_import-0.2.8rc7.dist-info → folio_data_import-0.2.8rc9.dist-info}/entry_points.txt +0 -0
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import asyncio
|
|
3
|
+
import datetime
|
|
3
4
|
import glob
|
|
4
5
|
import importlib
|
|
5
6
|
import io
|
|
7
|
+
import logging
|
|
6
8
|
import os
|
|
7
9
|
import sys
|
|
8
|
-
from typing import List
|
|
9
10
|
import uuid
|
|
10
11
|
from contextlib import ExitStack
|
|
11
|
-
import datetime
|
|
12
12
|
from datetime import datetime as dt
|
|
13
|
+
from functools import cached_property
|
|
13
14
|
from getpass import getpass
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
from time import sleep
|
|
17
|
+
from typing import List
|
|
16
18
|
|
|
17
19
|
import folioclient
|
|
18
20
|
import httpx
|
|
@@ -21,8 +23,6 @@ import pymarc
|
|
|
21
23
|
import tabulate
|
|
22
24
|
from humps import decamelize
|
|
23
25
|
from tqdm import tqdm
|
|
24
|
-
from zmq import has
|
|
25
|
-
|
|
26
26
|
|
|
27
27
|
try:
|
|
28
28
|
datetime_utc = datetime.UTC
|
|
@@ -37,6 +37,18 @@ REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error":
|
|
|
37
37
|
RETRY_TIMEOUT_START = 1
|
|
38
38
|
RETRY_TIMEOUT_RETRY_FACTOR = 2
|
|
39
39
|
|
|
40
|
+
# Custom log level for data issues, set to 26
|
|
41
|
+
DATA_ISSUE_LVL_NUM = 26
|
|
42
|
+
logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
|
|
43
|
+
|
|
44
|
+
def data_issues(self, msg, *args, **kws):
|
|
45
|
+
if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
|
|
46
|
+
self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
|
|
47
|
+
|
|
48
|
+
logging.Logger.data_issues = data_issues
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
40
52
|
class MARCImportJob:
|
|
41
53
|
"""
|
|
42
54
|
Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
|
|
@@ -57,7 +69,6 @@ class MARCImportJob:
|
|
|
57
69
|
bad_records_file: io.TextIOWrapper
|
|
58
70
|
failed_batches_file: io.TextIOWrapper
|
|
59
71
|
job_id: str
|
|
60
|
-
job_import_profile: dict
|
|
61
72
|
pbar_sent: tqdm
|
|
62
73
|
pbar_imported: tqdm
|
|
63
74
|
http_client: httpx.Client
|
|
@@ -78,9 +89,11 @@ class MARCImportJob:
|
|
|
78
89
|
marc_record_preprocessor=None,
|
|
79
90
|
consolidate=False,
|
|
80
91
|
no_progress=False,
|
|
92
|
+
let_summary_fail=False,
|
|
81
93
|
) -> None:
|
|
82
94
|
self.consolidate_files = consolidate
|
|
83
95
|
self.no_progress = no_progress
|
|
96
|
+
self.let_summary_fail = let_summary_fail
|
|
84
97
|
self.folio_client: folioclient.FolioClient = folio_client
|
|
85
98
|
self.import_files = marc_files
|
|
86
99
|
self.import_profile_name = import_profile_name
|
|
@@ -88,6 +101,10 @@ class MARCImportJob:
|
|
|
88
101
|
self.batch_delay = batch_delay
|
|
89
102
|
self.current_retry_timeout = None
|
|
90
103
|
self.marc_record_preprocessor = marc_record_preprocessor
|
|
104
|
+
self.pbar_sent: tqdm
|
|
105
|
+
self.pbar_imported: tqdm
|
|
106
|
+
self._max_summary_retries: int = 2
|
|
107
|
+
self._summary_retries: int = 0
|
|
91
108
|
|
|
92
109
|
async def do_work(self) -> None:
|
|
93
110
|
"""
|
|
@@ -101,21 +118,25 @@ class MARCImportJob:
|
|
|
101
118
|
Returns:
|
|
102
119
|
None
|
|
103
120
|
"""
|
|
104
|
-
with
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
121
|
+
with (
|
|
122
|
+
httpx.Client() as http_client,
|
|
123
|
+
open(
|
|
124
|
+
self.import_files[0].parent.joinpath(
|
|
125
|
+
f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
126
|
+
),
|
|
127
|
+
"wb+",
|
|
128
|
+
) as bad_marc_file,
|
|
129
|
+
open(
|
|
130
|
+
self.import_files[0].parent.joinpath(
|
|
131
|
+
f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
132
|
+
),
|
|
133
|
+
"wb+",
|
|
134
|
+
) as failed_batches,
|
|
135
|
+
):
|
|
115
136
|
self.bad_records_file = bad_marc_file
|
|
116
|
-
|
|
137
|
+
logger.info(f"Writing bad records to {self.bad_records_file.name}")
|
|
117
138
|
self.failed_batches_file = failed_batches
|
|
118
|
-
|
|
139
|
+
logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
|
|
119
140
|
self.http_client = http_client
|
|
120
141
|
if self.consolidate_files:
|
|
121
142
|
self.current_file = self.import_files
|
|
@@ -136,16 +157,16 @@ class MARCImportJob:
|
|
|
136
157
|
Returns:
|
|
137
158
|
None
|
|
138
159
|
"""
|
|
139
|
-
self.bad_records_file.
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
self.failed_batches_file.
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
160
|
+
with open(self.bad_records_file.name, "rb") as bad_records:
|
|
161
|
+
if not bad_records.read(1):
|
|
162
|
+
os.remove(bad_records.name)
|
|
163
|
+
logger.info("No bad records found. Removing bad records file.")
|
|
164
|
+
with open(self.failed_batches_file.name, "rb") as failed_batches:
|
|
165
|
+
if not failed_batches.read(1):
|
|
166
|
+
os.remove(failed_batches.name)
|
|
167
|
+
logger.info("No failed batches. Removing failed batches file.")
|
|
168
|
+
logger.info("Import complete.")
|
|
169
|
+
logger.info(f"Total records imported: {self.total_records_sent}")
|
|
149
170
|
|
|
150
171
|
async def get_job_status(self) -> None:
|
|
151
172
|
"""
|
|
@@ -159,38 +180,69 @@ class MARCImportJob:
|
|
|
159
180
|
"""
|
|
160
181
|
try:
|
|
161
182
|
self.current_retry_timeout = (
|
|
162
|
-
self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
166
|
-
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
183
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
184
|
+
if self.current_retry_timeout
|
|
185
|
+
else RETRY_TIMEOUT_START
|
|
167
186
|
)
|
|
168
|
-
self.current_retry_timeout = None
|
|
169
|
-
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
170
|
-
sleep(.25)
|
|
171
187
|
with httpx.Client(
|
|
172
188
|
timeout=self.current_retry_timeout,
|
|
173
|
-
verify=self.folio_client.ssl_verify
|
|
189
|
+
verify=self.folio_client.ssl_verify,
|
|
174
190
|
) as temp_client:
|
|
175
|
-
self.folio_client.
|
|
176
|
-
|
|
191
|
+
job_status = self.folio_client.folio_get(
|
|
192
|
+
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
193
|
+
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
194
|
+
)
|
|
195
|
+
self.current_retry_timeout = None
|
|
196
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
197
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
198
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
199
|
+
logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
|
|
200
|
+
sleep(0.25)
|
|
201
|
+
with httpx.Client(
|
|
202
|
+
timeout=self.current_retry_timeout,
|
|
203
|
+
verify=self.folio_client.ssl_verify,
|
|
204
|
+
) as temp_client:
|
|
205
|
+
self.folio_client.httpx_client = temp_client
|
|
206
|
+
return await self.get_job_status()
|
|
207
|
+
else:
|
|
208
|
+
raise e
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.error(f"Error fetching job status. {e}")
|
|
211
|
+
|
|
177
212
|
try:
|
|
178
213
|
status = [
|
|
179
214
|
job for job in job_status["jobExecutions"] if job["id"] == self.job_id
|
|
180
215
|
][0]
|
|
181
216
|
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
182
217
|
self.last_current = status["progress"]["current"]
|
|
183
|
-
except IndexError:
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
218
|
+
except (IndexError, ValueError, KeyError):
|
|
219
|
+
logger.debug(f"No active job found with ID {self.job_id}. Checking for finished job.")
|
|
220
|
+
try:
|
|
221
|
+
job_status = self.folio_client.folio_get(
|
|
222
|
+
"/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
|
|
223
|
+
"=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
|
|
224
|
+
)
|
|
225
|
+
status = [
|
|
226
|
+
job for job in job_status["jobExecutions"] if job["id"] == self.job_id
|
|
227
|
+
][0]
|
|
228
|
+
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
229
|
+
self.last_current = status["progress"]["current"]
|
|
230
|
+
self.finished = True
|
|
231
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
232
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
233
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
234
|
+
logger.warning(
|
|
235
|
+
f"SERVER ERROR fetching job status: {error_text}. Retrying."
|
|
236
|
+
)
|
|
237
|
+
sleep(0.25)
|
|
238
|
+
with httpx.Client(
|
|
239
|
+
timeout=self.current_retry_timeout,
|
|
240
|
+
verify=self.folio_client.ssl_verify,
|
|
241
|
+
) as temp_client:
|
|
242
|
+
self.folio_client.httpx_client = temp_client
|
|
243
|
+
return await self.get_job_status()
|
|
244
|
+
else:
|
|
245
|
+
raise e
|
|
194
246
|
|
|
195
247
|
async def create_folio_import_job(self) -> None:
|
|
196
248
|
"""
|
|
@@ -202,26 +254,36 @@ class MARCImportJob:
|
|
|
202
254
|
Raises:
|
|
203
255
|
HTTPError: If there is an error creating the job.
|
|
204
256
|
"""
|
|
205
|
-
create_job = self.http_client.post(
|
|
206
|
-
self.folio_client.okapi_url + "/change-manager/jobExecutions",
|
|
207
|
-
headers=self.folio_client.okapi_headers,
|
|
208
|
-
json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
|
|
209
|
-
)
|
|
210
257
|
try:
|
|
211
|
-
create_job.
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
"
|
|
215
|
-
+ str(e)
|
|
216
|
-
+ "\n"
|
|
217
|
-
+ getattr(getattr(e, "response", ""), "text", "")
|
|
258
|
+
create_job = self.http_client.post(
|
|
259
|
+
self.folio_client.okapi_url + "/change-manager/jobExecutions",
|
|
260
|
+
headers=self.folio_client.okapi_headers,
|
|
261
|
+
json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
|
|
218
262
|
)
|
|
219
|
-
|
|
263
|
+
create_job.raise_for_status()
|
|
264
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
265
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
266
|
+
logger.warning(f"SERVER ERROR creating job: {e}. Retrying.")
|
|
267
|
+
sleep(0.25)
|
|
268
|
+
return await self.create_folio_import_job()
|
|
269
|
+
else:
|
|
270
|
+
logger.error(
|
|
271
|
+
"Error creating job: "
|
|
272
|
+
+ str(e)
|
|
273
|
+
+ "\n"
|
|
274
|
+
+ getattr(getattr(e, "response", ""), "text", "")
|
|
275
|
+
)
|
|
276
|
+
raise e
|
|
220
277
|
self.job_id = create_job.json()["parentJobExecutionId"]
|
|
278
|
+
logger.info("Created job: " + self.job_id)
|
|
221
279
|
|
|
222
|
-
|
|
280
|
+
@cached_property
|
|
281
|
+
def import_profile(self) -> dict:
|
|
223
282
|
"""
|
|
224
|
-
|
|
283
|
+
Returns the import profile for the current job execution.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
dict: The import profile for the current job execution.
|
|
225
287
|
"""
|
|
226
288
|
import_profiles = self.folio_client.folio_get(
|
|
227
289
|
"/data-import-profiles/jobProfiles",
|
|
@@ -233,7 +295,7 @@ class MARCImportJob:
|
|
|
233
295
|
for profile in import_profiles
|
|
234
296
|
if profile["name"] == self.import_profile_name
|
|
235
297
|
][0]
|
|
236
|
-
|
|
298
|
+
return profile
|
|
237
299
|
|
|
238
300
|
async def set_job_profile(self) -> None:
|
|
239
301
|
"""
|
|
@@ -249,15 +311,15 @@ class MARCImportJob:
|
|
|
249
311
|
+ "/jobProfile",
|
|
250
312
|
headers=self.folio_client.okapi_headers,
|
|
251
313
|
json={
|
|
252
|
-
"id": self.
|
|
253
|
-
"name": self.
|
|
314
|
+
"id": self.import_profile["id"],
|
|
315
|
+
"name": self.import_profile["name"],
|
|
254
316
|
"dataType": "MARC",
|
|
255
317
|
},
|
|
256
318
|
)
|
|
257
319
|
try:
|
|
258
320
|
set_job_profile.raise_for_status()
|
|
259
321
|
except httpx.HTTPError as e:
|
|
260
|
-
|
|
322
|
+
logger.error(
|
|
261
323
|
"Error creating job: "
|
|
262
324
|
+ str(e)
|
|
263
325
|
+ "\n"
|
|
@@ -299,8 +361,13 @@ class MARCImportJob:
|
|
|
299
361
|
headers=self.folio_client.okapi_headers,
|
|
300
362
|
json=batch_payload,
|
|
301
363
|
)
|
|
302
|
-
|
|
303
|
-
|
|
364
|
+
# if batch_payload["recordsMetadata"]["last"]:
|
|
365
|
+
# logger.log(
|
|
366
|
+
# 25,
|
|
367
|
+
# f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
|
|
368
|
+
# )
|
|
369
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
370
|
+
sleep(0.25)
|
|
304
371
|
return await self.process_record_batch(batch_payload)
|
|
305
372
|
try:
|
|
306
373
|
post_batch.raise_for_status()
|
|
@@ -308,12 +375,14 @@ class MARCImportJob:
|
|
|
308
375
|
self.record_batch = []
|
|
309
376
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
310
377
|
except Exception as e:
|
|
311
|
-
if
|
|
378
|
+
if (
|
|
379
|
+
hasattr(e, "response") and e.response.status_code in [500, 422]
|
|
380
|
+
): # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
|
|
312
381
|
self.total_records_sent += len(self.record_batch)
|
|
313
382
|
self.record_batch = []
|
|
314
383
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
315
384
|
else:
|
|
316
|
-
|
|
385
|
+
logger.error("Error posting batch: " + str(e))
|
|
317
386
|
for record in self.record_batch:
|
|
318
387
|
self.failed_batches_file.write(record)
|
|
319
388
|
self.error_records += len(self.record_batch)
|
|
@@ -335,14 +404,20 @@ class MARCImportJob:
|
|
|
335
404
|
"""
|
|
336
405
|
counter = 0
|
|
337
406
|
for import_file in files:
|
|
407
|
+
file_path = Path(import_file.name)
|
|
338
408
|
self.pbar_sent.set_description(
|
|
339
409
|
f"Sent ({os.path.basename(import_file.name)}): "
|
|
340
410
|
)
|
|
341
411
|
reader = pymarc.MARCReader(import_file, hide_utf8_warnings=True)
|
|
342
|
-
for record in reader:
|
|
412
|
+
for idx, record in enumerate(reader, start=1):
|
|
343
413
|
if len(self.record_batch) == self.batch_size:
|
|
344
414
|
await self.process_record_batch(
|
|
345
|
-
await self.create_batch_payload(
|
|
415
|
+
await self.create_batch_payload(
|
|
416
|
+
counter,
|
|
417
|
+
total_records,
|
|
418
|
+
(counter - self.error_records)
|
|
419
|
+
== (total_records - self.error_records),
|
|
420
|
+
),
|
|
346
421
|
)
|
|
347
422
|
await self.get_job_status()
|
|
348
423
|
sleep(0.25)
|
|
@@ -354,14 +429,35 @@ class MARCImportJob:
|
|
|
354
429
|
self.record_batch.append(record.as_marc())
|
|
355
430
|
counter += 1
|
|
356
431
|
else:
|
|
432
|
+
logger.data_issues(
|
|
433
|
+
"RECORD FAILED\t%s\t%s\t%s",
|
|
434
|
+
f"{file_path.name}:{idx}",
|
|
435
|
+
f"Error reading {idx} record from {file_path}. Skipping. Writing current chunk to {self.bad_records_file.name}.",
|
|
436
|
+
"",
|
|
437
|
+
)
|
|
357
438
|
self.bad_records_file.write(reader.current_chunk)
|
|
358
439
|
if self.record_batch:
|
|
359
440
|
await self.process_record_batch(
|
|
360
|
-
await self.create_batch_payload(
|
|
441
|
+
await self.create_batch_payload(
|
|
442
|
+
counter,
|
|
443
|
+
total_records,
|
|
444
|
+
(counter - self.error_records)
|
|
445
|
+
== (total_records - self.error_records),
|
|
446
|
+
),
|
|
361
447
|
)
|
|
448
|
+
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
449
|
+
if not import_complete_path.exists():
|
|
450
|
+
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
451
|
+
import_complete_path.mkdir(exist_ok=True)
|
|
452
|
+
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
453
|
+
file_path.rename(
|
|
454
|
+
file_path.parent.joinpath("import_complete", file_path.name)
|
|
455
|
+
)
|
|
362
456
|
|
|
363
457
|
@staticmethod
|
|
364
|
-
async def apply_marc_record_preprocessing(
|
|
458
|
+
async def apply_marc_record_preprocessing(
|
|
459
|
+
record: pymarc.Record, func_or_path
|
|
460
|
+
) -> pymarc.Record:
|
|
365
461
|
"""
|
|
366
462
|
Apply preprocessing to the MARC record before sending it to FOLIO.
|
|
367
463
|
|
|
@@ -373,25 +469,42 @@ class MARCImportJob:
|
|
|
373
469
|
pymarc.Record: The preprocessed MARC record.
|
|
374
470
|
"""
|
|
375
471
|
if isinstance(func_or_path, str):
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
except (ImportError, AttributeError) as e:
|
|
382
|
-
print(f"Error importing preprocessing function {func_or_path}: {e}. Skipping preprocessing.")
|
|
383
|
-
return record
|
|
472
|
+
func_paths = func_or_path.split(",")
|
|
473
|
+
for func_path in func_paths:
|
|
474
|
+
record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
|
|
475
|
+
record, func_path
|
|
476
|
+
)
|
|
384
477
|
elif callable(func_or_path):
|
|
385
|
-
|
|
478
|
+
record = func_or_path(record)
|
|
386
479
|
else:
|
|
387
|
-
|
|
388
|
-
|
|
480
|
+
logger.warning(
|
|
481
|
+
f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
|
|
482
|
+
)
|
|
483
|
+
return record
|
|
389
484
|
|
|
485
|
+
async def _apply_single_marc_record_preprocessing_by_path(
|
|
486
|
+
record: pymarc.Record, func_path: str
|
|
487
|
+
) -> pymarc.Record:
|
|
488
|
+
"""
|
|
489
|
+
Apply a single preprocessing function to the MARC record.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
493
|
+
func_path (str): The path to the preprocessing function.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
pymarc.Record: The preprocessed MARC record.
|
|
497
|
+
"""
|
|
390
498
|
try:
|
|
391
|
-
|
|
499
|
+
module_path, func_name = func_path.rsplit(".", 1)
|
|
500
|
+
module = importlib.import_module(module_path)
|
|
501
|
+
func = getattr(module, func_name)
|
|
502
|
+
record = func(record)
|
|
392
503
|
except Exception as e:
|
|
393
|
-
|
|
394
|
-
|
|
504
|
+
logger.warning(
|
|
505
|
+
f"Error applying preprocessing function {func_path}: {e}. Skipping."
|
|
506
|
+
)
|
|
507
|
+
return record
|
|
395
508
|
|
|
396
509
|
async def create_batch_payload(self, counter, total_records, is_last) -> dict:
|
|
397
510
|
"""
|
|
@@ -435,24 +548,26 @@ class MARCImportJob:
|
|
|
435
548
|
None
|
|
436
549
|
"""
|
|
437
550
|
await self.create_folio_import_job()
|
|
438
|
-
await self.get_import_profile()
|
|
439
551
|
await self.set_job_profile()
|
|
440
552
|
with ExitStack() as stack:
|
|
441
553
|
files = [
|
|
442
554
|
stack.enter_context(open(file, "rb")) for file in self.current_file
|
|
443
555
|
]
|
|
444
556
|
total_records = await self.read_total_records(files)
|
|
445
|
-
with
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
557
|
+
with (
|
|
558
|
+
tqdm(
|
|
559
|
+
desc="Imported: ",
|
|
560
|
+
total=total_records,
|
|
561
|
+
position=1,
|
|
562
|
+
disable=self.no_progress,
|
|
563
|
+
) as pbar_imported,
|
|
564
|
+
tqdm(
|
|
565
|
+
desc="Sent: ()",
|
|
566
|
+
total=total_records,
|
|
567
|
+
position=0,
|
|
568
|
+
disable=self.no_progress,
|
|
569
|
+
) as pbar_sent,
|
|
570
|
+
):
|
|
456
571
|
self.pbar_sent = pbar_sent
|
|
457
572
|
self.pbar_imported = pbar_imported
|
|
458
573
|
await self.process_records(files, total_records)
|
|
@@ -461,8 +576,8 @@ class MARCImportJob:
|
|
|
461
576
|
sleep(1)
|
|
462
577
|
if self.finished:
|
|
463
578
|
if job_summary := await self.get_job_summary():
|
|
464
|
-
job_summary.pop("jobExecutionId")
|
|
465
|
-
job_summary.pop("totalErrors")
|
|
579
|
+
job_id = job_summary.pop("jobExecutionId", None)
|
|
580
|
+
total_errors = job_summary.pop("totalErrors", 0)
|
|
466
581
|
columns = ["Summary"] + list(job_summary.keys())
|
|
467
582
|
rows = set()
|
|
468
583
|
for key in columns[1:]:
|
|
@@ -479,17 +594,20 @@ class MARCImportJob:
|
|
|
479
594
|
columns = columns[:1] + [
|
|
480
595
|
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
481
596
|
]
|
|
482
|
-
|
|
597
|
+
logger.info(
|
|
483
598
|
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
484
599
|
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
485
600
|
)
|
|
486
|
-
|
|
487
|
-
|
|
601
|
+
logger.info(
|
|
602
|
+
"\n"
|
|
603
|
+
+ tabulate.tabulate(
|
|
488
604
|
table_data, headers=columns, tablefmt="fancy_grid"
|
|
489
605
|
),
|
|
490
606
|
)
|
|
607
|
+
if total_errors:
|
|
608
|
+
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
491
609
|
else:
|
|
492
|
-
|
|
610
|
+
logger.error(f"No job summary available for job {self.job_id}.")
|
|
493
611
|
self.last_current = 0
|
|
494
612
|
self.finished = False
|
|
495
613
|
|
|
@@ -502,11 +620,12 @@ class MARCImportJob:
|
|
|
502
620
|
"""
|
|
503
621
|
try:
|
|
504
622
|
self.current_retry_timeout = (
|
|
505
|
-
self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
|
|
506
|
-
|
|
623
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
624
|
+
if self.current_retry_timeout
|
|
625
|
+
else RETRY_TIMEOUT_START
|
|
626
|
+
)
|
|
507
627
|
with httpx.Client(
|
|
508
|
-
timeout=self.current_retry_timeout,
|
|
509
|
-
verify=self.folio_client.ssl_verify
|
|
628
|
+
timeout=self.current_retry_timeout, verify=self.folio_client.ssl_verify
|
|
510
629
|
) as temp_client:
|
|
511
630
|
self.folio_client.httpx_client = temp_client
|
|
512
631
|
job_summary = self.folio_client.folio_get(
|
|
@@ -514,21 +633,75 @@ class MARCImportJob:
|
|
|
514
633
|
)
|
|
515
634
|
self.current_retry_timeout = None
|
|
516
635
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
517
|
-
if
|
|
518
|
-
|
|
636
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
637
|
+
if (self._max_summary_retries > self._summary_retries and not hasattr(e, "response")) or (
|
|
638
|
+
e.response.status_code in [502, 504] and not self.let_summary_fail
|
|
639
|
+
):
|
|
640
|
+
logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
|
|
641
|
+
sleep(0.25)
|
|
519
642
|
with httpx.Client(
|
|
520
643
|
timeout=self.current_retry_timeout,
|
|
521
|
-
verify=self.folio_client.ssl_verify
|
|
644
|
+
verify=self.folio_client.ssl_verify,
|
|
522
645
|
) as temp_client:
|
|
523
646
|
self.folio_client.httpx_client = temp_client
|
|
524
|
-
|
|
525
|
-
|
|
647
|
+
self._summary_retries += 1
|
|
648
|
+
return await self.get_job_summary()
|
|
649
|
+
elif (self._summary_retries >= self._max_summary_retries) or (hasattr(e, "response") and (
|
|
650
|
+
e.response.status_code in [502, 504] and self.let_summary_fail)
|
|
651
|
+
):
|
|
652
|
+
logger.warning(
|
|
653
|
+
f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
|
|
654
|
+
)
|
|
526
655
|
job_summary = {}
|
|
527
656
|
else:
|
|
528
657
|
raise e
|
|
529
658
|
return job_summary
|
|
530
659
|
|
|
531
660
|
|
|
661
|
+
def set_up_cli_logging():
|
|
662
|
+
"""
|
|
663
|
+
This function sets up logging for the CLI.
|
|
664
|
+
"""
|
|
665
|
+
logger.setLevel(logging.INFO)
|
|
666
|
+
logger.propagate = False
|
|
667
|
+
|
|
668
|
+
# Set up file and stream handlers
|
|
669
|
+
file_handler = logging.FileHandler(
|
|
670
|
+
"folio_data_import_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
671
|
+
)
|
|
672
|
+
file_handler.setLevel(logging.INFO)
|
|
673
|
+
file_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
674
|
+
# file_handler.addFilter(IncludeLevelFilter(25))
|
|
675
|
+
file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
676
|
+
file_handler.setFormatter(file_formatter)
|
|
677
|
+
logger.addHandler(file_handler)
|
|
678
|
+
|
|
679
|
+
if not any(
|
|
680
|
+
isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
|
|
681
|
+
for h in logger.handlers
|
|
682
|
+
):
|
|
683
|
+
stream_handler = logging.StreamHandler(sys.stdout)
|
|
684
|
+
stream_handler.setLevel(logging.INFO)
|
|
685
|
+
stream_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
686
|
+
# stream_handler.addFilter(ExcludeLevelFilter(25))
|
|
687
|
+
stream_formatter = logging.Formatter("%(message)s")
|
|
688
|
+
stream_handler.setFormatter(stream_formatter)
|
|
689
|
+
logger.addHandler(stream_handler)
|
|
690
|
+
|
|
691
|
+
# Set up data issues logging
|
|
692
|
+
data_issues_handler = logging.FileHandler(
|
|
693
|
+
"marc_import_data_issues_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
694
|
+
)
|
|
695
|
+
data_issues_handler.setLevel(26)
|
|
696
|
+
data_issues_handler.addFilter(IncludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
697
|
+
data_issues_formatter = logging.Formatter("%(message)s")
|
|
698
|
+
data_issues_handler.setFormatter(data_issues_formatter)
|
|
699
|
+
logger.addHandler(data_issues_handler)
|
|
700
|
+
|
|
701
|
+
# Stop httpx from logging info messages to the console
|
|
702
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
703
|
+
|
|
704
|
+
|
|
532
705
|
async def main() -> None:
|
|
533
706
|
"""
|
|
534
707
|
Main function to run the MARC import job.
|
|
@@ -536,6 +709,7 @@ async def main() -> None:
|
|
|
536
709
|
This function parses command line arguments, initializes the FolioClient,
|
|
537
710
|
and runs the MARCImportJob.
|
|
538
711
|
"""
|
|
712
|
+
set_up_cli_logging()
|
|
539
713
|
parser = argparse.ArgumentParser()
|
|
540
714
|
parser.add_argument("--gateway_url", type=str, help="The FOLIO API Gateway URL")
|
|
541
715
|
parser.add_argument("--tenant_id", type=str, help="The FOLIO tenant ID")
|
|
@@ -592,6 +766,11 @@ async def main() -> None:
|
|
|
592
766
|
action="store_true",
|
|
593
767
|
help="Disable progress bars (eg. for running in a CI environment)",
|
|
594
768
|
)
|
|
769
|
+
parser.add_argument(
|
|
770
|
+
"--let-summary-fail",
|
|
771
|
+
action="store_true",
|
|
772
|
+
help="Do not retry fetching the final job summary if it fails",
|
|
773
|
+
)
|
|
595
774
|
args = parser.parse_args()
|
|
596
775
|
if not args.password:
|
|
597
776
|
args.password = getpass("Enter FOLIO password: ")
|
|
@@ -611,10 +790,10 @@ async def main() -> None:
|
|
|
611
790
|
marc_files.sort()
|
|
612
791
|
|
|
613
792
|
if len(marc_files) == 0:
|
|
614
|
-
|
|
793
|
+
logger.critical(f"No files found matching {args.marc_file_path}. Exiting.")
|
|
615
794
|
sys.exit(1)
|
|
616
795
|
else:
|
|
617
|
-
|
|
796
|
+
logger.info(marc_files)
|
|
618
797
|
|
|
619
798
|
if not args.import_profile_name:
|
|
620
799
|
import_profiles = folio_client.folio_get(
|
|
@@ -646,12 +825,31 @@ async def main() -> None:
|
|
|
646
825
|
marc_record_preprocessor=args.preprocessor,
|
|
647
826
|
consolidate=bool(args.consolidate),
|
|
648
827
|
no_progress=bool(args.no_progress),
|
|
828
|
+
let_summary_fail=bool(args.let_summary_fail),
|
|
649
829
|
).do_work()
|
|
650
830
|
except Exception as e:
|
|
651
|
-
|
|
831
|
+
logger.error("Error importing files: " + str(e))
|
|
652
832
|
raise
|
|
653
833
|
|
|
654
834
|
|
|
835
|
+
class ExcludeLevelFilter(logging.Filter):
|
|
836
|
+
def __init__(self, level):
|
|
837
|
+
super().__init__()
|
|
838
|
+
self.level = level
|
|
839
|
+
|
|
840
|
+
def filter(self, record):
|
|
841
|
+
return record.levelno != self.level
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
class IncludeLevelFilter(logging.Filter):
|
|
845
|
+
def __init__(self, level):
|
|
846
|
+
super().__init__()
|
|
847
|
+
self.level = level
|
|
848
|
+
|
|
849
|
+
def filter(self, record):
|
|
850
|
+
return record.levelno == self.level
|
|
851
|
+
|
|
852
|
+
|
|
655
853
|
def sync_main() -> None:
|
|
656
854
|
"""
|
|
657
855
|
Synchronous main function to run the MARC import job.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
from ._preprocessors import
|
|
1
|
+
from ._preprocessors import *
|
|
@@ -1,4 +1,8 @@
|
|
|
1
1
|
import pymarc
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger("folio_data_import.MARCDataImport")
|
|
5
|
+
|
|
2
6
|
|
|
3
7
|
def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
|
|
4
8
|
"""
|
|
@@ -11,9 +15,10 @@ def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
|
|
|
11
15
|
Returns:
|
|
12
16
|
pymarc.Record: The preprocessed MARC record.
|
|
13
17
|
"""
|
|
14
|
-
record[
|
|
18
|
+
record["001"].data = f"({prefix})" + record["001"].data
|
|
15
19
|
return record
|
|
16
20
|
|
|
21
|
+
|
|
17
22
|
def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
18
23
|
"""
|
|
19
24
|
Prepend the PPN prefix to the record's 001 field. Useful when
|
|
@@ -25,7 +30,8 @@ def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
|
25
30
|
Returns:
|
|
26
31
|
pymarc.Record: The preprocessed MARC record.
|
|
27
32
|
"""
|
|
28
|
-
return prepend_prefix_001(record,
|
|
33
|
+
return prepend_prefix_001(record, "PPN")
|
|
34
|
+
|
|
29
35
|
|
|
30
36
|
def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
31
37
|
"""
|
|
@@ -38,7 +44,8 @@ def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
|
38
44
|
Returns:
|
|
39
45
|
pymarc.Record: The preprocessed MARC record.
|
|
40
46
|
"""
|
|
41
|
-
return prepend_prefix_001(record,
|
|
47
|
+
return prepend_prefix_001(record, "ABES")
|
|
48
|
+
|
|
42
49
|
|
|
43
50
|
def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
|
|
44
51
|
"""
|
|
@@ -51,11 +58,35 @@ def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
|
|
|
51
58
|
Returns:
|
|
52
59
|
pymarc.Record: The preprocessed MARC record.
|
|
53
60
|
"""
|
|
54
|
-
for field in record.get_fields(
|
|
55
|
-
if field.indicators == pymarc.Indicators(*[
|
|
61
|
+
for field in record.get_fields("999"):
|
|
62
|
+
if field.indicators == pymarc.Indicators(*["f", "f"]):
|
|
56
63
|
record.remove_field(field)
|
|
57
64
|
return record
|
|
58
65
|
|
|
66
|
+
def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
|
|
67
|
+
"""
|
|
68
|
+
The presence of 999 fields, with or without ff indicators, can cause
|
|
69
|
+
issues with data import mapping in FOLIO. This function calls strip_999_ff_fields
|
|
70
|
+
to remove 999 fields with ff indicators and then copies the remaining 999 fields
|
|
71
|
+
to 945 fields.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
pymarc.Record: The preprocessed MARC record.
|
|
78
|
+
"""
|
|
79
|
+
record = strip_999_ff_fields(record)
|
|
80
|
+
for field in record.get_fields("999"):
|
|
81
|
+
_945 = pymarc.Field(
|
|
82
|
+
tag="945",
|
|
83
|
+
indicators=field.indicators,
|
|
84
|
+
subfields=field.subfields,
|
|
85
|
+
)
|
|
86
|
+
record.add_ordered_field(_945)
|
|
87
|
+
record.remove_field(field)
|
|
88
|
+
return record
|
|
89
|
+
|
|
59
90
|
def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
60
91
|
"""
|
|
61
92
|
Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
|
|
@@ -71,14 +102,232 @@ def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
|
71
102
|
pymarc.Record: The preprocessed MARC record.
|
|
72
103
|
"""
|
|
73
104
|
record = prepend_abes_prefix_001(record)
|
|
74
|
-
for field in record.get_fields(
|
|
75
|
-
if "a" in field and "9" in field and field[
|
|
105
|
+
for field in record.get_fields("035"):
|
|
106
|
+
if "a" in field and "9" in field and field["9"] == "sudoc":
|
|
76
107
|
_935 = pymarc.Field(
|
|
77
|
-
tag=
|
|
78
|
-
indicators=[
|
|
79
|
-
subfields=[
|
|
80
|
-
pymarc.field.Subfield('a', "(ABES)" + field['a'])
|
|
81
|
-
]
|
|
108
|
+
tag="935",
|
|
109
|
+
indicators=["f", "f"],
|
|
110
|
+
subfields=[pymarc.field.Subfield("a", "(ABES)" + field["a"])],
|
|
82
111
|
)
|
|
83
112
|
record.add_ordered_field(_935)
|
|
84
113
|
return record
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
117
|
+
"""
|
|
118
|
+
Remove empty fields and subfields from the record. These can cause
|
|
119
|
+
data import mapping issues in FOLIO. Removals are logged at custom
|
|
120
|
+
log level 26, which is used by folio_migration_tools to populate the
|
|
121
|
+
data issues report.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
pymarc.Record: The preprocessed MARC record.
|
|
128
|
+
"""
|
|
129
|
+
MAPPED_FIELDS = {
|
|
130
|
+
"010": ["a", "z"],
|
|
131
|
+
"020": ["a", "y", "z"],
|
|
132
|
+
"035": ["a", "z"],
|
|
133
|
+
"040": ["a", "b", "c", "d", "e", "f", "g", "h", "k", "m", "n", "p", "r", "s"],
|
|
134
|
+
"050": ["a", "b"],
|
|
135
|
+
"082": ["a", "b"],
|
|
136
|
+
"100": ["a", "b", "c", "d", "q"],
|
|
137
|
+
"110": ["a", "b", "c"],
|
|
138
|
+
"111": ["a", "c", "d"],
|
|
139
|
+
"130": [
|
|
140
|
+
"a",
|
|
141
|
+
"d",
|
|
142
|
+
"f",
|
|
143
|
+
"k",
|
|
144
|
+
"l",
|
|
145
|
+
"m",
|
|
146
|
+
"n",
|
|
147
|
+
"o",
|
|
148
|
+
"p",
|
|
149
|
+
"r",
|
|
150
|
+
"s",
|
|
151
|
+
"t",
|
|
152
|
+
"x",
|
|
153
|
+
"y",
|
|
154
|
+
"z",
|
|
155
|
+
],
|
|
156
|
+
"180": ["x", "y", "z"],
|
|
157
|
+
"210": ["a", "c"],
|
|
158
|
+
"240": ["a", "f", "k", "l", "m", "n", "o", "p", "r", "s", "t", "x", "y", "z"],
|
|
159
|
+
"245": ["a", "b", "c", "f", "g", "h", "k", "n", "p", "s"],
|
|
160
|
+
"246": ["a", "f", "g", "n", "p", "s"],
|
|
161
|
+
"250": ["a", "b"],
|
|
162
|
+
"260": ["a", "b", "c", "e", "f", "g"],
|
|
163
|
+
"300": ["a", "b", "c", "e", "f", "g"],
|
|
164
|
+
"440": ["a", "n", "p", "v", "x", "y", "z"],
|
|
165
|
+
"490": ["a", "v", "x", "y", "z"],
|
|
166
|
+
"500": ["a", "c", "d", "n", "p", "v", "x", "y", "z"],
|
|
167
|
+
"505": ["a", "g", "r", "t", "u"],
|
|
168
|
+
"520": ["a", "b", "c", "u"],
|
|
169
|
+
"600": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
170
|
+
"610": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
171
|
+
"611": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
172
|
+
"630": [
|
|
173
|
+
"a",
|
|
174
|
+
"d",
|
|
175
|
+
"f",
|
|
176
|
+
"k",
|
|
177
|
+
"l",
|
|
178
|
+
"m",
|
|
179
|
+
"n",
|
|
180
|
+
"o",
|
|
181
|
+
"p",
|
|
182
|
+
"r",
|
|
183
|
+
"s",
|
|
184
|
+
"t",
|
|
185
|
+
"x",
|
|
186
|
+
"y",
|
|
187
|
+
"z",
|
|
188
|
+
],
|
|
189
|
+
"650": ["a", "d", "v", "x", "y", "z"],
|
|
190
|
+
"651": ["a", "v", "x", "y", "z"],
|
|
191
|
+
"655": ["a", "v", "x", "y", "z"],
|
|
192
|
+
"700": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
193
|
+
"710": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
194
|
+
"711": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
195
|
+
"730": [
|
|
196
|
+
"a",
|
|
197
|
+
"d",
|
|
198
|
+
"f",
|
|
199
|
+
"k",
|
|
200
|
+
"l",
|
|
201
|
+
"m",
|
|
202
|
+
"n",
|
|
203
|
+
"o",
|
|
204
|
+
"p",
|
|
205
|
+
"r",
|
|
206
|
+
"s",
|
|
207
|
+
"t",
|
|
208
|
+
"x",
|
|
209
|
+
"y",
|
|
210
|
+
"z",
|
|
211
|
+
],
|
|
212
|
+
"740": ["a", "n", "p", "v", "x", "y", "z"],
|
|
213
|
+
"800": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
214
|
+
"810": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
215
|
+
"811": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
216
|
+
"830": [
|
|
217
|
+
"a",
|
|
218
|
+
"d",
|
|
219
|
+
"f",
|
|
220
|
+
"k",
|
|
221
|
+
"l",
|
|
222
|
+
"m",
|
|
223
|
+
"n",
|
|
224
|
+
"o",
|
|
225
|
+
"p",
|
|
226
|
+
"r",
|
|
227
|
+
"s",
|
|
228
|
+
"t",
|
|
229
|
+
"x",
|
|
230
|
+
"y",
|
|
231
|
+
"z",
|
|
232
|
+
],
|
|
233
|
+
"856": ["u", "y", "z"],
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
for field in list(record.get_fields()):
|
|
237
|
+
len_subs = len(field.subfields)
|
|
238
|
+
subfield_value = bool(field.subfields[0].value) if len_subs > 0 else False
|
|
239
|
+
if not int(field.tag) >= 900 and field.tag in MAPPED_FIELDS:
|
|
240
|
+
if int(field.tag) > 9 and len_subs == 0:
|
|
241
|
+
logger.log(
|
|
242
|
+
26,
|
|
243
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
244
|
+
record["001"].value(),
|
|
245
|
+
f"{field.tag} is empty, removing field",
|
|
246
|
+
field,
|
|
247
|
+
)
|
|
248
|
+
record.remove_field(field)
|
|
249
|
+
elif len_subs == 1 and not subfield_value:
|
|
250
|
+
logger.log(
|
|
251
|
+
26,
|
|
252
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
253
|
+
record["001"].value(),
|
|
254
|
+
f"{field.tag}${field.subfields[0].code} is empty, no other subfields present, removing field",
|
|
255
|
+
field,
|
|
256
|
+
)
|
|
257
|
+
record.remove_field(field)
|
|
258
|
+
else:
|
|
259
|
+
if len_subs > 1 and "a" in field and not field["a"].strip():
|
|
260
|
+
logger.log(
|
|
261
|
+
26,
|
|
262
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
263
|
+
record["001"].value(),
|
|
264
|
+
f"{field.tag}$a is empty, removing subfield",
|
|
265
|
+
field,
|
|
266
|
+
)
|
|
267
|
+
field.delete_subfield("a")
|
|
268
|
+
for idx, subfield in enumerate(list(field.subfields), start=1):
|
|
269
|
+
if (
|
|
270
|
+
subfield.code in MAPPED_FIELDS.get(field.tag, [])
|
|
271
|
+
and not subfield.value
|
|
272
|
+
):
|
|
273
|
+
logger.log(
|
|
274
|
+
26,
|
|
275
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
276
|
+
record["001"].value(),
|
|
277
|
+
f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
|
|
278
|
+
field,
|
|
279
|
+
)
|
|
280
|
+
field.delete_subfield(subfield.code)
|
|
281
|
+
if len(field.subfields) == 0:
|
|
282
|
+
logger.log(
|
|
283
|
+
26,
|
|
284
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
285
|
+
record["001"].value(),
|
|
286
|
+
f"{field.tag} has no non-empty subfields after cleaning, removing field",
|
|
287
|
+
field,
|
|
288
|
+
)
|
|
289
|
+
record.remove_field(field)
|
|
290
|
+
return record
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def fix_leader(record: pymarc.Record) -> pymarc.Record:
|
|
294
|
+
"""
|
|
295
|
+
Fixes the leader of the record by setting the record status to 'c' (modified
|
|
296
|
+
record) and the type of record to 'a' (language material).
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
pymarc.Record: The preprocessed MARC record.
|
|
303
|
+
"""
|
|
304
|
+
VALID_STATUSES = ["a", "c", "d", "n", "p"]
|
|
305
|
+
VALID_TYPES = ["a", "c", "d", "e", "f", "g", "i", "j", "k", "m", "o", "p", "r", "t"]
|
|
306
|
+
if record.leader[5] not in VALID_STATUSES:
|
|
307
|
+
logger.log(
|
|
308
|
+
26,
|
|
309
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
310
|
+
record["001"].value(),
|
|
311
|
+
f"Invalid record status: {record.leader[5]}, setting to 'c'",
|
|
312
|
+
record,
|
|
313
|
+
)
|
|
314
|
+
record.leader = pymarc.Leader(record.leader[:5] + "c" + record.leader[6:])
|
|
315
|
+
if record.leader[6] not in VALID_TYPES:
|
|
316
|
+
logger.log(
|
|
317
|
+
26,
|
|
318
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
319
|
+
record["001"].value(),
|
|
320
|
+
f"Invalid record type: {record.leader[6]}, setting to 'a'",
|
|
321
|
+
record,
|
|
322
|
+
)
|
|
323
|
+
record.leader = pymarc.Leader(record.leader[:6] + "a" + record.leader[7:])
|
|
324
|
+
return record
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def ordinal(n):
|
|
328
|
+
s = ("th", "st", "nd", "rd") + ("th",) * 10
|
|
329
|
+
v = n % 100
|
|
330
|
+
if v > 13:
|
|
331
|
+
return f"{n}{s[v % 10]}"
|
|
332
|
+
else:
|
|
333
|
+
return f"{n}{s[v]}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: folio_data_import
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8rc9
|
|
4
4
|
Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Brooks Travis
|
|
@@ -108,11 +108,11 @@ Unlike mod-user-import, this importer does not require `externalSystemId` as the
|
|
|
108
108
|
|
|
109
109
|
#### Preferred Contact Type Mapping
|
|
110
110
|
|
|
111
|
-
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by
|
|
111
|
+
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by FOLIO, or the human-friendly strings used by `mod-user-import` (`"mail", "email", "text", "phone", "mobile"`). It will also __*set a customizable default for all users that do not otherwise have a valid value specified*__ (using `--default_preferred_contact_type`), unless a (valid) value is already present in the user record being updated.
|
|
112
112
|
|
|
113
113
|
#### Field Protection (*experimental*)
|
|
114
114
|
|
|
115
|
-
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you
|
|
115
|
+
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you can specify a comma-separated list of User schema field names, using dot-notation for nested fields. This protection should support all standard fields except addresses within `personal.addresses`. If you include `personal.addresses` in a user record, any existing addresses will be replaced by the new values.
|
|
116
116
|
|
|
117
117
|
##### Example
|
|
118
118
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
folio_data_import/MARCDataImport.py,sha256=ImbuGw1ADt4nCmq0lLaqugP2wv5kBrgMGAr0jbKSgFc,33135
|
|
2
|
+
folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
|
|
3
|
+
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
+
folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
|
|
6
|
+
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4i1_lEnptzZDx3DojX9sfvJ_hmehwFJUC3aZsUADcwA,10851
|
|
7
|
+
folio_data_import-0.2.8rc9.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
8
|
+
folio_data_import-0.2.8rc9.dist-info/METADATA,sha256=Q80K34yk3xcZPfCf50FBtAYY7Hrxb3ukbAAGAv4uCEs,6112
|
|
9
|
+
folio_data_import-0.2.8rc9.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
10
|
+
folio_data_import-0.2.8rc9.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
11
|
+
folio_data_import-0.2.8rc9.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
folio_data_import/MARCDataImport.py,sha256=b4Qdja0S1_mrrxjVkVsEBatm0Kr2ZfI3b5ZRkzD0kRA,24845
|
|
2
|
-
folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
|
|
3
|
-
folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
|
|
5
|
-
folio_data_import/marc_preprocessors/__init__.py,sha256=Wt-TKkMhUyZWFS-WhAmbShKQLPjXmHKPb2vL6kvkqVA,72
|
|
6
|
-
folio_data_import/marc_preprocessors/_preprocessors.py,sha256=CMG4Xq3vR84xmNFUJfRmnU0A5lchAfK33xDzeiC2AWk,2787
|
|
7
|
-
folio_data_import-0.2.8rc7.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
|
|
8
|
-
folio_data_import-0.2.8rc7.dist-info/METADATA,sha256=NtI0ue7pc177DJ7KtCgYekGYbeeDs-PC_-HI0AtZmCI,6115
|
|
9
|
-
folio_data_import-0.2.8rc7.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
10
|
-
folio_data_import-0.2.8rc7.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
|
|
11
|
-
folio_data_import-0.2.8rc7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{folio_data_import-0.2.8rc7.dist-info → folio_data_import-0.2.8rc9.dist-info}/entry_points.txt
RENAMED
|
File without changes
|