folio-data-import 0.2.8rc7__tar.gz → 0.2.8rc8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/PKG-INFO +3 -3
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/README.md +2 -2
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/pyproject.toml +1 -1
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/src/folio_data_import/MARCDataImport.py +268 -97
- folio_data_import-0.2.8rc8/src/folio_data_import/marc_preprocessors/__init__.py +1 -0
- folio_data_import-0.2.8rc8/src/folio_data_import/marc_preprocessors/_preprocessors.py +273 -0
- folio_data_import-0.2.8rc7/src/folio_data_import/marc_preprocessors/__init__.py +0 -1
- folio_data_import-0.2.8rc7/src/folio_data_import/marc_preprocessors/_preprocessors.py +0 -84
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/LICENSE +0 -0
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/src/folio_data_import/UserImport.py +0 -0
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/src/folio_data_import/__init__.py +0 -0
- {folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/src/folio_data_import/__main__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: folio_data_import
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8rc8
|
|
4
4
|
Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Brooks Travis
|
|
@@ -108,11 +108,11 @@ Unlike mod-user-import, this importer does not require `externalSystemId` as the
|
|
|
108
108
|
|
|
109
109
|
#### Preferred Contact Type Mapping
|
|
110
110
|
|
|
111
|
-
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by
|
|
111
|
+
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by FOLIO, or the human-friendly strings used by `mod-user-import` (`"mail", "email", "text", "phone", "mobile"`). It will also __*set a customizable default for all users that do not otherwise have a valid value specified*__ (using `--default_preferred_contact_type`), unless a (valid) value is already present in the user record being updated.
|
|
112
112
|
|
|
113
113
|
#### Field Protection (*experimental*)
|
|
114
114
|
|
|
115
|
-
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you
|
|
115
|
+
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you can specify a comma-separated list of User schema field names, using dot-notation for nested fields. This protection should support all standard fields except addresses within `personal.addresses`. If you include `personal.addresses` in a user record, any existing addresses will be replaced by the new values.
|
|
116
116
|
|
|
117
117
|
##### Example
|
|
118
118
|
|
|
@@ -78,11 +78,11 @@ Unlike mod-user-import, this importer does not require `externalSystemId` as the
|
|
|
78
78
|
|
|
79
79
|
#### Preferred Contact Type Mapping
|
|
80
80
|
|
|
81
|
-
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by
|
|
81
|
+
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by FOLIO, or the human-friendly strings used by `mod-user-import` (`"mail", "email", "text", "phone", "mobile"`). It will also __*set a customizable default for all users that do not otherwise have a valid value specified*__ (using `--default_preferred_contact_type`), unless a (valid) value is already present in the user record being updated.
|
|
82
82
|
|
|
83
83
|
#### Field Protection (*experimental*)
|
|
84
84
|
|
|
85
|
-
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you
|
|
85
|
+
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you can specify a comma-separated list of User schema field names, using dot-notation for nested fields. This protection should support all standard fields except addresses within `personal.addresses`. If you include `personal.addresses` in a user record, any existing addresses will be replaced by the new values.
|
|
86
86
|
|
|
87
87
|
##### Example
|
|
88
88
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "folio_data_import"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.8rc8"
|
|
4
4
|
description = "A python module to interact with the data importing capabilities of the open-source FOLIO ILS"
|
|
5
5
|
authors = ["Brooks Travis <brooks.travis@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
{folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/src/folio_data_import/MARCDataImport.py
RENAMED
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import asyncio
|
|
3
|
+
import datetime
|
|
4
|
+
from email import message
|
|
3
5
|
import glob
|
|
4
6
|
import importlib
|
|
5
7
|
import io
|
|
8
|
+
import logging
|
|
6
9
|
import os
|
|
7
10
|
import sys
|
|
8
|
-
from typing import List
|
|
9
11
|
import uuid
|
|
10
12
|
from contextlib import ExitStack
|
|
11
|
-
import datetime
|
|
12
13
|
from datetime import datetime as dt
|
|
14
|
+
from functools import cached_property
|
|
13
15
|
from getpass import getpass
|
|
14
16
|
from pathlib import Path
|
|
15
17
|
from time import sleep
|
|
18
|
+
from typing import List
|
|
16
19
|
|
|
17
20
|
import folioclient
|
|
18
21
|
import httpx
|
|
@@ -21,8 +24,6 @@ import pymarc
|
|
|
21
24
|
import tabulate
|
|
22
25
|
from humps import decamelize
|
|
23
26
|
from tqdm import tqdm
|
|
24
|
-
from zmq import has
|
|
25
|
-
|
|
26
27
|
|
|
27
28
|
try:
|
|
28
29
|
datetime_utc = datetime.UTC
|
|
@@ -37,6 +38,18 @@ REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error":
|
|
|
37
38
|
RETRY_TIMEOUT_START = 1
|
|
38
39
|
RETRY_TIMEOUT_RETRY_FACTOR = 2
|
|
39
40
|
|
|
41
|
+
# Custom log level for data issues, set to 26
|
|
42
|
+
DATA_ISSUE_LVL_NUM = 26
|
|
43
|
+
logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
|
|
44
|
+
|
|
45
|
+
def data_issues(self, msg, *args, **kws):
|
|
46
|
+
if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
|
|
47
|
+
self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
|
|
48
|
+
|
|
49
|
+
logging.Logger.data_issues = data_issues
|
|
50
|
+
|
|
51
|
+
logger = logging.getLogger(__name__)
|
|
52
|
+
|
|
40
53
|
class MARCImportJob:
|
|
41
54
|
"""
|
|
42
55
|
Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
|
|
@@ -57,7 +70,6 @@ class MARCImportJob:
|
|
|
57
70
|
bad_records_file: io.TextIOWrapper
|
|
58
71
|
failed_batches_file: io.TextIOWrapper
|
|
59
72
|
job_id: str
|
|
60
|
-
job_import_profile: dict
|
|
61
73
|
pbar_sent: tqdm
|
|
62
74
|
pbar_imported: tqdm
|
|
63
75
|
http_client: httpx.Client
|
|
@@ -78,9 +90,11 @@ class MARCImportJob:
|
|
|
78
90
|
marc_record_preprocessor=None,
|
|
79
91
|
consolidate=False,
|
|
80
92
|
no_progress=False,
|
|
93
|
+
let_summary_fail=False,
|
|
81
94
|
) -> None:
|
|
82
95
|
self.consolidate_files = consolidate
|
|
83
96
|
self.no_progress = no_progress
|
|
97
|
+
self.let_summary_fail = let_summary_fail
|
|
84
98
|
self.folio_client: folioclient.FolioClient = folio_client
|
|
85
99
|
self.import_files = marc_files
|
|
86
100
|
self.import_profile_name = import_profile_name
|
|
@@ -88,6 +102,8 @@ class MARCImportJob:
|
|
|
88
102
|
self.batch_delay = batch_delay
|
|
89
103
|
self.current_retry_timeout = None
|
|
90
104
|
self.marc_record_preprocessor = marc_record_preprocessor
|
|
105
|
+
self.pbar_sent: tqdm
|
|
106
|
+
self.pbar_imported: tqdm
|
|
91
107
|
|
|
92
108
|
async def do_work(self) -> None:
|
|
93
109
|
"""
|
|
@@ -101,21 +117,25 @@ class MARCImportJob:
|
|
|
101
117
|
Returns:
|
|
102
118
|
None
|
|
103
119
|
"""
|
|
104
|
-
with
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
120
|
+
with (
|
|
121
|
+
httpx.Client() as http_client,
|
|
122
|
+
open(
|
|
123
|
+
self.import_files[0].parent.joinpath(
|
|
124
|
+
f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
125
|
+
),
|
|
126
|
+
"wb+",
|
|
127
|
+
) as bad_marc_file,
|
|
128
|
+
open(
|
|
129
|
+
self.import_files[0].parent.joinpath(
|
|
130
|
+
f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
131
|
+
),
|
|
132
|
+
"wb+",
|
|
133
|
+
) as failed_batches,
|
|
134
|
+
):
|
|
115
135
|
self.bad_records_file = bad_marc_file
|
|
116
|
-
|
|
136
|
+
logger.info(f"Writing bad records to {self.bad_records_file.name}")
|
|
117
137
|
self.failed_batches_file = failed_batches
|
|
118
|
-
|
|
138
|
+
logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
|
|
119
139
|
self.http_client = http_client
|
|
120
140
|
if self.consolidate_files:
|
|
121
141
|
self.current_file = self.import_files
|
|
@@ -136,16 +156,16 @@ class MARCImportJob:
|
|
|
136
156
|
Returns:
|
|
137
157
|
None
|
|
138
158
|
"""
|
|
139
|
-
self.bad_records_file.
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
self.failed_batches_file.
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
159
|
+
with open(self.bad_records_file.name, "rb") as bad_records:
|
|
160
|
+
if not bad_records.read(1):
|
|
161
|
+
os.remove(bad_records.name)
|
|
162
|
+
logger.info("No bad records found. Removing bad records file.")
|
|
163
|
+
with open(self.failed_batches_file.name, "rb") as failed_batches:
|
|
164
|
+
if not failed_batches.read(1):
|
|
165
|
+
os.remove(failed_batches.name)
|
|
166
|
+
logger.info("No failed batches. Removing failed batches file.")
|
|
167
|
+
logger.info("Import complete.")
|
|
168
|
+
logger.info(f"Total records imported: {self.total_records_sent}")
|
|
149
169
|
|
|
150
170
|
async def get_job_status(self) -> None:
|
|
151
171
|
"""
|
|
@@ -159,21 +179,28 @@ class MARCImportJob:
|
|
|
159
179
|
"""
|
|
160
180
|
try:
|
|
161
181
|
self.current_retry_timeout = (
|
|
162
|
-
self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
|
|
163
|
-
|
|
182
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
183
|
+
if self.current_retry_timeout
|
|
184
|
+
else RETRY_TIMEOUT_START
|
|
185
|
+
)
|
|
164
186
|
job_status = self.folio_client.folio_get(
|
|
165
187
|
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
166
188
|
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
167
189
|
)
|
|
168
190
|
self.current_retry_timeout = None
|
|
169
|
-
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
191
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
192
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
193
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
194
|
+
logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
|
|
195
|
+
sleep(0.25)
|
|
196
|
+
with httpx.Client(
|
|
197
|
+
timeout=self.current_retry_timeout,
|
|
198
|
+
verify=self.folio_client.ssl_verify,
|
|
199
|
+
) as temp_client:
|
|
200
|
+
self.folio_client.httpx_client = temp_client
|
|
201
|
+
return await self.get_job_status()
|
|
202
|
+
else:
|
|
203
|
+
raise e
|
|
177
204
|
try:
|
|
178
205
|
status = [
|
|
179
206
|
job for job in job_status["jobExecutions"] if job["id"] == self.job_id
|
|
@@ -181,16 +208,32 @@ class MARCImportJob:
|
|
|
181
208
|
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
182
209
|
self.last_current = status["progress"]["current"]
|
|
183
210
|
except IndexError:
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
211
|
+
try:
|
|
212
|
+
job_status = self.folio_client.folio_get(
|
|
213
|
+
"/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
|
|
214
|
+
"=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
|
|
215
|
+
)
|
|
216
|
+
status = [
|
|
217
|
+
job for job in job_status["jobExecutions"] if job["id"] == self.job_id
|
|
218
|
+
][0]
|
|
219
|
+
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
220
|
+
self.last_current = status["progress"]["current"]
|
|
221
|
+
self.finished = True
|
|
222
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
223
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
224
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
225
|
+
logger.warning(
|
|
226
|
+
f"SERVER ERROR fetching job status: {error_text}. Retrying."
|
|
227
|
+
)
|
|
228
|
+
sleep(0.25)
|
|
229
|
+
with httpx.Client(
|
|
230
|
+
timeout=self.current_retry_timeout,
|
|
231
|
+
verify=self.folio_client.ssl_verify,
|
|
232
|
+
) as temp_client:
|
|
233
|
+
self.folio_client.httpx_client = temp_client
|
|
234
|
+
return await self.get_job_status()
|
|
235
|
+
else:
|
|
236
|
+
raise e
|
|
194
237
|
|
|
195
238
|
async def create_folio_import_job(self) -> None:
|
|
196
239
|
"""
|
|
@@ -210,7 +253,7 @@ class MARCImportJob:
|
|
|
210
253
|
try:
|
|
211
254
|
create_job.raise_for_status()
|
|
212
255
|
except httpx.HTTPError as e:
|
|
213
|
-
|
|
256
|
+
logger.error(
|
|
214
257
|
"Error creating job: "
|
|
215
258
|
+ str(e)
|
|
216
259
|
+ "\n"
|
|
@@ -218,10 +261,15 @@ class MARCImportJob:
|
|
|
218
261
|
)
|
|
219
262
|
raise e
|
|
220
263
|
self.job_id = create_job.json()["parentJobExecutionId"]
|
|
264
|
+
logger.info("Created job: " + self.job_id)
|
|
221
265
|
|
|
222
|
-
|
|
266
|
+
@cached_property
|
|
267
|
+
def import_profile(self) -> dict:
|
|
223
268
|
"""
|
|
224
|
-
|
|
269
|
+
Returns the import profile for the current job execution.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
dict: The import profile for the current job execution.
|
|
225
273
|
"""
|
|
226
274
|
import_profiles = self.folio_client.folio_get(
|
|
227
275
|
"/data-import-profiles/jobProfiles",
|
|
@@ -233,7 +281,7 @@ class MARCImportJob:
|
|
|
233
281
|
for profile in import_profiles
|
|
234
282
|
if profile["name"] == self.import_profile_name
|
|
235
283
|
][0]
|
|
236
|
-
|
|
284
|
+
return profile
|
|
237
285
|
|
|
238
286
|
async def set_job_profile(self) -> None:
|
|
239
287
|
"""
|
|
@@ -249,15 +297,15 @@ class MARCImportJob:
|
|
|
249
297
|
+ "/jobProfile",
|
|
250
298
|
headers=self.folio_client.okapi_headers,
|
|
251
299
|
json={
|
|
252
|
-
"id": self.
|
|
253
|
-
"name": self.
|
|
300
|
+
"id": self.import_profile["id"],
|
|
301
|
+
"name": self.import_profile["name"],
|
|
254
302
|
"dataType": "MARC",
|
|
255
303
|
},
|
|
256
304
|
)
|
|
257
305
|
try:
|
|
258
306
|
set_job_profile.raise_for_status()
|
|
259
307
|
except httpx.HTTPError as e:
|
|
260
|
-
|
|
308
|
+
logger.error(
|
|
261
309
|
"Error creating job: "
|
|
262
310
|
+ str(e)
|
|
263
311
|
+ "\n"
|
|
@@ -299,8 +347,13 @@ class MARCImportJob:
|
|
|
299
347
|
headers=self.folio_client.okapi_headers,
|
|
300
348
|
json=batch_payload,
|
|
301
349
|
)
|
|
302
|
-
|
|
303
|
-
|
|
350
|
+
# if batch_payload["recordsMetadata"]["last"]:
|
|
351
|
+
# logger.log(
|
|
352
|
+
# 25,
|
|
353
|
+
# f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
|
|
354
|
+
# )
|
|
355
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
356
|
+
sleep(0.25)
|
|
304
357
|
return await self.process_record_batch(batch_payload)
|
|
305
358
|
try:
|
|
306
359
|
post_batch.raise_for_status()
|
|
@@ -308,12 +361,14 @@ class MARCImportJob:
|
|
|
308
361
|
self.record_batch = []
|
|
309
362
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
310
363
|
except Exception as e:
|
|
311
|
-
if
|
|
364
|
+
if (
|
|
365
|
+
hasattr(e, "response") and e.response.status_code in [500, 422]
|
|
366
|
+
): # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
|
|
312
367
|
self.total_records_sent += len(self.record_batch)
|
|
313
368
|
self.record_batch = []
|
|
314
369
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
315
370
|
else:
|
|
316
|
-
|
|
371
|
+
logger.error("Error posting batch: " + str(e))
|
|
317
372
|
for record in self.record_batch:
|
|
318
373
|
self.failed_batches_file.write(record)
|
|
319
374
|
self.error_records += len(self.record_batch)
|
|
@@ -335,14 +390,20 @@ class MARCImportJob:
|
|
|
335
390
|
"""
|
|
336
391
|
counter = 0
|
|
337
392
|
for import_file in files:
|
|
393
|
+
file_path = Path(import_file.name)
|
|
338
394
|
self.pbar_sent.set_description(
|
|
339
395
|
f"Sent ({os.path.basename(import_file.name)}): "
|
|
340
396
|
)
|
|
341
397
|
reader = pymarc.MARCReader(import_file, hide_utf8_warnings=True)
|
|
342
|
-
for record in reader:
|
|
398
|
+
for idx, record in enumerate(reader, start=1):
|
|
343
399
|
if len(self.record_batch) == self.batch_size:
|
|
344
400
|
await self.process_record_batch(
|
|
345
|
-
await self.create_batch_payload(
|
|
401
|
+
await self.create_batch_payload(
|
|
402
|
+
counter,
|
|
403
|
+
total_records,
|
|
404
|
+
(counter - self.error_records)
|
|
405
|
+
== (total_records - self.error_records),
|
|
406
|
+
),
|
|
346
407
|
)
|
|
347
408
|
await self.get_job_status()
|
|
348
409
|
sleep(0.25)
|
|
@@ -354,14 +415,35 @@ class MARCImportJob:
|
|
|
354
415
|
self.record_batch.append(record.as_marc())
|
|
355
416
|
counter += 1
|
|
356
417
|
else:
|
|
418
|
+
logger.data_issues(
|
|
419
|
+
"RECORD FAILED\t%s\t%s\t%s",
|
|
420
|
+
f"{file_path.name}:{idx}",
|
|
421
|
+
f"Error reading {idx} record from {file_path}. Skipping. Writing current chunk to {self.bad_records_file.name}.",
|
|
422
|
+
"",
|
|
423
|
+
)
|
|
357
424
|
self.bad_records_file.write(reader.current_chunk)
|
|
358
425
|
if self.record_batch:
|
|
359
426
|
await self.process_record_batch(
|
|
360
|
-
await self.create_batch_payload(
|
|
427
|
+
await self.create_batch_payload(
|
|
428
|
+
counter,
|
|
429
|
+
total_records,
|
|
430
|
+
(counter - self.error_records)
|
|
431
|
+
== (total_records - self.error_records),
|
|
432
|
+
),
|
|
361
433
|
)
|
|
434
|
+
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
435
|
+
if import_complete_path.exists():
|
|
436
|
+
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
437
|
+
import_complete_path.mkdir(exist_ok=True)
|
|
438
|
+
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
439
|
+
file_path.rename(
|
|
440
|
+
file_path.parent.joinpath("import_complete", file_path.name)
|
|
441
|
+
)
|
|
362
442
|
|
|
363
443
|
@staticmethod
|
|
364
|
-
async def apply_marc_record_preprocessing(
|
|
444
|
+
async def apply_marc_record_preprocessing(
|
|
445
|
+
record: pymarc.Record, func_or_path
|
|
446
|
+
) -> pymarc.Record:
|
|
365
447
|
"""
|
|
366
448
|
Apply preprocessing to the MARC record before sending it to FOLIO.
|
|
367
449
|
|
|
@@ -374,23 +456,29 @@ class MARCImportJob:
|
|
|
374
456
|
"""
|
|
375
457
|
if isinstance(func_or_path, str):
|
|
376
458
|
try:
|
|
377
|
-
path_parts = func_or_path.rsplit(
|
|
459
|
+
path_parts = func_or_path.rsplit(".")
|
|
378
460
|
module_path, func_name = ".".join(path_parts[:-1]), path_parts[-1]
|
|
379
461
|
module = importlib.import_module(module_path)
|
|
380
462
|
func = getattr(module, func_name)
|
|
381
463
|
except (ImportError, AttributeError) as e:
|
|
382
|
-
|
|
464
|
+
logger.error(
|
|
465
|
+
f"Error importing preprocessing function {func_or_path}: {e}. Skipping preprocessing."
|
|
466
|
+
)
|
|
383
467
|
return record
|
|
384
468
|
elif callable(func_or_path):
|
|
385
469
|
func = func_or_path
|
|
386
470
|
else:
|
|
387
|
-
|
|
471
|
+
logger.warning(
|
|
472
|
+
f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
|
|
473
|
+
)
|
|
388
474
|
return record
|
|
389
475
|
|
|
390
476
|
try:
|
|
391
477
|
return func(record)
|
|
392
478
|
except Exception as e:
|
|
393
|
-
|
|
479
|
+
logger.error(
|
|
480
|
+
f"Error applying preprocessing function: {e}. Skipping preprocessing."
|
|
481
|
+
)
|
|
394
482
|
return record
|
|
395
483
|
|
|
396
484
|
async def create_batch_payload(self, counter, total_records, is_last) -> dict:
|
|
@@ -435,24 +523,26 @@ class MARCImportJob:
|
|
|
435
523
|
None
|
|
436
524
|
"""
|
|
437
525
|
await self.create_folio_import_job()
|
|
438
|
-
await self.get_import_profile()
|
|
439
526
|
await self.set_job_profile()
|
|
440
527
|
with ExitStack() as stack:
|
|
441
528
|
files = [
|
|
442
529
|
stack.enter_context(open(file, "rb")) for file in self.current_file
|
|
443
530
|
]
|
|
444
531
|
total_records = await self.read_total_records(files)
|
|
445
|
-
with
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
532
|
+
with (
|
|
533
|
+
tqdm(
|
|
534
|
+
desc="Imported: ",
|
|
535
|
+
total=total_records,
|
|
536
|
+
position=1,
|
|
537
|
+
disable=self.no_progress,
|
|
538
|
+
) as pbar_imported,
|
|
539
|
+
tqdm(
|
|
540
|
+
desc="Sent: ()",
|
|
541
|
+
total=total_records,
|
|
542
|
+
position=0,
|
|
543
|
+
disable=self.no_progress,
|
|
544
|
+
) as pbar_sent,
|
|
545
|
+
):
|
|
456
546
|
self.pbar_sent = pbar_sent
|
|
457
547
|
self.pbar_imported = pbar_imported
|
|
458
548
|
await self.process_records(files, total_records)
|
|
@@ -461,8 +551,8 @@ class MARCImportJob:
|
|
|
461
551
|
sleep(1)
|
|
462
552
|
if self.finished:
|
|
463
553
|
if job_summary := await self.get_job_summary():
|
|
464
|
-
job_summary.pop("jobExecutionId")
|
|
465
|
-
job_summary.pop("totalErrors")
|
|
554
|
+
job_id = job_summary.pop("jobExecutionId", None)
|
|
555
|
+
total_errors = job_summary.pop("totalErrors", 0)
|
|
466
556
|
columns = ["Summary"] + list(job_summary.keys())
|
|
467
557
|
rows = set()
|
|
468
558
|
for key in columns[1:]:
|
|
@@ -479,17 +569,20 @@ class MARCImportJob:
|
|
|
479
569
|
columns = columns[:1] + [
|
|
480
570
|
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
481
571
|
]
|
|
482
|
-
|
|
572
|
+
logger.info(
|
|
483
573
|
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
484
574
|
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
485
575
|
)
|
|
486
|
-
|
|
487
|
-
|
|
576
|
+
logger.info(
|
|
577
|
+
"\n"
|
|
578
|
+
+ tabulate.tabulate(
|
|
488
579
|
table_data, headers=columns, tablefmt="fancy_grid"
|
|
489
580
|
),
|
|
490
581
|
)
|
|
582
|
+
if total_errors:
|
|
583
|
+
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
491
584
|
else:
|
|
492
|
-
|
|
585
|
+
logger.error(f"No job summary available for job {self.job_id}.")
|
|
493
586
|
self.last_current = 0
|
|
494
587
|
self.finished = False
|
|
495
588
|
|
|
@@ -502,11 +595,12 @@ class MARCImportJob:
|
|
|
502
595
|
"""
|
|
503
596
|
try:
|
|
504
597
|
self.current_retry_timeout = (
|
|
505
|
-
self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
|
|
506
|
-
|
|
598
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
599
|
+
if self.current_retry_timeout
|
|
600
|
+
else RETRY_TIMEOUT_START
|
|
601
|
+
)
|
|
507
602
|
with httpx.Client(
|
|
508
|
-
timeout=self.current_retry_timeout,
|
|
509
|
-
verify=self.folio_client.ssl_verify
|
|
603
|
+
timeout=self.current_retry_timeout, verify=self.folio_client.ssl_verify
|
|
510
604
|
) as temp_client:
|
|
511
605
|
self.folio_client.httpx_client = temp_client
|
|
512
606
|
job_summary = self.folio_client.folio_get(
|
|
@@ -514,21 +608,73 @@ class MARCImportJob:
|
|
|
514
608
|
)
|
|
515
609
|
self.current_retry_timeout = None
|
|
516
610
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
517
|
-
if
|
|
518
|
-
|
|
611
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
612
|
+
if not hasattr(e, "response") or (
|
|
613
|
+
e.response.status_code in [502, 504] and not self.let_summary_fail
|
|
614
|
+
):
|
|
615
|
+
logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
|
|
616
|
+
sleep(0.25)
|
|
519
617
|
with httpx.Client(
|
|
520
618
|
timeout=self.current_retry_timeout,
|
|
521
|
-
verify=self.folio_client.ssl_verify
|
|
619
|
+
verify=self.folio_client.ssl_verify,
|
|
522
620
|
) as temp_client:
|
|
523
621
|
self.folio_client.httpx_client = temp_client
|
|
524
|
-
return await self.
|
|
525
|
-
elif hasattr(e, "response") and
|
|
622
|
+
return await self.get_job_summary()
|
|
623
|
+
elif hasattr(e, "response") and (
|
|
624
|
+
e.response.status_code in [502, 504] and self.let_summary_fail
|
|
625
|
+
):
|
|
626
|
+
logger.warning(
|
|
627
|
+
f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
|
|
628
|
+
)
|
|
526
629
|
job_summary = {}
|
|
527
630
|
else:
|
|
528
631
|
raise e
|
|
529
632
|
return job_summary
|
|
530
633
|
|
|
531
634
|
|
|
635
|
+
def set_up_cli_logging():
|
|
636
|
+
"""
|
|
637
|
+
This function sets up logging for the CLI.
|
|
638
|
+
"""
|
|
639
|
+
logger.setLevel(logging.INFO)
|
|
640
|
+
logger.propagate = False
|
|
641
|
+
|
|
642
|
+
# Set up file and stream handlers
|
|
643
|
+
file_handler = logging.FileHandler(
|
|
644
|
+
"folio_data_import_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
645
|
+
)
|
|
646
|
+
file_handler.setLevel(logging.INFO)
|
|
647
|
+
file_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
648
|
+
# file_handler.addFilter(IncludeLevelFilter(25))
|
|
649
|
+
file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
650
|
+
file_handler.setFormatter(file_formatter)
|
|
651
|
+
logger.addHandler(file_handler)
|
|
652
|
+
|
|
653
|
+
if not any(
|
|
654
|
+
isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
|
|
655
|
+
for h in logger.handlers
|
|
656
|
+
):
|
|
657
|
+
stream_handler = logging.StreamHandler(sys.stdout)
|
|
658
|
+
stream_handler.setLevel(logging.INFO)
|
|
659
|
+
stream_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
660
|
+
# stream_handler.addFilter(ExcludeLevelFilter(25))
|
|
661
|
+
stream_formatter = logging.Formatter("%(message)s")
|
|
662
|
+
stream_handler.setFormatter(stream_formatter)
|
|
663
|
+
logger.addHandler(stream_handler)
|
|
664
|
+
|
|
665
|
+
# Set up data issues logging
|
|
666
|
+
data_issues_handler = logging.FileHandler(
|
|
667
|
+
"marc_import_data_issues_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
668
|
+
)
|
|
669
|
+
data_issues_handler.setLevel(26)
|
|
670
|
+
data_issues_formatter = logging.Formatter("%(message)s")
|
|
671
|
+
data_issues_handler.setFormatter(data_issues_formatter)
|
|
672
|
+
logger.addHandler(data_issues_handler)
|
|
673
|
+
|
|
674
|
+
# Stop httpx from logging info messages to the console
|
|
675
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
676
|
+
|
|
677
|
+
|
|
532
678
|
async def main() -> None:
|
|
533
679
|
"""
|
|
534
680
|
Main function to run the MARC import job.
|
|
@@ -536,6 +682,7 @@ async def main() -> None:
|
|
|
536
682
|
This function parses command line arguments, initializes the FolioClient,
|
|
537
683
|
and runs the MARCImportJob.
|
|
538
684
|
"""
|
|
685
|
+
set_up_cli_logging()
|
|
539
686
|
parser = argparse.ArgumentParser()
|
|
540
687
|
parser.add_argument("--gateway_url", type=str, help="The FOLIO API Gateway URL")
|
|
541
688
|
parser.add_argument("--tenant_id", type=str, help="The FOLIO tenant ID")
|
|
@@ -592,6 +739,11 @@ async def main() -> None:
|
|
|
592
739
|
action="store_true",
|
|
593
740
|
help="Disable progress bars (eg. for running in a CI environment)",
|
|
594
741
|
)
|
|
742
|
+
parser.add_argument(
|
|
743
|
+
"--let-summary-fail",
|
|
744
|
+
action="store_true",
|
|
745
|
+
help="Do not retry fetching the final job summary if it fails",
|
|
746
|
+
)
|
|
595
747
|
args = parser.parse_args()
|
|
596
748
|
if not args.password:
|
|
597
749
|
args.password = getpass("Enter FOLIO password: ")
|
|
@@ -611,10 +763,10 @@ async def main() -> None:
|
|
|
611
763
|
marc_files.sort()
|
|
612
764
|
|
|
613
765
|
if len(marc_files) == 0:
|
|
614
|
-
|
|
766
|
+
logger.critical(f"No files found matching {args.marc_file_path}. Exiting.")
|
|
615
767
|
sys.exit(1)
|
|
616
768
|
else:
|
|
617
|
-
|
|
769
|
+
logger.info(marc_files)
|
|
618
770
|
|
|
619
771
|
if not args.import_profile_name:
|
|
620
772
|
import_profiles = folio_client.folio_get(
|
|
@@ -646,12 +798,31 @@ async def main() -> None:
|
|
|
646
798
|
marc_record_preprocessor=args.preprocessor,
|
|
647
799
|
consolidate=bool(args.consolidate),
|
|
648
800
|
no_progress=bool(args.no_progress),
|
|
801
|
+
let_summary_fail=bool(args.let_summary_fail),
|
|
649
802
|
).do_work()
|
|
650
803
|
except Exception as e:
|
|
651
|
-
|
|
804
|
+
logger.error("Error importing files: " + str(e))
|
|
652
805
|
raise
|
|
653
806
|
|
|
654
807
|
|
|
808
|
+
class ExcludeLevelFilter(logging.Filter):
|
|
809
|
+
def __init__(self, level):
|
|
810
|
+
super().__init__()
|
|
811
|
+
self.level = level
|
|
812
|
+
|
|
813
|
+
def filter(self, record):
|
|
814
|
+
return record.levelno != self.level
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
class IncludeLevelFilter(logging.Filter):
|
|
818
|
+
def __init__(self, level):
|
|
819
|
+
super().__init__()
|
|
820
|
+
self.level = level
|
|
821
|
+
|
|
822
|
+
def filter(self, record):
|
|
823
|
+
return record.levelno == self.level
|
|
824
|
+
|
|
825
|
+
|
|
655
826
|
def sync_main() -> None:
|
|
656
827
|
"""
|
|
657
828
|
Synchronous main function to run the MARC import job.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ._preprocessors import *
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
import pymarc
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger("folio_data_import.MARCDataImport")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
|
|
8
|
+
"""
|
|
9
|
+
Prepend a prefix to the record's 001 field.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
13
|
+
prefix (str): The prefix to prepend to the 001 field.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
pymarc.Record: The preprocessed MARC record.
|
|
17
|
+
"""
|
|
18
|
+
record["001"].data = f"({prefix})" + record["001"].data
|
|
19
|
+
return record
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
23
|
+
"""
|
|
24
|
+
Prepend the PPN prefix to the record's 001 field. Useful when
|
|
25
|
+
importing records from the ABES SUDOC catalog
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
pymarc.Record: The preprocessed MARC record.
|
|
32
|
+
"""
|
|
33
|
+
return prepend_prefix_001(record, "PPN")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
37
|
+
"""
|
|
38
|
+
Prepend the ABES prefix to the record's 001 field. Useful when
|
|
39
|
+
importing records from the ABES SUDOC catalog
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
pymarc.Record: The preprocessed MARC record.
|
|
46
|
+
"""
|
|
47
|
+
return prepend_prefix_001(record, "ABES")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
|
|
51
|
+
"""
|
|
52
|
+
Strip all 999 fields with ff indicators from the record.
|
|
53
|
+
Useful when importing records exported from another FOLIO system
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
pymarc.Record: The preprocessed MARC record.
|
|
60
|
+
"""
|
|
61
|
+
for field in record.get_fields("999"):
|
|
62
|
+
if field.indicators == pymarc.Indicators(*["f", "f"]):
|
|
63
|
+
record.remove_field(field)
|
|
64
|
+
return record
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
68
|
+
"""
|
|
69
|
+
Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
|
|
70
|
+
with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
|
|
71
|
+
prefixed with "(ABES)". This is useful when importing newly-merged records
|
|
72
|
+
from the SUDOC catalog when you want the new record to replace the old one
|
|
73
|
+
in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
pymarc.Record: The preprocessed MARC record.
|
|
80
|
+
"""
|
|
81
|
+
record = prepend_abes_prefix_001(record)
|
|
82
|
+
for field in record.get_fields("035"):
|
|
83
|
+
if "a" in field and "9" in field and field["9"] == "sudoc":
|
|
84
|
+
_935 = pymarc.Field(
|
|
85
|
+
tag="935",
|
|
86
|
+
indicators=["f", "f"],
|
|
87
|
+
subfields=[pymarc.field.Subfield("a", "(ABES)" + field["a"])],
|
|
88
|
+
)
|
|
89
|
+
record.add_ordered_field(_935)
|
|
90
|
+
return record
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
94
|
+
"""
|
|
95
|
+
Remove empty fields and subfields from the record. These can cause
|
|
96
|
+
data import mapping issues in FOLIO. Removals are logged at custom
|
|
97
|
+
log level 26, which is used by folio_migration_tools to populate the
|
|
98
|
+
data issues report.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
pymarc.Record: The preprocessed MARC record.
|
|
105
|
+
"""
|
|
106
|
+
MAPPED_FIELDS = {
|
|
107
|
+
"010": ["a", "z"],
|
|
108
|
+
"020": ["a", "y", "z"],
|
|
109
|
+
"035": ["a", "z"],
|
|
110
|
+
"040": ["a", "b", "c", "d", "e", "f", "g", "h", "k", "m", "n", "p", "r", "s"],
|
|
111
|
+
"050": ["a", "b"],
|
|
112
|
+
"082": ["a", "b"],
|
|
113
|
+
"100": ["a", "b", "c", "d", "q"],
|
|
114
|
+
"110": ["a", "b", "c"],
|
|
115
|
+
"111": ["a", "c", "d"],
|
|
116
|
+
"130": [
|
|
117
|
+
"a",
|
|
118
|
+
"d",
|
|
119
|
+
"f",
|
|
120
|
+
"k",
|
|
121
|
+
"l",
|
|
122
|
+
"m",
|
|
123
|
+
"n",
|
|
124
|
+
"o",
|
|
125
|
+
"p",
|
|
126
|
+
"r",
|
|
127
|
+
"s",
|
|
128
|
+
"t",
|
|
129
|
+
"x",
|
|
130
|
+
"y",
|
|
131
|
+
"z",
|
|
132
|
+
],
|
|
133
|
+
"180": ["x", "y", "z"],
|
|
134
|
+
"210": ["a", "c"],
|
|
135
|
+
"240": ["a", "f", "k", "l", "m", "n", "o", "p", "r", "s", "t", "x", "y", "z"],
|
|
136
|
+
"245": ["a", "b", "c", "f", "g", "h", "k", "n", "p", "s"],
|
|
137
|
+
"246": ["a", "f", "g", "n", "p", "s"],
|
|
138
|
+
"250": ["a", "b"],
|
|
139
|
+
"260": ["a", "b", "c", "e", "f", "g"],
|
|
140
|
+
"300": ["a", "b", "c", "e", "f", "g"],
|
|
141
|
+
"440": ["a", "n", "p", "v", "x", "y", "z"],
|
|
142
|
+
"490": ["a", "v", "x", "y", "z"],
|
|
143
|
+
"500": ["a", "c", "d", "n", "p", "v", "x", "y", "z"],
|
|
144
|
+
"505": ["a", "g", "r", "t", "u"],
|
|
145
|
+
"520": ["a", "b", "c", "u"],
|
|
146
|
+
"600": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
147
|
+
"610": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
148
|
+
"611": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
149
|
+
"630": [
|
|
150
|
+
"a",
|
|
151
|
+
"d",
|
|
152
|
+
"f",
|
|
153
|
+
"k",
|
|
154
|
+
"l",
|
|
155
|
+
"m",
|
|
156
|
+
"n",
|
|
157
|
+
"o",
|
|
158
|
+
"p",
|
|
159
|
+
"r",
|
|
160
|
+
"s",
|
|
161
|
+
"t",
|
|
162
|
+
"x",
|
|
163
|
+
"y",
|
|
164
|
+
"z",
|
|
165
|
+
],
|
|
166
|
+
"650": ["a", "d", "v", "x", "y", "z"],
|
|
167
|
+
"651": ["a", "v", "x", "y", "z"],
|
|
168
|
+
"655": ["a", "v", "x", "y", "z"],
|
|
169
|
+
"700": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
170
|
+
"710": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
171
|
+
"711": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
172
|
+
"730": [
|
|
173
|
+
"a",
|
|
174
|
+
"d",
|
|
175
|
+
"f",
|
|
176
|
+
"k",
|
|
177
|
+
"l",
|
|
178
|
+
"m",
|
|
179
|
+
"n",
|
|
180
|
+
"o",
|
|
181
|
+
"p",
|
|
182
|
+
"r",
|
|
183
|
+
"s",
|
|
184
|
+
"t",
|
|
185
|
+
"x",
|
|
186
|
+
"y",
|
|
187
|
+
"z",
|
|
188
|
+
],
|
|
189
|
+
"740": ["a", "n", "p", "v", "x", "y", "z"],
|
|
190
|
+
"800": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
191
|
+
"810": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
192
|
+
"811": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
193
|
+
"830": [
|
|
194
|
+
"a",
|
|
195
|
+
"d",
|
|
196
|
+
"f",
|
|
197
|
+
"k",
|
|
198
|
+
"l",
|
|
199
|
+
"m",
|
|
200
|
+
"n",
|
|
201
|
+
"o",
|
|
202
|
+
"p",
|
|
203
|
+
"r",
|
|
204
|
+
"s",
|
|
205
|
+
"t",
|
|
206
|
+
"x",
|
|
207
|
+
"y",
|
|
208
|
+
"z",
|
|
209
|
+
],
|
|
210
|
+
"856": ["u", "y", "z"],
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
for field in list(record.get_fields()):
|
|
214
|
+
len_subs = len(field.subfields)
|
|
215
|
+
subfield_value = bool(field.subfields[0].value) if len_subs > 0 else False
|
|
216
|
+
if not int(field.tag) >= 900 and field.tag in MAPPED_FIELDS:
|
|
217
|
+
if int(field.tag) > 9 and len_subs == 0:
|
|
218
|
+
logger.log(
|
|
219
|
+
26,
|
|
220
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
221
|
+
record["001"].value(),
|
|
222
|
+
f"{field.tag} is empty",
|
|
223
|
+
field,
|
|
224
|
+
)
|
|
225
|
+
record.remove_field(field)
|
|
226
|
+
elif len_subs == 1 and not subfield_value:
|
|
227
|
+
logger.log(
|
|
228
|
+
26,
|
|
229
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
230
|
+
record["001"].value(),
|
|
231
|
+
f"{field.tag}${field.subfields[0].code} is empty, removing field",
|
|
232
|
+
field,
|
|
233
|
+
)
|
|
234
|
+
record.remove_field(field)
|
|
235
|
+
else:
|
|
236
|
+
if len_subs > 1 and "a" in field and not field["a"].strip():
|
|
237
|
+
logger.log(
|
|
238
|
+
26,
|
|
239
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
240
|
+
record["001"].value(),
|
|
241
|
+
f"{field.tag}$a is empty, removing field",
|
|
242
|
+
field,
|
|
243
|
+
)
|
|
244
|
+
field.delete_subfield("a")
|
|
245
|
+
for idx, subfield in enumerate(list(field.subfields), start=1):
|
|
246
|
+
if subfield.code in MAPPED_FIELDS.get(field.tag, []) and not subfield.value:
|
|
247
|
+
logger.log(
|
|
248
|
+
26,
|
|
249
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
250
|
+
record["001"].value(),
|
|
251
|
+
f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
|
|
252
|
+
field,
|
|
253
|
+
)
|
|
254
|
+
field.delete_subfield(subfield.code)
|
|
255
|
+
if len(field.subfields) == 0:
|
|
256
|
+
logger.log(
|
|
257
|
+
26,
|
|
258
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
259
|
+
record["001"].value(),
|
|
260
|
+
f"{field.tag} has no non-empty subfields after cleaning, removing field",
|
|
261
|
+
field,
|
|
262
|
+
)
|
|
263
|
+
record.remove_field(field)
|
|
264
|
+
return record
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def ordinal(n):
|
|
268
|
+
s = ("th", "st", "nd", "rd") + ("th",) * 10
|
|
269
|
+
v = n % 100
|
|
270
|
+
if v > 13:
|
|
271
|
+
return f"{n}{s[v % 10]}"
|
|
272
|
+
else:
|
|
273
|
+
return f"{n}{s[v]}"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from ._preprocessors import prepend_ppn_prefix_001, strip_999_ff_fields
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
import pymarc
|
|
2
|
-
|
|
3
|
-
def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
|
|
4
|
-
"""
|
|
5
|
-
Prepend a prefix to the record's 001 field.
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
9
|
-
prefix (str): The prefix to prepend to the 001 field.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
pymarc.Record: The preprocessed MARC record.
|
|
13
|
-
"""
|
|
14
|
-
record['001'].data = f'({prefix})' + record['001'].data
|
|
15
|
-
return record
|
|
16
|
-
|
|
17
|
-
def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
18
|
-
"""
|
|
19
|
-
Prepend the PPN prefix to the record's 001 field. Useful when
|
|
20
|
-
importing records from the ABES SUDOC catalog
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
24
|
-
|
|
25
|
-
Returns:
|
|
26
|
-
pymarc.Record: The preprocessed MARC record.
|
|
27
|
-
"""
|
|
28
|
-
return prepend_prefix_001(record, 'PPN')
|
|
29
|
-
|
|
30
|
-
def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
31
|
-
"""
|
|
32
|
-
Prepend the ABES prefix to the record's 001 field. Useful when
|
|
33
|
-
importing records from the ABES SUDOC catalog
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
pymarc.Record: The preprocessed MARC record.
|
|
40
|
-
"""
|
|
41
|
-
return prepend_prefix_001(record, 'ABES')
|
|
42
|
-
|
|
43
|
-
def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
|
|
44
|
-
"""
|
|
45
|
-
Strip all 999 fields with ff indicators from the record.
|
|
46
|
-
Useful when importing records exported from another FOLIO system
|
|
47
|
-
|
|
48
|
-
Args:
|
|
49
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
50
|
-
|
|
51
|
-
Returns:
|
|
52
|
-
pymarc.Record: The preprocessed MARC record.
|
|
53
|
-
"""
|
|
54
|
-
for field in record.get_fields('999'):
|
|
55
|
-
if field.indicators == pymarc.Indicators(*['f', 'f']):
|
|
56
|
-
record.remove_field(field)
|
|
57
|
-
return record
|
|
58
|
-
|
|
59
|
-
def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
60
|
-
"""
|
|
61
|
-
Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
|
|
62
|
-
with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
|
|
63
|
-
prefixed with "(ABES)". This is useful when importing newly-merged records
|
|
64
|
-
from the SUDOC catalog when you want the new record to replace the old one
|
|
65
|
-
in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
pymarc.Record: The preprocessed MARC record.
|
|
72
|
-
"""
|
|
73
|
-
record = prepend_abes_prefix_001(record)
|
|
74
|
-
for field in record.get_fields('035'):
|
|
75
|
-
if "a" in field and "9" in field and field['9'] == 'sudoc':
|
|
76
|
-
_935 = pymarc.Field(
|
|
77
|
-
tag='935',
|
|
78
|
-
indicators=['f', 'f'],
|
|
79
|
-
subfields=[
|
|
80
|
-
pymarc.field.Subfield('a', "(ABES)" + field['a'])
|
|
81
|
-
]
|
|
82
|
-
)
|
|
83
|
-
record.add_ordered_field(_935)
|
|
84
|
-
return record
|
|
File without changes
|
{folio_data_import-0.2.8rc7 → folio_data_import-0.2.8rc8}/src/folio_data_import/UserImport.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|