folio-data-import 0.2.8rc6__tar.gz → 0.2.8rc8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/PKG-INFO +3 -3
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/README.md +2 -2
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/pyproject.toml +1 -1
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/src/folio_data_import/MARCDataImport.py +298 -117
- folio_data_import-0.2.8rc8/src/folio_data_import/marc_preprocessors/__init__.py +1 -0
- folio_data_import-0.2.8rc8/src/folio_data_import/marc_preprocessors/_preprocessors.py +273 -0
- folio_data_import-0.2.8rc6/src/folio_data_import/marc_preprocessors/__init__.py +0 -1
- folio_data_import-0.2.8rc6/src/folio_data_import/marc_preprocessors/_preprocessors.py +0 -84
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/LICENSE +0 -0
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/src/folio_data_import/UserImport.py +0 -0
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/src/folio_data_import/__init__.py +0 -0
- {folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/src/folio_data_import/__main__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: folio_data_import
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8rc8
|
|
4
4
|
Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Brooks Travis
|
|
@@ -108,11 +108,11 @@ Unlike mod-user-import, this importer does not require `externalSystemId` as the
|
|
|
108
108
|
|
|
109
109
|
#### Preferred Contact Type Mapping
|
|
110
110
|
|
|
111
|
-
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by
|
|
111
|
+
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by FOLIO, or the human-friendly strings used by `mod-user-import` (`"mail", "email", "text", "phone", "mobile"`). It will also __*set a customizable default for all users that do not otherwise have a valid value specified*__ (using `--default_preferred_contact_type`), unless a (valid) value is already present in the user record being updated.
|
|
112
112
|
|
|
113
113
|
#### Field Protection (*experimental*)
|
|
114
114
|
|
|
115
|
-
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you
|
|
115
|
+
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you can specify a comma-separated list of User schema field names, using dot-notation for nested fields. This protection should support all standard fields except addresses within `personal.addresses`. If you include `personal.addresses` in a user record, any existing addresses will be replaced by the new values.
|
|
116
116
|
|
|
117
117
|
##### Example
|
|
118
118
|
|
|
@@ -78,11 +78,11 @@ Unlike mod-user-import, this importer does not require `externalSystemId` as the
|
|
|
78
78
|
|
|
79
79
|
#### Preferred Contact Type Mapping
|
|
80
80
|
|
|
81
|
-
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by
|
|
81
|
+
Another point of departure from the behavior of `mod-user-import` is the handling of `preferredContactTypeId`. This importer will accept either the `"001", "002", "003"...` values stored by FOLIO, or the human-friendly strings used by `mod-user-import` (`"mail", "email", "text", "phone", "mobile"`). It will also __*set a customizable default for all users that do not otherwise have a valid value specified*__ (using `--default_preferred_contact_type`), unless a (valid) value is already present in the user record being updated.
|
|
82
82
|
|
|
83
83
|
#### Field Protection (*experimental*)
|
|
84
84
|
|
|
85
|
-
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you
|
|
85
|
+
This script offers a rudimentary field protection implementation using custom fields. To enable this functionality, create a text custom field that has the field name `protectedFields`. In this field, you can specify a comma-separated list of User schema field names, using dot-notation for nested fields. This protection should support all standard fields except addresses within `personal.addresses`. If you include `personal.addresses` in a user record, any existing addresses will be replaced by the new values.
|
|
86
86
|
|
|
87
87
|
##### Example
|
|
88
88
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "folio_data_import"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.8rc8"
|
|
4
4
|
description = "A python module to interact with the data importing capabilities of the open-source FOLIO ILS"
|
|
5
5
|
authors = ["Brooks Travis <brooks.travis@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
{folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/src/folio_data_import/MARCDataImport.py
RENAMED
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import asyncio
|
|
3
|
+
import datetime
|
|
4
|
+
from email import message
|
|
3
5
|
import glob
|
|
4
6
|
import importlib
|
|
5
7
|
import io
|
|
8
|
+
import logging
|
|
6
9
|
import os
|
|
7
10
|
import sys
|
|
8
|
-
from typing import List
|
|
9
11
|
import uuid
|
|
10
12
|
from contextlib import ExitStack
|
|
11
|
-
import datetime
|
|
12
13
|
from datetime import datetime as dt
|
|
14
|
+
from functools import cached_property
|
|
13
15
|
from getpass import getpass
|
|
14
16
|
from pathlib import Path
|
|
15
17
|
from time import sleep
|
|
18
|
+
from typing import List
|
|
16
19
|
|
|
17
20
|
import folioclient
|
|
18
21
|
import httpx
|
|
@@ -22,7 +25,6 @@ import tabulate
|
|
|
22
25
|
from humps import decamelize
|
|
23
26
|
from tqdm import tqdm
|
|
24
27
|
|
|
25
|
-
|
|
26
28
|
try:
|
|
27
29
|
datetime_utc = datetime.UTC
|
|
28
30
|
except AttributeError:
|
|
@@ -36,6 +38,18 @@ REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error":
|
|
|
36
38
|
RETRY_TIMEOUT_START = 1
|
|
37
39
|
RETRY_TIMEOUT_RETRY_FACTOR = 2
|
|
38
40
|
|
|
41
|
+
# Custom log level for data issues, set to 26
|
|
42
|
+
DATA_ISSUE_LVL_NUM = 26
|
|
43
|
+
logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
|
|
44
|
+
|
|
45
|
+
def data_issues(self, msg, *args, **kws):
|
|
46
|
+
if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
|
|
47
|
+
self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
|
|
48
|
+
|
|
49
|
+
logging.Logger.data_issues = data_issues
|
|
50
|
+
|
|
51
|
+
logger = logging.getLogger(__name__)
|
|
52
|
+
|
|
39
53
|
class MARCImportJob:
|
|
40
54
|
"""
|
|
41
55
|
Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
|
|
@@ -56,7 +70,6 @@ class MARCImportJob:
|
|
|
56
70
|
bad_records_file: io.TextIOWrapper
|
|
57
71
|
failed_batches_file: io.TextIOWrapper
|
|
58
72
|
job_id: str
|
|
59
|
-
job_import_profile: dict
|
|
60
73
|
pbar_sent: tqdm
|
|
61
74
|
pbar_imported: tqdm
|
|
62
75
|
http_client: httpx.Client
|
|
@@ -77,9 +90,11 @@ class MARCImportJob:
|
|
|
77
90
|
marc_record_preprocessor=None,
|
|
78
91
|
consolidate=False,
|
|
79
92
|
no_progress=False,
|
|
93
|
+
let_summary_fail=False,
|
|
80
94
|
) -> None:
|
|
81
95
|
self.consolidate_files = consolidate
|
|
82
96
|
self.no_progress = no_progress
|
|
97
|
+
self.let_summary_fail = let_summary_fail
|
|
83
98
|
self.folio_client: folioclient.FolioClient = folio_client
|
|
84
99
|
self.import_files = marc_files
|
|
85
100
|
self.import_profile_name = import_profile_name
|
|
@@ -87,6 +102,8 @@ class MARCImportJob:
|
|
|
87
102
|
self.batch_delay = batch_delay
|
|
88
103
|
self.current_retry_timeout = None
|
|
89
104
|
self.marc_record_preprocessor = marc_record_preprocessor
|
|
105
|
+
self.pbar_sent: tqdm
|
|
106
|
+
self.pbar_imported: tqdm
|
|
90
107
|
|
|
91
108
|
async def do_work(self) -> None:
|
|
92
109
|
"""
|
|
@@ -100,21 +117,25 @@ class MARCImportJob:
|
|
|
100
117
|
Returns:
|
|
101
118
|
None
|
|
102
119
|
"""
|
|
103
|
-
with
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
120
|
+
with (
|
|
121
|
+
httpx.Client() as http_client,
|
|
122
|
+
open(
|
|
123
|
+
self.import_files[0].parent.joinpath(
|
|
124
|
+
f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
125
|
+
),
|
|
126
|
+
"wb+",
|
|
127
|
+
) as bad_marc_file,
|
|
128
|
+
open(
|
|
129
|
+
self.import_files[0].parent.joinpath(
|
|
130
|
+
f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
131
|
+
),
|
|
132
|
+
"wb+",
|
|
133
|
+
) as failed_batches,
|
|
134
|
+
):
|
|
114
135
|
self.bad_records_file = bad_marc_file
|
|
115
|
-
|
|
136
|
+
logger.info(f"Writing bad records to {self.bad_records_file.name}")
|
|
116
137
|
self.failed_batches_file = failed_batches
|
|
117
|
-
|
|
138
|
+
logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
|
|
118
139
|
self.http_client = http_client
|
|
119
140
|
if self.consolidate_files:
|
|
120
141
|
self.current_file = self.import_files
|
|
@@ -135,16 +156,16 @@ class MARCImportJob:
|
|
|
135
156
|
Returns:
|
|
136
157
|
None
|
|
137
158
|
"""
|
|
138
|
-
self.bad_records_file.
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
self.failed_batches_file.
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
159
|
+
with open(self.bad_records_file.name, "rb") as bad_records:
|
|
160
|
+
if not bad_records.read(1):
|
|
161
|
+
os.remove(bad_records.name)
|
|
162
|
+
logger.info("No bad records found. Removing bad records file.")
|
|
163
|
+
with open(self.failed_batches_file.name, "rb") as failed_batches:
|
|
164
|
+
if not failed_batches.read(1):
|
|
165
|
+
os.remove(failed_batches.name)
|
|
166
|
+
logger.info("No failed batches. Removing failed batches file.")
|
|
167
|
+
logger.info("Import complete.")
|
|
168
|
+
logger.info(f"Total records imported: {self.total_records_sent}")
|
|
148
169
|
|
|
149
170
|
async def get_job_status(self) -> None:
|
|
150
171
|
"""
|
|
@@ -158,21 +179,28 @@ class MARCImportJob:
|
|
|
158
179
|
"""
|
|
159
180
|
try:
|
|
160
181
|
self.current_retry_timeout = (
|
|
161
|
-
self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
|
|
162
|
-
|
|
182
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
183
|
+
if self.current_retry_timeout
|
|
184
|
+
else RETRY_TIMEOUT_START
|
|
185
|
+
)
|
|
163
186
|
job_status = self.folio_client.folio_get(
|
|
164
187
|
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
165
188
|
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
166
189
|
)
|
|
167
190
|
self.current_retry_timeout = None
|
|
168
|
-
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
191
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
192
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
193
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
194
|
+
logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
|
|
195
|
+
sleep(0.25)
|
|
196
|
+
with httpx.Client(
|
|
197
|
+
timeout=self.current_retry_timeout,
|
|
198
|
+
verify=self.folio_client.ssl_verify,
|
|
199
|
+
) as temp_client:
|
|
200
|
+
self.folio_client.httpx_client = temp_client
|
|
201
|
+
return await self.get_job_status()
|
|
202
|
+
else:
|
|
203
|
+
raise e
|
|
176
204
|
try:
|
|
177
205
|
status = [
|
|
178
206
|
job for job in job_status["jobExecutions"] if job["id"] == self.job_id
|
|
@@ -180,16 +208,32 @@ class MARCImportJob:
|
|
|
180
208
|
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
181
209
|
self.last_current = status["progress"]["current"]
|
|
182
210
|
except IndexError:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
211
|
+
try:
|
|
212
|
+
job_status = self.folio_client.folio_get(
|
|
213
|
+
"/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
|
|
214
|
+
"=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
|
|
215
|
+
)
|
|
216
|
+
status = [
|
|
217
|
+
job for job in job_status["jobExecutions"] if job["id"] == self.job_id
|
|
218
|
+
][0]
|
|
219
|
+
self.pbar_imported.update(status["progress"]["current"] - self.last_current)
|
|
220
|
+
self.last_current = status["progress"]["current"]
|
|
221
|
+
self.finished = True
|
|
222
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
223
|
+
if not hasattr(e, "response") or e.response.status_code in [502, 504]:
|
|
224
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
225
|
+
logger.warning(
|
|
226
|
+
f"SERVER ERROR fetching job status: {error_text}. Retrying."
|
|
227
|
+
)
|
|
228
|
+
sleep(0.25)
|
|
229
|
+
with httpx.Client(
|
|
230
|
+
timeout=self.current_retry_timeout,
|
|
231
|
+
verify=self.folio_client.ssl_verify,
|
|
232
|
+
) as temp_client:
|
|
233
|
+
self.folio_client.httpx_client = temp_client
|
|
234
|
+
return await self.get_job_status()
|
|
235
|
+
else:
|
|
236
|
+
raise e
|
|
193
237
|
|
|
194
238
|
async def create_folio_import_job(self) -> None:
|
|
195
239
|
"""
|
|
@@ -209,7 +253,7 @@ class MARCImportJob:
|
|
|
209
253
|
try:
|
|
210
254
|
create_job.raise_for_status()
|
|
211
255
|
except httpx.HTTPError as e:
|
|
212
|
-
|
|
256
|
+
logger.error(
|
|
213
257
|
"Error creating job: "
|
|
214
258
|
+ str(e)
|
|
215
259
|
+ "\n"
|
|
@@ -217,10 +261,15 @@ class MARCImportJob:
|
|
|
217
261
|
)
|
|
218
262
|
raise e
|
|
219
263
|
self.job_id = create_job.json()["parentJobExecutionId"]
|
|
264
|
+
logger.info("Created job: " + self.job_id)
|
|
220
265
|
|
|
221
|
-
|
|
266
|
+
@cached_property
|
|
267
|
+
def import_profile(self) -> dict:
|
|
222
268
|
"""
|
|
223
|
-
|
|
269
|
+
Returns the import profile for the current job execution.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
dict: The import profile for the current job execution.
|
|
224
273
|
"""
|
|
225
274
|
import_profiles = self.folio_client.folio_get(
|
|
226
275
|
"/data-import-profiles/jobProfiles",
|
|
@@ -232,7 +281,7 @@ class MARCImportJob:
|
|
|
232
281
|
for profile in import_profiles
|
|
233
282
|
if profile["name"] == self.import_profile_name
|
|
234
283
|
][0]
|
|
235
|
-
|
|
284
|
+
return profile
|
|
236
285
|
|
|
237
286
|
async def set_job_profile(self) -> None:
|
|
238
287
|
"""
|
|
@@ -248,15 +297,15 @@ class MARCImportJob:
|
|
|
248
297
|
+ "/jobProfile",
|
|
249
298
|
headers=self.folio_client.okapi_headers,
|
|
250
299
|
json={
|
|
251
|
-
"id": self.
|
|
252
|
-
"name": self.
|
|
300
|
+
"id": self.import_profile["id"],
|
|
301
|
+
"name": self.import_profile["name"],
|
|
253
302
|
"dataType": "MARC",
|
|
254
303
|
},
|
|
255
304
|
)
|
|
256
305
|
try:
|
|
257
306
|
set_job_profile.raise_for_status()
|
|
258
307
|
except httpx.HTTPError as e:
|
|
259
|
-
|
|
308
|
+
logger.error(
|
|
260
309
|
"Error creating job: "
|
|
261
310
|
+ str(e)
|
|
262
311
|
+ "\n"
|
|
@@ -298,8 +347,13 @@ class MARCImportJob:
|
|
|
298
347
|
headers=self.folio_client.okapi_headers,
|
|
299
348
|
json=batch_payload,
|
|
300
349
|
)
|
|
301
|
-
|
|
302
|
-
|
|
350
|
+
# if batch_payload["recordsMetadata"]["last"]:
|
|
351
|
+
# logger.log(
|
|
352
|
+
# 25,
|
|
353
|
+
# f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
|
|
354
|
+
# )
|
|
355
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
356
|
+
sleep(0.25)
|
|
303
357
|
return await self.process_record_batch(batch_payload)
|
|
304
358
|
try:
|
|
305
359
|
post_batch.raise_for_status()
|
|
@@ -307,12 +361,14 @@ class MARCImportJob:
|
|
|
307
361
|
self.record_batch = []
|
|
308
362
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
309
363
|
except Exception as e:
|
|
310
|
-
if
|
|
364
|
+
if (
|
|
365
|
+
hasattr(e, "response") and e.response.status_code in [500, 422]
|
|
366
|
+
): # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
|
|
311
367
|
self.total_records_sent += len(self.record_batch)
|
|
312
368
|
self.record_batch = []
|
|
313
369
|
self.pbar_sent.update(len(batch_payload["initialRecords"]))
|
|
314
370
|
else:
|
|
315
|
-
|
|
371
|
+
logger.error("Error posting batch: " + str(e))
|
|
316
372
|
for record in self.record_batch:
|
|
317
373
|
self.failed_batches_file.write(record)
|
|
318
374
|
self.error_records += len(self.record_batch)
|
|
@@ -334,14 +390,20 @@ class MARCImportJob:
|
|
|
334
390
|
"""
|
|
335
391
|
counter = 0
|
|
336
392
|
for import_file in files:
|
|
393
|
+
file_path = Path(import_file.name)
|
|
337
394
|
self.pbar_sent.set_description(
|
|
338
395
|
f"Sent ({os.path.basename(import_file.name)}): "
|
|
339
396
|
)
|
|
340
397
|
reader = pymarc.MARCReader(import_file, hide_utf8_warnings=True)
|
|
341
|
-
for record in reader:
|
|
398
|
+
for idx, record in enumerate(reader, start=1):
|
|
342
399
|
if len(self.record_batch) == self.batch_size:
|
|
343
400
|
await self.process_record_batch(
|
|
344
|
-
await self.create_batch_payload(
|
|
401
|
+
await self.create_batch_payload(
|
|
402
|
+
counter,
|
|
403
|
+
total_records,
|
|
404
|
+
(counter - self.error_records)
|
|
405
|
+
== (total_records - self.error_records),
|
|
406
|
+
),
|
|
345
407
|
)
|
|
346
408
|
await self.get_job_status()
|
|
347
409
|
sleep(0.25)
|
|
@@ -353,14 +415,35 @@ class MARCImportJob:
|
|
|
353
415
|
self.record_batch.append(record.as_marc())
|
|
354
416
|
counter += 1
|
|
355
417
|
else:
|
|
418
|
+
logger.data_issues(
|
|
419
|
+
"RECORD FAILED\t%s\t%s\t%s",
|
|
420
|
+
f"{file_path.name}:{idx}",
|
|
421
|
+
f"Error reading {idx} record from {file_path}. Skipping. Writing current chunk to {self.bad_records_file.name}.",
|
|
422
|
+
"",
|
|
423
|
+
)
|
|
356
424
|
self.bad_records_file.write(reader.current_chunk)
|
|
357
425
|
if self.record_batch:
|
|
358
426
|
await self.process_record_batch(
|
|
359
|
-
await self.create_batch_payload(
|
|
427
|
+
await self.create_batch_payload(
|
|
428
|
+
counter,
|
|
429
|
+
total_records,
|
|
430
|
+
(counter - self.error_records)
|
|
431
|
+
== (total_records - self.error_records),
|
|
432
|
+
),
|
|
360
433
|
)
|
|
434
|
+
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
435
|
+
if import_complete_path.exists():
|
|
436
|
+
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
437
|
+
import_complete_path.mkdir(exist_ok=True)
|
|
438
|
+
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
439
|
+
file_path.rename(
|
|
440
|
+
file_path.parent.joinpath("import_complete", file_path.name)
|
|
441
|
+
)
|
|
361
442
|
|
|
362
443
|
@staticmethod
|
|
363
|
-
async def apply_marc_record_preprocessing(
|
|
444
|
+
async def apply_marc_record_preprocessing(
|
|
445
|
+
record: pymarc.Record, func_or_path
|
|
446
|
+
) -> pymarc.Record:
|
|
364
447
|
"""
|
|
365
448
|
Apply preprocessing to the MARC record before sending it to FOLIO.
|
|
366
449
|
|
|
@@ -373,23 +456,29 @@ class MARCImportJob:
|
|
|
373
456
|
"""
|
|
374
457
|
if isinstance(func_or_path, str):
|
|
375
458
|
try:
|
|
376
|
-
path_parts = func_or_path.rsplit(
|
|
459
|
+
path_parts = func_or_path.rsplit(".")
|
|
377
460
|
module_path, func_name = ".".join(path_parts[:-1]), path_parts[-1]
|
|
378
461
|
module = importlib.import_module(module_path)
|
|
379
462
|
func = getattr(module, func_name)
|
|
380
463
|
except (ImportError, AttributeError) as e:
|
|
381
|
-
|
|
464
|
+
logger.error(
|
|
465
|
+
f"Error importing preprocessing function {func_or_path}: {e}. Skipping preprocessing."
|
|
466
|
+
)
|
|
382
467
|
return record
|
|
383
468
|
elif callable(func_or_path):
|
|
384
469
|
func = func_or_path
|
|
385
470
|
else:
|
|
386
|
-
|
|
471
|
+
logger.warning(
|
|
472
|
+
f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
|
|
473
|
+
)
|
|
387
474
|
return record
|
|
388
475
|
|
|
389
476
|
try:
|
|
390
477
|
return func(record)
|
|
391
478
|
except Exception as e:
|
|
392
|
-
|
|
479
|
+
logger.error(
|
|
480
|
+
f"Error applying preprocessing function: {e}. Skipping preprocessing."
|
|
481
|
+
)
|
|
393
482
|
return record
|
|
394
483
|
|
|
395
484
|
async def create_batch_payload(self, counter, total_records, is_last) -> dict:
|
|
@@ -434,24 +523,26 @@ class MARCImportJob:
|
|
|
434
523
|
None
|
|
435
524
|
"""
|
|
436
525
|
await self.create_folio_import_job()
|
|
437
|
-
await self.get_import_profile()
|
|
438
526
|
await self.set_job_profile()
|
|
439
527
|
with ExitStack() as stack:
|
|
440
528
|
files = [
|
|
441
529
|
stack.enter_context(open(file, "rb")) for file in self.current_file
|
|
442
530
|
]
|
|
443
531
|
total_records = await self.read_total_records(files)
|
|
444
|
-
with
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
532
|
+
with (
|
|
533
|
+
tqdm(
|
|
534
|
+
desc="Imported: ",
|
|
535
|
+
total=total_records,
|
|
536
|
+
position=1,
|
|
537
|
+
disable=self.no_progress,
|
|
538
|
+
) as pbar_imported,
|
|
539
|
+
tqdm(
|
|
540
|
+
desc="Sent: ()",
|
|
541
|
+
total=total_records,
|
|
542
|
+
position=0,
|
|
543
|
+
disable=self.no_progress,
|
|
544
|
+
) as pbar_sent,
|
|
545
|
+
):
|
|
455
546
|
self.pbar_sent = pbar_sent
|
|
456
547
|
self.pbar_imported = pbar_imported
|
|
457
548
|
await self.process_records(files, total_records)
|
|
@@ -459,34 +550,39 @@ class MARCImportJob:
|
|
|
459
550
|
await self.get_job_status()
|
|
460
551
|
sleep(1)
|
|
461
552
|
if self.finished:
|
|
462
|
-
job_summary
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
553
|
+
if job_summary := await self.get_job_summary():
|
|
554
|
+
job_id = job_summary.pop("jobExecutionId", None)
|
|
555
|
+
total_errors = job_summary.pop("totalErrors", 0)
|
|
556
|
+
columns = ["Summary"] + list(job_summary.keys())
|
|
557
|
+
rows = set()
|
|
558
|
+
for key in columns[1:]:
|
|
559
|
+
rows.update(job_summary[key].keys())
|
|
560
|
+
|
|
561
|
+
table_data = []
|
|
562
|
+
for row in rows:
|
|
563
|
+
metric_name = decamelize(row).split("_")[1]
|
|
564
|
+
table_row = [metric_name]
|
|
565
|
+
for col in columns[1:]:
|
|
566
|
+
table_row.append(job_summary[col].get(row, "N/A"))
|
|
567
|
+
table_data.append(table_row)
|
|
568
|
+
table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
|
|
569
|
+
columns = columns[:1] + [
|
|
570
|
+
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
571
|
+
]
|
|
572
|
+
logger.info(
|
|
573
|
+
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
574
|
+
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
575
|
+
)
|
|
576
|
+
logger.info(
|
|
577
|
+
"\n"
|
|
578
|
+
+ tabulate.tabulate(
|
|
579
|
+
table_data, headers=columns, tablefmt="fancy_grid"
|
|
580
|
+
),
|
|
581
|
+
)
|
|
582
|
+
if total_errors:
|
|
583
|
+
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
584
|
+
else:
|
|
585
|
+
logger.error(f"No job summary available for job {self.job_id}.")
|
|
490
586
|
self.last_current = 0
|
|
491
587
|
self.finished = False
|
|
492
588
|
|
|
@@ -499,26 +595,86 @@ class MARCImportJob:
|
|
|
499
595
|
"""
|
|
500
596
|
try:
|
|
501
597
|
self.current_retry_timeout = (
|
|
502
|
-
self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
f"/metadata-provider/jobSummary/{self.job_id}"
|
|
598
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
599
|
+
if self.current_retry_timeout
|
|
600
|
+
else RETRY_TIMEOUT_START
|
|
506
601
|
)
|
|
602
|
+
with httpx.Client(
|
|
603
|
+
timeout=self.current_retry_timeout, verify=self.folio_client.ssl_verify
|
|
604
|
+
) as temp_client:
|
|
605
|
+
self.folio_client.httpx_client = temp_client
|
|
606
|
+
job_summary = self.folio_client.folio_get(
|
|
607
|
+
f"/metadata-provider/jobSummary/{self.job_id}"
|
|
608
|
+
)
|
|
507
609
|
self.current_retry_timeout = None
|
|
508
610
|
except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
509
|
-
if
|
|
510
|
-
|
|
611
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
612
|
+
if not hasattr(e, "response") or (
|
|
613
|
+
e.response.status_code in [502, 504] and not self.let_summary_fail
|
|
614
|
+
):
|
|
615
|
+
logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
|
|
616
|
+
sleep(0.25)
|
|
511
617
|
with httpx.Client(
|
|
512
618
|
timeout=self.current_retry_timeout,
|
|
513
|
-
verify=self.folio_client.ssl_verify
|
|
619
|
+
verify=self.folio_client.ssl_verify,
|
|
514
620
|
) as temp_client:
|
|
515
621
|
self.folio_client.httpx_client = temp_client
|
|
516
|
-
return await self.
|
|
622
|
+
return await self.get_job_summary()
|
|
623
|
+
elif hasattr(e, "response") and (
|
|
624
|
+
e.response.status_code in [502, 504] and self.let_summary_fail
|
|
625
|
+
):
|
|
626
|
+
logger.warning(
|
|
627
|
+
f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
|
|
628
|
+
)
|
|
629
|
+
job_summary = {}
|
|
517
630
|
else:
|
|
518
631
|
raise e
|
|
519
632
|
return job_summary
|
|
520
633
|
|
|
521
634
|
|
|
635
|
+
def set_up_cli_logging():
|
|
636
|
+
"""
|
|
637
|
+
This function sets up logging for the CLI.
|
|
638
|
+
"""
|
|
639
|
+
logger.setLevel(logging.INFO)
|
|
640
|
+
logger.propagate = False
|
|
641
|
+
|
|
642
|
+
# Set up file and stream handlers
|
|
643
|
+
file_handler = logging.FileHandler(
|
|
644
|
+
"folio_data_import_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
645
|
+
)
|
|
646
|
+
file_handler.setLevel(logging.INFO)
|
|
647
|
+
file_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
648
|
+
# file_handler.addFilter(IncludeLevelFilter(25))
|
|
649
|
+
file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
650
|
+
file_handler.setFormatter(file_formatter)
|
|
651
|
+
logger.addHandler(file_handler)
|
|
652
|
+
|
|
653
|
+
if not any(
|
|
654
|
+
isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
|
|
655
|
+
for h in logger.handlers
|
|
656
|
+
):
|
|
657
|
+
stream_handler = logging.StreamHandler(sys.stdout)
|
|
658
|
+
stream_handler.setLevel(logging.INFO)
|
|
659
|
+
stream_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
660
|
+
# stream_handler.addFilter(ExcludeLevelFilter(25))
|
|
661
|
+
stream_formatter = logging.Formatter("%(message)s")
|
|
662
|
+
stream_handler.setFormatter(stream_formatter)
|
|
663
|
+
logger.addHandler(stream_handler)
|
|
664
|
+
|
|
665
|
+
# Set up data issues logging
|
|
666
|
+
data_issues_handler = logging.FileHandler(
|
|
667
|
+
"marc_import_data_issues_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
668
|
+
)
|
|
669
|
+
data_issues_handler.setLevel(26)
|
|
670
|
+
data_issues_formatter = logging.Formatter("%(message)s")
|
|
671
|
+
data_issues_handler.setFormatter(data_issues_formatter)
|
|
672
|
+
logger.addHandler(data_issues_handler)
|
|
673
|
+
|
|
674
|
+
# Stop httpx from logging info messages to the console
|
|
675
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
676
|
+
|
|
677
|
+
|
|
522
678
|
async def main() -> None:
|
|
523
679
|
"""
|
|
524
680
|
Main function to run the MARC import job.
|
|
@@ -526,6 +682,7 @@ async def main() -> None:
|
|
|
526
682
|
This function parses command line arguments, initializes the FolioClient,
|
|
527
683
|
and runs the MARCImportJob.
|
|
528
684
|
"""
|
|
685
|
+
set_up_cli_logging()
|
|
529
686
|
parser = argparse.ArgumentParser()
|
|
530
687
|
parser.add_argument("--gateway_url", type=str, help="The FOLIO API Gateway URL")
|
|
531
688
|
parser.add_argument("--tenant_id", type=str, help="The FOLIO tenant ID")
|
|
@@ -582,6 +739,11 @@ async def main() -> None:
|
|
|
582
739
|
action="store_true",
|
|
583
740
|
help="Disable progress bars (eg. for running in a CI environment)",
|
|
584
741
|
)
|
|
742
|
+
parser.add_argument(
|
|
743
|
+
"--let-summary-fail",
|
|
744
|
+
action="store_true",
|
|
745
|
+
help="Do not retry fetching the final job summary if it fails",
|
|
746
|
+
)
|
|
585
747
|
args = parser.parse_args()
|
|
586
748
|
if not args.password:
|
|
587
749
|
args.password = getpass("Enter FOLIO password: ")
|
|
@@ -601,10 +763,10 @@ async def main() -> None:
|
|
|
601
763
|
marc_files.sort()
|
|
602
764
|
|
|
603
765
|
if len(marc_files) == 0:
|
|
604
|
-
|
|
766
|
+
logger.critical(f"No files found matching {args.marc_file_path}. Exiting.")
|
|
605
767
|
sys.exit(1)
|
|
606
768
|
else:
|
|
607
|
-
|
|
769
|
+
logger.info(marc_files)
|
|
608
770
|
|
|
609
771
|
if not args.import_profile_name:
|
|
610
772
|
import_profiles = folio_client.folio_get(
|
|
@@ -636,12 +798,31 @@ async def main() -> None:
|
|
|
636
798
|
marc_record_preprocessor=args.preprocessor,
|
|
637
799
|
consolidate=bool(args.consolidate),
|
|
638
800
|
no_progress=bool(args.no_progress),
|
|
801
|
+
let_summary_fail=bool(args.let_summary_fail),
|
|
639
802
|
).do_work()
|
|
640
803
|
except Exception as e:
|
|
641
|
-
|
|
804
|
+
logger.error("Error importing files: " + str(e))
|
|
642
805
|
raise
|
|
643
806
|
|
|
644
807
|
|
|
808
|
+
class ExcludeLevelFilter(logging.Filter):
|
|
809
|
+
def __init__(self, level):
|
|
810
|
+
super().__init__()
|
|
811
|
+
self.level = level
|
|
812
|
+
|
|
813
|
+
def filter(self, record):
|
|
814
|
+
return record.levelno != self.level
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
class IncludeLevelFilter(logging.Filter):
|
|
818
|
+
def __init__(self, level):
|
|
819
|
+
super().__init__()
|
|
820
|
+
self.level = level
|
|
821
|
+
|
|
822
|
+
def filter(self, record):
|
|
823
|
+
return record.levelno == self.level
|
|
824
|
+
|
|
825
|
+
|
|
645
826
|
def sync_main() -> None:
|
|
646
827
|
"""
|
|
647
828
|
Synchronous main function to run the MARC import job.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ._preprocessors import *
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
import pymarc
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger("folio_data_import.MARCDataImport")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
|
|
8
|
+
"""
|
|
9
|
+
Prepend a prefix to the record's 001 field.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
13
|
+
prefix (str): The prefix to prepend to the 001 field.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
pymarc.Record: The preprocessed MARC record.
|
|
17
|
+
"""
|
|
18
|
+
record["001"].data = f"({prefix})" + record["001"].data
|
|
19
|
+
return record
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
23
|
+
"""
|
|
24
|
+
Prepend the PPN prefix to the record's 001 field. Useful when
|
|
25
|
+
importing records from the ABES SUDOC catalog
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
pymarc.Record: The preprocessed MARC record.
|
|
32
|
+
"""
|
|
33
|
+
return prepend_prefix_001(record, "PPN")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
37
|
+
"""
|
|
38
|
+
Prepend the ABES prefix to the record's 001 field. Useful when
|
|
39
|
+
importing records from the ABES SUDOC catalog
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
pymarc.Record: The preprocessed MARC record.
|
|
46
|
+
"""
|
|
47
|
+
return prepend_prefix_001(record, "ABES")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
|
|
51
|
+
"""
|
|
52
|
+
Strip all 999 fields with ff indicators from the record.
|
|
53
|
+
Useful when importing records exported from another FOLIO system
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
pymarc.Record: The preprocessed MARC record.
|
|
60
|
+
"""
|
|
61
|
+
for field in record.get_fields("999"):
|
|
62
|
+
if field.indicators == pymarc.Indicators(*["f", "f"]):
|
|
63
|
+
record.remove_field(field)
|
|
64
|
+
return record
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
68
|
+
"""
|
|
69
|
+
Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
|
|
70
|
+
with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
|
|
71
|
+
prefixed with "(ABES)". This is useful when importing newly-merged records
|
|
72
|
+
from the SUDOC catalog when you want the new record to replace the old one
|
|
73
|
+
in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
pymarc.Record: The preprocessed MARC record.
|
|
80
|
+
"""
|
|
81
|
+
record = prepend_abes_prefix_001(record)
|
|
82
|
+
for field in record.get_fields("035"):
|
|
83
|
+
if "a" in field and "9" in field and field["9"] == "sudoc":
|
|
84
|
+
_935 = pymarc.Field(
|
|
85
|
+
tag="935",
|
|
86
|
+
indicators=["f", "f"],
|
|
87
|
+
subfields=[pymarc.field.Subfield("a", "(ABES)" + field["a"])],
|
|
88
|
+
)
|
|
89
|
+
record.add_ordered_field(_935)
|
|
90
|
+
return record
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
|
|
94
|
+
"""
|
|
95
|
+
Remove empty fields and subfields from the record. These can cause
|
|
96
|
+
data import mapping issues in FOLIO. Removals are logged at custom
|
|
97
|
+
log level 26, which is used by folio_migration_tools to populate the
|
|
98
|
+
data issues report.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
record (pymarc.Record): The MARC record to preprocess.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
pymarc.Record: The preprocessed MARC record.
|
|
105
|
+
"""
|
|
106
|
+
MAPPED_FIELDS = {
|
|
107
|
+
"010": ["a", "z"],
|
|
108
|
+
"020": ["a", "y", "z"],
|
|
109
|
+
"035": ["a", "z"],
|
|
110
|
+
"040": ["a", "b", "c", "d", "e", "f", "g", "h", "k", "m", "n", "p", "r", "s"],
|
|
111
|
+
"050": ["a", "b"],
|
|
112
|
+
"082": ["a", "b"],
|
|
113
|
+
"100": ["a", "b", "c", "d", "q"],
|
|
114
|
+
"110": ["a", "b", "c"],
|
|
115
|
+
"111": ["a", "c", "d"],
|
|
116
|
+
"130": [
|
|
117
|
+
"a",
|
|
118
|
+
"d",
|
|
119
|
+
"f",
|
|
120
|
+
"k",
|
|
121
|
+
"l",
|
|
122
|
+
"m",
|
|
123
|
+
"n",
|
|
124
|
+
"o",
|
|
125
|
+
"p",
|
|
126
|
+
"r",
|
|
127
|
+
"s",
|
|
128
|
+
"t",
|
|
129
|
+
"x",
|
|
130
|
+
"y",
|
|
131
|
+
"z",
|
|
132
|
+
],
|
|
133
|
+
"180": ["x", "y", "z"],
|
|
134
|
+
"210": ["a", "c"],
|
|
135
|
+
"240": ["a", "f", "k", "l", "m", "n", "o", "p", "r", "s", "t", "x", "y", "z"],
|
|
136
|
+
"245": ["a", "b", "c", "f", "g", "h", "k", "n", "p", "s"],
|
|
137
|
+
"246": ["a", "f", "g", "n", "p", "s"],
|
|
138
|
+
"250": ["a", "b"],
|
|
139
|
+
"260": ["a", "b", "c", "e", "f", "g"],
|
|
140
|
+
"300": ["a", "b", "c", "e", "f", "g"],
|
|
141
|
+
"440": ["a", "n", "p", "v", "x", "y", "z"],
|
|
142
|
+
"490": ["a", "v", "x", "y", "z"],
|
|
143
|
+
"500": ["a", "c", "d", "n", "p", "v", "x", "y", "z"],
|
|
144
|
+
"505": ["a", "g", "r", "t", "u"],
|
|
145
|
+
"520": ["a", "b", "c", "u"],
|
|
146
|
+
"600": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
147
|
+
"610": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
148
|
+
"611": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
149
|
+
"630": [
|
|
150
|
+
"a",
|
|
151
|
+
"d",
|
|
152
|
+
"f",
|
|
153
|
+
"k",
|
|
154
|
+
"l",
|
|
155
|
+
"m",
|
|
156
|
+
"n",
|
|
157
|
+
"o",
|
|
158
|
+
"p",
|
|
159
|
+
"r",
|
|
160
|
+
"s",
|
|
161
|
+
"t",
|
|
162
|
+
"x",
|
|
163
|
+
"y",
|
|
164
|
+
"z",
|
|
165
|
+
],
|
|
166
|
+
"650": ["a", "d", "v", "x", "y", "z"],
|
|
167
|
+
"651": ["a", "v", "x", "y", "z"],
|
|
168
|
+
"655": ["a", "v", "x", "y", "z"],
|
|
169
|
+
"700": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
170
|
+
"710": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
171
|
+
"711": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
172
|
+
"730": [
|
|
173
|
+
"a",
|
|
174
|
+
"d",
|
|
175
|
+
"f",
|
|
176
|
+
"k",
|
|
177
|
+
"l",
|
|
178
|
+
"m",
|
|
179
|
+
"n",
|
|
180
|
+
"o",
|
|
181
|
+
"p",
|
|
182
|
+
"r",
|
|
183
|
+
"s",
|
|
184
|
+
"t",
|
|
185
|
+
"x",
|
|
186
|
+
"y",
|
|
187
|
+
"z",
|
|
188
|
+
],
|
|
189
|
+
"740": ["a", "n", "p", "v", "x", "y", "z"],
|
|
190
|
+
"800": ["a", "b", "c", "d", "q", "t", "v", "x", "y", "z"],
|
|
191
|
+
"810": ["a", "b", "c", "d", "t", "v", "x", "y", "z"],
|
|
192
|
+
"811": ["a", "c", "d", "t", "v", "x", "y", "z"],
|
|
193
|
+
"830": [
|
|
194
|
+
"a",
|
|
195
|
+
"d",
|
|
196
|
+
"f",
|
|
197
|
+
"k",
|
|
198
|
+
"l",
|
|
199
|
+
"m",
|
|
200
|
+
"n",
|
|
201
|
+
"o",
|
|
202
|
+
"p",
|
|
203
|
+
"r",
|
|
204
|
+
"s",
|
|
205
|
+
"t",
|
|
206
|
+
"x",
|
|
207
|
+
"y",
|
|
208
|
+
"z",
|
|
209
|
+
],
|
|
210
|
+
"856": ["u", "y", "z"],
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
for field in list(record.get_fields()):
|
|
214
|
+
len_subs = len(field.subfields)
|
|
215
|
+
subfield_value = bool(field.subfields[0].value) if len_subs > 0 else False
|
|
216
|
+
if not int(field.tag) >= 900 and field.tag in MAPPED_FIELDS:
|
|
217
|
+
if int(field.tag) > 9 and len_subs == 0:
|
|
218
|
+
logger.log(
|
|
219
|
+
26,
|
|
220
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
221
|
+
record["001"].value(),
|
|
222
|
+
f"{field.tag} is empty",
|
|
223
|
+
field,
|
|
224
|
+
)
|
|
225
|
+
record.remove_field(field)
|
|
226
|
+
elif len_subs == 1 and not subfield_value:
|
|
227
|
+
logger.log(
|
|
228
|
+
26,
|
|
229
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
230
|
+
record["001"].value(),
|
|
231
|
+
f"{field.tag}${field.subfields[0].code} is empty, removing field",
|
|
232
|
+
field,
|
|
233
|
+
)
|
|
234
|
+
record.remove_field(field)
|
|
235
|
+
else:
|
|
236
|
+
if len_subs > 1 and "a" in field and not field["a"].strip():
|
|
237
|
+
logger.log(
|
|
238
|
+
26,
|
|
239
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
240
|
+
record["001"].value(),
|
|
241
|
+
f"{field.tag}$a is empty, removing field",
|
|
242
|
+
field,
|
|
243
|
+
)
|
|
244
|
+
field.delete_subfield("a")
|
|
245
|
+
for idx, subfield in enumerate(list(field.subfields), start=1):
|
|
246
|
+
if subfield.code in MAPPED_FIELDS.get(field.tag, []) and not subfield.value:
|
|
247
|
+
logger.log(
|
|
248
|
+
26,
|
|
249
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
250
|
+
record["001"].value(),
|
|
251
|
+
f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
|
|
252
|
+
field,
|
|
253
|
+
)
|
|
254
|
+
field.delete_subfield(subfield.code)
|
|
255
|
+
if len(field.subfields) == 0:
|
|
256
|
+
logger.log(
|
|
257
|
+
26,
|
|
258
|
+
"DATA ISSUE\t%s\t%s\t%s",
|
|
259
|
+
record["001"].value(),
|
|
260
|
+
f"{field.tag} has no non-empty subfields after cleaning, removing field",
|
|
261
|
+
field,
|
|
262
|
+
)
|
|
263
|
+
record.remove_field(field)
|
|
264
|
+
return record
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def ordinal(n):
|
|
268
|
+
s = ("th", "st", "nd", "rd") + ("th",) * 10
|
|
269
|
+
v = n % 100
|
|
270
|
+
if v > 13:
|
|
271
|
+
return f"{n}{s[v % 10]}"
|
|
272
|
+
else:
|
|
273
|
+
return f"{n}{s[v]}"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from ._preprocessors import prepend_ppn_prefix_001, strip_999_ff_fields
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
import pymarc
|
|
2
|
-
|
|
3
|
-
def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
|
|
4
|
-
"""
|
|
5
|
-
Prepend a prefix to the record's 001 field.
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
9
|
-
prefix (str): The prefix to prepend to the 001 field.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
pymarc.Record: The preprocessed MARC record.
|
|
13
|
-
"""
|
|
14
|
-
record['001'].data = f'({prefix})' + record['001'].data
|
|
15
|
-
return record
|
|
16
|
-
|
|
17
|
-
def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
18
|
-
"""
|
|
19
|
-
Prepend the PPN prefix to the record's 001 field. Useful when
|
|
20
|
-
importing records from the ABES SUDOC catalog
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
24
|
-
|
|
25
|
-
Returns:
|
|
26
|
-
pymarc.Record: The preprocessed MARC record.
|
|
27
|
-
"""
|
|
28
|
-
return prepend_prefix_001(record, 'PPN')
|
|
29
|
-
|
|
30
|
-
def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
|
|
31
|
-
"""
|
|
32
|
-
Prepend the ABES prefix to the record's 001 field. Useful when
|
|
33
|
-
importing records from the ABES SUDOC catalog
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
pymarc.Record: The preprocessed MARC record.
|
|
40
|
-
"""
|
|
41
|
-
return prepend_prefix_001(record, 'ABES')
|
|
42
|
-
|
|
43
|
-
def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
|
|
44
|
-
"""
|
|
45
|
-
Strip all 999 fields with ff indicators from the record.
|
|
46
|
-
Useful when importing records exported from another FOLIO system
|
|
47
|
-
|
|
48
|
-
Args:
|
|
49
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
50
|
-
|
|
51
|
-
Returns:
|
|
52
|
-
pymarc.Record: The preprocessed MARC record.
|
|
53
|
-
"""
|
|
54
|
-
for field in record.get_fields('999'):
|
|
55
|
-
if field.indicators == pymarc.Indicators(*['f', 'f']):
|
|
56
|
-
record.remove_field(field)
|
|
57
|
-
return record
|
|
58
|
-
|
|
59
|
-
def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
|
|
60
|
-
"""
|
|
61
|
-
Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
|
|
62
|
-
with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
|
|
63
|
-
prefixed with "(ABES)". This is useful when importing newly-merged records
|
|
64
|
-
from the SUDOC catalog when you want the new record to replace the old one
|
|
65
|
-
in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
record (pymarc.Record): The MARC record to preprocess.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
pymarc.Record: The preprocessed MARC record.
|
|
72
|
-
"""
|
|
73
|
-
record = prepend_abes_prefix_001(record)
|
|
74
|
-
for field in record.get_fields('035'):
|
|
75
|
-
if "a" in field and "9" in field and field['9'] == 'sudoc':
|
|
76
|
-
_935 = pymarc.Field(
|
|
77
|
-
tag='935',
|
|
78
|
-
indicators=['f', 'f'],
|
|
79
|
-
subfields=[
|
|
80
|
-
pymarc.field.Subfield('a', "(ABES)" + field['a'])
|
|
81
|
-
]
|
|
82
|
-
)
|
|
83
|
-
record.add_ordered_field(_935)
|
|
84
|
-
return record
|
|
File without changes
|
{folio_data_import-0.2.8rc6 → folio_data_import-0.2.8rc8}/src/folio_data_import/UserImport.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|