folio-data-import 0.5.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- folio_data_import/BatchPoster.py +1265 -0
- folio_data_import/MARCDataImport.py +1252 -0
- folio_data_import/UserImport.py +1270 -0
- folio_data_import/__init__.py +31 -0
- folio_data_import/__main__.py +14 -0
- folio_data_import/_progress.py +737 -0
- folio_data_import/custom_exceptions.py +35 -0
- folio_data_import/marc_preprocessors/__init__.py +29 -0
- folio_data_import/marc_preprocessors/_preprocessors.py +517 -0
- folio_data_import-0.5.0b3.dist-info/METADATA +467 -0
- folio_data_import-0.5.0b3.dist-info/RECORD +13 -0
- folio_data_import-0.5.0b3.dist-info/WHEEL +4 -0
- folio_data_import-0.5.0b3.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,1252 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
import glob
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import math
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import uuid
|
|
11
|
+
from contextlib import ExitStack
|
|
12
|
+
from datetime import datetime as dt
|
|
13
|
+
from functools import cached_property
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from time import sleep
|
|
16
|
+
from typing import Annotated, BinaryIO, Callable, Dict, Generator, List, cast
|
|
17
|
+
|
|
18
|
+
import cyclopts
|
|
19
|
+
import folioclient
|
|
20
|
+
import httpx
|
|
21
|
+
import pymarc
|
|
22
|
+
import questionary
|
|
23
|
+
import tabulate
|
|
24
|
+
from humps import decamelize
|
|
25
|
+
from pydantic import BaseModel, Field
|
|
26
|
+
from rich.logging import RichHandler
|
|
27
|
+
|
|
28
|
+
from folio_data_import import get_folio_connection_parameters, __version__ as app_version
|
|
29
|
+
from folio_data_import._progress import (
|
|
30
|
+
RichProgressReporter,
|
|
31
|
+
ProgressReporter,
|
|
32
|
+
NoOpProgressReporter,
|
|
33
|
+
)
|
|
34
|
+
from folio_data_import.custom_exceptions import (
|
|
35
|
+
FolioDataImportBatchError,
|
|
36
|
+
FolioDataImportJobError,
|
|
37
|
+
)
|
|
38
|
+
from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
datetime_utc = datetime.UTC
|
|
42
|
+
except AttributeError:
|
|
43
|
+
datetime_utc = datetime.timezone.utc
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# The order in which the report summary should be displayed
|
|
47
|
+
REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error": 3}
|
|
48
|
+
|
|
49
|
+
# Set default timeout and backoff values for HTTP requests when retrying job status and final summary checks # noqa: E501
|
|
50
|
+
RETRY_TIMEOUT_START = 5
|
|
51
|
+
RETRY_TIMEOUT_RETRY_FACTOR = 1.5
|
|
52
|
+
RETRY_TIMEOUT_MAX = 25.32
|
|
53
|
+
|
|
54
|
+
# Custom log level for data issues, set to 26
|
|
55
|
+
DATA_ISSUE_LVL_NUM = 26
|
|
56
|
+
logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class MARCImportStats(BaseModel):
|
|
60
|
+
"""Statistics for MARC import operations."""
|
|
61
|
+
|
|
62
|
+
records_sent: int = 0
|
|
63
|
+
records_processed: int = 0
|
|
64
|
+
created: int = 0
|
|
65
|
+
updated: int = 0
|
|
66
|
+
discarded: int = 0
|
|
67
|
+
error: int = 0
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class CustomLogger(logging.Logger):
|
|
71
|
+
"""Logger subclass with custom data_issues method."""
|
|
72
|
+
|
|
73
|
+
def data_issues(self, msg: str, *args, **kws) -> None:
|
|
74
|
+
"""Log data issues at custom level (26)."""
|
|
75
|
+
if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
|
|
76
|
+
self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# Set the custom logger class as the default
|
|
80
|
+
logging.setLoggerClass(CustomLogger)
|
|
81
|
+
|
|
82
|
+
logger: CustomLogger = logging.getLogger(__name__) # type: ignore[assignment]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class MARCImportJob:
|
|
86
|
+
"""
|
|
87
|
+
Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
|
|
88
|
+
APIs (https://github.com/folio-org/mod-source-record-manager/tree/master?tab=readme-ov-file#data-import-workflow),
|
|
89
|
+
rather than file-based Data Import. When executed in an interactive environment, it can provide progress bars
|
|
90
|
+
for tracking the number of records both uploaded and processed.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
folio_client (FolioClient): An instance of the FolioClient class.
|
|
94
|
+
marc_files (list): A list of Path objects representing the MARC files to import.
|
|
95
|
+
import_profile_name (str): The name of the data import job profile to use.
|
|
96
|
+
batch_size (int): The number of source records to include in a record batch (default=10).
|
|
97
|
+
batch_delay (float): The number of seconds to wait between record batches (default=0).
|
|
98
|
+
no_progress (bool): Disable progress bars (eg. for running in a CI environment).
|
|
99
|
+
marc_record_preprocessors (list or str): A list of callables, or a string representing
|
|
100
|
+
a comma-separated list of MARC record preprocessor names to apply to each record before import.
|
|
101
|
+
preprocessor_args (dict): A dictionary of arguments to pass to the MARC record preprocessor(s).
|
|
102
|
+
let_summary_fail (bool): If True, will not retry or fail the import if the final job summary
|
|
103
|
+
cannot be retrieved (default=False).
|
|
104
|
+
split_files (bool): If True, will split each file into smaller jobs of size `split_size`
|
|
105
|
+
split_size (int): The number of records to include in each split file (default=1000).
|
|
106
|
+
split_offset (int): The number of split files to skip before starting processing (default=0).
|
|
107
|
+
job_ids_file_path (str): The path to the file where job IDs will be saved (default="marc_import_job_ids.txt").
|
|
108
|
+
show_file_names_in_data_import_logs (bool): If True, will set the file name for each job in the data import logs.
|
|
109
|
+
""" # noqa: E501
|
|
110
|
+
|
|
111
|
+
class Config(BaseModel):
|
|
112
|
+
"""Configuration for MARC import operations."""
|
|
113
|
+
|
|
114
|
+
marc_files: Annotated[
|
|
115
|
+
List[Path],
|
|
116
|
+
Field(
|
|
117
|
+
title="MARC files",
|
|
118
|
+
description="List of Path objects representing the MARC files to import",
|
|
119
|
+
),
|
|
120
|
+
]
|
|
121
|
+
import_profile_name: Annotated[
|
|
122
|
+
str,
|
|
123
|
+
Field(
|
|
124
|
+
title="Import profile name",
|
|
125
|
+
description="The name of the data import job profile to use",
|
|
126
|
+
),
|
|
127
|
+
]
|
|
128
|
+
batch_size: Annotated[
|
|
129
|
+
int,
|
|
130
|
+
Field(
|
|
131
|
+
title="Batch size",
|
|
132
|
+
description="Number of source records to include in a record batch",
|
|
133
|
+
ge=1,
|
|
134
|
+
le=1000,
|
|
135
|
+
),
|
|
136
|
+
] = 10
|
|
137
|
+
batch_delay: Annotated[
|
|
138
|
+
float,
|
|
139
|
+
Field(
|
|
140
|
+
title="Batch delay",
|
|
141
|
+
description="Number of seconds to wait between record batches",
|
|
142
|
+
ge=0.0,
|
|
143
|
+
),
|
|
144
|
+
] = 0.0
|
|
145
|
+
marc_record_preprocessors: Annotated[
|
|
146
|
+
List[Callable] | str | None,
|
|
147
|
+
Field(
|
|
148
|
+
title="MARC record preprocessor",
|
|
149
|
+
description=(
|
|
150
|
+
"List of callables or string representing preprocessor(s) "
|
|
151
|
+
"to apply to each record before import"
|
|
152
|
+
),
|
|
153
|
+
),
|
|
154
|
+
] = None
|
|
155
|
+
preprocessors_args: Annotated[
|
|
156
|
+
Dict[str, Dict] | None,
|
|
157
|
+
Field(
|
|
158
|
+
title="Preprocessor arguments",
|
|
159
|
+
description="Dictionary of arguments to pass to the MARC record preprocessor(s)",
|
|
160
|
+
),
|
|
161
|
+
] = None
|
|
162
|
+
no_progress: Annotated[
|
|
163
|
+
bool,
|
|
164
|
+
Field(
|
|
165
|
+
title="No progress bars",
|
|
166
|
+
description="Disable progress bars (e.g., for CI environments)",
|
|
167
|
+
),
|
|
168
|
+
] = False
|
|
169
|
+
no_summary: Annotated[
|
|
170
|
+
bool,
|
|
171
|
+
Field(
|
|
172
|
+
title="No summary",
|
|
173
|
+
description="Skip the final job summary",
|
|
174
|
+
),
|
|
175
|
+
] = False
|
|
176
|
+
let_summary_fail: Annotated[
|
|
177
|
+
bool,
|
|
178
|
+
Field(
|
|
179
|
+
title="Let summary fail",
|
|
180
|
+
description="Do not retry or fail import if final job summary cannot be retrieved",
|
|
181
|
+
),
|
|
182
|
+
] = False
|
|
183
|
+
split_files: Annotated[
|
|
184
|
+
bool,
|
|
185
|
+
Field(
|
|
186
|
+
title="Split files",
|
|
187
|
+
description="Split each file into smaller jobs",
|
|
188
|
+
),
|
|
189
|
+
] = False
|
|
190
|
+
split_size: Annotated[
|
|
191
|
+
int,
|
|
192
|
+
Field(
|
|
193
|
+
title="Split size",
|
|
194
|
+
description="Number of records to include in each split file",
|
|
195
|
+
ge=1,
|
|
196
|
+
),
|
|
197
|
+
] = 1000
|
|
198
|
+
split_offset: Annotated[
|
|
199
|
+
int,
|
|
200
|
+
Field(
|
|
201
|
+
title="Split offset",
|
|
202
|
+
description="Number of split files to skip before starting processing",
|
|
203
|
+
ge=0,
|
|
204
|
+
),
|
|
205
|
+
] = 0
|
|
206
|
+
job_ids_file_path: Annotated[
|
|
207
|
+
Path | None,
|
|
208
|
+
Field(
|
|
209
|
+
title="Job IDs file path",
|
|
210
|
+
description="Path to file where job IDs will be saved",
|
|
211
|
+
),
|
|
212
|
+
] = None
|
|
213
|
+
show_file_names_in_data_import_logs: Annotated[
|
|
214
|
+
bool,
|
|
215
|
+
Field(
|
|
216
|
+
title="Show file names in DI logs",
|
|
217
|
+
description="Show file names in data import logs",
|
|
218
|
+
),
|
|
219
|
+
] = False
|
|
220
|
+
|
|
221
|
+
bad_records_file: BinaryIO
|
|
222
|
+
failed_batches_file: BinaryIO
|
|
223
|
+
job_id: str
|
|
224
|
+
reporter: ProgressReporter
|
|
225
|
+
task_sent: str
|
|
226
|
+
task_imported: str
|
|
227
|
+
http_client: httpx.Client
|
|
228
|
+
current_file: List[Path] | List[BinaryIO]
|
|
229
|
+
record_batch: List[bytes]
|
|
230
|
+
last_current: int = 0
|
|
231
|
+
total_records_sent: int = 0
|
|
232
|
+
finished: bool = False
|
|
233
|
+
job_id: str = ""
|
|
234
|
+
job_ids: List[str]
|
|
235
|
+
job_hrid: int = 0
|
|
236
|
+
_max_summary_retries: int = 2
|
|
237
|
+
_max_job_retries: int = 2
|
|
238
|
+
_job_retries: int = 0
|
|
239
|
+
_summary_retries: int = 0
|
|
240
|
+
|
|
241
|
+
def __init__(
|
|
242
|
+
self,
|
|
243
|
+
folio_client: folioclient.FolioClient,
|
|
244
|
+
config: "MARCImportJob.Config",
|
|
245
|
+
reporter: ProgressReporter | None = None,
|
|
246
|
+
) -> None:
|
|
247
|
+
self.folio_client: folioclient.FolioClient = folio_client
|
|
248
|
+
self.config = config
|
|
249
|
+
self.reporter = reporter or NoOpProgressReporter()
|
|
250
|
+
self.current_retry_timeout: float | None = None
|
|
251
|
+
self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(
|
|
252
|
+
config.marc_record_preprocessors or "", **(config.preprocessors_args or {})
|
|
253
|
+
)
|
|
254
|
+
self.job_ids_file_path = config.job_ids_file_path or config.marc_files[0].parent.joinpath(
|
|
255
|
+
"marc_import_job_ids.txt"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
async def do_work(self) -> None:
|
|
259
|
+
"""
|
|
260
|
+
Performs the necessary work for data import.
|
|
261
|
+
|
|
262
|
+
This method initializes an HTTP client, files to store records that fail to send,
|
|
263
|
+
and calls the appropriate method to import MARC files based on the configuration.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
None
|
|
267
|
+
"""
|
|
268
|
+
self.record_batch = []
|
|
269
|
+
self.job_ids = []
|
|
270
|
+
with (
|
|
271
|
+
self.folio_client.get_folio_http_client() as http_client,
|
|
272
|
+
open(
|
|
273
|
+
self.config.marc_files[0].parent.joinpath(
|
|
274
|
+
f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
275
|
+
),
|
|
276
|
+
"wb+",
|
|
277
|
+
) as bad_marc_file,
|
|
278
|
+
open(
|
|
279
|
+
self.config.marc_files[0].parent.joinpath(
|
|
280
|
+
f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
|
|
281
|
+
),
|
|
282
|
+
"wb+",
|
|
283
|
+
) as failed_batches,
|
|
284
|
+
):
|
|
285
|
+
self.bad_records_file = bad_marc_file
|
|
286
|
+
logger.info(f"Writing bad records to {self.bad_records_file.name}")
|
|
287
|
+
self.failed_batches_file = failed_batches
|
|
288
|
+
logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
|
|
289
|
+
self.http_client = http_client
|
|
290
|
+
if self.config.split_files:
|
|
291
|
+
await self.process_split_files()
|
|
292
|
+
else:
|
|
293
|
+
for file in self.config.marc_files:
|
|
294
|
+
self.current_file = [file]
|
|
295
|
+
await self.import_marc_file()
|
|
296
|
+
|
|
297
|
+
async def process_split_files(self):
|
|
298
|
+
"""
|
|
299
|
+
Process the import of files in smaller batches.
|
|
300
|
+
This method is called when `split_files` is set to True.
|
|
301
|
+
It splits each file into smaller chunks and processes them one by one.
|
|
302
|
+
"""
|
|
303
|
+
for file in self.config.marc_files:
|
|
304
|
+
with open(file, "rb") as f:
|
|
305
|
+
file_length = await self.read_total_records([f])
|
|
306
|
+
expected_batches = math.ceil(file_length / self.config.split_size)
|
|
307
|
+
logger.info(
|
|
308
|
+
f"{file.name} contains {file_length} records."
|
|
309
|
+
f" Splitting into {expected_batches} {self.config.split_size} record batches."
|
|
310
|
+
)
|
|
311
|
+
zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
|
|
312
|
+
for idx, batch in enumerate(
|
|
313
|
+
self.split_marc_file(file, self.config.split_size), start=1
|
|
314
|
+
):
|
|
315
|
+
if idx > self.config.split_offset:
|
|
316
|
+
batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
|
|
317
|
+
self.current_file = [batch]
|
|
318
|
+
await self.import_marc_file()
|
|
319
|
+
self.move_file_to_complete(file)
|
|
320
|
+
|
|
321
|
+
async def wrap_up(self) -> None:
|
|
322
|
+
"""
|
|
323
|
+
Wraps up the data import process.
|
|
324
|
+
|
|
325
|
+
This method is called after the import process is complete.
|
|
326
|
+
It checks for empty bad records and error files and removes them.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
None
|
|
330
|
+
"""
|
|
331
|
+
with open(self.bad_records_file.name, "rb") as bad_records:
|
|
332
|
+
if not bad_records.read(1):
|
|
333
|
+
os.remove(bad_records.name)
|
|
334
|
+
logger.info("No bad records found. Removing bad records file.")
|
|
335
|
+
with open(self.failed_batches_file.name, "rb") as failed_batches:
|
|
336
|
+
if not failed_batches.read(1):
|
|
337
|
+
os.remove(failed_batches.name)
|
|
338
|
+
logger.info("No failed batches. Removing failed batches file.")
|
|
339
|
+
with open(self.job_ids_file_path, "a+") as job_ids_file:
|
|
340
|
+
logger.info(f"Writing job IDs to {self.job_ids_file_path}")
|
|
341
|
+
for job_id in self.job_ids:
|
|
342
|
+
job_ids_file.write(f"{job_id}\n")
|
|
343
|
+
logger.info("Import complete.")
|
|
344
|
+
logger.info(f"Total records imported: {self.total_records_sent}")
|
|
345
|
+
|
|
346
|
+
async def get_job_status(self) -> None:
|
|
347
|
+
"""
|
|
348
|
+
Retrieves the status of a job execution.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
None
|
|
352
|
+
|
|
353
|
+
Raises:
|
|
354
|
+
IndexError: If the job execution with the specified ID is not found.
|
|
355
|
+
"""
|
|
356
|
+
job_status: Dict | None = None
|
|
357
|
+
try:
|
|
358
|
+
self.current_retry_timeout = (
|
|
359
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
360
|
+
if self.current_retry_timeout
|
|
361
|
+
else RETRY_TIMEOUT_START
|
|
362
|
+
)
|
|
363
|
+
with self.folio_client.get_folio_http_client() as temp_client:
|
|
364
|
+
temp_client.timeout = self.current_retry_timeout
|
|
365
|
+
self.folio_client.httpx_client = temp_client
|
|
366
|
+
job_status = self.folio_client.folio_get(
|
|
367
|
+
"/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
|
|
368
|
+
"=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
|
|
369
|
+
)
|
|
370
|
+
self.current_retry_timeout = None
|
|
371
|
+
except (folioclient.FolioConnectionError, folioclient.FolioHTTPError) as e:
|
|
372
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
373
|
+
|
|
374
|
+
# Raise non-retriable HTTP errors immediately
|
|
375
|
+
if hasattr(e, "response") and e.response.status_code not in [502, 504, 401]:
|
|
376
|
+
raise e
|
|
377
|
+
|
|
378
|
+
# For retriable errors or connection errors
|
|
379
|
+
if (
|
|
380
|
+
self.current_retry_timeout is not None
|
|
381
|
+
and self.current_retry_timeout <= RETRY_TIMEOUT_MAX
|
|
382
|
+
):
|
|
383
|
+
logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
|
|
384
|
+
sleep(0.25)
|
|
385
|
+
return await self.get_job_status()
|
|
386
|
+
elif (
|
|
387
|
+
self.current_retry_timeout is not None
|
|
388
|
+
and self.current_retry_timeout > RETRY_TIMEOUT_MAX
|
|
389
|
+
):
|
|
390
|
+
logger.critical(
|
|
391
|
+
f"SERVER ERROR fetching job status: {error_text}. Max retries exceeded."
|
|
392
|
+
)
|
|
393
|
+
raise FolioDataImportJobError(self.job_id, error_text, e) from e
|
|
394
|
+
else:
|
|
395
|
+
raise e
|
|
396
|
+
except Exception as e:
|
|
397
|
+
logger.error(f"Error fetching job status. {e}")
|
|
398
|
+
|
|
399
|
+
if job_status is None:
|
|
400
|
+
return
|
|
401
|
+
|
|
402
|
+
try:
|
|
403
|
+
status = [job for job in job_status["jobExecutions"] if job["id"] == self.job_id][0]
|
|
404
|
+
self.reporter.update_task(
|
|
405
|
+
self.task_imported,
|
|
406
|
+
advance=status["progress"]["current"] - self.last_current,
|
|
407
|
+
)
|
|
408
|
+
self.last_current = status["progress"]["current"]
|
|
409
|
+
except (IndexError, ValueError, KeyError):
|
|
410
|
+
logger.debug(f"No active job found with ID {self.job_id}. Checking for finished job.")
|
|
411
|
+
try:
|
|
412
|
+
job_status = self.folio_client.folio_get(
|
|
413
|
+
"/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
|
|
414
|
+
"=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
|
|
415
|
+
)
|
|
416
|
+
status = [job for job in job_status["jobExecutions"] if job["id"] == self.job_id][
|
|
417
|
+
0
|
|
418
|
+
]
|
|
419
|
+
self.reporter.update_task(
|
|
420
|
+
self.task_imported,
|
|
421
|
+
advance=status["progress"]["current"] - self.last_current,
|
|
422
|
+
)
|
|
423
|
+
self.last_current = status["progress"]["current"]
|
|
424
|
+
self.finished = True
|
|
425
|
+
except (folioclient.FolioConnectionError, folioclient.FolioHTTPError) as e:
|
|
426
|
+
# Raise non-retriable HTTP errors immediately
|
|
427
|
+
if hasattr(e, "response") and e.response.status_code not in [502, 504]:
|
|
428
|
+
raise e
|
|
429
|
+
|
|
430
|
+
# Retry retriable errors or connection errors
|
|
431
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
432
|
+
logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
|
|
433
|
+
sleep(0.25)
|
|
434
|
+
with self.folio_client.get_folio_http_client() as temp_client:
|
|
435
|
+
temp_client.timeout = self.current_retry_timeout
|
|
436
|
+
self.folio_client.httpx_client = temp_client
|
|
437
|
+
return await self.get_job_status()
|
|
438
|
+
|
|
439
|
+
async def set_job_file_name(self) -> None:
|
|
440
|
+
"""
|
|
441
|
+
Sets the file name for the current job execution.
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
None
|
|
445
|
+
"""
|
|
446
|
+
try:
|
|
447
|
+
job_object = self.http_client.get(
|
|
448
|
+
"/change-manager/jobExecutions/" + self.job_id,
|
|
449
|
+
)
|
|
450
|
+
job_object.raise_for_status()
|
|
451
|
+
job_object_json = job_object.json()
|
|
452
|
+
job_object_json.update({"fileName": self.current_file[0].name})
|
|
453
|
+
set_file_name = self.http_client.put(
|
|
454
|
+
"/change-manager/jobExecutions/" + self.job_id,
|
|
455
|
+
json=job_object_json,
|
|
456
|
+
)
|
|
457
|
+
set_file_name.raise_for_status()
|
|
458
|
+
except httpx.HTTPError as e:
|
|
459
|
+
logger.error(
|
|
460
|
+
"Error setting job file name: "
|
|
461
|
+
+ str(e)
|
|
462
|
+
+ "\n"
|
|
463
|
+
+ getattr(getattr(e, "response", ""), "text", "")
|
|
464
|
+
)
|
|
465
|
+
raise e
|
|
466
|
+
|
|
467
|
+
async def create_folio_import_job(self) -> None:
|
|
468
|
+
"""
|
|
469
|
+
Creates a job execution for importing data into FOLIO.
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
None
|
|
473
|
+
|
|
474
|
+
Raises:
|
|
475
|
+
FolioHTTPError: If there is an error creating the job.
|
|
476
|
+
"""
|
|
477
|
+
try:
|
|
478
|
+
job_response = self.folio_client.folio_post(
|
|
479
|
+
"/change-manager/jobExecutions",
|
|
480
|
+
{"sourceType": "ONLINE", "userId": self.folio_client.current_user},
|
|
481
|
+
)
|
|
482
|
+
except (folioclient.FolioConnectionError, folioclient.FolioHTTPError) as e:
|
|
483
|
+
# Raise non-retriable HTTP errors immediately
|
|
484
|
+
if hasattr(e, "response") and e.response.status_code not in [502, 504]:
|
|
485
|
+
raise e
|
|
486
|
+
|
|
487
|
+
# Retry retriable errors or connection errors
|
|
488
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
489
|
+
logger.warning(f"SERVER ERROR creating job: {error_text}. Retrying.")
|
|
490
|
+
sleep(0.25)
|
|
491
|
+
return await self.create_folio_import_job()
|
|
492
|
+
|
|
493
|
+
try:
|
|
494
|
+
self.job_id = job_response["parentJobExecutionId"]
|
|
495
|
+
except (KeyError, TypeError) as e:
|
|
496
|
+
logger.error(
|
|
497
|
+
f"Invalid job response from FOLIO API. Expected 'parentJobExecutionId' key. "
|
|
498
|
+
f"Response: {job_response}"
|
|
499
|
+
)
|
|
500
|
+
raise ValueError(f"FOLIO API returned invalid job response: {job_response}") from e
|
|
501
|
+
|
|
502
|
+
if self.config.show_file_names_in_data_import_logs:
|
|
503
|
+
await self.set_job_file_name()
|
|
504
|
+
self.job_ids.append(self.job_id)
|
|
505
|
+
logger.info(f"Created job: {self.job_id}")
|
|
506
|
+
|
|
507
|
+
@cached_property
|
|
508
|
+
def import_profile(self) -> dict:
|
|
509
|
+
"""
|
|
510
|
+
Returns the import profile for the current job execution.
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
dict: The import profile for the current job execution.
|
|
514
|
+
"""
|
|
515
|
+
import_profiles = self.folio_client.folio_get(
|
|
516
|
+
"/data-import-profiles/jobProfiles",
|
|
517
|
+
"jobProfiles",
|
|
518
|
+
query_params={"limit": "1000"},
|
|
519
|
+
)
|
|
520
|
+
profile = [
|
|
521
|
+
profile
|
|
522
|
+
for profile in import_profiles
|
|
523
|
+
if profile["name"] == self.config.import_profile_name
|
|
524
|
+
][0]
|
|
525
|
+
return profile
|
|
526
|
+
|
|
527
|
+
async def set_job_profile(self) -> None:
|
|
528
|
+
"""
|
|
529
|
+
Sets the job profile for the current job execution.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
The response from the HTTP request to set the job profile.
|
|
533
|
+
"""
|
|
534
|
+
logger.info(
|
|
535
|
+
f"Setting job profile: {self.import_profile['name']} ({self.import_profile['id']})"
|
|
536
|
+
f" for job {self.job_id}"
|
|
537
|
+
)
|
|
538
|
+
set_job_profile = self.http_client.put(
|
|
539
|
+
"/change-manager/jobExecutions/" + self.job_id + "/jobProfile",
|
|
540
|
+
json={
|
|
541
|
+
"id": self.import_profile["id"],
|
|
542
|
+
"name": self.import_profile["name"],
|
|
543
|
+
"dataType": "MARC",
|
|
544
|
+
},
|
|
545
|
+
)
|
|
546
|
+
try:
|
|
547
|
+
set_job_profile.raise_for_status()
|
|
548
|
+
self.job_hrid = set_job_profile.json()["hrId"]
|
|
549
|
+
logger.info(f"Job HRID: {self.job_hrid}")
|
|
550
|
+
except httpx.HTTPError as e:
|
|
551
|
+
logger.error(
|
|
552
|
+
"Error creating job: "
|
|
553
|
+
+ str(e)
|
|
554
|
+
+ "\n"
|
|
555
|
+
+ getattr(getattr(e, "response", ""), "text", "")
|
|
556
|
+
)
|
|
557
|
+
raise e
|
|
558
|
+
|
|
559
|
+
@staticmethod
|
|
560
|
+
async def _count_records(files: List[BinaryIO]) -> int:
|
|
561
|
+
"""
|
|
562
|
+
Internal method to count total number of records from files.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
files (list): List of files to read.
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
int: The total number of records found in the files.
|
|
569
|
+
"""
|
|
570
|
+
total_records = 0
|
|
571
|
+
for import_file in files:
|
|
572
|
+
while True:
|
|
573
|
+
chunk = import_file.read(104857600)
|
|
574
|
+
if not chunk:
|
|
575
|
+
break
|
|
576
|
+
total_records += chunk.count(b"\x1d")
|
|
577
|
+
import_file.seek(0)
|
|
578
|
+
return total_records
|
|
579
|
+
|
|
580
|
+
@staticmethod
|
|
581
|
+
async def read_total_records(files: List[BinaryIO]) -> int:
|
|
582
|
+
"""
|
|
583
|
+
Count records from files with per-file logging.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
files (list): List of files to read.
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
int: The total number of records found in the files.
|
|
590
|
+
"""
|
|
591
|
+
total_records = 0
|
|
592
|
+
for import_file in files:
|
|
593
|
+
file_name = os.path.basename(import_file.name)
|
|
594
|
+
logger.info(f"Counting records in {file_name}...")
|
|
595
|
+
file_record_count = await MARCImportJob._count_records([import_file])
|
|
596
|
+
total_records += file_record_count
|
|
597
|
+
logger.info(f"Counted {file_record_count} records in {file_name}")
|
|
598
|
+
return total_records
|
|
599
|
+
|
|
600
|
+
async def process_record_batch(self, batch_payload) -> None:
|
|
601
|
+
"""
|
|
602
|
+
Processes a record batch.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
batch_payload (dict): A records payload containing the current batch of MARC records.
|
|
606
|
+
"""
|
|
607
|
+
try:
|
|
608
|
+
post_batch = self.http_client.post(
|
|
609
|
+
"/change-manager/jobExecutions/" + self.job_id + "/records",
|
|
610
|
+
json=batch_payload,
|
|
611
|
+
)
|
|
612
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
613
|
+
logger.warning(f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying...")
|
|
614
|
+
sleep(0.25)
|
|
615
|
+
return await self.process_record_batch(batch_payload)
|
|
616
|
+
try:
|
|
617
|
+
post_batch.raise_for_status()
|
|
618
|
+
self.total_records_sent += len(self.record_batch)
|
|
619
|
+
self.record_batch = []
|
|
620
|
+
self.reporter.update_task(self.task_sent, advance=len(batch_payload["initialRecords"]))
|
|
621
|
+
except httpx.HTTPStatusError as e:
|
|
622
|
+
if e.response.status_code in [
|
|
623
|
+
500,
|
|
624
|
+
400,
|
|
625
|
+
422,
|
|
626
|
+
]: # TODO: Update once we no longer have to support < Sunflower to just be 400
|
|
627
|
+
self.total_records_sent += len(self.record_batch)
|
|
628
|
+
self.record_batch = []
|
|
629
|
+
self.reporter.update_task(
|
|
630
|
+
self.task_sent, advance=len(batch_payload["initialRecords"])
|
|
631
|
+
)
|
|
632
|
+
else:
|
|
633
|
+
for record in self.record_batch:
|
|
634
|
+
self.failed_batches_file.write(record)
|
|
635
|
+
raise FolioDataImportBatchError(
|
|
636
|
+
batch_payload["id"], f"{e}\n{e.response.text}", e
|
|
637
|
+
) from e
|
|
638
|
+
await self.get_job_status()
|
|
639
|
+
sleep(self.config.batch_delay)
|
|
640
|
+
|
|
641
|
+
async def process_records(self, files, total_records: int) -> None:
|
|
642
|
+
"""
|
|
643
|
+
Process records from the given files.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
files (list): List of files to process.
|
|
647
|
+
total_records (int): Total number of records to process.
|
|
648
|
+
pbar_sent: Progress bar for tracking the number of records sent.
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
None
|
|
652
|
+
"""
|
|
653
|
+
counter = 0
|
|
654
|
+
for import_file in files:
|
|
655
|
+
file_path = Path(import_file.name)
|
|
656
|
+
self.reporter.update_task(
|
|
657
|
+
self.task_sent,
|
|
658
|
+
description=f"Sent ({os.path.basename(import_file.name)})",
|
|
659
|
+
)
|
|
660
|
+
reader = pymarc.MARCReader(import_file, hide_utf8_warnings=True)
|
|
661
|
+
for idx, record in enumerate(reader, start=1):
|
|
662
|
+
if len(self.record_batch) == self.config.batch_size:
|
|
663
|
+
await self.process_record_batch(
|
|
664
|
+
await self.create_batch_payload(
|
|
665
|
+
counter,
|
|
666
|
+
total_records,
|
|
667
|
+
counter == total_records,
|
|
668
|
+
),
|
|
669
|
+
)
|
|
670
|
+
sleep(0.25)
|
|
671
|
+
if record:
|
|
672
|
+
record = self.marc_record_preprocessor.do_work(record)
|
|
673
|
+
self.record_batch.append(record.as_marc())
|
|
674
|
+
counter += 1
|
|
675
|
+
else:
|
|
676
|
+
logger.data_issues(
|
|
677
|
+
"RECORD FAILED\t%s\t%s\t%s",
|
|
678
|
+
f"{file_path.name}:{idx}",
|
|
679
|
+
f"Error reading {idx} record from {file_path}. Skipping."
|
|
680
|
+
f" Writing current chunk to {self.bad_records_file.name}.",
|
|
681
|
+
"",
|
|
682
|
+
)
|
|
683
|
+
if reader.current_chunk:
|
|
684
|
+
self.bad_records_file.write(reader.current_chunk)
|
|
685
|
+
if not self.config.split_files:
|
|
686
|
+
self.move_file_to_complete(file_path)
|
|
687
|
+
if self.record_batch or not self.finished:
|
|
688
|
+
await self.process_record_batch(
|
|
689
|
+
await self.create_batch_payload(
|
|
690
|
+
counter,
|
|
691
|
+
total_records,
|
|
692
|
+
counter == total_records,
|
|
693
|
+
),
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
def move_file_to_complete(self, file_path: Path) -> None:
|
|
697
|
+
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
698
|
+
if not import_complete_path.exists():
|
|
699
|
+
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
700
|
+
import_complete_path.mkdir(exist_ok=True)
|
|
701
|
+
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
702
|
+
file_path.rename(file_path.parent.joinpath("import_complete", file_path.name))
|
|
703
|
+
|
|
704
|
+
async def create_batch_payload(self, counter: int, total_records, is_last: bool) -> dict:
|
|
705
|
+
"""
|
|
706
|
+
Create a batch payload for data import.
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
counter (int): The current counter value.
|
|
710
|
+
total_records (int): The total number of records.
|
|
711
|
+
is_last (bool): Indicates if this is the last batch.
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
dict: The batch payload containing the ID, records metadata, and initial records.
|
|
715
|
+
"""
|
|
716
|
+
return {
|
|
717
|
+
"id": str(uuid.uuid4()),
|
|
718
|
+
"recordsMetadata": {
|
|
719
|
+
"last": is_last,
|
|
720
|
+
"counter": counter,
|
|
721
|
+
"contentType": "MARC_RAW",
|
|
722
|
+
"total": total_records,
|
|
723
|
+
},
|
|
724
|
+
"initialRecords": [{"record": x.decode()} for x in self.record_batch],
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
@staticmethod
|
|
728
|
+
def split_marc_file(file_path: Path, batch_size: int) -> Generator[io.BytesIO, None, None]:
|
|
729
|
+
"""Generator to iterate over MARC records in batches, yielding BytesIO objects."""
|
|
730
|
+
with open(file_path, "rb") as f:
|
|
731
|
+
batch = io.BytesIO()
|
|
732
|
+
count = 0
|
|
733
|
+
|
|
734
|
+
while True:
|
|
735
|
+
leader = f.read(24)
|
|
736
|
+
if not leader:
|
|
737
|
+
break # End of file
|
|
738
|
+
|
|
739
|
+
try:
|
|
740
|
+
record_length = int(leader[:5]) # Extract record length from leader
|
|
741
|
+
except ValueError as ve:
|
|
742
|
+
raise ValueError("Invalid MARC record length encountered.") from ve
|
|
743
|
+
|
|
744
|
+
record_body = f.read(record_length - 24)
|
|
745
|
+
if len(record_body) != record_length - 24:
|
|
746
|
+
raise ValueError("Unexpected end of file while reading MARC record.")
|
|
747
|
+
|
|
748
|
+
# Verify record terminator
|
|
749
|
+
if record_body[-1:] != b"\x1d":
|
|
750
|
+
raise ValueError(
|
|
751
|
+
"MARC record does not end with the expected terminator (0x1D)."
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Write the full record to the batch buffer
|
|
755
|
+
batch.write(leader + record_body)
|
|
756
|
+
count += 1
|
|
757
|
+
|
|
758
|
+
if count >= batch_size:
|
|
759
|
+
batch.seek(0)
|
|
760
|
+
yield batch
|
|
761
|
+
batch = io.BytesIO() # Reset buffer
|
|
762
|
+
count = 0
|
|
763
|
+
|
|
764
|
+
# Yield any remaining records
|
|
765
|
+
if count > 0:
|
|
766
|
+
batch.seek(0)
|
|
767
|
+
yield batch
|
|
768
|
+
|
|
769
|
+
async def import_marc_file(self) -> None:
|
|
770
|
+
"""
|
|
771
|
+
Imports MARC file into the system.
|
|
772
|
+
|
|
773
|
+
This method performs the following steps:
|
|
774
|
+
1. Creates a FOLIO import job.
|
|
775
|
+
2. Retrieves the import profile.
|
|
776
|
+
3. Sets the job profile.
|
|
777
|
+
4. Opens the MARC file(s) and reads the total number of records.
|
|
778
|
+
5. Displays progress bars for imported and sent records.
|
|
779
|
+
6. Processes the records and updates the progress bars.
|
|
780
|
+
7. Checks the job status periodically until the import is finished.
|
|
781
|
+
|
|
782
|
+
Note: This method assumes that the necessary instance attributes are already set.
|
|
783
|
+
|
|
784
|
+
Returns:
|
|
785
|
+
None
|
|
786
|
+
"""
|
|
787
|
+
await self.create_folio_import_job()
|
|
788
|
+
await self.set_job_profile()
|
|
789
|
+
with ExitStack() as stack:
|
|
790
|
+
files: List[BinaryIO]
|
|
791
|
+
try:
|
|
792
|
+
if isinstance(self.current_file[0], Path):
|
|
793
|
+
path_list = cast(List[Path], self.current_file)
|
|
794
|
+
files = [stack.enter_context(open(file, "rb")) for file in path_list]
|
|
795
|
+
elif isinstance(self.current_file[0], io.BytesIO):
|
|
796
|
+
bytesio_list = cast(List[io.BytesIO], self.current_file)
|
|
797
|
+
files = [stack.enter_context(file) for file in bytesio_list]
|
|
798
|
+
else:
|
|
799
|
+
raise ValueError("Invalid file type. Must be Path or BytesIO.")
|
|
800
|
+
except IndexError as e:
|
|
801
|
+
logger.error(f"Error opening file: {e}")
|
|
802
|
+
raise e
|
|
803
|
+
|
|
804
|
+
total_records = await self._count_records(files)
|
|
805
|
+
|
|
806
|
+
with self.reporter:
|
|
807
|
+
try:
|
|
808
|
+
self.task_sent = self.reporter.start_task(
|
|
809
|
+
"sent", total=total_records, description="Sent"
|
|
810
|
+
)
|
|
811
|
+
self.task_imported = self.reporter.start_task(
|
|
812
|
+
f"imported_{self.job_hrid}",
|
|
813
|
+
total=total_records,
|
|
814
|
+
description=f"Imported ({self.job_hrid})",
|
|
815
|
+
)
|
|
816
|
+
await self.process_records(files, total_records)
|
|
817
|
+
while not self.finished:
|
|
818
|
+
await self.get_job_status()
|
|
819
|
+
except FolioDataImportBatchError as e:
|
|
820
|
+
logger.error(f"Unhandled error posting batch {e.batch_id}: {e.message}")
|
|
821
|
+
await self.cancel_job()
|
|
822
|
+
raise e
|
|
823
|
+
except FolioDataImportJobError as e:
|
|
824
|
+
await self.cancel_job()
|
|
825
|
+
if self._job_retries < self._max_job_retries:
|
|
826
|
+
self._job_retries += 1
|
|
827
|
+
logger.error(
|
|
828
|
+
f"Unhandled error processing job {e.job_id}: {e.message},"
|
|
829
|
+
f" cancelling and retrying."
|
|
830
|
+
)
|
|
831
|
+
await self.import_marc_file()
|
|
832
|
+
else:
|
|
833
|
+
logger.critical(
|
|
834
|
+
f"Unhandled error processing job {e.job_id}: {e.message},"
|
|
835
|
+
f" cancelling and exiting (maximum retries reached)."
|
|
836
|
+
)
|
|
837
|
+
raise e
|
|
838
|
+
if self.finished and not self.config.no_summary:
|
|
839
|
+
await asyncio.sleep(5)
|
|
840
|
+
await self.log_job_summary()
|
|
841
|
+
elif self.finished:
|
|
842
|
+
logger.info("Skipping final job summary.")
|
|
843
|
+
self.last_current = 0
|
|
844
|
+
self.finished = False
|
|
845
|
+
|
|
846
|
+
async def cancel_job(self) -> None:
|
|
847
|
+
"""
|
|
848
|
+
Cancels the current job execution.
|
|
849
|
+
|
|
850
|
+
This method sends a request to cancel the job execution and logs the result.
|
|
851
|
+
|
|
852
|
+
Returns:
|
|
853
|
+
None
|
|
854
|
+
"""
|
|
855
|
+
try:
|
|
856
|
+
cancel = self.http_client.delete(
|
|
857
|
+
f"/change-manager/jobExecutions/{self.job_id}/records",
|
|
858
|
+
)
|
|
859
|
+
cancel.raise_for_status()
|
|
860
|
+
self.finished = True
|
|
861
|
+
logger.info(f"Cancelled job: {self.job_id}")
|
|
862
|
+
except (httpx.ConnectTimeout, httpx.ReadTimeout):
|
|
863
|
+
logger.warning(f"CONNECTION ERROR cancelling job {self.job_id}. Retrying...")
|
|
864
|
+
sleep(0.25)
|
|
865
|
+
await self.cancel_job()
|
|
866
|
+
|
|
867
|
+
async def log_job_summary(self):
|
|
868
|
+
if job_summary := await self.get_job_summary():
|
|
869
|
+
job_id = job_summary.pop("jobExecutionId", None)
|
|
870
|
+
total_errors = job_summary.pop("totalErrors", 0)
|
|
871
|
+
columns = ["Summary"] + list(job_summary.keys())
|
|
872
|
+
rows = set()
|
|
873
|
+
for key in columns[1:]:
|
|
874
|
+
rows.update(job_summary[key].keys())
|
|
875
|
+
|
|
876
|
+
table_data = []
|
|
877
|
+
for row in rows:
|
|
878
|
+
metric_name = decamelize(row).split("_")[1]
|
|
879
|
+
table_row = [metric_name]
|
|
880
|
+
for col in columns[1:]:
|
|
881
|
+
table_row.append(job_summary[col].get(row, "N/A"))
|
|
882
|
+
table_data.append(table_row)
|
|
883
|
+
table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
|
|
884
|
+
columns = columns[:1] + [" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]]
|
|
885
|
+
logger.info(
|
|
886
|
+
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
887
|
+
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
888
|
+
)
|
|
889
|
+
logger.info(
|
|
890
|
+
"\n" + tabulate.tabulate(table_data, headers=columns, tablefmt="fancy_grid"),
|
|
891
|
+
)
|
|
892
|
+
if total_errors:
|
|
893
|
+
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
894
|
+
else:
|
|
895
|
+
logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
|
|
896
|
+
|
|
897
|
+
async def get_job_summary(self) -> dict:
|
|
898
|
+
"""
|
|
899
|
+
Retrieves the job summary for the current job execution.
|
|
900
|
+
|
|
901
|
+
Returns:
|
|
902
|
+
dict: The job summary for the current job execution.
|
|
903
|
+
"""
|
|
904
|
+
try:
|
|
905
|
+
self.current_retry_timeout = (
|
|
906
|
+
(self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
|
|
907
|
+
if self.current_retry_timeout
|
|
908
|
+
else RETRY_TIMEOUT_START
|
|
909
|
+
)
|
|
910
|
+
with self.folio_client.get_folio_http_client() as temp_client:
|
|
911
|
+
temp_client.timeout = self.current_retry_timeout
|
|
912
|
+
self.folio_client.httpx_client = temp_client
|
|
913
|
+
job_summary = self.folio_client.folio_get(
|
|
914
|
+
f"/metadata-provider/jobSummary/{self.job_id}"
|
|
915
|
+
)
|
|
916
|
+
self.current_retry_timeout = None
|
|
917
|
+
except (folioclient.FolioConnectionError, folioclient.FolioHTTPError) as e:
|
|
918
|
+
error_text = e.response.text if hasattr(e, "response") else str(e)
|
|
919
|
+
if hasattr(e, "response") and e.response.status_code not in [502, 504, 404]:
|
|
920
|
+
raise e
|
|
921
|
+
|
|
922
|
+
if (
|
|
923
|
+
self._max_summary_retries > self._summary_retries
|
|
924
|
+
) and not self.config.let_summary_fail:
|
|
925
|
+
logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
|
|
926
|
+
sleep(0.25)
|
|
927
|
+
with self.folio_client.get_folio_http_client() as temp_client:
|
|
928
|
+
temp_client.timeout = self.current_retry_timeout
|
|
929
|
+
self.folio_client.httpx_client = temp_client
|
|
930
|
+
self._summary_retries += 1
|
|
931
|
+
return await self.get_job_summary()
|
|
932
|
+
else:
|
|
933
|
+
logger.warning(
|
|
934
|
+
f"SERVER ERROR fetching job summary: {error_text}."
|
|
935
|
+
" Skipping final summary check."
|
|
936
|
+
)
|
|
937
|
+
job_summary = {}
|
|
938
|
+
|
|
939
|
+
return job_summary
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def set_up_cli_logging() -> None:
|
|
943
|
+
"""
|
|
944
|
+
This function sets up logging for the CLI.
|
|
945
|
+
"""
|
|
946
|
+
logger.setLevel(logging.INFO)
|
|
947
|
+
logger.propagate = False
|
|
948
|
+
|
|
949
|
+
# Set up file and stream handlers
|
|
950
|
+
file_handler = logging.FileHandler(
|
|
951
|
+
"folio_data_import_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
952
|
+
)
|
|
953
|
+
file_handler.setLevel(logging.INFO)
|
|
954
|
+
file_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
955
|
+
# file_handler.addFilter(IncludeLevelFilter(25))
|
|
956
|
+
file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
957
|
+
file_handler.setFormatter(file_formatter)
|
|
958
|
+
logger.addHandler(file_handler)
|
|
959
|
+
|
|
960
|
+
if not any(
|
|
961
|
+
isinstance(h, logging.StreamHandler) and h.stream == sys.stderr for h in logger.handlers
|
|
962
|
+
):
|
|
963
|
+
stream_handler = RichHandler(
|
|
964
|
+
show_level=False,
|
|
965
|
+
show_time=False,
|
|
966
|
+
omit_repeated_times=False,
|
|
967
|
+
show_path=False,
|
|
968
|
+
)
|
|
969
|
+
stream_handler.setLevel(logging.INFO)
|
|
970
|
+
stream_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
971
|
+
# stream_handler.addFilter(ExcludeLevelFilter(25))
|
|
972
|
+
stream_formatter = logging.Formatter("%(message)s")
|
|
973
|
+
stream_handler.setFormatter(stream_formatter)
|
|
974
|
+
logger.addHandler(stream_handler)
|
|
975
|
+
|
|
976
|
+
# Set up data issues logging
|
|
977
|
+
data_issues_handler = logging.FileHandler(
|
|
978
|
+
"marc_import_data_issues_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
|
|
979
|
+
)
|
|
980
|
+
data_issues_handler.setLevel(26)
|
|
981
|
+
data_issues_handler.addFilter(IncludeLevelFilter(DATA_ISSUE_LVL_NUM))
|
|
982
|
+
data_issues_formatter = logging.Formatter("%(message)s")
|
|
983
|
+
data_issues_handler.setFormatter(data_issues_formatter)
|
|
984
|
+
logger.addHandler(data_issues_handler)
|
|
985
|
+
|
|
986
|
+
# Stop httpx from logging info messages to the console
|
|
987
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
app = cyclopts.App(version=app_version)
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
@app.default
|
|
994
|
+
def main(
|
|
995
|
+
config_file: Annotated[
|
|
996
|
+
Path | None, cyclopts.Parameter(group="Job Configuration Parameters")
|
|
997
|
+
] = None,
|
|
998
|
+
*,
|
|
999
|
+
gateway_url: Annotated[
|
|
1000
|
+
str | None,
|
|
1001
|
+
cyclopts.Parameter(
|
|
1002
|
+
env_var=["FOLIO_GATEWAY_URL"],
|
|
1003
|
+
show_env_var=True,
|
|
1004
|
+
group="FOLIO Connection Parameters",
|
|
1005
|
+
),
|
|
1006
|
+
] = None,
|
|
1007
|
+
tenant_id: Annotated[
|
|
1008
|
+
str | None,
|
|
1009
|
+
cyclopts.Parameter(
|
|
1010
|
+
env_var=["FOLIO_TENANT_ID"],
|
|
1011
|
+
show_env_var=True,
|
|
1012
|
+
group="FOLIO Connection Parameters",
|
|
1013
|
+
),
|
|
1014
|
+
] = None,
|
|
1015
|
+
username: Annotated[
|
|
1016
|
+
str | None,
|
|
1017
|
+
cyclopts.Parameter(
|
|
1018
|
+
env_var=["FOLIO_USERNAME"],
|
|
1019
|
+
show_env_var=True,
|
|
1020
|
+
group="FOLIO Connection Parameters",
|
|
1021
|
+
),
|
|
1022
|
+
] = None,
|
|
1023
|
+
password: Annotated[
|
|
1024
|
+
str | None,
|
|
1025
|
+
cyclopts.Parameter(
|
|
1026
|
+
env_var=["FOLIO_PASSWORD"],
|
|
1027
|
+
show_env_var=True,
|
|
1028
|
+
group="FOLIO Connection Parameters",
|
|
1029
|
+
),
|
|
1030
|
+
] = None,
|
|
1031
|
+
marc_file_paths: Annotated[
|
|
1032
|
+
List[Path] | None,
|
|
1033
|
+
cyclopts.Parameter(
|
|
1034
|
+
consume_multiple=True,
|
|
1035
|
+
name=["--marc-file-paths", "--marc-file-path"],
|
|
1036
|
+
help="Path(s) to MARC file(s). Accepts multiple values and glob patterns.",
|
|
1037
|
+
group="Job Configuration Parameters",
|
|
1038
|
+
),
|
|
1039
|
+
] = None,
|
|
1040
|
+
member_tenant_id: Annotated[
|
|
1041
|
+
str | None,
|
|
1042
|
+
cyclopts.Parameter(
|
|
1043
|
+
env_var="FOLIO_MEMBER_TENANT_ID",
|
|
1044
|
+
show_env_var=True,
|
|
1045
|
+
group="FOLIO Connection Parameters",
|
|
1046
|
+
),
|
|
1047
|
+
] = None,
|
|
1048
|
+
import_profile_name: Annotated[
|
|
1049
|
+
str | None, cyclopts.Parameter(group="Job Configuration Parameters")
|
|
1050
|
+
] = None,
|
|
1051
|
+
batch_size: Annotated[int, cyclopts.Parameter(group="Job Configuration Parameters")] = 10,
|
|
1052
|
+
batch_delay: Annotated[float, cyclopts.Parameter(group="Job Configuration Parameters")] = 0.0,
|
|
1053
|
+
preprocessors: Annotated[
|
|
1054
|
+
str | None,
|
|
1055
|
+
cyclopts.Parameter(
|
|
1056
|
+
name=["--preprocessor", "--preprocessors"], group="Job Configuration Parameters"
|
|
1057
|
+
),
|
|
1058
|
+
] = None,
|
|
1059
|
+
preprocessors_config: Annotated[
|
|
1060
|
+
str | None,
|
|
1061
|
+
cyclopts.Parameter(
|
|
1062
|
+
name=["--preprocessor-config", "--preprocessors-config"],
|
|
1063
|
+
group="Job Configuration Parameters",
|
|
1064
|
+
),
|
|
1065
|
+
] = None,
|
|
1066
|
+
file_names_in_di_logs: Annotated[
|
|
1067
|
+
bool, cyclopts.Parameter(group="Job Configuration Parameters")
|
|
1068
|
+
] = False,
|
|
1069
|
+
split_files: Annotated[bool, cyclopts.Parameter(group="Job Configuration Parameters")] = False,
|
|
1070
|
+
split_size: Annotated[int, cyclopts.Parameter(group="Job Configuration Parameters")] = 1000,
|
|
1071
|
+
split_offset: Annotated[int, cyclopts.Parameter(group="Job Configuration Parameters")] = 0,
|
|
1072
|
+
no_progress: Annotated[bool, cyclopts.Parameter(group="Job Configuration Parameters")] = False,
|
|
1073
|
+
no_summary: Annotated[bool, cyclopts.Parameter(group="Job Configuration Parameters")] = False,
|
|
1074
|
+
let_summary_fail: Annotated[
|
|
1075
|
+
bool, cyclopts.Parameter(group="Job Configuration Parameters")
|
|
1076
|
+
] = False,
|
|
1077
|
+
job_ids_file_path: Annotated[
|
|
1078
|
+
str | None, cyclopts.Parameter(group="Job Configuration Parameters")
|
|
1079
|
+
] = None,
|
|
1080
|
+
) -> None:
|
|
1081
|
+
"""
|
|
1082
|
+
Command-line interface to batch import MARC records into FOLIO using FOLIO Data Import
|
|
1083
|
+
|
|
1084
|
+
Parameters:
|
|
1085
|
+
config_file (Path | None): Path to JSON config file for the import job, overrides other parameters if provided.
|
|
1086
|
+
gateway_url (str): The FOLIO API Gateway URL.
|
|
1087
|
+
tenant_id (str): The tenant id.
|
|
1088
|
+
username (str): The FOLIO username.
|
|
1089
|
+
password (str): The FOLIO password.
|
|
1090
|
+
marc_file_paths (List[Path]): The MARC file(s) or glob pattern(s) to import.
|
|
1091
|
+
member_tenant_id (str): The FOLIO ECS member tenant id (if applicable).
|
|
1092
|
+
import_profile_name (str): The name of the import profile to use.
|
|
1093
|
+
batch_size (int): The number of records to send in each batch.
|
|
1094
|
+
batch_delay (float): The delay (in seconds) between sending each batch.
|
|
1095
|
+
preprocessors (str): Comma-separated list of MARC record preprocessors to use.
|
|
1096
|
+
preprocessors_config (str): Path to JSON config file for the preprocessors.
|
|
1097
|
+
file_names_in_di_logs (bool): Show file names in data import logs.
|
|
1098
|
+
split_files (bool): Split files into smaller batches.
|
|
1099
|
+
split_size (int): The number of records per split batch.
|
|
1100
|
+
split_offset (int): The number of split batches to skip before starting import.
|
|
1101
|
+
no_progress (bool): Disable progress bars.
|
|
1102
|
+
no_summary (bool): Skip the final job summary.
|
|
1103
|
+
let_summary_fail (bool): Let the final summary check fail without exiting.
|
|
1104
|
+
preprocessor_config (str): Path to JSON config file for the preprocessor.
|
|
1105
|
+
job_ids_file_path (str): Path to file to write job IDs to.
|
|
1106
|
+
""" # noqa: E501
|
|
1107
|
+
set_up_cli_logging()
|
|
1108
|
+
gateway_url, tenant_id, username, password = get_folio_connection_parameters(
|
|
1109
|
+
gateway_url, tenant_id, username, password
|
|
1110
|
+
)
|
|
1111
|
+
folio_client = folioclient.FolioClient(gateway_url, tenant_id, username, password)
|
|
1112
|
+
|
|
1113
|
+
if member_tenant_id:
|
|
1114
|
+
folio_client.tenant_id = member_tenant_id
|
|
1115
|
+
|
|
1116
|
+
# Handle file path expansion
|
|
1117
|
+
marc_files = collect_marc_file_paths(marc_file_paths)
|
|
1118
|
+
|
|
1119
|
+
marc_files.sort()
|
|
1120
|
+
|
|
1121
|
+
if len(marc_files) == 0:
|
|
1122
|
+
logger.critical(f"No files found matching {marc_file_paths}. Exiting.")
|
|
1123
|
+
sys.exit(1)
|
|
1124
|
+
else:
|
|
1125
|
+
logger.info(marc_files)
|
|
1126
|
+
|
|
1127
|
+
if preprocessors_config:
|
|
1128
|
+
with open(preprocessors_config, "r") as f:
|
|
1129
|
+
preprocessor_args = json.load(f)
|
|
1130
|
+
else:
|
|
1131
|
+
preprocessor_args = {}
|
|
1132
|
+
|
|
1133
|
+
if not import_profile_name:
|
|
1134
|
+
import_profile_name = select_import_profile(folio_client)
|
|
1135
|
+
|
|
1136
|
+
job = None
|
|
1137
|
+
try:
|
|
1138
|
+
if config_file:
|
|
1139
|
+
with open(config_file, "r") as f:
|
|
1140
|
+
config_data = json.load(f)
|
|
1141
|
+
config = MARCImportJob.Config(**config_data)
|
|
1142
|
+
else:
|
|
1143
|
+
config = MARCImportJob.Config(
|
|
1144
|
+
marc_files=marc_files,
|
|
1145
|
+
import_profile_name=import_profile_name,
|
|
1146
|
+
batch_size=batch_size,
|
|
1147
|
+
batch_delay=batch_delay,
|
|
1148
|
+
marc_record_preprocessors=preprocessors,
|
|
1149
|
+
preprocessors_args=preprocessor_args,
|
|
1150
|
+
no_progress=no_progress,
|
|
1151
|
+
no_summary=no_summary,
|
|
1152
|
+
let_summary_fail=let_summary_fail,
|
|
1153
|
+
split_files=split_files,
|
|
1154
|
+
split_size=split_size,
|
|
1155
|
+
split_offset=split_offset,
|
|
1156
|
+
job_ids_file_path=Path(job_ids_file_path) if job_ids_file_path else None,
|
|
1157
|
+
show_file_names_in_data_import_logs=file_names_in_di_logs,
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
# Create progress reporter
|
|
1161
|
+
reporter = (
|
|
1162
|
+
NoOpProgressReporter()
|
|
1163
|
+
if no_progress
|
|
1164
|
+
else RichProgressReporter(show_speed=True, show_time=True)
|
|
1165
|
+
)
|
|
1166
|
+
|
|
1167
|
+
job = MARCImportJob(folio_client, config, reporter)
|
|
1168
|
+
asyncio.run(run_job(job))
|
|
1169
|
+
except Exception as e:
|
|
1170
|
+
logger.error("Could not initialize MARCImportJob: " + str(e))
|
|
1171
|
+
sys.exit(1)
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
def select_import_profile(folio_client):
|
|
1175
|
+
try:
|
|
1176
|
+
import_profiles = folio_client.folio_get(
|
|
1177
|
+
"/data-import-profiles/jobProfiles",
|
|
1178
|
+
"jobProfiles",
|
|
1179
|
+
query_params={"limit": "1000"},
|
|
1180
|
+
)
|
|
1181
|
+
import_profile_names = [
|
|
1182
|
+
profile["name"] for profile in import_profiles if "marc" in profile["dataType"].lower()
|
|
1183
|
+
]
|
|
1184
|
+
import_profile_name = questionary.select(
|
|
1185
|
+
"Select an import profile:",
|
|
1186
|
+
choices=import_profile_names,
|
|
1187
|
+
).ask()
|
|
1188
|
+
except httpx.HTTPStatusError as e:
|
|
1189
|
+
logger.error(
|
|
1190
|
+
f"HTTP Error fetching import profiles: {e}"
|
|
1191
|
+
f"\n{getattr(getattr(e, 'response', ''), 'text', '')}\nExiting."
|
|
1192
|
+
)
|
|
1193
|
+
sys.exit(1)
|
|
1194
|
+
except KeyboardInterrupt:
|
|
1195
|
+
logger.info("Keyboard interrupt received. Exiting.")
|
|
1196
|
+
sys.exit(0)
|
|
1197
|
+
return import_profile_name
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
def collect_marc_file_paths(marc_file_paths):
|
|
1201
|
+
marc_files: List[Path] = []
|
|
1202
|
+
if marc_file_paths:
|
|
1203
|
+
for file_path in marc_file_paths:
|
|
1204
|
+
# Check if the path contains glob patterns
|
|
1205
|
+
file_path_str = str(file_path)
|
|
1206
|
+
if any(char in file_path_str for char in ["*", "?", "["]):
|
|
1207
|
+
# It's a glob pattern - expand it
|
|
1208
|
+
expanded = glob.glob(file_path_str)
|
|
1209
|
+
marc_files.extend([Path(x) for x in expanded])
|
|
1210
|
+
else:
|
|
1211
|
+
# It's a regular path
|
|
1212
|
+
marc_files.append(file_path)
|
|
1213
|
+
return marc_files
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
async def run_job(job: MARCImportJob):
|
|
1217
|
+
try:
|
|
1218
|
+
await job.do_work()
|
|
1219
|
+
except httpx.HTTPStatusError as e:
|
|
1220
|
+
logger.error(
|
|
1221
|
+
f"HTTP Error importing files: {e}"
|
|
1222
|
+
f"\n{getattr(getattr(e, 'response', ''), 'text', '')}\nExiting."
|
|
1223
|
+
)
|
|
1224
|
+
sys.exit(1)
|
|
1225
|
+
except Exception as e:
|
|
1226
|
+
logger.error("Error importing files: " + str(e))
|
|
1227
|
+
raise
|
|
1228
|
+
finally:
|
|
1229
|
+
if job:
|
|
1230
|
+
await job.wrap_up()
|
|
1231
|
+
|
|
1232
|
+
|
|
1233
|
+
class ExcludeLevelFilter(logging.Filter):
|
|
1234
|
+
def __init__(self, level) -> None:
|
|
1235
|
+
super().__init__()
|
|
1236
|
+
self.level = level
|
|
1237
|
+
|
|
1238
|
+
def filter(self, record):
|
|
1239
|
+
return record.levelno != self.level
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
class IncludeLevelFilter(logging.Filter):
|
|
1243
|
+
def __init__(self, level) -> None:
|
|
1244
|
+
super().__init__()
|
|
1245
|
+
self.level = level
|
|
1246
|
+
|
|
1247
|
+
def filter(self, record):
|
|
1248
|
+
return record.levelno == self.level
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
if __name__ == "__main__":
|
|
1252
|
+
app()
|