folio-data-import 0.2.8rc12__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of folio-data-import might be problematic. Click here for more details.

@@ -2,8 +2,8 @@ import argparse
2
2
  import asyncio
3
3
  import datetime
4
4
  import glob
5
- import importlib
6
5
  import io
6
+ import json
7
7
  import logging
8
8
  import math
9
9
  import os
@@ -15,7 +15,7 @@ from functools import cached_property
15
15
  from getpass import getpass
16
16
  from pathlib import Path
17
17
  from time import sleep
18
- from typing import List, Union
18
+ from typing import BinaryIO, Callable, Dict, List, Union
19
19
 
20
20
  import folioclient
21
21
  import httpx
@@ -25,6 +25,9 @@ import tabulate
25
25
  from humps import decamelize
26
26
  from tqdm import tqdm
27
27
 
28
+ from folio_data_import.custom_exceptions import FolioDataImportBatchError, FolioDataImportJobError
29
+ from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
30
+
28
31
  try:
29
32
  datetime_utc = datetime.UTC
30
33
  except AttributeError:
@@ -35,21 +38,25 @@ except AttributeError:
35
38
  REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error": 3}
36
39
 
37
40
  # Set default timeout and backoff values for HTTP requests when retrying job status and final summary checks
38
- RETRY_TIMEOUT_START = 1
39
- RETRY_TIMEOUT_RETRY_FACTOR = 2
41
+ RETRY_TIMEOUT_START = 5
42
+ RETRY_TIMEOUT_RETRY_FACTOR = 1.5
43
+ RETRY_TIMEOUT_MAX = 25.32
40
44
 
41
45
  # Custom log level for data issues, set to 26
42
46
  DATA_ISSUE_LVL_NUM = 26
43
47
  logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
44
48
 
49
+
45
50
  def data_issues(self, msg, *args, **kws):
46
51
  if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
47
52
  self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
48
53
 
54
+
49
55
  logging.Logger.data_issues = data_issues
50
56
 
51
57
  logger = logging.getLogger(__name__)
52
58
 
59
+
53
60
  class MARCImportJob:
54
61
  """
55
62
  Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
@@ -63,7 +70,6 @@ class MARCImportJob:
63
70
  import_profile_name (str): The name of the data import job profile to use.
64
71
  batch_size (int): The number of source records to include in a record batch (default=10).
65
72
  batch_delay (float): The number of seconds to wait between record batches (default=0).
66
- consolidate (bool): Consolidate files into a single job. Default is one job for each file.
67
73
  no_progress (bool): Disable progress bars (eg. for running in a CI environment).
68
74
  """
69
75
 
@@ -75,14 +81,15 @@ class MARCImportJob:
75
81
  http_client: httpx.Client
76
82
  current_file: List[Path]
77
83
  record_batch: List[dict] = []
78
- error_records: int = 0
79
84
  last_current: int = 0
80
85
  total_records_sent: int = 0
81
86
  finished: bool = False
82
87
  job_id: str = ""
83
88
  job_hrid: int = 0
84
- current_file: Union[List[Path],List[io.BytesIO]] = []
89
+ current_file: Union[List[Path], List[io.BytesIO]] = []
85
90
  _max_summary_retries: int = 2
91
+ _max_job_retries: int = 2
92
+ _job_retries: int = 0
86
93
  _summary_retries: int = 0
87
94
 
88
95
  def __init__(
@@ -92,18 +99,17 @@ class MARCImportJob:
92
99
  import_profile_name: str,
93
100
  batch_size=10,
94
101
  batch_delay=0,
95
- marc_record_preprocessor=None,
96
- consolidate=False,
102
+ marc_record_preprocessor: Union[List[Callable], str] = [],
103
+ preprocessor_args: Dict[str, Dict] = {},
97
104
  no_progress=False,
98
105
  let_summary_fail=False,
99
106
  split_files=False,
100
107
  split_size=1000,
108
+ split_offset=0,
101
109
  ) -> None:
102
- self.consolidate_files = consolidate
103
110
  self.split_files = split_files
104
111
  self.split_size = split_size
105
- if self.split_files and self.consolidate_files:
106
- raise ValueError("Cannot consolidate and split files at the same time.")
112
+ self.split_offset = split_offset
107
113
  self.no_progress = no_progress
108
114
  self.let_summary_fail = let_summary_fail
109
115
  self.folio_client: folioclient.FolioClient = folio_client
@@ -111,17 +117,17 @@ class MARCImportJob:
111
117
  self.import_profile_name = import_profile_name
112
118
  self.batch_size = batch_size
113
119
  self.batch_delay = batch_delay
114
- self.current_retry_timeout = None
115
- self.marc_record_preprocessor = marc_record_preprocessor
120
+ self.current_retry_timeout = 0
121
+ self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(
122
+ marc_record_preprocessor, **preprocessor_args
123
+ )
116
124
 
117
125
  async def do_work(self) -> None:
118
126
  """
119
127
  Performs the necessary work for data import.
120
128
 
121
129
  This method initializes an HTTP client, files to store records that fail to send,
122
- and calls `self.import_marc_records` to import MARC files. If `consolidate_files` is True,
123
- it imports all the files specified in `import_files` as a single batch. Otherwise,
124
- it imports each file as a separate import job.
130
+ and calls the appropriate method to import MARC files based on the configuration.
125
131
 
126
132
  Returns:
127
133
  None
@@ -146,27 +152,37 @@ class MARCImportJob:
146
152
  self.failed_batches_file = failed_batches
147
153
  logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
148
154
  self.http_client = http_client
149
- if self.consolidate_files:
150
- self.current_file = self.import_files
151
- await self.import_marc_file()
152
- elif self.split_files:
153
- for file in self.import_files:
154
- with open(file, "rb") as f:
155
- file_length = await self.read_total_records([f])
156
- expected_batches = math.ceil(file_length /self.split_size)
157
- logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
158
- zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
159
- for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
160
- batch.name = f"{file.name}_part{idx:0{zero_pad_parts}}"
161
- self.current_file = [batch]
162
- await self.import_marc_file()
163
- self.move_file_to_complete(file)
155
+ if self.split_files:
156
+ await self.process_split_files()
164
157
  else:
165
158
  for file in self.import_files:
166
159
  self.current_file = [file]
167
160
  await self.import_marc_file()
168
161
  await self.wrap_up()
169
162
 
163
+ async def process_split_files(self):
164
+ """
165
+ Process the import of files in smaller batches.
166
+ This method is called when `split_files` is set to True.
167
+ It splits each file into smaller chunks and processes them one by one.
168
+ """
169
+ for file in self.import_files:
170
+ with open(file, "rb") as f:
171
+ file_length = await self.read_total_records([f])
172
+ expected_batches = math.ceil(file_length / self.split_size)
173
+ logger.info(
174
+ f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches."
175
+ )
176
+ zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
177
+ for idx, batch in enumerate(
178
+ self.split_marc_file(file, self.split_size), start=1
179
+ ):
180
+ if idx > self.split_offset:
181
+ batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
182
+ self.current_file = [batch]
183
+ await self.import_marc_file()
184
+ self.move_file_to_complete(file)
185
+
170
186
  async def wrap_up(self) -> None:
171
187
  """
172
188
  Wraps up the data import process.
@@ -208,22 +224,29 @@ class MARCImportJob:
208
224
  timeout=self.current_retry_timeout,
209
225
  verify=self.folio_client.ssl_verify,
210
226
  ) as temp_client:
227
+ self.folio_client.httpx_client = temp_client
211
228
  job_status = self.folio_client.folio_get(
212
229
  "/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
213
230
  "=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
214
231
  )
215
232
  self.current_retry_timeout = None
216
233
  except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
217
- if not hasattr(e, "response") or e.response.status_code in [502, 504]:
218
- error_text = e.response.text if hasattr(e, "response") else str(e)
219
- logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
234
+ error_text = e.response.text if hasattr(e, "response") else str(e)
235
+ if self.current_retry_timeout <= RETRY_TIMEOUT_MAX and (
236
+ not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
237
+ ):
238
+ logger.warning(
239
+ f"SERVER ERROR fetching job status: {error_text}. Retrying."
240
+ )
220
241
  sleep(0.25)
221
- with httpx.Client(
222
- timeout=self.current_retry_timeout,
223
- verify=self.folio_client.ssl_verify,
224
- ) as temp_client:
225
- self.folio_client.httpx_client = temp_client
226
- return await self.get_job_status()
242
+ return await self.get_job_status()
243
+ elif self.current_retry_timeout > RETRY_TIMEOUT_MAX and (
244
+ not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
245
+ ):
246
+ logger.critical(
247
+ f"SERVER ERROR fetching job status: {error_text}. Max retries exceeded."
248
+ )
249
+ raise FolioDataImportJobError(self.job_id, error_text, e)
227
250
  else:
228
251
  raise e
229
252
  except Exception as e:
@@ -236,19 +259,29 @@ class MARCImportJob:
236
259
  self.pbar_imported.update(status["progress"]["current"] - self.last_current)
237
260
  self.last_current = status["progress"]["current"]
238
261
  except (IndexError, ValueError, KeyError):
239
- logger.debug(f"No active job found with ID {self.job_id}. Checking for finished job.")
262
+ logger.debug(
263
+ f"No active job found with ID {self.job_id}. Checking for finished job."
264
+ )
240
265
  try:
241
266
  job_status = self.folio_client.folio_get(
242
267
  "/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
243
268
  "=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
244
269
  )
245
270
  status = [
246
- job for job in job_status["jobExecutions"] if job["id"] == self.job_id
271
+ job
272
+ for job in job_status["jobExecutions"]
273
+ if job["id"] == self.job_id
247
274
  ][0]
248
- self.pbar_imported.update(status["progress"]["current"] - self.last_current)
275
+ self.pbar_imported.update(
276
+ status["progress"]["current"] - self.last_current
277
+ )
249
278
  self.last_current = status["progress"]["current"]
250
279
  self.finished = True
251
- except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
280
+ except (
281
+ httpx.ConnectTimeout,
282
+ httpx.ReadTimeout,
283
+ httpx.HTTPStatusError,
284
+ ) as e:
252
285
  if not hasattr(e, "response") or e.response.status_code in [502, 504]:
253
286
  error_text = e.response.text if hasattr(e, "response") else str(e)
254
287
  logger.warning(
@@ -276,7 +309,7 @@ class MARCImportJob:
276
309
  """
277
310
  try:
278
311
  create_job = self.http_client.post(
279
- self.folio_client.okapi_url + "/change-manager/jobExecutions",
312
+ self.folio_client.gateway_url + "/change-manager/jobExecutions",
280
313
  headers=self.folio_client.okapi_headers,
281
314
  json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
282
315
  )
@@ -325,7 +358,7 @@ class MARCImportJob:
325
358
  The response from the HTTP request to set the job profile.
326
359
  """
327
360
  set_job_profile = self.http_client.put(
328
- self.folio_client.okapi_url
361
+ self.folio_client.gateway_url
329
362
  + "/change-manager/jobExecutions/"
330
363
  + self.job_id
331
364
  + "/jobProfile",
@@ -338,7 +371,7 @@ class MARCImportJob:
338
371
  )
339
372
  try:
340
373
  set_job_profile.raise_for_status()
341
- self.job_hrid = set_job_profile.json()['hrId']
374
+ self.job_hrid = set_job_profile.json()["hrId"]
342
375
  logger.info(f"Job HRID: {self.job_hrid}")
343
376
  except httpx.HTTPError as e:
344
377
  logger.error(
@@ -350,7 +383,7 @@ class MARCImportJob:
350
383
  raise e
351
384
 
352
385
  @staticmethod
353
- async def read_total_records(files) -> int:
386
+ async def read_total_records(files: List[BinaryIO]) -> int:
354
387
  """
355
388
  Reads the total number of records from the given files.
356
389
 
@@ -379,17 +412,15 @@ class MARCImportJob:
379
412
  """
380
413
  try:
381
414
  post_batch = self.http_client.post(
382
- self.folio_client.okapi_url
415
+ self.folio_client.gateway_url
383
416
  + f"/change-manager/jobExecutions/{self.job_id}/records",
384
417
  headers=self.folio_client.okapi_headers,
385
418
  json=batch_payload,
386
419
  )
387
- # if batch_payload["recordsMetadata"]["last"]:
388
- # logger.log(
389
- # 25,
390
- # f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
391
- # )
392
420
  except (httpx.ConnectTimeout, httpx.ReadTimeout):
421
+ logger.warning(
422
+ f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying..."
423
+ )
393
424
  sleep(0.25)
394
425
  return await self.process_record_batch(batch_payload)
395
426
  try:
@@ -397,20 +428,19 @@ class MARCImportJob:
397
428
  self.total_records_sent += len(self.record_batch)
398
429
  self.record_batch = []
399
430
  self.pbar_sent.update(len(batch_payload["initialRecords"]))
400
- except Exception as e:
431
+ except httpx.HTTPStatusError as e:
401
432
  if (
402
- hasattr(e, "response") and e.response.status_code in [500, 422]
403
- ): # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
433
+ e.response.status_code in [500, 400, 422]
434
+ ): # TODO: Update once we no longer have to support < Sunflower to just be 400
404
435
  self.total_records_sent += len(self.record_batch)
405
436
  self.record_batch = []
406
437
  self.pbar_sent.update(len(batch_payload["initialRecords"]))
407
438
  else:
408
- logger.error("Error posting batch: " + str(e))
409
439
  for record in self.record_batch:
410
440
  self.failed_batches_file.write(record)
411
- self.error_records += len(self.record_batch)
412
- self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
413
- self.record_batch = []
441
+ raise FolioDataImportBatchError(
442
+ batch_payload["id"], f"{e}\n{e.response.text}", e
443
+ )
414
444
  await self.get_job_status()
415
445
  sleep(self.batch_delay)
416
446
 
@@ -439,16 +469,12 @@ class MARCImportJob:
439
469
  await self.create_batch_payload(
440
470
  counter,
441
471
  total_records,
442
- (counter - self.error_records)
443
- == (total_records - self.error_records),
472
+ counter == total_records,
444
473
  ),
445
474
  )
446
475
  sleep(0.25)
447
476
  if record:
448
- if self.marc_record_preprocessor:
449
- record = await self.apply_marc_record_preprocessing(
450
- record, self.marc_record_preprocessor
451
- )
477
+ record = self.marc_record_preprocessor.do_work(record)
452
478
  self.record_batch.append(record.as_marc())
453
479
  counter += 1
454
480
  else:
@@ -459,79 +485,26 @@ class MARCImportJob:
459
485
  "",
460
486
  )
461
487
  self.bad_records_file.write(reader.current_chunk)
462
- if self.record_batch:
463
- await self.process_record_batch(
464
- await self.create_batch_payload(
465
- counter,
466
- total_records,
467
- (counter - self.error_records)
468
- == (total_records - self.error_records),
469
- ),
470
- )
471
488
  if not self.split_files:
472
489
  self.move_file_to_complete(file_path)
490
+ if self.record_batch or not self.finished:
491
+ await self.process_record_batch(
492
+ await self.create_batch_payload(
493
+ counter,
494
+ total_records,
495
+ counter == total_records,
496
+ ),
497
+ )
473
498
 
474
- def move_file_to_complete(self, file_path):
499
+ def move_file_to_complete(self, file_path: Path):
475
500
  import_complete_path = file_path.parent.joinpath("import_complete")
476
501
  if not import_complete_path.exists():
477
- logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
502
+ logger.debug(
503
+ f"Creating import_complete directory: {import_complete_path.absolute()}"
504
+ )
478
505
  import_complete_path.mkdir(exist_ok=True)
479
506
  logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
480
- file_path.rename(
481
- file_path.parent.joinpath("import_complete", file_path.name)
482
- )
483
-
484
- @staticmethod
485
- async def apply_marc_record_preprocessing(
486
- record: pymarc.Record, func_or_path
487
- ) -> pymarc.Record:
488
- """
489
- Apply preprocessing to the MARC record before sending it to FOLIO.
490
-
491
- Args:
492
- record (pymarc.Record): The MARC record to preprocess.
493
- func_or_path (Union[Callable, str]): The preprocessing function or its import path.
494
-
495
- Returns:
496
- pymarc.Record: The preprocessed MARC record.
497
- """
498
- if isinstance(func_or_path, str):
499
- func_paths = func_or_path.split(",")
500
- for func_path in func_paths:
501
- record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
502
- record, func_path
503
- )
504
- elif callable(func_or_path):
505
- record = func_or_path(record)
506
- else:
507
- logger.warning(
508
- f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
509
- )
510
- return record
511
-
512
- async def _apply_single_marc_record_preprocessing_by_path(
513
- record: pymarc.Record, func_path: str
514
- ) -> pymarc.Record:
515
- """
516
- Apply a single preprocessing function to the MARC record.
517
-
518
- Args:
519
- record (pymarc.Record): The MARC record to preprocess.
520
- func_path (str): The path to the preprocessing function.
521
-
522
- Returns:
523
- pymarc.Record: The preprocessed MARC record.
524
- """
525
- try:
526
- module_path, func_name = func_path.rsplit(".", 1)
527
- module = importlib.import_module(module_path)
528
- func = getattr(module, func_name)
529
- record = func(record)
530
- except Exception as e:
531
- logger.warning(
532
- f"Error applying preprocessing function {func_path}: {e}. Skipping."
533
- )
534
- return record
507
+ file_path.rename(file_path.parent.joinpath("import_complete", file_path.name))
535
508
 
536
509
  async def create_batch_payload(self, counter, total_records, is_last) -> dict:
537
510
  """
@@ -549,9 +522,9 @@ class MARCImportJob:
549
522
  "id": str(uuid.uuid4()),
550
523
  "recordsMetadata": {
551
524
  "last": is_last,
552
- "counter": counter - self.error_records,
525
+ "counter": counter,
553
526
  "contentType": "MARC_RAW",
554
- "total": total_records - self.error_records,
527
+ "total": total_records,
555
528
  },
556
529
  "initialRecords": [{"record": x.decode()} for x in self.record_batch],
557
530
  }
@@ -575,11 +548,15 @@ class MARCImportJob:
575
548
 
576
549
  record_body = f.read(record_length - 24)
577
550
  if len(record_body) != record_length - 24:
578
- raise ValueError("Unexpected end of file while reading MARC record.")
551
+ raise ValueError(
552
+ "Unexpected end of file while reading MARC record."
553
+ )
579
554
 
580
555
  # Verify record terminator
581
- if record_body[-1:] != b'\x1D':
582
- raise ValueError("MARC record does not end with the expected terminator (0x1D).")
556
+ if record_body[-1:] != b"\x1d":
557
+ raise ValueError(
558
+ "MARC record does not end with the expected terminator (0x1D)."
559
+ )
583
560
 
584
561
  # Write the full record to the batch buffer
585
562
  batch.write(leader + record_body)
@@ -620,12 +597,11 @@ class MARCImportJob:
620
597
  try:
621
598
  if isinstance(self.current_file[0], Path):
622
599
  files = [
623
- stack.enter_context(open(file, "rb")) for file in self.current_file
600
+ stack.enter_context(open(file, "rb"))
601
+ for file in self.current_file
624
602
  ]
625
603
  elif isinstance(self.current_file[0], io.BytesIO):
626
- files = [
627
- stack.enter_context(file) for file in self.current_file
628
- ]
604
+ files = [stack.enter_context(file) for file in self.current_file]
629
605
  else:
630
606
  raise ValueError("Invalid file type. Must be Path or BytesIO.")
631
607
  except IndexError as e:
@@ -646,17 +622,62 @@ class MARCImportJob:
646
622
  disable=self.no_progress,
647
623
  ) as pbar_sent,
648
624
  ):
649
- self.pbar_sent = pbar_sent
650
- self.pbar_imported = pbar_imported
651
- await self.process_records(files, total_records)
652
- while not self.finished:
653
- await self.get_job_status()
654
- sleep(1)
625
+ try:
626
+ self.pbar_sent = pbar_sent
627
+ self.pbar_imported = pbar_imported
628
+ await self.process_records(files, total_records)
629
+ while not self.finished:
630
+ await self.get_job_status()
631
+ await asyncio.sleep(5)
632
+ except FolioDataImportBatchError as e:
633
+ logger.error(
634
+ f"Unhandled error posting batch {e.batch_id}: {e.message}"
635
+ )
636
+ await self.cancel_job()
637
+ raise e
638
+ except FolioDataImportJobError as e:
639
+ await self.cancel_job()
640
+ if self._job_retries < self._max_job_retries:
641
+ self._job_retries += 1
642
+ logger.error(
643
+ f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and retrying."
644
+ )
645
+ await self.import_marc_file()
646
+ else:
647
+ logger.critical(
648
+ f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and exiting (maximum retries reached)."
649
+ )
650
+ raise e
655
651
  if self.finished:
656
652
  await self.log_job_summary()
657
653
  self.last_current = 0
658
654
  self.finished = False
659
655
 
656
+ async def cancel_job(self) -> None:
657
+ """
658
+ Cancels the current job execution.
659
+
660
+ This method sends a request to cancel the job execution and logs the result.
661
+
662
+ Returns:
663
+ None
664
+ """
665
+ try:
666
+ cancel = self.http_client.delete(
667
+ self.folio_client.gateway_url
668
+ + f"/change-manager/jobExecutions/{self.job_id}/records",
669
+ headers=self.folio_client.okapi_headers,
670
+ )
671
+ cancel.raise_for_status()
672
+ self.finished = True
673
+ logger.info(f"Cancelled job: {self.job_id}")
674
+ except (httpx.ConnectTimeout, httpx.ReadTimeout):
675
+ logger.warning(
676
+ f"CONNECTION ERROR cancelling job {self.job_id}. Retrying..."
677
+ )
678
+ sleep(0.25)
679
+ await self.cancel_job()
680
+
660
681
  async def log_job_summary(self):
661
682
  if job_summary := await self.get_job_summary():
662
683
  job_id = job_summary.pop("jobExecutionId", None)
@@ -675,22 +696,22 @@ class MARCImportJob:
675
696
  table_data.append(table_row)
676
697
  table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
677
698
  columns = columns[:1] + [
678
- " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
679
- ]
699
+ " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
700
+ ]
680
701
  logger.info(
681
- f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
682
- f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
683
- )
702
+ f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
703
+ f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
704
+ )
684
705
  logger.info(
685
- "\n"
686
- + tabulate.tabulate(
687
- table_data, headers=columns, tablefmt="fancy_grid"
688
- ),
689
- )
706
+ "\n"
707
+ + tabulate.tabulate(table_data, headers=columns, tablefmt="fancy_grid"),
708
+ )
690
709
  if total_errors:
691
710
  logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
692
711
  else:
693
- logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
712
+ logger.error(
713
+ f"No job summary available for job #{self.job_hrid}({self.job_id})."
714
+ )
694
715
 
695
716
  async def get_job_summary(self) -> dict:
696
717
  """
@@ -715,8 +736,10 @@ class MARCImportJob:
715
736
  self.current_retry_timeout = None
716
737
  except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
717
738
  error_text = e.response.text if hasattr(e, "response") else str(e)
718
- if (self._max_summary_retries > self._summary_retries) and (not hasattr(e, "response") or (
719
- hasattr(e, "response") and e.response.status_code in [502, 504]) and not self.let_summary_fail
739
+ if (self._max_summary_retries > self._summary_retries) and (
740
+ not hasattr(e, "response")
741
+ or (hasattr(e, "response") and e.response.status_code in [502, 504])
742
+ and not self.let_summary_fail
720
743
  ):
721
744
  logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
722
745
  sleep(0.25)
@@ -727,8 +750,9 @@ class MARCImportJob:
727
750
  self.folio_client.httpx_client = temp_client
728
751
  self._summary_retries += 1
729
752
  return await self.get_job_summary()
730
- elif (self._summary_retries >= self._max_summary_retries) or (hasattr(e, "response") and (
731
- e.response.status_code in [502, 504] and self.let_summary_fail)
753
+ elif (self._summary_retries >= self._max_summary_retries) or (
754
+ hasattr(e, "response")
755
+ and (e.response.status_code in [502, 504] and self.let_summary_fail)
732
756
  ):
733
757
  logger.warning(
734
758
  f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
@@ -833,19 +857,10 @@ async def main() -> None:
833
857
  "to apply to each MARC record before sending to FOLIO. Function should take "
834
858
  "a pymarc.Record object as input and return a pymarc.Record object."
835
859
  ),
836
- default=None,
837
- )
838
- # Add mutually exclusive group for consolidate and split-files options
839
- group = parser.add_mutually_exclusive_group()
840
- group.add_argument(
841
- "--consolidate",
842
- action="store_true",
843
- help=(
844
- "Consolidate records into a single job. "
845
- "Default is to create a new job for each MARC file."
846
- ),
860
+ default="",
847
861
  )
848
- group.add_argument(
862
+
863
+ parser.add_argument(
849
864
  "--split-files",
850
865
  action="store_true",
851
866
  help="Split files into smaller parts before importing.",
@@ -856,6 +871,12 @@ async def main() -> None:
856
871
  help="The number of records to include in each split file.",
857
872
  default=1000,
858
873
  )
874
+ parser.add_argument(
875
+ "--split-offset",
876
+ type=int,
877
+ help="The number of record batches of <split-size> to skip before starting import.",
878
+ default=0,
879
+ )
859
880
 
860
881
  parser.add_argument(
861
882
  "--no-progress",
@@ -867,6 +888,16 @@ async def main() -> None:
867
888
  action="store_true",
868
889
  help="Do not retry fetching the final job summary if it fails",
869
890
  )
891
+ parser.add_argument(
892
+ "--preprocessor-config",
893
+ type=str,
894
+ help=(
895
+ "JSON file containing configuration for preprocessor functions. "
896
+ "This is passed to MARCPreprocessor class as a dict of dicts."
897
+ ),
898
+ default=None,
899
+ )
900
+
870
901
  args = parser.parse_args()
871
902
  if not args.password:
872
903
  args.password = getpass("Enter FOLIO password: ")
@@ -891,6 +922,12 @@ async def main() -> None:
891
922
  else:
892
923
  logger.info(marc_files)
893
924
 
925
+ if args.preprocessor_config:
926
+ with open(args.preprocessor_config, "r") as f:
927
+ preprocessor_args = json.load(f)
928
+ else:
929
+ preprocessor_args = {}
930
+
894
931
  if not args.import_profile_name:
895
932
  import_profiles = folio_client.folio_get(
896
933
  "/data-import-profiles/jobProfiles",
@@ -919,11 +956,12 @@ async def main() -> None:
919
956
  batch_size=args.batch_size,
920
957
  batch_delay=args.batch_delay,
921
958
  marc_record_preprocessor=args.preprocessor,
922
- consolidate=bool(args.consolidate),
959
+ preprocessor_args=preprocessor_args,
923
960
  no_progress=bool(args.no_progress),
924
961
  let_summary_fail=bool(args.let_summary_fail),
925
962
  split_files=bool(args.split_files),
926
963
  split_size=args.split_size,
964
+ split_offset=args.split_offset,
927
965
  ).do_work()
928
966
  except Exception as e:
929
967
  logger.error("Error importing files: " + str(e))