PyPI - folio-data-import - Versions diffs - 0.2.8rc12__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend - Supply Chain Defender

folio-data-import 0.2.8rc12py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of folio-data-import might be problematic. Click here for more details.

Files changed (10) hide show

folio_data_import/MARCDataImport.py CHANGED Viewed

@@ -2,8 +2,8 @@ import argparse
 import asyncio
 import datetime
 import glob
-import importlib
 import io
+import json
 import logging
 import math
 import os
@@ -15,7 +15,7 @@ from functools import cached_property
 from getpass import getpass
 from pathlib import Path
 from time import sleep
-from typing import List, Union
+from typing import BinaryIO, Callable, Dict, List, Union
 import folioclient
 import httpx
@@ -25,6 +25,9 @@ import tabulate
 from humps import decamelize
 from tqdm import tqdm
+from folio_data_import.custom_exceptions import FolioDataImportBatchError, FolioDataImportJobError
+from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
 try:
     datetime_utc = datetime.UTC
 except AttributeError:
@@ -35,21 +38,25 @@ except AttributeError:
 REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error": 3}
 # Set default timeout and backoff values for HTTP requests when retrying job status and final summary checks
-RETRY_TIMEOUT_START = 1
-RETRY_TIMEOUT_RETRY_FACTOR = 2
+RETRY_TIMEOUT_START = 5
+RETRY_TIMEOUT_RETRY_FACTOR = 1.5
+RETRY_TIMEOUT_MAX = 25.32
 # Custom log level for data issues, set to 26
 DATA_ISSUE_LVL_NUM = 26
 logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
 def data_issues(self, msg, *args, **kws):
     if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
         self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
 logging.Logger.data_issues = data_issues
 logger = logging.getLogger(__name__)
 class MARCImportJob:
     """
     Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
@@ -63,7 +70,6 @@ class MARCImportJob:
         import_profile_name (str): The name of the data import job profile to use.
         batch_size (int): The number of source records to include in a record batch (default=10).
         batch_delay (float): The number of seconds to wait between record batches (default=0).
-        consolidate (bool): Consolidate files into a single job. Default is one job for each file.
         no_progress (bool): Disable progress bars (eg. for running in a CI environment).
     """
@@ -75,14 +81,15 @@ class MARCImportJob:
     http_client: httpx.Client
     current_file: List[Path]
     record_batch: List[dict] = []
-    error_records: int = 0
     last_current: int = 0
     total_records_sent: int = 0
     finished: bool = False
     job_id: str = ""
     job_hrid: int = 0
-    current_file: Union[List[Path],List[io.BytesIO]] = []
+    current_file: Union[List[Path], List[io.BytesIO]] = []
     _max_summary_retries: int = 2
+    _max_job_retries: int = 2
+    _job_retries: int = 0
     _summary_retries: int = 0
     def __init__(
@@ -92,18 +99,17 @@ class MARCImportJob:
         import_profile_name: str,
         batch_size=10,
         batch_delay=0,
-        marc_record_preprocessor=None,
-        consolidate=False,
+        marc_record_preprocessor: Union[List[Callable], str] = [],
+        preprocessor_args: Dict[str, Dict] = {},
         no_progress=False,
         let_summary_fail=False,
         split_files=False,
         split_size=1000,
+        split_offset=0,
     ) -> None:
-        self.consolidate_files = consolidate
         self.split_files = split_files
         self.split_size = split_size
-        if self.split_files and self.consolidate_files:
-            raise ValueError("Cannot consolidate and split files at the same time.")
+        self.split_offset = split_offset
         self.no_progress = no_progress
         self.let_summary_fail = let_summary_fail
         self.folio_client: folioclient.FolioClient = folio_client
@@ -111,17 +117,17 @@ class MARCImportJob:
         self.import_profile_name = import_profile_name
         self.batch_size = batch_size
         self.batch_delay = batch_delay
-        self.current_retry_timeout = None
-        self.marc_record_preprocessor = marc_record_preprocessor
+        self.current_retry_timeout = 0
+        self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(
+            marc_record_preprocessor, **preprocessor_args
+        )
     async def do_work(self) -> None:
         """
         Performs the necessary work for data import.
         This method initializes an HTTP client, files to store records that fail to send,
-        and calls `self.import_marc_records` to import MARC files. If `consolidate_files` is True,
-        it imports all the files specified in `import_files` as a single batch. Otherwise,
-        it imports each file as a separate import job.
+        and calls the appropriate method to import MARC files based on the configuration.
         Returns:
             None
@@ -146,27 +152,37 @@ class MARCImportJob:
             self.failed_batches_file = failed_batches
             logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
             self.http_client = http_client
-            if self.consolidate_files:
-                self.current_file = self.import_files
-                await self.import_marc_file()
-            elif self.split_files:
-                for file in self.import_files:
-                    with open(file, "rb") as f:
-                        file_length = await self.read_total_records([f])
-                    expected_batches = math.ceil(file_length /self.split_size)
-                    logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
-                    zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
-                    for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
-                        batch.name = f"{file.name}_part{idx:0{zero_pad_parts}}"
-                        self.current_file = [batch]
-                        await self.import_marc_file()
-                    self.move_file_to_complete(file)
+            if self.split_files:
+                await self.process_split_files()
             else:
                 for file in self.import_files:
                     self.current_file = [file]
                     await self.import_marc_file()
             await self.wrap_up()
+    async def process_split_files(self):
+        """
+        Process the import of files in smaller batches.
+        This method is called when `split_files` is set to True.
+        It splits each file into smaller chunks and processes them one by one.
+        """
+        for file in self.import_files:
+            with open(file, "rb") as f:
+                file_length = await self.read_total_records([f])
+            expected_batches = math.ceil(file_length / self.split_size)
+            logger.info(
+                f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches."
+            )
+            zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
+            for idx, batch in enumerate(
+                self.split_marc_file(file, self.split_size), start=1
+            ):
+                if idx > self.split_offset:
+                    batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
+                    self.current_file = [batch]
+                    await self.import_marc_file()
+            self.move_file_to_complete(file)
     async def wrap_up(self) -> None:
         """
         Wraps up the data import process.
@@ -208,22 +224,29 @@ class MARCImportJob:
                 timeout=self.current_retry_timeout,
                 verify=self.folio_client.ssl_verify,
             ) as temp_client:
+                self.folio_client.httpx_client = temp_client
                 job_status = self.folio_client.folio_get(
                     "/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
                     "=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
                 )
                 self.current_retry_timeout = None
         except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
-            if not hasattr(e, "response") or e.response.status_code in [502, 504]:
-                error_text = e.response.text if hasattr(e, "response") else str(e)
-                logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
+            error_text = e.response.text if hasattr(e, "response") else str(e)
+            if self.current_retry_timeout <= RETRY_TIMEOUT_MAX and (
+                not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
+            ):
+                logger.warning(
+                    f"SERVER ERROR fetching job status: {error_text}. Retrying."
+                )
                 sleep(0.25)
-                with httpx.Client(
-                    timeout=self.current_retry_timeout,
-                    verify=self.folio_client.ssl_verify,
-                ) as temp_client:
-                    self.folio_client.httpx_client = temp_client
-                    return await self.get_job_status()
+                return await self.get_job_status()
+            elif self.current_retry_timeout > RETRY_TIMEOUT_MAX and (
+                not hasattr(e, "response") or e.response.status_code in [502, 504, 401]
+            ):
+                logger.critical(
+                    f"SERVER ERROR fetching job status: {error_text}. Max retries exceeded."
+                )
+                raise FolioDataImportJobError(self.job_id, error_text, e)
             else:
                 raise e
         except Exception as e:
@@ -236,19 +259,29 @@ class MARCImportJob:
             self.pbar_imported.update(status["progress"]["current"] - self.last_current)
             self.last_current = status["progress"]["current"]
         except (IndexError, ValueError, KeyError):
-            logger.debug(f"No active job found with ID {self.job_id}. Checking for finished job.")
+            logger.debug(
+                f"No active job found with ID {self.job_id}. Checking for finished job."
+            )
             try:
                 job_status = self.folio_client.folio_get(
                     "/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
                     "=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
                 )
                 status = [
-                    job for job in job_status["jobExecutions"] if job["id"] == self.job_id
+                    job
+                    for job in job_status["jobExecutions"]
+                    if job["id"] == self.job_id
                 ][0]
-                self.pbar_imported.update(status["progress"]["current"] - self.last_current)
+                self.pbar_imported.update(
+                    status["progress"]["current"] - self.last_current
+                )
                 self.last_current = status["progress"]["current"]
                 self.finished = True
-            except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
+            except (
+                httpx.ConnectTimeout,
+                httpx.ReadTimeout,
+                httpx.HTTPStatusError,
+            ) as e:
                 if not hasattr(e, "response") or e.response.status_code in [502, 504]:
                     error_text = e.response.text if hasattr(e, "response") else str(e)
                     logger.warning(
@@ -276,7 +309,7 @@ class MARCImportJob:
         """
         try:
             create_job = self.http_client.post(
-                self.folio_client.okapi_url + "/change-manager/jobExecutions",
+                self.folio_client.gateway_url + "/change-manager/jobExecutions",
                 headers=self.folio_client.okapi_headers,
                 json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
             )
@@ -325,7 +358,7 @@ class MARCImportJob:
             The response from the HTTP request to set the job profile.
         """
         set_job_profile = self.http_client.put(
-            self.folio_client.okapi_url
+            self.folio_client.gateway_url
             + "/change-manager/jobExecutions/"
             + self.job_id
             + "/jobProfile",
@@ -338,7 +371,7 @@ class MARCImportJob:
         )
         try:
             set_job_profile.raise_for_status()
-            self.job_hrid = set_job_profile.json()['hrId']
+            self.job_hrid = set_job_profile.json()["hrId"]
             logger.info(f"Job HRID: {self.job_hrid}")
         except httpx.HTTPError as e:
             logger.error(
@@ -350,7 +383,7 @@ class MARCImportJob:
             raise e
     @staticmethod
-    async def read_total_records(files) -> int:
+    async def read_total_records(files: List[BinaryIO]) -> int:
         """
         Reads the total number of records from the given files.
@@ -379,17 +412,15 @@ class MARCImportJob:
         """
         try:
             post_batch = self.http_client.post(
-                self.folio_client.okapi_url
+                self.folio_client.gateway_url
                 + f"/change-manager/jobExecutions/{self.job_id}/records",
                 headers=self.folio_client.okapi_headers,
                 json=batch_payload,
             )
-            # if batch_payload["recordsMetadata"]["last"]:
-            #     logger.log(
-            #         25,
-            #         f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
-            #     )
         except (httpx.ConnectTimeout, httpx.ReadTimeout):
+            logger.warning(
+                f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying..."
+            )
             sleep(0.25)
             return await self.process_record_batch(batch_payload)
         try:
@@ -397,20 +428,19 @@ class MARCImportJob:
             self.total_records_sent += len(self.record_batch)
             self.record_batch = []
             self.pbar_sent.update(len(batch_payload["initialRecords"]))
-        except Exception as e:
+        except httpx.HTTPStatusError as e:
             if (
-                hasattr(e, "response") and e.response.status_code in [500, 422]
-            ):  # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
+                e.response.status_code in [500, 400, 422]
+            ):  # TODO: Update once we no longer have to support < Sunflower to just be 400
                 self.total_records_sent += len(self.record_batch)
                 self.record_batch = []
                 self.pbar_sent.update(len(batch_payload["initialRecords"]))
             else:
-                logger.error("Error posting batch: " + str(e))
                 for record in self.record_batch:
                     self.failed_batches_file.write(record)
-                    self.error_records += len(self.record_batch)
-                    self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
-                self.record_batch = []
+                raise FolioDataImportBatchError(
+                    batch_payload["id"], f"{e}\n{e.response.text}", e
+                )
         await self.get_job_status()
         sleep(self.batch_delay)
@@ -439,16 +469,12 @@ class MARCImportJob:
                         await self.create_batch_payload(
                             counter,
                             total_records,
-                            (counter - self.error_records)
-                            == (total_records - self.error_records),
+                            counter == total_records,
                         ),
                     )
                     sleep(0.25)
                 if record:
-                    if self.marc_record_preprocessor:
-                        record = await self.apply_marc_record_preprocessing(
-                            record, self.marc_record_preprocessor
-                        )
+                    record = self.marc_record_preprocessor.do_work(record)
                     self.record_batch.append(record.as_marc())
                     counter += 1
                 else:
@@ -459,79 +485,26 @@ class MARCImportJob:
                         "",
                     )
                     self.bad_records_file.write(reader.current_chunk)
-            if self.record_batch:
-                await self.process_record_batch(
-                    await self.create_batch_payload(
-                        counter,
-                        total_records,
-                        (counter - self.error_records)
-                        == (total_records - self.error_records),
-                    ),
-                )
             if not self.split_files:
                 self.move_file_to_complete(file_path)
+        if self.record_batch or not self.finished:
+            await self.process_record_batch(
+                await self.create_batch_payload(
+                    counter,
+                    total_records,
+                    counter == total_records,
+                ),
+            )
-    def move_file_to_complete(self, file_path):
+    def move_file_to_complete(self, file_path: Path):
         import_complete_path = file_path.parent.joinpath("import_complete")
         if not import_complete_path.exists():
-            logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
+            logger.debug(
+                f"Creating import_complete directory: {import_complete_path.absolute()}"
+            )
             import_complete_path.mkdir(exist_ok=True)
         logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
-        file_path.rename(
-                file_path.parent.joinpath("import_complete", file_path.name)
-            )
-    @staticmethod
-    async def apply_marc_record_preprocessing(
-        record: pymarc.Record, func_or_path
-    ) -> pymarc.Record:
-        """
-        Apply preprocessing to the MARC record before sending it to FOLIO.
-        Args:
-            record (pymarc.Record): The MARC record to preprocess.
-            func_or_path (Union[Callable, str]): The preprocessing function or its import path.
-        Returns:
-            pymarc.Record: The preprocessed MARC record.
-        """
-        if isinstance(func_or_path, str):
-            func_paths = func_or_path.split(",")
-            for func_path in func_paths:
-                record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
-                    record, func_path
-                )
-        elif callable(func_or_path):
-            record = func_or_path(record)
-        else:
-            logger.warning(
-                f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
-            )
-        return record
-    async def _apply_single_marc_record_preprocessing_by_path(
-        record: pymarc.Record, func_path: str
-    ) -> pymarc.Record:
-        """
-        Apply a single preprocessing function to the MARC record.
-        Args:
-            record (pymarc.Record): The MARC record to preprocess.
-            func_path (str): The path to the preprocessing function.
-        Returns:
-            pymarc.Record: The preprocessed MARC record.
-        """
-        try:
-            module_path, func_name = func_path.rsplit(".", 1)
-            module = importlib.import_module(module_path)
-            func = getattr(module, func_name)
-            record = func(record)
-        except Exception as e:
-            logger.warning(
-                f"Error applying preprocessing function {func_path}: {e}. Skipping."
-            )
-        return record
+        file_path.rename(file_path.parent.joinpath("import_complete", file_path.name))
     async def create_batch_payload(self, counter, total_records, is_last) -> dict:
         """
@@ -549,9 +522,9 @@ class MARCImportJob:
             "id": str(uuid.uuid4()),
             "recordsMetadata": {
                 "last": is_last,
-                "counter": counter - self.error_records,
+                "counter": counter,
                 "contentType": "MARC_RAW",
-                "total": total_records - self.error_records,
+                "total": total_records,
             },
             "initialRecords": [{"record": x.decode()} for x in self.record_batch],
         }
@@ -575,11 +548,15 @@ class MARCImportJob:
                 record_body = f.read(record_length - 24)
                 if len(record_body) != record_length - 24:
-                    raise ValueError("Unexpected end of file while reading MARC record.")
+                    raise ValueError(
+                        "Unexpected end of file while reading MARC record."
+                    )
                 # Verify record terminator
-                if record_body[-1:] != b'\x1D':
-                    raise ValueError("MARC record does not end with the expected terminator (0x1D).")
+                if record_body[-1:] != b"\x1d":
+                    raise ValueError(
+                        "MARC record does not end with the expected terminator (0x1D)."
+                    )
                 # Write the full record to the batch buffer
                 batch.write(leader + record_body)
@@ -620,12 +597,11 @@ class MARCImportJob:
             try:
                 if isinstance(self.current_file[0], Path):
                     files = [
-                        stack.enter_context(open(file, "rb")) for file in self.current_file
+                        stack.enter_context(open(file, "rb"))
+                        for file in self.current_file
                     ]
                 elif isinstance(self.current_file[0], io.BytesIO):
-                    files = [
-                        stack.enter_context(file) for file in self.current_file
-                        ]
+                    files = [stack.enter_context(file) for file in self.current_file]
                 else:
                     raise ValueError("Invalid file type. Must be Path or BytesIO.")
             except IndexError as e:
@@ -646,17 +622,62 @@ class MARCImportJob:
                     disable=self.no_progress,
                 ) as pbar_sent,
             ):
-                self.pbar_sent = pbar_sent
-                self.pbar_imported = pbar_imported
-                await self.process_records(files, total_records)
-                while not self.finished:
-                    await self.get_job_status()
-                sleep(1)
+                try:
+                    self.pbar_sent = pbar_sent
+                    self.pbar_imported = pbar_imported
+                    await self.process_records(files, total_records)
+                    while not self.finished:
+                        await self.get_job_status()
+                    await asyncio.sleep(5)
+                except FolioDataImportBatchError as e:
+                    logger.error(
+                        f"Unhandled error posting batch {e.batch_id}: {e.message}"
+                    )
+                    await self.cancel_job()
+                    raise e
+                except FolioDataImportJobError as e:
+                    await self.cancel_job()
+                    if self._job_retries < self._max_job_retries:
+                        self._job_retries += 1
+                        logger.error(
+                            f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and retrying."
+                        )
+                        await self.import_marc_file()
+                    else:
+                        logger.critical(
+                            f"Unhandled error processing job {e.job_id}: {e.message}, cancelling and exiting (maximum retries reached)."
+                        )
+                        raise e
             if self.finished:
                 await self.log_job_summary()
             self.last_current = 0
             self.finished = False
+    async def cancel_job(self) -> None:
+        """
+        Cancels the current job execution.
+        This method sends a request to cancel the job execution and logs the result.
+        Returns:
+            None
+        """
+        try:
+            cancel = self.http_client.delete(
+                self.folio_client.gateway_url
+                + f"/change-manager/jobExecutions/{self.job_id}/records",
+                headers=self.folio_client.okapi_headers,
+            )
+            cancel.raise_for_status()
+            self.finished = True
+            logger.info(f"Cancelled job: {self.job_id}")
+        except (httpx.ConnectTimeout, httpx.ReadTimeout):
+            logger.warning(
+                f"CONNECTION ERROR cancelling job {self.job_id}. Retrying..."
+            )
+            sleep(0.25)
+            await self.cancel_job()
     async def log_job_summary(self):
         if job_summary := await self.get_job_summary():
             job_id = job_summary.pop("jobExecutionId", None)
@@ -675,22 +696,22 @@ class MARCImportJob:
                 table_data.append(table_row)
             table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
             columns = columns[:1] + [
-                        " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
-                    ]
+                " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
+            ]
             logger.info(
-                        f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
-                        f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
-                    )
+                f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
+                f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
+            )
             logger.info(
-                        "\n"
-                        + tabulate.tabulate(
-                            table_data, headers=columns, tablefmt="fancy_grid"
-                        ),
-                    )
+                "\n"
+                + tabulate.tabulate(table_data, headers=columns, tablefmt="fancy_grid"),
+            )
             if total_errors:
                 logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
         else:
-            logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
+            logger.error(
+                f"No job summary available for job #{self.job_hrid}({self.job_id})."
+            )
     async def get_job_summary(self) -> dict:
         """
@@ -715,8 +736,10 @@ class MARCImportJob:
             self.current_retry_timeout = None
         except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
             error_text = e.response.text if hasattr(e, "response") else str(e)
-            if (self._max_summary_retries > self._summary_retries) and (not hasattr(e, "response") or (
-                hasattr(e, "response") and e.response.status_code in [502, 504]) and not self.let_summary_fail
+            if (self._max_summary_retries > self._summary_retries) and (
+                not hasattr(e, "response")
+                or (hasattr(e, "response") and e.response.status_code in [502, 504])
+                and not self.let_summary_fail
             ):
                 logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
                 sleep(0.25)
@@ -727,8 +750,9 @@ class MARCImportJob:
                     self.folio_client.httpx_client = temp_client
                     self._summary_retries += 1
                     return await self.get_job_summary()
-            elif (self._summary_retries >= self._max_summary_retries) or (hasattr(e, "response") and (
-                e.response.status_code in [502, 504] and self.let_summary_fail)
+            elif (self._summary_retries >= self._max_summary_retries) or (
+                hasattr(e, "response")
+                and (e.response.status_code in [502, 504] and self.let_summary_fail)
             ):
                 logger.warning(
                     f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
@@ -833,19 +857,10 @@ async def main() -> None:
             "to apply to each MARC record before sending to FOLIO. Function should take "
             "a pymarc.Record object as input and return a pymarc.Record object."
         ),
-        default=None,
-    )
-    # Add mutually exclusive group for consolidate and split-files options
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "--consolidate",
-        action="store_true",
-        help=(
-            "Consolidate records into a single job. "
-            "Default is to create a new job for each MARC file."
-        ),
+        default="",
     )
-    group.add_argument(
+    parser.add_argument(
         "--split-files",
         action="store_true",
         help="Split files into smaller parts before importing.",
@@ -856,6 +871,12 @@ async def main() -> None:
         help="The number of records to include in each split file.",
         default=1000,
     )
+    parser.add_argument(
+        "--split-offset",
+        type=int,
+        help="The number of record batches of <split-size> to skip before starting import.",
+        default=0,
+    )
     parser.add_argument(
         "--no-progress",
@@ -867,6 +888,16 @@ async def main() -> None:
         action="store_true",
         help="Do not retry fetching the final job summary if it fails",
     )
+    parser.add_argument(
+        "--preprocessor-config",
+        type=str,
+        help=(
+            "JSON file containing configuration for preprocessor functions. "
+            "This is passed to MARCPreprocessor class as a dict of dicts."
+        ),
+        default=None,
+    )
     args = parser.parse_args()
     if not args.password:
         args.password = getpass("Enter FOLIO password: ")
@@ -891,6 +922,12 @@ async def main() -> None:
     else:
         logger.info(marc_files)
+    if args.preprocessor_config:
+        with open(args.preprocessor_config, "r") as f:
+            preprocessor_args = json.load(f)
+    else:
+        preprocessor_args = {}
     if not args.import_profile_name:
         import_profiles = folio_client.folio_get(
             "/data-import-profiles/jobProfiles",
@@ -919,11 +956,12 @@ async def main() -> None:
             batch_size=args.batch_size,
             batch_delay=args.batch_delay,
             marc_record_preprocessor=args.preprocessor,
-            consolidate=bool(args.consolidate),
+            preprocessor_args=preprocessor_args,
             no_progress=bool(args.no_progress),
             let_summary_fail=bool(args.let_summary_fail),
             split_files=bool(args.split_files),
             split_size=args.split_size,
+            split_offset=args.split_offset,
         ).do_work()
     except Exception as e:
         logger.error("Error importing files: " + str(e))