PyPI - folio-data-import - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8.post1__py3-none-any.whl - Mend - Supply Chain Defender

folio-data-import 0.2.7py3-none-any.whl → 0.2.8.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of folio-data-import might be problematic. Click here for more details.

Files changed (10) hide show

folio_data_import/MARCDataImport.py CHANGED Viewed

@@ -1,18 +1,21 @@
 import argparse
 import asyncio
+import datetime
 import glob
 import importlib
 import io
+import logging
+import math
 import os
 import sys
-from typing import List
 import uuid
 from contextlib import ExitStack
-import datetime
 from datetime import datetime as dt
+from functools import cached_property
 from getpass import getpass
 from pathlib import Path
 from time import sleep
+from typing import List, Union
 import folioclient
 import httpx
@@ -22,7 +25,6 @@ import tabulate
 from humps import decamelize
 from tqdm import tqdm
 try:
     datetime_utc = datetime.UTC
 except AttributeError:
@@ -36,6 +38,18 @@ REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error":
 RETRY_TIMEOUT_START = 1
 RETRY_TIMEOUT_RETRY_FACTOR = 2
+# Custom log level for data issues, set to 26
+DATA_ISSUE_LVL_NUM = 26
+logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
+def data_issues(self, msg, *args, **kws):
+    if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
+        self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
+logging.Logger.data_issues = data_issues
+logger = logging.getLogger(__name__)
 class MARCImportJob:
     """
     Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
@@ -56,7 +70,6 @@ class MARCImportJob:
     bad_records_file: io.TextIOWrapper
     failed_batches_file: io.TextIOWrapper
     job_id: str
-    job_import_profile: dict
     pbar_sent: tqdm
     pbar_imported: tqdm
     http_client: httpx.Client
@@ -66,6 +79,11 @@ class MARCImportJob:
     last_current: int = 0
     total_records_sent: int = 0
     finished: bool = False
+    job_id: str = ""
+    job_hrid: int = 0
+    current_file: Union[List[Path],List[io.BytesIO]] = []
+    _max_summary_retries: int = 2
+    _summary_retries: int = 0
     def __init__(
         self,
@@ -77,9 +95,19 @@ class MARCImportJob:
         marc_record_preprocessor=None,
         consolidate=False,
         no_progress=False,
+        let_summary_fail=False,
+        split_files=False,
+        split_size=1000,
+        split_offset=0,
     ) -> None:
         self.consolidate_files = consolidate
+        self.split_files = split_files
+        self.split_size = split_size
+        self.split_offset = split_offset
+        if self.split_files and self.consolidate_files:
+            raise ValueError("Cannot consolidate and split files at the same time.")
         self.no_progress = no_progress
+        self.let_summary_fail = let_summary_fail
         self.folio_client: folioclient.FolioClient = folio_client
         self.import_files = marc_files
         self.import_profile_name = import_profile_name
@@ -93,38 +121,69 @@ class MARCImportJob:
         Performs the necessary work for data import.
         This method initializes an HTTP client, files to store records that fail to send,
-        and calls `self.import_marc_records` to import MARC files. If `consolidate_files` is True,
-        it imports all the files specified in `import_files` as a single batch. Otherwise,
-        it imports each file as a separate import job.
+        and calls the appropriate method to import MARC files based on the configuration.
         Returns:
             None
         """
-        with httpx.Client() as http_client, open(
-            self.import_files[0].parent.joinpath(
-                f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
-            ),
-            "wb+",
-        ) as bad_marc_file, open(
-            self.import_files[0].parent.joinpath(
-                f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
-            ),
-            "wb+",
-        ) as failed_batches:
+        with (
+            httpx.Client() as http_client,
+            open(
+                self.import_files[0].parent.joinpath(
+                    f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
+                ),
+                "wb+",
+            ) as bad_marc_file,
+            open(
+                self.import_files[0].parent.joinpath(
+                    f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
+                ),
+                "wb+",
+            ) as failed_batches,
+        ):
             self.bad_records_file = bad_marc_file
-            print(f"Writing bad records to {self.bad_records_file.name}")
+            logger.info(f"Writing bad records to {self.bad_records_file.name}")
             self.failed_batches_file = failed_batches
-            print(f"Writing failed batches to {self.failed_batches_file.name}")
+            logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
             self.http_client = http_client
             if self.consolidate_files:
-                self.current_file = self.import_files
-                await self.import_marc_file()
+                await self.process_consolidated_import()
+            elif self.split_files:
+                await self.process_split_files()
             else:
                 for file in self.import_files:
                     self.current_file = [file]
                     await self.import_marc_file()
             await self.wrap_up()
+    async def process_split_files(self):
+        """
+        Process the import of files in smaller batches.
+        This method is called when `split_files` is set to True.
+        It splits each file into smaller chunks and processes them one by one.
+        """
+        for file in self.import_files:
+            with open(file, "rb") as f:
+                file_length = await self.read_total_records([f])
+            expected_batches = math.ceil(file_length /self.split_size)
+            logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
+            zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
+            for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
+                if idx > self.split_offset:
+                    batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
+                    self.current_file = [batch]
+                    await self.import_marc_file()
+            self.move_file_to_complete(file)
+    async def process_consolidated_import(self):
+        """
+        Process the import of files as a single batch.
+        This method is called when `consolidate_files` is set to True.
+        It creates a single job for all files and processes them together.
+        """
+        self.current_file = self.import_files
+        await self.import_marc_file()
     async def wrap_up(self) -> None:
         """
         Wraps up the data import process.
@@ -135,16 +194,16 @@ class MARCImportJob:
         Returns:
             None
         """
-        self.bad_records_file.seek(0)
-        if not self.bad_records_file.read(1):
-            os.remove(self.bad_records_file.name)
-            print("No bad records found. Removing bad records file.")
-        self.failed_batches_file.seek(0)
-        if not self.failed_batches_file.read(1):
-            os.remove(self.failed_batches_file.name)
-            print("No failed batches. Removing failed batches file.")
-        print("Import complete.")
-        print(f"Total records imported: {self.total_records_sent}")
+        with open(self.bad_records_file.name, "rb") as bad_records:
+            if not bad_records.read(1):
+                os.remove(bad_records.name)
+                logger.info("No bad records found. Removing bad records file.")
+        with open(self.failed_batches_file.name, "rb") as failed_batches:
+            if not failed_batches.read(1):
+                os.remove(failed_batches.name)
+                logger.info("No failed batches. Removing failed batches file.")
+        logger.info("Import complete.")
+        logger.info(f"Total records imported: {self.total_records_sent}")
     async def get_job_status(self) -> None:
         """
@@ -158,38 +217,69 @@ class MARCImportJob:
         """
         try:
             self.current_retry_timeout = (
-                self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
-            ) if self.current_retry_timeout else RETRY_TIMEOUT_START
-            job_status = self.folio_client.folio_get(
-                "/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
-                "=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
+                (self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
+                if self.current_retry_timeout
+                else RETRY_TIMEOUT_START
             )
-            self.current_retry_timeout = None
-        except httpx.ConnectTimeout:
-            sleep(.25)
             with httpx.Client(
                 timeout=self.current_retry_timeout,
-                verify=self.folio_client.ssl_verify
+                verify=self.folio_client.ssl_verify,
             ) as temp_client:
-                self.folio_client.httpx_client = temp_client
-                return await self.get_job_status()
+                job_status = self.folio_client.folio_get(
+                    "/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
+                    "=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
+                )
+                self.current_retry_timeout = None
+        except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
+            if not hasattr(e, "response") or e.response.status_code in [502, 504]:
+                error_text = e.response.text if hasattr(e, "response") else str(e)
+                logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
+                sleep(0.25)
+                with httpx.Client(
+                    timeout=self.current_retry_timeout,
+                    verify=self.folio_client.ssl_verify,
+                ) as temp_client:
+                    self.folio_client.httpx_client = temp_client
+                    return await self.get_job_status()
+            else:
+                raise e
+        except Exception as e:
+            logger.error(f"Error fetching job status. {e}")
         try:
             status = [
                 job for job in job_status["jobExecutions"] if job["id"] == self.job_id
             ][0]
             self.pbar_imported.update(status["progress"]["current"] - self.last_current)
             self.last_current = status["progress"]["current"]
-        except IndexError:
-            job_status = self.folio_client.folio_get(
-                "/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
-                "=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
-            )
-            status = [
-                job for job in job_status["jobExecutions"] if job["id"] == self.job_id
-            ][0]
-            self.pbar_imported.update(status["progress"]["current"] - self.last_current)
-            self.last_current = status["progress"]["current"]
-            self.finished = True
+        except (IndexError, ValueError, KeyError):
+            logger.debug(f"No active job found with ID {self.job_id}. Checking for finished job.")
+            try:
+                job_status = self.folio_client.folio_get(
+                    "/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
+                    "=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
+                )
+                status = [
+                    job for job in job_status["jobExecutions"] if job["id"] == self.job_id
+                ][0]
+                self.pbar_imported.update(status["progress"]["current"] - self.last_current)
+                self.last_current = status["progress"]["current"]
+                self.finished = True
+            except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
+                if not hasattr(e, "response") or e.response.status_code in [502, 504]:
+                    error_text = e.response.text if hasattr(e, "response") else str(e)
+                    logger.warning(
+                        f"SERVER ERROR fetching job status: {error_text}. Retrying."
+                    )
+                    sleep(0.25)
+                    with httpx.Client(
+                        timeout=self.current_retry_timeout,
+                        verify=self.folio_client.ssl_verify,
+                    ) as temp_client:
+                        self.folio_client.httpx_client = temp_client
+                        return await self.get_job_status()
+                else:
+                    raise e
     async def create_folio_import_job(self) -> None:
         """
@@ -201,26 +291,36 @@ class MARCImportJob:
         Raises:
             HTTPError: If there is an error creating the job.
         """
-        create_job = self.http_client.post(
-            self.folio_client.okapi_url + "/change-manager/jobExecutions",
-            headers=self.folio_client.okapi_headers,
-            json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
-        )
         try:
-            create_job.raise_for_status()
-        except httpx.HTTPError as e:
-            print(
-                "Error creating job: "
-                + str(e)
-                + "\n"
-                + getattr(getattr(e, "response", ""), "text", "")
+            create_job = self.http_client.post(
+                self.folio_client.gateway_url + "/change-manager/jobExecutions",
+                headers=self.folio_client.okapi_headers,
+                json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
             )
-            raise e
+            create_job.raise_for_status()
+        except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
+            if not hasattr(e, "response") or e.response.status_code in [502, 504]:
+                logger.warning(f"SERVER ERROR creating job: {e}. Retrying.")
+                sleep(0.25)
+                return await self.create_folio_import_job()
+            else:
+                logger.error(
+                    "Error creating job: "
+                    + str(e)
+                    + "\n"
+                    + getattr(getattr(e, "response", ""), "text", "")
+                )
+                raise e
         self.job_id = create_job.json()["parentJobExecutionId"]
+        logger.info(f"Created job: {self.job_id}")
-    async def get_import_profile(self) -> None:
+    @cached_property
+    def import_profile(self) -> dict:
         """
-        Retrieves the import profile with the specified name.
+        Returns the import profile for the current job execution.
+        Returns:
+            dict: The import profile for the current job execution.
         """
         import_profiles = self.folio_client.folio_get(
             "/data-import-profiles/jobProfiles",
@@ -232,7 +332,7 @@ class MARCImportJob:
             for profile in import_profiles
             if profile["name"] == self.import_profile_name
         ][0]
-        self.job_import_profile = profile
+        return profile
     async def set_job_profile(self) -> None:
         """
@@ -242,21 +342,23 @@ class MARCImportJob:
             The response from the HTTP request to set the job profile.
         """
         set_job_profile = self.http_client.put(
-            self.folio_client.okapi_url
+            self.folio_client.gateway_url
             + "/change-manager/jobExecutions/"
             + self.job_id
             + "/jobProfile",
             headers=self.folio_client.okapi_headers,
             json={
-                "id": self.job_import_profile["id"],
-                "name": self.job_import_profile["name"],
+                "id": self.import_profile["id"],
+                "name": self.import_profile["name"],
                 "dataType": "MARC",
             },
         )
         try:
             set_job_profile.raise_for_status()
+            self.job_hrid = set_job_profile.json()['hrId']
+            logger.info(f"Job HRID: {self.job_hrid}")
         except httpx.HTTPError as e:
-            print(
+            logger.error(
                 "Error creating job: "
                 + str(e)
                 + "\n"
@@ -264,7 +366,8 @@ class MARCImportJob:
             )
             raise e
-    async def read_total_records(self, files) -> int:
+    @staticmethod
+    async def read_total_records(files) -> int:
         """
         Reads the total number of records from the given files.
@@ -277,7 +380,7 @@ class MARCImportJob:
         total_records = 0
         for import_file in files:
             while True:
-                chunk = import_file.read(1024)
+                chunk = import_file.read(104857600)
                 if not chunk:
                     break
                 total_records += chunk.count(b"\x1d")
@@ -291,24 +394,41 @@ class MARCImportJob:
         Args:
             batch_payload (dict): A records payload containing the current batch of MARC records.
         """
-        post_batch = self.http_client.post(
-            self.folio_client.okapi_url
-            + f"/change-manager/jobExecutions/{self.job_id}/records",
-            headers=self.folio_client.okapi_headers,
-            json=batch_payload,
-        )
+        try:
+            post_batch = self.http_client.post(
+                self.folio_client.gateway_url
+                + f"/change-manager/jobExecutions/{self.job_id}/records",
+                headers=self.folio_client.okapi_headers,
+                json=batch_payload,
+            )
+            # if batch_payload["recordsMetadata"]["last"]:
+            #     logger.log(
+            #         25,
+            #         f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
+            #     )
+        except (httpx.ConnectTimeout, httpx.ReadTimeout):
+            sleep(0.25)
+            return await self.process_record_batch(batch_payload)
         try:
             post_batch.raise_for_status()
             self.total_records_sent += len(self.record_batch)
             self.record_batch = []
             self.pbar_sent.update(len(batch_payload["initialRecords"]))
         except Exception as e:
-            print("Error posting batch: " + str(e))
-            for record in self.record_batch:
-                self.failed_batches_file.write(record)
-                self.error_records += len(self.record_batch)
-                self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
-            self.record_batch = []
+            if (
+                hasattr(e, "response") and e.response.status_code in [500, 422]
+            ):  # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
+                self.total_records_sent += len(self.record_batch)
+                self.record_batch = []
+                self.pbar_sent.update(len(batch_payload["initialRecords"]))
+            else:
+                logger.error("Error posting batch: " + str(e))
+                for record in self.record_batch:
+                    self.failed_batches_file.write(record)
+                    self.error_records += len(self.record_batch)
+                    self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
+                self.record_batch = []
+        await self.get_job_status()
         sleep(self.batch_delay)
     async def process_records(self, files, total_records) -> None:
@@ -325,16 +445,21 @@ class MARCImportJob:
         """
         counter = 0
         for import_file in files:
+            file_path = Path(import_file.name)
             self.pbar_sent.set_description(
                 f"Sent ({os.path.basename(import_file.name)}): "
             )
             reader = pymarc.MARCReader(import_file, hide_utf8_warnings=True)
-            for record in reader:
+            for idx, record in enumerate(reader, start=1):
                 if len(self.record_batch) == self.batch_size:
                     await self.process_record_batch(
-                        await self.create_batch_payload(counter, total_records, False),
+                        await self.create_batch_payload(
+                            counter,
+                            total_records,
+                            (counter - self.error_records)
+                            == (total_records - self.error_records),
+                        ),
                     )
-                    await self.get_job_status()
                     sleep(0.25)
                 if record:
                     if self.marc_record_preprocessor:
@@ -344,14 +469,39 @@ class MARCImportJob:
                     self.record_batch.append(record.as_marc())
                     counter += 1
                 else:
+                    logger.data_issues(
+                        "RECORD FAILED\t%s\t%s\t%s",
+                        f"{file_path.name}:{idx}",
+                        f"Error reading {idx} record from {file_path}. Skipping. Writing current chunk to {self.bad_records_file.name}.",
+                        "",
+                    )
                     self.bad_records_file.write(reader.current_chunk)
             if self.record_batch:
                 await self.process_record_batch(
-                    await self.create_batch_payload(counter, total_records, True),
+                    await self.create_batch_payload(
+                        counter,
+                        total_records,
+                        (counter - self.error_records)
+                        == (total_records - self.error_records),
+                    ),
                 )
+            if not self.split_files:
+                self.move_file_to_complete(file_path)
+    def move_file_to_complete(self, file_path):
+        import_complete_path = file_path.parent.joinpath("import_complete")
+        if not import_complete_path.exists():
+            logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
+            import_complete_path.mkdir(exist_ok=True)
+        logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
+        file_path.rename(
+                file_path.parent.joinpath("import_complete", file_path.name)
+            )
     @staticmethod
-    async def apply_marc_record_preprocessing(record: pymarc.Record, func_or_path) -> pymarc.Record:
+    async def apply_marc_record_preprocessing(
+        record: pymarc.Record, func_or_path
+    ) -> pymarc.Record:
         """
         Apply preprocessing to the MARC record before sending it to FOLIO.
@@ -363,25 +513,42 @@ class MARCImportJob:
             pymarc.Record: The preprocessed MARC record.
         """
         if isinstance(func_or_path, str):
-            try:
-                path_parts = func_or_path.rsplit('.')
-                module_path, func_name = ".".join(path_parts[:-1]), path_parts[-1]
-                module = importlib.import_module(module_path)
-                func = getattr(module, func_name)
-            except (ImportError, AttributeError) as e:
-                print(f"Error importing preprocessing function {func_or_path}: {e}. Skipping preprocessing.")
-                return record
+            func_paths = func_or_path.split(",")
+            for func_path in func_paths:
+                record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
+                    record, func_path
+                )
         elif callable(func_or_path):
-            func = func_or_path
+            record = func_or_path(record)
         else:
-            print(f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing.")
-            return record
+            logger.warning(
+                f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
+            )
+        return record
+    async def _apply_single_marc_record_preprocessing_by_path(
+        record: pymarc.Record, func_path: str
+    ) -> pymarc.Record:
+        """
+        Apply a single preprocessing function to the MARC record.
+        Args:
+            record (pymarc.Record): The MARC record to preprocess.
+            func_path (str): The path to the preprocessing function.
+        Returns:
+            pymarc.Record: The preprocessed MARC record.
+        """
         try:
-            return func(record)
+            module_path, func_name = func_path.rsplit(".", 1)
+            module = importlib.import_module(module_path)
+            func = getattr(module, func_name)
+            record = func(record)
         except Exception as e:
-            print(f"Error applying preprocessing function: {e}. Skipping preprocessing.")
-            return record
+            logger.warning(
+                f"Error applying preprocessing function {func_path}: {e}. Skipping."
+            )
+        return record
     async def create_batch_payload(self, counter, total_records, is_last) -> dict:
         """
@@ -406,6 +573,46 @@ class MARCImportJob:
             "initialRecords": [{"record": x.decode()} for x in self.record_batch],
         }
+    @staticmethod
+    def split_marc_file(file_path, batch_size):
+        """Generator to iterate over MARC records in batches, yielding BytesIO objects."""
+        with open(file_path, "rb") as f:
+            batch = io.BytesIO()
+            count = 0
+            while True:
+                leader = f.read(24)
+                if not leader:
+                    break  # End of file
+                try:
+                    record_length = int(leader[:5])  # Extract record length from leader
+                except ValueError:
+                    raise ValueError("Invalid MARC record length encountered.")
+                record_body = f.read(record_length - 24)
+                if len(record_body) != record_length - 24:
+                    raise ValueError("Unexpected end of file while reading MARC record.")
+                # Verify record terminator
+                if record_body[-1:] != b'\x1D':
+                    raise ValueError("MARC record does not end with the expected terminator (0x1D).")
+                # Write the full record to the batch buffer
+                batch.write(leader + record_body)
+                count += 1
+                if count >= batch_size:
+                    batch.seek(0)
+                    yield batch
+                    batch = io.BytesIO()  # Reset buffer
+                    count = 0
+            # Yield any remaining records
+            if count > 0:
+                batch.seek(0)
+                yield batch
     async def import_marc_file(self) -> None:
         """
         Imports MARC file into the system.
@@ -425,24 +632,37 @@ class MARCImportJob:
             None
         """
         await self.create_folio_import_job()
-        await self.get_import_profile()
         await self.set_job_profile()
         with ExitStack() as stack:
-            files = [
-                stack.enter_context(open(file, "rb")) for file in self.current_file
-            ]
+            try:
+                if isinstance(self.current_file[0], Path):
+                    files = [
+                        stack.enter_context(open(file, "rb")) for file in self.current_file
+                    ]
+                elif isinstance(self.current_file[0], io.BytesIO):
+                    files = [
+                        stack.enter_context(file) for file in self.current_file
+                        ]
+                else:
+                    raise ValueError("Invalid file type. Must be Path or BytesIO.")
+            except IndexError as e:
+                logger.error(f"Error opening file: {e}")
+                raise e
             total_records = await self.read_total_records(files)
-            with tqdm(
-                desc="Imported: ",
-                total=total_records,
-                position=1,
-                disable=self.no_progress,
-            ) as pbar_imported, tqdm(
-                desc="Sent: ()",
-                total=total_records,
-                position=0,
-                disable=self.no_progress,
-            ) as pbar_sent:
+            with (
+                tqdm(
+                    desc=f"Imported ({self.job_hrid}): ",
+                    total=total_records,
+                    position=1,
+                    disable=self.no_progress,
+                ) as pbar_imported,
+                tqdm(
+                    desc="Sent: ()",
+                    total=total_records,
+                    position=0,
+                    disable=self.no_progress,
+                ) as pbar_sent,
+            ):
                 self.pbar_sent = pbar_sent
                 self.pbar_imported = pbar_imported
                 await self.process_records(files, total_records)
@@ -450,37 +670,45 @@ class MARCImportJob:
                     await self.get_job_status()
                 sleep(1)
             if self.finished:
-                job_summary = await self.get_job_summary()
-                job_summary.pop("jobExecutionId")
-                job_summary.pop("totalErrors")
-                columns = ["Summary"] + list(job_summary.keys())
-                rows = set()
-                for key in columns[1:]:
-                    rows.update(job_summary[key].keys())
-                table_data = []
-                for row in rows:
-                    metric_name = decamelize(row).split("_")[1]
-                    table_row = [metric_name]
-                    for col in columns[1:]:
-                        table_row.append(job_summary[col].get(row, "N/A"))
-                    table_data.append(table_row)
-                table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
-                columns = columns[:1] + [
-                    " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
-                ]
-                print(
-                    f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
-                    f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
-                )
-                print(
-                    tabulate.tabulate(
-                        table_data, headers=columns, tablefmt="fancy_grid"
-                    ),
-                )
+                await self.log_job_summary()
             self.last_current = 0
             self.finished = False
+    async def log_job_summary(self):
+        if job_summary := await self.get_job_summary():
+            job_id = job_summary.pop("jobExecutionId", None)
+            total_errors = job_summary.pop("totalErrors", 0)
+            columns = ["Summary"] + list(job_summary.keys())
+            rows = set()
+            for key in columns[1:]:
+                rows.update(job_summary[key].keys())
+            table_data = []
+            for row in rows:
+                metric_name = decamelize(row).split("_")[1]
+                table_row = [metric_name]
+                for col in columns[1:]:
+                    table_row.append(job_summary[col].get(row, "N/A"))
+                table_data.append(table_row)
+            table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
+            columns = columns[:1] + [
+                        " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
+                    ]
+            logger.info(
+                        f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
+                        f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
+                    )
+            logger.info(
+                        "\n"
+                        + tabulate.tabulate(
+                            table_data, headers=columns, tablefmt="fancy_grid"
+                        ),
+                    )
+            if total_errors:
+                logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
+        else:
+            logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
     async def get_job_summary(self) -> dict:
         """
         Retrieves the job summary for the current job execution.
@@ -490,23 +718,88 @@ class MARCImportJob:
         """
         try:
             self.current_retry_timeout = (
-                self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
-            ) if self.current_retry_timeout else RETRY_TIMEOUT_START
-            job_summary = self.folio_client.folio_get(
-                f"/metadata-provider/jobSummary/{self.job_id}"
+                (self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
+                if self.current_retry_timeout
+                else RETRY_TIMEOUT_START
             )
-            self.current_retry_timeout = None
-        except httpx.ReadTimeout:  #
-            sleep(.25)
             with httpx.Client(
-                timeout=self.current_retry_timeout,
-                verify=self.folio_client.ssl_verify
+                timeout=self.current_retry_timeout, verify=self.folio_client.ssl_verify
             ) as temp_client:
                 self.folio_client.httpx_client = temp_client
-                return await self.get_job_summary()
+                job_summary = self.folio_client.folio_get(
+                    f"/metadata-provider/jobSummary/{self.job_id}"
+                )
+            self.current_retry_timeout = None
+        except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
+            error_text = e.response.text if hasattr(e, "response") else str(e)
+            if (self._max_summary_retries > self._summary_retries) and (not hasattr(e, "response") or (
+                hasattr(e, "response") and e.response.status_code in [502, 504]) and not self.let_summary_fail
+            ):
+                logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
+                sleep(0.25)
+                with httpx.Client(
+                    timeout=self.current_retry_timeout,
+                    verify=self.folio_client.ssl_verify,
+                ) as temp_client:
+                    self.folio_client.httpx_client = temp_client
+                    self._summary_retries += 1
+                    return await self.get_job_summary()
+            elif (self._summary_retries >= self._max_summary_retries) or (hasattr(e, "response") and (
+                e.response.status_code in [502, 504] and self.let_summary_fail)
+            ):
+                logger.warning(
+                    f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
+                )
+                job_summary = {}
+            else:
+                raise e
         return job_summary
+def set_up_cli_logging():
+    """
+    This function sets up logging for the CLI.
+    """
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+    # Set up file and stream handlers
+    file_handler = logging.FileHandler(
+        "folio_data_import_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
+    # file_handler.addFilter(IncludeLevelFilter(25))
+    file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    file_handler.setFormatter(file_formatter)
+    logger.addHandler(file_handler)
+    if not any(
+        isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
+        for h in logger.handlers
+    ):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        stream_handler.setLevel(logging.INFO)
+        stream_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
+        # stream_handler.addFilter(ExcludeLevelFilter(25))
+        stream_formatter = logging.Formatter("%(message)s")
+        stream_handler.setFormatter(stream_formatter)
+        logger.addHandler(stream_handler)
+    # Set up data issues logging
+    data_issues_handler = logging.FileHandler(
+        "marc_import_data_issues_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
+    )
+    data_issues_handler.setLevel(26)
+    data_issues_handler.addFilter(IncludeLevelFilter(DATA_ISSUE_LVL_NUM))
+    data_issues_formatter = logging.Formatter("%(message)s")
+    data_issues_handler.setFormatter(data_issues_formatter)
+    logger.addHandler(data_issues_handler)
+    # Stop httpx from logging info messages to the console
+    logging.getLogger("httpx").setLevel(logging.WARNING)
 async def main() -> None:
     """
     Main function to run the MARC import job.
@@ -514,6 +807,7 @@ async def main() -> None:
     This function parses command line arguments, initializes the FolioClient,
     and runs the MARCImportJob.
     """
+    set_up_cli_logging()
     parser = argparse.ArgumentParser()
     parser.add_argument("--gateway_url", type=str, help="The FOLIO API Gateway URL")
     parser.add_argument("--tenant_id", type=str, help="The FOLIO tenant ID")
@@ -552,12 +846,15 @@ async def main() -> None:
         "--preprocessor",
         type=str,
         help=(
-            "The path to a Python module containing a preprocessing function "
-            "to apply to each MARC record before sending to FOLIO."
+            "Comma-separated python import paths to Python function(s) "
+            "to apply to each MARC record before sending to FOLIO. Function should take "
+            "a pymarc.Record object as input and return a pymarc.Record object."
         ),
         default=None,
     )
-    parser.add_argument(
+    # Add mutually exclusive group for consolidate and split-files options
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
         "--consolidate",
         action="store_true",
         help=(
@@ -565,11 +862,34 @@ async def main() -> None:
             "Default is to create a new job for each MARC file."
         ),
     )
+    group.add_argument(
+        "--split-files",
+        action="store_true",
+        help="Split files into smaller parts before importing.",
+    )
+    parser.add_argument(
+        "--split-size",
+        type=int,
+        help="The number of records to include in each split file.",
+        default=1000,
+    )
+    parser.add_argument(
+        "--split-offset",
+        type=int,
+        help="The number of record batches of <split-size> to skip before starting import.",
+        default=0,
+    )
     parser.add_argument(
         "--no-progress",
         action="store_true",
         help="Disable progress bars (eg. for running in a CI environment)",
     )
+    parser.add_argument(
+        "--let-summary-fail",
+        action="store_true",
+        help="Do not retry fetching the final job summary if it fails",
+    )
     args = parser.parse_args()
     if not args.password:
         args.password = getpass("Enter FOLIO password: ")
@@ -586,11 +906,13 @@ async def main() -> None:
     else:
         marc_files = list(Path("./").glob(args.marc_file_path))
+    marc_files.sort()
     if len(marc_files) == 0:
-        print(f"No files found matching {args.marc_file_path}. Exiting.")
+        logger.critical(f"No files found matching {args.marc_file_path}. Exiting.")
         sys.exit(1)
     else:
-        print(marc_files)
+        logger.info(marc_files)
     if not args.import_profile_name:
         import_profiles = folio_client.folio_get(
@@ -622,12 +944,34 @@ async def main() -> None:
             marc_record_preprocessor=args.preprocessor,
             consolidate=bool(args.consolidate),
             no_progress=bool(args.no_progress),
+            let_summary_fail=bool(args.let_summary_fail),
+            split_files=bool(args.split_files),
+            split_size=args.split_size,
+            split_offset=args.split_offset,
         ).do_work()
     except Exception as e:
-        print("Error importing files: " + str(e))
+        logger.error("Error importing files: " + str(e))
         raise
+class ExcludeLevelFilter(logging.Filter):
+    def __init__(self, level):
+        super().__init__()
+        self.level = level
+    def filter(self, record):
+        return record.levelno != self.level
+class IncludeLevelFilter(logging.Filter):
+    def __init__(self, level):
+        super().__init__()
+        self.level = level
+    def filter(self, record):
+        return record.levelno == self.level
 def sync_main() -> None:
     """
     Synchronous main function to run the MARC import job.