PyPI - folio-data-import - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

folio-data-import 0.3.2py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of folio-data-import might be problematic. Click here for more details.

Files changed (12) hide show

folio_data_import/MARCDataImport.py +295 -159
folio_data_import/UserImport.py +386 -255
folio_data_import/__main__.py +7 -110
folio_data_import/_progress.py +27 -0
folio_data_import/marc_preprocessors/_preprocessors.py +12 -8
{folio_data_import-0.3.2.dist-info → folio_data_import-0.4.1.dist-info}/METADATA +58 -7
folio_data_import-0.4.1.dist-info/RECORD +13 -0
{folio_data_import-0.3.2.dist-info → folio_data_import-0.4.1.dist-info}/WHEEL +1 -1
folio_data_import-0.4.1.dist-info/entry_points.txt +5 -0
folio_data_import-0.3.2.dist-info/RECORD +0 -12
folio_data_import-0.3.2.dist-info/entry_points.txt +0 -5
{folio_data_import-0.3.2.dist-info → folio_data_import-0.4.1.dist-info/licenses}/LICENSE +0 -0

folio_data_import/MARCDataImport.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import argparse
+import typer
 import asyncio
 import datetime
 import glob
@@ -12,10 +12,10 @@ import uuid
 from contextlib import ExitStack
 from datetime import datetime as dt
 from functools import cached_property
-from getpass import getpass
 from pathlib import Path
 from time import sleep
 from typing import BinaryIO, Callable, Dict, List, Union
+from typing_extensions import Annotated
 import folioclient
 import httpx
@@ -23,10 +23,21 @@ import inquirer
 import pymarc
 import tabulate
 from humps import decamelize
-from tqdm import tqdm
-from folio_data_import.custom_exceptions import FolioDataImportBatchError, FolioDataImportJobError
+from rich.progress import (
+    Progress,
+    TimeElapsedColumn,
+    BarColumn,
+    TimeRemainingColumn,
+    SpinnerColumn,
+    MofNCompleteColumn,
+)
+from rich.logging import RichHandler
+from folio_data_import.custom_exceptions import (
+    FolioDataImportBatchError,
+    FolioDataImportJobError,
+)
 from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
+from folio_data_import._progress import ItemsPerSecondColumn
 try:
     datetime_utc = datetime.UTC
@@ -71,20 +82,32 @@ class MARCImportJob:
         batch_size (int): The number of source records to include in a record batch (default=10).
         batch_delay (float): The number of seconds to wait between record batches (default=0).
         no_progress (bool): Disable progress bars (eg. for running in a CI environment).
+        marc_record_preprocessor (list or str): A list of callables or a string representing
+            the MARC record preprocessor(s) to apply to each record before import.
+        preprocessor_args (dict): A dictionary of arguments to pass to the MARC record preprocessor(s).
+        let_summary_fail (bool): If True, will not retry or fail the import if the final job summary
+            cannot be retrieved (default=False).
+        split_files (bool): If True, will split each file into smaller jobs of size `split_size`
+        split_size (int): The number of records to include in each split file (default=1000).
+        split_offset (int): The number of split files to skip before starting processing (default=0).
+        job_ids_file_path (str): The path to the file where job IDs will be saved (default="marc_import_job_ids.txt").
+        show_file_names_in_data_import_logs (bool): If True, will set the file name for each job in the data import logs.
     """
     bad_records_file: io.TextIOWrapper
     failed_batches_file: io.TextIOWrapper
     job_id: str
-    pbar_sent: tqdm
-    pbar_imported: tqdm
+    progress: Progress
+    pbar_sent: int
+    pbar_imported: int
     http_client: httpx.Client
     current_file: List[Path]
-    record_batch: List[dict] = []
+    record_batch: List[dict]
     last_current: int = 0
     total_records_sent: int = 0
     finished: bool = False
     job_id: str = ""
+    job_ids: List[str]
     job_hrid: int = 0
     current_file: Union[List[Path], List[io.BytesIO]] = []
     _max_summary_retries: int = 2
@@ -99,13 +122,15 @@ class MARCImportJob:
         import_profile_name: str,
         batch_size=10,
         batch_delay=0,
-        marc_record_preprocessor: Union[List[Callable], str] = [],
-        preprocessor_args: Dict[str, Dict] = {},
+        marc_record_preprocessor: Union[List[Callable], str] = None,
+        preprocessor_args: Dict[str, Dict] = None,
         no_progress=False,
         let_summary_fail=False,
         split_files=False,
         split_size=1000,
         split_offset=0,
+        job_ids_file_path: str = "",
+        show_file_names_in_data_import_logs: bool = False,
     ) -> None:
         self.split_files = split_files
         self.split_size = split_size
@@ -121,6 +146,10 @@ class MARCImportJob:
         self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(
             marc_record_preprocessor, **preprocessor_args
         )
+        self.job_ids_file_path = job_ids_file_path or self.import_files[
+            0
+        ].parent.joinpath("marc_import_job_ids.txt")
+        self.show_file_names_in_data_import_logs = show_file_names_in_data_import_logs
     async def do_work(self) -> None:
         """
@@ -132,6 +161,8 @@ class MARCImportJob:
         Returns:
             None
         """
+        self.record_batch = []
+        self.job_ids = []
         with (
             httpx.Client() as http_client,
             open(
@@ -158,7 +189,6 @@ class MARCImportJob:
                 for file in self.import_files:
                     self.current_file = [file]
                     await self.import_marc_file()
-            await self.wrap_up()
     async def process_split_files(self):
         """
@@ -201,6 +231,10 @@ class MARCImportJob:
             if not failed_batches.read(1):
                 os.remove(failed_batches.name)
                 logger.info("No failed batches. Removing failed batches file.")
+        with open(self.job_ids_file_path, "a+") as job_ids_file:
+            logger.info(f"Writing job IDs to {self.job_ids_file_path}")
+            for job_id in self.job_ids:
+                job_ids_file.write(f"{job_id}\n")
         logger.info("Import complete.")
         logger.info(f"Total records imported: {self.total_records_sent}")
@@ -256,7 +290,10 @@ class MARCImportJob:
             status = [
                 job for job in job_status["jobExecutions"] if job["id"] == self.job_id
             ][0]
-            self.pbar_imported.update(status["progress"]["current"] - self.last_current)
+            self.progress.update(
+                self.pbar_imported,
+                advance=status["progress"]["current"] - self.last_current,
+            )
             self.last_current = status["progress"]["current"]
         except (IndexError, ValueError, KeyError):
             logger.debug(
@@ -272,8 +309,9 @@ class MARCImportJob:
                     for job in job_status["jobExecutions"]
                     if job["id"] == self.job_id
                 ][0]
-                self.pbar_imported.update(
-                    status["progress"]["current"] - self.last_current
+                self.progress.update(
+                    self.pbar_imported,
+                    advance=status["progress"]["current"] - self.last_current,
                 )
                 self.last_current = status["progress"]["current"]
                 self.finished = True
@@ -297,6 +335,40 @@ class MARCImportJob:
                 else:
                     raise e
+    async def set_job_file_name(self) -> None:
+        """
+        Sets the file name for the current job execution.
+        Returns:
+            None
+        """
+        try:
+            job_object = self.http_client.get(
+                self.folio_client.gateway_url
+                + "/change-manager/jobExecutions/"
+                + self.job_id,
+                headers=self.folio_client.okapi_headers,
+            )
+            job_object.raise_for_status()
+            job_object_json = job_object.json()
+            job_object_json.update({"fileName": self.current_file[0].name})
+            set_file_name = self.http_client.put(
+                self.folio_client.gateway_url
+                + "/change-manager/jobExecutions/"
+                + self.job_id,
+                headers=self.folio_client.okapi_headers,
+                json=job_object_json,
+            )
+            set_file_name.raise_for_status()
+        except httpx.HTTPError as e:
+            logger.error(
+                "Error setting job file name: "
+                + str(e)
+                + "\n"
+                + getattr(getattr(e, "response", ""), "text", "")
+            )
+            raise e
     async def create_folio_import_job(self) -> None:
         """
         Creates a job execution for importing data into FOLIO.
@@ -328,6 +400,9 @@ class MARCImportJob:
                 )
                 raise e
         self.job_id = create_job.json()["parentJobExecutionId"]
+        if self.show_file_names_in_data_import_logs:
+            await self.set_job_file_name()
+        self.job_ids.append(self.job_id)
         logger.info(f"Created job: {self.job_id}")
     @cached_property
@@ -357,6 +432,9 @@ class MARCImportJob:
         Returns:
             The response from the HTTP request to set the job profile.
         """
+        logger.info(
+            f"Setting job profile: {self.import_profile['name']} ({self.import_profile['id']}) for job {self.job_id}"
+        )
         set_job_profile = self.http_client.put(
             self.folio_client.gateway_url
             + "/change-manager/jobExecutions/"
@@ -427,14 +505,18 @@ class MARCImportJob:
             post_batch.raise_for_status()
             self.total_records_sent += len(self.record_batch)
             self.record_batch = []
-            self.pbar_sent.update(len(batch_payload["initialRecords"]))
+            self.progress.update(
+                self.pbar_sent, advance=len(batch_payload["initialRecords"])
+            )
         except httpx.HTTPStatusError as e:
             if (
                 e.response.status_code in [500, 400, 422]
             ):  # TODO: Update once we no longer have to support < Sunflower to just be 400
                 self.total_records_sent += len(self.record_batch)
                 self.record_batch = []
-                self.pbar_sent.update(len(batch_payload["initialRecords"]))
+                self.progress.update(
+                    self.pbar_sent, advance=len(batch_payload["initialRecords"])
+                )
             else:
                 for record in self.record_batch:
                     self.failed_batches_file.write(record)
@@ -459,8 +541,9 @@ class MARCImportJob:
         counter = 0
         for import_file in files:
             file_path = Path(import_file.name)
-            self.pbar_sent.set_description(
-                f"Sent ({os.path.basename(import_file.name)}): "
+            self.progress.update(
+                self.pbar_sent,
+                description=f"Sent ({os.path.basename(import_file.name)}): ",
             )
             reader = pymarc.MARCReader(import_file, hide_utf8_warnings=True)
             for idx, record in enumerate(reader, start=1):
@@ -609,22 +692,30 @@ class MARCImportJob:
                 raise e
             total_records = await self.read_total_records(files)
             with (
-                tqdm(
-                    desc=f"Imported ({self.job_hrid}): ",
-                    total=total_records,
-                    position=1,
-                    disable=self.no_progress,
-                ) as pbar_imported,
-                tqdm(
-                    desc="Sent: ()",
-                    total=total_records,
-                    position=0,
-                    disable=self.no_progress,
-                ) as pbar_sent,
+                Progress(
+                    "{task.description}",
+                    SpinnerColumn(),
+                    BarColumn(),
+                    MofNCompleteColumn(),
+                    "[",
+                    TimeElapsedColumn(),
+                    "<",
+                    TimeRemainingColumn(),
+                    "/",
+                    ItemsPerSecondColumn(),
+                    "]",
+                ) as import_progress,
             ):
+                self.progress = import_progress
                 try:
-                    self.pbar_sent = pbar_sent
-                    self.pbar_imported = pbar_imported
+                    self.pbar_sent = self.progress.add_task(
+                        "Sent: ", total=total_records, visible=not self.no_progress
+                    )
+                    self.pbar_imported = self.progress.add_task(
+                        f"Imported: ({self.job_hrid})",
+                        total=total_records,
+                        visible=not self.no_progress,
+                    )
                     await self.process_records(files, total_records)
                     while not self.finished:
                         await self.get_job_status()
@@ -785,7 +876,12 @@ def set_up_cli_logging():
         isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
         for h in logger.handlers
     ):
-        stream_handler = logging.StreamHandler(sys.stdout)
+        stream_handler = RichHandler(
+            show_level=False,
+            show_time=False,
+            omit_repeated_times=False,
+            show_path=False,
+        )
         stream_handler.setLevel(logging.INFO)
         stream_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
         # stream_handler.addFilter(ExcludeLevelFilter(25))
@@ -807,165 +903,208 @@ def set_up_cli_logging():
     logging.getLogger("httpx").setLevel(logging.WARNING)
-async def main() -> None:
-    """
-    Main function to run the MARC import job.
+app = typer.Typer()
-    This function parses command line arguments, initializes the FolioClient,
-    and runs the MARCImportJob.
-    """
-    set_up_cli_logging()
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--gateway_url", type=str, help="The FOLIO API Gateway URL")
-    parser.add_argument("--tenant_id", type=str, help="The FOLIO tenant ID")
-    parser.add_argument(
-        "--member_tenant_id",
-        type=str,
-        help="The FOLIO ECS member tenant ID (if applicable)",
-        default="",
-    )
-    parser.add_argument("--username", type=str, help="The FOLIO username")
-    parser.add_argument("--password", type=str, help="The FOLIO password", default="")
-    parser.add_argument(
-        "--marc_file_path",
-        type=str,
-        help="The MARC file (or file glob, using shell globbing syntax) to import",
-    )
-    parser.add_argument(
-        "--import_profile_name",
-        type=str,
-        help="The name of the data import job profile to use",
-        default="",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
+@app.command()
+def main(
+    gateway_url: Annotated[
+        str,
+        typer.Option(
+            prompt="Please enter the FOLIO API Gateway URL",
+            help="The FOLIO API Gateway URL",
+            envvar="FOLIO_GATEWAY_URL",
+        ),
+    ],
+    tenant_id: Annotated[
+        str,
+        typer.Option(
+            prompt="Please enter the FOLIO tenant id",
+            help="The tenant id",
+            envvar="FOLIO_TENANT_ID",
+        ),
+    ],
+    username: Annotated[
+        str,
+        typer.Option(
+            prompt="Please enter your FOLIO username",
+            help="The FOLIO username",
+            envvar="FOLIO_USERNAME",
+        ),
+    ],
+    password: Annotated[
+        str,
+        typer.Option(
+            prompt="Please enter your FOLIO Password",
+            hide_input=True,
+            help="The FOLIO password",
+            envvar="FOLIO_PASSWORD",
+        ),
+    ],
+    marc_file_path: str = typer.Option(
+        ..., help="The MARC file (or file glob, using shell globbing syntax) to import"
+    ),
+    member_tenant_id: Annotated[
+        str,
+        typer.Option(
+            help="The FOLIO ECS member tenant id (if applicable)",
+            envvar="FOLIO_MEMBER_TENANT_ID",
+        ),
+    ] = "",
+    import_profile_name: str = typer.Option(
+        "", help="The name of the data import job profile to use"
+    ),
+    batch_size: int = typer.Option(
+        10,
         help="The number of source records to include in a record batch sent to FOLIO.",
-        default=10,
-    )
-    parser.add_argument(
-        "--batch_delay",
-        type=float,
-        help="The number of seconds to wait between record batches.",
-        default=0.0,
-    )
-    parser.add_argument(
-        "--preprocessor",
-        type=str,
+    ),
+    batch_delay: float = typer.Option(
+        0.0, help="The number of seconds to wait between record batches."
+    ),
+    preprocessor: str = typer.Option(
+        "",
         help=(
             "Comma-separated python import paths to Python function(s) "
             "to apply to each MARC record before sending to FOLIO. Function should take "
             "a pymarc.Record object as input and return a pymarc.Record object."
         ),
-        default="",
-    )
-    parser.add_argument(
-        "--split-files",
-        action="store_true",
-        help="Split files into smaller parts before importing.",
-    )
-    parser.add_argument(
-        "--split-size",
-        type=int,
-        help="The number of records to include in each split file.",
-        default=1000,
-    )
-    parser.add_argument(
-        "--split-offset",
-        type=int,
+    ),
+    file_names_in_di_logs: bool = typer.Option(
+        False,
+        "--file-names-in-di-logs",
+        help="Show file names in FOLIO Data Import logs",
+    ),
+    split_files: bool = typer.Option(
+        False, "--split-files", help="Split files into smaller parts before importing."
+    ),
+    split_size: int = typer.Option(
+        1000, help="The number of records to include in each split file."
+    ),
+    split_offset: int = typer.Option(
+        0,
         help="The number of record batches of <split-size> to skip before starting import.",
-        default=0,
-    )
-    parser.add_argument(
+    ),
+    no_progress: bool = typer.Option(
+        False,
         "--no-progress",
-        action="store_true",
         help="Disable progress bars (eg. for running in a CI environment)",
-    )
-    parser.add_argument(
+        envvar="FOLIO_MARC_NO_PROGRESS",
+    ),
+    let_summary_fail: bool = typer.Option(
+        False,
         "--let-summary-fail",
-        action="store_true",
         help="Do not retry fetching the final job summary if it fails",
-    )
-    parser.add_argument(
-        "--preprocessor-config",
-        type=str,
+        envvar="FOLIO_MARC_LET_SUMMARY_FAIL",
+    ),
+    preprocessor_config: str = typer.Option(
+        None,
         help=(
             "JSON file containing configuration for preprocessor functions. "
             "This is passed to MARCPreprocessor class as a dict of dicts."
         ),
-        default=None,
-    )
-    args = parser.parse_args()
-    if not args.password:
-        args.password = getpass("Enter FOLIO password: ")
-    folio_client = folioclient.FolioClient(
-        args.gateway_url, args.tenant_id, args.username, args.password
-    )
+    ),
+    job_ids_file_path: str = typer.Option(
+        None, help="Path to a file to write job IDs to for later processing."
+    ),
+):
+    """
+    Command-line interface to batch import MARC records into FOLIO using FOLIO Data Import
+    """
+    set_up_cli_logging()
+    if not password:
+        password = typer.prompt("Enter FOLIO password: ", hide_input=True)
+    folio_client = folioclient.FolioClient(gateway_url, tenant_id, username, password)
-    # Set the member tenant id if provided to support FOLIO ECS multi-tenant environments
-    if args.member_tenant_id:
-        folio_client.okapi_headers["x-okapi-tenant"] = args.member_tenant_id
+    if member_tenant_id:
+        folio_client.okapi_headers["x-okapi-tenant"] = member_tenant_id
-    if os.path.isabs(args.marc_file_path):
-        marc_files = [Path(x) for x in glob.glob(args.marc_file_path)]
+    if os.path.isabs(marc_file_path):
+        marc_files = [Path(x) for x in glob.glob(marc_file_path)]
     else:
-        marc_files = list(Path("./").glob(args.marc_file_path))
+        marc_files = list(Path("./").glob(marc_file_path))
     marc_files.sort()
     if len(marc_files) == 0:
-        logger.critical(f"No files found matching {args.marc_file_path}. Exiting.")
+        logger.critical(f"No files found matching {marc_file_path}. Exiting.")
         sys.exit(1)
     else:
         logger.info(marc_files)
-    if args.preprocessor_config:
-        with open(args.preprocessor_config, "r") as f:
+    if preprocessor_config:
+        with open(preprocessor_config, "r") as f:
             preprocessor_args = json.load(f)
     else:
         preprocessor_args = {}
-    if not args.import_profile_name:
-        import_profiles = folio_client.folio_get(
-            "/data-import-profiles/jobProfiles",
-            "jobProfiles",
-            query_params={"limit": "1000"},
-        )
-        import_profile_names = [
-            profile["name"]
-            for profile in import_profiles
-            if "marc" in profile["dataType"].lower()
-        ]
-        questions = [
-            inquirer.List(
-                "import_profile_name",
-                message="Select an import profile",
-                choices=import_profile_names,
+    if not import_profile_name:
+        try:
+            import_profiles = folio_client.folio_get(
+                "/data-import-profiles/jobProfiles",
+                "jobProfiles",
+                query_params={"limit": "1000"},
+            )
+            import_profile_names = [
+                profile["name"]
+                for profile in import_profiles
+                if "marc" in profile["dataType"].lower()
+            ]
+            questions = [
+                inquirer.List(
+                    "import_profile_name",
+                    message="Select an import profile",
+                    choices=import_profile_names,
+                )
+            ]
+            answers = inquirer.prompt(questions, raise_keyboard_interrupt=True)
+            import_profile_name = answers["import_profile_name"]
+        except httpx.HTTPStatusError as e:
+            logger.error(
+                f"HTTP Error fetching import profiles: {e}\n{getattr(getattr(e, 'response', ''), 'text', '')}\nExiting."
             )
-        ]
-        answers = inquirer.prompt(questions)
-        args.import_profile_name = answers["import_profile_name"]
+            sys.exit(1)
+        except KeyboardInterrupt:
+            logger.info("Keyboard interrupt received. Exiting.")
+            sys.exit(0)
+    job = None
     try:
-        await MARCImportJob(
+        job = MARCImportJob(
             folio_client,
             marc_files,
-            args.import_profile_name,
-            batch_size=args.batch_size,
-            batch_delay=args.batch_delay,
-            marc_record_preprocessor=args.preprocessor,
+            import_profile_name,
+            batch_size=batch_size,
+            batch_delay=batch_delay,
+            marc_record_preprocessor=preprocessor,
             preprocessor_args=preprocessor_args,
-            no_progress=bool(args.no_progress),
-            let_summary_fail=bool(args.let_summary_fail),
-            split_files=bool(args.split_files),
-            split_size=args.split_size,
-            split_offset=args.split_offset,
-        ).do_work()
+            no_progress=no_progress,
+            let_summary_fail=let_summary_fail,
+            split_files=split_files,
+            split_size=split_size,
+            split_offset=split_offset,
+            job_ids_file_path=job_ids_file_path,
+            show_file_names_in_data_import_logs=file_names_in_di_logs,
+        )
+        asyncio.run(run_job(job))
+    except Exception as e:
+        logger.error("Could not initialize MARCImportJob: " + str(e))
+        raise typer.Exit(1)
+async def run_job(job):
+    try:
+        await job.do_work()
+    except httpx.HTTPStatusError as e:
+        logger.error(
+            f"HTTP Error importing files: {e}\n{getattr(getattr(e, 'response', ''), 'text', '')}\nExiting."
+        )
+        typer.Exit(1)
     except Exception as e:
         logger.error("Error importing files: " + str(e))
         raise
+    finally:
+        if job:
+            await job.wrap_up()
 class ExcludeLevelFilter(logging.Filter):
@@ -986,12 +1125,9 @@ class IncludeLevelFilter(logging.Filter):
         return record.levelno == self.level
-def sync_main() -> None:
-    """
-    Synchronous main function to run the MARC import job.
-    """
-    asyncio.run(main())
+def _main():
+    typer.run(main)
 if __name__ == "__main__":
-    asyncio.run(main())
+    app()

folio-data-import 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

folio-data-import 0.3.2py3-none-any.whl → 0.4.1py3-none-any.whl