PyPI - folio-data-import - Versions diffs - 0.2.8rc11__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

folio-data-import 0.2.8rc11py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of folio-data-import might be problematic. Click here for more details.

Files changed (10) hide show

folio_data_import/MARCDataImport.py CHANGED Viewed

@@ -2,9 +2,10 @@ import argparse
 import asyncio
 import datetime
 import glob
-import importlib
 import io
+import json
 import logging
+import math
 import os
 import sys
 import uuid
@@ -14,7 +15,7 @@ from functools import cached_property
 from getpass import getpass
 from pathlib import Path
 from time import sleep
-from typing import List
+from typing import Any, BinaryIO, Callable, Dict, List, Union
 import folioclient
 import httpx
@@ -24,6 +25,9 @@ import tabulate
 from humps import decamelize
 from tqdm import tqdm
+from folio_data_import.custom_exceptions import FolioDataImportBatchError
+from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
 try:
     datetime_utc = datetime.UTC
 except AttributeError:
@@ -62,7 +66,6 @@ class MARCImportJob:
         import_profile_name (str): The name of the data import job profile to use.
         batch_size (int): The number of source records to include in a record batch (default=10).
         batch_delay (float): The number of seconds to wait between record batches (default=0).
-        consolidate (bool): Consolidate files into a single job. Default is one job for each file.
         no_progress (bool): Disable progress bars (eg. for running in a CI environment).
     """
@@ -74,10 +77,14 @@ class MARCImportJob:
     http_client: httpx.Client
     current_file: List[Path]
     record_batch: List[dict] = []
-    error_records: int = 0
     last_current: int = 0
     total_records_sent: int = 0
     finished: bool = False
+    job_id: str = ""
+    job_hrid: int = 0
+    current_file: Union[List[Path],List[io.BytesIO]] = []
+    _max_summary_retries: int = 2
+    _summary_retries: int = 0
     def __init__(
         self,
@@ -86,12 +93,17 @@ class MARCImportJob:
         import_profile_name: str,
         batch_size=10,
         batch_delay=0,
-        marc_record_preprocessor=None,
-        consolidate=False,
+        marc_record_preprocessor: Union[List[Callable], str]=[],
+        preprocessor_args: Dict[str,Dict]={},
         no_progress=False,
         let_summary_fail=False,
+        split_files=False,
+        split_size=1000,
+        split_offset=0,
     ) -> None:
-        self.consolidate_files = consolidate
+        self.split_files = split_files
+        self.split_size = split_size
+        self.split_offset = split_offset
         self.no_progress = no_progress
         self.let_summary_fail = let_summary_fail
         self.folio_client: folioclient.FolioClient = folio_client
@@ -100,20 +112,14 @@ class MARCImportJob:
         self.batch_size = batch_size
         self.batch_delay = batch_delay
         self.current_retry_timeout = None
-        self.marc_record_preprocessor = marc_record_preprocessor
-        self.pbar_sent: tqdm
-        self.pbar_imported: tqdm
-        self._max_summary_retries: int = 2
-        self._summary_retries: int = 0
+        self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(marc_record_preprocessor, **preprocessor_args)
     async def do_work(self) -> None:
         """
         Performs the necessary work for data import.
         This method initializes an HTTP client, files to store records that fail to send,
-        and calls `self.import_marc_records` to import MARC files. If `consolidate_files` is True,
-        it imports all the files specified in `import_files` as a single batch. Otherwise,
-        it imports each file as a separate import job.
+        and calls the appropriate method to import MARC files based on the configuration.
         Returns:
             None
@@ -138,15 +144,33 @@ class MARCImportJob:
             self.failed_batches_file = failed_batches
             logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
             self.http_client = http_client
-            if self.consolidate_files:
-                self.current_file = self.import_files
-                await self.import_marc_file()
+            if self.split_files:
+                await self.process_split_files()
             else:
                 for file in self.import_files:
                     self.current_file = [file]
                     await self.import_marc_file()
             await self.wrap_up()
+    async def process_split_files(self):
+        """
+        Process the import of files in smaller batches.
+        This method is called when `split_files` is set to True.
+        It splits each file into smaller chunks and processes them one by one.
+        """
+        for file in self.import_files:
+            with open(file, "rb") as f:
+                file_length = await self.read_total_records([f])
+            expected_batches = math.ceil(file_length /self.split_size)
+            logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
+            zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
+            for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
+                if idx > self.split_offset:
+                    batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
+                    self.current_file = [batch]
+                    await self.import_marc_file()
+            self.move_file_to_complete(file)
     async def wrap_up(self) -> None:
         """
         Wraps up the data import process.
@@ -194,7 +218,7 @@ class MARCImportJob:
                 )
                 self.current_retry_timeout = None
         except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
-            if not hasattr(e, "response") or e.response.status_code in [502, 504]:
+            if not hasattr(e, "response") or e.response.status_code in [502, 504, 401]:
                 error_text = e.response.text if hasattr(e, "response") else str(e)
                 logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
                 sleep(0.25)
@@ -256,7 +280,7 @@ class MARCImportJob:
         """
         try:
             create_job = self.http_client.post(
-                self.folio_client.okapi_url + "/change-manager/jobExecutions",
+                self.folio_client.gateway_url + "/change-manager/jobExecutions",
                 headers=self.folio_client.okapi_headers,
                 json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
             )
@@ -275,7 +299,7 @@ class MARCImportJob:
                 )
                 raise e
         self.job_id = create_job.json()["parentJobExecutionId"]
-        logger.info("Created job: " + self.job_id)
+        logger.info(f"Created job: {self.job_id}")
     @cached_property
     def import_profile(self) -> dict:
@@ -305,7 +329,7 @@ class MARCImportJob:
             The response from the HTTP request to set the job profile.
         """
         set_job_profile = self.http_client.put(
-            self.folio_client.okapi_url
+            self.folio_client.gateway_url
             + "/change-manager/jobExecutions/"
             + self.job_id
             + "/jobProfile",
@@ -318,6 +342,8 @@ class MARCImportJob:
         )
         try:
             set_job_profile.raise_for_status()
+            self.job_hrid = set_job_profile.json()['hrId']
+            logger.info(f"Job HRID: {self.job_hrid}")
         except httpx.HTTPError as e:
             logger.error(
                 "Error creating job: "
@@ -328,7 +354,7 @@ class MARCImportJob:
             raise e
     @staticmethod
-    async def read_total_records(files) -> int:
+    async def read_total_records(files: List[BinaryIO]) -> int:
         """
         Reads the total number of records from the given files.
@@ -357,17 +383,15 @@ class MARCImportJob:
         """
         try:
             post_batch = self.http_client.post(
-                self.folio_client.okapi_url
+                self.folio_client.gateway_url
                 + f"/change-manager/jobExecutions/{self.job_id}/records",
                 headers=self.folio_client.okapi_headers,
                 json=batch_payload,
             )
-            # if batch_payload["recordsMetadata"]["last"]:
-            #     logger.log(
-            #         25,
-            #         f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
-            #     )
         except (httpx.ConnectTimeout, httpx.ReadTimeout):
+            logger.warning(
+                f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying..."
+            )
             sleep(0.25)
             return await self.process_record_batch(batch_payload)
         try:
@@ -375,20 +399,21 @@ class MARCImportJob:
             self.total_records_sent += len(self.record_batch)
             self.record_batch = []
             self.pbar_sent.update(len(batch_payload["initialRecords"]))
-        except Exception as e:
+        except httpx.HTTPStatusError as e:
             if (
-                hasattr(e, "response") and e.response.status_code in [500, 422]
-            ):  # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
+                e.response.status_code in [500, 400, 422]
+            ):  # TODO: Update once we no longer have to support < Sunflower to just be 400
                 self.total_records_sent += len(self.record_batch)
                 self.record_batch = []
                 self.pbar_sent.update(len(batch_payload["initialRecords"]))
             else:
-                logger.error("Error posting batch: " + str(e))
                 for record in self.record_batch:
                     self.failed_batches_file.write(record)
-                    self.error_records += len(self.record_batch)
-                    self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
-                self.record_batch = []
+                raise FolioDataImportBatchError(
+                    batch_payload['id'],
+                    f"{e}\n{e.response.text}",
+                    e
+                )
         await self.get_job_status()
         sleep(self.batch_delay)
@@ -417,16 +442,12 @@ class MARCImportJob:
                         await self.create_batch_payload(
                             counter,
                             total_records,
-                            (counter - self.error_records)
-                            == (total_records - self.error_records),
+                            counter == total_records,
                         ),
                     )
                     sleep(0.25)
                 if record:
-                    if self.marc_record_preprocessor:
-                        record = await self.apply_marc_record_preprocessing(
-                            record, self.marc_record_preprocessor
-                        )
+                    record = self.marc_record_preprocessor.do_work(record)
                     self.record_batch.append(record.as_marc())
                     counter += 1
                 else:
@@ -437,75 +458,26 @@ class MARCImportJob:
                         "",
                     )
                     self.bad_records_file.write(reader.current_chunk)
-            if self.record_batch:
-                await self.process_record_batch(
-                    await self.create_batch_payload(
-                        counter,
-                        total_records,
-                        (counter - self.error_records)
-                        == (total_records - self.error_records),
-                    ),
-                )
-            import_complete_path = file_path.parent.joinpath("import_complete")
-            if not import_complete_path.exists():
-                logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
-                import_complete_path.mkdir(exist_ok=True)
-            logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
-            file_path.rename(
-                file_path.parent.joinpath("import_complete", file_path.name)
-            )
-    @staticmethod
-    async def apply_marc_record_preprocessing(
-        record: pymarc.Record, func_or_path
-    ) -> pymarc.Record:
-        """
-        Apply preprocessing to the MARC record before sending it to FOLIO.
-        Args:
-            record (pymarc.Record): The MARC record to preprocess.
-            func_or_path (Union[Callable, str]): The preprocessing function or its import path.
-        Returns:
-            pymarc.Record: The preprocessed MARC record.
-        """
-        if isinstance(func_or_path, str):
-            func_paths = func_or_path.split(",")
-            for func_path in func_paths:
-                record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
-                    record, func_path
-                )
-        elif callable(func_or_path):
-            record = func_or_path(record)
-        else:
-            logger.warning(
-                f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
+            if not self.split_files:
+                self.move_file_to_complete(file_path)
+        if self.record_batch or not self.finished:
+            await self.process_record_batch(
+                await self.create_batch_payload(
+                    counter,
+                    total_records,
+                    counter == total_records,
+                ),
             )
-        return record
-    async def _apply_single_marc_record_preprocessing_by_path(
-        record: pymarc.Record, func_path: str
-    ) -> pymarc.Record:
-        """
-        Apply a single preprocessing function to the MARC record.
-        Args:
-            record (pymarc.Record): The MARC record to preprocess.
-            func_path (str): The path to the preprocessing function.
-        Returns:
-            pymarc.Record: The preprocessed MARC record.
-        """
-        try:
-            module_path, func_name = func_path.rsplit(".", 1)
-            module = importlib.import_module(module_path)
-            func = getattr(module, func_name)
-            record = func(record)
-        except Exception as e:
-            logger.warning(
-                f"Error applying preprocessing function {func_path}: {e}. Skipping."
+    def move_file_to_complete(self, file_path: Path):
+        import_complete_path = file_path.parent.joinpath("import_complete")
+        if not import_complete_path.exists():
+            logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
+            import_complete_path.mkdir(exist_ok=True)
+        logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
+        file_path.rename(
+                file_path.parent.joinpath("import_complete", file_path.name)
             )
-        return record
     async def create_batch_payload(self, counter, total_records, is_last) -> dict:
         """
@@ -523,13 +495,53 @@ class MARCImportJob:
             "id": str(uuid.uuid4()),
             "recordsMetadata": {
                 "last": is_last,
-                "counter": counter - self.error_records,
+                "counter": counter,
                 "contentType": "MARC_RAW",
-                "total": total_records - self.error_records,
+                "total": total_records,
             },
             "initialRecords": [{"record": x.decode()} for x in self.record_batch],
         }
+    @staticmethod
+    def split_marc_file(file_path, batch_size):
+        """Generator to iterate over MARC records in batches, yielding BytesIO objects."""
+        with open(file_path, "rb") as f:
+            batch = io.BytesIO()
+            count = 0
+            while True:
+                leader = f.read(24)
+                if not leader:
+                    break  # End of file
+                try:
+                    record_length = int(leader[:5])  # Extract record length from leader
+                except ValueError:
+                    raise ValueError("Invalid MARC record length encountered.")
+                record_body = f.read(record_length - 24)
+                if len(record_body) != record_length - 24:
+                    raise ValueError("Unexpected end of file while reading MARC record.")
+                # Verify record terminator
+                if record_body[-1:] != b'\x1D':
+                    raise ValueError("MARC record does not end with the expected terminator (0x1D).")
+                # Write the full record to the batch buffer
+                batch.write(leader + record_body)
+                count += 1
+                if count >= batch_size:
+                    batch.seek(0)
+                    yield batch
+                    batch = io.BytesIO()  # Reset buffer
+                    count = 0
+            # Yield any remaining records
+            if count > 0:
+                batch.seek(0)
+                yield batch
     async def import_marc_file(self) -> None:
         """
         Imports MARC file into the system.
@@ -551,13 +563,24 @@ class MARCImportJob:
         await self.create_folio_import_job()
         await self.set_job_profile()
         with ExitStack() as stack:
-            files = [
-                stack.enter_context(open(file, "rb")) for file in self.current_file
-            ]
+            try:
+                if isinstance(self.current_file[0], Path):
+                    files = [
+                        stack.enter_context(open(file, "rb")) for file in self.current_file
+                    ]
+                elif isinstance(self.current_file[0], io.BytesIO):
+                    files = [
+                        stack.enter_context(file) for file in self.current_file
+                        ]
+                else:
+                    raise ValueError("Invalid file type. Must be Path or BytesIO.")
+            except IndexError as e:
+                logger.error(f"Error opening file: {e}")
+                raise e
             total_records = await self.read_total_records(files)
             with (
                 tqdm(
-                    desc="Imported: ",
+                    desc=f"Imported ({self.job_hrid}): ",
                     total=total_records,
                     position=1,
                     disable=self.no_progress,
@@ -569,48 +592,81 @@ class MARCImportJob:
                     disable=self.no_progress,
                 ) as pbar_sent,
             ):
-                self.pbar_sent = pbar_sent
-                self.pbar_imported = pbar_imported
-                await self.process_records(files, total_records)
-                while not self.finished:
-                    await self.get_job_status()
-                sleep(1)
+                try:
+                    self.pbar_sent = pbar_sent
+                    self.pbar_imported = pbar_imported
+                    await self.process_records(files, total_records)
+                    while not self.finished:
+                        await self.get_job_status()
+                    sleep(1)
+                except FolioDataImportBatchError as e:
+                    logger.error(
+                        f"Unhandled error posting batch {e.batch_id}: {e.message}"
+                    )
+                    await self.cancel_job()
+                    raise e
             if self.finished:
-                if job_summary := await self.get_job_summary():
-                    job_id = job_summary.pop("jobExecutionId", None)
-                    total_errors = job_summary.pop("totalErrors", 0)
-                    columns = ["Summary"] + list(job_summary.keys())
-                    rows = set()
-                    for key in columns[1:]:
-                        rows.update(job_summary[key].keys())
-                    table_data = []
-                    for row in rows:
-                        metric_name = decamelize(row).split("_")[1]
-                        table_row = [metric_name]
-                        for col in columns[1:]:
-                            table_row.append(job_summary[col].get(row, "N/A"))
-                        table_data.append(table_row)
-                    table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
-                    columns = columns[:1] + [
+                await self.log_job_summary()
+            self.last_current = 0
+            self.finished = False
+    async def cancel_job(self) -> None:
+        """
+        Cancels the current job execution.
+        This method sends a request to cancel the job execution and logs the result.
+        Returns:
+            None
+        """
+        try:
+            cancel = self.http_client.delete(
+                self.folio_client.gateway_url
+                + f"/change-manager/jobExecutions/{self.job_id}/records",
+                headers=self.folio_client.okapi_headers,
+            )
+            cancel.raise_for_status()
+            self.finished = True
+            logger.info(f"Cancelled job: {self.job_id}")
+        except (httpx.ConnectTimeout, httpx.ReadTimeout):
+            logger.warning(f"CONNECTION ERROR cancelling job {self.job_id}. Retrying...")
+            sleep(0.25)
+            await self.cancel_job()
+    async def log_job_summary(self):
+        if job_summary := await self.get_job_summary():
+            job_id = job_summary.pop("jobExecutionId", None)
+            total_errors = job_summary.pop("totalErrors", 0)
+            columns = ["Summary"] + list(job_summary.keys())
+            rows = set()
+            for key in columns[1:]:
+                rows.update(job_summary[key].keys())
+            table_data = []
+            for row in rows:
+                metric_name = decamelize(row).split("_")[1]
+                table_row = [metric_name]
+                for col in columns[1:]:
+                    table_row.append(job_summary[col].get(row, "N/A"))
+                table_data.append(table_row)
+            table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
+            columns = columns[:1] + [
                         " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
                     ]
-                    logger.info(
+            logger.info(
                         f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
                         f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
                     )
-                    logger.info(
+            logger.info(
                         "\n"
                         + tabulate.tabulate(
                             table_data, headers=columns, tablefmt="fancy_grid"
                         ),
                     )
-                    if total_errors:
-                        logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
-                else:
-                    logger.error(f"No job summary available for job {self.job_id}.")
-            self.last_current = 0
-            self.finished = False
+            if total_errors:
+                logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
+        else:
+            logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
     async def get_job_summary(self) -> dict:
         """
@@ -749,19 +805,31 @@ async def main() -> None:
         "--preprocessor",
         type=str,
         help=(
-            "The path to a Python module containing a preprocessing function "
-            "to apply to each MARC record before sending to FOLIO."
+            "Comma-separated python import paths to Python function(s) "
+            "to apply to each MARC record before sending to FOLIO. Function should take "
+            "a pymarc.Record object as input and return a pymarc.Record object."
         ),
         default=None,
     )
     parser.add_argument(
-        "--consolidate",
+        "--split-files",
         action="store_true",
-        help=(
-            "Consolidate records into a single job. "
-            "Default is to create a new job for each MARC file."
-        ),
+        help="Split files into smaller parts before importing.",
     )
+    parser.add_argument(
+        "--split-size",
+        type=int,
+        help="The number of records to include in each split file.",
+        default=1000,
+    )
+    parser.add_argument(
+        "--split-offset",
+        type=int,
+        help="The number of record batches of <split-size> to skip before starting import.",
+        default=0,
+    )
     parser.add_argument(
         "--no-progress",
         action="store_true",
@@ -772,6 +840,16 @@ async def main() -> None:
         action="store_true",
         help="Do not retry fetching the final job summary if it fails",
     )
+    parser.add_argument(
+        "--preprocessor-config",
+        type=str,
+        help=(
+            "JSON file containing configuration for preprocessor functions. "
+            "This is passed to MARCPreprocessor class as a dict of dicts."
+        ),
+        default=None,
+    )
     args = parser.parse_args()
     if not args.password:
         args.password = getpass("Enter FOLIO password: ")
@@ -796,6 +874,12 @@ async def main() -> None:
     else:
         logger.info(marc_files)
+    if args.preprocessor_config:
+        with open(args.preprocessor_config, "r") as f:
+            preprocessor_args = json.load(f)
+    else:
+        preprocessor_args = {}
     if not args.import_profile_name:
         import_profiles = folio_client.folio_get(
             "/data-import-profiles/jobProfiles",
@@ -824,9 +908,12 @@ async def main() -> None:
             batch_size=args.batch_size,
             batch_delay=args.batch_delay,
             marc_record_preprocessor=args.preprocessor,
-            consolidate=bool(args.consolidate),
+            preprocessor_args=preprocessor_args,
             no_progress=bool(args.no_progress),
             let_summary_fail=bool(args.let_summary_fail),
+            split_files=bool(args.split_files),
+            split_size=args.split_size,
+            split_offset=args.split_offset,
         ).do_work()
     except Exception as e:
         logger.error("Error importing files: " + str(e))

folio_data_import/UserImport.py CHANGED Viewed

@@ -137,7 +137,7 @@ class UserImporter:  # noqa: R0902
         match_key = "id" if ("id" in user_obj) else self.match_key
         try:
             existing_user = await self.http_client.get(
-                self.folio_client.okapi_url + "/users",
+                self.folio_client.gateway_url + "/users",
                 headers=self.folio_client.okapi_headers,
                 params={"query": f"{match_key}=={user_obj[match_key]}"},
             )
@@ -161,7 +161,7 @@ class UserImporter:  # noqa: R0902
         """
         try:
             existing_rp = await self.http_client.get(
-                self.folio_client.okapi_url
+                self.folio_client.gateway_url
                 + "/request-preference-storage/request-preference",
                 headers=self.folio_client.okapi_headers,
                 params={
@@ -188,7 +188,7 @@ class UserImporter:  # noqa: R0902
         """
         try:
             existing_pu = await self.http_client.get(
-                self.folio_client.okapi_url + "/perms/users",
+                self.folio_client.gateway_url + "/perms/users",
                 headers=self.folio_client.okapi_headers,
                 params={
                     "query": f"userId=={existing_user.get('id', user_obj.get('id', ''))}"
@@ -369,7 +369,7 @@ class UserImporter:  # noqa: R0902
             else:
                 existing_user[key] = value
         create_update_user = await self.http_client.put(
-            self.folio_client.okapi_url + f"/users/{existing_user['id']}",
+            self.folio_client.gateway_url + f"/users/{existing_user['id']}",
             headers=self.folio_client.okapi_headers,
             json=existing_user,
         )
@@ -389,7 +389,7 @@ class UserImporter:  # noqa: R0902
             HTTPError: If the HTTP request to create the user fails.
         """
         response = await self.http_client.post(
-            self.folio_client.okapi_url + "/users",
+            self.folio_client.gateway_url + "/users",
             headers=self.folio_client.okapi_headers,
             json=user_obj,
         )
@@ -589,7 +589,7 @@ class UserImporter:  # noqa: R0902
         rp_obj["userId"] = new_user_obj["id"]
         # print(rp_obj)
         response = await self.http_client.post(
-            self.folio_client.okapi_url
+            self.folio_client.gateway_url
             + "/request-preference-storage/request-preference",
             headers=self.folio_client.okapi_headers,
             json=rp_obj,
@@ -613,7 +613,7 @@ class UserImporter:  # noqa: R0902
         existing_rp.update(rp_obj)
         # print(existing_rp)
         response = await self.http_client.put(
-            self.folio_client.okapi_url
+            self.folio_client.gateway_url
             + f"/request-preference-storage/request-preference/{existing_rp['id']}",
             headers=self.folio_client.okapi_headers,
             json=existing_rp,
@@ -635,7 +635,7 @@ class UserImporter:  # noqa: R0902
         """
         perms_user_obj = {"userId": new_user_obj["id"], "permissions": []}
         response = await self.http_client.post(
-            self.folio_client.okapi_url + "/perms/users",
+            self.folio_client.gateway_url + "/perms/users",
             headers=self.folio_client.okapi_headers,
             json=perms_user_obj,
         )
@@ -788,7 +788,7 @@ class UserImporter:  # noqa: R0902
         """
         try:
             existing_spu = await self.http_client.get(
-                self.folio_client.okapi_url + "/service-points-users",
+                self.folio_client.gateway_url + "/service-points-users",
                 headers=self.folio_client.okapi_headers,
                 params={"query": f"userId=={existing_user['id']}"},
             )
@@ -812,7 +812,7 @@ class UserImporter:  # noqa: R0902
         """
         spu_obj["userId"] = existing_user["id"]
         response = await self.http_client.post(
-            self.folio_client.okapi_url + "/service-points-users",
+            self.folio_client.gateway_url + "/service-points-users",
             headers=self.folio_client.okapi_headers,
             json=spu_obj,
         )
@@ -831,7 +831,7 @@ class UserImporter:  # noqa: R0902
         """
         existing_spu.update(spu_obj)
         response = await self.http_client.put(
-            self.folio_client.okapi_url + f"/service-points-users/{existing_spu['id']}",
+            self.folio_client.gateway_url + f"/service-points-users/{existing_spu['id']}",
             headers=self.folio_client.okapi_headers,
             json=existing_spu,
         )

folio_data_import/custom_exceptions.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Custom exceptions for the Folio Data Import module."""
+class FolioDataImportError(Exception):
+    """Base class for all exceptions in the Folio Data Import module."""
+    pass
+class FolioDataImportBatchError(FolioDataImportError):
+    """Exception raised for errors in the Folio Data Import batch process.
+    Attributes:
+        batch_id -- ID of the batch that caused the error
+        message -- explanation of the error
+    """
+    def __init__(self, batch_id, message, exception=None):
+        self.batch_id = batch_id
+        self.message = message
+        super().__init__(f"Unhandled error posting batch {batch_id}: {message}")

folio_data_import/marc_preprocessors/_preprocessors.py CHANGED Viewed

@@ -1,69 +1,168 @@
+import importlib
+import sys
+from typing import Callable, Dict, List, Tuple, Union
 import pymarc
 import logging
+from pymarc.record import Record
 logger = logging.getLogger("folio_data_import.MARCDataImport")
+class MARCPreprocessor:
+    """
+    A class to preprocess MARC records for data import into FOLIO.
+    """
+    def __init__(self, preprocessors: Union[str,List[Callable]], **kwargs):
+        """
+        Initialize the MARCPreprocessor with a list of preprocessors.
+        Args:
+            preprocessors (Union[str, List[Callable]]): A string of comma-separated function names or a list of callable preprocessor functions to apply.
+        """
+        self.preprocessor_args: Dict[str, Dict] = kwargs
+        self.preprocessors: List[Tuple[Callable, Dict]] = self._get_preprocessor_functions(
+            preprocessors
+        )
+        self.proc_kwargs = kwargs
+        self.record = None
+    def _get_preprocessor_args(self, func: Callable) -> Dict:
+        """
+        Get the arguments for the preprocessor function.
+        Args:
+            func (Callable): The preprocessor function.
+        Returns:
+            Dict: A dictionary of arguments for the preprocessor function.
+        """
+        func_path = f"{func.__module__}.{func.__name__}"
+        path_args: Dict = self.preprocessor_args.get("default", {})
+        path_args.update(self.preprocessor_args.get(func.__name__, {}))
+        path_args.update(self.preprocessor_args.get(func_path, {}))
+        return path_args
+    def _get_preprocessor_functions(self, func_list: Union[str, List[Callable]]) -> List[Callable]:
+        """
+        Get the preprocessor functions based on the provided names.
+        Returns:
+            List[callable]: A list of preprocessor functions.
+        """
+        preprocessors = []
+        if isinstance(func_list, str):
+            func_list = func_list.split(",")
+        else:
+            for f in func_list:
+                if not callable(f):
+                    logger.warning(
+                        f"Preprocessing function {f} is not callable. Skipping."
+                    )
+                else:
+                    preprocessors.append((f, self._get_preprocessor_args(f)))
+            return preprocessors
+        for f_path in func_list:
+            f_import = f_path.rsplit(".", 1)
+            if len(f_import) == 1:
+                # If the function is not a full path, assume it's in the current module
+                if func := getattr(sys.modules[__name__], f_import[0], None):
+                    if callable(func):
+                        preprocessors.append((func, self._get_preprocessor_args(func)))
+                    else:
+                        logger.warning(
+                            f"Preprocessing function {f_path} is not callable. Skipping."
+                        )
+                else:
+                    logger.warning(
+                        f"Preprocessing function {f_path} not found in current module. Skipping."
+                    )
+            elif len(f_import) == 2:
+                # If the function is a full path, import it
+                module_path, func_name = f_import
+                try:
+                    module = importlib.import_module(module_path)
+                    func = getattr(module, func_name)
+                    preprocessors.append((func, self._get_preprocessor_args(func)))
+                except ImportError as e:
+                    logger.warning(
+                        f"Error importing preprocessing function {f_path}: {e}. Skipping."
+                    )
+        return preprocessors
+    def do_work(self, record: Record) -> Record:
+        """
+        Preprocess the MARC record.
+        """
+        for proc, kwargs in self.preprocessors:
+            record = proc(record, **kwargs)
+        return record
-def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
+def prepend_prefix_001(record: Record, prefix: str) -> Record:
     """
     Prepend a prefix to the record's 001 field.
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
         prefix (str): The prefix to prepend to the 001 field.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
-    record["001"].data = f"({prefix})" + record["001"].data
+    if "001" in record:
+        record["001"].data = f"({prefix})" + record["001"].data
+    else:
+        logger.warning("Field '001' not found in record. Skipping prefix prepend.")
     return record
-def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
+def prepend_ppn_prefix_001(record: Record, **kwargs) -> Record:
     """
     Prepend the PPN prefix to the record's 001 field. Useful when
     importing records from the ABES SUDOC catalog
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
     return prepend_prefix_001(record, "PPN")
-def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
+def prepend_abes_prefix_001(record: Record, **kwargs) -> Record:
     """
     Prepend the ABES prefix to the record's 001 field. Useful when
     importing records from the ABES SUDOC catalog
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
     return prepend_prefix_001(record, "ABES")
-def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
+def strip_999_ff_fields(record: Record, **kwargs) -> Record:
     """
     Strip all 999 fields with ff indicators from the record.
     Useful when importing records exported from another FOLIO system
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
     for field in record.get_fields("999"):
         if field.indicators == pymarc.Indicators(*["f", "f"]):
             record.remove_field(field)
     return record
-def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
+def clean_999_fields(record: Record, **kwargs) -> Record:
     """
     The presence of 999 fields, with or without ff indicators, can cause
     issues with data import mapping in FOLIO. This function calls strip_999_ff_fields
@@ -71,10 +170,10 @@ def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
     to 945 fields.
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
     record = strip_999_ff_fields(record)
     for field in record.get_fields("999"):
@@ -87,7 +186,31 @@ def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
         record.remove_field(field)
     return record
-def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
+def clean_non_ff_999_fields(record: Record, **kwargs) -> Record:
+    """
+    When loading migrated MARC records from folio_migration_tools, the presence of other 999 fields
+    than those set by the migration process can cause the record to fail to load properly. This preprocessor
+    function moves all 999 fields with non-ff indicators to 945 fields with 99 indicators.
+    """
+    for field in record.get_fields("999"):
+        if field.indicators != pymarc.Indicators(*["f", "f"]):
+            logger.log(
+                26,
+                "DATA ISSUE\t%s\t%s\t%s",
+                record["001"].value(),
+                "Record contains a 999 field with non-ff indicators: Moving field to a 945 with indicators \"99\"",
+                field,
+            )
+            _945 = pymarc.Field(
+                tag="945",
+                indicators=pymarc.Indicators("9","9"),
+                subfields=field.subfields,
+            )
+            record.add_ordered_field(_945)
+            record.remove_field(field)
+    return record
+def sudoc_supercede_prep(record: Record, **kwargs) -> Record:
     """
     Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
     with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
@@ -96,10 +219,10 @@ def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
     in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
     record = prepend_abes_prefix_001(record)
     for field in record.get_fields("035"):
@@ -113,7 +236,7 @@ def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
     return record
-def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
+def clean_empty_fields(record: Record, **kwargs) -> Record:
     """
     Remove empty fields and subfields from the record. These can cause
     data import mapping issues in FOLIO. Removals are logged at custom
@@ -121,10 +244,10 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
     data issues report.
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
     MAPPED_FIELDS = {
         "010": ["a", "z"],
@@ -233,73 +356,72 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
         "856": ["u", "y", "z"],
     }
-    for field in list(record.get_fields()):
+    for field in record.get_fields(*MAPPED_FIELDS.keys()):
         len_subs = len(field.subfields)
-        subfield_value = bool(field.subfields[0].value) if len_subs > 0 else False
-        if not int(field.tag) >= 900 and field.tag in MAPPED_FIELDS:
-            if int(field.tag) > 9 and len_subs == 0:
+        subfield_value = bool(field.subfields[0].value) if len_subs else False
+        if int(field.tag) > 9 and len_subs == 0:
+            logger.log(
+                26,
+                "DATA ISSUE\t%s\t%s\t%s",
+                record["001"].value(),
+                f"{field.tag} is empty, removing field",
+                field,
+            )
+            record.remove_field(field)
+        elif len_subs == 1 and not subfield_value:
+            logger.log(
+                26,
+                "DATA ISSUE\t%s\t%s\t%s",
+                record["001"].value(),
+                f"{field.tag}${field.subfields[0].code} is empty, no other subfields present, removing field",
+                field,
+            )
+            record.remove_field(field)
+        else:
+            if len_subs > 1 and "a" in field and not field["a"].strip():
                 logger.log(
                     26,
                     "DATA ISSUE\t%s\t%s\t%s",
                     record["001"].value(),
-                    f"{field.tag} is empty, removing field",
+                    f"{field.tag}$a is empty, removing subfield",
                     field,
                 )
-                record.remove_field(field)
-            elif len_subs == 1 and not subfield_value:
+                field.delete_subfield("a")
+            for idx, subfield in enumerate(list(field.subfields), start=1):
+                if (
+                    subfield.code in MAPPED_FIELDS.get(field.tag, [])
+                    and not subfield.value
+                ):
+                    logger.log(
+                        26,
+                        "DATA ISSUE\t%s\t%s\t%s",
+                        record["001"].value(),
+                        f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
+                        field,
+                    )
+                    field.delete_subfield(subfield.code)
+            if len(field.subfields) == 0:
                 logger.log(
                     26,
                     "DATA ISSUE\t%s\t%s\t%s",
                     record["001"].value(),
-                    f"{field.tag}${field.subfields[0].code} is empty, no other subfields present, removing field",
+                    f"{field.tag} has no non-empty subfields after cleaning, removing field",
                     field,
                 )
                 record.remove_field(field)
-            else:
-                if len_subs > 1 and "a" in field and not field["a"].strip():
-                    logger.log(
-                        26,
-                        "DATA ISSUE\t%s\t%s\t%s",
-                        record["001"].value(),
-                        f"{field.tag}$a is empty, removing subfield",
-                        field,
-                    )
-                    field.delete_subfield("a")
-                for idx, subfield in enumerate(list(field.subfields), start=1):
-                    if (
-                        subfield.code in MAPPED_FIELDS.get(field.tag, [])
-                        and not subfield.value
-                    ):
-                        logger.log(
-                            26,
-                            "DATA ISSUE\t%s\t%s\t%s",
-                            record["001"].value(),
-                            f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
-                            field,
-                        )
-                        field.delete_subfield(subfield.code)
-                if len(field.subfields) == 0:
-                    logger.log(
-                        26,
-                        "DATA ISSUE\t%s\t%s\t%s",
-                        record["001"].value(),
-                        f"{field.tag} has no non-empty subfields after cleaning, removing field",
-                        field,
-                    )
-                    record.remove_field(field)
     return record
-def fix_leader(record: pymarc.Record) -> pymarc.Record:
+def fix_leader(record: Record, **kwargs) -> Record:
     """
     Fixes the leader of the record by setting the record status to 'c' (modified
     record) and the type of record to 'a' (language material).
     Args:
-        record (pymarc.Record): The MARC record to preprocess.
+        record (Record): The MARC record to preprocess.
     Returns:
-        pymarc.Record: The preprocessed MARC record.
+        Record: The preprocessed MARC record.
     """
     VALID_STATUSES = ["a", "c", "d", "n", "p"]
     VALID_TYPES = ["a", "c", "d", "e", "f", "g", "i", "j", "k", "m", "o", "p", "r", "t"]
@@ -309,7 +431,7 @@ def fix_leader(record: pymarc.Record) -> pymarc.Record:
             "DATA ISSUE\t%s\t%s\t%s",
             record["001"].value(),
             f"Invalid record status: {record.leader[5]}, setting to 'c'",
-            record,
+            record.leader,
         )
         record.leader = pymarc.Leader(record.leader[:5] + "c" + record.leader[6:])
     if record.leader[6] not in VALID_TYPES:
@@ -318,11 +440,40 @@ def fix_leader(record: pymarc.Record) -> pymarc.Record:
             "DATA ISSUE\t%s\t%s\t%s",
             record["001"].value(),
             f"Invalid record type: {record.leader[6]}, setting to 'a'",
-            record,
+            record.leader,
         )
         record.leader = pymarc.Leader(record.leader[:6] + "a" + record.leader[7:])
     return record
+def move_authority_subfield_9_to_0_all_controllable_fields(record: Record, **kwargs) -> Record:
+    """
+    Move subfield 9 from authority fields to subfield 0. This is useful when
+    importing records from the ABES SUDOC catalog.
+    Args:
+        record (Record): The MARC record to preprocess.
+    Returns:
+        Record: The preprocessed MARC record.
+    """
+    controlled_fields = [
+            "100", "110", "111", "130",
+            "600", "610", "611", "630", "650", "651", "655",
+            "700", "710", "711", "730",
+            "800", "810", "811", "830"
+        ]
+    for field in record.get_fields(*controlled_fields):
+        for subfield in list(field.get_subfields("9")):
+            field.add_subfield("0", subfield)
+            field.delete_subfield("9", subfield)
+            logger.log(
+                26,
+                "DATA ISSUE\t%s\t%s\t%s",
+                record["001"].value(),
+                f"Subfield 9 moved to subfield 0 in {field.tag}",
+                field,
+            )
+    return record
 def ordinal(n):
     s = ("th", "st", "nd", "rd") + ("th",) * 10

{folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: folio_data_import
-Version: 0.2.8rc11
+Version: 0.3.0
 Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
 License: MIT
 Author: Brooks Travis
@@ -19,8 +19,7 @@ Requires-Dist: flake8-black (>=0.3.6,<0.4.0)
 Requires-Dist: flake8-bugbear (>=24.8.19,<25.0.0)
 Requires-Dist: flake8-docstrings (>=1.7.0,<2.0.0)
 Requires-Dist: flake8-isort (>=6.1.1,<7.0.0)
-Requires-Dist: folioclient (>=0.61.0,<0.62.0)
-Requires-Dist: httpx (>=0.27.2,<0.28.0)
+Requires-Dist: folioclient (>=0.70.1,<0.71.0)
 Requires-Dist: inquirer (>=3.4.0,<4.0.0)
 Requires-Dist: pyhumps (>=3.8.0,<4.0.0)
 Requires-Dist: pymarc (>=5.2.2,<6.0.0)

folio_data_import-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+folio_data_import/MARCDataImport.py,sha256=je3TdCdaDR-gYA3Gh1k4AX9l3v83sCTt4Y9lOFxayu8,36220
+folio_data_import/UserImport.py,sha256=ZulGaGJhI_N5vmR69YF_qbzbGeVyzcthXklSjDpZCyA,40998
+folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
+folio_data_import/custom_exceptions.py,sha256=xOeIbM86d2r5-z3ul4JFTJLT3vI3kwmEq62cWS-9dOc,646
+folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
+folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4Zrp-9LdL7f5QqUTOjyMkK5IaHP2YOkmkqoY_4o585Q,16377
+folio_data_import-0.3.0.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
+folio_data_import-0.3.0.dist-info/METADATA,sha256=Aqf0PXhdwFyChMKvl9cOluKN60IyMAUPDKSpb8AOlXI,6069
+folio_data_import-0.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+folio_data_import-0.3.0.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
+folio_data_import-0.3.0.dist-info/RECORD,,

{folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.1.1
+Generator: poetry-core 2.1.3
 Root-Is-Purelib: true
 Tag: py3-none-any

folio_data_import-0.2.8rc11.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-folio_data_import/MARCDataImport.py,sha256=DjNIfnKSQ7d2IWP0x_R8NRDeDBHoAmalNMmsimeHf94,33164
-folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
-folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
-folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
-folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4i1_lEnptzZDx3DojX9sfvJ_hmehwFJUC3aZsUADcwA,10851
-folio_data_import-0.2.8rc11.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
-folio_data_import-0.2.8rc11.dist-info/METADATA,sha256=xlq3E8A6c-dme1eF5GTNmskjrvqFBidPWL7Z7K1hsqs,6113
-folio_data_import-0.2.8rc11.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
-folio_data_import-0.2.8rc11.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
-folio_data_import-0.2.8rc11.dist-info/RECORD,,

{folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{folio_data_import-0.2.8rc11.dist-info → folio_data_import-0.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

folio-data-import 0.2.8rc11__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

folio-data-import 0.2.8rc11py3-none-any.whl → 0.3.0py3-none-any.whl