folio-data-import 0.2.8rc11__py3-none-any.whl → 0.2.8rc12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of folio-data-import might be problematic. Click here for more details.

@@ -5,6 +5,7 @@ import glob
5
5
  import importlib
6
6
  import io
7
7
  import logging
8
+ import math
8
9
  import os
9
10
  import sys
10
11
  import uuid
@@ -14,7 +15,7 @@ from functools import cached_property
14
15
  from getpass import getpass
15
16
  from pathlib import Path
16
17
  from time import sleep
17
- from typing import List
18
+ from typing import List, Union
18
19
 
19
20
  import folioclient
20
21
  import httpx
@@ -78,6 +79,11 @@ class MARCImportJob:
78
79
  last_current: int = 0
79
80
  total_records_sent: int = 0
80
81
  finished: bool = False
82
+ job_id: str = ""
83
+ job_hrid: int = 0
84
+ current_file: Union[List[Path],List[io.BytesIO]] = []
85
+ _max_summary_retries: int = 2
86
+ _summary_retries: int = 0
81
87
 
82
88
  def __init__(
83
89
  self,
@@ -90,8 +96,14 @@ class MARCImportJob:
90
96
  consolidate=False,
91
97
  no_progress=False,
92
98
  let_summary_fail=False,
99
+ split_files=False,
100
+ split_size=1000,
93
101
  ) -> None:
94
102
  self.consolidate_files = consolidate
103
+ self.split_files = split_files
104
+ self.split_size = split_size
105
+ if self.split_files and self.consolidate_files:
106
+ raise ValueError("Cannot consolidate and split files at the same time.")
95
107
  self.no_progress = no_progress
96
108
  self.let_summary_fail = let_summary_fail
97
109
  self.folio_client: folioclient.FolioClient = folio_client
@@ -101,10 +113,6 @@ class MARCImportJob:
101
113
  self.batch_delay = batch_delay
102
114
  self.current_retry_timeout = None
103
115
  self.marc_record_preprocessor = marc_record_preprocessor
104
- self.pbar_sent: tqdm
105
- self.pbar_imported: tqdm
106
- self._max_summary_retries: int = 2
107
- self._summary_retries: int = 0
108
116
 
109
117
  async def do_work(self) -> None:
110
118
  """
@@ -141,6 +149,18 @@ class MARCImportJob:
141
149
  if self.consolidate_files:
142
150
  self.current_file = self.import_files
143
151
  await self.import_marc_file()
152
+ elif self.split_files:
153
+ for file in self.import_files:
154
+ with open(file, "rb") as f:
155
+ file_length = await self.read_total_records([f])
156
+ expected_batches = math.ceil(file_length /self.split_size)
157
+ logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
158
+ zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
159
+ for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
160
+ batch.name = f"{file.name}_part{idx:0{zero_pad_parts}}"
161
+ self.current_file = [batch]
162
+ await self.import_marc_file()
163
+ self.move_file_to_complete(file)
144
164
  else:
145
165
  for file in self.import_files:
146
166
  self.current_file = [file]
@@ -275,7 +295,7 @@ class MARCImportJob:
275
295
  )
276
296
  raise e
277
297
  self.job_id = create_job.json()["parentJobExecutionId"]
278
- logger.info("Created job: " + self.job_id)
298
+ logger.info(f"Created job: {self.job_id}")
279
299
 
280
300
  @cached_property
281
301
  def import_profile(self) -> dict:
@@ -318,6 +338,8 @@ class MARCImportJob:
318
338
  )
319
339
  try:
320
340
  set_job_profile.raise_for_status()
341
+ self.job_hrid = set_job_profile.json()['hrId']
342
+ logger.info(f"Job HRID: {self.job_hrid}")
321
343
  except httpx.HTTPError as e:
322
344
  logger.error(
323
345
  "Error creating job: "
@@ -446,12 +468,16 @@ class MARCImportJob:
446
468
  == (total_records - self.error_records),
447
469
  ),
448
470
  )
449
- import_complete_path = file_path.parent.joinpath("import_complete")
450
- if not import_complete_path.exists():
451
- logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
452
- import_complete_path.mkdir(exist_ok=True)
453
- logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
454
- file_path.rename(
471
+ if not self.split_files:
472
+ self.move_file_to_complete(file_path)
473
+
474
+ def move_file_to_complete(self, file_path):
475
+ import_complete_path = file_path.parent.joinpath("import_complete")
476
+ if not import_complete_path.exists():
477
+ logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
478
+ import_complete_path.mkdir(exist_ok=True)
479
+ logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
480
+ file_path.rename(
455
481
  file_path.parent.joinpath("import_complete", file_path.name)
456
482
  )
457
483
 
@@ -530,6 +556,46 @@ class MARCImportJob:
530
556
  "initialRecords": [{"record": x.decode()} for x in self.record_batch],
531
557
  }
532
558
 
559
+ @staticmethod
560
+ def split_marc_file(file_path, batch_size):
561
+ """Generator to iterate over MARC records in batches, yielding BytesIO objects."""
562
+ with open(file_path, "rb") as f:
563
+ batch = io.BytesIO()
564
+ count = 0
565
+
566
+ while True:
567
+ leader = f.read(24)
568
+ if not leader:
569
+ break # End of file
570
+
571
+ try:
572
+ record_length = int(leader[:5]) # Extract record length from leader
573
+ except ValueError:
574
+ raise ValueError("Invalid MARC record length encountered.")
575
+
576
+ record_body = f.read(record_length - 24)
577
+ if len(record_body) != record_length - 24:
578
+ raise ValueError("Unexpected end of file while reading MARC record.")
579
+
580
+ # Verify record terminator
581
+ if record_body[-1:] != b'\x1D':
582
+ raise ValueError("MARC record does not end with the expected terminator (0x1D).")
583
+
584
+ # Write the full record to the batch buffer
585
+ batch.write(leader + record_body)
586
+ count += 1
587
+
588
+ if count >= batch_size:
589
+ batch.seek(0)
590
+ yield batch
591
+ batch = io.BytesIO() # Reset buffer
592
+ count = 0
593
+
594
+ # Yield any remaining records
595
+ if count > 0:
596
+ batch.seek(0)
597
+ yield batch
598
+
533
599
  async def import_marc_file(self) -> None:
534
600
  """
535
601
  Imports MARC file into the system.
@@ -551,13 +617,24 @@ class MARCImportJob:
551
617
  await self.create_folio_import_job()
552
618
  await self.set_job_profile()
553
619
  with ExitStack() as stack:
554
- files = [
555
- stack.enter_context(open(file, "rb")) for file in self.current_file
556
- ]
620
+ try:
621
+ if isinstance(self.current_file[0], Path):
622
+ files = [
623
+ stack.enter_context(open(file, "rb")) for file in self.current_file
624
+ ]
625
+ elif isinstance(self.current_file[0], io.BytesIO):
626
+ files = [
627
+ stack.enter_context(file) for file in self.current_file
628
+ ]
629
+ else:
630
+ raise ValueError("Invalid file type. Must be Path or BytesIO.")
631
+ except IndexError as e:
632
+ logger.error(f"Error opening file: {e}")
633
+ raise e
557
634
  total_records = await self.read_total_records(files)
558
635
  with (
559
636
  tqdm(
560
- desc="Imported: ",
637
+ desc=f"Imported ({self.job_hrid}): ",
561
638
  total=total_records,
562
639
  position=1,
563
640
  disable=self.no_progress,
@@ -576,41 +653,44 @@ class MARCImportJob:
576
653
  await self.get_job_status()
577
654
  sleep(1)
578
655
  if self.finished:
579
- if job_summary := await self.get_job_summary():
580
- job_id = job_summary.pop("jobExecutionId", None)
581
- total_errors = job_summary.pop("totalErrors", 0)
582
- columns = ["Summary"] + list(job_summary.keys())
583
- rows = set()
584
- for key in columns[1:]:
585
- rows.update(job_summary[key].keys())
586
-
587
- table_data = []
588
- for row in rows:
589
- metric_name = decamelize(row).split("_")[1]
590
- table_row = [metric_name]
591
- for col in columns[1:]:
592
- table_row.append(job_summary[col].get(row, "N/A"))
593
- table_data.append(table_row)
594
- table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
595
- columns = columns[:1] + [
656
+ await self.log_job_summary()
657
+ self.last_current = 0
658
+ self.finished = False
659
+
660
+ async def log_job_summary(self):
661
+ if job_summary := await self.get_job_summary():
662
+ job_id = job_summary.pop("jobExecutionId", None)
663
+ total_errors = job_summary.pop("totalErrors", 0)
664
+ columns = ["Summary"] + list(job_summary.keys())
665
+ rows = set()
666
+ for key in columns[1:]:
667
+ rows.update(job_summary[key].keys())
668
+
669
+ table_data = []
670
+ for row in rows:
671
+ metric_name = decamelize(row).split("_")[1]
672
+ table_row = [metric_name]
673
+ for col in columns[1:]:
674
+ table_row.append(job_summary[col].get(row, "N/A"))
675
+ table_data.append(table_row)
676
+ table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
677
+ columns = columns[:1] + [
596
678
  " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
597
679
  ]
598
- logger.info(
680
+ logger.info(
599
681
  f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
600
682
  f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
601
683
  )
602
- logger.info(
684
+ logger.info(
603
685
  "\n"
604
686
  + tabulate.tabulate(
605
687
  table_data, headers=columns, tablefmt="fancy_grid"
606
688
  ),
607
689
  )
608
- if total_errors:
609
- logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
610
- else:
611
- logger.error(f"No job summary available for job {self.job_id}.")
612
- self.last_current = 0
613
- self.finished = False
690
+ if total_errors:
691
+ logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
692
+ else:
693
+ logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
614
694
 
615
695
  async def get_job_summary(self) -> dict:
616
696
  """
@@ -749,12 +829,15 @@ async def main() -> None:
749
829
  "--preprocessor",
750
830
  type=str,
751
831
  help=(
752
- "The path to a Python module containing a preprocessing function "
753
- "to apply to each MARC record before sending to FOLIO."
832
+ "Comma-separated python import paths to Python function(s) "
833
+ "to apply to each MARC record before sending to FOLIO. Function should take "
834
+ "a pymarc.Record object as input and return a pymarc.Record object."
754
835
  ),
755
836
  default=None,
756
837
  )
757
- parser.add_argument(
838
+ # Add mutually exclusive group for consolidate and split-files options
839
+ group = parser.add_mutually_exclusive_group()
840
+ group.add_argument(
758
841
  "--consolidate",
759
842
  action="store_true",
760
843
  help=(
@@ -762,6 +845,18 @@ async def main() -> None:
762
845
  "Default is to create a new job for each MARC file."
763
846
  ),
764
847
  )
848
+ group.add_argument(
849
+ "--split-files",
850
+ action="store_true",
851
+ help="Split files into smaller parts before importing.",
852
+ )
853
+ parser.add_argument(
854
+ "--split-size",
855
+ type=int,
856
+ help="The number of records to include in each split file.",
857
+ default=1000,
858
+ )
859
+
765
860
  parser.add_argument(
766
861
  "--no-progress",
767
862
  action="store_true",
@@ -827,6 +922,8 @@ async def main() -> None:
827
922
  consolidate=bool(args.consolidate),
828
923
  no_progress=bool(args.no_progress),
829
924
  let_summary_fail=bool(args.let_summary_fail),
925
+ split_files=bool(args.split_files),
926
+ split_size=args.split_size,
830
927
  ).do_work()
831
928
  except Exception as e:
832
929
  logger.error("Error importing files: " + str(e))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: folio_data_import
3
- Version: 0.2.8rc11
3
+ Version: 0.2.8rc12
4
4
  Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
5
5
  License: MIT
6
6
  Author: Brooks Travis
@@ -1,11 +1,11 @@
1
- folio_data_import/MARCDataImport.py,sha256=DjNIfnKSQ7d2IWP0x_R8NRDeDBHoAmalNMmsimeHf94,33164
1
+ folio_data_import/MARCDataImport.py,sha256=Rs8TuIrC--yXyr4oBQlV1b-pcyNq_6M_lB_SiKmnFb4,37135
2
2
  folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
3
3
  folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
5
5
  folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
6
6
  folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4i1_lEnptzZDx3DojX9sfvJ_hmehwFJUC3aZsUADcwA,10851
7
- folio_data_import-0.2.8rc11.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
8
- folio_data_import-0.2.8rc11.dist-info/METADATA,sha256=xlq3E8A6c-dme1eF5GTNmskjrvqFBidPWL7Z7K1hsqs,6113
9
- folio_data_import-0.2.8rc11.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
10
- folio_data_import-0.2.8rc11.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
11
- folio_data_import-0.2.8rc11.dist-info/RECORD,,
7
+ folio_data_import-0.2.8rc12.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
8
+ folio_data_import-0.2.8rc12.dist-info/METADATA,sha256=LQbv4kjrCutjgvLXTHHNb6UfpKxvlNTCvVb7FbBOhk4,6113
9
+ folio_data_import-0.2.8rc12.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
10
+ folio_data_import-0.2.8rc12.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
11
+ folio_data_import-0.2.8rc12.dist-info/RECORD,,