folio-data-import 0.2.8rc11__tar.gz → 0.2.8rc12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/PKG-INFO +1 -1
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/pyproject.toml +1 -1
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/MARCDataImport.py +141 -44
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/LICENSE +0 -0
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/README.md +0 -0
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/UserImport.py +0 -0
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/__init__.py +0 -0
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/__main__.py +0 -0
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/marc_preprocessors/__init__.py +0 -0
- {folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/marc_preprocessors/_preprocessors.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "folio_data_import"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.8rc12"
|
|
4
4
|
description = "A python module to interact with the data importing capabilities of the open-source FOLIO ILS"
|
|
5
5
|
authors = ["Brooks Travis <brooks.travis@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
{folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/MARCDataImport.py
RENAMED
|
@@ -5,6 +5,7 @@ import glob
|
|
|
5
5
|
import importlib
|
|
6
6
|
import io
|
|
7
7
|
import logging
|
|
8
|
+
import math
|
|
8
9
|
import os
|
|
9
10
|
import sys
|
|
10
11
|
import uuid
|
|
@@ -14,7 +15,7 @@ from functools import cached_property
|
|
|
14
15
|
from getpass import getpass
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from time import sleep
|
|
17
|
-
from typing import List
|
|
18
|
+
from typing import List, Union
|
|
18
19
|
|
|
19
20
|
import folioclient
|
|
20
21
|
import httpx
|
|
@@ -78,6 +79,11 @@ class MARCImportJob:
|
|
|
78
79
|
last_current: int = 0
|
|
79
80
|
total_records_sent: int = 0
|
|
80
81
|
finished: bool = False
|
|
82
|
+
job_id: str = ""
|
|
83
|
+
job_hrid: int = 0
|
|
84
|
+
current_file: Union[List[Path],List[io.BytesIO]] = []
|
|
85
|
+
_max_summary_retries: int = 2
|
|
86
|
+
_summary_retries: int = 0
|
|
81
87
|
|
|
82
88
|
def __init__(
|
|
83
89
|
self,
|
|
@@ -90,8 +96,14 @@ class MARCImportJob:
|
|
|
90
96
|
consolidate=False,
|
|
91
97
|
no_progress=False,
|
|
92
98
|
let_summary_fail=False,
|
|
99
|
+
split_files=False,
|
|
100
|
+
split_size=1000,
|
|
93
101
|
) -> None:
|
|
94
102
|
self.consolidate_files = consolidate
|
|
103
|
+
self.split_files = split_files
|
|
104
|
+
self.split_size = split_size
|
|
105
|
+
if self.split_files and self.consolidate_files:
|
|
106
|
+
raise ValueError("Cannot consolidate and split files at the same time.")
|
|
95
107
|
self.no_progress = no_progress
|
|
96
108
|
self.let_summary_fail = let_summary_fail
|
|
97
109
|
self.folio_client: folioclient.FolioClient = folio_client
|
|
@@ -101,10 +113,6 @@ class MARCImportJob:
|
|
|
101
113
|
self.batch_delay = batch_delay
|
|
102
114
|
self.current_retry_timeout = None
|
|
103
115
|
self.marc_record_preprocessor = marc_record_preprocessor
|
|
104
|
-
self.pbar_sent: tqdm
|
|
105
|
-
self.pbar_imported: tqdm
|
|
106
|
-
self._max_summary_retries: int = 2
|
|
107
|
-
self._summary_retries: int = 0
|
|
108
116
|
|
|
109
117
|
async def do_work(self) -> None:
|
|
110
118
|
"""
|
|
@@ -141,6 +149,18 @@ class MARCImportJob:
|
|
|
141
149
|
if self.consolidate_files:
|
|
142
150
|
self.current_file = self.import_files
|
|
143
151
|
await self.import_marc_file()
|
|
152
|
+
elif self.split_files:
|
|
153
|
+
for file in self.import_files:
|
|
154
|
+
with open(file, "rb") as f:
|
|
155
|
+
file_length = await self.read_total_records([f])
|
|
156
|
+
expected_batches = math.ceil(file_length /self.split_size)
|
|
157
|
+
logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
|
|
158
|
+
zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
|
|
159
|
+
for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
|
|
160
|
+
batch.name = f"{file.name}_part{idx:0{zero_pad_parts}}"
|
|
161
|
+
self.current_file = [batch]
|
|
162
|
+
await self.import_marc_file()
|
|
163
|
+
self.move_file_to_complete(file)
|
|
144
164
|
else:
|
|
145
165
|
for file in self.import_files:
|
|
146
166
|
self.current_file = [file]
|
|
@@ -275,7 +295,7 @@ class MARCImportJob:
|
|
|
275
295
|
)
|
|
276
296
|
raise e
|
|
277
297
|
self.job_id = create_job.json()["parentJobExecutionId"]
|
|
278
|
-
logger.info("Created job:
|
|
298
|
+
logger.info(f"Created job: {self.job_id}")
|
|
279
299
|
|
|
280
300
|
@cached_property
|
|
281
301
|
def import_profile(self) -> dict:
|
|
@@ -318,6 +338,8 @@ class MARCImportJob:
|
|
|
318
338
|
)
|
|
319
339
|
try:
|
|
320
340
|
set_job_profile.raise_for_status()
|
|
341
|
+
self.job_hrid = set_job_profile.json()['hrId']
|
|
342
|
+
logger.info(f"Job HRID: {self.job_hrid}")
|
|
321
343
|
except httpx.HTTPError as e:
|
|
322
344
|
logger.error(
|
|
323
345
|
"Error creating job: "
|
|
@@ -446,12 +468,16 @@ class MARCImportJob:
|
|
|
446
468
|
== (total_records - self.error_records),
|
|
447
469
|
),
|
|
448
470
|
)
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
471
|
+
if not self.split_files:
|
|
472
|
+
self.move_file_to_complete(file_path)
|
|
473
|
+
|
|
474
|
+
def move_file_to_complete(self, file_path):
|
|
475
|
+
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
476
|
+
if not import_complete_path.exists():
|
|
477
|
+
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
478
|
+
import_complete_path.mkdir(exist_ok=True)
|
|
479
|
+
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
480
|
+
file_path.rename(
|
|
455
481
|
file_path.parent.joinpath("import_complete", file_path.name)
|
|
456
482
|
)
|
|
457
483
|
|
|
@@ -530,6 +556,46 @@ class MARCImportJob:
|
|
|
530
556
|
"initialRecords": [{"record": x.decode()} for x in self.record_batch],
|
|
531
557
|
}
|
|
532
558
|
|
|
559
|
+
@staticmethod
|
|
560
|
+
def split_marc_file(file_path, batch_size):
|
|
561
|
+
"""Generator to iterate over MARC records in batches, yielding BytesIO objects."""
|
|
562
|
+
with open(file_path, "rb") as f:
|
|
563
|
+
batch = io.BytesIO()
|
|
564
|
+
count = 0
|
|
565
|
+
|
|
566
|
+
while True:
|
|
567
|
+
leader = f.read(24)
|
|
568
|
+
if not leader:
|
|
569
|
+
break # End of file
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
record_length = int(leader[:5]) # Extract record length from leader
|
|
573
|
+
except ValueError:
|
|
574
|
+
raise ValueError("Invalid MARC record length encountered.")
|
|
575
|
+
|
|
576
|
+
record_body = f.read(record_length - 24)
|
|
577
|
+
if len(record_body) != record_length - 24:
|
|
578
|
+
raise ValueError("Unexpected end of file while reading MARC record.")
|
|
579
|
+
|
|
580
|
+
# Verify record terminator
|
|
581
|
+
if record_body[-1:] != b'\x1D':
|
|
582
|
+
raise ValueError("MARC record does not end with the expected terminator (0x1D).")
|
|
583
|
+
|
|
584
|
+
# Write the full record to the batch buffer
|
|
585
|
+
batch.write(leader + record_body)
|
|
586
|
+
count += 1
|
|
587
|
+
|
|
588
|
+
if count >= batch_size:
|
|
589
|
+
batch.seek(0)
|
|
590
|
+
yield batch
|
|
591
|
+
batch = io.BytesIO() # Reset buffer
|
|
592
|
+
count = 0
|
|
593
|
+
|
|
594
|
+
# Yield any remaining records
|
|
595
|
+
if count > 0:
|
|
596
|
+
batch.seek(0)
|
|
597
|
+
yield batch
|
|
598
|
+
|
|
533
599
|
async def import_marc_file(self) -> None:
|
|
534
600
|
"""
|
|
535
601
|
Imports MARC file into the system.
|
|
@@ -551,13 +617,24 @@ class MARCImportJob:
|
|
|
551
617
|
await self.create_folio_import_job()
|
|
552
618
|
await self.set_job_profile()
|
|
553
619
|
with ExitStack() as stack:
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
620
|
+
try:
|
|
621
|
+
if isinstance(self.current_file[0], Path):
|
|
622
|
+
files = [
|
|
623
|
+
stack.enter_context(open(file, "rb")) for file in self.current_file
|
|
624
|
+
]
|
|
625
|
+
elif isinstance(self.current_file[0], io.BytesIO):
|
|
626
|
+
files = [
|
|
627
|
+
stack.enter_context(file) for file in self.current_file
|
|
628
|
+
]
|
|
629
|
+
else:
|
|
630
|
+
raise ValueError("Invalid file type. Must be Path or BytesIO.")
|
|
631
|
+
except IndexError as e:
|
|
632
|
+
logger.error(f"Error opening file: {e}")
|
|
633
|
+
raise e
|
|
557
634
|
total_records = await self.read_total_records(files)
|
|
558
635
|
with (
|
|
559
636
|
tqdm(
|
|
560
|
-
desc="Imported: ",
|
|
637
|
+
desc=f"Imported ({self.job_hrid}): ",
|
|
561
638
|
total=total_records,
|
|
562
639
|
position=1,
|
|
563
640
|
disable=self.no_progress,
|
|
@@ -576,41 +653,44 @@ class MARCImportJob:
|
|
|
576
653
|
await self.get_job_status()
|
|
577
654
|
sleep(1)
|
|
578
655
|
if self.finished:
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
656
|
+
await self.log_job_summary()
|
|
657
|
+
self.last_current = 0
|
|
658
|
+
self.finished = False
|
|
659
|
+
|
|
660
|
+
async def log_job_summary(self):
|
|
661
|
+
if job_summary := await self.get_job_summary():
|
|
662
|
+
job_id = job_summary.pop("jobExecutionId", None)
|
|
663
|
+
total_errors = job_summary.pop("totalErrors", 0)
|
|
664
|
+
columns = ["Summary"] + list(job_summary.keys())
|
|
665
|
+
rows = set()
|
|
666
|
+
for key in columns[1:]:
|
|
667
|
+
rows.update(job_summary[key].keys())
|
|
668
|
+
|
|
669
|
+
table_data = []
|
|
670
|
+
for row in rows:
|
|
671
|
+
metric_name = decamelize(row).split("_")[1]
|
|
672
|
+
table_row = [metric_name]
|
|
673
|
+
for col in columns[1:]:
|
|
674
|
+
table_row.append(job_summary[col].get(row, "N/A"))
|
|
675
|
+
table_data.append(table_row)
|
|
676
|
+
table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
|
|
677
|
+
columns = columns[:1] + [
|
|
596
678
|
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
597
679
|
]
|
|
598
|
-
|
|
680
|
+
logger.info(
|
|
599
681
|
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
600
682
|
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
601
683
|
)
|
|
602
|
-
|
|
684
|
+
logger.info(
|
|
603
685
|
"\n"
|
|
604
686
|
+ tabulate.tabulate(
|
|
605
687
|
table_data, headers=columns, tablefmt="fancy_grid"
|
|
606
688
|
),
|
|
607
689
|
)
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
self.last_current = 0
|
|
613
|
-
self.finished = False
|
|
690
|
+
if total_errors:
|
|
691
|
+
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
692
|
+
else:
|
|
693
|
+
logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
|
|
614
694
|
|
|
615
695
|
async def get_job_summary(self) -> dict:
|
|
616
696
|
"""
|
|
@@ -749,12 +829,15 @@ async def main() -> None:
|
|
|
749
829
|
"--preprocessor",
|
|
750
830
|
type=str,
|
|
751
831
|
help=(
|
|
752
|
-
"
|
|
753
|
-
"to apply to each MARC record before sending to FOLIO."
|
|
832
|
+
"Comma-separated python import paths to Python function(s) "
|
|
833
|
+
"to apply to each MARC record before sending to FOLIO. Function should take "
|
|
834
|
+
"a pymarc.Record object as input and return a pymarc.Record object."
|
|
754
835
|
),
|
|
755
836
|
default=None,
|
|
756
837
|
)
|
|
757
|
-
|
|
838
|
+
# Add mutually exclusive group for consolidate and split-files options
|
|
839
|
+
group = parser.add_mutually_exclusive_group()
|
|
840
|
+
group.add_argument(
|
|
758
841
|
"--consolidate",
|
|
759
842
|
action="store_true",
|
|
760
843
|
help=(
|
|
@@ -762,6 +845,18 @@ async def main() -> None:
|
|
|
762
845
|
"Default is to create a new job for each MARC file."
|
|
763
846
|
),
|
|
764
847
|
)
|
|
848
|
+
group.add_argument(
|
|
849
|
+
"--split-files",
|
|
850
|
+
action="store_true",
|
|
851
|
+
help="Split files into smaller parts before importing.",
|
|
852
|
+
)
|
|
853
|
+
parser.add_argument(
|
|
854
|
+
"--split-size",
|
|
855
|
+
type=int,
|
|
856
|
+
help="The number of records to include in each split file.",
|
|
857
|
+
default=1000,
|
|
858
|
+
)
|
|
859
|
+
|
|
765
860
|
parser.add_argument(
|
|
766
861
|
"--no-progress",
|
|
767
862
|
action="store_true",
|
|
@@ -827,6 +922,8 @@ async def main() -> None:
|
|
|
827
922
|
consolidate=bool(args.consolidate),
|
|
828
923
|
no_progress=bool(args.no_progress),
|
|
829
924
|
let_summary_fail=bool(args.let_summary_fail),
|
|
925
|
+
split_files=bool(args.split_files),
|
|
926
|
+
split_size=args.split_size,
|
|
830
927
|
).do_work()
|
|
831
928
|
except Exception as e:
|
|
832
929
|
logger.error("Error importing files: " + str(e))
|
|
File without changes
|
|
File without changes
|
{folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/UserImport.py
RENAMED
|
File without changes
|
{folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/__init__.py
RENAMED
|
File without changes
|
{folio_data_import-0.2.8rc11 → folio_data_import-0.2.8rc12}/src/folio_data_import/__main__.py
RENAMED
|
File without changes
|
|
File without changes
|