folio-data-import 0.2.8rc10__tar.gz → 0.2.8rc12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of folio-data-import might be problematic. Click here for more details.
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/PKG-INFO +1 -1
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/pyproject.toml +1 -1
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/MARCDataImport.py +144 -46
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/LICENSE +0 -0
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/README.md +0 -0
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/UserImport.py +0 -0
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/__init__.py +0 -0
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/__main__.py +0 -0
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/marc_preprocessors/__init__.py +0 -0
- {folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/marc_preprocessors/_preprocessors.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "folio_data_import"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.8rc12"
|
|
4
4
|
description = "A python module to interact with the data importing capabilities of the open-source FOLIO ILS"
|
|
5
5
|
authors = ["Brooks Travis <brooks.travis@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
{folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/MARCDataImport.py
RENAMED
|
@@ -5,6 +5,7 @@ import glob
|
|
|
5
5
|
import importlib
|
|
6
6
|
import io
|
|
7
7
|
import logging
|
|
8
|
+
import math
|
|
8
9
|
import os
|
|
9
10
|
import sys
|
|
10
11
|
import uuid
|
|
@@ -14,7 +15,7 @@ from functools import cached_property
|
|
|
14
15
|
from getpass import getpass
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from time import sleep
|
|
17
|
-
from typing import List
|
|
18
|
+
from typing import List, Union
|
|
18
19
|
|
|
19
20
|
import folioclient
|
|
20
21
|
import httpx
|
|
@@ -78,6 +79,11 @@ class MARCImportJob:
|
|
|
78
79
|
last_current: int = 0
|
|
79
80
|
total_records_sent: int = 0
|
|
80
81
|
finished: bool = False
|
|
82
|
+
job_id: str = ""
|
|
83
|
+
job_hrid: int = 0
|
|
84
|
+
current_file: Union[List[Path],List[io.BytesIO]] = []
|
|
85
|
+
_max_summary_retries: int = 2
|
|
86
|
+
_summary_retries: int = 0
|
|
81
87
|
|
|
82
88
|
def __init__(
|
|
83
89
|
self,
|
|
@@ -90,8 +96,14 @@ class MARCImportJob:
|
|
|
90
96
|
consolidate=False,
|
|
91
97
|
no_progress=False,
|
|
92
98
|
let_summary_fail=False,
|
|
99
|
+
split_files=False,
|
|
100
|
+
split_size=1000,
|
|
93
101
|
) -> None:
|
|
94
102
|
self.consolidate_files = consolidate
|
|
103
|
+
self.split_files = split_files
|
|
104
|
+
self.split_size = split_size
|
|
105
|
+
if self.split_files and self.consolidate_files:
|
|
106
|
+
raise ValueError("Cannot consolidate and split files at the same time.")
|
|
95
107
|
self.no_progress = no_progress
|
|
96
108
|
self.let_summary_fail = let_summary_fail
|
|
97
109
|
self.folio_client: folioclient.FolioClient = folio_client
|
|
@@ -101,10 +113,6 @@ class MARCImportJob:
|
|
|
101
113
|
self.batch_delay = batch_delay
|
|
102
114
|
self.current_retry_timeout = None
|
|
103
115
|
self.marc_record_preprocessor = marc_record_preprocessor
|
|
104
|
-
self.pbar_sent: tqdm
|
|
105
|
-
self.pbar_imported: tqdm
|
|
106
|
-
self._max_summary_retries: int = 2
|
|
107
|
-
self._summary_retries: int = 0
|
|
108
116
|
|
|
109
117
|
async def do_work(self) -> None:
|
|
110
118
|
"""
|
|
@@ -141,6 +149,18 @@ class MARCImportJob:
|
|
|
141
149
|
if self.consolidate_files:
|
|
142
150
|
self.current_file = self.import_files
|
|
143
151
|
await self.import_marc_file()
|
|
152
|
+
elif self.split_files:
|
|
153
|
+
for file in self.import_files:
|
|
154
|
+
with open(file, "rb") as f:
|
|
155
|
+
file_length = await self.read_total_records([f])
|
|
156
|
+
expected_batches = math.ceil(file_length /self.split_size)
|
|
157
|
+
logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
|
|
158
|
+
zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
|
|
159
|
+
for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
|
|
160
|
+
batch.name = f"{file.name}_part{idx:0{zero_pad_parts}}"
|
|
161
|
+
self.current_file = [batch]
|
|
162
|
+
await self.import_marc_file()
|
|
163
|
+
self.move_file_to_complete(file)
|
|
144
164
|
else:
|
|
145
165
|
for file in self.import_files:
|
|
146
166
|
self.current_file = [file]
|
|
@@ -275,7 +295,7 @@ class MARCImportJob:
|
|
|
275
295
|
)
|
|
276
296
|
raise e
|
|
277
297
|
self.job_id = create_job.json()["parentJobExecutionId"]
|
|
278
|
-
logger.info("Created job:
|
|
298
|
+
logger.info(f"Created job: {self.job_id}")
|
|
279
299
|
|
|
280
300
|
@cached_property
|
|
281
301
|
def import_profile(self) -> dict:
|
|
@@ -318,6 +338,8 @@ class MARCImportJob:
|
|
|
318
338
|
)
|
|
319
339
|
try:
|
|
320
340
|
set_job_profile.raise_for_status()
|
|
341
|
+
self.job_hrid = set_job_profile.json()['hrId']
|
|
342
|
+
logger.info(f"Job HRID: {self.job_hrid}")
|
|
321
343
|
except httpx.HTTPError as e:
|
|
322
344
|
logger.error(
|
|
323
345
|
"Error creating job: "
|
|
@@ -327,7 +349,8 @@ class MARCImportJob:
|
|
|
327
349
|
)
|
|
328
350
|
raise e
|
|
329
351
|
|
|
330
|
-
|
|
352
|
+
@staticmethod
|
|
353
|
+
async def read_total_records(files) -> int:
|
|
331
354
|
"""
|
|
332
355
|
Reads the total number of records from the given files.
|
|
333
356
|
|
|
@@ -388,6 +411,7 @@ class MARCImportJob:
|
|
|
388
411
|
self.error_records += len(self.record_batch)
|
|
389
412
|
self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
|
|
390
413
|
self.record_batch = []
|
|
414
|
+
await self.get_job_status()
|
|
391
415
|
sleep(self.batch_delay)
|
|
392
416
|
|
|
393
417
|
async def process_records(self, files, total_records) -> None:
|
|
@@ -419,7 +443,6 @@ class MARCImportJob:
|
|
|
419
443
|
== (total_records - self.error_records),
|
|
420
444
|
),
|
|
421
445
|
)
|
|
422
|
-
await self.get_job_status()
|
|
423
446
|
sleep(0.25)
|
|
424
447
|
if record:
|
|
425
448
|
if self.marc_record_preprocessor:
|
|
@@ -445,12 +468,16 @@ class MARCImportJob:
|
|
|
445
468
|
== (total_records - self.error_records),
|
|
446
469
|
),
|
|
447
470
|
)
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
471
|
+
if not self.split_files:
|
|
472
|
+
self.move_file_to_complete(file_path)
|
|
473
|
+
|
|
474
|
+
def move_file_to_complete(self, file_path):
|
|
475
|
+
import_complete_path = file_path.parent.joinpath("import_complete")
|
|
476
|
+
if not import_complete_path.exists():
|
|
477
|
+
logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
|
|
478
|
+
import_complete_path.mkdir(exist_ok=True)
|
|
479
|
+
logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
|
|
480
|
+
file_path.rename(
|
|
454
481
|
file_path.parent.joinpath("import_complete", file_path.name)
|
|
455
482
|
)
|
|
456
483
|
|
|
@@ -529,6 +556,46 @@ class MARCImportJob:
|
|
|
529
556
|
"initialRecords": [{"record": x.decode()} for x in self.record_batch],
|
|
530
557
|
}
|
|
531
558
|
|
|
559
|
+
@staticmethod
|
|
560
|
+
def split_marc_file(file_path, batch_size):
|
|
561
|
+
"""Generator to iterate over MARC records in batches, yielding BytesIO objects."""
|
|
562
|
+
with open(file_path, "rb") as f:
|
|
563
|
+
batch = io.BytesIO()
|
|
564
|
+
count = 0
|
|
565
|
+
|
|
566
|
+
while True:
|
|
567
|
+
leader = f.read(24)
|
|
568
|
+
if not leader:
|
|
569
|
+
break # End of file
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
record_length = int(leader[:5]) # Extract record length from leader
|
|
573
|
+
except ValueError:
|
|
574
|
+
raise ValueError("Invalid MARC record length encountered.")
|
|
575
|
+
|
|
576
|
+
record_body = f.read(record_length - 24)
|
|
577
|
+
if len(record_body) != record_length - 24:
|
|
578
|
+
raise ValueError("Unexpected end of file while reading MARC record.")
|
|
579
|
+
|
|
580
|
+
# Verify record terminator
|
|
581
|
+
if record_body[-1:] != b'\x1D':
|
|
582
|
+
raise ValueError("MARC record does not end with the expected terminator (0x1D).")
|
|
583
|
+
|
|
584
|
+
# Write the full record to the batch buffer
|
|
585
|
+
batch.write(leader + record_body)
|
|
586
|
+
count += 1
|
|
587
|
+
|
|
588
|
+
if count >= batch_size:
|
|
589
|
+
batch.seek(0)
|
|
590
|
+
yield batch
|
|
591
|
+
batch = io.BytesIO() # Reset buffer
|
|
592
|
+
count = 0
|
|
593
|
+
|
|
594
|
+
# Yield any remaining records
|
|
595
|
+
if count > 0:
|
|
596
|
+
batch.seek(0)
|
|
597
|
+
yield batch
|
|
598
|
+
|
|
532
599
|
async def import_marc_file(self) -> None:
|
|
533
600
|
"""
|
|
534
601
|
Imports MARC file into the system.
|
|
@@ -550,13 +617,24 @@ class MARCImportJob:
|
|
|
550
617
|
await self.create_folio_import_job()
|
|
551
618
|
await self.set_job_profile()
|
|
552
619
|
with ExitStack() as stack:
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
620
|
+
try:
|
|
621
|
+
if isinstance(self.current_file[0], Path):
|
|
622
|
+
files = [
|
|
623
|
+
stack.enter_context(open(file, "rb")) for file in self.current_file
|
|
624
|
+
]
|
|
625
|
+
elif isinstance(self.current_file[0], io.BytesIO):
|
|
626
|
+
files = [
|
|
627
|
+
stack.enter_context(file) for file in self.current_file
|
|
628
|
+
]
|
|
629
|
+
else:
|
|
630
|
+
raise ValueError("Invalid file type. Must be Path or BytesIO.")
|
|
631
|
+
except IndexError as e:
|
|
632
|
+
logger.error(f"Error opening file: {e}")
|
|
633
|
+
raise e
|
|
556
634
|
total_records = await self.read_total_records(files)
|
|
557
635
|
with (
|
|
558
636
|
tqdm(
|
|
559
|
-
desc="Imported: ",
|
|
637
|
+
desc=f"Imported ({self.job_hrid}): ",
|
|
560
638
|
total=total_records,
|
|
561
639
|
position=1,
|
|
562
640
|
disable=self.no_progress,
|
|
@@ -575,41 +653,44 @@ class MARCImportJob:
|
|
|
575
653
|
await self.get_job_status()
|
|
576
654
|
sleep(1)
|
|
577
655
|
if self.finished:
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
656
|
+
await self.log_job_summary()
|
|
657
|
+
self.last_current = 0
|
|
658
|
+
self.finished = False
|
|
659
|
+
|
|
660
|
+
async def log_job_summary(self):
|
|
661
|
+
if job_summary := await self.get_job_summary():
|
|
662
|
+
job_id = job_summary.pop("jobExecutionId", None)
|
|
663
|
+
total_errors = job_summary.pop("totalErrors", 0)
|
|
664
|
+
columns = ["Summary"] + list(job_summary.keys())
|
|
665
|
+
rows = set()
|
|
666
|
+
for key in columns[1:]:
|
|
667
|
+
rows.update(job_summary[key].keys())
|
|
668
|
+
|
|
669
|
+
table_data = []
|
|
670
|
+
for row in rows:
|
|
671
|
+
metric_name = decamelize(row).split("_")[1]
|
|
672
|
+
table_row = [metric_name]
|
|
673
|
+
for col in columns[1:]:
|
|
674
|
+
table_row.append(job_summary[col].get(row, "N/A"))
|
|
675
|
+
table_data.append(table_row)
|
|
676
|
+
table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
|
|
677
|
+
columns = columns[:1] + [
|
|
595
678
|
" ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
|
|
596
679
|
]
|
|
597
|
-
|
|
680
|
+
logger.info(
|
|
598
681
|
f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
|
|
599
682
|
f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
|
|
600
683
|
)
|
|
601
|
-
|
|
684
|
+
logger.info(
|
|
602
685
|
"\n"
|
|
603
686
|
+ tabulate.tabulate(
|
|
604
687
|
table_data, headers=columns, tablefmt="fancy_grid"
|
|
605
688
|
),
|
|
606
689
|
)
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
self.last_current = 0
|
|
612
|
-
self.finished = False
|
|
690
|
+
if total_errors:
|
|
691
|
+
logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
|
|
692
|
+
else:
|
|
693
|
+
logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
|
|
613
694
|
|
|
614
695
|
async def get_job_summary(self) -> dict:
|
|
615
696
|
"""
|
|
@@ -748,12 +829,15 @@ async def main() -> None:
|
|
|
748
829
|
"--preprocessor",
|
|
749
830
|
type=str,
|
|
750
831
|
help=(
|
|
751
|
-
"
|
|
752
|
-
"to apply to each MARC record before sending to FOLIO."
|
|
832
|
+
"Comma-separated python import paths to Python function(s) "
|
|
833
|
+
"to apply to each MARC record before sending to FOLIO. Function should take "
|
|
834
|
+
"a pymarc.Record object as input and return a pymarc.Record object."
|
|
753
835
|
),
|
|
754
836
|
default=None,
|
|
755
837
|
)
|
|
756
|
-
|
|
838
|
+
# Add mutually exclusive group for consolidate and split-files options
|
|
839
|
+
group = parser.add_mutually_exclusive_group()
|
|
840
|
+
group.add_argument(
|
|
757
841
|
"--consolidate",
|
|
758
842
|
action="store_true",
|
|
759
843
|
help=(
|
|
@@ -761,6 +845,18 @@ async def main() -> None:
|
|
|
761
845
|
"Default is to create a new job for each MARC file."
|
|
762
846
|
),
|
|
763
847
|
)
|
|
848
|
+
group.add_argument(
|
|
849
|
+
"--split-files",
|
|
850
|
+
action="store_true",
|
|
851
|
+
help="Split files into smaller parts before importing.",
|
|
852
|
+
)
|
|
853
|
+
parser.add_argument(
|
|
854
|
+
"--split-size",
|
|
855
|
+
type=int,
|
|
856
|
+
help="The number of records to include in each split file.",
|
|
857
|
+
default=1000,
|
|
858
|
+
)
|
|
859
|
+
|
|
764
860
|
parser.add_argument(
|
|
765
861
|
"--no-progress",
|
|
766
862
|
action="store_true",
|
|
@@ -826,6 +922,8 @@ async def main() -> None:
|
|
|
826
922
|
consolidate=bool(args.consolidate),
|
|
827
923
|
no_progress=bool(args.no_progress),
|
|
828
924
|
let_summary_fail=bool(args.let_summary_fail),
|
|
925
|
+
split_files=bool(args.split_files),
|
|
926
|
+
split_size=args.split_size,
|
|
829
927
|
).do_work()
|
|
830
928
|
except Exception as e:
|
|
831
929
|
logger.error("Error importing files: " + str(e))
|
|
File without changes
|
|
File without changes
|
{folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/UserImport.py
RENAMED
|
File without changes
|
{folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/__init__.py
RENAMED
|
File without changes
|
{folio_data_import-0.2.8rc10 → folio_data_import-0.2.8rc12}/src/folio_data_import/__main__.py
RENAMED
|
File without changes
|
|
File without changes
|