folio-data-import 0.2.8rc11__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of folio-data-import might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: folio_data_import
3
- Version: 0.2.8rc11
3
+ Version: 0.3.0
4
4
  Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
5
5
  License: MIT
6
6
  Author: Brooks Travis
@@ -19,8 +19,7 @@ Requires-Dist: flake8-black (>=0.3.6,<0.4.0)
19
19
  Requires-Dist: flake8-bugbear (>=24.8.19,<25.0.0)
20
20
  Requires-Dist: flake8-docstrings (>=1.7.0,<2.0.0)
21
21
  Requires-Dist: flake8-isort (>=6.1.1,<7.0.0)
22
- Requires-Dist: folioclient (>=0.61.0,<0.62.0)
23
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
22
+ Requires-Dist: folioclient (>=0.70.1,<0.71.0)
24
23
  Requires-Dist: inquirer (>=3.4.0,<4.0.0)
25
24
  Requires-Dist: pyhumps (>=3.8.0,<4.0.0)
26
25
  Requires-Dist: pymarc (>=5.2.2,<6.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "folio_data_import"
3
- version = "0.2.8rc11"
3
+ version = "0.3.0"
4
4
  description = "A python module to interact with the data importing capabilities of the open-source FOLIO ILS"
5
5
  authors = ["Brooks Travis <brooks.travis@gmail.com>"]
6
6
  license = "MIT"
@@ -14,8 +14,7 @@ folio-user-import = "folio_data_import.UserImport:sync_main"
14
14
 
15
15
  [tool.poetry.dependencies]
16
16
  python = "^3.9"
17
- folioclient = "^0.61.0"
18
- httpx = "^0.27.2"
17
+ folioclient = "^0.70.1"
19
18
  pymarc = "^5.2.2"
20
19
  pyhumps = "^3.8.0"
21
20
  inquirer = "^3.4.0"
@@ -2,9 +2,10 @@ import argparse
2
2
  import asyncio
3
3
  import datetime
4
4
  import glob
5
- import importlib
6
5
  import io
6
+ import json
7
7
  import logging
8
+ import math
8
9
  import os
9
10
  import sys
10
11
  import uuid
@@ -14,7 +15,7 @@ from functools import cached_property
14
15
  from getpass import getpass
15
16
  from pathlib import Path
16
17
  from time import sleep
17
- from typing import List
18
+ from typing import Any, BinaryIO, Callable, Dict, List, Union
18
19
 
19
20
  import folioclient
20
21
  import httpx
@@ -24,6 +25,9 @@ import tabulate
24
25
  from humps import decamelize
25
26
  from tqdm import tqdm
26
27
 
28
+ from folio_data_import.custom_exceptions import FolioDataImportBatchError
29
+ from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
30
+
27
31
  try:
28
32
  datetime_utc = datetime.UTC
29
33
  except AttributeError:
@@ -62,7 +66,6 @@ class MARCImportJob:
62
66
  import_profile_name (str): The name of the data import job profile to use.
63
67
  batch_size (int): The number of source records to include in a record batch (default=10).
64
68
  batch_delay (float): The number of seconds to wait between record batches (default=0).
65
- consolidate (bool): Consolidate files into a single job. Default is one job for each file.
66
69
  no_progress (bool): Disable progress bars (eg. for running in a CI environment).
67
70
  """
68
71
 
@@ -74,10 +77,14 @@ class MARCImportJob:
74
77
  http_client: httpx.Client
75
78
  current_file: List[Path]
76
79
  record_batch: List[dict] = []
77
- error_records: int = 0
78
80
  last_current: int = 0
79
81
  total_records_sent: int = 0
80
82
  finished: bool = False
83
+ job_id: str = ""
84
+ job_hrid: int = 0
85
+ current_file: Union[List[Path],List[io.BytesIO]] = []
86
+ _max_summary_retries: int = 2
87
+ _summary_retries: int = 0
81
88
 
82
89
  def __init__(
83
90
  self,
@@ -86,12 +93,17 @@ class MARCImportJob:
86
93
  import_profile_name: str,
87
94
  batch_size=10,
88
95
  batch_delay=0,
89
- marc_record_preprocessor=None,
90
- consolidate=False,
96
+ marc_record_preprocessor: Union[List[Callable], str]=[],
97
+ preprocessor_args: Dict[str,Dict]={},
91
98
  no_progress=False,
92
99
  let_summary_fail=False,
100
+ split_files=False,
101
+ split_size=1000,
102
+ split_offset=0,
93
103
  ) -> None:
94
- self.consolidate_files = consolidate
104
+ self.split_files = split_files
105
+ self.split_size = split_size
106
+ self.split_offset = split_offset
95
107
  self.no_progress = no_progress
96
108
  self.let_summary_fail = let_summary_fail
97
109
  self.folio_client: folioclient.FolioClient = folio_client
@@ -100,20 +112,14 @@ class MARCImportJob:
100
112
  self.batch_size = batch_size
101
113
  self.batch_delay = batch_delay
102
114
  self.current_retry_timeout = None
103
- self.marc_record_preprocessor = marc_record_preprocessor
104
- self.pbar_sent: tqdm
105
- self.pbar_imported: tqdm
106
- self._max_summary_retries: int = 2
107
- self._summary_retries: int = 0
115
+ self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(marc_record_preprocessor, **preprocessor_args)
108
116
 
109
117
  async def do_work(self) -> None:
110
118
  """
111
119
  Performs the necessary work for data import.
112
120
 
113
121
  This method initializes an HTTP client, files to store records that fail to send,
114
- and calls `self.import_marc_records` to import MARC files. If `consolidate_files` is True,
115
- it imports all the files specified in `import_files` as a single batch. Otherwise,
116
- it imports each file as a separate import job.
122
+ and calls the appropriate method to import MARC files based on the configuration.
117
123
 
118
124
  Returns:
119
125
  None
@@ -138,15 +144,33 @@ class MARCImportJob:
138
144
  self.failed_batches_file = failed_batches
139
145
  logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
140
146
  self.http_client = http_client
141
- if self.consolidate_files:
142
- self.current_file = self.import_files
143
- await self.import_marc_file()
147
+ if self.split_files:
148
+ await self.process_split_files()
144
149
  else:
145
150
  for file in self.import_files:
146
151
  self.current_file = [file]
147
152
  await self.import_marc_file()
148
153
  await self.wrap_up()
149
154
 
155
+ async def process_split_files(self):
156
+ """
157
+ Process the import of files in smaller batches.
158
+ This method is called when `split_files` is set to True.
159
+ It splits each file into smaller chunks and processes them one by one.
160
+ """
161
+ for file in self.import_files:
162
+ with open(file, "rb") as f:
163
+ file_length = await self.read_total_records([f])
164
+ expected_batches = math.ceil(file_length /self.split_size)
165
+ logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
166
+ zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
167
+ for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
168
+ if idx > self.split_offset:
169
+ batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
170
+ self.current_file = [batch]
171
+ await self.import_marc_file()
172
+ self.move_file_to_complete(file)
173
+
150
174
  async def wrap_up(self) -> None:
151
175
  """
152
176
  Wraps up the data import process.
@@ -194,7 +218,7 @@ class MARCImportJob:
194
218
  )
195
219
  self.current_retry_timeout = None
196
220
  except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
197
- if not hasattr(e, "response") or e.response.status_code in [502, 504]:
221
+ if not hasattr(e, "response") or e.response.status_code in [502, 504, 401]:
198
222
  error_text = e.response.text if hasattr(e, "response") else str(e)
199
223
  logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
200
224
  sleep(0.25)
@@ -256,7 +280,7 @@ class MARCImportJob:
256
280
  """
257
281
  try:
258
282
  create_job = self.http_client.post(
259
- self.folio_client.okapi_url + "/change-manager/jobExecutions",
283
+ self.folio_client.gateway_url + "/change-manager/jobExecutions",
260
284
  headers=self.folio_client.okapi_headers,
261
285
  json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
262
286
  )
@@ -275,7 +299,7 @@ class MARCImportJob:
275
299
  )
276
300
  raise e
277
301
  self.job_id = create_job.json()["parentJobExecutionId"]
278
- logger.info("Created job: " + self.job_id)
302
+ logger.info(f"Created job: {self.job_id}")
279
303
 
280
304
  @cached_property
281
305
  def import_profile(self) -> dict:
@@ -305,7 +329,7 @@ class MARCImportJob:
305
329
  The response from the HTTP request to set the job profile.
306
330
  """
307
331
  set_job_profile = self.http_client.put(
308
- self.folio_client.okapi_url
332
+ self.folio_client.gateway_url
309
333
  + "/change-manager/jobExecutions/"
310
334
  + self.job_id
311
335
  + "/jobProfile",
@@ -318,6 +342,8 @@ class MARCImportJob:
318
342
  )
319
343
  try:
320
344
  set_job_profile.raise_for_status()
345
+ self.job_hrid = set_job_profile.json()['hrId']
346
+ logger.info(f"Job HRID: {self.job_hrid}")
321
347
  except httpx.HTTPError as e:
322
348
  logger.error(
323
349
  "Error creating job: "
@@ -328,7 +354,7 @@ class MARCImportJob:
328
354
  raise e
329
355
 
330
356
  @staticmethod
331
- async def read_total_records(files) -> int:
357
+ async def read_total_records(files: List[BinaryIO]) -> int:
332
358
  """
333
359
  Reads the total number of records from the given files.
334
360
 
@@ -357,17 +383,15 @@ class MARCImportJob:
357
383
  """
358
384
  try:
359
385
  post_batch = self.http_client.post(
360
- self.folio_client.okapi_url
386
+ self.folio_client.gateway_url
361
387
  + f"/change-manager/jobExecutions/{self.job_id}/records",
362
388
  headers=self.folio_client.okapi_headers,
363
389
  json=batch_payload,
364
390
  )
365
- # if batch_payload["recordsMetadata"]["last"]:
366
- # logger.log(
367
- # 25,
368
- # f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
369
- # )
370
391
  except (httpx.ConnectTimeout, httpx.ReadTimeout):
392
+ logger.warning(
393
+ f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying..."
394
+ )
371
395
  sleep(0.25)
372
396
  return await self.process_record_batch(batch_payload)
373
397
  try:
@@ -375,20 +399,21 @@ class MARCImportJob:
375
399
  self.total_records_sent += len(self.record_batch)
376
400
  self.record_batch = []
377
401
  self.pbar_sent.update(len(batch_payload["initialRecords"]))
378
- except Exception as e:
402
+ except httpx.HTTPStatusError as e:
379
403
  if (
380
- hasattr(e, "response") and e.response.status_code in [500, 422]
381
- ): # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
404
+ e.response.status_code in [500, 400, 422]
405
+ ): # TODO: Update once we no longer have to support < Sunflower to just be 400
382
406
  self.total_records_sent += len(self.record_batch)
383
407
  self.record_batch = []
384
408
  self.pbar_sent.update(len(batch_payload["initialRecords"]))
385
409
  else:
386
- logger.error("Error posting batch: " + str(e))
387
410
  for record in self.record_batch:
388
411
  self.failed_batches_file.write(record)
389
- self.error_records += len(self.record_batch)
390
- self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
391
- self.record_batch = []
412
+ raise FolioDataImportBatchError(
413
+ batch_payload['id'],
414
+ f"{e}\n{e.response.text}",
415
+ e
416
+ )
392
417
  await self.get_job_status()
393
418
  sleep(self.batch_delay)
394
419
 
@@ -417,16 +442,12 @@ class MARCImportJob:
417
442
  await self.create_batch_payload(
418
443
  counter,
419
444
  total_records,
420
- (counter - self.error_records)
421
- == (total_records - self.error_records),
445
+ counter == total_records,
422
446
  ),
423
447
  )
424
448
  sleep(0.25)
425
449
  if record:
426
- if self.marc_record_preprocessor:
427
- record = await self.apply_marc_record_preprocessing(
428
- record, self.marc_record_preprocessor
429
- )
450
+ record = self.marc_record_preprocessor.do_work(record)
430
451
  self.record_batch.append(record.as_marc())
431
452
  counter += 1
432
453
  else:
@@ -437,75 +458,26 @@ class MARCImportJob:
437
458
  "",
438
459
  )
439
460
  self.bad_records_file.write(reader.current_chunk)
440
- if self.record_batch:
441
- await self.process_record_batch(
442
- await self.create_batch_payload(
443
- counter,
444
- total_records,
445
- (counter - self.error_records)
446
- == (total_records - self.error_records),
447
- ),
448
- )
449
- import_complete_path = file_path.parent.joinpath("import_complete")
450
- if not import_complete_path.exists():
451
- logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
452
- import_complete_path.mkdir(exist_ok=True)
453
- logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
454
- file_path.rename(
455
- file_path.parent.joinpath("import_complete", file_path.name)
456
- )
457
-
458
- @staticmethod
459
- async def apply_marc_record_preprocessing(
460
- record: pymarc.Record, func_or_path
461
- ) -> pymarc.Record:
462
- """
463
- Apply preprocessing to the MARC record before sending it to FOLIO.
464
-
465
- Args:
466
- record (pymarc.Record): The MARC record to preprocess.
467
- func_or_path (Union[Callable, str]): The preprocessing function or its import path.
468
-
469
- Returns:
470
- pymarc.Record: The preprocessed MARC record.
471
- """
472
- if isinstance(func_or_path, str):
473
- func_paths = func_or_path.split(",")
474
- for func_path in func_paths:
475
- record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
476
- record, func_path
477
- )
478
- elif callable(func_or_path):
479
- record = func_or_path(record)
480
- else:
481
- logger.warning(
482
- f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
461
+ if not self.split_files:
462
+ self.move_file_to_complete(file_path)
463
+ if self.record_batch or not self.finished:
464
+ await self.process_record_batch(
465
+ await self.create_batch_payload(
466
+ counter,
467
+ total_records,
468
+ counter == total_records,
469
+ ),
483
470
  )
484
- return record
485
471
 
486
- async def _apply_single_marc_record_preprocessing_by_path(
487
- record: pymarc.Record, func_path: str
488
- ) -> pymarc.Record:
489
- """
490
- Apply a single preprocessing function to the MARC record.
491
-
492
- Args:
493
- record (pymarc.Record): The MARC record to preprocess.
494
- func_path (str): The path to the preprocessing function.
495
-
496
- Returns:
497
- pymarc.Record: The preprocessed MARC record.
498
- """
499
- try:
500
- module_path, func_name = func_path.rsplit(".", 1)
501
- module = importlib.import_module(module_path)
502
- func = getattr(module, func_name)
503
- record = func(record)
504
- except Exception as e:
505
- logger.warning(
506
- f"Error applying preprocessing function {func_path}: {e}. Skipping."
472
+ def move_file_to_complete(self, file_path: Path):
473
+ import_complete_path = file_path.parent.joinpath("import_complete")
474
+ if not import_complete_path.exists():
475
+ logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
476
+ import_complete_path.mkdir(exist_ok=True)
477
+ logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
478
+ file_path.rename(
479
+ file_path.parent.joinpath("import_complete", file_path.name)
507
480
  )
508
- return record
509
481
 
510
482
  async def create_batch_payload(self, counter, total_records, is_last) -> dict:
511
483
  """
@@ -523,13 +495,53 @@ class MARCImportJob:
523
495
  "id": str(uuid.uuid4()),
524
496
  "recordsMetadata": {
525
497
  "last": is_last,
526
- "counter": counter - self.error_records,
498
+ "counter": counter,
527
499
  "contentType": "MARC_RAW",
528
- "total": total_records - self.error_records,
500
+ "total": total_records,
529
501
  },
530
502
  "initialRecords": [{"record": x.decode()} for x in self.record_batch],
531
503
  }
532
504
 
505
+ @staticmethod
506
+ def split_marc_file(file_path, batch_size):
507
+ """Generator to iterate over MARC records in batches, yielding BytesIO objects."""
508
+ with open(file_path, "rb") as f:
509
+ batch = io.BytesIO()
510
+ count = 0
511
+
512
+ while True:
513
+ leader = f.read(24)
514
+ if not leader:
515
+ break # End of file
516
+
517
+ try:
518
+ record_length = int(leader[:5]) # Extract record length from leader
519
+ except ValueError:
520
+ raise ValueError("Invalid MARC record length encountered.")
521
+
522
+ record_body = f.read(record_length - 24)
523
+ if len(record_body) != record_length - 24:
524
+ raise ValueError("Unexpected end of file while reading MARC record.")
525
+
526
+ # Verify record terminator
527
+ if record_body[-1:] != b'\x1D':
528
+ raise ValueError("MARC record does not end with the expected terminator (0x1D).")
529
+
530
+ # Write the full record to the batch buffer
531
+ batch.write(leader + record_body)
532
+ count += 1
533
+
534
+ if count >= batch_size:
535
+ batch.seek(0)
536
+ yield batch
537
+ batch = io.BytesIO() # Reset buffer
538
+ count = 0
539
+
540
+ # Yield any remaining records
541
+ if count > 0:
542
+ batch.seek(0)
543
+ yield batch
544
+
533
545
  async def import_marc_file(self) -> None:
534
546
  """
535
547
  Imports MARC file into the system.
@@ -551,13 +563,24 @@ class MARCImportJob:
551
563
  await self.create_folio_import_job()
552
564
  await self.set_job_profile()
553
565
  with ExitStack() as stack:
554
- files = [
555
- stack.enter_context(open(file, "rb")) for file in self.current_file
556
- ]
566
+ try:
567
+ if isinstance(self.current_file[0], Path):
568
+ files = [
569
+ stack.enter_context(open(file, "rb")) for file in self.current_file
570
+ ]
571
+ elif isinstance(self.current_file[0], io.BytesIO):
572
+ files = [
573
+ stack.enter_context(file) for file in self.current_file
574
+ ]
575
+ else:
576
+ raise ValueError("Invalid file type. Must be Path or BytesIO.")
577
+ except IndexError as e:
578
+ logger.error(f"Error opening file: {e}")
579
+ raise e
557
580
  total_records = await self.read_total_records(files)
558
581
  with (
559
582
  tqdm(
560
- desc="Imported: ",
583
+ desc=f"Imported ({self.job_hrid}): ",
561
584
  total=total_records,
562
585
  position=1,
563
586
  disable=self.no_progress,
@@ -569,48 +592,81 @@ class MARCImportJob:
569
592
  disable=self.no_progress,
570
593
  ) as pbar_sent,
571
594
  ):
572
- self.pbar_sent = pbar_sent
573
- self.pbar_imported = pbar_imported
574
- await self.process_records(files, total_records)
575
- while not self.finished:
576
- await self.get_job_status()
577
- sleep(1)
595
+ try:
596
+ self.pbar_sent = pbar_sent
597
+ self.pbar_imported = pbar_imported
598
+ await self.process_records(files, total_records)
599
+ while not self.finished:
600
+ await self.get_job_status()
601
+ sleep(1)
602
+ except FolioDataImportBatchError as e:
603
+ logger.error(
604
+ f"Unhandled error posting batch {e.batch_id}: {e.message}"
605
+ )
606
+ await self.cancel_job()
607
+ raise e
578
608
  if self.finished:
579
- if job_summary := await self.get_job_summary():
580
- job_id = job_summary.pop("jobExecutionId", None)
581
- total_errors = job_summary.pop("totalErrors", 0)
582
- columns = ["Summary"] + list(job_summary.keys())
583
- rows = set()
584
- for key in columns[1:]:
585
- rows.update(job_summary[key].keys())
586
-
587
- table_data = []
588
- for row in rows:
589
- metric_name = decamelize(row).split("_")[1]
590
- table_row = [metric_name]
591
- for col in columns[1:]:
592
- table_row.append(job_summary[col].get(row, "N/A"))
593
- table_data.append(table_row)
594
- table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
595
- columns = columns[:1] + [
609
+ await self.log_job_summary()
610
+ self.last_current = 0
611
+ self.finished = False
612
+
613
+ async def cancel_job(self) -> None:
614
+ """
615
+ Cancels the current job execution.
616
+
617
+ This method sends a request to cancel the job execution and logs the result.
618
+
619
+ Returns:
620
+ None
621
+ """
622
+ try:
623
+ cancel = self.http_client.delete(
624
+ self.folio_client.gateway_url
625
+ + f"/change-manager/jobExecutions/{self.job_id}/records",
626
+ headers=self.folio_client.okapi_headers,
627
+ )
628
+ cancel.raise_for_status()
629
+ self.finished = True
630
+ logger.info(f"Cancelled job: {self.job_id}")
631
+ except (httpx.ConnectTimeout, httpx.ReadTimeout):
632
+ logger.warning(f"CONNECTION ERROR cancelling job {self.job_id}. Retrying...")
633
+ sleep(0.25)
634
+ await self.cancel_job()
635
+
636
+ async def log_job_summary(self):
637
+ if job_summary := await self.get_job_summary():
638
+ job_id = job_summary.pop("jobExecutionId", None)
639
+ total_errors = job_summary.pop("totalErrors", 0)
640
+ columns = ["Summary"] + list(job_summary.keys())
641
+ rows = set()
642
+ for key in columns[1:]:
643
+ rows.update(job_summary[key].keys())
644
+
645
+ table_data = []
646
+ for row in rows:
647
+ metric_name = decamelize(row).split("_")[1]
648
+ table_row = [metric_name]
649
+ for col in columns[1:]:
650
+ table_row.append(job_summary[col].get(row, "N/A"))
651
+ table_data.append(table_row)
652
+ table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
653
+ columns = columns[:1] + [
596
654
  " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
597
655
  ]
598
- logger.info(
656
+ logger.info(
599
657
  f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
600
658
  f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
601
659
  )
602
- logger.info(
660
+ logger.info(
603
661
  "\n"
604
662
  + tabulate.tabulate(
605
663
  table_data, headers=columns, tablefmt="fancy_grid"
606
664
  ),
607
665
  )
608
- if total_errors:
609
- logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
610
- else:
611
- logger.error(f"No job summary available for job {self.job_id}.")
612
- self.last_current = 0
613
- self.finished = False
666
+ if total_errors:
667
+ logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
668
+ else:
669
+ logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
614
670
 
615
671
  async def get_job_summary(self) -> dict:
616
672
  """
@@ -749,19 +805,31 @@ async def main() -> None:
749
805
  "--preprocessor",
750
806
  type=str,
751
807
  help=(
752
- "The path to a Python module containing a preprocessing function "
753
- "to apply to each MARC record before sending to FOLIO."
808
+ "Comma-separated python import paths to Python function(s) "
809
+ "to apply to each MARC record before sending to FOLIO. Function should take "
810
+ "a pymarc.Record object as input and return a pymarc.Record object."
754
811
  ),
755
812
  default=None,
756
813
  )
814
+
757
815
  parser.add_argument(
758
- "--consolidate",
816
+ "--split-files",
759
817
  action="store_true",
760
- help=(
761
- "Consolidate records into a single job. "
762
- "Default is to create a new job for each MARC file."
763
- ),
818
+ help="Split files into smaller parts before importing.",
764
819
  )
820
+ parser.add_argument(
821
+ "--split-size",
822
+ type=int,
823
+ help="The number of records to include in each split file.",
824
+ default=1000,
825
+ )
826
+ parser.add_argument(
827
+ "--split-offset",
828
+ type=int,
829
+ help="The number of record batches of <split-size> to skip before starting import.",
830
+ default=0,
831
+ )
832
+
765
833
  parser.add_argument(
766
834
  "--no-progress",
767
835
  action="store_true",
@@ -772,6 +840,16 @@ async def main() -> None:
772
840
  action="store_true",
773
841
  help="Do not retry fetching the final job summary if it fails",
774
842
  )
843
+ parser.add_argument(
844
+ "--preprocessor-config",
845
+ type=str,
846
+ help=(
847
+ "JSON file containing configuration for preprocessor functions. "
848
+ "This is passed to MARCPreprocessor class as a dict of dicts."
849
+ ),
850
+ default=None,
851
+ )
852
+
775
853
  args = parser.parse_args()
776
854
  if not args.password:
777
855
  args.password = getpass("Enter FOLIO password: ")
@@ -796,6 +874,12 @@ async def main() -> None:
796
874
  else:
797
875
  logger.info(marc_files)
798
876
 
877
+ if args.preprocessor_config:
878
+ with open(args.preprocessor_config, "r") as f:
879
+ preprocessor_args = json.load(f)
880
+ else:
881
+ preprocessor_args = {}
882
+
799
883
  if not args.import_profile_name:
800
884
  import_profiles = folio_client.folio_get(
801
885
  "/data-import-profiles/jobProfiles",
@@ -824,9 +908,12 @@ async def main() -> None:
824
908
  batch_size=args.batch_size,
825
909
  batch_delay=args.batch_delay,
826
910
  marc_record_preprocessor=args.preprocessor,
827
- consolidate=bool(args.consolidate),
911
+ preprocessor_args=preprocessor_args,
828
912
  no_progress=bool(args.no_progress),
829
913
  let_summary_fail=bool(args.let_summary_fail),
914
+ split_files=bool(args.split_files),
915
+ split_size=args.split_size,
916
+ split_offset=args.split_offset,
830
917
  ).do_work()
831
918
  except Exception as e:
832
919
  logger.error("Error importing files: " + str(e))