folio-data-import 0.2.8rc12__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of folio-data-import might be problematic. Click here for more details.

@@ -2,8 +2,8 @@ import argparse
2
2
  import asyncio
3
3
  import datetime
4
4
  import glob
5
- import importlib
6
5
  import io
6
+ import json
7
7
  import logging
8
8
  import math
9
9
  import os
@@ -15,7 +15,7 @@ from functools import cached_property
15
15
  from getpass import getpass
16
16
  from pathlib import Path
17
17
  from time import sleep
18
- from typing import List, Union
18
+ from typing import Any, BinaryIO, Callable, Dict, List, Union
19
19
 
20
20
  import folioclient
21
21
  import httpx
@@ -25,6 +25,9 @@ import tabulate
25
25
  from humps import decamelize
26
26
  from tqdm import tqdm
27
27
 
28
+ from folio_data_import.custom_exceptions import FolioDataImportBatchError
29
+ from folio_data_import.marc_preprocessors._preprocessors import MARCPreprocessor
30
+
28
31
  try:
29
32
  datetime_utc = datetime.UTC
30
33
  except AttributeError:
@@ -63,7 +66,6 @@ class MARCImportJob:
63
66
  import_profile_name (str): The name of the data import job profile to use.
64
67
  batch_size (int): The number of source records to include in a record batch (default=10).
65
68
  batch_delay (float): The number of seconds to wait between record batches (default=0).
66
- consolidate (bool): Consolidate files into a single job. Default is one job for each file.
67
69
  no_progress (bool): Disable progress bars (eg. for running in a CI environment).
68
70
  """
69
71
 
@@ -75,7 +77,6 @@ class MARCImportJob:
75
77
  http_client: httpx.Client
76
78
  current_file: List[Path]
77
79
  record_batch: List[dict] = []
78
- error_records: int = 0
79
80
  last_current: int = 0
80
81
  total_records_sent: int = 0
81
82
  finished: bool = False
@@ -92,18 +93,17 @@ class MARCImportJob:
92
93
  import_profile_name: str,
93
94
  batch_size=10,
94
95
  batch_delay=0,
95
- marc_record_preprocessor=None,
96
- consolidate=False,
96
+ marc_record_preprocessor: Union[List[Callable], str]=[],
97
+ preprocessor_args: Dict[str,Dict]={},
97
98
  no_progress=False,
98
99
  let_summary_fail=False,
99
100
  split_files=False,
100
101
  split_size=1000,
102
+ split_offset=0,
101
103
  ) -> None:
102
- self.consolidate_files = consolidate
103
104
  self.split_files = split_files
104
105
  self.split_size = split_size
105
- if self.split_files and self.consolidate_files:
106
- raise ValueError("Cannot consolidate and split files at the same time.")
106
+ self.split_offset = split_offset
107
107
  self.no_progress = no_progress
108
108
  self.let_summary_fail = let_summary_fail
109
109
  self.folio_client: folioclient.FolioClient = folio_client
@@ -112,16 +112,14 @@ class MARCImportJob:
112
112
  self.batch_size = batch_size
113
113
  self.batch_delay = batch_delay
114
114
  self.current_retry_timeout = None
115
- self.marc_record_preprocessor = marc_record_preprocessor
115
+ self.marc_record_preprocessor: MARCPreprocessor = MARCPreprocessor(marc_record_preprocessor, **preprocessor_args)
116
116
 
117
117
  async def do_work(self) -> None:
118
118
  """
119
119
  Performs the necessary work for data import.
120
120
 
121
121
  This method initializes an HTTP client, files to store records that fail to send,
122
- and calls `self.import_marc_records` to import MARC files. If `consolidate_files` is True,
123
- it imports all the files specified in `import_files` as a single batch. Otherwise,
124
- it imports each file as a separate import job.
122
+ and calls the appropriate method to import MARC files based on the configuration.
125
123
 
126
124
  Returns:
127
125
  None
@@ -146,27 +144,33 @@ class MARCImportJob:
146
144
  self.failed_batches_file = failed_batches
147
145
  logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
148
146
  self.http_client = http_client
149
- if self.consolidate_files:
150
- self.current_file = self.import_files
151
- await self.import_marc_file()
152
- elif self.split_files:
153
- for file in self.import_files:
154
- with open(file, "rb") as f:
155
- file_length = await self.read_total_records([f])
156
- expected_batches = math.ceil(file_length /self.split_size)
157
- logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
158
- zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
159
- for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
160
- batch.name = f"{file.name}_part{idx:0{zero_pad_parts}}"
161
- self.current_file = [batch]
162
- await self.import_marc_file()
163
- self.move_file_to_complete(file)
147
+ if self.split_files:
148
+ await self.process_split_files()
164
149
  else:
165
150
  for file in self.import_files:
166
151
  self.current_file = [file]
167
152
  await self.import_marc_file()
168
153
  await self.wrap_up()
169
154
 
155
+ async def process_split_files(self):
156
+ """
157
+ Process the import of files in smaller batches.
158
+ This method is called when `split_files` is set to True.
159
+ It splits each file into smaller chunks and processes them one by one.
160
+ """
161
+ for file in self.import_files:
162
+ with open(file, "rb") as f:
163
+ file_length = await self.read_total_records([f])
164
+ expected_batches = math.ceil(file_length /self.split_size)
165
+ logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
166
+ zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
167
+ for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
168
+ if idx > self.split_offset:
169
+ batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
170
+ self.current_file = [batch]
171
+ await self.import_marc_file()
172
+ self.move_file_to_complete(file)
173
+
170
174
  async def wrap_up(self) -> None:
171
175
  """
172
176
  Wraps up the data import process.
@@ -214,7 +218,7 @@ class MARCImportJob:
214
218
  )
215
219
  self.current_retry_timeout = None
216
220
  except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
217
- if not hasattr(e, "response") or e.response.status_code in [502, 504]:
221
+ if not hasattr(e, "response") or e.response.status_code in [502, 504, 401]:
218
222
  error_text = e.response.text if hasattr(e, "response") else str(e)
219
223
  logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
220
224
  sleep(0.25)
@@ -276,7 +280,7 @@ class MARCImportJob:
276
280
  """
277
281
  try:
278
282
  create_job = self.http_client.post(
279
- self.folio_client.okapi_url + "/change-manager/jobExecutions",
283
+ self.folio_client.gateway_url + "/change-manager/jobExecutions",
280
284
  headers=self.folio_client.okapi_headers,
281
285
  json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
282
286
  )
@@ -325,7 +329,7 @@ class MARCImportJob:
325
329
  The response from the HTTP request to set the job profile.
326
330
  """
327
331
  set_job_profile = self.http_client.put(
328
- self.folio_client.okapi_url
332
+ self.folio_client.gateway_url
329
333
  + "/change-manager/jobExecutions/"
330
334
  + self.job_id
331
335
  + "/jobProfile",
@@ -350,7 +354,7 @@ class MARCImportJob:
350
354
  raise e
351
355
 
352
356
  @staticmethod
353
- async def read_total_records(files) -> int:
357
+ async def read_total_records(files: List[BinaryIO]) -> int:
354
358
  """
355
359
  Reads the total number of records from the given files.
356
360
 
@@ -379,17 +383,15 @@ class MARCImportJob:
379
383
  """
380
384
  try:
381
385
  post_batch = self.http_client.post(
382
- self.folio_client.okapi_url
386
+ self.folio_client.gateway_url
383
387
  + f"/change-manager/jobExecutions/{self.job_id}/records",
384
388
  headers=self.folio_client.okapi_headers,
385
389
  json=batch_payload,
386
390
  )
387
- # if batch_payload["recordsMetadata"]["last"]:
388
- # logger.log(
389
- # 25,
390
- # f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
391
- # )
392
391
  except (httpx.ConnectTimeout, httpx.ReadTimeout):
392
+ logger.warning(
393
+ f"CONNECTION ERROR posting batch {batch_payload['id']}. Retrying..."
394
+ )
393
395
  sleep(0.25)
394
396
  return await self.process_record_batch(batch_payload)
395
397
  try:
@@ -397,20 +399,21 @@ class MARCImportJob:
397
399
  self.total_records_sent += len(self.record_batch)
398
400
  self.record_batch = []
399
401
  self.pbar_sent.update(len(batch_payload["initialRecords"]))
400
- except Exception as e:
402
+ except httpx.HTTPStatusError as e:
401
403
  if (
402
- hasattr(e, "response") and e.response.status_code in [500, 422]
403
- ): # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
404
+ e.response.status_code in [500, 400, 422]
405
+ ): # TODO: Update once we no longer have to support < Sunflower to just be 400
404
406
  self.total_records_sent += len(self.record_batch)
405
407
  self.record_batch = []
406
408
  self.pbar_sent.update(len(batch_payload["initialRecords"]))
407
409
  else:
408
- logger.error("Error posting batch: " + str(e))
409
410
  for record in self.record_batch:
410
411
  self.failed_batches_file.write(record)
411
- self.error_records += len(self.record_batch)
412
- self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
413
- self.record_batch = []
412
+ raise FolioDataImportBatchError(
413
+ batch_payload['id'],
414
+ f"{e}\n{e.response.text}",
415
+ e
416
+ )
414
417
  await self.get_job_status()
415
418
  sleep(self.batch_delay)
416
419
 
@@ -439,16 +442,12 @@ class MARCImportJob:
439
442
  await self.create_batch_payload(
440
443
  counter,
441
444
  total_records,
442
- (counter - self.error_records)
443
- == (total_records - self.error_records),
445
+ counter == total_records,
444
446
  ),
445
447
  )
446
448
  sleep(0.25)
447
449
  if record:
448
- if self.marc_record_preprocessor:
449
- record = await self.apply_marc_record_preprocessing(
450
- record, self.marc_record_preprocessor
451
- )
450
+ record = self.marc_record_preprocessor.do_work(record)
452
451
  self.record_batch.append(record.as_marc())
453
452
  counter += 1
454
453
  else:
@@ -459,19 +458,18 @@ class MARCImportJob:
459
458
  "",
460
459
  )
461
460
  self.bad_records_file.write(reader.current_chunk)
462
- if self.record_batch:
463
- await self.process_record_batch(
464
- await self.create_batch_payload(
465
- counter,
466
- total_records,
467
- (counter - self.error_records)
468
- == (total_records - self.error_records),
469
- ),
470
- )
471
461
  if not self.split_files:
472
462
  self.move_file_to_complete(file_path)
463
+ if self.record_batch or not self.finished:
464
+ await self.process_record_batch(
465
+ await self.create_batch_payload(
466
+ counter,
467
+ total_records,
468
+ counter == total_records,
469
+ ),
470
+ )
473
471
 
474
- def move_file_to_complete(self, file_path):
472
+ def move_file_to_complete(self, file_path: Path):
475
473
  import_complete_path = file_path.parent.joinpath("import_complete")
476
474
  if not import_complete_path.exists():
477
475
  logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
@@ -481,58 +479,6 @@ class MARCImportJob:
481
479
  file_path.parent.joinpath("import_complete", file_path.name)
482
480
  )
483
481
 
484
- @staticmethod
485
- async def apply_marc_record_preprocessing(
486
- record: pymarc.Record, func_or_path
487
- ) -> pymarc.Record:
488
- """
489
- Apply preprocessing to the MARC record before sending it to FOLIO.
490
-
491
- Args:
492
- record (pymarc.Record): The MARC record to preprocess.
493
- func_or_path (Union[Callable, str]): The preprocessing function or its import path.
494
-
495
- Returns:
496
- pymarc.Record: The preprocessed MARC record.
497
- """
498
- if isinstance(func_or_path, str):
499
- func_paths = func_or_path.split(",")
500
- for func_path in func_paths:
501
- record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
502
- record, func_path
503
- )
504
- elif callable(func_or_path):
505
- record = func_or_path(record)
506
- else:
507
- logger.warning(
508
- f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
509
- )
510
- return record
511
-
512
- async def _apply_single_marc_record_preprocessing_by_path(
513
- record: pymarc.Record, func_path: str
514
- ) -> pymarc.Record:
515
- """
516
- Apply a single preprocessing function to the MARC record.
517
-
518
- Args:
519
- record (pymarc.Record): The MARC record to preprocess.
520
- func_path (str): The path to the preprocessing function.
521
-
522
- Returns:
523
- pymarc.Record: The preprocessed MARC record.
524
- """
525
- try:
526
- module_path, func_name = func_path.rsplit(".", 1)
527
- module = importlib.import_module(module_path)
528
- func = getattr(module, func_name)
529
- record = func(record)
530
- except Exception as e:
531
- logger.warning(
532
- f"Error applying preprocessing function {func_path}: {e}. Skipping."
533
- )
534
- return record
535
-
536
482
  async def create_batch_payload(self, counter, total_records, is_last) -> dict:
537
483
  """
538
484
  Create a batch payload for data import.
@@ -549,9 +495,9 @@ class MARCImportJob:
549
495
  "id": str(uuid.uuid4()),
550
496
  "recordsMetadata": {
551
497
  "last": is_last,
552
- "counter": counter - self.error_records,
498
+ "counter": counter,
553
499
  "contentType": "MARC_RAW",
554
- "total": total_records - self.error_records,
500
+ "total": total_records,
555
501
  },
556
502
  "initialRecords": [{"record": x.decode()} for x in self.record_batch],
557
503
  }
@@ -646,17 +592,47 @@ class MARCImportJob:
646
592
  disable=self.no_progress,
647
593
  ) as pbar_sent,
648
594
  ):
649
- self.pbar_sent = pbar_sent
650
- self.pbar_imported = pbar_imported
651
- await self.process_records(files, total_records)
652
- while not self.finished:
653
- await self.get_job_status()
654
- sleep(1)
595
+ try:
596
+ self.pbar_sent = pbar_sent
597
+ self.pbar_imported = pbar_imported
598
+ await self.process_records(files, total_records)
599
+ while not self.finished:
600
+ await self.get_job_status()
601
+ sleep(1)
602
+ except FolioDataImportBatchError as e:
603
+ logger.error(
604
+ f"Unhandled error posting batch {e.batch_id}: {e.message}"
605
+ )
606
+ await self.cancel_job()
607
+ raise e
655
608
  if self.finished:
656
609
  await self.log_job_summary()
657
610
  self.last_current = 0
658
611
  self.finished = False
659
612
 
613
+ async def cancel_job(self) -> None:
614
+ """
615
+ Cancels the current job execution.
616
+
617
+ This method sends a request to cancel the job execution and logs the result.
618
+
619
+ Returns:
620
+ None
621
+ """
622
+ try:
623
+ cancel = self.http_client.delete(
624
+ self.folio_client.gateway_url
625
+ + f"/change-manager/jobExecutions/{self.job_id}/records",
626
+ headers=self.folio_client.okapi_headers,
627
+ )
628
+ cancel.raise_for_status()
629
+ self.finished = True
630
+ logger.info(f"Cancelled job: {self.job_id}")
631
+ except (httpx.ConnectTimeout, httpx.ReadTimeout):
632
+ logger.warning(f"CONNECTION ERROR cancelling job {self.job_id}. Retrying...")
633
+ sleep(0.25)
634
+ await self.cancel_job()
635
+
660
636
  async def log_job_summary(self):
661
637
  if job_summary := await self.get_job_summary():
662
638
  job_id = job_summary.pop("jobExecutionId", None)
@@ -835,17 +811,8 @@ async def main() -> None:
835
811
  ),
836
812
  default=None,
837
813
  )
838
- # Add mutually exclusive group for consolidate and split-files options
839
- group = parser.add_mutually_exclusive_group()
840
- group.add_argument(
841
- "--consolidate",
842
- action="store_true",
843
- help=(
844
- "Consolidate records into a single job. "
845
- "Default is to create a new job for each MARC file."
846
- ),
847
- )
848
- group.add_argument(
814
+
815
+ parser.add_argument(
849
816
  "--split-files",
850
817
  action="store_true",
851
818
  help="Split files into smaller parts before importing.",
@@ -856,6 +823,12 @@ async def main() -> None:
856
823
  help="The number of records to include in each split file.",
857
824
  default=1000,
858
825
  )
826
+ parser.add_argument(
827
+ "--split-offset",
828
+ type=int,
829
+ help="The number of record batches of <split-size> to skip before starting import.",
830
+ default=0,
831
+ )
859
832
 
860
833
  parser.add_argument(
861
834
  "--no-progress",
@@ -867,6 +840,16 @@ async def main() -> None:
867
840
  action="store_true",
868
841
  help="Do not retry fetching the final job summary if it fails",
869
842
  )
843
+ parser.add_argument(
844
+ "--preprocessor-config",
845
+ type=str,
846
+ help=(
847
+ "JSON file containing configuration for preprocessor functions. "
848
+ "This is passed to MARCPreprocessor class as a dict of dicts."
849
+ ),
850
+ default=None,
851
+ )
852
+
870
853
  args = parser.parse_args()
871
854
  if not args.password:
872
855
  args.password = getpass("Enter FOLIO password: ")
@@ -891,6 +874,12 @@ async def main() -> None:
891
874
  else:
892
875
  logger.info(marc_files)
893
876
 
877
+ if args.preprocessor_config:
878
+ with open(args.preprocessor_config, "r") as f:
879
+ preprocessor_args = json.load(f)
880
+ else:
881
+ preprocessor_args = {}
882
+
894
883
  if not args.import_profile_name:
895
884
  import_profiles = folio_client.folio_get(
896
885
  "/data-import-profiles/jobProfiles",
@@ -919,11 +908,12 @@ async def main() -> None:
919
908
  batch_size=args.batch_size,
920
909
  batch_delay=args.batch_delay,
921
910
  marc_record_preprocessor=args.preprocessor,
922
- consolidate=bool(args.consolidate),
911
+ preprocessor_args=preprocessor_args,
923
912
  no_progress=bool(args.no_progress),
924
913
  let_summary_fail=bool(args.let_summary_fail),
925
914
  split_files=bool(args.split_files),
926
915
  split_size=args.split_size,
916
+ split_offset=args.split_offset,
927
917
  ).do_work()
928
918
  except Exception as e:
929
919
  logger.error("Error importing files: " + str(e))
@@ -137,7 +137,7 @@ class UserImporter: # noqa: R0902
137
137
  match_key = "id" if ("id" in user_obj) else self.match_key
138
138
  try:
139
139
  existing_user = await self.http_client.get(
140
- self.folio_client.okapi_url + "/users",
140
+ self.folio_client.gateway_url + "/users",
141
141
  headers=self.folio_client.okapi_headers,
142
142
  params={"query": f"{match_key}=={user_obj[match_key]}"},
143
143
  )
@@ -161,7 +161,7 @@ class UserImporter: # noqa: R0902
161
161
  """
162
162
  try:
163
163
  existing_rp = await self.http_client.get(
164
- self.folio_client.okapi_url
164
+ self.folio_client.gateway_url
165
165
  + "/request-preference-storage/request-preference",
166
166
  headers=self.folio_client.okapi_headers,
167
167
  params={
@@ -188,7 +188,7 @@ class UserImporter: # noqa: R0902
188
188
  """
189
189
  try:
190
190
  existing_pu = await self.http_client.get(
191
- self.folio_client.okapi_url + "/perms/users",
191
+ self.folio_client.gateway_url + "/perms/users",
192
192
  headers=self.folio_client.okapi_headers,
193
193
  params={
194
194
  "query": f"userId=={existing_user.get('id', user_obj.get('id', ''))}"
@@ -369,7 +369,7 @@ class UserImporter: # noqa: R0902
369
369
  else:
370
370
  existing_user[key] = value
371
371
  create_update_user = await self.http_client.put(
372
- self.folio_client.okapi_url + f"/users/{existing_user['id']}",
372
+ self.folio_client.gateway_url + f"/users/{existing_user['id']}",
373
373
  headers=self.folio_client.okapi_headers,
374
374
  json=existing_user,
375
375
  )
@@ -389,7 +389,7 @@ class UserImporter: # noqa: R0902
389
389
  HTTPError: If the HTTP request to create the user fails.
390
390
  """
391
391
  response = await self.http_client.post(
392
- self.folio_client.okapi_url + "/users",
392
+ self.folio_client.gateway_url + "/users",
393
393
  headers=self.folio_client.okapi_headers,
394
394
  json=user_obj,
395
395
  )
@@ -589,7 +589,7 @@ class UserImporter: # noqa: R0902
589
589
  rp_obj["userId"] = new_user_obj["id"]
590
590
  # print(rp_obj)
591
591
  response = await self.http_client.post(
592
- self.folio_client.okapi_url
592
+ self.folio_client.gateway_url
593
593
  + "/request-preference-storage/request-preference",
594
594
  headers=self.folio_client.okapi_headers,
595
595
  json=rp_obj,
@@ -613,7 +613,7 @@ class UserImporter: # noqa: R0902
613
613
  existing_rp.update(rp_obj)
614
614
  # print(existing_rp)
615
615
  response = await self.http_client.put(
616
- self.folio_client.okapi_url
616
+ self.folio_client.gateway_url
617
617
  + f"/request-preference-storage/request-preference/{existing_rp['id']}",
618
618
  headers=self.folio_client.okapi_headers,
619
619
  json=existing_rp,
@@ -635,7 +635,7 @@ class UserImporter: # noqa: R0902
635
635
  """
636
636
  perms_user_obj = {"userId": new_user_obj["id"], "permissions": []}
637
637
  response = await self.http_client.post(
638
- self.folio_client.okapi_url + "/perms/users",
638
+ self.folio_client.gateway_url + "/perms/users",
639
639
  headers=self.folio_client.okapi_headers,
640
640
  json=perms_user_obj,
641
641
  )
@@ -788,7 +788,7 @@ class UserImporter: # noqa: R0902
788
788
  """
789
789
  try:
790
790
  existing_spu = await self.http_client.get(
791
- self.folio_client.okapi_url + "/service-points-users",
791
+ self.folio_client.gateway_url + "/service-points-users",
792
792
  headers=self.folio_client.okapi_headers,
793
793
  params={"query": f"userId=={existing_user['id']}"},
794
794
  )
@@ -812,7 +812,7 @@ class UserImporter: # noqa: R0902
812
812
  """
813
813
  spu_obj["userId"] = existing_user["id"]
814
814
  response = await self.http_client.post(
815
- self.folio_client.okapi_url + "/service-points-users",
815
+ self.folio_client.gateway_url + "/service-points-users",
816
816
  headers=self.folio_client.okapi_headers,
817
817
  json=spu_obj,
818
818
  )
@@ -831,7 +831,7 @@ class UserImporter: # noqa: R0902
831
831
  """
832
832
  existing_spu.update(spu_obj)
833
833
  response = await self.http_client.put(
834
- self.folio_client.okapi_url + f"/service-points-users/{existing_spu['id']}",
834
+ self.folio_client.gateway_url + f"/service-points-users/{existing_spu['id']}",
835
835
  headers=self.folio_client.okapi_headers,
836
836
  json=existing_spu,
837
837
  )
@@ -0,0 +1,17 @@
1
+ """Custom exceptions for the Folio Data Import module."""
2
+
3
+ class FolioDataImportError(Exception):
4
+ """Base class for all exceptions in the Folio Data Import module."""
5
+ pass
6
+
7
+ class FolioDataImportBatchError(FolioDataImportError):
8
+ """Exception raised for errors in the Folio Data Import batch process.
9
+
10
+ Attributes:
11
+ batch_id -- ID of the batch that caused the error
12
+ message -- explanation of the error
13
+ """
14
+ def __init__(self, batch_id, message, exception=None):
15
+ self.batch_id = batch_id
16
+ self.message = message
17
+ super().__init__(f"Unhandled error posting batch {batch_id}: {message}")
@@ -1,69 +1,168 @@
1
+ import importlib
2
+ import sys
3
+ from typing import Callable, Dict, List, Tuple, Union
1
4
  import pymarc
2
5
  import logging
3
6
 
7
+ from pymarc.record import Record
8
+
4
9
  logger = logging.getLogger("folio_data_import.MARCDataImport")
5
10
 
11
+ class MARCPreprocessor:
12
+ """
13
+ A class to preprocess MARC records for data import into FOLIO.
14
+ """
15
+
16
+ def __init__(self, preprocessors: Union[str,List[Callable]], **kwargs):
17
+ """
18
+ Initialize the MARCPreprocessor with a list of preprocessors.
19
+
20
+ Args:
21
+ preprocessors (Union[str, List[Callable]]): A string of comma-separated function names or a list of callable preprocessor functions to apply.
22
+ """
23
+ self.preprocessor_args: Dict[str, Dict] = kwargs
24
+ self.preprocessors: List[Tuple[Callable, Dict]] = self._get_preprocessor_functions(
25
+ preprocessors
26
+ )
27
+ self.proc_kwargs = kwargs
28
+ self.record = None
29
+
30
+ def _get_preprocessor_args(self, func: Callable) -> Dict:
31
+ """
32
+ Get the arguments for the preprocessor function.
33
+
34
+ Args:
35
+ func (Callable): The preprocessor function.
36
+
37
+ Returns:
38
+ Dict: A dictionary of arguments for the preprocessor function.
39
+ """
40
+ func_path = f"{func.__module__}.{func.__name__}"
41
+ path_args: Dict = self.preprocessor_args.get("default", {})
42
+ path_args.update(self.preprocessor_args.get(func.__name__, {}))
43
+ path_args.update(self.preprocessor_args.get(func_path, {}))
44
+ return path_args
45
+
46
+ def _get_preprocessor_functions(self, func_list: Union[str, List[Callable]]) -> List[Callable]:
47
+ """
48
+ Get the preprocessor functions based on the provided names.
49
+
50
+ Returns:
51
+ List[callable]: A list of preprocessor functions.
52
+ """
53
+ preprocessors = []
54
+ if isinstance(func_list, str):
55
+ func_list = func_list.split(",")
56
+ else:
57
+ for f in func_list:
58
+ if not callable(f):
59
+ logger.warning(
60
+ f"Preprocessing function {f} is not callable. Skipping."
61
+ )
62
+ else:
63
+ preprocessors.append((f, self._get_preprocessor_args(f)))
64
+ return preprocessors
65
+ for f_path in func_list:
66
+ f_import = f_path.rsplit(".", 1)
67
+ if len(f_import) == 1:
68
+ # If the function is not a full path, assume it's in the current module
69
+ if func := getattr(sys.modules[__name__], f_import[0], None):
70
+ if callable(func):
71
+ preprocessors.append((func, self._get_preprocessor_args(func)))
72
+ else:
73
+ logger.warning(
74
+ f"Preprocessing function {f_path} is not callable. Skipping."
75
+ )
76
+ else:
77
+ logger.warning(
78
+ f"Preprocessing function {f_path} not found in current module. Skipping."
79
+ )
80
+ elif len(f_import) == 2:
81
+ # If the function is a full path, import it
82
+ module_path, func_name = f_import
83
+ try:
84
+ module = importlib.import_module(module_path)
85
+ func = getattr(module, func_name)
86
+ preprocessors.append((func, self._get_preprocessor_args(func)))
87
+ except ImportError as e:
88
+ logger.warning(
89
+ f"Error importing preprocessing function {f_path}: {e}. Skipping."
90
+ )
91
+ return preprocessors
92
+
93
+ def do_work(self, record: Record) -> Record:
94
+ """
95
+ Preprocess the MARC record.
96
+ """
97
+ for proc, kwargs in self.preprocessors:
98
+ record = proc(record, **kwargs)
99
+ return record
6
100
 
7
- def prepend_prefix_001(record: pymarc.Record, prefix: str) -> pymarc.Record:
101
+
102
+
103
+ def prepend_prefix_001(record: Record, prefix: str) -> Record:
8
104
  """
9
105
  Prepend a prefix to the record's 001 field.
10
106
 
11
107
  Args:
12
- record (pymarc.Record): The MARC record to preprocess.
108
+ record (Record): The MARC record to preprocess.
13
109
  prefix (str): The prefix to prepend to the 001 field.
14
110
 
15
111
  Returns:
16
- pymarc.Record: The preprocessed MARC record.
112
+ Record: The preprocessed MARC record.
17
113
  """
18
- record["001"].data = f"({prefix})" + record["001"].data
114
+ if "001" in record:
115
+ record["001"].data = f"({prefix})" + record["001"].data
116
+ else:
117
+ logger.warning("Field '001' not found in record. Skipping prefix prepend.")
19
118
  return record
20
119
 
21
120
 
22
- def prepend_ppn_prefix_001(record: pymarc.Record) -> pymarc.Record:
121
+ def prepend_ppn_prefix_001(record: Record, **kwargs) -> Record:
23
122
  """
24
123
  Prepend the PPN prefix to the record's 001 field. Useful when
25
124
  importing records from the ABES SUDOC catalog
26
125
 
27
126
  Args:
28
- record (pymarc.Record): The MARC record to preprocess.
127
+ record (Record): The MARC record to preprocess.
29
128
 
30
129
  Returns:
31
- pymarc.Record: The preprocessed MARC record.
130
+ Record: The preprocessed MARC record.
32
131
  """
33
132
  return prepend_prefix_001(record, "PPN")
34
133
 
35
134
 
36
- def prepend_abes_prefix_001(record: pymarc.Record) -> pymarc.Record:
135
+ def prepend_abes_prefix_001(record: Record, **kwargs) -> Record:
37
136
  """
38
137
  Prepend the ABES prefix to the record's 001 field. Useful when
39
138
  importing records from the ABES SUDOC catalog
40
139
 
41
140
  Args:
42
- record (pymarc.Record): The MARC record to preprocess.
141
+ record (Record): The MARC record to preprocess.
43
142
 
44
143
  Returns:
45
- pymarc.Record: The preprocessed MARC record.
144
+ Record: The preprocessed MARC record.
46
145
  """
47
146
  return prepend_prefix_001(record, "ABES")
48
147
 
49
148
 
50
- def strip_999_ff_fields(record: pymarc.Record) -> pymarc.Record:
149
+ def strip_999_ff_fields(record: Record, **kwargs) -> Record:
51
150
  """
52
151
  Strip all 999 fields with ff indicators from the record.
53
152
  Useful when importing records exported from another FOLIO system
54
153
 
55
154
  Args:
56
- record (pymarc.Record): The MARC record to preprocess.
155
+ record (Record): The MARC record to preprocess.
57
156
 
58
157
  Returns:
59
- pymarc.Record: The preprocessed MARC record.
158
+ Record: The preprocessed MARC record.
60
159
  """
61
160
  for field in record.get_fields("999"):
62
161
  if field.indicators == pymarc.Indicators(*["f", "f"]):
63
162
  record.remove_field(field)
64
163
  return record
65
164
 
66
- def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
165
+ def clean_999_fields(record: Record, **kwargs) -> Record:
67
166
  """
68
167
  The presence of 999 fields, with or without ff indicators, can cause
69
168
  issues with data import mapping in FOLIO. This function calls strip_999_ff_fields
@@ -71,10 +170,10 @@ def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
71
170
  to 945 fields.
72
171
 
73
172
  Args:
74
- record (pymarc.Record): The MARC record to preprocess.
173
+ record (Record): The MARC record to preprocess.
75
174
 
76
175
  Returns:
77
- pymarc.Record: The preprocessed MARC record.
176
+ Record: The preprocessed MARC record.
78
177
  """
79
178
  record = strip_999_ff_fields(record)
80
179
  for field in record.get_fields("999"):
@@ -87,7 +186,31 @@ def clean_999_fields(record: pymarc.Record) -> pymarc.Record:
87
186
  record.remove_field(field)
88
187
  return record
89
188
 
90
- def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
189
+ def clean_non_ff_999_fields(record: Record, **kwargs) -> Record:
190
+ """
191
+ When loading migrated MARC records from folio_migration_tools, the presence of other 999 fields
192
+ than those set by the migration process can cause the record to fail to load properly. This preprocessor
193
+ function moves all 999 fields with non-ff indicators to 945 fields with 99 indicators.
194
+ """
195
+ for field in record.get_fields("999"):
196
+ if field.indicators != pymarc.Indicators(*["f", "f"]):
197
+ logger.log(
198
+ 26,
199
+ "DATA ISSUE\t%s\t%s\t%s",
200
+ record["001"].value(),
201
+ "Record contains a 999 field with non-ff indicators: Moving field to a 945 with indicators \"99\"",
202
+ field,
203
+ )
204
+ _945 = pymarc.Field(
205
+ tag="945",
206
+ indicators=pymarc.Indicators("9","9"),
207
+ subfields=field.subfields,
208
+ )
209
+ record.add_ordered_field(_945)
210
+ record.remove_field(field)
211
+ return record
212
+
213
+ def sudoc_supercede_prep(record: Record, **kwargs) -> Record:
91
214
  """
92
215
  Preprocesses a record from the ABES SUDOC catalog to copy 035 fields
93
216
  with a $9 subfield value of 'sudoc' to 935 fields with a $a subfield
@@ -96,10 +219,10 @@ def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
96
219
  in FOLIO. This also applyes the prepend_ppn_prefix_001 function to the record.
97
220
 
98
221
  Args:
99
- record (pymarc.Record): The MARC record to preprocess.
222
+ record (Record): The MARC record to preprocess.
100
223
 
101
224
  Returns:
102
- pymarc.Record: The preprocessed MARC record.
225
+ Record: The preprocessed MARC record.
103
226
  """
104
227
  record = prepend_abes_prefix_001(record)
105
228
  for field in record.get_fields("035"):
@@ -113,7 +236,7 @@ def sudoc_supercede_prep(record: pymarc.Record) -> pymarc.Record:
113
236
  return record
114
237
 
115
238
 
116
- def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
239
+ def clean_empty_fields(record: Record, **kwargs) -> Record:
117
240
  """
118
241
  Remove empty fields and subfields from the record. These can cause
119
242
  data import mapping issues in FOLIO. Removals are logged at custom
@@ -121,10 +244,10 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
121
244
  data issues report.
122
245
 
123
246
  Args:
124
- record (pymarc.Record): The MARC record to preprocess.
247
+ record (Record): The MARC record to preprocess.
125
248
 
126
249
  Returns:
127
- pymarc.Record: The preprocessed MARC record.
250
+ Record: The preprocessed MARC record.
128
251
  """
129
252
  MAPPED_FIELDS = {
130
253
  "010": ["a", "z"],
@@ -233,73 +356,72 @@ def clean_empty_fields(record: pymarc.Record) -> pymarc.Record:
233
356
  "856": ["u", "y", "z"],
234
357
  }
235
358
 
236
- for field in list(record.get_fields()):
359
+ for field in record.get_fields(*MAPPED_FIELDS.keys()):
237
360
  len_subs = len(field.subfields)
238
- subfield_value = bool(field.subfields[0].value) if len_subs > 0 else False
239
- if not int(field.tag) >= 900 and field.tag in MAPPED_FIELDS:
240
- if int(field.tag) > 9 and len_subs == 0:
361
+ subfield_value = bool(field.subfields[0].value) if len_subs else False
362
+ if int(field.tag) > 9 and len_subs == 0:
363
+ logger.log(
364
+ 26,
365
+ "DATA ISSUE\t%s\t%s\t%s",
366
+ record["001"].value(),
367
+ f"{field.tag} is empty, removing field",
368
+ field,
369
+ )
370
+ record.remove_field(field)
371
+ elif len_subs == 1 and not subfield_value:
372
+ logger.log(
373
+ 26,
374
+ "DATA ISSUE\t%s\t%s\t%s",
375
+ record["001"].value(),
376
+ f"{field.tag}${field.subfields[0].code} is empty, no other subfields present, removing field",
377
+ field,
378
+ )
379
+ record.remove_field(field)
380
+ else:
381
+ if len_subs > 1 and "a" in field and not field["a"].strip():
241
382
  logger.log(
242
383
  26,
243
384
  "DATA ISSUE\t%s\t%s\t%s",
244
385
  record["001"].value(),
245
- f"{field.tag} is empty, removing field",
386
+ f"{field.tag}$a is empty, removing subfield",
246
387
  field,
247
388
  )
248
- record.remove_field(field)
249
- elif len_subs == 1 and not subfield_value:
389
+ field.delete_subfield("a")
390
+ for idx, subfield in enumerate(list(field.subfields), start=1):
391
+ if (
392
+ subfield.code in MAPPED_FIELDS.get(field.tag, [])
393
+ and not subfield.value
394
+ ):
395
+ logger.log(
396
+ 26,
397
+ "DATA ISSUE\t%s\t%s\t%s",
398
+ record["001"].value(),
399
+ f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
400
+ field,
401
+ )
402
+ field.delete_subfield(subfield.code)
403
+ if len(field.subfields) == 0:
250
404
  logger.log(
251
405
  26,
252
406
  "DATA ISSUE\t%s\t%s\t%s",
253
407
  record["001"].value(),
254
- f"{field.tag}${field.subfields[0].code} is empty, no other subfields present, removing field",
408
+ f"{field.tag} has no non-empty subfields after cleaning, removing field",
255
409
  field,
256
410
  )
257
411
  record.remove_field(field)
258
- else:
259
- if len_subs > 1 and "a" in field and not field["a"].strip():
260
- logger.log(
261
- 26,
262
- "DATA ISSUE\t%s\t%s\t%s",
263
- record["001"].value(),
264
- f"{field.tag}$a is empty, removing subfield",
265
- field,
266
- )
267
- field.delete_subfield("a")
268
- for idx, subfield in enumerate(list(field.subfields), start=1):
269
- if (
270
- subfield.code in MAPPED_FIELDS.get(field.tag, [])
271
- and not subfield.value
272
- ):
273
- logger.log(
274
- 26,
275
- "DATA ISSUE\t%s\t%s\t%s",
276
- record["001"].value(),
277
- f"{field.tag}${subfield.code} ({ordinal(idx)} subfield) is empty, but other subfields have values, removing subfield",
278
- field,
279
- )
280
- field.delete_subfield(subfield.code)
281
- if len(field.subfields) == 0:
282
- logger.log(
283
- 26,
284
- "DATA ISSUE\t%s\t%s\t%s",
285
- record["001"].value(),
286
- f"{field.tag} has no non-empty subfields after cleaning, removing field",
287
- field,
288
- )
289
- record.remove_field(field)
290
412
  return record
291
413
 
292
414
 
293
- def fix_leader(record: pymarc.Record) -> pymarc.Record:
415
+ def fix_leader(record: Record, **kwargs) -> Record:
294
416
  """
295
417
  Fixes the leader of the record by setting the record status to 'c' (modified
296
418
  record) and the type of record to 'a' (language material).
297
419
 
298
420
  Args:
299
- record (pymarc.Record): The MARC record to preprocess.
421
+ record (Record): The MARC record to preprocess.
300
422
 
301
423
  Returns:
302
- pymarc.Record: The preprocessed MARC record.
424
+ Record: The preprocessed MARC record.
303
425
  """
304
426
  VALID_STATUSES = ["a", "c", "d", "n", "p"]
305
427
  VALID_TYPES = ["a", "c", "d", "e", "f", "g", "i", "j", "k", "m", "o", "p", "r", "t"]
@@ -309,7 +431,7 @@ def fix_leader(record: pymarc.Record) -> pymarc.Record:
309
431
  "DATA ISSUE\t%s\t%s\t%s",
310
432
  record["001"].value(),
311
433
  f"Invalid record status: {record.leader[5]}, setting to 'c'",
312
- record,
434
+ record.leader,
313
435
  )
314
436
  record.leader = pymarc.Leader(record.leader[:5] + "c" + record.leader[6:])
315
437
  if record.leader[6] not in VALID_TYPES:
@@ -318,11 +440,40 @@ def fix_leader(record: pymarc.Record) -> pymarc.Record:
318
440
  "DATA ISSUE\t%s\t%s\t%s",
319
441
  record["001"].value(),
320
442
  f"Invalid record type: {record.leader[6]}, setting to 'a'",
321
- record,
443
+ record.leader,
322
444
  )
323
445
  record.leader = pymarc.Leader(record.leader[:6] + "a" + record.leader[7:])
324
446
  return record
325
447
 
448
+ def move_authority_subfield_9_to_0_all_controllable_fields(record: Record, **kwargs) -> Record:
449
+ """
450
+ Move subfield 9 from authority fields to subfield 0. This is useful when
451
+ importing records from the ABES SUDOC catalog.
452
+
453
+ Args:
454
+ record (Record): The MARC record to preprocess.
455
+
456
+ Returns:
457
+ Record: The preprocessed MARC record.
458
+ """
459
+ controlled_fields = [
460
+ "100", "110", "111", "130",
461
+ "600", "610", "611", "630", "650", "651", "655",
462
+ "700", "710", "711", "730",
463
+ "800", "810", "811", "830"
464
+ ]
465
+ for field in record.get_fields(*controlled_fields):
466
+ for subfield in list(field.get_subfields("9")):
467
+ field.add_subfield("0", subfield)
468
+ field.delete_subfield("9", subfield)
469
+ logger.log(
470
+ 26,
471
+ "DATA ISSUE\t%s\t%s\t%s",
472
+ record["001"].value(),
473
+ f"Subfield 9 moved to subfield 0 in {field.tag}",
474
+ field,
475
+ )
476
+ return record
326
477
 
327
478
  def ordinal(n):
328
479
  s = ("th", "st", "nd", "rd") + ("th",) * 10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: folio_data_import
3
- Version: 0.2.8rc12
3
+ Version: 0.3.0
4
4
  Summary: A python module to interact with the data importing capabilities of the open-source FOLIO ILS
5
5
  License: MIT
6
6
  Author: Brooks Travis
@@ -19,8 +19,7 @@ Requires-Dist: flake8-black (>=0.3.6,<0.4.0)
19
19
  Requires-Dist: flake8-bugbear (>=24.8.19,<25.0.0)
20
20
  Requires-Dist: flake8-docstrings (>=1.7.0,<2.0.0)
21
21
  Requires-Dist: flake8-isort (>=6.1.1,<7.0.0)
22
- Requires-Dist: folioclient (>=0.61.0,<0.62.0)
23
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
22
+ Requires-Dist: folioclient (>=0.70.1,<0.71.0)
24
23
  Requires-Dist: inquirer (>=3.4.0,<4.0.0)
25
24
  Requires-Dist: pyhumps (>=3.8.0,<4.0.0)
26
25
  Requires-Dist: pymarc (>=5.2.2,<6.0.0)
@@ -0,0 +1,12 @@
1
+ folio_data_import/MARCDataImport.py,sha256=je3TdCdaDR-gYA3Gh1k4AX9l3v83sCTt4Y9lOFxayu8,36220
2
+ folio_data_import/UserImport.py,sha256=ZulGaGJhI_N5vmR69YF_qbzbGeVyzcthXklSjDpZCyA,40998
3
+ folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
5
+ folio_data_import/custom_exceptions.py,sha256=xOeIbM86d2r5-z3ul4JFTJLT3vI3kwmEq62cWS-9dOc,646
6
+ folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
7
+ folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4Zrp-9LdL7f5QqUTOjyMkK5IaHP2YOkmkqoY_4o585Q,16377
8
+ folio_data_import-0.3.0.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
9
+ folio_data_import-0.3.0.dist-info/METADATA,sha256=Aqf0PXhdwFyChMKvl9cOluKN60IyMAUPDKSpb8AOlXI,6069
10
+ folio_data_import-0.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
11
+ folio_data_import-0.3.0.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
12
+ folio_data_import-0.3.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.1.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,11 +0,0 @@
1
- folio_data_import/MARCDataImport.py,sha256=Rs8TuIrC--yXyr4oBQlV1b-pcyNq_6M_lB_SiKmnFb4,37135
2
- folio_data_import/UserImport.py,sha256=Y9ZjYoUP_vNJVftx_xUcbBqvC5CwWeuzlmCcSVQfzgo,40976
3
- folio_data_import/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- folio_data_import/__main__.py,sha256=kav_uUsnrIjGjVxQkk3exLKrc1mah9t2x3G6bGS-5I0,3710
5
- folio_data_import/marc_preprocessors/__init__.py,sha256=urExfNTQoZsDCtDPcUY9EEC5OFcUihxhYEQkQFVzbMY,30
6
- folio_data_import/marc_preprocessors/_preprocessors.py,sha256=4i1_lEnptzZDx3DojX9sfvJ_hmehwFJUC3aZsUADcwA,10851
7
- folio_data_import-0.2.8rc12.dist-info/LICENSE,sha256=qJX7wxMC7ky9Kq4v3zij8MjGEiC5wsB7pYeOhLj5TDk,1083
8
- folio_data_import-0.2.8rc12.dist-info/METADATA,sha256=LQbv4kjrCutjgvLXTHHNb6UfpKxvlNTCvVb7FbBOhk4,6113
9
- folio_data_import-0.2.8rc12.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
10
- folio_data_import-0.2.8rc12.dist-info/entry_points.txt,sha256=498SxWVXeEMRNw3PUf-eoReZvKewmYwPBtZhIUPr_Jg,192
11
- folio_data_import-0.2.8rc12.dist-info/RECORD,,