folio-data-import 0.2.7__py3-none-any.whl → 0.2.8.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of folio-data-import might be problematic. Click here for more details.

@@ -1,18 +1,21 @@
1
1
  import argparse
2
2
  import asyncio
3
+ import datetime
3
4
  import glob
4
5
  import importlib
5
6
  import io
7
+ import logging
8
+ import math
6
9
  import os
7
10
  import sys
8
- from typing import List
9
11
  import uuid
10
12
  from contextlib import ExitStack
11
- import datetime
12
13
  from datetime import datetime as dt
14
+ from functools import cached_property
13
15
  from getpass import getpass
14
16
  from pathlib import Path
15
17
  from time import sleep
18
+ from typing import List, Union
16
19
 
17
20
  import folioclient
18
21
  import httpx
@@ -22,7 +25,6 @@ import tabulate
22
25
  from humps import decamelize
23
26
  from tqdm import tqdm
24
27
 
25
-
26
28
  try:
27
29
  datetime_utc = datetime.UTC
28
30
  except AttributeError:
@@ -36,6 +38,18 @@ REPORT_SUMMARY_ORDERING = {"created": 0, "updated": 1, "discarded": 2, "error":
36
38
  RETRY_TIMEOUT_START = 1
37
39
  RETRY_TIMEOUT_RETRY_FACTOR = 2
38
40
 
41
+ # Custom log level for data issues, set to 26
42
+ DATA_ISSUE_LVL_NUM = 26
43
+ logging.addLevelName(DATA_ISSUE_LVL_NUM, "DATA_ISSUES")
44
+
45
+ def data_issues(self, msg, *args, **kws):
46
+ if self.isEnabledFor(DATA_ISSUE_LVL_NUM):
47
+ self._log(DATA_ISSUE_LVL_NUM, msg, args, **kws)
48
+
49
+ logging.Logger.data_issues = data_issues
50
+
51
+ logger = logging.getLogger(__name__)
52
+
39
53
  class MARCImportJob:
40
54
  """
41
55
  Class to manage importing MARC data (Bib, Authority) into FOLIO using the Change Manager
@@ -56,7 +70,6 @@ class MARCImportJob:
56
70
  bad_records_file: io.TextIOWrapper
57
71
  failed_batches_file: io.TextIOWrapper
58
72
  job_id: str
59
- job_import_profile: dict
60
73
  pbar_sent: tqdm
61
74
  pbar_imported: tqdm
62
75
  http_client: httpx.Client
@@ -66,6 +79,11 @@ class MARCImportJob:
66
79
  last_current: int = 0
67
80
  total_records_sent: int = 0
68
81
  finished: bool = False
82
+ job_id: str = ""
83
+ job_hrid: int = 0
84
+ current_file: Union[List[Path],List[io.BytesIO]] = []
85
+ _max_summary_retries: int = 2
86
+ _summary_retries: int = 0
69
87
 
70
88
  def __init__(
71
89
  self,
@@ -77,9 +95,19 @@ class MARCImportJob:
77
95
  marc_record_preprocessor=None,
78
96
  consolidate=False,
79
97
  no_progress=False,
98
+ let_summary_fail=False,
99
+ split_files=False,
100
+ split_size=1000,
101
+ split_offset=0,
80
102
  ) -> None:
81
103
  self.consolidate_files = consolidate
104
+ self.split_files = split_files
105
+ self.split_size = split_size
106
+ self.split_offset = split_offset
107
+ if self.split_files and self.consolidate_files:
108
+ raise ValueError("Cannot consolidate and split files at the same time.")
82
109
  self.no_progress = no_progress
110
+ self.let_summary_fail = let_summary_fail
83
111
  self.folio_client: folioclient.FolioClient = folio_client
84
112
  self.import_files = marc_files
85
113
  self.import_profile_name = import_profile_name
@@ -93,38 +121,69 @@ class MARCImportJob:
93
121
  Performs the necessary work for data import.
94
122
 
95
123
  This method initializes an HTTP client, files to store records that fail to send,
96
- and calls `self.import_marc_records` to import MARC files. If `consolidate_files` is True,
97
- it imports all the files specified in `import_files` as a single batch. Otherwise,
98
- it imports each file as a separate import job.
124
+ and calls the appropriate method to import MARC files based on the configuration.
99
125
 
100
126
  Returns:
101
127
  None
102
128
  """
103
- with httpx.Client() as http_client, open(
104
- self.import_files[0].parent.joinpath(
105
- f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
106
- ),
107
- "wb+",
108
- ) as bad_marc_file, open(
109
- self.import_files[0].parent.joinpath(
110
- f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
111
- ),
112
- "wb+",
113
- ) as failed_batches:
129
+ with (
130
+ httpx.Client() as http_client,
131
+ open(
132
+ self.import_files[0].parent.joinpath(
133
+ f"bad_marc_records_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
134
+ ),
135
+ "wb+",
136
+ ) as bad_marc_file,
137
+ open(
138
+ self.import_files[0].parent.joinpath(
139
+ f"failed_batches_{dt.now(tz=datetime_utc).strftime('%Y%m%d%H%M%S')}.mrc"
140
+ ),
141
+ "wb+",
142
+ ) as failed_batches,
143
+ ):
114
144
  self.bad_records_file = bad_marc_file
115
- print(f"Writing bad records to {self.bad_records_file.name}")
145
+ logger.info(f"Writing bad records to {self.bad_records_file.name}")
116
146
  self.failed_batches_file = failed_batches
117
- print(f"Writing failed batches to {self.failed_batches_file.name}")
147
+ logger.info(f"Writing failed batches to {self.failed_batches_file.name}")
118
148
  self.http_client = http_client
119
149
  if self.consolidate_files:
120
- self.current_file = self.import_files
121
- await self.import_marc_file()
150
+ await self.process_consolidated_import()
151
+ elif self.split_files:
152
+ await self.process_split_files()
122
153
  else:
123
154
  for file in self.import_files:
124
155
  self.current_file = [file]
125
156
  await self.import_marc_file()
126
157
  await self.wrap_up()
127
158
 
159
+ async def process_split_files(self):
160
+ """
161
+ Process the import of files in smaller batches.
162
+ This method is called when `split_files` is set to True.
163
+ It splits each file into smaller chunks and processes them one by one.
164
+ """
165
+ for file in self.import_files:
166
+ with open(file, "rb") as f:
167
+ file_length = await self.read_total_records([f])
168
+ expected_batches = math.ceil(file_length /self.split_size)
169
+ logger.info(f"{file.name} contains {file_length} records. Splitting into {expected_batches} {self.split_size} record batches.")
170
+ zero_pad_parts = len(str(expected_batches)) if expected_batches > 1 else 2
171
+ for idx, batch in enumerate(self.split_marc_file(file, self.split_size), start=1):
172
+ if idx > self.split_offset:
173
+ batch.name = f"{file.name} (Part {idx:0{zero_pad_parts}})"
174
+ self.current_file = [batch]
175
+ await self.import_marc_file()
176
+ self.move_file_to_complete(file)
177
+
178
+ async def process_consolidated_import(self):
179
+ """
180
+ Process the import of files as a single batch.
181
+ This method is called when `consolidate_files` is set to True.
182
+ It creates a single job for all files and processes them together.
183
+ """
184
+ self.current_file = self.import_files
185
+ await self.import_marc_file()
186
+
128
187
  async def wrap_up(self) -> None:
129
188
  """
130
189
  Wraps up the data import process.
@@ -135,16 +194,16 @@ class MARCImportJob:
135
194
  Returns:
136
195
  None
137
196
  """
138
- self.bad_records_file.seek(0)
139
- if not self.bad_records_file.read(1):
140
- os.remove(self.bad_records_file.name)
141
- print("No bad records found. Removing bad records file.")
142
- self.failed_batches_file.seek(0)
143
- if not self.failed_batches_file.read(1):
144
- os.remove(self.failed_batches_file.name)
145
- print("No failed batches. Removing failed batches file.")
146
- print("Import complete.")
147
- print(f"Total records imported: {self.total_records_sent}")
197
+ with open(self.bad_records_file.name, "rb") as bad_records:
198
+ if not bad_records.read(1):
199
+ os.remove(bad_records.name)
200
+ logger.info("No bad records found. Removing bad records file.")
201
+ with open(self.failed_batches_file.name, "rb") as failed_batches:
202
+ if not failed_batches.read(1):
203
+ os.remove(failed_batches.name)
204
+ logger.info("No failed batches. Removing failed batches file.")
205
+ logger.info("Import complete.")
206
+ logger.info(f"Total records imported: {self.total_records_sent}")
148
207
 
149
208
  async def get_job_status(self) -> None:
150
209
  """
@@ -158,38 +217,69 @@ class MARCImportJob:
158
217
  """
159
218
  try:
160
219
  self.current_retry_timeout = (
161
- self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
162
- ) if self.current_retry_timeout else RETRY_TIMEOUT_START
163
- job_status = self.folio_client.folio_get(
164
- "/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
165
- "=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
220
+ (self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
221
+ if self.current_retry_timeout
222
+ else RETRY_TIMEOUT_START
166
223
  )
167
- self.current_retry_timeout = None
168
- except httpx.ConnectTimeout:
169
- sleep(.25)
170
224
  with httpx.Client(
171
225
  timeout=self.current_retry_timeout,
172
- verify=self.folio_client.ssl_verify
226
+ verify=self.folio_client.ssl_verify,
173
227
  ) as temp_client:
174
- self.folio_client.httpx_client = temp_client
175
- return await self.get_job_status()
228
+ job_status = self.folio_client.folio_get(
229
+ "/metadata-provider/jobExecutions?statusNot=DISCARDED&uiStatusAny"
230
+ "=PREPARING_FOR_PREVIEW&uiStatusAny=READY_FOR_PREVIEW&uiStatusAny=RUNNING&limit=50"
231
+ )
232
+ self.current_retry_timeout = None
233
+ except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
234
+ if not hasattr(e, "response") or e.response.status_code in [502, 504]:
235
+ error_text = e.response.text if hasattr(e, "response") else str(e)
236
+ logger.warning(f"SERVER ERROR fetching job status: {error_text}. Retrying.")
237
+ sleep(0.25)
238
+ with httpx.Client(
239
+ timeout=self.current_retry_timeout,
240
+ verify=self.folio_client.ssl_verify,
241
+ ) as temp_client:
242
+ self.folio_client.httpx_client = temp_client
243
+ return await self.get_job_status()
244
+ else:
245
+ raise e
246
+ except Exception as e:
247
+ logger.error(f"Error fetching job status. {e}")
248
+
176
249
  try:
177
250
  status = [
178
251
  job for job in job_status["jobExecutions"] if job["id"] == self.job_id
179
252
  ][0]
180
253
  self.pbar_imported.update(status["progress"]["current"] - self.last_current)
181
254
  self.last_current = status["progress"]["current"]
182
- except IndexError:
183
- job_status = self.folio_client.folio_get(
184
- "/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
185
- "=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
186
- )
187
- status = [
188
- job for job in job_status["jobExecutions"] if job["id"] == self.job_id
189
- ][0]
190
- self.pbar_imported.update(status["progress"]["current"] - self.last_current)
191
- self.last_current = status["progress"]["current"]
192
- self.finished = True
255
+ except (IndexError, ValueError, KeyError):
256
+ logger.debug(f"No active job found with ID {self.job_id}. Checking for finished job.")
257
+ try:
258
+ job_status = self.folio_client.folio_get(
259
+ "/metadata-provider/jobExecutions?limit=100&sortBy=completed_date%2Cdesc&statusAny"
260
+ "=COMMITTED&statusAny=ERROR&statusAny=CANCELLED"
261
+ )
262
+ status = [
263
+ job for job in job_status["jobExecutions"] if job["id"] == self.job_id
264
+ ][0]
265
+ self.pbar_imported.update(status["progress"]["current"] - self.last_current)
266
+ self.last_current = status["progress"]["current"]
267
+ self.finished = True
268
+ except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
269
+ if not hasattr(e, "response") or e.response.status_code in [502, 504]:
270
+ error_text = e.response.text if hasattr(e, "response") else str(e)
271
+ logger.warning(
272
+ f"SERVER ERROR fetching job status: {error_text}. Retrying."
273
+ )
274
+ sleep(0.25)
275
+ with httpx.Client(
276
+ timeout=self.current_retry_timeout,
277
+ verify=self.folio_client.ssl_verify,
278
+ ) as temp_client:
279
+ self.folio_client.httpx_client = temp_client
280
+ return await self.get_job_status()
281
+ else:
282
+ raise e
193
283
 
194
284
  async def create_folio_import_job(self) -> None:
195
285
  """
@@ -201,26 +291,36 @@ class MARCImportJob:
201
291
  Raises:
202
292
  HTTPError: If there is an error creating the job.
203
293
  """
204
- create_job = self.http_client.post(
205
- self.folio_client.okapi_url + "/change-manager/jobExecutions",
206
- headers=self.folio_client.okapi_headers,
207
- json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
208
- )
209
294
  try:
210
- create_job.raise_for_status()
211
- except httpx.HTTPError as e:
212
- print(
213
- "Error creating job: "
214
- + str(e)
215
- + "\n"
216
- + getattr(getattr(e, "response", ""), "text", "")
295
+ create_job = self.http_client.post(
296
+ self.folio_client.gateway_url + "/change-manager/jobExecutions",
297
+ headers=self.folio_client.okapi_headers,
298
+ json={"sourceType": "ONLINE", "userId": self.folio_client.current_user},
217
299
  )
218
- raise e
300
+ create_job.raise_for_status()
301
+ except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
302
+ if not hasattr(e, "response") or e.response.status_code in [502, 504]:
303
+ logger.warning(f"SERVER ERROR creating job: {e}. Retrying.")
304
+ sleep(0.25)
305
+ return await self.create_folio_import_job()
306
+ else:
307
+ logger.error(
308
+ "Error creating job: "
309
+ + str(e)
310
+ + "\n"
311
+ + getattr(getattr(e, "response", ""), "text", "")
312
+ )
313
+ raise e
219
314
  self.job_id = create_job.json()["parentJobExecutionId"]
315
+ logger.info(f"Created job: {self.job_id}")
220
316
 
221
- async def get_import_profile(self) -> None:
317
+ @cached_property
318
+ def import_profile(self) -> dict:
222
319
  """
223
- Retrieves the import profile with the specified name.
320
+ Returns the import profile for the current job execution.
321
+
322
+ Returns:
323
+ dict: The import profile for the current job execution.
224
324
  """
225
325
  import_profiles = self.folio_client.folio_get(
226
326
  "/data-import-profiles/jobProfiles",
@@ -232,7 +332,7 @@ class MARCImportJob:
232
332
  for profile in import_profiles
233
333
  if profile["name"] == self.import_profile_name
234
334
  ][0]
235
- self.job_import_profile = profile
335
+ return profile
236
336
 
237
337
  async def set_job_profile(self) -> None:
238
338
  """
@@ -242,21 +342,23 @@ class MARCImportJob:
242
342
  The response from the HTTP request to set the job profile.
243
343
  """
244
344
  set_job_profile = self.http_client.put(
245
- self.folio_client.okapi_url
345
+ self.folio_client.gateway_url
246
346
  + "/change-manager/jobExecutions/"
247
347
  + self.job_id
248
348
  + "/jobProfile",
249
349
  headers=self.folio_client.okapi_headers,
250
350
  json={
251
- "id": self.job_import_profile["id"],
252
- "name": self.job_import_profile["name"],
351
+ "id": self.import_profile["id"],
352
+ "name": self.import_profile["name"],
253
353
  "dataType": "MARC",
254
354
  },
255
355
  )
256
356
  try:
257
357
  set_job_profile.raise_for_status()
358
+ self.job_hrid = set_job_profile.json()['hrId']
359
+ logger.info(f"Job HRID: {self.job_hrid}")
258
360
  except httpx.HTTPError as e:
259
- print(
361
+ logger.error(
260
362
  "Error creating job: "
261
363
  + str(e)
262
364
  + "\n"
@@ -264,7 +366,8 @@ class MARCImportJob:
264
366
  )
265
367
  raise e
266
368
 
267
- async def read_total_records(self, files) -> int:
369
+ @staticmethod
370
+ async def read_total_records(files) -> int:
268
371
  """
269
372
  Reads the total number of records from the given files.
270
373
 
@@ -277,7 +380,7 @@ class MARCImportJob:
277
380
  total_records = 0
278
381
  for import_file in files:
279
382
  while True:
280
- chunk = import_file.read(1024)
383
+ chunk = import_file.read(104857600)
281
384
  if not chunk:
282
385
  break
283
386
  total_records += chunk.count(b"\x1d")
@@ -291,24 +394,41 @@ class MARCImportJob:
291
394
  Args:
292
395
  batch_payload (dict): A records payload containing the current batch of MARC records.
293
396
  """
294
- post_batch = self.http_client.post(
295
- self.folio_client.okapi_url
296
- + f"/change-manager/jobExecutions/{self.job_id}/records",
297
- headers=self.folio_client.okapi_headers,
298
- json=batch_payload,
299
- )
397
+ try:
398
+ post_batch = self.http_client.post(
399
+ self.folio_client.gateway_url
400
+ + f"/change-manager/jobExecutions/{self.job_id}/records",
401
+ headers=self.folio_client.okapi_headers,
402
+ json=batch_payload,
403
+ )
404
+ # if batch_payload["recordsMetadata"]["last"]:
405
+ # logger.log(
406
+ # 25,
407
+ # f"Sending last batch of {batch_payload['recordsMetadata']['total']} records.",
408
+ # )
409
+ except (httpx.ConnectTimeout, httpx.ReadTimeout):
410
+ sleep(0.25)
411
+ return await self.process_record_batch(batch_payload)
300
412
  try:
301
413
  post_batch.raise_for_status()
302
414
  self.total_records_sent += len(self.record_batch)
303
415
  self.record_batch = []
304
416
  self.pbar_sent.update(len(batch_payload["initialRecords"]))
305
417
  except Exception as e:
306
- print("Error posting batch: " + str(e))
307
- for record in self.record_batch:
308
- self.failed_batches_file.write(record)
309
- self.error_records += len(self.record_batch)
310
- self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
311
- self.record_batch = []
418
+ if (
419
+ hasattr(e, "response") and e.response.status_code in [500, 422]
420
+ ): # TODO: #26 Check for specific error code once https://folio-org.atlassian.net/browse/MODSOURMAN-1281 is resolved
421
+ self.total_records_sent += len(self.record_batch)
422
+ self.record_batch = []
423
+ self.pbar_sent.update(len(batch_payload["initialRecords"]))
424
+ else:
425
+ logger.error("Error posting batch: " + str(e))
426
+ for record in self.record_batch:
427
+ self.failed_batches_file.write(record)
428
+ self.error_records += len(self.record_batch)
429
+ self.pbar_sent.total = self.pbar_sent.total - len(self.record_batch)
430
+ self.record_batch = []
431
+ await self.get_job_status()
312
432
  sleep(self.batch_delay)
313
433
 
314
434
  async def process_records(self, files, total_records) -> None:
@@ -325,16 +445,21 @@ class MARCImportJob:
325
445
  """
326
446
  counter = 0
327
447
  for import_file in files:
448
+ file_path = Path(import_file.name)
328
449
  self.pbar_sent.set_description(
329
450
  f"Sent ({os.path.basename(import_file.name)}): "
330
451
  )
331
452
  reader = pymarc.MARCReader(import_file, hide_utf8_warnings=True)
332
- for record in reader:
453
+ for idx, record in enumerate(reader, start=1):
333
454
  if len(self.record_batch) == self.batch_size:
334
455
  await self.process_record_batch(
335
- await self.create_batch_payload(counter, total_records, False),
456
+ await self.create_batch_payload(
457
+ counter,
458
+ total_records,
459
+ (counter - self.error_records)
460
+ == (total_records - self.error_records),
461
+ ),
336
462
  )
337
- await self.get_job_status()
338
463
  sleep(0.25)
339
464
  if record:
340
465
  if self.marc_record_preprocessor:
@@ -344,14 +469,39 @@ class MARCImportJob:
344
469
  self.record_batch.append(record.as_marc())
345
470
  counter += 1
346
471
  else:
472
+ logger.data_issues(
473
+ "RECORD FAILED\t%s\t%s\t%s",
474
+ f"{file_path.name}:{idx}",
475
+ f"Error reading {idx} record from {file_path}. Skipping. Writing current chunk to {self.bad_records_file.name}.",
476
+ "",
477
+ )
347
478
  self.bad_records_file.write(reader.current_chunk)
348
479
  if self.record_batch:
349
480
  await self.process_record_batch(
350
- await self.create_batch_payload(counter, total_records, True),
481
+ await self.create_batch_payload(
482
+ counter,
483
+ total_records,
484
+ (counter - self.error_records)
485
+ == (total_records - self.error_records),
486
+ ),
351
487
  )
488
+ if not self.split_files:
489
+ self.move_file_to_complete(file_path)
490
+
491
+ def move_file_to_complete(self, file_path):
492
+ import_complete_path = file_path.parent.joinpath("import_complete")
493
+ if not import_complete_path.exists():
494
+ logger.debug(f"Creating import_complete directory: {import_complete_path.absolute()}")
495
+ import_complete_path.mkdir(exist_ok=True)
496
+ logger.debug(f"Moving {file_path} to {import_complete_path.absolute()}")
497
+ file_path.rename(
498
+ file_path.parent.joinpath("import_complete", file_path.name)
499
+ )
352
500
 
353
501
  @staticmethod
354
- async def apply_marc_record_preprocessing(record: pymarc.Record, func_or_path) -> pymarc.Record:
502
+ async def apply_marc_record_preprocessing(
503
+ record: pymarc.Record, func_or_path
504
+ ) -> pymarc.Record:
355
505
  """
356
506
  Apply preprocessing to the MARC record before sending it to FOLIO.
357
507
 
@@ -363,25 +513,42 @@ class MARCImportJob:
363
513
  pymarc.Record: The preprocessed MARC record.
364
514
  """
365
515
  if isinstance(func_or_path, str):
366
- try:
367
- path_parts = func_or_path.rsplit('.')
368
- module_path, func_name = ".".join(path_parts[:-1]), path_parts[-1]
369
- module = importlib.import_module(module_path)
370
- func = getattr(module, func_name)
371
- except (ImportError, AttributeError) as e:
372
- print(f"Error importing preprocessing function {func_or_path}: {e}. Skipping preprocessing.")
373
- return record
516
+ func_paths = func_or_path.split(",")
517
+ for func_path in func_paths:
518
+ record = await MARCImportJob._apply_single_marc_record_preprocessing_by_path(
519
+ record, func_path
520
+ )
374
521
  elif callable(func_or_path):
375
- func = func_or_path
522
+ record = func_or_path(record)
376
523
  else:
377
- print(f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing.")
378
- return record
524
+ logger.warning(
525
+ f"Invalid preprocessing function: {func_or_path}. Skipping preprocessing."
526
+ )
527
+ return record
528
+
529
+ async def _apply_single_marc_record_preprocessing_by_path(
530
+ record: pymarc.Record, func_path: str
531
+ ) -> pymarc.Record:
532
+ """
533
+ Apply a single preprocessing function to the MARC record.
534
+
535
+ Args:
536
+ record (pymarc.Record): The MARC record to preprocess.
537
+ func_path (str): The path to the preprocessing function.
379
538
 
539
+ Returns:
540
+ pymarc.Record: The preprocessed MARC record.
541
+ """
380
542
  try:
381
- return func(record)
543
+ module_path, func_name = func_path.rsplit(".", 1)
544
+ module = importlib.import_module(module_path)
545
+ func = getattr(module, func_name)
546
+ record = func(record)
382
547
  except Exception as e:
383
- print(f"Error applying preprocessing function: {e}. Skipping preprocessing.")
384
- return record
548
+ logger.warning(
549
+ f"Error applying preprocessing function {func_path}: {e}. Skipping."
550
+ )
551
+ return record
385
552
 
386
553
  async def create_batch_payload(self, counter, total_records, is_last) -> dict:
387
554
  """
@@ -406,6 +573,46 @@ class MARCImportJob:
406
573
  "initialRecords": [{"record": x.decode()} for x in self.record_batch],
407
574
  }
408
575
 
576
+ @staticmethod
577
+ def split_marc_file(file_path, batch_size):
578
+ """Generator to iterate over MARC records in batches, yielding BytesIO objects."""
579
+ with open(file_path, "rb") as f:
580
+ batch = io.BytesIO()
581
+ count = 0
582
+
583
+ while True:
584
+ leader = f.read(24)
585
+ if not leader:
586
+ break # End of file
587
+
588
+ try:
589
+ record_length = int(leader[:5]) # Extract record length from leader
590
+ except ValueError:
591
+ raise ValueError("Invalid MARC record length encountered.")
592
+
593
+ record_body = f.read(record_length - 24)
594
+ if len(record_body) != record_length - 24:
595
+ raise ValueError("Unexpected end of file while reading MARC record.")
596
+
597
+ # Verify record terminator
598
+ if record_body[-1:] != b'\x1D':
599
+ raise ValueError("MARC record does not end with the expected terminator (0x1D).")
600
+
601
+ # Write the full record to the batch buffer
602
+ batch.write(leader + record_body)
603
+ count += 1
604
+
605
+ if count >= batch_size:
606
+ batch.seek(0)
607
+ yield batch
608
+ batch = io.BytesIO() # Reset buffer
609
+ count = 0
610
+
611
+ # Yield any remaining records
612
+ if count > 0:
613
+ batch.seek(0)
614
+ yield batch
615
+
409
616
  async def import_marc_file(self) -> None:
410
617
  """
411
618
  Imports MARC file into the system.
@@ -425,24 +632,37 @@ class MARCImportJob:
425
632
  None
426
633
  """
427
634
  await self.create_folio_import_job()
428
- await self.get_import_profile()
429
635
  await self.set_job_profile()
430
636
  with ExitStack() as stack:
431
- files = [
432
- stack.enter_context(open(file, "rb")) for file in self.current_file
433
- ]
637
+ try:
638
+ if isinstance(self.current_file[0], Path):
639
+ files = [
640
+ stack.enter_context(open(file, "rb")) for file in self.current_file
641
+ ]
642
+ elif isinstance(self.current_file[0], io.BytesIO):
643
+ files = [
644
+ stack.enter_context(file) for file in self.current_file
645
+ ]
646
+ else:
647
+ raise ValueError("Invalid file type. Must be Path or BytesIO.")
648
+ except IndexError as e:
649
+ logger.error(f"Error opening file: {e}")
650
+ raise e
434
651
  total_records = await self.read_total_records(files)
435
- with tqdm(
436
- desc="Imported: ",
437
- total=total_records,
438
- position=1,
439
- disable=self.no_progress,
440
- ) as pbar_imported, tqdm(
441
- desc="Sent: ()",
442
- total=total_records,
443
- position=0,
444
- disable=self.no_progress,
445
- ) as pbar_sent:
652
+ with (
653
+ tqdm(
654
+ desc=f"Imported ({self.job_hrid}): ",
655
+ total=total_records,
656
+ position=1,
657
+ disable=self.no_progress,
658
+ ) as pbar_imported,
659
+ tqdm(
660
+ desc="Sent: ()",
661
+ total=total_records,
662
+ position=0,
663
+ disable=self.no_progress,
664
+ ) as pbar_sent,
665
+ ):
446
666
  self.pbar_sent = pbar_sent
447
667
  self.pbar_imported = pbar_imported
448
668
  await self.process_records(files, total_records)
@@ -450,37 +670,45 @@ class MARCImportJob:
450
670
  await self.get_job_status()
451
671
  sleep(1)
452
672
  if self.finished:
453
- job_summary = await self.get_job_summary()
454
- job_summary.pop("jobExecutionId")
455
- job_summary.pop("totalErrors")
456
- columns = ["Summary"] + list(job_summary.keys())
457
- rows = set()
458
- for key in columns[1:]:
459
- rows.update(job_summary[key].keys())
460
-
461
- table_data = []
462
- for row in rows:
463
- metric_name = decamelize(row).split("_")[1]
464
- table_row = [metric_name]
465
- for col in columns[1:]:
466
- table_row.append(job_summary[col].get(row, "N/A"))
467
- table_data.append(table_row)
468
- table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
469
- columns = columns[:1] + [
470
- " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
471
- ]
472
- print(
473
- f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
474
- f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
475
- )
476
- print(
477
- tabulate.tabulate(
478
- table_data, headers=columns, tablefmt="fancy_grid"
479
- ),
480
- )
673
+ await self.log_job_summary()
481
674
  self.last_current = 0
482
675
  self.finished = False
483
676
 
677
+ async def log_job_summary(self):
678
+ if job_summary := await self.get_job_summary():
679
+ job_id = job_summary.pop("jobExecutionId", None)
680
+ total_errors = job_summary.pop("totalErrors", 0)
681
+ columns = ["Summary"] + list(job_summary.keys())
682
+ rows = set()
683
+ for key in columns[1:]:
684
+ rows.update(job_summary[key].keys())
685
+
686
+ table_data = []
687
+ for row in rows:
688
+ metric_name = decamelize(row).split("_")[1]
689
+ table_row = [metric_name]
690
+ for col in columns[1:]:
691
+ table_row.append(job_summary[col].get(row, "N/A"))
692
+ table_data.append(table_row)
693
+ table_data.sort(key=lambda x: REPORT_SUMMARY_ORDERING.get(x[0], 99))
694
+ columns = columns[:1] + [
695
+ " ".join(decamelize(x).split("_")[:-1]) for x in columns[1:]
696
+ ]
697
+ logger.info(
698
+ f"Results for {'file' if len(self.current_file) == 1 else 'files'}: "
699
+ f"{', '.join([os.path.basename(x.name) for x in self.current_file])}"
700
+ )
701
+ logger.info(
702
+ "\n"
703
+ + tabulate.tabulate(
704
+ table_data, headers=columns, tablefmt="fancy_grid"
705
+ ),
706
+ )
707
+ if total_errors:
708
+ logger.info(f"Total errors: {total_errors}. Job ID: {job_id}.")
709
+ else:
710
+ logger.error(f"No job summary available for job #{self.job_hrid}({self.job_id}).")
711
+
484
712
  async def get_job_summary(self) -> dict:
485
713
  """
486
714
  Retrieves the job summary for the current job execution.
@@ -490,23 +718,88 @@ class MARCImportJob:
490
718
  """
491
719
  try:
492
720
  self.current_retry_timeout = (
493
- self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR
494
- ) if self.current_retry_timeout else RETRY_TIMEOUT_START
495
- job_summary = self.folio_client.folio_get(
496
- f"/metadata-provider/jobSummary/{self.job_id}"
721
+ (self.current_retry_timeout * RETRY_TIMEOUT_RETRY_FACTOR)
722
+ if self.current_retry_timeout
723
+ else RETRY_TIMEOUT_START
497
724
  )
498
- self.current_retry_timeout = None
499
- except httpx.ReadTimeout: #
500
- sleep(.25)
501
725
  with httpx.Client(
502
- timeout=self.current_retry_timeout,
503
- verify=self.folio_client.ssl_verify
726
+ timeout=self.current_retry_timeout, verify=self.folio_client.ssl_verify
504
727
  ) as temp_client:
505
728
  self.folio_client.httpx_client = temp_client
506
- return await self.get_job_summary()
729
+ job_summary = self.folio_client.folio_get(
730
+ f"/metadata-provider/jobSummary/{self.job_id}"
731
+ )
732
+ self.current_retry_timeout = None
733
+ except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
734
+ error_text = e.response.text if hasattr(e, "response") else str(e)
735
+ if (self._max_summary_retries > self._summary_retries) and (not hasattr(e, "response") or (
736
+ hasattr(e, "response") and e.response.status_code in [502, 504]) and not self.let_summary_fail
737
+ ):
738
+ logger.warning(f"SERVER ERROR fetching job summary: {e}. Retrying.")
739
+ sleep(0.25)
740
+ with httpx.Client(
741
+ timeout=self.current_retry_timeout,
742
+ verify=self.folio_client.ssl_verify,
743
+ ) as temp_client:
744
+ self.folio_client.httpx_client = temp_client
745
+ self._summary_retries += 1
746
+ return await self.get_job_summary()
747
+ elif (self._summary_retries >= self._max_summary_retries) or (hasattr(e, "response") and (
748
+ e.response.status_code in [502, 504] and self.let_summary_fail)
749
+ ):
750
+ logger.warning(
751
+ f"SERVER ERROR fetching job summary: {error_text}. Skipping final summary check."
752
+ )
753
+ job_summary = {}
754
+ else:
755
+ raise e
507
756
  return job_summary
508
757
 
509
758
 
759
+ def set_up_cli_logging():
760
+ """
761
+ This function sets up logging for the CLI.
762
+ """
763
+ logger.setLevel(logging.INFO)
764
+ logger.propagate = False
765
+
766
+ # Set up file and stream handlers
767
+ file_handler = logging.FileHandler(
768
+ "folio_data_import_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
769
+ )
770
+ file_handler.setLevel(logging.INFO)
771
+ file_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
772
+ # file_handler.addFilter(IncludeLevelFilter(25))
773
+ file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
774
+ file_handler.setFormatter(file_formatter)
775
+ logger.addHandler(file_handler)
776
+
777
+ if not any(
778
+ isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
779
+ for h in logger.handlers
780
+ ):
781
+ stream_handler = logging.StreamHandler(sys.stdout)
782
+ stream_handler.setLevel(logging.INFO)
783
+ stream_handler.addFilter(ExcludeLevelFilter(DATA_ISSUE_LVL_NUM))
784
+ # stream_handler.addFilter(ExcludeLevelFilter(25))
785
+ stream_formatter = logging.Formatter("%(message)s")
786
+ stream_handler.setFormatter(stream_formatter)
787
+ logger.addHandler(stream_handler)
788
+
789
+ # Set up data issues logging
790
+ data_issues_handler = logging.FileHandler(
791
+ "marc_import_data_issues_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
792
+ )
793
+ data_issues_handler.setLevel(26)
794
+ data_issues_handler.addFilter(IncludeLevelFilter(DATA_ISSUE_LVL_NUM))
795
+ data_issues_formatter = logging.Formatter("%(message)s")
796
+ data_issues_handler.setFormatter(data_issues_formatter)
797
+ logger.addHandler(data_issues_handler)
798
+
799
+ # Stop httpx from logging info messages to the console
800
+ logging.getLogger("httpx").setLevel(logging.WARNING)
801
+
802
+
510
803
  async def main() -> None:
511
804
  """
512
805
  Main function to run the MARC import job.
@@ -514,6 +807,7 @@ async def main() -> None:
514
807
  This function parses command line arguments, initializes the FolioClient,
515
808
  and runs the MARCImportJob.
516
809
  """
810
+ set_up_cli_logging()
517
811
  parser = argparse.ArgumentParser()
518
812
  parser.add_argument("--gateway_url", type=str, help="The FOLIO API Gateway URL")
519
813
  parser.add_argument("--tenant_id", type=str, help="The FOLIO tenant ID")
@@ -552,12 +846,15 @@ async def main() -> None:
552
846
  "--preprocessor",
553
847
  type=str,
554
848
  help=(
555
- "The path to a Python module containing a preprocessing function "
556
- "to apply to each MARC record before sending to FOLIO."
849
+ "Comma-separated python import paths to Python function(s) "
850
+ "to apply to each MARC record before sending to FOLIO. Function should take "
851
+ "a pymarc.Record object as input and return a pymarc.Record object."
557
852
  ),
558
853
  default=None,
559
854
  )
560
- parser.add_argument(
855
+ # Add mutually exclusive group for consolidate and split-files options
856
+ group = parser.add_mutually_exclusive_group()
857
+ group.add_argument(
561
858
  "--consolidate",
562
859
  action="store_true",
563
860
  help=(
@@ -565,11 +862,34 @@ async def main() -> None:
565
862
  "Default is to create a new job for each MARC file."
566
863
  ),
567
864
  )
865
+ group.add_argument(
866
+ "--split-files",
867
+ action="store_true",
868
+ help="Split files into smaller parts before importing.",
869
+ )
870
+ parser.add_argument(
871
+ "--split-size",
872
+ type=int,
873
+ help="The number of records to include in each split file.",
874
+ default=1000,
875
+ )
876
+ parser.add_argument(
877
+ "--split-offset",
878
+ type=int,
879
+ help="The number of record batches of <split-size> to skip before starting import.",
880
+ default=0,
881
+ )
882
+
568
883
  parser.add_argument(
569
884
  "--no-progress",
570
885
  action="store_true",
571
886
  help="Disable progress bars (eg. for running in a CI environment)",
572
887
  )
888
+ parser.add_argument(
889
+ "--let-summary-fail",
890
+ action="store_true",
891
+ help="Do not retry fetching the final job summary if it fails",
892
+ )
573
893
  args = parser.parse_args()
574
894
  if not args.password:
575
895
  args.password = getpass("Enter FOLIO password: ")
@@ -586,11 +906,13 @@ async def main() -> None:
586
906
  else:
587
907
  marc_files = list(Path("./").glob(args.marc_file_path))
588
908
 
909
+ marc_files.sort()
910
+
589
911
  if len(marc_files) == 0:
590
- print(f"No files found matching {args.marc_file_path}. Exiting.")
912
+ logger.critical(f"No files found matching {args.marc_file_path}. Exiting.")
591
913
  sys.exit(1)
592
914
  else:
593
- print(marc_files)
915
+ logger.info(marc_files)
594
916
 
595
917
  if not args.import_profile_name:
596
918
  import_profiles = folio_client.folio_get(
@@ -622,12 +944,34 @@ async def main() -> None:
622
944
  marc_record_preprocessor=args.preprocessor,
623
945
  consolidate=bool(args.consolidate),
624
946
  no_progress=bool(args.no_progress),
947
+ let_summary_fail=bool(args.let_summary_fail),
948
+ split_files=bool(args.split_files),
949
+ split_size=args.split_size,
950
+ split_offset=args.split_offset,
625
951
  ).do_work()
626
952
  except Exception as e:
627
- print("Error importing files: " + str(e))
953
+ logger.error("Error importing files: " + str(e))
628
954
  raise
629
955
 
630
956
 
957
+ class ExcludeLevelFilter(logging.Filter):
958
+ def __init__(self, level):
959
+ super().__init__()
960
+ self.level = level
961
+
962
+ def filter(self, record):
963
+ return record.levelno != self.level
964
+
965
+
966
+ class IncludeLevelFilter(logging.Filter):
967
+ def __init__(self, level):
968
+ super().__init__()
969
+ self.level = level
970
+
971
+ def filter(self, record):
972
+ return record.levelno == self.level
973
+
974
+
631
975
  def sync_main() -> None:
632
976
  """
633
977
  Synchronous main function to run the MARC import job.