folio-data-import 0.5.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1265 @@
1
+ """
2
+ BatchPoster module for FOLIO inventory batch operations.
3
+
4
+ This module provides functionality for batch posting of Instances, Holdings, and Items
5
+ to FOLIO's inventory storage endpoints with support for upsert operations.
6
+ """
7
+
8
+ import asyncio
9
+ import glob as glob_module
10
+ import json
11
+ import logging
12
+ import sys
13
+ from datetime import datetime as dt
14
+ from io import TextIOWrapper
15
+ from pathlib import Path
16
+ from typing import Annotated, Any, Dict, Generator, List, Literal, Union
17
+
18
+ import cyclopts
19
+ import folioclient
20
+ from folioclient import FolioClient
21
+ import httpx
22
+ from pydantic import BaseModel, Field
23
+ from rich.logging import RichHandler
24
+
25
+ from folio_data_import import get_folio_connection_parameters
26
+ from folio_data_import._progress import (
27
+ RichProgressReporter,
28
+ ProgressReporter,
29
+ NoOpProgressReporter,
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class BatchPosterStats(BaseModel):
36
+ """Statistics for batch posting operations."""
37
+
38
+ records_processed: int = 0
39
+ records_posted: int = 0
40
+ records_created: int = 0
41
+ records_updated: int = 0
42
+ records_failed: int = 0
43
+ batches_posted: int = 0
44
+ batches_failed: int = 0
45
+
46
+
47
+ def get_api_info(object_type: str) -> Dict[str, Any]:
48
+ """
49
+ Get API endpoint information for a given object type.
50
+
51
+ Args:
52
+ object_type: The type of object (Instances, Holdings, Items)
53
+
54
+ Returns:
55
+ Dictionary containing API endpoint information
56
+
57
+ Raises:
58
+ ValueError: If object_type is not supported
59
+ """
60
+ api_info = {
61
+ "Items": {
62
+ "object_name": "items",
63
+ "api_endpoint": "/item-storage/batch/synchronous",
64
+ "query_endpoint": "/item-storage/items",
65
+ "is_batch": True,
66
+ "supports_upsert": True,
67
+ },
68
+ "Holdings": {
69
+ "object_name": "holdingsRecords",
70
+ "api_endpoint": "/holdings-storage/batch/synchronous",
71
+ "query_endpoint": "/holdings-storage/holdings",
72
+ "is_batch": True,
73
+ "supports_upsert": True,
74
+ },
75
+ "Instances": {
76
+ "object_name": "instances",
77
+ "api_endpoint": "/instance-storage/batch/synchronous",
78
+ "query_endpoint": "/instance-storage/instances",
79
+ "is_batch": True,
80
+ "supports_upsert": True,
81
+ },
82
+ }
83
+
84
+ if object_type not in api_info:
85
+ raise ValueError(
86
+ f"Unsupported object type: {object_type}. "
87
+ f"Supported types: {', '.join(api_info.keys())}"
88
+ )
89
+
90
+ return api_info[object_type]
91
+
92
+
93
+ def deep_update(target: dict, source: dict) -> None:
94
+ """
95
+ Recursively update target dictionary with values from source dictionary.
96
+
97
+ Args:
98
+ target: The dictionary to update
99
+ source: The dictionary to merge into target
100
+ """
101
+ for key, value in source.items():
102
+ if isinstance(value, dict) and key in target and isinstance(target[key], dict):
103
+ deep_update(target[key], value)
104
+ else:
105
+ target[key] = value
106
+
107
+
108
+ def extract_paths(record: dict, paths: List[str]) -> dict:
109
+ """
110
+ Extract specified paths from a record.
111
+
112
+ Args:
113
+ record: The record to extract from
114
+ paths: List of JSON paths to extract (e.g., ['statisticalCodeIds', 'status'])
115
+
116
+ Returns:
117
+ Dictionary containing only the specified paths
118
+ """
119
+ result = {}
120
+ for path in paths:
121
+ if path in record:
122
+ result[path] = record[path]
123
+ return result
124
+
125
+
126
+ class BatchPoster:
127
+ """
128
+ Handles batch posting of inventory records to FOLIO.
129
+
130
+ This class provides functionality for posting Instances, Holdings, and Items
131
+ to FOLIO's batch inventory endpoints with support for upsert operations.
132
+ """
133
+
134
+ class Config(BaseModel):
135
+ """Configuration for BatchPoster operations."""
136
+
137
+ object_type: Annotated[
138
+ Literal["Instances", "Holdings", "Items"],
139
+ Field(
140
+ title="Object type",
141
+ description="The type of inventory object to post: Instances, Holdings, or Items",
142
+ ),
143
+ ]
144
+ batch_size: Annotated[
145
+ int,
146
+ Field(
147
+ title="Batch size",
148
+ description="Number of records to include in each batch (1-1000)",
149
+ ),
150
+ ] = 1
151
+ upsert: Annotated[
152
+ bool,
153
+ Field(
154
+ title="Upsert",
155
+ description=(
156
+ "Enable upsert mode to create new records or update existing ones. "
157
+ "When enabled, records with matching IDs will be updated instead "
158
+ "of causing errors."
159
+ ),
160
+ ),
161
+ ] = False
162
+ preserve_statistical_codes: Annotated[
163
+ bool,
164
+ Field(
165
+ title="Preserve statistical codes",
166
+ description=(
167
+ "Preserve existing statistical codes during upsert. "
168
+ "When enabled, statistical codes from existing records will be retained "
169
+ "and merged with new codes."
170
+ ),
171
+ ),
172
+ ] = False
173
+ preserve_administrative_notes: Annotated[
174
+ bool,
175
+ Field(
176
+ title="Preserve administrative notes",
177
+ description=(
178
+ "Preserve existing administrative notes during upsert. "
179
+ "When enabled, administrative notes from existing records will be retained "
180
+ "and merged with new notes."
181
+ ),
182
+ ),
183
+ ] = False
184
+ preserve_temporary_locations: Annotated[
185
+ bool,
186
+ Field(
187
+ title="Preserve temporary locations",
188
+ description=(
189
+ "Preserve temporary location assignments on items during upsert. "
190
+ "Only applicable when object_type is 'Items'."
191
+ ),
192
+ ),
193
+ ] = False
194
+ preserve_temporary_loan_types: Annotated[
195
+ bool,
196
+ Field(
197
+ title="Preserve temporary loan types",
198
+ description=(
199
+ "Preserve temporary loan type assignments on items during upsert. "
200
+ "Only applicable when object_type is 'Items'."
201
+ ),
202
+ ),
203
+ ] = False
204
+ preserve_item_status: Annotated[
205
+ bool,
206
+ Field(
207
+ title="Preserve item status",
208
+ description=(
209
+ "Preserve item status during upsert. When enabled, the status "
210
+ "field from existing records will be retained. Only applicable "
211
+ "when object_type is 'Items'."
212
+ ),
213
+ ),
214
+ ] = True
215
+ patch_existing_records: Annotated[
216
+ bool,
217
+ Field(
218
+ title="Patch existing records",
219
+ description=(
220
+ "Enable selective field patching during upsert. When enabled, only fields "
221
+ "specified in patch_paths will be updated, preserving all other fields."
222
+ ),
223
+ ),
224
+ ] = False
225
+ patch_paths: Annotated[
226
+ List[str] | None,
227
+ Field(
228
+ title="Patch paths",
229
+ description=(
230
+ "List of field paths to patch during upsert "
231
+ "(e.g., ['barcode', 'status']). "
232
+ "If empty and patch_existing_records is True, all fields "
233
+ "will be patched. Use this to selectively update only "
234
+ "specific fields while preserving others."
235
+ ),
236
+ ),
237
+ ] = None
238
+ no_progress: Annotated[
239
+ bool,
240
+ Field(
241
+ title="No progress bar",
242
+ description="Disable the progress bar display (e.g., for CI environments)",
243
+ ),
244
+ ] = False
245
+
246
+ def __init__(
247
+ self,
248
+ folio_client: FolioClient,
249
+ config: "BatchPoster.Config",
250
+ failed_records_file=None,
251
+ reporter: ProgressReporter | None = None,
252
+ ):
253
+ """
254
+ Initialize BatchPoster.
255
+
256
+ Args:
257
+ folio_client: Authenticated FOLIO client
258
+ config: Configuration for batch posting
259
+ failed_records_file: Optional file handle or path for writing failed records.
260
+ Can be an open file handle (managed by caller) or a string/Path
261
+ (will be opened/closed by BatchPoster).
262
+ reporter: Optional progress reporter. If None, uses NoOpProgressReporter.
263
+ """
264
+ self.folio_client = folio_client
265
+ self.config = config
266
+ self.reporter = reporter or NoOpProgressReporter()
267
+ self.api_info = get_api_info(config.object_type)
268
+ self.stats = BatchPosterStats()
269
+
270
+ # Handle failed records file
271
+ self._failed_records_file_handle: TextIOWrapper | None = None
272
+ self._failed_records_path: Path | None = None
273
+ self._owns_file_handle = False
274
+
275
+ if failed_records_file:
276
+ if hasattr(failed_records_file, "write"):
277
+ # It's a file handle - use it but don't close it
278
+ self._failed_records_file_handle = failed_records_file
279
+ self._owns_file_handle = False
280
+ else:
281
+ # It's a path - we'll open and manage it
282
+ self._failed_records_path = Path(failed_records_file)
283
+ self._owns_file_handle = True
284
+
285
+ # Validate upsert configuration
286
+ if config.upsert and not self.api_info["supports_upsert"]:
287
+ raise ValueError(f"Upsert is not supported for {config.object_type}")
288
+
289
+ async def __aenter__(self):
290
+ """Async context manager entry."""
291
+ # Open the file if we own it and it's not already open
292
+ if (
293
+ self._owns_file_handle
294
+ and self._failed_records_path
295
+ and not self._failed_records_file_handle
296
+ ):
297
+ self._failed_records_file_handle = open(
298
+ self._failed_records_path, "w", encoding="utf-8"
299
+ )
300
+ logger.info(f"Opened failed records file: {self._failed_records_path}")
301
+ return self
302
+
303
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
304
+ """Async context manager exit."""
305
+ # Only close the file if we opened it
306
+ if self._owns_file_handle and self._failed_records_file_handle:
307
+ self._failed_records_file_handle.close()
308
+ if self._failed_records_path:
309
+ logger.info(
310
+ f"Wrote {self.stats.records_failed} failed records "
311
+ f"to {self._failed_records_path}"
312
+ )
313
+ self._failed_records_file_handle = None
314
+
315
+ def _write_failed_record(self, record: dict) -> None:
316
+ """
317
+ Write a single failed record to the file immediately.
318
+
319
+ Args:
320
+ record: The record that failed to post
321
+ """
322
+ if self._failed_records_file_handle:
323
+ self._failed_records_file_handle.write(json.dumps(record) + "\n")
324
+ self._failed_records_file_handle.flush() # Ensure it's written immediately
325
+
326
+ def _write_failed_batch(self, batch: List[dict]) -> None:
327
+ """
328
+ Write a batch of failed records to the file immediately.
329
+
330
+ Args:
331
+ batch: List of records that failed to post
332
+ """
333
+ if self._failed_records_file_handle:
334
+ for record in batch:
335
+ self._failed_records_file_handle.write(json.dumps(record) + "\n")
336
+ self._failed_records_file_handle.flush() # Ensure they're written immediately
337
+
338
+ def handle_upsert_for_statistical_codes(self, updates: dict, keep_existing: dict) -> None:
339
+ """
340
+ Handle statistical codes during upsert based on configuration.
341
+
342
+ Args:
343
+ updates: Dictionary being prepared for update
344
+ keep_existing: Dictionary of fields to preserve from existing record
345
+ """
346
+ if not self.config.preserve_statistical_codes:
347
+ updates["statisticalCodeIds"] = []
348
+ keep_existing["statisticalCodeIds"] = []
349
+ else:
350
+ keep_existing["statisticalCodeIds"] = updates.pop("statisticalCodeIds", [])
351
+ updates["statisticalCodeIds"] = []
352
+
353
+ def handle_upsert_for_administrative_notes(self, updates: dict, keep_existing: dict) -> None:
354
+ """
355
+ Handle administrative notes during upsert based on configuration.
356
+
357
+ Args:
358
+ updates: Dictionary being prepared for update
359
+ keep_existing: Dictionary of fields to preserve from existing record
360
+ """
361
+ if not self.config.preserve_administrative_notes:
362
+ updates["administrativeNotes"] = []
363
+ keep_existing["administrativeNotes"] = []
364
+ else:
365
+ keep_existing["administrativeNotes"] = updates.pop("administrativeNotes", [])
366
+ updates["administrativeNotes"] = []
367
+
368
+ def handle_upsert_for_temporary_locations(self, updates: dict, keep_existing: dict) -> None:
369
+ """
370
+ Handle temporary locations during upsert based on configuration.
371
+
372
+ Args:
373
+ updates: Dictionary being prepared for update
374
+ keep_existing: Dictionary of fields to preserve from existing record
375
+ """
376
+ if self.config.preserve_temporary_locations:
377
+ keep_existing["temporaryLocationId"] = updates.pop("temporaryLocationId", None)
378
+
379
+ def handle_upsert_for_temporary_loan_types(self, updates: dict, keep_existing: dict) -> None:
380
+ """
381
+ Handle temporary loan types during upsert based on configuration.
382
+
383
+ Args:
384
+ updates: Dictionary being prepared for update
385
+ keep_existing: Dictionary of fields to preserve from existing record
386
+ """
387
+ if self.config.preserve_temporary_loan_types:
388
+ keep_existing["temporaryLoanTypeId"] = updates.pop("temporaryLoanTypeId", None)
389
+
390
+ def keep_existing_fields(self, updates: dict, existing_record: dict) -> None:
391
+ """
392
+ Preserve specific fields from existing record during upsert.
393
+
394
+ Args:
395
+ updates: Dictionary being prepared for update
396
+ existing_record: The existing record in FOLIO
397
+ """
398
+ if self.config.preserve_item_status and "status" in existing_record:
399
+ updates["status"] = existing_record["status"]
400
+
401
+ def patch_record(
402
+ self, new_record: dict, existing_record: dict, patch_paths: List[str]
403
+ ) -> None:
404
+ """
405
+ Update new_record with values from existing_record according to patch_paths.
406
+
407
+ Args:
408
+ new_record: The new record to be updated
409
+ existing_record: The existing record to patch from
410
+ patch_paths: List of fields in JSON Path notation to patch during upsert
411
+ """
412
+ updates = {}
413
+ updates.update(existing_record)
414
+ keep_existing: Dict[str, Any] = {}
415
+
416
+ # Handle special field preservation rules
417
+ self.handle_upsert_for_administrative_notes(updates, keep_existing)
418
+ self.handle_upsert_for_statistical_codes(updates, keep_existing)
419
+
420
+ if self.config.object_type == "Items":
421
+ self.handle_upsert_for_temporary_locations(updates, keep_existing)
422
+ self.handle_upsert_for_temporary_loan_types(updates, keep_existing)
423
+
424
+ # Determine which fields to keep from new record
425
+ if not patch_paths:
426
+ keep_new = new_record
427
+ else:
428
+ keep_new = extract_paths(new_record, patch_paths)
429
+
430
+ # Special handling for instance status
431
+ if "instanceStatusId" in new_record:
432
+ updates["instanceStatusId"] = new_record["instanceStatusId"]
433
+
434
+ # Merge the updates
435
+ deep_update(updates, keep_new)
436
+
437
+ # Merge arrays from keep_existing, avoiding duplicates
438
+ for key, value in keep_existing.items():
439
+ if isinstance(value, list) and key in keep_new:
440
+ # Combine arrays and remove duplicates
441
+ updates[key] = list(dict.fromkeys(updates.get(key, []) + value))
442
+ elif key not in keep_new:
443
+ updates[key] = value
444
+
445
+ # Apply item-specific preservation
446
+ if self.config.object_type == "Items":
447
+ self.keep_existing_fields(updates, existing_record)
448
+
449
+ # Update the new_record in place
450
+ new_record.clear()
451
+ new_record.update(updates)
452
+
453
+ def prepare_record_for_upsert(self, new_record: dict, existing_record: dict) -> None:
454
+ """
455
+ Prepare a record for upsert by adding version and patching fields.
456
+
457
+ Args:
458
+ new_record: The new record to prepare
459
+ existing_record: The existing record in FOLIO
460
+ """
461
+ # Set the version for optimistic locking
462
+ new_record["_version"] = existing_record.get("_version", 1)
463
+
464
+ # Apply patching if configured
465
+ if self.config.patch_existing_records:
466
+ self.patch_record(new_record, existing_record, self.config.patch_paths or [])
467
+
468
+ async def fetch_existing_records(self, record_ids: List[str]) -> Dict[str, dict]:
469
+ """
470
+ Fetch existing records from FOLIO by their IDs.
471
+
472
+ Args:
473
+ record_ids: List of record IDs to fetch
474
+
475
+ Returns:
476
+ Dictionary mapping record IDs to their full records
477
+ """
478
+ existing_records: Dict[str, dict] = {}
479
+ query_endpoint = self.api_info["query_endpoint"]
480
+ object_name = self.api_info["object_name"]
481
+
482
+ # Fetch in batches of 90 (FOLIO CQL limit for OR queries)
483
+ fetch_batch_size = 90
484
+
485
+ async def fetch_batch(batch_ids: List[str]) -> dict:
486
+ query = f"id==({' OR '.join(batch_ids)})"
487
+ params = {"query": query, "limit": fetch_batch_size}
488
+ try:
489
+ return await self.folio_client.folio_get_async(
490
+ query_endpoint, key=object_name, query_params=params
491
+ )
492
+ except folioclient.FolioClientError as e:
493
+ logger.error(f"FOLIO client error fetching existing records: {e}")
494
+ raise
495
+ except folioclient.FolioConnectionError as e:
496
+ logger.error(f"FOLIO connection error fetching existing records: {e}")
497
+ raise
498
+ except Exception as e:
499
+ logger.error(f"Failed to fetch existing records: {e}")
500
+ raise
501
+
502
+ # Create tasks for all batches
503
+ tasks = []
504
+ for i in range(0, len(record_ids), fetch_batch_size):
505
+ batch_slice = record_ids[i : i + fetch_batch_size]
506
+ tasks.append(fetch_batch(batch_slice))
507
+
508
+ # Fetch all batches concurrently
509
+ results = await asyncio.gather(*tasks)
510
+
511
+ # Process results
512
+ for result in results:
513
+ if isinstance(result, list):
514
+ for record in result:
515
+ existing_records[record["id"]] = record
516
+
517
+ return existing_records
518
+
519
+ async def set_versions_for_upsert(self, batch: List[dict]) -> None:
520
+ """
521
+ Fetch existing record versions and prepare batch for upsert.
522
+
523
+ Only records that already exist in FOLIO will have their _version set
524
+ and be prepared for update. New records will not have _version set.
525
+
526
+ Args:
527
+ batch: List of records to prepare for upsert
528
+ """
529
+ # Extract record IDs
530
+ record_ids = [record["id"] for record in batch if "id" in record]
531
+
532
+ if not record_ids:
533
+ return
534
+
535
+ # Fetch existing records
536
+ existing_records = await self.fetch_existing_records(record_ids)
537
+
538
+ # Only prepare records that already exist
539
+ for record in batch:
540
+ if "id" in record and record["id"] in existing_records:
541
+ self.prepare_record_for_upsert(record, existing_records[record["id"]])
542
+
543
+ async def post_batch(self, batch: List[dict]) -> tuple[httpx.Response, int, int]:
544
+ """
545
+ Post a batch of records to FOLIO.
546
+
547
+ Args:
548
+ batch: List of records to post
549
+
550
+ Returns:
551
+ Tuple of (response data dict, number of creates, number of updates)
552
+
553
+ Raises:
554
+ folioclient.FolioClientError: If FOLIO API returns an error
555
+ folioclient.FolioConnectionError: If connection to FOLIO fails
556
+ """
557
+ # Track creates vs updates before posting
558
+ num_creates = 0
559
+ num_updates = 0
560
+
561
+ # If upsert mode, set versions and track which are updates
562
+ if self.config.upsert:
563
+ await self.set_versions_for_upsert(batch)
564
+ # Count records with _version as updates, others as creates
565
+ for record in batch:
566
+ if "_version" in record:
567
+ num_updates += 1
568
+ else:
569
+ num_creates += 1
570
+ else:
571
+ # In create-only mode, all are creates
572
+ num_creates = len(batch)
573
+
574
+ # Prepare payload
575
+ object_name = self.api_info["object_name"]
576
+ payload = {object_name: batch}
577
+
578
+ # Prepare query parameters
579
+ query_params = {}
580
+ if self.config.upsert:
581
+ query_params["upsert"] = "true"
582
+
583
+ # Make the request
584
+ api_endpoint = self.api_info["api_endpoint"]
585
+
586
+ response_data = await self.folio_client.async_httpx_client.post(
587
+ api_endpoint, json=payload, params=query_params
588
+ )
589
+ response_data.raise_for_status()
590
+ logger.info(
591
+ (
592
+ "Posting successful! Total rows: %s Total failed: %s "
593
+ "in %ss "
594
+ "Batch Size: %s Request size: %s "
595
+ ),
596
+ self.stats.records_processed,
597
+ self.stats.records_failed,
598
+ response_data.elapsed.total_seconds(),
599
+ len(batch),
600
+ get_req_size(response_data),
601
+ )
602
+ self.stats.records_posted += len(batch)
603
+ self.stats.batches_posted += 1
604
+
605
+ return response_data, num_creates, num_updates
606
+
607
+ async def post_records(self, records) -> None:
608
+ """
609
+ Post records in batches.
610
+
611
+ Failed records will be written to the file handle provided during initialization.
612
+
613
+ Args:
614
+ records: Records to post. Can be:
615
+ - List of dict records
616
+ - File-like object containing JSON lines (one record per line)
617
+ - String/Path to a file containing JSON lines
618
+ """
619
+ # Normalize input to an iterator
620
+ if isinstance(records, (str, Path)):
621
+ # It's a file path
622
+ record_iterator = self._read_records_from_path(records)
623
+ elif hasattr(records, "read"):
624
+ # It's a file-like object
625
+ record_iterator = self._read_records_from_file_handle(records)
626
+ elif isinstance(records, list):
627
+ # It's already a list - wrap in a generator
628
+ record_iterator = iter(records)
629
+ else:
630
+ raise TypeError(
631
+ f"records must be a list, file path, or file-like object, got {type(records)}"
632
+ )
633
+
634
+ # Process records in batches
635
+ batch = []
636
+ for record in record_iterator:
637
+ batch.append(record)
638
+
639
+ # Post when batch is full
640
+ if len(batch) >= self.config.batch_size:
641
+ await self._post_single_batch(batch)
642
+ batch = []
643
+
644
+ # Post any remaining records
645
+ if batch:
646
+ await self._post_single_batch(batch)
647
+
648
+ def _read_records_from_path(self, file_path: Union[str, Path]) -> Generator[dict, None, None]:
649
+ """
650
+ Generator that yields records from a file path.
651
+
652
+ Args:
653
+ file_path: Path to file containing JSON lines
654
+
655
+ Yields:
656
+ Parsed record dictionaries
657
+ """
658
+ file_path = Path(file_path)
659
+ if not file_path.exists():
660
+ raise FileNotFoundError(f"Input file not found: {file_path}")
661
+
662
+ logger.info(f"Reading records from {file_path}")
663
+
664
+ with open(file_path, "r", encoding="utf-8") as f:
665
+ yield from self._read_records_from_file_handle(f)
666
+
667
+ def _read_records_from_file_handle(self, file_handle) -> Generator[dict, None, None]:
668
+ """
669
+ Generator that yields records from a file handle.
670
+
671
+ If a line cannot be parsed as JSON, writes the problematic line and all
672
+ remaining lines to the failed records file (if configured) before raising
673
+ an exception.
674
+
675
+ Args:
676
+ file_handle: File-like object containing JSON lines
677
+
678
+ Yields:
679
+ Parsed record dictionaries
680
+
681
+ Raises:
682
+ ValueError: If a line cannot be parsed as JSON
683
+ """
684
+ for line_number, original_line in enumerate(file_handle, start=1):
685
+ line = original_line.strip()
686
+ if not line:
687
+ continue
688
+
689
+ try:
690
+ record = self._parse_json_line(line, line_number)
691
+ yield record
692
+ except ValueError:
693
+ # Write the failed line to failed records file
694
+ if self._failed_records_file_handle:
695
+ self._failed_records_file_handle.write(original_line)
696
+ # Write all remaining lines as-is
697
+ for remaining_line in file_handle:
698
+ self._failed_records_file_handle.write(remaining_line)
699
+
700
+ self._failed_records_file_handle.flush()
701
+
702
+ # Re-raise the exception
703
+ raise
704
+
705
+ async def _post_single_batch(self, batch: List[dict]) -> None:
706
+ """
707
+ Post a single batch with error handling.
708
+
709
+ Args:
710
+ batch: List of records to post
711
+ """
712
+ self.stats.records_processed += len(batch)
713
+
714
+ try:
715
+ _, num_creates, num_updates = await self.post_batch(batch)
716
+
717
+ # Success - update stats
718
+ self.stats.records_created += num_creates
719
+ self.stats.records_updated += num_updates
720
+ # Update progress bar if available
721
+ if hasattr(self, "reporter") and hasattr(self, "task_id"):
722
+ self.reporter.update_task(
723
+ self.task_id,
724
+ advance=len(batch),
725
+ posted=self.stats.records_posted,
726
+ created=self.stats.records_created,
727
+ updated=self.stats.records_updated,
728
+ failed=self.stats.records_failed,
729
+ )
730
+
731
+ except folioclient.FolioClientError as e:
732
+ logger.error(f"Batch failed: {e} - {e.response.text}")
733
+ self.stats.records_failed += len(batch)
734
+ self._write_failed_batch(batch)
735
+
736
+ # Update progress bar if available
737
+ if hasattr(self, "reporter") and hasattr(self, "task_id"):
738
+ self.reporter.update_task(
739
+ self.task_id,
740
+ advance=len(batch),
741
+ posted=self.stats.records_posted,
742
+ created=self.stats.records_created,
743
+ updated=self.stats.records_updated,
744
+ failed=self.stats.records_failed,
745
+ )
746
+
747
+ except folioclient.FolioConnectionError as e:
748
+ logger.error(f"Batch failed due to connection error: {e}")
749
+ self.stats.records_failed += len(batch)
750
+ self._write_failed_batch(batch)
751
+
752
+ # Update progress bar if available
753
+ if hasattr(self, "reporter") and hasattr(self, "task_id"):
754
+ self.reporter.update_task(
755
+ self.task_id,
756
+ advance=len(batch),
757
+ posted=self.stats.records_posted,
758
+ created=self.stats.records_created,
759
+ updated=self.stats.records_updated,
760
+ failed=self.stats.records_failed,
761
+ )
762
+
763
+ except Exception as e:
764
+ logger.error(f"Unexpected error during batch post: {e}")
765
+ if hasattr(e, "request"):
766
+ logger.debug(f"DEBUG: {e.request}, {e.request.content}")
767
+ self.stats.records_failed += len(batch)
768
+ self._write_failed_batch(batch)
769
+
770
+ # Update progress bar if available
771
+ if hasattr(self, "reporter") and hasattr(self, "task_id"):
772
+ self.reporter.update_task(
773
+ self.task_id,
774
+ advance=len(batch),
775
+ posted=self.stats.records_posted,
776
+ created=self.stats.records_created,
777
+ updated=self.stats.records_updated,
778
+ failed=self.stats.records_failed,
779
+ )
780
+
781
+ def _parse_json_line(self, line: str, line_number: int) -> dict:
782
+ """
783
+ Parse a JSON line, handling both plain and tab-delimited formats.
784
+
785
+ Args:
786
+ line: Line to parse
787
+ line_number: Line number for error reporting
788
+
789
+ Returns:
790
+ Parsed record dictionary
791
+
792
+ Raises:
793
+ ValueError: If the line cannot be parsed as JSON
794
+ """
795
+ try:
796
+ # Handle both plain JSON and tab-delimited format
797
+ # (tab-delimited: last field is the JSON)
798
+ json_str = line.split("\t")[-1] if "\t" in line else line
799
+ return json.loads(json_str)
800
+ except json.JSONDecodeError as e:
801
+ raise ValueError(
802
+ f"Invalid JSON at line {line_number}: {e}. Line content: {line[:100]}"
803
+ ) from e
804
+ except Exception as e:
805
+ raise ValueError(f"Error processing line {line_number}: {e}") from e
806
+
807
+ async def do_work(
808
+ self,
809
+ file_paths: Union[str, Path, List[Union[str, Path]]],
810
+ ) -> BatchPosterStats:
811
+ """
812
+ Main orchestration method for processing files.
813
+
814
+ This is the primary entry point for batch posting from files. It handles:
815
+ - Single or multiple file processing
816
+ - Progress tracking and logging
817
+ - Failed record collection
818
+ - Statistics reporting
819
+
820
+ Mimics the folio_migration_tools BatchPoster.do_work() workflow.
821
+
822
+ Note: To write failed records, pass a file handle or path to the
823
+ BatchPoster constructor's `failed_records_file` parameter.
824
+
825
+ Args:
826
+ file_paths: Path(s) to JSONL file(s) to process
827
+
828
+ Returns:
829
+ Final statistics from the posting operation
830
+
831
+ Example:
832
+ ```python
833
+ config = BatchPosterConfig(
834
+ object_type="Items",
835
+ batch_size=100,
836
+ upsert=True
837
+ )
838
+
839
+ reporter = RichProgressReporter(enabled=True)
840
+
841
+ # With failed records file
842
+ with open("failed_items.jsonl", "w") as failed_file:
843
+ poster = BatchPoster(folio_client, config, failed_records_file=failed_file, reporter=reporter)
844
+ async with poster:
845
+ stats = await poster.do_work(["items1.jsonl", "items2.jsonl"])
846
+
847
+ # Or let BatchPoster manage the file
848
+ poster = BatchPoster(folio_client, config, failed_records_file="failed_items.jsonl", reporter=reporter)
849
+ async with poster:
850
+ stats = await poster.do_work("items.jsonl")
851
+
852
+ print(f"Posted: {stats.records_posted}, Failed: {stats.records_failed}")
853
+ ```
854
+ """ # noqa: E501
855
+ # Reset statistics
856
+ self.stats = BatchPosterStats()
857
+
858
+ # Normalize file_paths to list
859
+ if isinstance(file_paths, (str, Path)):
860
+ files_to_process = [Path(file_paths)]
861
+ else:
862
+ files_to_process = [Path(p) for p in file_paths]
863
+
864
+ # Log start
865
+ logger.info(
866
+ "Starting batch posting of %d file(s) with batch_size=%d",
867
+ len(files_to_process),
868
+ self.config.batch_size,
869
+ )
870
+ logger.info("Object type: %s", self.config.object_type)
871
+ logger.info("Upsert mode: %s", "On" if self.config.upsert else "Off")
872
+ if self.config.upsert:
873
+ logger.info(
874
+ "Preservation settings: statistical_codes=%s, administrative_notes=%s, "
875
+ "temporary_locations=%s, temporary_loan_types=%s",
876
+ self.config.preserve_statistical_codes,
877
+ self.config.preserve_administrative_notes,
878
+ self.config.preserve_temporary_locations,
879
+ self.config.preserve_temporary_loan_types,
880
+ )
881
+
882
+ # Count total lines across all files for progress bar
883
+ total_lines = 0
884
+ for file_path in files_to_process:
885
+ with open(file_path, "rb") as f:
886
+ total_lines += sum(
887
+ buf.count(b"\n") for buf in iter(lambda: f.read(1024 * 1024), b"")
888
+ )
889
+
890
+ # Set up progress reporting
891
+ with self.reporter:
892
+ self.task_id = self.reporter.start_task(
893
+ f"posting_{self.config.object_type}",
894
+ total=total_lines,
895
+ description=f"Posting {self.config.object_type}",
896
+ )
897
+
898
+ # Process each file
899
+ for idx, file_path in enumerate(files_to_process, start=1):
900
+ logger.info(
901
+ "Processing file %d of %d: %s",
902
+ idx,
903
+ len(files_to_process),
904
+ file_path.name,
905
+ )
906
+
907
+ try:
908
+ await self.post_records(file_path)
909
+ except Exception as e:
910
+ logger.error("Error processing file %s: %s", file_path, e, exc_info=True)
911
+ raise
912
+
913
+ return self.stats
914
+
915
+ def get_stats(self) -> BatchPosterStats:
916
+ """
917
+ Get current posting statistics.
918
+
919
+ Returns:
920
+ Current statistics
921
+ """
922
+ return self.stats
923
+
924
+
925
+ def get_human_readable_size(size: int, precision: int = 2) -> str:
926
+ """
927
+ Convert bytes to human-readable format.
928
+
929
+ Args:
930
+ size: Size in bytes
931
+ precision: Number of decimal places
932
+
933
+ Returns:
934
+ Human-readable size string
935
+ """
936
+ suffixes = ["B", "KB", "MB", "GB", "TB"]
937
+ suffix_index = 0
938
+ size_float = float(size)
939
+
940
+ while size_float >= 1024 and suffix_index < len(suffixes) - 1:
941
+ suffix_index += 1
942
+ size_float = size_float / 1024.0
943
+
944
+ return f"{size_float:.{precision}f}{suffixes[suffix_index]}"
945
+
946
+
947
+ def get_req_size(response: httpx.Response):
948
+ size = response.request.method
949
+ size += str(response.request.url)
950
+ size += "\r\n".join(f"{k}{v}" for k, v in response.request.headers.items())
951
+ size += response.request.content.decode("utf-8") or ""
952
+ return get_human_readable_size(len(size.encode("utf-8")))
953
+
954
+
955
+ def set_up_cli_logging() -> None:
956
+ """
957
+ This function sets up logging for the CLI.
958
+ """
959
+
960
+ logger.setLevel(logging.INFO)
961
+ logger.propagate = False
962
+
963
+ # Set up file and stream handlers
964
+ file_handler = logging.FileHandler(
965
+ "folio_batch_poster_{}.log".format(dt.now().strftime("%Y%m%d%H%M%S"))
966
+ )
967
+ file_handler.setLevel(logging.INFO)
968
+ file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
969
+ file_handler.setFormatter(file_formatter)
970
+ logger.addHandler(file_handler)
971
+
972
+ if not any(
973
+ isinstance(h, logging.StreamHandler) and h.stream == sys.stderr for h in logger.handlers
974
+ ):
975
+ stream_handler = RichHandler(
976
+ show_level=False,
977
+ show_time=False,
978
+ omit_repeated_times=False,
979
+ show_path=False,
980
+ )
981
+ stream_handler.setLevel(logging.INFO)
982
+ stream_formatter = logging.Formatter("%(message)s")
983
+ stream_handler.setFormatter(stream_formatter)
984
+ logger.addHandler(stream_handler)
985
+
986
+ # Stop httpx from logging info messages to the console
987
+ logging.getLogger("httpx").setLevel(logging.WARNING)
988
+
989
+
990
+ app = cyclopts.App(default_parameter=cyclopts.Parameter(negative=()))
991
+
992
+
993
+ @app.default
994
+ def main(
995
+ config_file: Annotated[
996
+ Path | None, cyclopts.Parameter(group="Job Configuration Parameters")
997
+ ] = None,
998
+ *,
999
+ gateway_url: Annotated[
1000
+ str | None,
1001
+ cyclopts.Parameter(
1002
+ env_var="FOLIO_GATEWAY_URL",
1003
+ show_env_var=True,
1004
+ group="FOLIO Connection Parameters",
1005
+ ),
1006
+ ] = None,
1007
+ tenant_id: Annotated[
1008
+ str | None,
1009
+ cyclopts.Parameter(
1010
+ env_var="FOLIO_TENANT_ID", show_env_var=True, group="FOLIO Connection Parameters"
1011
+ ),
1012
+ ] = None,
1013
+ username: Annotated[
1014
+ str | None,
1015
+ cyclopts.Parameter(
1016
+ env_var="FOLIO_USERNAME", show_env_var=True, group="FOLIO Connection Parameters"
1017
+ ),
1018
+ ] = None,
1019
+ password: Annotated[
1020
+ str | None,
1021
+ cyclopts.Parameter(
1022
+ env_var="FOLIO_PASSWORD", show_env_var=True, group="FOLIO Connection Parameters"
1023
+ ),
1024
+ ] = None,
1025
+ member_tenant_id: Annotated[
1026
+ str | None,
1027
+ cyclopts.Parameter(
1028
+ env_var="FOLIO_MEMBER_TENANT_ID",
1029
+ show_env_var=True,
1030
+ group="FOLIO Connection Parameters",
1031
+ ),
1032
+ ] = None,
1033
+ object_type: Annotated[
1034
+ Literal["Instances", "Holdings", "Items"] | None,
1035
+ cyclopts.Parameter(group="Job Configuration Parameters"),
1036
+ ] = None,
1037
+ file_paths: Annotated[
1038
+ tuple[Path, ...] | None,
1039
+ cyclopts.Parameter(
1040
+ name=["--file-paths", "--file-path"],
1041
+ help="Path(s) to JSONL file(s). Accepts multiple values and glob patterns.",
1042
+ group="Job Configuration Parameters",
1043
+ ),
1044
+ ] = None,
1045
+ batch_size: Annotated[
1046
+ int,
1047
+ cyclopts.Parameter(group="Job Configuration Parameters"),
1048
+ ] = 100,
1049
+ upsert: Annotated[
1050
+ bool,
1051
+ cyclopts.Parameter(group="Job Configuration Parameters"),
1052
+ ] = False,
1053
+ preserve_statistical_codes: Annotated[
1054
+ bool,
1055
+ cyclopts.Parameter(group="Job Configuration Parameters: --upsert options"),
1056
+ ] = False,
1057
+ preserve_administrative_notes: Annotated[
1058
+ bool,
1059
+ cyclopts.Parameter(group="Job Configuration Parameters: --upsert options"),
1060
+ ] = False,
1061
+ preserve_temporary_locations: Annotated[
1062
+ bool,
1063
+ cyclopts.Parameter(group="Job Configuration Parameters: --upsert options"),
1064
+ ] = False,
1065
+ preserve_temporary_loan_types: Annotated[
1066
+ bool,
1067
+ cyclopts.Parameter(group="Job Configuration Parameters: --upsert options"),
1068
+ ] = False,
1069
+ overwrite_item_status: Annotated[
1070
+ bool,
1071
+ cyclopts.Parameter(group="Job Configuration Parameters: --upsert options"),
1072
+ ] = False,
1073
+ patch_existing_records: Annotated[
1074
+ bool,
1075
+ cyclopts.Parameter(group="Job Configuration Parameters: --upsert options"),
1076
+ ] = False,
1077
+ patch_paths: Annotated[
1078
+ str | None,
1079
+ cyclopts.Parameter(
1080
+ help=(
1081
+ "Comma-separated list of field paths to patch during upsert (e.g., barcode,status)"
1082
+ ),
1083
+ group="Job Configuration Parameters: --upsert options",
1084
+ ),
1085
+ ] = None,
1086
+ failed_records_file: Annotated[
1087
+ Path | None,
1088
+ cyclopts.Parameter(group="Job Configuration Parameters"),
1089
+ ] = None,
1090
+ no_progress: Annotated[
1091
+ bool,
1092
+ cyclopts.Parameter(group="Job Configuration Parameters"),
1093
+ ] = False,
1094
+ ) -> None:
1095
+ """
1096
+ Command-line interface to batch post inventory records to FOLIO
1097
+
1098
+ Parameters:
1099
+ config_file: Path to JSON config file (overrides CLI parameters).
1100
+ gateway_url: The FOLIO API Gateway URL.
1101
+ tenant_id: The tenant id.
1102
+ username: The FOLIO username.
1103
+ password: The FOLIO password.
1104
+ member_tenant_id: The FOLIO ECS member tenant id (if applicable).
1105
+ object_type: Type of inventory object (Instances, Holdings, or Items).
1106
+ file_paths: Path(s) to JSONL file(s) to post.
1107
+ batch_size: Number of records to include in each batch (1-1000).
1108
+ upsert: Enable upsert mode to update existing records.
1109
+ preserve_statistical_codes: Preserve existing statistical codes during upsert.
1110
+ preserve_administrative_notes: Preserve existing administrative notes during upsert.
1111
+ preserve_temporary_locations: Preserve temporary location assignments during upsert.
1112
+ preserve_temporary_loan_types: Preserve temporary loan type assignments during upsert.
1113
+ overwrite_item_status: Overwrite item status during upsert.
1114
+ patch_existing_records: Enable selective field patching during upsert.
1115
+ patch_paths: Comma-separated list of field paths to patch.
1116
+ failed_records_file: Path to file for writing failed records.
1117
+ no_progress: Disable progress bar display.
1118
+ """
1119
+ set_up_cli_logging()
1120
+
1121
+ gateway_url, tenant_id, username, password = get_folio_connection_parameters(
1122
+ gateway_url, tenant_id, username, password
1123
+ )
1124
+ folio_client = folioclient.FolioClient(gateway_url, tenant_id, username, password)
1125
+
1126
+ if member_tenant_id:
1127
+ folio_client.tenant_id = member_tenant_id
1128
+
1129
+ # Handle file path expansion
1130
+ expanded_file_paths = expand_file_paths(file_paths)
1131
+
1132
+ expanded_file_paths.sort()
1133
+
1134
+ # Parse patch_paths if provided
1135
+ patch_paths_list = parse_patch_paths(patch_paths)
1136
+
1137
+ try:
1138
+ if config_file:
1139
+ config, files_to_process = parse_config_file(config_file)
1140
+ else:
1141
+ if not object_type:
1142
+ logger.critical("--object-type is required when not using a config file")
1143
+ sys.exit(1)
1144
+
1145
+ if not expanded_file_paths:
1146
+ logger.critical("No files found to process. Exiting.")
1147
+ sys.exit(1)
1148
+
1149
+ config = BatchPoster.Config(
1150
+ object_type=object_type,
1151
+ batch_size=batch_size,
1152
+ upsert=upsert,
1153
+ preserve_statistical_codes=preserve_statistical_codes,
1154
+ preserve_administrative_notes=preserve_administrative_notes,
1155
+ preserve_temporary_locations=preserve_temporary_locations,
1156
+ preserve_temporary_loan_types=preserve_temporary_loan_types,
1157
+ preserve_item_status=not overwrite_item_status,
1158
+ patch_existing_records=patch_existing_records,
1159
+ patch_paths=patch_paths_list,
1160
+ no_progress=no_progress,
1161
+ )
1162
+ files_to_process = expanded_file_paths
1163
+
1164
+ logger.info(f"Processing {len(files_to_process)} file(s)")
1165
+ asyncio.run(run_batch_poster(folio_client, config, files_to_process, failed_records_file))
1166
+
1167
+ except Exception as e:
1168
+ logger.critical(f"An error occurred: {e}", exc_info=True)
1169
+ sys.exit(1)
1170
+
1171
+
1172
+ def parse_config_file(config_file):
1173
+ with open(config_file, "r") as f:
1174
+ config_data = json.load(f)
1175
+ # Convert file_paths if present in config
1176
+ if "file_paths" in config_data:
1177
+ config_data["file_paths"] = [Path(p) for p in config_data["file_paths"]]
1178
+ config = BatchPoster.Config(**config_data)
1179
+ files_to_process = config_data.get("file_paths", [])
1180
+ return config, files_to_process
1181
+
1182
+
1183
+ def parse_patch_paths(patch_paths):
1184
+ patch_paths_list = None
1185
+ if patch_paths:
1186
+ patch_paths_list = [p.strip() for p in patch_paths.split(",") if p.strip()]
1187
+ return patch_paths_list
1188
+
1189
+
1190
+ def expand_file_paths(file_paths):
1191
+ expanded_paths: List[Path] = []
1192
+ if file_paths:
1193
+ for file_path in file_paths:
1194
+ file_path_str = str(file_path)
1195
+ if any(char in file_path_str for char in ["*", "?", "["]):
1196
+ # It's a glob pattern - expand it
1197
+ expanded = glob_module.glob(file_path_str)
1198
+ expanded_paths.extend([Path(x) for x in expanded])
1199
+ else:
1200
+ # It's a regular path
1201
+ expanded_paths.append(file_path)
1202
+ return expanded_paths
1203
+
1204
+
1205
+ async def run_batch_poster(
1206
+ folio_client: FolioClient,
1207
+ config: "BatchPoster.Config",
1208
+ files_to_process: List[Path],
1209
+ failed_records_file: Path | None,
1210
+ ):
1211
+ """
1212
+ Run the batch poster operation.
1213
+
1214
+ Args:
1215
+ folio_client: Authenticated FOLIO client
1216
+ config: BatchPoster configuration
1217
+ files_to_process: List of file paths to process
1218
+ failed_records_file: Optional path for failed records
1219
+ """
1220
+ async with folio_client:
1221
+ try:
1222
+ # Create progress reporter
1223
+ reporter = (
1224
+ NoOpProgressReporter()
1225
+ if config.no_progress
1226
+ else RichProgressReporter(show_speed=True, show_time=True)
1227
+ )
1228
+
1229
+ poster = BatchPoster(
1230
+ folio_client, config, failed_records_file=failed_records_file, reporter=reporter
1231
+ )
1232
+ async with poster:
1233
+ await poster.do_work(files_to_process)
1234
+ log_final_stats(poster)
1235
+
1236
+ except Exception as e:
1237
+ logger.critical(f"Batch posting failed: {e}", exc_info=True)
1238
+ raise
1239
+
1240
+
1241
+ def log_final_stats(poster: BatchPoster) -> None:
1242
+ """
1243
+ Log the final statistics after batch posting.
1244
+
1245
+ Args:
1246
+ poster: The BatchPoster instance containing the stats
1247
+ """
1248
+ # Log final statistics
1249
+ logger.info("=" * 60)
1250
+ logger.info("Batch posting complete!")
1251
+ logger.info("=" * 60)
1252
+ total_processed = poster.stats.records_processed
1253
+ logger.info("Total records processed: %d", total_processed)
1254
+ logger.info("Records posted successfully: %d", poster.stats.records_posted)
1255
+ logger.info("Records created: %d", poster.stats.records_created)
1256
+ logger.info("Records updated: %d", poster.stats.records_updated)
1257
+ logger.info("Records failed: %d", poster.stats.records_failed)
1258
+ logger.info("Total batches posted: %d", poster.stats.batches_posted)
1259
+ logger.info("Total batches failed: %d", poster.stats.batches_failed)
1260
+ if poster._failed_records_path:
1261
+ logger.info("Failed records written to: %s", poster._failed_records_path)
1262
+
1263
+
1264
+ if __name__ == "__main__":
1265
+ app()