mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

@@ -0,0 +1,598 @@
1
+ """
2
+ Docling-Based Document Processing and Table Image Extraction
3
+
4
+ Supports: single PDF/HTML file, folder of documents, or zip file containing documents.
5
+ Supported formats: PDF, HTML, XHTML (via Docling library)
6
+ Configuration via config.yaml under pdf_processing section.
7
+ """
8
+
9
+ import logging
10
+ import time
11
+ from pathlib import Path
12
+ import requests
13
+ import base64
14
+ from io import BytesIO
15
+ from PIL import Image
16
+ import matplotlib.pyplot as plt
17
+ import os
18
+ import json
19
+ import zipfile
20
+ import tempfile
21
+ import shutil
22
+ import yaml
23
+
24
+ import pandas as pd
25
+ import torch
26
+
27
+ from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
28
+ from docling_core.types.doc.document import DescriptionAnnotation
29
+ from docling.datamodel.base_models import InputFormat
30
+ from docling.datamodel.pipeline_options import (
31
+ PdfPipelineOptions,
32
+ AcceleratorDevice,
33
+ AcceleratorOptions,
34
+ PictureDescriptionApiOptions,
35
+ EasyOcrOptions
36
+ )
37
+ from docling.document_converter import DocumentConverter, PdfFormatOption, HTMLFormatOption
38
+ from docling.utils.export import generate_multimodal_pages
39
+
40
+
41
+ def load_config(config_path="config.yaml"):
42
+ """Load configuration from YAML file."""
43
+ with open(config_path, 'r') as f:
44
+ return yaml.safe_load(f)
45
+
46
+ # Load config
47
+ CONFIG = load_config()
48
+ PDF_CONFIG = CONFIG.get("pdf_processing", {})
49
+ BACKEND_CONFIG = CONFIG.get("backend", {})
50
+
51
+ # --- Configuration from config.yaml ---
52
+ IMAGE_RESOLUTION_SCALE = PDF_CONFIG.get("image_resolution_scale", 2.0)
53
+ INPUT_PATH = PDF_CONFIG.get("input_path", "data/documents")
54
+ OUTPUT_DIR = Path(PDF_CONFIG.get("output_dir", "trials/pdf2md/output"))
55
+ MODEL_NAME = PDF_CONFIG.get("model_name", "qwen2.5vl:32b")
56
+ NUM_THREADS = PDF_CONFIG.get("num_threads", 14)
57
+ CUDA_DEVICE_ID = PDF_CONFIG.get("cuda_device_id", 1)
58
+
59
+ # API Configuration from backend settings
60
+ # API configuration from backend config or environment
61
+ API_KEY_FILE = os.environ.get("GEMINI_API_KEY_PATH", os.path.expanduser("~/.config/gemini/api_key.txt"))
62
+ API_URL = os.environ.get("LLM_API_URL", "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent")
63
+
64
+ from prompt import PROMPTS_DESC
65
+
66
+ logging.basicConfig(level=logging.INFO)
67
+ _log = logging.getLogger(__name__)
68
+
69
+
70
+ # Supported document extensions
71
+ SUPPORTED_EXTENSIONS = {'.pdf', '.html', '.htm', '.xhtml'}
72
+
73
+ def get_input_format(file_path: Path) -> InputFormat:
74
+ """Get the InputFormat enum for a given file path."""
75
+ ext = file_path.suffix.lower()
76
+ if ext == '.pdf':
77
+ return InputFormat.PDF
78
+ elif ext in {'.html', '.htm', '.xhtml'}:
79
+ return InputFormat.HTML
80
+ else:
81
+ raise ValueError(f"Unsupported file extension: {ext}")
82
+
83
+ def collect_input_files(input_path):
84
+ """
85
+ Collect document files (PDF, HTML) from input path.
86
+
87
+ Args:
88
+ input_path: str - path to a single document, folder of documents, or zip file
89
+
90
+ Returns:
91
+ tuple: (list of document paths, temp_dir or None)
92
+ temp_dir is returned if zip was extracted (caller should clean up)
93
+ """
94
+ input_path = Path(input_path)
95
+ temp_dir = None
96
+ doc_files = []
97
+
98
+ if not input_path.exists():
99
+ raise FileNotFoundError(f"Input path does not exist: {input_path}")
100
+
101
+ if input_path.is_file():
102
+ if input_path.suffix.lower() in SUPPORTED_EXTENSIONS:
103
+ # Single document file
104
+ doc_files = [input_path]
105
+ elif input_path.suffix.lower() == '.zip':
106
+ # Zip file - extract to temp directory
107
+ temp_dir = tempfile.mkdtemp(prefix="doc_extract_")
108
+ _log.info(f"Extracting zip file to: {temp_dir}")
109
+ with zipfile.ZipFile(input_path, 'r') as zf:
110
+ zf.extractall(temp_dir)
111
+ # Recursively find all supported documents in extracted content
112
+ for ext in SUPPORTED_EXTENSIONS:
113
+ doc_files.extend(Path(temp_dir).rglob(f"*{ext}"))
114
+ else:
115
+ raise ValueError(f"Unsupported file type: {input_path.suffix}. Supported: {SUPPORTED_EXTENSIONS}")
116
+ elif input_path.is_dir():
117
+ # Folder - recursively find all supported documents
118
+ for ext in SUPPORTED_EXTENSIONS:
119
+ doc_files.extend(input_path.rglob(f"*{ext}"))
120
+ else:
121
+ raise ValueError(f"Invalid input path: {input_path}")
122
+
123
+ # Sort by file size (smallest first) for faster initial feedback
124
+ doc_files = sorted(doc_files, key=lambda p: p.stat().st_size)
125
+
126
+ # Log counts by format
127
+ pdf_count = sum(1 for f in doc_files if f.suffix.lower() == '.pdf')
128
+ html_count = sum(1 for f in doc_files if f.suffix.lower() in {'.html', '.htm', '.xhtml'})
129
+ _log.info(f"Found {len(doc_files)} document files to process (PDF: {pdf_count}, HTML: {html_count})")
130
+ return doc_files, temp_dir
131
+
132
+ # Backward compatibility alias
133
+ def collect_pdf_files(input_path):
134
+ """Backward compatibility wrapper for collect_input_files."""
135
+ return collect_input_files(input_path)
136
+
137
+ def motormaven_vlm_options():
138
+ """Configure PictureDescriptionApiOptions for motormaven endpoint"""
139
+ # Load API key from file
140
+ with open(API_KEY_FILE, 'r') as f:
141
+ api_key = f.read().strip()
142
+
143
+ options = PictureDescriptionApiOptions(
144
+ url=API_URL,
145
+ params=dict(
146
+ model=MODEL_NAME,
147
+ ),
148
+ headers={
149
+ "Authorization": f"Bearer {api_key}",
150
+ "Content-Type": "application/json",
151
+ },
152
+ prompt=PROMPTS_DESC["image"],
153
+ timeout=120,
154
+ retries=10,
155
+ )
156
+ return options
157
+
158
+ def check_cuda_memory(device_id=0):
159
+ if torch.cuda.is_available():
160
+ total_memory = torch.cuda.get_device_properties(device_id).total_memory
161
+ reserved_memory = torch.cuda.memory_reserved(device_id)
162
+ allocated_memory = torch.cuda.memory_allocated(device_id)
163
+ free_memory = reserved_memory - allocated_memory
164
+ unreserved_memory = total_memory - reserved_memory
165
+
166
+ print(f"Total memory: {total_memory / 1024 ** 3:.2f} GiB")
167
+ print(f"Reserved memory: {reserved_memory / 1024 ** 3:.2f} GiB")
168
+ print(f"Allocated memory: {allocated_memory / 1024 ** 3:.2f} GiB")
169
+ print(f"Free memory: {free_memory / 1024 ** 3:.2f} GiB (within reserved)")
170
+ print(f"Unreserved memory: {unreserved_memory / 1024 ** 3:.2f} GiB")
171
+ else:
172
+ print("CUDA is not available.")
173
+
174
+
175
+ def is_bbox_inside(inner_bbox, outer_bbox):
176
+ """Check if inner_bbox is inside outer_bbox. Both are BoundingBox objects with l, t, r, b.
177
+ Coordinate origin is bottom-left: b < t (bottom has lower y than top)."""
178
+ return (inner_bbox.l >= outer_bbox.l and
179
+ inner_bbox.r <= outer_bbox.r and
180
+ inner_bbox.b >= outer_bbox.b and
181
+ inner_bbox.t <= outer_bbox.t)
182
+
183
+
184
+ def get_pictures_inside_tables(conv_res):
185
+ """Return set of picture indices that are inside tables."""
186
+ pictures_to_skip = set()
187
+
188
+ # Build table info: {page_no: [bbox1, bbox2, ...]}
189
+ table_bboxes_by_page = {}
190
+ for table in conv_res.document.tables:
191
+ if table.prov:
192
+ for prov in table.prov:
193
+ page_no = prov.page_no
194
+ bbox = prov.bbox
195
+ if page_no not in table_bboxes_by_page:
196
+ table_bboxes_by_page[page_no] = []
197
+ table_bboxes_by_page[page_no].append(bbox)
198
+
199
+ # Check each picture
200
+ for i, picture in enumerate(conv_res.document.pictures):
201
+ if picture.prov:
202
+ for prov in picture.prov:
203
+ pic_page = prov.page_no
204
+ pic_bbox = prov.bbox
205
+ # Check if this picture is inside any table on the same page
206
+ if pic_page in table_bboxes_by_page:
207
+ for table_bbox in table_bboxes_by_page[pic_page]:
208
+ if is_bbox_inside(pic_bbox, table_bbox):
209
+ pictures_to_skip.add(i)
210
+ print(f"Picture {i} on page {pic_page} is inside a table - will skip annotation")
211
+ break
212
+
213
+ return pictures_to_skip
214
+
215
+
216
+ def make_api_call_with_retries(payload, api_url, headers, item_type, item_index, output_dir):
217
+ """Make API call with retry logic for server errors and response recording"""
218
+ retry_count = 0
219
+ while True:
220
+ try:
221
+ response = requests.post(api_url, json=payload, headers=headers, timeout=120)
222
+
223
+ if response.status_code == 200:
224
+ # Response saving disabled - no longer saving individual response files
225
+ # responses_dir = output_dir / "api_responses"
226
+ # responses_dir.mkdir(parents=True, exist_ok=True)
227
+ # response_file = responses_dir / f"{item_type}_{item_index}_response.json"
228
+ # with open(response_file, 'w') as f:
229
+ # json.dump(response.json(), f, indent=2)
230
+ return response
231
+ elif 400 <= response.status_code < 500:
232
+ # Client error - skip
233
+ print(f"Client error for {item_type} {item_index}: HTTP {response.status_code} - skipping")
234
+ return None
235
+ elif response.status_code >= 500:
236
+ # Server error - retry with exponential backoff
237
+ retry_count += 1
238
+ wait_time = min(60, 2 ** min(retry_count, 6)) # Cap at 60 seconds
239
+ print(f"Server error for {item_type} {item_index}: HTTP {response.status_code} - retrying in {wait_time}s (attempt {retry_count})")
240
+ time.sleep(wait_time)
241
+ continue
242
+ else:
243
+ print(f"Unexpected status for {item_type} {item_index}: HTTP {response.status_code} - skipping")
244
+ return None
245
+
246
+ except Exception as e:
247
+ # Network/connection errors - retry
248
+ retry_count += 1
249
+ wait_time = min(60, 2 ** min(retry_count, 6))
250
+ print(f"Network error for {item_type} {item_index}: {str(e)} - retrying in {wait_time}s (attempt {retry_count})")
251
+ time.sleep(wait_time)
252
+ continue
253
+
254
+ def annotate_items_with_images(conv_res, model_name=MODEL_NAME, api_url=API_URL, max_tokens=1000, use_batch=True):
255
+ """Annotate both pictures and tables that don't have annotations using their base64 image data.
256
+
257
+ Args:
258
+ conv_res: Conversion result from docling
259
+ model_name: VLM model name
260
+ api_url: API endpoint URL
261
+ max_tokens: Max tokens for response
262
+ use_batch: If True, use batch processing for faster annotation
263
+
264
+ Returns:
265
+ Set of picture indices that are inside tables (skipped).
266
+ """
267
+
268
+ # Get pictures that are inside tables (skip these)
269
+ pictures_to_skip = get_pictures_inside_tables(conv_res)
270
+
271
+ if use_batch:
272
+ return _annotate_items_batch(conv_res, model_name, pictures_to_skip)
273
+ else:
274
+ return _annotate_items_sequential(conv_res, model_name, api_url, pictures_to_skip)
275
+
276
+
277
+ def _annotate_items_batch(conv_res, model_name, pictures_to_skip):
278
+ """Batch annotate pictures and tables using async batch processing."""
279
+ from call_llm import batch_call_vlm_base64
280
+
281
+ # Collect all items that need annotation
282
+ batch_requests = [] # List of (prompt, base64, mime_type)
283
+ item_refs = [] # Track which item each request corresponds to: ('picture'|'table', index, item)
284
+
285
+ # Collect pictures
286
+ for i, item in enumerate(conv_res.document.pictures):
287
+ if i in pictures_to_skip:
288
+ print(f"Picture {i}: Skipping - inside a table")
289
+ continue
290
+ if not item.annotations:
291
+ try:
292
+ base64_data = str(item.image.uri).split(',')[1] if item.image and item.image.uri else None
293
+ if not base64_data:
294
+ print(f"Picture {i}: No base64 data available")
295
+ continue
296
+ print(f"Picture {i}: Queued for batch (base64 len: {len(base64_data)})")
297
+ batch_requests.append((PROMPTS_DESC["image"], base64_data, "image/png"))
298
+ item_refs.append(('picture', i, item))
299
+ except Exception as e:
300
+ print(f"Error preparing picture {i}: {str(e)}")
301
+
302
+ # Collect tables
303
+ for i, item in enumerate(conv_res.document.tables):
304
+ if not item.annotations:
305
+ try:
306
+ base64_data = str(item.image.uri.path).split(',')[1] if item.image and item.image.uri else None
307
+ if not base64_data:
308
+ print(f"Table {i}: No base64 data available")
309
+ continue
310
+ print(f"Table {i}: Queued for batch (base64 len: {len(base64_data)})")
311
+ batch_requests.append((PROMPTS_DESC["table"], base64_data, "image/png"))
312
+ item_refs.append(('table', i, item))
313
+ except Exception as e:
314
+ print(f"Error preparing table {i}: {str(e)}")
315
+
316
+ if not batch_requests:
317
+ print("No items to annotate")
318
+ return pictures_to_skip
319
+
320
+ # Execute batch VLM call
321
+ print(f"\n⚡ Batch annotating {len(batch_requests)} items...")
322
+ responses = batch_call_vlm_base64(batch_requests, show_progress=True)
323
+
324
+ # Apply responses to items
325
+ success_count = 0
326
+ for (item_type, idx, item), response in zip(item_refs, responses):
327
+ if response and not response.startswith("ERROR:"):
328
+ annotation = DescriptionAnnotation(
329
+ kind='description',
330
+ text=response,
331
+ provenance=model_name
332
+ )
333
+ item.annotations.append(annotation)
334
+ print(f"✅ Added annotation to {item_type} {idx}")
335
+ success_count += 1
336
+ else:
337
+ print(f"❌ Failed to annotate {item_type} {idx}: {response[:100] if response else 'No response'}")
338
+
339
+ print(f"\n📊 Batch annotation complete: {success_count}/{len(batch_requests)} successful")
340
+ return pictures_to_skip
341
+
342
+
343
+ def _annotate_items_sequential(conv_res, model_name, api_url, pictures_to_skip):
344
+ """Sequential annotation (original implementation, kept as fallback)."""
345
+ # Load API key for authentication
346
+ with open(API_KEY_FILE, 'r') as f:
347
+ api_key = f.read().strip()
348
+ headers = {
349
+ "Authorization": f"Bearer {api_key}",
350
+ "Content-Type": "application/json",
351
+ }
352
+
353
+ # Process pictures
354
+ for i, item in enumerate(conv_res.document.pictures):
355
+ if i in pictures_to_skip:
356
+ print(f"Picture {i}: Skipping - inside a table")
357
+ continue
358
+ if not item.annotations:
359
+ try:
360
+ base64_data = str(item.image.uri).split(',')[1] if item.image and item.image.uri else None
361
+ if not base64_data:
362
+ print(f"Picture {i}: No base64 data available")
363
+ continue
364
+
365
+ print(f"Picture {i}: Using base64 data, length: {len(base64_data)}")
366
+
367
+ payload = {
368
+ "model": model_name,
369
+ "messages": [{
370
+ "role": "user",
371
+ "content": [
372
+ {"type": "text", "text": PROMPTS_DESC["image"]},
373
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_data}"}}
374
+ ]
375
+ }]
376
+ }
377
+
378
+ response = make_api_call_with_retries(payload, api_url, headers, "picture", i, OUTPUT_DIR)
379
+
380
+ if response and response.status_code == 200:
381
+ result = response.json()
382
+ description_text = result.get('choices', [{}])[0].get('message', {}).get('content', 'No description available')
383
+ annotation = DescriptionAnnotation(kind='description', text=description_text, provenance=model_name)
384
+ item.annotations.append(annotation)
385
+ print(f"Added annotation to picture {i}")
386
+ else:
387
+ print(f"Failed to annotate picture {i}: HTTP {response.status_code if response else 'No response'}")
388
+ except Exception as e:
389
+ print(f"Error annotating picture {i}: {str(e)}")
390
+
391
+ # Process tables
392
+ for i, item in enumerate(conv_res.document.tables):
393
+ if not item.annotations:
394
+ try:
395
+ base64_data = str(item.image.uri.path).split(',')[1] if item.image and item.image.uri else None
396
+ if not base64_data:
397
+ print(f"Table {i}: No base64 data available")
398
+ continue
399
+
400
+ print(f"Table {i}: Using base64 data, length: {len(base64_data)}")
401
+
402
+ payload = {
403
+ "model": model_name,
404
+ "messages": [{
405
+ "role": "user",
406
+ "content": [
407
+ {"type": "text", "text": PROMPTS_DESC["table"]},
408
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_data}"}}
409
+ ]
410
+ }]
411
+ }
412
+
413
+ response = make_api_call_with_retries(payload, api_url, headers, "table", i, OUTPUT_DIR)
414
+
415
+ if response and response.status_code == 200:
416
+ result = response.json()
417
+ description_text = result.get('choices', [{}])[0].get('message', {}).get('content', 'No description available')
418
+ annotation = DescriptionAnnotation(kind='description', text=description_text, provenance=model_name)
419
+ item.annotations.append(annotation)
420
+ print(f"Added annotation to table {i}")
421
+ else:
422
+ print(f"Failed to annotate table {i}: HTTP {response.status_code if response else 'No response'}")
423
+ except Exception as e:
424
+ print(f"Error annotating table {i}: {str(e)}")
425
+
426
+ return pictures_to_skip
427
+
428
+
429
+ def configure_pipeline_options(model_name:str="granite3.2-vision:latest", cuda_device_id:int=None):
430
+ pipeline_options = PdfPipelineOptions()
431
+ pipeline_options.enable_remote_services = True
432
+ pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
433
+ pipeline_options.generate_page_images = False # True For debugging
434
+ pipeline_options.generate_parsed_pages = False # True For debugging
435
+ pipeline_options.generate_picture_images = True
436
+ pipeline_options.do_picture_classification = False # False to Avoid CUDA OOM
437
+ pipeline_options.do_picture_description = True # False Avoid CUDA OOM
438
+ pipeline_options.do_ocr = True
439
+ pipeline_options.ocr_options = EasyOcrOptions()
440
+ pipeline_options.do_code_enrichment = True
441
+ pipeline_options.do_formula_enrichment = True
442
+ pipeline_options.do_table_structure = True
443
+ pipeline_options.table_structure_options.do_cell_matching = True
444
+ pipeline_options.generate_table_images = True
445
+
446
+ print(f"DEBUG: Pipeline options - generate_table_images: {pipeline_options.generate_table_images}")
447
+ print(f"DEBUG: Pipeline options - do_table_structure: {pipeline_options.do_table_structure}")
448
+
449
+ # Set accelerator with specific CUDA device if provided
450
+ if cuda_device_id is not None:
451
+ print(f"DEBUG: Using CUDA device {cuda_device_id}")
452
+ pipeline_options.accelerator_options = AcceleratorOptions(
453
+ num_threads=NUM_THREADS, device=f"cuda:{cuda_device_id}"
454
+ )
455
+ else:
456
+ pipeline_options.accelerator_options = AcceleratorOptions(
457
+ num_threads=NUM_THREADS, device=AcceleratorDevice.AUTO
458
+ )
459
+
460
+ ### Set picture description API options for motormaven endpoint
461
+ pipeline_options.picture_description_options = motormaven_vlm_options()
462
+
463
+ print("DEBUG: Using manual table annotation (no built-in table description options)")
464
+
465
+ return pipeline_options
466
+
467
+ def process_single_document(doc_path, doc_converter, output_dir, is_pdf=True):
468
+ """Process a single document file (PDF or HTML) and save outputs.
469
+
470
+ Args:
471
+ doc_path: Path to the document file
472
+ doc_converter: DocumentConverter instance
473
+ output_dir: Output directory path
474
+ is_pdf: Whether the document is a PDF (affects annotation behavior)
475
+ """
476
+ doc_start = time.time()
477
+
478
+ try:
479
+ conv_res = doc_converter.convert(str(doc_path))
480
+ doc_filename = conv_res.input.file.stem
481
+
482
+ # Create per-document output directory
483
+ doc_output_dir = output_dir / doc_filename
484
+ doc_output_dir.mkdir(parents=True, exist_ok=True)
485
+
486
+ # Annotate images and tables (only for PDFs with complex layouts)
487
+ # HTML files typically have simpler structure
488
+ if is_pdf:
489
+ pictures_to_skip = annotate_items_with_images(conv_res)
490
+
491
+ # Create folders for images
492
+ tables_dir = doc_output_dir / "tables"
493
+ tables_dir.mkdir(parents=True, exist_ok=True)
494
+
495
+ # Save table images
496
+ table_counter = 0
497
+ for element, _level in conv_res.document.iterate_items():
498
+ if isinstance(element, TableItem):
499
+ table_counter += 1
500
+ element_image_filename = tables_dir / f"{doc_filename}-table-{table_counter}.png"
501
+ _log.info(f"Table {element.self_ref} - Caption: {element.caption_text(doc=conv_res.document)}")
502
+ try:
503
+ with element_image_filename.open("wb") as fp:
504
+ element.get_image(conv_res.document).save(fp, "PNG")
505
+ except Exception as e:
506
+ _log.warning(f"Could not save table image: {e}")
507
+
508
+ # Create artifacts directory for referenced images
509
+ artifacts_dir = doc_output_dir / "ref_artifacts"
510
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
511
+
512
+ # Save markdown with externally referenced pictures
513
+ md_filename = doc_output_dir / f"{doc_filename}_ref.md"
514
+ conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED, artifacts_dir=artifacts_dir)
515
+
516
+ elapsed = time.time() - doc_start
517
+ file_type = "PDF" if is_pdf else "HTML"
518
+ _log.info(f"✅ Processed {file_type} {doc_filename} in {elapsed:.1f}s")
519
+ return True, doc_filename, elapsed
520
+
521
+ except Exception as e:
522
+ elapsed = time.time() - doc_start
523
+ _log.error(f"❌ Failed to process {doc_path.name}: {str(e)}")
524
+ return False, doc_path.name, elapsed
525
+
526
+ # Backward compatibility alias
527
+ def process_single_pdf(pdf_path, doc_converter, output_dir):
528
+ """Backward compatibility wrapper for process_single_document."""
529
+ return process_single_document(pdf_path, doc_converter, output_dir, is_pdf=True)
530
+
531
+
532
+ def create_multi_format_converter(cuda_device_id=None):
533
+ """Create a DocumentConverter that supports both PDF and HTML formats."""
534
+ # PDF pipeline options (full processing)
535
+ pdf_pipeline_options = configure_pipeline_options(cuda_device_id=cuda_device_id)
536
+
537
+ # Initialize document converter with both PDF and HTML support
538
+ doc_converter = DocumentConverter(
539
+ format_options={
540
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
541
+ InputFormat.HTML: HTMLFormatOption(), # HTML uses default options
542
+ }
543
+ )
544
+ return doc_converter
545
+
546
+
547
+ if __name__ == "__main__":
548
+ start_time = time.time()
549
+
550
+ torch.cuda.empty_cache()
551
+ torch.cuda.ipc_collect()
552
+ check_cuda_memory(CUDA_DEVICE_ID)
553
+
554
+ # Collect document files (PDF and HTML) from input path
555
+ doc_files, temp_dir = collect_input_files(INPUT_PATH)
556
+
557
+ if not doc_files:
558
+ _log.error("No document files found to process")
559
+ exit(1)
560
+
561
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
562
+
563
+ # Initialize document converter with multi-format support
564
+ doc_converter = create_multi_format_converter(cuda_device_id=CUDA_DEVICE_ID)
565
+
566
+ # Process all documents
567
+ results = []
568
+ for i, doc_path in enumerate(doc_files, 1):
569
+ is_pdf = doc_path.suffix.lower() == '.pdf'
570
+ file_type = "PDF" if is_pdf else "HTML"
571
+
572
+ _log.info(f"\n{'='*60}")
573
+ _log.info(f"Processing {file_type} {i}/{len(doc_files)}: {doc_path.name}")
574
+ _log.info(f"{'='*60}")
575
+
576
+ success, name, elapsed = process_single_document(doc_path, doc_converter, OUTPUT_DIR, is_pdf=is_pdf)
577
+ results.append((success, name, elapsed, file_type))
578
+
579
+ # Cleanup temp directory if created from zip
580
+ if temp_dir:
581
+ _log.info(f"Cleaning up temp directory: {temp_dir}")
582
+ shutil.rmtree(temp_dir, ignore_errors=True)
583
+
584
+ # Summary
585
+ total_time = time.time() - start_time
586
+ successful = sum(1 for r in results if r[0])
587
+ failed = len(results) - successful
588
+ pdf_count = sum(1 for r in results if r[3] == "PDF")
589
+ html_count = sum(1 for r in results if r[3] == "HTML")
590
+
591
+ _log.info(f"\n{'='*60}")
592
+ _log.info(f"PROCESSING COMPLETE")
593
+ _log.info(f"{'='*60}")
594
+ _log.info(f"Total documents: {len(results)} (PDF: {pdf_count}, HTML: {html_count})")
595
+ _log.info(f"Successful: {successful}")
596
+ _log.info(f"Failed: {failed}")
597
+ _log.info(f"Total time: {total_time:.1f}s")
598
+ _log.info(f"Output directory: {OUTPUT_DIR}")