mirage-benchmark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mirage-benchmark might be problematic. Click here for more details.
- mirage/__init__.py +83 -0
- mirage/cli.py +150 -0
- mirage/core/__init__.py +52 -0
- mirage/core/config.py +248 -0
- mirage/core/llm.py +1745 -0
- mirage/core/prompts.py +884 -0
- mirage/embeddings/__init__.py +31 -0
- mirage/embeddings/models.py +512 -0
- mirage/embeddings/rerankers_multimodal.py +766 -0
- mirage/embeddings/rerankers_text.py +149 -0
- mirage/evaluation/__init__.py +26 -0
- mirage/evaluation/metrics.py +2223 -0
- mirage/evaluation/metrics_optimized.py +2172 -0
- mirage/pipeline/__init__.py +45 -0
- mirage/pipeline/chunker.py +545 -0
- mirage/pipeline/context.py +1003 -0
- mirage/pipeline/deduplication.py +491 -0
- mirage/pipeline/domain.py +514 -0
- mirage/pipeline/pdf_processor.py +598 -0
- mirage/pipeline/qa_generator.py +798 -0
- mirage/utils/__init__.py +31 -0
- mirage/utils/ablation.py +360 -0
- mirage/utils/preflight.py +663 -0
- mirage/utils/stats.py +626 -0
- mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
- mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
- mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
- mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
- mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
- mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Docling-Based Document Processing and Table Image Extraction
|
|
3
|
+
|
|
4
|
+
Supports: single PDF/HTML file, folder of documents, or zip file containing documents.
|
|
5
|
+
Supported formats: PDF, HTML, XHTML (via Docling library)
|
|
6
|
+
Configuration via config.yaml under pdf_processing section.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import requests
|
|
13
|
+
import base64
|
|
14
|
+
from io import BytesIO
|
|
15
|
+
from PIL import Image
|
|
16
|
+
import matplotlib.pyplot as plt
|
|
17
|
+
import os
|
|
18
|
+
import json
|
|
19
|
+
import zipfile
|
|
20
|
+
import tempfile
|
|
21
|
+
import shutil
|
|
22
|
+
import yaml
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
import torch
|
|
26
|
+
|
|
27
|
+
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
|
28
|
+
from docling_core.types.doc.document import DescriptionAnnotation
|
|
29
|
+
from docling.datamodel.base_models import InputFormat
|
|
30
|
+
from docling.datamodel.pipeline_options import (
|
|
31
|
+
PdfPipelineOptions,
|
|
32
|
+
AcceleratorDevice,
|
|
33
|
+
AcceleratorOptions,
|
|
34
|
+
PictureDescriptionApiOptions,
|
|
35
|
+
EasyOcrOptions
|
|
36
|
+
)
|
|
37
|
+
from docling.document_converter import DocumentConverter, PdfFormatOption, HTMLFormatOption
|
|
38
|
+
from docling.utils.export import generate_multimodal_pages
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_config(config_path="config.yaml"):
|
|
42
|
+
"""Load configuration from YAML file."""
|
|
43
|
+
with open(config_path, 'r') as f:
|
|
44
|
+
return yaml.safe_load(f)
|
|
45
|
+
|
|
46
|
+
# Load config
|
|
47
|
+
CONFIG = load_config()
|
|
48
|
+
PDF_CONFIG = CONFIG.get("pdf_processing", {})
|
|
49
|
+
BACKEND_CONFIG = CONFIG.get("backend", {})
|
|
50
|
+
|
|
51
|
+
# --- Configuration from config.yaml ---
|
|
52
|
+
IMAGE_RESOLUTION_SCALE = PDF_CONFIG.get("image_resolution_scale", 2.0)
|
|
53
|
+
INPUT_PATH = PDF_CONFIG.get("input_path", "data/documents")
|
|
54
|
+
OUTPUT_DIR = Path(PDF_CONFIG.get("output_dir", "trials/pdf2md/output"))
|
|
55
|
+
MODEL_NAME = PDF_CONFIG.get("model_name", "qwen2.5vl:32b")
|
|
56
|
+
NUM_THREADS = PDF_CONFIG.get("num_threads", 14)
|
|
57
|
+
CUDA_DEVICE_ID = PDF_CONFIG.get("cuda_device_id", 1)
|
|
58
|
+
|
|
59
|
+
# API Configuration from backend settings
|
|
60
|
+
# API configuration from backend config or environment
|
|
61
|
+
API_KEY_FILE = os.environ.get("GEMINI_API_KEY_PATH", os.path.expanduser("~/.config/gemini/api_key.txt"))
|
|
62
|
+
API_URL = os.environ.get("LLM_API_URL", "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent")
|
|
63
|
+
|
|
64
|
+
from prompt import PROMPTS_DESC
|
|
65
|
+
|
|
66
|
+
logging.basicConfig(level=logging.INFO)
|
|
67
|
+
_log = logging.getLogger(__name__)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Supported document extensions
|
|
71
|
+
SUPPORTED_EXTENSIONS = {'.pdf', '.html', '.htm', '.xhtml'}
|
|
72
|
+
|
|
73
|
+
def get_input_format(file_path: Path) -> InputFormat:
|
|
74
|
+
"""Get the InputFormat enum for a given file path."""
|
|
75
|
+
ext = file_path.suffix.lower()
|
|
76
|
+
if ext == '.pdf':
|
|
77
|
+
return InputFormat.PDF
|
|
78
|
+
elif ext in {'.html', '.htm', '.xhtml'}:
|
|
79
|
+
return InputFormat.HTML
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError(f"Unsupported file extension: {ext}")
|
|
82
|
+
|
|
83
|
+
def collect_input_files(input_path):
|
|
84
|
+
"""
|
|
85
|
+
Collect document files (PDF, HTML) from input path.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
input_path: str - path to a single document, folder of documents, or zip file
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
tuple: (list of document paths, temp_dir or None)
|
|
92
|
+
temp_dir is returned if zip was extracted (caller should clean up)
|
|
93
|
+
"""
|
|
94
|
+
input_path = Path(input_path)
|
|
95
|
+
temp_dir = None
|
|
96
|
+
doc_files = []
|
|
97
|
+
|
|
98
|
+
if not input_path.exists():
|
|
99
|
+
raise FileNotFoundError(f"Input path does not exist: {input_path}")
|
|
100
|
+
|
|
101
|
+
if input_path.is_file():
|
|
102
|
+
if input_path.suffix.lower() in SUPPORTED_EXTENSIONS:
|
|
103
|
+
# Single document file
|
|
104
|
+
doc_files = [input_path]
|
|
105
|
+
elif input_path.suffix.lower() == '.zip':
|
|
106
|
+
# Zip file - extract to temp directory
|
|
107
|
+
temp_dir = tempfile.mkdtemp(prefix="doc_extract_")
|
|
108
|
+
_log.info(f"Extracting zip file to: {temp_dir}")
|
|
109
|
+
with zipfile.ZipFile(input_path, 'r') as zf:
|
|
110
|
+
zf.extractall(temp_dir)
|
|
111
|
+
# Recursively find all supported documents in extracted content
|
|
112
|
+
for ext in SUPPORTED_EXTENSIONS:
|
|
113
|
+
doc_files.extend(Path(temp_dir).rglob(f"*{ext}"))
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Unsupported file type: {input_path.suffix}. Supported: {SUPPORTED_EXTENSIONS}")
|
|
116
|
+
elif input_path.is_dir():
|
|
117
|
+
# Folder - recursively find all supported documents
|
|
118
|
+
for ext in SUPPORTED_EXTENSIONS:
|
|
119
|
+
doc_files.extend(input_path.rglob(f"*{ext}"))
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"Invalid input path: {input_path}")
|
|
122
|
+
|
|
123
|
+
# Sort by file size (smallest first) for faster initial feedback
|
|
124
|
+
doc_files = sorted(doc_files, key=lambda p: p.stat().st_size)
|
|
125
|
+
|
|
126
|
+
# Log counts by format
|
|
127
|
+
pdf_count = sum(1 for f in doc_files if f.suffix.lower() == '.pdf')
|
|
128
|
+
html_count = sum(1 for f in doc_files if f.suffix.lower() in {'.html', '.htm', '.xhtml'})
|
|
129
|
+
_log.info(f"Found {len(doc_files)} document files to process (PDF: {pdf_count}, HTML: {html_count})")
|
|
130
|
+
return doc_files, temp_dir
|
|
131
|
+
|
|
132
|
+
# Backward compatibility alias
|
|
133
|
+
def collect_pdf_files(input_path):
|
|
134
|
+
"""Backward compatibility wrapper for collect_input_files."""
|
|
135
|
+
return collect_input_files(input_path)
|
|
136
|
+
|
|
137
|
+
def motormaven_vlm_options():
|
|
138
|
+
"""Configure PictureDescriptionApiOptions for motormaven endpoint"""
|
|
139
|
+
# Load API key from file
|
|
140
|
+
with open(API_KEY_FILE, 'r') as f:
|
|
141
|
+
api_key = f.read().strip()
|
|
142
|
+
|
|
143
|
+
options = PictureDescriptionApiOptions(
|
|
144
|
+
url=API_URL,
|
|
145
|
+
params=dict(
|
|
146
|
+
model=MODEL_NAME,
|
|
147
|
+
),
|
|
148
|
+
headers={
|
|
149
|
+
"Authorization": f"Bearer {api_key}",
|
|
150
|
+
"Content-Type": "application/json",
|
|
151
|
+
},
|
|
152
|
+
prompt=PROMPTS_DESC["image"],
|
|
153
|
+
timeout=120,
|
|
154
|
+
retries=10,
|
|
155
|
+
)
|
|
156
|
+
return options
|
|
157
|
+
|
|
158
|
+
def check_cuda_memory(device_id=0):
|
|
159
|
+
if torch.cuda.is_available():
|
|
160
|
+
total_memory = torch.cuda.get_device_properties(device_id).total_memory
|
|
161
|
+
reserved_memory = torch.cuda.memory_reserved(device_id)
|
|
162
|
+
allocated_memory = torch.cuda.memory_allocated(device_id)
|
|
163
|
+
free_memory = reserved_memory - allocated_memory
|
|
164
|
+
unreserved_memory = total_memory - reserved_memory
|
|
165
|
+
|
|
166
|
+
print(f"Total memory: {total_memory / 1024 ** 3:.2f} GiB")
|
|
167
|
+
print(f"Reserved memory: {reserved_memory / 1024 ** 3:.2f} GiB")
|
|
168
|
+
print(f"Allocated memory: {allocated_memory / 1024 ** 3:.2f} GiB")
|
|
169
|
+
print(f"Free memory: {free_memory / 1024 ** 3:.2f} GiB (within reserved)")
|
|
170
|
+
print(f"Unreserved memory: {unreserved_memory / 1024 ** 3:.2f} GiB")
|
|
171
|
+
else:
|
|
172
|
+
print("CUDA is not available.")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def is_bbox_inside(inner_bbox, outer_bbox):
|
|
176
|
+
"""Check if inner_bbox is inside outer_bbox. Both are BoundingBox objects with l, t, r, b.
|
|
177
|
+
Coordinate origin is bottom-left: b < t (bottom has lower y than top)."""
|
|
178
|
+
return (inner_bbox.l >= outer_bbox.l and
|
|
179
|
+
inner_bbox.r <= outer_bbox.r and
|
|
180
|
+
inner_bbox.b >= outer_bbox.b and
|
|
181
|
+
inner_bbox.t <= outer_bbox.t)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def get_pictures_inside_tables(conv_res):
|
|
185
|
+
"""Return set of picture indices that are inside tables."""
|
|
186
|
+
pictures_to_skip = set()
|
|
187
|
+
|
|
188
|
+
# Build table info: {page_no: [bbox1, bbox2, ...]}
|
|
189
|
+
table_bboxes_by_page = {}
|
|
190
|
+
for table in conv_res.document.tables:
|
|
191
|
+
if table.prov:
|
|
192
|
+
for prov in table.prov:
|
|
193
|
+
page_no = prov.page_no
|
|
194
|
+
bbox = prov.bbox
|
|
195
|
+
if page_no not in table_bboxes_by_page:
|
|
196
|
+
table_bboxes_by_page[page_no] = []
|
|
197
|
+
table_bboxes_by_page[page_no].append(bbox)
|
|
198
|
+
|
|
199
|
+
# Check each picture
|
|
200
|
+
for i, picture in enumerate(conv_res.document.pictures):
|
|
201
|
+
if picture.prov:
|
|
202
|
+
for prov in picture.prov:
|
|
203
|
+
pic_page = prov.page_no
|
|
204
|
+
pic_bbox = prov.bbox
|
|
205
|
+
# Check if this picture is inside any table on the same page
|
|
206
|
+
if pic_page in table_bboxes_by_page:
|
|
207
|
+
for table_bbox in table_bboxes_by_page[pic_page]:
|
|
208
|
+
if is_bbox_inside(pic_bbox, table_bbox):
|
|
209
|
+
pictures_to_skip.add(i)
|
|
210
|
+
print(f"Picture {i} on page {pic_page} is inside a table - will skip annotation")
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
return pictures_to_skip
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def make_api_call_with_retries(payload, api_url, headers, item_type, item_index, output_dir):
|
|
217
|
+
"""Make API call with retry logic for server errors and response recording"""
|
|
218
|
+
retry_count = 0
|
|
219
|
+
while True:
|
|
220
|
+
try:
|
|
221
|
+
response = requests.post(api_url, json=payload, headers=headers, timeout=120)
|
|
222
|
+
|
|
223
|
+
if response.status_code == 200:
|
|
224
|
+
# Response saving disabled - no longer saving individual response files
|
|
225
|
+
# responses_dir = output_dir / "api_responses"
|
|
226
|
+
# responses_dir.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
# response_file = responses_dir / f"{item_type}_{item_index}_response.json"
|
|
228
|
+
# with open(response_file, 'w') as f:
|
|
229
|
+
# json.dump(response.json(), f, indent=2)
|
|
230
|
+
return response
|
|
231
|
+
elif 400 <= response.status_code < 500:
|
|
232
|
+
# Client error - skip
|
|
233
|
+
print(f"Client error for {item_type} {item_index}: HTTP {response.status_code} - skipping")
|
|
234
|
+
return None
|
|
235
|
+
elif response.status_code >= 500:
|
|
236
|
+
# Server error - retry with exponential backoff
|
|
237
|
+
retry_count += 1
|
|
238
|
+
wait_time = min(60, 2 ** min(retry_count, 6)) # Cap at 60 seconds
|
|
239
|
+
print(f"Server error for {item_type} {item_index}: HTTP {response.status_code} - retrying in {wait_time}s (attempt {retry_count})")
|
|
240
|
+
time.sleep(wait_time)
|
|
241
|
+
continue
|
|
242
|
+
else:
|
|
243
|
+
print(f"Unexpected status for {item_type} {item_index}: HTTP {response.status_code} - skipping")
|
|
244
|
+
return None
|
|
245
|
+
|
|
246
|
+
except Exception as e:
|
|
247
|
+
# Network/connection errors - retry
|
|
248
|
+
retry_count += 1
|
|
249
|
+
wait_time = min(60, 2 ** min(retry_count, 6))
|
|
250
|
+
print(f"Network error for {item_type} {item_index}: {str(e)} - retrying in {wait_time}s (attempt {retry_count})")
|
|
251
|
+
time.sleep(wait_time)
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
def annotate_items_with_images(conv_res, model_name=MODEL_NAME, api_url=API_URL, max_tokens=1000, use_batch=True):
|
|
255
|
+
"""Annotate both pictures and tables that don't have annotations using their base64 image data.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
conv_res: Conversion result from docling
|
|
259
|
+
model_name: VLM model name
|
|
260
|
+
api_url: API endpoint URL
|
|
261
|
+
max_tokens: Max tokens for response
|
|
262
|
+
use_batch: If True, use batch processing for faster annotation
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Set of picture indices that are inside tables (skipped).
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
# Get pictures that are inside tables (skip these)
|
|
269
|
+
pictures_to_skip = get_pictures_inside_tables(conv_res)
|
|
270
|
+
|
|
271
|
+
if use_batch:
|
|
272
|
+
return _annotate_items_batch(conv_res, model_name, pictures_to_skip)
|
|
273
|
+
else:
|
|
274
|
+
return _annotate_items_sequential(conv_res, model_name, api_url, pictures_to_skip)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _annotate_items_batch(conv_res, model_name, pictures_to_skip):
|
|
278
|
+
"""Batch annotate pictures and tables using async batch processing."""
|
|
279
|
+
from call_llm import batch_call_vlm_base64
|
|
280
|
+
|
|
281
|
+
# Collect all items that need annotation
|
|
282
|
+
batch_requests = [] # List of (prompt, base64, mime_type)
|
|
283
|
+
item_refs = [] # Track which item each request corresponds to: ('picture'|'table', index, item)
|
|
284
|
+
|
|
285
|
+
# Collect pictures
|
|
286
|
+
for i, item in enumerate(conv_res.document.pictures):
|
|
287
|
+
if i in pictures_to_skip:
|
|
288
|
+
print(f"Picture {i}: Skipping - inside a table")
|
|
289
|
+
continue
|
|
290
|
+
if not item.annotations:
|
|
291
|
+
try:
|
|
292
|
+
base64_data = str(item.image.uri).split(',')[1] if item.image and item.image.uri else None
|
|
293
|
+
if not base64_data:
|
|
294
|
+
print(f"Picture {i}: No base64 data available")
|
|
295
|
+
continue
|
|
296
|
+
print(f"Picture {i}: Queued for batch (base64 len: {len(base64_data)})")
|
|
297
|
+
batch_requests.append((PROMPTS_DESC["image"], base64_data, "image/png"))
|
|
298
|
+
item_refs.append(('picture', i, item))
|
|
299
|
+
except Exception as e:
|
|
300
|
+
print(f"Error preparing picture {i}: {str(e)}")
|
|
301
|
+
|
|
302
|
+
# Collect tables
|
|
303
|
+
for i, item in enumerate(conv_res.document.tables):
|
|
304
|
+
if not item.annotations:
|
|
305
|
+
try:
|
|
306
|
+
base64_data = str(item.image.uri.path).split(',')[1] if item.image and item.image.uri else None
|
|
307
|
+
if not base64_data:
|
|
308
|
+
print(f"Table {i}: No base64 data available")
|
|
309
|
+
continue
|
|
310
|
+
print(f"Table {i}: Queued for batch (base64 len: {len(base64_data)})")
|
|
311
|
+
batch_requests.append((PROMPTS_DESC["table"], base64_data, "image/png"))
|
|
312
|
+
item_refs.append(('table', i, item))
|
|
313
|
+
except Exception as e:
|
|
314
|
+
print(f"Error preparing table {i}: {str(e)}")
|
|
315
|
+
|
|
316
|
+
if not batch_requests:
|
|
317
|
+
print("No items to annotate")
|
|
318
|
+
return pictures_to_skip
|
|
319
|
+
|
|
320
|
+
# Execute batch VLM call
|
|
321
|
+
print(f"\n⚡ Batch annotating {len(batch_requests)} items...")
|
|
322
|
+
responses = batch_call_vlm_base64(batch_requests, show_progress=True)
|
|
323
|
+
|
|
324
|
+
# Apply responses to items
|
|
325
|
+
success_count = 0
|
|
326
|
+
for (item_type, idx, item), response in zip(item_refs, responses):
|
|
327
|
+
if response and not response.startswith("ERROR:"):
|
|
328
|
+
annotation = DescriptionAnnotation(
|
|
329
|
+
kind='description',
|
|
330
|
+
text=response,
|
|
331
|
+
provenance=model_name
|
|
332
|
+
)
|
|
333
|
+
item.annotations.append(annotation)
|
|
334
|
+
print(f"✅ Added annotation to {item_type} {idx}")
|
|
335
|
+
success_count += 1
|
|
336
|
+
else:
|
|
337
|
+
print(f"❌ Failed to annotate {item_type} {idx}: {response[:100] if response else 'No response'}")
|
|
338
|
+
|
|
339
|
+
print(f"\n📊 Batch annotation complete: {success_count}/{len(batch_requests)} successful")
|
|
340
|
+
return pictures_to_skip
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _annotate_items_sequential(conv_res, model_name, api_url, pictures_to_skip):
|
|
344
|
+
"""Sequential annotation (original implementation, kept as fallback)."""
|
|
345
|
+
# Load API key for authentication
|
|
346
|
+
with open(API_KEY_FILE, 'r') as f:
|
|
347
|
+
api_key = f.read().strip()
|
|
348
|
+
headers = {
|
|
349
|
+
"Authorization": f"Bearer {api_key}",
|
|
350
|
+
"Content-Type": "application/json",
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
# Process pictures
|
|
354
|
+
for i, item in enumerate(conv_res.document.pictures):
|
|
355
|
+
if i in pictures_to_skip:
|
|
356
|
+
print(f"Picture {i}: Skipping - inside a table")
|
|
357
|
+
continue
|
|
358
|
+
if not item.annotations:
|
|
359
|
+
try:
|
|
360
|
+
base64_data = str(item.image.uri).split(',')[1] if item.image and item.image.uri else None
|
|
361
|
+
if not base64_data:
|
|
362
|
+
print(f"Picture {i}: No base64 data available")
|
|
363
|
+
continue
|
|
364
|
+
|
|
365
|
+
print(f"Picture {i}: Using base64 data, length: {len(base64_data)}")
|
|
366
|
+
|
|
367
|
+
payload = {
|
|
368
|
+
"model": model_name,
|
|
369
|
+
"messages": [{
|
|
370
|
+
"role": "user",
|
|
371
|
+
"content": [
|
|
372
|
+
{"type": "text", "text": PROMPTS_DESC["image"]},
|
|
373
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_data}"}}
|
|
374
|
+
]
|
|
375
|
+
}]
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
response = make_api_call_with_retries(payload, api_url, headers, "picture", i, OUTPUT_DIR)
|
|
379
|
+
|
|
380
|
+
if response and response.status_code == 200:
|
|
381
|
+
result = response.json()
|
|
382
|
+
description_text = result.get('choices', [{}])[0].get('message', {}).get('content', 'No description available')
|
|
383
|
+
annotation = DescriptionAnnotation(kind='description', text=description_text, provenance=model_name)
|
|
384
|
+
item.annotations.append(annotation)
|
|
385
|
+
print(f"Added annotation to picture {i}")
|
|
386
|
+
else:
|
|
387
|
+
print(f"Failed to annotate picture {i}: HTTP {response.status_code if response else 'No response'}")
|
|
388
|
+
except Exception as e:
|
|
389
|
+
print(f"Error annotating picture {i}: {str(e)}")
|
|
390
|
+
|
|
391
|
+
# Process tables
|
|
392
|
+
for i, item in enumerate(conv_res.document.tables):
|
|
393
|
+
if not item.annotations:
|
|
394
|
+
try:
|
|
395
|
+
base64_data = str(item.image.uri.path).split(',')[1] if item.image and item.image.uri else None
|
|
396
|
+
if not base64_data:
|
|
397
|
+
print(f"Table {i}: No base64 data available")
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
print(f"Table {i}: Using base64 data, length: {len(base64_data)}")
|
|
401
|
+
|
|
402
|
+
payload = {
|
|
403
|
+
"model": model_name,
|
|
404
|
+
"messages": [{
|
|
405
|
+
"role": "user",
|
|
406
|
+
"content": [
|
|
407
|
+
{"type": "text", "text": PROMPTS_DESC["table"]},
|
|
408
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_data}"}}
|
|
409
|
+
]
|
|
410
|
+
}]
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
response = make_api_call_with_retries(payload, api_url, headers, "table", i, OUTPUT_DIR)
|
|
414
|
+
|
|
415
|
+
if response and response.status_code == 200:
|
|
416
|
+
result = response.json()
|
|
417
|
+
description_text = result.get('choices', [{}])[0].get('message', {}).get('content', 'No description available')
|
|
418
|
+
annotation = DescriptionAnnotation(kind='description', text=description_text, provenance=model_name)
|
|
419
|
+
item.annotations.append(annotation)
|
|
420
|
+
print(f"Added annotation to table {i}")
|
|
421
|
+
else:
|
|
422
|
+
print(f"Failed to annotate table {i}: HTTP {response.status_code if response else 'No response'}")
|
|
423
|
+
except Exception as e:
|
|
424
|
+
print(f"Error annotating table {i}: {str(e)}")
|
|
425
|
+
|
|
426
|
+
return pictures_to_skip
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def configure_pipeline_options(model_name:str="granite3.2-vision:latest", cuda_device_id:int=None):
|
|
430
|
+
pipeline_options = PdfPipelineOptions()
|
|
431
|
+
pipeline_options.enable_remote_services = True
|
|
432
|
+
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
|
433
|
+
pipeline_options.generate_page_images = False # True For debugging
|
|
434
|
+
pipeline_options.generate_parsed_pages = False # True For debugging
|
|
435
|
+
pipeline_options.generate_picture_images = True
|
|
436
|
+
pipeline_options.do_picture_classification = False # False to Avoid CUDA OOM
|
|
437
|
+
pipeline_options.do_picture_description = True # False Avoid CUDA OOM
|
|
438
|
+
pipeline_options.do_ocr = True
|
|
439
|
+
pipeline_options.ocr_options = EasyOcrOptions()
|
|
440
|
+
pipeline_options.do_code_enrichment = True
|
|
441
|
+
pipeline_options.do_formula_enrichment = True
|
|
442
|
+
pipeline_options.do_table_structure = True
|
|
443
|
+
pipeline_options.table_structure_options.do_cell_matching = True
|
|
444
|
+
pipeline_options.generate_table_images = True
|
|
445
|
+
|
|
446
|
+
print(f"DEBUG: Pipeline options - generate_table_images: {pipeline_options.generate_table_images}")
|
|
447
|
+
print(f"DEBUG: Pipeline options - do_table_structure: {pipeline_options.do_table_structure}")
|
|
448
|
+
|
|
449
|
+
# Set accelerator with specific CUDA device if provided
|
|
450
|
+
if cuda_device_id is not None:
|
|
451
|
+
print(f"DEBUG: Using CUDA device {cuda_device_id}")
|
|
452
|
+
pipeline_options.accelerator_options = AcceleratorOptions(
|
|
453
|
+
num_threads=NUM_THREADS, device=f"cuda:{cuda_device_id}"
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
pipeline_options.accelerator_options = AcceleratorOptions(
|
|
457
|
+
num_threads=NUM_THREADS, device=AcceleratorDevice.AUTO
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
### Set picture description API options for motormaven endpoint
|
|
461
|
+
pipeline_options.picture_description_options = motormaven_vlm_options()
|
|
462
|
+
|
|
463
|
+
print("DEBUG: Using manual table annotation (no built-in table description options)")
|
|
464
|
+
|
|
465
|
+
return pipeline_options
|
|
466
|
+
|
|
467
|
+
def process_single_document(doc_path, doc_converter, output_dir, is_pdf=True):
|
|
468
|
+
"""Process a single document file (PDF or HTML) and save outputs.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
doc_path: Path to the document file
|
|
472
|
+
doc_converter: DocumentConverter instance
|
|
473
|
+
output_dir: Output directory path
|
|
474
|
+
is_pdf: Whether the document is a PDF (affects annotation behavior)
|
|
475
|
+
"""
|
|
476
|
+
doc_start = time.time()
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
conv_res = doc_converter.convert(str(doc_path))
|
|
480
|
+
doc_filename = conv_res.input.file.stem
|
|
481
|
+
|
|
482
|
+
# Create per-document output directory
|
|
483
|
+
doc_output_dir = output_dir / doc_filename
|
|
484
|
+
doc_output_dir.mkdir(parents=True, exist_ok=True)
|
|
485
|
+
|
|
486
|
+
# Annotate images and tables (only for PDFs with complex layouts)
|
|
487
|
+
# HTML files typically have simpler structure
|
|
488
|
+
if is_pdf:
|
|
489
|
+
pictures_to_skip = annotate_items_with_images(conv_res)
|
|
490
|
+
|
|
491
|
+
# Create folders for images
|
|
492
|
+
tables_dir = doc_output_dir / "tables"
|
|
493
|
+
tables_dir.mkdir(parents=True, exist_ok=True)
|
|
494
|
+
|
|
495
|
+
# Save table images
|
|
496
|
+
table_counter = 0
|
|
497
|
+
for element, _level in conv_res.document.iterate_items():
|
|
498
|
+
if isinstance(element, TableItem):
|
|
499
|
+
table_counter += 1
|
|
500
|
+
element_image_filename = tables_dir / f"{doc_filename}-table-{table_counter}.png"
|
|
501
|
+
_log.info(f"Table {element.self_ref} - Caption: {element.caption_text(doc=conv_res.document)}")
|
|
502
|
+
try:
|
|
503
|
+
with element_image_filename.open("wb") as fp:
|
|
504
|
+
element.get_image(conv_res.document).save(fp, "PNG")
|
|
505
|
+
except Exception as e:
|
|
506
|
+
_log.warning(f"Could not save table image: {e}")
|
|
507
|
+
|
|
508
|
+
# Create artifacts directory for referenced images
|
|
509
|
+
artifacts_dir = doc_output_dir / "ref_artifacts"
|
|
510
|
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
511
|
+
|
|
512
|
+
# Save markdown with externally referenced pictures
|
|
513
|
+
md_filename = doc_output_dir / f"{doc_filename}_ref.md"
|
|
514
|
+
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED, artifacts_dir=artifacts_dir)
|
|
515
|
+
|
|
516
|
+
elapsed = time.time() - doc_start
|
|
517
|
+
file_type = "PDF" if is_pdf else "HTML"
|
|
518
|
+
_log.info(f"✅ Processed {file_type} {doc_filename} in {elapsed:.1f}s")
|
|
519
|
+
return True, doc_filename, elapsed
|
|
520
|
+
|
|
521
|
+
except Exception as e:
|
|
522
|
+
elapsed = time.time() - doc_start
|
|
523
|
+
_log.error(f"❌ Failed to process {doc_path.name}: {str(e)}")
|
|
524
|
+
return False, doc_path.name, elapsed
|
|
525
|
+
|
|
526
|
+
# Backward compatibility alias
|
|
527
|
+
def process_single_pdf(pdf_path, doc_converter, output_dir):
|
|
528
|
+
"""Backward compatibility wrapper for process_single_document."""
|
|
529
|
+
return process_single_document(pdf_path, doc_converter, output_dir, is_pdf=True)
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def create_multi_format_converter(cuda_device_id=None):
|
|
533
|
+
"""Create a DocumentConverter that supports both PDF and HTML formats."""
|
|
534
|
+
# PDF pipeline options (full processing)
|
|
535
|
+
pdf_pipeline_options = configure_pipeline_options(cuda_device_id=cuda_device_id)
|
|
536
|
+
|
|
537
|
+
# Initialize document converter with both PDF and HTML support
|
|
538
|
+
doc_converter = DocumentConverter(
|
|
539
|
+
format_options={
|
|
540
|
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
|
|
541
|
+
InputFormat.HTML: HTMLFormatOption(), # HTML uses default options
|
|
542
|
+
}
|
|
543
|
+
)
|
|
544
|
+
return doc_converter
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
if __name__ == "__main__":
|
|
548
|
+
start_time = time.time()
|
|
549
|
+
|
|
550
|
+
torch.cuda.empty_cache()
|
|
551
|
+
torch.cuda.ipc_collect()
|
|
552
|
+
check_cuda_memory(CUDA_DEVICE_ID)
|
|
553
|
+
|
|
554
|
+
# Collect document files (PDF and HTML) from input path
|
|
555
|
+
doc_files, temp_dir = collect_input_files(INPUT_PATH)
|
|
556
|
+
|
|
557
|
+
if not doc_files:
|
|
558
|
+
_log.error("No document files found to process")
|
|
559
|
+
exit(1)
|
|
560
|
+
|
|
561
|
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
562
|
+
|
|
563
|
+
# Initialize document converter with multi-format support
|
|
564
|
+
doc_converter = create_multi_format_converter(cuda_device_id=CUDA_DEVICE_ID)
|
|
565
|
+
|
|
566
|
+
# Process all documents
|
|
567
|
+
results = []
|
|
568
|
+
for i, doc_path in enumerate(doc_files, 1):
|
|
569
|
+
is_pdf = doc_path.suffix.lower() == '.pdf'
|
|
570
|
+
file_type = "PDF" if is_pdf else "HTML"
|
|
571
|
+
|
|
572
|
+
_log.info(f"\n{'='*60}")
|
|
573
|
+
_log.info(f"Processing {file_type} {i}/{len(doc_files)}: {doc_path.name}")
|
|
574
|
+
_log.info(f"{'='*60}")
|
|
575
|
+
|
|
576
|
+
success, name, elapsed = process_single_document(doc_path, doc_converter, OUTPUT_DIR, is_pdf=is_pdf)
|
|
577
|
+
results.append((success, name, elapsed, file_type))
|
|
578
|
+
|
|
579
|
+
# Cleanup temp directory if created from zip
|
|
580
|
+
if temp_dir:
|
|
581
|
+
_log.info(f"Cleaning up temp directory: {temp_dir}")
|
|
582
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
583
|
+
|
|
584
|
+
# Summary
|
|
585
|
+
total_time = time.time() - start_time
|
|
586
|
+
successful = sum(1 for r in results if r[0])
|
|
587
|
+
failed = len(results) - successful
|
|
588
|
+
pdf_count = sum(1 for r in results if r[3] == "PDF")
|
|
589
|
+
html_count = sum(1 for r in results if r[3] == "HTML")
|
|
590
|
+
|
|
591
|
+
_log.info(f"\n{'='*60}")
|
|
592
|
+
_log.info(f"PROCESSING COMPLETE")
|
|
593
|
+
_log.info(f"{'='*60}")
|
|
594
|
+
_log.info(f"Total documents: {len(results)} (PDF: {pdf_count}, HTML: {html_count})")
|
|
595
|
+
_log.info(f"Successful: {successful}")
|
|
596
|
+
_log.info(f"Failed: {failed}")
|
|
597
|
+
_log.info(f"Total time: {total_time:.1f}s")
|
|
598
|
+
_log.info(f"Output directory: {OUTPUT_DIR}")
|