mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

mirage/core/llm.py ADDED
@@ -0,0 +1,1745 @@
1
+
2
+ import time
3
+ import re
4
+ import requests
5
+ import logging
6
+ import base64
7
+ import asyncio
8
+ import aiohttp
9
+ from pathlib import Path
10
+ from typing import List, Tuple, Dict, Optional, Any, Callable
11
+ import sys
12
+ import os
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ from functools import partial
15
+
16
+ # ============================================================================
17
+ # CONFIGURATION - Lazy loading to allow import without config file
18
+ # ============================================================================
19
+
20
+ # Default values - actual values loaded lazily when needed
21
+ _config_initialized = False
22
+ BACKEND = os.environ.get("LLM_BACKEND", "GEMINI")
23
+ API_URL = ""
24
+ LLM_MODEL_NAME = ""
25
+ VLM_MODEL_NAME = ""
26
+ API_KEY = ""
27
+ GEMINI_RPM = int(os.environ.get("GEMINI_RPM", "60"))
28
+ GEMINI_BURST = int(os.environ.get("GEMINI_BURST", "15"))
29
+ LOG_FILE = os.environ.get("LOG_FILE", "output/pipeline.log")
30
+ TERMINAL_LOG_FILE = os.environ.get("TERMINAL_LOG_FILE", "output/terminal_pipeline.log")
31
+ HEADERS = {"Content-Type": "application/json"}
32
+ GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
33
+
34
+ # Default URLs and Models
35
+ _DEFAULT_URLS = {
36
+ "OLLAMA": "http://127.0.0.1:11434/api/chat",
37
+ "GEMINI": "https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent",
38
+ "OPENAI": "https://api.openai.com/v1/chat/completions"
39
+ }
40
+
41
+ _DEFAULT_MODELS = {
42
+ "OLLAMA": ("llama3.1:8b", "llava:13b"),
43
+ "GEMINI": ("gemini-2.0-flash", "gemini-2.0-flash"),
44
+ "OPENAI": ("gpt-4o-mini", "gpt-4o")
45
+ }
46
+
47
+
48
+ def _initialize_config():
49
+ """Initialize configuration lazily on first use.
50
+
51
+ This is called before any LLM/VLM call to ensure config is loaded.
52
+ Allows the module to be imported without a config file.
53
+ """
54
+ global _config_initialized, BACKEND, API_URL, LLM_MODEL_NAME, VLM_MODEL_NAME
55
+ global API_KEY, GEMINI_RPM, GEMINI_BURST, LOG_FILE, TERMINAL_LOG_FILE, HEADERS
56
+
57
+ if _config_initialized:
58
+ return
59
+
60
+ try:
61
+ from mirage.core.config import get_backend_config, get_api_key, get_rate_limit_config, get_paths_config
62
+
63
+ _backend_cfg = get_backend_config()
64
+ _rate_cfg = get_rate_limit_config()
65
+ _paths_cfg = get_paths_config()
66
+
67
+ BACKEND = _backend_cfg['name']
68
+ API_URL = _backend_cfg.get('url', _DEFAULT_URLS.get(BACKEND, ''))
69
+ LLM_MODEL_NAME = _backend_cfg.get('llm_model', _DEFAULT_MODELS.get(BACKEND, ('', ''))[0])
70
+ VLM_MODEL_NAME = _backend_cfg.get('vlm_model', _DEFAULT_MODELS.get(BACKEND, ('', ''))[1])
71
+ API_KEY = get_api_key()
72
+
73
+ # Rate limiting from config
74
+ GEMINI_RPM = _rate_cfg.get('requests_per_minute', 60)
75
+ GEMINI_BURST = _rate_cfg.get('burst_size', 15)
76
+
77
+ # Auto-generate log file names from dataset name and LLM model
78
+ _output_dir = _paths_cfg.get('output_dir', 'output')
79
+ _input_pdf_dir = _paths_cfg.get('input_pdf_dir', 'data/documents')
80
+ _dataset_name = Path(_input_pdf_dir).name
81
+ _log_basename = f"{_dataset_name}_{LLM_MODEL_NAME}.log"
82
+
83
+ LOG_FILE = os.path.join(_output_dir, _log_basename)
84
+ TERMINAL_LOG_FILE = os.path.join(_output_dir, f"terminal_{_log_basename}")
85
+
86
+ except Exception:
87
+ # Use environment variables and defaults
88
+ BACKEND = os.environ.get("LLM_BACKEND", "GEMINI")
89
+ API_URL = _DEFAULT_URLS.get(BACKEND, _DEFAULT_URLS["GEMINI"])
90
+ LLM_MODEL_NAME, VLM_MODEL_NAME = _DEFAULT_MODELS.get(BACKEND, _DEFAULT_MODELS["GEMINI"])
91
+
92
+ # Try to load API key from environment
93
+ API_KEY = os.environ.get("GEMINI_API_KEY", "") or os.environ.get("OPENAI_API_KEY", "")
94
+
95
+ # Set headers based on backend
96
+ if BACKEND == "GEMINI":
97
+ HEADERS = {"Content-Type": "application/json"}
98
+ elif BACKEND == "OLLAMA":
99
+ HEADERS = {"Content-Type": "application/json"}
100
+ else:
101
+ HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
102
+
103
+ _config_initialized = True
104
+
105
+
106
+ def test_llm_connection() -> bool:
107
+ """Test LLM API connection."""
108
+ _initialize_config()
109
+ print(f"Testing LLM connection to {BACKEND}...")
110
+ try:
111
+ response = call_llm_simple("Say 'Hello' in one word.")
112
+ print(f"LLM connection successful: {response[:50]}...")
113
+ return True
114
+ except Exception as e:
115
+ print(f"LLM connection failed: {e}")
116
+ return False
117
+
118
+ # ============================================================================
119
+ # CORE UTILITY FUNCTIONS
120
+ # ============================================================================
121
+
122
+ class TeeOutput:
123
+ """Tee output to both console and file"""
124
+ def __init__(self, file_path, stream):
125
+ self.file = open(file_path, 'a', encoding='utf-8')
126
+ self.stream = stream
127
+ self.encoding = getattr(stream, 'encoding', 'utf-8')
128
+
129
+ def write(self, data):
130
+ self.stream.write(data)
131
+ self.file.write(data)
132
+ self.file.flush()
133
+
134
+ def flush(self):
135
+ self.stream.flush()
136
+ self.file.flush()
137
+
138
+ def close(self):
139
+ self.file.close()
140
+
141
+
142
+ def setup_logging(enable_terminal_log=True):
143
+ """Setup logging to file and console, optionally capture all terminal output"""
144
+ # Create logs directory if it doesn't exist
145
+ Path(LOG_FILE).parent.mkdir(parents=True, exist_ok=True)
146
+ Path(TERMINAL_LOG_FILE).parent.mkdir(parents=True, exist_ok=True)
147
+
148
+ # Setup logging
149
+ logging.basicConfig(
150
+ level=logging.INFO,
151
+ format='%(asctime)s - %(message)s',
152
+ handlers=[
153
+ logging.FileHandler(LOG_FILE, encoding='utf-8'),
154
+ logging.StreamHandler()
155
+ ]
156
+ )
157
+
158
+ # Capture all terminal output (stdout and stderr) to terminal log file
159
+ if enable_terminal_log:
160
+ # Clear the terminal log file at start
161
+ with open(TERMINAL_LOG_FILE, 'w', encoding='utf-8') as f:
162
+ f.write(f"=== Terminal Log Started ===\n")
163
+
164
+ sys.stdout = TeeOutput(TERMINAL_LOG_FILE, sys.__stdout__)
165
+ sys.stderr = TeeOutput(TERMINAL_LOG_FILE, sys.__stderr__)
166
+ print(f"📝 Terminal output being captured to: {TERMINAL_LOG_FILE}")
167
+
168
+ def get_image_mime_type(image_path: str) -> str:
169
+ """Get MIME type based on file extension"""
170
+ ext = Path(image_path).suffix.lower()
171
+ mime_types = {
172
+ '.png': 'image/png',
173
+ '.jpg': 'image/jpeg',
174
+ '.jpeg': 'image/jpeg',
175
+ '.gif': 'image/gif',
176
+ '.bmp': 'image/bmp',
177
+ '.webp': 'image/webp'
178
+ }
179
+ return mime_types.get(ext, 'image/png') # Default to PNG if unknown
180
+
181
+ def encode_image_to_base64(image_path: str) -> str:
182
+ """Encode image file to base64 string"""
183
+ with open(image_path, "rb") as f:
184
+ return base64.b64encode(f.read()).decode('utf-8')
185
+
186
+ # ============================================================================
187
+ # VLM INTERACTION FUNCTIONS
188
+ # ============================================================================
189
+
190
+ def test_vlm_connection(test_image: str = None) -> bool:
191
+ """Test VLM API connection with a sample image
192
+
193
+ Args:
194
+ test_image: Path to a test image file. If None, returns True (skips image test).
195
+ """
196
+ print("🔍 Testing VLM API connection...")
197
+ try:
198
+ if test_image is None:
199
+ print("⚠️ No test image provided, skipping VLM image test")
200
+ return True
201
+
202
+ if not Path(test_image).exists():
203
+ print(f"❌ Test image not found: {test_image}")
204
+ return False
205
+
206
+ call_vlm_simple("Describe this image briefly.", test_image)
207
+ print("✅ VLM API connection successful!")
208
+ return True
209
+ except Exception as e:
210
+ print(f"❌ VLM API connection error: {e}")
211
+ return False
212
+
213
+ def call_llm_simple(prompt: str) -> str:
214
+ """Simple LLM call with text-only input. Supports OLLAMA, GEMINI, OPENAI."""
215
+ _initialize_config()
216
+ print(f"Calling LLM (text-only) via {BACKEND}...")
217
+ attempt = 0
218
+ wait_time = 2
219
+
220
+ # Log LLM request
221
+ content_preview = f"Content: {prompt[:50]}...{prompt[-50:]}" if len(prompt) > 100 else f"Content: {prompt}"
222
+ logging.info(f"LLM Request [{BACKEND}] - {content_preview}")
223
+
224
+ while True:
225
+ attempt += 1
226
+ try:
227
+ if BACKEND == "OLLAMA":
228
+ # Ollama API format
229
+ data = {
230
+ "model": LLM_MODEL_NAME,
231
+ "messages": [{"role": "user", "content": prompt}],
232
+ "stream": False,
233
+ "options": {"temperature": 0.0}
234
+ }
235
+ response = requests.post(API_URL, json=data, timeout=300)
236
+ if response.status_code == 200:
237
+ result = response.json()["message"]["content"]
238
+ else:
239
+ raise Exception(f"HTTP {response.status_code}")
240
+
241
+ elif BACKEND == "GEMINI":
242
+ # Google Gemini direct API format
243
+ url = GEMINI_URL.format(model=LLM_MODEL_NAME) + f"?key={API_KEY}"
244
+ data = {
245
+ "contents": [{"parts": [{"text": prompt}]}],
246
+ "generationConfig": {"temperature": 0.0}
247
+ }
248
+ response = requests.post(url, headers=HEADERS, json=data, timeout=300)
249
+ if response.status_code == 200:
250
+ resp_json = response.json()
251
+ result = resp_json["candidates"][0]["content"]["parts"][0]["text"]
252
+ else:
253
+ error_msg = response.text[:200] if response.text else f"HTTP {response.status_code}"
254
+ raise Exception(error_msg)
255
+
256
+ elif BACKEND == "OPENAI":
257
+ # OpenAI API format
258
+ data = {
259
+ "model": LLM_MODEL_NAME,
260
+ "messages": [{"role": "user", "content": prompt}],
261
+ "temperature": 0.0
262
+ }
263
+ response = requests.post(API_URL, headers=HEADERS, json=data, timeout=300)
264
+ if response.status_code == 200:
265
+ result = response.json()["choices"][0]["message"]["content"]
266
+ else:
267
+ raise Exception(f"HTTP {response.status_code}")
268
+
269
+ else: # OpenAI-compatible API
270
+ data = {
271
+ "model": LLM_MODEL_NAME,
272
+ "messages": [{"role": "user", "content": prompt}]
273
+ }
274
+ response = requests.post(API_URL, headers=HEADERS, json=data, timeout=300)
275
+ if response.status_code == 200:
276
+ response_data = response.json()
277
+ result = response_data["choices"][0]["message"]["content"]
278
+ else:
279
+ raise Exception(f"HTTP {response.status_code}")
280
+
281
+ print(f"✅ LLM response received ({len(result)} chars)")
282
+ logging.info(f"LLM Response (Complete) - {len(result)} chars")
283
+ logging.info(f"LLM Response Content: {result}")
284
+ logging.info("-" * 60)
285
+ return result
286
+
287
+ except Exception as e:
288
+ print(f"⚠️ LLM call error (attempt {attempt}): {e}")
289
+
290
+ # Wait with exponential backoff (capped at 60 seconds)
291
+ print(f" Waiting {wait_time}s before retry...")
292
+ time.sleep(wait_time)
293
+ wait_time = min(wait_time * 2, 60)
294
+
295
+ def call_vlm_simple(prompt: str, image_path: str) -> str:
296
+ """Simple VLM call with single image. Supports OLLAMA, GEMINI, OPENAI."""
297
+ _initialize_config()
298
+ print(f"Calling VLM (simple) via {BACKEND}...")
299
+ attempt = 0
300
+ wait_time = 2
301
+
302
+ # Log VLM request
303
+ content_preview = f"Content: {prompt[:50]}...{prompt[-50:]}" if len(prompt) > 100 else f"Content: {prompt}"
304
+ logging.info(f"VLM Request [{BACKEND}] - Image: {image_path}, {content_preview}")
305
+
306
+ while True:
307
+ attempt += 1
308
+ try:
309
+ base64_image = encode_image_to_base64(image_path)
310
+ mime_type = get_image_mime_type(image_path)
311
+
312
+ if BACKEND == "OLLAMA":
313
+ # Ollama VLM format - images as base64 list
314
+ data = {
315
+ "model": VLM_MODEL_NAME,
316
+ "messages": [{"role": "user", "content": prompt, "images": [base64_image]}],
317
+ "stream": False,
318
+ "options": {"temperature": 0.0}
319
+ }
320
+ response = requests.post(API_URL, json=data, timeout=300)
321
+ if response.status_code == 200:
322
+ result = response.json()["message"]["content"]
323
+ else:
324
+ raise Exception(f"HTTP {response.status_code}")
325
+
326
+ elif BACKEND == "GEMINI":
327
+ # Google Gemini direct API format with inline image
328
+ url = GEMINI_URL.format(model=VLM_MODEL_NAME) + f"?key={API_KEY}"
329
+ data = {
330
+ "contents": [{
331
+ "parts": [
332
+ {"text": prompt},
333
+ {"inline_data": {"mime_type": mime_type, "data": base64_image}}
334
+ ]
335
+ }],
336
+ "generationConfig": {"temperature": 0.0}
337
+ }
338
+ response = requests.post(url, headers=HEADERS, json=data, timeout=300)
339
+ if response.status_code == 200:
340
+ resp_json = response.json()
341
+ result = resp_json["candidates"][0]["content"]["parts"][0]["text"]
342
+ else:
343
+ error_msg = response.text[:200] if response.text else f"HTTP {response.status_code}"
344
+ raise Exception(error_msg)
345
+
346
+ elif BACKEND == "OPENAI":
347
+ # OpenAI Vision API format
348
+ image_url = f"data:{mime_type};base64,{base64_image}"
349
+ data = {
350
+ "model": VLM_MODEL_NAME,
351
+ "messages": [{
352
+ "role": "user",
353
+ "content": [
354
+ {"type": "text", "text": prompt},
355
+ {"type": "image_url", "image_url": {"url": image_url}}
356
+ ]
357
+ }],
358
+ "temperature": 0.0
359
+ }
360
+ response = requests.post(API_URL, headers=HEADERS, json=data, timeout=300)
361
+ if response.status_code == 200:
362
+ result = response.json()["choices"][0]["message"]["content"]
363
+ else:
364
+ raise Exception(f"HTTP {response.status_code}")
365
+
366
+ else: # OpenAI-compatible API
367
+ image_url = f"data:{mime_type};base64,{base64_image}"
368
+ data = {
369
+ "model": VLM_MODEL_NAME,
370
+ "messages": [{
371
+ "role": "user",
372
+ "content": [
373
+ {"type": "text", "text": prompt},
374
+ {"type": "image_url", "image_url": {"url": image_url}}
375
+ ]
376
+ }]
377
+ }
378
+ response = requests.post(API_URL, headers=HEADERS, json=data, timeout=300)
379
+ if response.status_code == 200:
380
+ result = response.json()["choices"][0]["message"]["content"]
381
+ else:
382
+ raise Exception(f"HTTP {response.status_code}")
383
+
384
+ print(f"✅ VLM response received ({len(result)} chars)")
385
+ logging.info(f"VLM Response (Complete) - Image: {image_path} - {len(result)} chars")
386
+ logging.info(f"VLM Response Content: {result}")
387
+ logging.info("-" * 60)
388
+ return result
389
+
390
+ except Exception as e:
391
+ print(f"⚠️ VLM call error (attempt {attempt}): {e}")
392
+
393
+ if attempt >= 3:
394
+ print(f"❌ VLM call failed after {attempt} attempts. Giving up.")
395
+ raise Exception(f"VLM call failed after {attempt} attempts")
396
+
397
+ print(f" Waiting {wait_time}s before retry...")
398
+ time.sleep(wait_time)
399
+ wait_time = min(wait_time * 2, 60)
400
+
401
+ def call_vlm_with_examples(prompt: str, query_image_path: str, example_image_paths: List[str]) -> str:
402
+ """VLM call with multiple example images and query image"""
403
+ _initialize_config()
404
+ print(f"Calling VLM with examples via {BACKEND}...")
405
+ attempt = 0
406
+ wait_time = 2
407
+
408
+ logging.info(f"VLM Request [{BACKEND}] - Query Image: {query_image_path}")
409
+
410
+ while True:
411
+ attempt += 1
412
+ try:
413
+ if BACKEND == "OLLAMA":
414
+ # Ollama: use message chaining for multiple images
415
+ messages = []
416
+ for i, example_path in enumerate(example_image_paths):
417
+ if Path(example_path).exists():
418
+ messages.append({
419
+ "role": "user",
420
+ "content": f"Example {i+1}",
421
+ "images": [encode_image_to_base64(example_path)]
422
+ })
423
+ # Add query image
424
+ messages.append({
425
+ "role": "user",
426
+ "content": f"Query image:\n{prompt}",
427
+ "images": [encode_image_to_base64(query_image_path)]
428
+ })
429
+ data = {
430
+ "model": VLM_MODEL_NAME,
431
+ "messages": messages,
432
+ "stream": False,
433
+ "options": {"temperature": 0.0}
434
+ }
435
+ response = requests.post(API_URL, json=data, timeout=300)
436
+ if response.status_code == 200:
437
+ result = response.json()["message"]["content"]
438
+ else:
439
+ raise Exception(f"HTTP {response.status_code}")
440
+ else:
441
+ # OpenAI-compatible format
442
+ content = [{"type": "text", "text": prompt}]
443
+ for example_path in example_image_paths:
444
+ if Path(example_path).exists():
445
+ base64_image = encode_image_to_base64(example_path)
446
+ if "qwen" in VLM_MODEL_NAME.lower():
447
+ image_url = f"data:{get_image_mime_type(example_path)};base64,{base64_image}"
448
+ else:
449
+ image_url = f"data:image/png;base64,{base64_image}"
450
+ content.append({"type": "image_url", "image_url": {"url": image_url}})
451
+
452
+ base64_query = encode_image_to_base64(query_image_path)
453
+ if "qwen" in VLM_MODEL_NAME.lower():
454
+ query_url = f"data:{get_image_mime_type(query_image_path)};base64,{base64_query}"
455
+ else:
456
+ query_url = f"data:image/png;base64,{base64_query}"
457
+ content.append({"type": "image_url", "image_url": {"url": query_url}})
458
+
459
+ data = {"model": VLM_MODEL_NAME, "messages": [{"role": "user", "content": content}]}
460
+ response = requests.post(API_URL, headers=HEADERS, json=data, timeout=300)
461
+ if response.status_code == 200:
462
+ result = response.json()["choices"][0]["message"]["content"]
463
+ else:
464
+ raise Exception(f"HTTP {response.status_code}")
465
+
466
+ print(f"✅ VLM response received ({len(result)} chars)")
467
+ logging.info(f"VLM Response - Query Image: {query_image_path}")
468
+ logging.info(f"VLM Response Content: {result}")
469
+ logging.info("-" * 60)
470
+ return result
471
+
472
+ except Exception as e:
473
+ print(f"⚠️ VLM call error (attempt {attempt}): {e}")
474
+
475
+ if attempt >= 3:
476
+ print(f"❌ VLM call failed after {attempt} attempts. Giving up.")
477
+ raise Exception(f"VLM call failed after {attempt} attempts")
478
+
479
+ print(f" Waiting {wait_time}s before retry...")
480
+ time.sleep(wait_time)
481
+ wait_time = min(wait_time * 2, 60)
482
+
483
+ def call_vlm_with_multiple_images(prompt: str, image_paths: List[str]) -> str:
484
+ """VLM call with multiple images for reranking"""
485
+ _initialize_config()
486
+ print(f"Calling VLM with {len(image_paths)} images via {BACKEND}...")
487
+ attempt = 0
488
+ wait_time = 2
489
+
490
+ logging.info(f"VLM Request [{BACKEND}] - {len(image_paths)} images")
491
+
492
+ while True:
493
+ attempt += 1
494
+ try:
495
+ if BACKEND == "OLLAMA":
496
+ # Ollama: message chaining for multiple images
497
+ messages = []
498
+ for i, image_path in enumerate(image_paths):
499
+ if Path(image_path).exists():
500
+ messages.append({
501
+ "role": "user",
502
+ "content": f"Image {i+1}",
503
+ "images": [encode_image_to_base64(image_path)]
504
+ })
505
+ messages.append({"role": "user", "content": prompt})
506
+
507
+ data = {
508
+ "model": VLM_MODEL_NAME,
509
+ "messages": messages,
510
+ "stream": False,
511
+ "options": {"num_ctx": 16384, "temperature": 0.0}
512
+ }
513
+ response = requests.post(API_URL, json=data, timeout=300)
514
+
515
+ if response.status_code == 200:
516
+ result = response.json()["message"]["content"]
517
+ elif 500 <= response.status_code < 600 or response.status_code == 429:
518
+ print(f"⚠️ Server error ({response.status_code}). Retrying...")
519
+ attempt -= 1
520
+ raise Exception(f"Server error {response.status_code}")
521
+ else:
522
+ raise Exception(f"HTTP {response.status_code}")
523
+
524
+ elif BACKEND == "GEMINI":
525
+ # Google Gemini API format with inline images
526
+ url = GEMINI_URL.format(model=VLM_MODEL_NAME) + f"?key={API_KEY}"
527
+ parts = [{"text": prompt}]
528
+ for image_path in image_paths:
529
+ if Path(image_path).exists():
530
+ base64_image = encode_image_to_base64(image_path)
531
+ mime_type = get_image_mime_type(image_path)
532
+ parts.append({"inline_data": {"mime_type": mime_type, "data": base64_image}})
533
+
534
+ data = {
535
+ "contents": [{"parts": parts}],
536
+ "generationConfig": {"temperature": 0.0}
537
+ }
538
+ response = requests.post(url, headers=HEADERS, json=data, timeout=300)
539
+
540
+ if response.status_code == 200:
541
+ resp_json = response.json()
542
+ result = resp_json["candidates"][0]["content"]["parts"][0]["text"]
543
+ elif 500 <= response.status_code < 600 or response.status_code == 429:
544
+ print(f"⚠️ Server error ({response.status_code}). Retrying...")
545
+ attempt -= 1
546
+ raise Exception(f"Server error {response.status_code}")
547
+ else:
548
+ error_msg = response.text[:200] if response.text else f"HTTP {response.status_code}"
549
+ raise Exception(error_msg)
550
+
551
+ else:
552
+ # OpenAI-compatible format
553
+ content = [{"type": "text", "text": prompt}]
554
+ for image_path in image_paths:
555
+ if Path(image_path).exists():
556
+ base64_image = encode_image_to_base64(image_path)
557
+ if "qwen" in VLM_MODEL_NAME.lower():
558
+ image_url = f"data:{get_image_mime_type(image_path)};base64,{base64_image}"
559
+ else:
560
+ image_url = f"data:image/png;base64,{base64_image}"
561
+ content.append({"type": "image_url", "image_url": {"url": image_url}})
562
+
563
+ data = {"model": VLM_MODEL_NAME, "messages": [{"role": "user", "content": content}]}
564
+ local_headers = HEADERS.copy()
565
+ local_headers["Connection"] = "close"
566
+ response = requests.post(API_URL, headers=local_headers, json=data, timeout=300)
567
+
568
+ if response.status_code == 200:
569
+ result = response.json()["choices"][0]["message"]["content"]
570
+ elif 500 <= response.status_code < 600 or response.status_code == 429:
571
+ print(f"⚠️ Server error ({response.status_code}). Retrying...")
572
+ attempt -= 1
573
+ raise Exception(f"Server error {response.status_code}")
574
+ else:
575
+ raise Exception(f"HTTP {response.status_code}")
576
+
577
+ print(f"✅ VLM response received ({len(result)} chars)")
578
+ logging.info(f"VLM Response - {len(image_paths)} images")
579
+ logging.info(f"VLM Response Content: {result}")
580
+ logging.info("-" * 60)
581
+ return result
582
+
583
+ except Exception as e:
584
+ print(f"⚠️ VLM call error (attempt {attempt}): {e}")
585
+
586
+ if attempt >= 3:
587
+ print(f"❌ VLM call failed after {attempt} attempts. Giving up.")
588
+ raise Exception(f"VLM call failed after {attempt} attempts")
589
+
590
+ # Wait with exponential backoff (capped at 60 seconds)
591
+ print(f" Waiting {wait_time}s before retry...")
592
+ time.sleep(wait_time)
593
+ wait_time = min(wait_time * 2, 60)
594
+
595
+ def call_vlm_multi_images_ollama(prompt: str, image_paths: List[str]) -> str:
596
+ """
597
+ Calls local Ollama with multiple images and a prompt.
598
+ Uses message chaining strategy for correct image ordering in context.
599
+ """
600
+ print(f"👁️ Calling VLM with {len(image_paths)} images (Ollama)...")
601
+
602
+ url = "http://127.0.0.1:11434/api/chat"
603
+ messages = []
604
+
605
+ # Add images as separate user messages
606
+ for i, image_path in enumerate(image_paths):
607
+ if Path(image_path).exists():
608
+ try:
609
+ base64_img = encode_image_to_base64(image_path)
610
+ messages.append({
611
+ "role": "user",
612
+ "content": f"Image {i+1}",
613
+ "images": [base64_img]
614
+ })
615
+ except Exception as e:
616
+ print(f"⚠️ Failed to encode image {image_path}: {e}")
617
+ else:
618
+ print(f"⚠️ Image not found: {image_path}")
619
+
620
+ # Add the actual prompt as the final message
621
+ messages.append({
622
+ "role": "user",
623
+ "content": prompt
624
+ })
625
+
626
+ payload = {
627
+ "model": "qwen3-vl:32b",
628
+ "messages": messages,
629
+ "options": {
630
+ "num_ctx": 16384,
631
+ "temperature": 0.0
632
+ },
633
+ "stream": False
634
+ }
635
+
636
+ try:
637
+ response = requests.post(url, json=payload)
638
+ response.raise_for_status()
639
+ result = response.json()['message']['content']
640
+ print(f"✅ VLM response received ({len(result)} chars)")
641
+ return result
642
+ except Exception as e:
643
+ print(f"❌ Ollama call failed: {e}")
644
+ if 'response' in locals():
645
+ print(f"Response: {response.text}")
646
+ return ""
647
+
648
+ def call_vlm_interweaved(prompt: str, chunks: List[Dict]) -> str:
649
+ """VLM call with interleaved text and images from chunks.
650
+ Supports OLLAMA, GEMINI, OPENAI.
651
+
652
+ Args:
653
+ prompt: System prompt / instruction
654
+ chunks: List of dicts - supports two formats:
655
+ 1. Old format: {'content': str, 'image_path': str|None}
656
+ 2. JSON format: {'chunk_type': str, 'content': str, 'artifact': str}
657
+ """
658
+ _initialize_config()
659
+ print(f"Calling VLM with {len(chunks)} chunks (interweaved) via {BACKEND}...")
660
+ attempt = 0
661
+ wait_time = 2
662
+
663
+ logging.info(f"VLM Request [{BACKEND}] - {len(chunks)} chunks")
664
+
665
+ def _extract_image_path(chunk):
666
+ """Helper to extract image path from chunk
667
+
668
+ Supports multiple formats:
669
+ 1. New format: {'artifact': [list of paths]} - uses first image
670
+ 2. Old format: {'image_path': str} - backward compatibility
671
+ 3. JSON format: {'chunk_type': str, 'artifact': str} - legacy format
672
+ """
673
+ # Check for artifact list (new format)
674
+ artifact = chunk.get('artifact', [])
675
+ if isinstance(artifact, list) and len(artifact) > 0:
676
+ return artifact[0] # Use first image
677
+
678
+ # Fallback: check image_path field (backward compatibility)
679
+ image_path = chunk.get('image_path')
680
+ if image_path:
681
+ return image_path
682
+
683
+ # Fallback: legacy format with chunk_type and artifact string
684
+ if 'chunk_type' in chunk:
685
+ chunk_type = chunk.get('chunk_type', '')
686
+ artifact_str = chunk.get('artifact', 'None')
687
+ if chunk_type in ['standalone image', 'image'] and artifact_str != 'None':
688
+ match = re.search(r'!\[Image\]\(([^)]+)\)', artifact_str)
689
+ if match:
690
+ return match.group(1)
691
+
692
+ return None
693
+
694
+ while True:
695
+ attempt += 1
696
+ try:
697
+ if BACKEND == "OLLAMA":
698
+ # Ollama: message chaining for interleaved content
699
+ messages = [{"role": "user", "content": prompt}]
700
+ for i, chunk in enumerate(chunks):
701
+ chunk_text = chunk.get('content', '')
702
+ image_path = _extract_image_path(chunk)
703
+ msg = {"role": "user", "content": f"CHUNK{i+1}: {chunk_text}"}
704
+ if image_path and Path(image_path).exists():
705
+ msg["images"] = [encode_image_to_base64(image_path)]
706
+ messages.append(msg)
707
+
708
+ data = {
709
+ "model": VLM_MODEL_NAME,
710
+ "messages": messages,
711
+ "stream": False,
712
+ "options": {"num_ctx": 16384, "temperature": 0.0}
713
+ }
714
+ response = requests.post(API_URL, json=data, timeout=300)
715
+ if response.status_code == 200:
716
+ result = response.json()["message"]["content"]
717
+ elif 500 <= response.status_code < 600 or response.status_code == 429:
718
+ attempt -= 1
719
+ raise Exception(f"Server error {response.status_code}")
720
+ else:
721
+ raise Exception(f"HTTP {response.status_code}")
722
+
723
+ elif BACKEND == "GEMINI":
724
+ # Google Gemini: parts array with text and inline_data
725
+ parts = [{"text": prompt}]
726
+ for i, chunk in enumerate(chunks):
727
+ chunk_text = chunk.get('content', '')
728
+ image_path = _extract_image_path(chunk)
729
+
730
+ parts.append({"text": f"\n\nCHUNK {i+1}:\n{chunk_text}"})
731
+
732
+ if image_path and Path(image_path).exists():
733
+ base64_image = encode_image_to_base64(image_path)
734
+ mime_type = get_image_mime_type(image_path)
735
+ parts.append({"inline_data": {"mime_type": mime_type, "data": base64_image}})
736
+
737
+ url = GEMINI_URL.format(model=VLM_MODEL_NAME) + f"?key={API_KEY}"
738
+ data = {
739
+ "contents": [{"parts": parts}],
740
+ "generationConfig": {"temperature": 0.0}
741
+ }
742
+ response = requests.post(url, headers=HEADERS, json=data, timeout=300)
743
+ if response.status_code == 200:
744
+ resp_json = response.json()
745
+ result = resp_json["candidates"][0]["content"]["parts"][0]["text"]
746
+ elif 500 <= response.status_code < 600 or response.status_code == 429:
747
+ attempt -= 1
748
+ raise Exception(f"Server error {response.status_code}")
749
+ else:
750
+ error_msg = response.text[:200] if response.text else f"HTTP {response.status_code}"
751
+ raise Exception(error_msg)
752
+
753
+ elif BACKEND == "OPENAI":
754
+ # OpenAI: content array with text and image_url types
755
+ content = [{"type": "text", "text": prompt}]
756
+ for i, chunk in enumerate(chunks):
757
+ chunk_text = chunk.get('content', '')
758
+ image_path = _extract_image_path(chunk)
759
+
760
+ content.append({"type": "text", "text": f"\n\nCHUNK {i+1}:\n{chunk_text}"})
761
+
762
+ if image_path and Path(image_path).exists():
763
+ base64_image = encode_image_to_base64(image_path)
764
+ mime_type = get_image_mime_type(image_path)
765
+ image_url = f"data:{mime_type};base64,{base64_image}"
766
+ content.append({"type": "image_url", "image_url": {"url": image_url}})
767
+
768
+ data = {
769
+ "model": VLM_MODEL_NAME,
770
+ "messages": [{"role": "user", "content": content}],
771
+ "temperature": 0.0
772
+ }
773
+ response = requests.post(API_URL, headers=HEADERS, json=data, timeout=300)
774
+ if response.status_code == 200:
775
+ result = response.json()["choices"][0]["message"]["content"]
776
+ elif 500 <= response.status_code < 600 or response.status_code == 429:
777
+ attempt -= 1
778
+ raise Exception(f"Server error {response.status_code}")
779
+ else:
780
+ raise Exception(f"HTTP {response.status_code}")
781
+
782
+ else: # OpenAI-compatible API
783
+ content = [{"type": "text", "text": prompt}]
784
+ for i, chunk in enumerate(chunks):
785
+ chunk_text = chunk.get('content', '')
786
+ image_path = _extract_image_path(chunk)
787
+
788
+ text_block = f"\n\n<|#|>CHUNK{i+1}<|#|>START<|#|>\n{chunk_text}\n"
789
+
790
+ if image_path and Path(image_path).exists():
791
+ text_block += "<|#|>Image<|#|>"
792
+ content.append({"type": "text", "text": text_block})
793
+ base64_image = encode_image_to_base64(image_path)
794
+ mime_type = get_image_mime_type(image_path)
795
+ image_url = f"data:{mime_type};base64,{base64_image}"
796
+ content.append({"type": "image_url", "image_url": {"url": image_url}})
797
+ content.append({"type": "text", "text": f"<|#|>CHUNK{i+1}<|#|>END<|#|>"})
798
+ else:
799
+ text_block += f"<|#|>Image<|#|>None<|#|>CHUNK{i+1}<|#|>END<|#|>"
800
+ content.append({"type": "text", "text": text_block})
801
+
802
+ data = {"model": VLM_MODEL_NAME, "messages": [{"role": "user", "content": content}]}
803
+ local_headers = {**HEADERS, "Connection": "close"}
804
+ response = requests.post(API_URL, headers=local_headers, json=data, timeout=300)
805
+ if response.status_code == 200:
806
+ result = response.json()["choices"][0]["message"]["content"]
807
+ elif 500 <= response.status_code < 600 or response.status_code == 429:
808
+ attempt -= 1
809
+ raise Exception(f"Server error {response.status_code}")
810
+ else:
811
+ raise Exception(f"HTTP {response.status_code}")
812
+
813
+ print(f"✅ VLM response received ({len(result)} chars)")
814
+ logging.info(f"VLM Response - {len(chunks)} chunks")
815
+ logging.info(f"VLM Response Content: {result}")
816
+ logging.info("-" * 60)
817
+ return result
818
+
819
+ except Exception as e:
820
+ print(f"⚠️ VLM call error (attempt {attempt}): {e}")
821
+
822
+ if attempt >= 3:
823
+ print(f"❌ VLM call failed after {attempt} attempts. Giving up.")
824
+ raise Exception(f"VLM call failed after {attempt} attempts")
825
+
826
+ print(f" Waiting {wait_time}s before retry...")
827
+ time.sleep(wait_time)
828
+ wait_time = min(wait_time * 2, 60)
829
+
830
+ # def extract_role_context(image_path: str) -> Tuple[str, str, str]:
831
+ # """Extract figure description, expert role, and figure category from image"""
832
+ # print("🔍 Extracting role context from image...")
833
+
834
+ # # Use the role_context prompt with example images
835
+ # example_paths = [fie_loc1, fie_loc2, fie_loc3, fie_loc4]
836
+ # prompt = PROMPTS_IMAGE["role_context"]
837
+
838
+ # response = call_vlm_with_examples(prompt, image_path, example_paths)
839
+
840
+ # # Parse structured response
841
+ # try:
842
+ # # Extract figure_description, expert_role, and figure_category from response
843
+ # # Try multiple patterns for flexibility
844
+ # figure_description_match = re.search(r'figure_description:\s*"([^"]*)"', response)
845
+ # if not figure_description_match:
846
+ # figure_description_match = re.search(r'figure_description:\s*(.*?)(?=\nexpert_role:|$)', response, re.DOTALL)
847
+
848
+ # expert_role_match = re.search(r'expert_role:\s*"([^"]*)"', response)
849
+ # if not expert_role_match:
850
+ # expert_role_match = re.search(r'expert_role:\s*(.*?)(?=\nfigure_category:|$)', response)
851
+
852
+ # figure_category_match = re.search(r'figure_category:\s*"([^"]*)"', response)
853
+ # if not figure_category_match:
854
+ # figure_category_match = re.search(r'figure_category:\s*(.*?)(?=\n|$)', response)
855
+
856
+ # if figure_description_match and expert_role_match and figure_category_match:
857
+ # figure_description = figure_description_match.group(1).strip()
858
+ # expert_role = expert_role_match.group(1).strip()
859
+ # figure_category = figure_category_match.group(1).strip()
860
+ # return figure_description, expert_role, figure_category
861
+ # else:
862
+ # print("⚠️ Could not parse structured response, returning raw response")
863
+ # print(f"Response: {response}")
864
+ # return response, "Technical expert", "Other"
865
+ # except Exception as e:
866
+ # print(f"⚠️ Error parsing response: {e}")
867
+ # return response, "Technical expert", "Other"
868
+
869
+ # def generate_qa_pair(image_path: str, figure_description: str, expert_role: str) -> Tuple[str, str]:
870
+ # """Generate Q&A pair using image, description, and role"""
871
+ # print("❓ Generating Q&A pair...")
872
+
873
+ # # Use the qa_single_artifact prompt with example images
874
+ # example_paths = [fie_loc1, fie_loc2, fie_loc3, fie_loc4]
875
+
876
+ # # Replace the placeholders in the prompt before formatting
877
+ # prompt_template = PROMPTS_IMAGE["qa_single_artifact"]
878
+ # prompt_template = prompt_template.replace("{fie_loc1}", fie_loc1)
879
+ # prompt_template = prompt_template.replace("{fie_loc2}", fie_loc2)
880
+ # prompt_template = prompt_template.replace("{fie_loc3}", fie_loc3)
881
+ # prompt_template = prompt_template.replace("{fie_loc4}", fie_loc4)
882
+
883
+ # prompt = prompt_template.format(
884
+ # expert_role=expert_role,
885
+ # figure_description=figure_description
886
+ # )
887
+
888
+ # response = call_vlm_with_examples(prompt, image_path, example_paths)
889
+
890
+ # # Parse Q&A from response
891
+ # try:
892
+ # question_match = re.search(r'Question:\s*(.*?)(?=\nAnswer:|\n\n|$)', response, re.DOTALL)
893
+ # answer_match = re.search(r'Answer:\s*(.*?)(?=\n\n|$)', response, re.DOTALL)
894
+
895
+ # if question_match and answer_match:
896
+ # question = question_match.group(1).strip()
897
+ # answer = answer_match.group(1).strip()
898
+ # return question, answer
899
+ # else:
900
+ # print("⚠️ Could not parse Q&A from response, returning raw response")
901
+ # return response, ""
902
+ # except Exception as e:
903
+ # print(f"⚠️ Error parsing Q&A: {e}")
904
+ # return response, ""
905
+
906
+ # def verify_qa_requires_image(image_path: str, question: str, answer: str) -> str:
907
+ # """Verify if the question requires the image to be answered"""
908
+ # print("🔍 Verifying if question requires image context...")
909
+
910
+ # prompt = f"""You are evaluating whether a question requires visual information from an image to be answered correctly.
911
+
912
+ # Question: {question}
913
+ # Answer: {answer}
914
+
915
+ # Please analyze if this question can be answered without seeing the image. Consider:
916
+ # 1. Does the question reference specific visual elements (shapes, colors, positions, labels, etc.)?
917
+ # 2. Does the answer rely on information that can only be obtained from the image?
918
+ # 3. Could someone answer this question accurately using only general knowledge?
919
+
920
+ # Respond with:
921
+ # - "REQUIRES_IMAGE" if the question cannot be answered without the image
922
+ # - "CAN_ANSWER_WITHOUT_IMAGE" if the question can be answered without the image
923
+ # - Brief explanation of your reasoning
924
+
925
+ # Your evaluation:"""
926
+
927
+ # response = call_vlm_simple(prompt, image_path)
928
+ # return response
929
+
930
+ # def process_image_for_qa_dataset(image_path: str) -> dict:
931
+ # """Complete pipeline: extract role context, generate Q&A pair, and verify"""
932
+ # print(f"🔄 Processing image for QA dataset: {image_path}")
933
+
934
+ # # Stage 1: Extract role context
935
+ # figure_description, expert_role, figure_category = extract_role_context(image_path)
936
+
937
+ # # Stage 2: Generate Q&A pair
938
+ # question, answer = generate_qa_pair(image_path, figure_description, expert_role)
939
+
940
+ # # Stage 3: Verify if question requires image
941
+ # verification_result = verify_qa_requires_image(image_path, question, answer)
942
+
943
+ # return {
944
+ # "image_path": image_path,
945
+ # "figure_description": figure_description,
946
+ # "expert_role": expert_role,
947
+ # "figure_category": figure_category,
948
+ # "question": question,
949
+ # "answer": answer,
950
+ # "verification_result": verification_result
951
+ # }
952
+
953
+ def call_vlm_interleaved_ollama(chunks: List[Dict], model: str = "qwen3-vl:32b", image_base_dir: str = "", prompt: str = None) -> str:
954
+ """
955
+ Calls local Ollama with interleaved text and images using message chaining strategy.
956
+
957
+ Args:
958
+ chunks: List of dicts with 'content' and 'artifact'/'chunk_type'
959
+ model: Ollama model name (default: qwen3-vl:32b)
960
+ image_base_dir: Base directory for resolving relative image paths
961
+ prompt: Optional instruction prompt to append after chunks
962
+ """
963
+ print(f"👁️ Calling VLM interleaved (Ollama) with {len(chunks)} chunks...")
964
+
965
+ # Local Ollama endpoint
966
+ url = "http://127.0.0.1:11434/api/chat"
967
+ messages = []
968
+
969
+ for chunk in chunks:
970
+ content = chunk.get("content", "")
971
+ artifact = chunk.get("artifact", "None")
972
+ chunk_type = chunk.get("chunk_type", "")
973
+
974
+ # Base message
975
+ msg = {
976
+ "role": "user",
977
+ "content": content
978
+ }
979
+
980
+ # Check for image
981
+ if chunk_type == "standalone image" and artifact and artifact != "None":
982
+ # Extract image path from markdown: ![Image](path)
983
+ match = re.search(r'\!\[.*?\]\((.*?)\)', artifact)
984
+ if match:
985
+ rel_path = match.group(1)
986
+ img_path = os.path.join(image_base_dir, rel_path) if image_base_dir else rel_path
987
+
988
+ if os.path.exists(img_path):
989
+ try:
990
+ msg["images"] = [encode_image_to_base64(img_path)]
991
+ except Exception as e:
992
+ print(f"⚠️ Failed to encode image {img_path}: {e}")
993
+ else:
994
+ print(f"⚠️ Image not found: {img_path}")
995
+
996
+ messages.append(msg)
997
+
998
+ # Append prompt if provided
999
+ if prompt:
1000
+ messages.append({
1001
+ "role": "user",
1002
+ "content": prompt
1003
+ })
1004
+
1005
+ payload = {
1006
+ "model": model,
1007
+ "messages": messages,
1008
+ "options": {
1009
+ "num_ctx": 16384, # High context window for images
1010
+ "temperature": 0.0
1011
+ },
1012
+ "stream": False
1013
+ }
1014
+
1015
+ try:
1016
+ print(f"Sending {len(messages)} interleaved blocks to Ollama...")
1017
+ response = requests.post(url, json=payload)
1018
+ response.raise_for_status()
1019
+ result = response.json()['message']['content']
1020
+
1021
+ print(f"✅ VLM response received ({len(result)} chars)")
1022
+ logging.info(f"VLM Response (Ollama Interleaved) - {len(result)} chars")
1023
+
1024
+ return result
1025
+
1026
+ except Exception as e:
1027
+ print(f"❌ Ollama call failed: {e}")
1028
+ if 'response' in locals():
1029
+ print(f"Response: {response.text}")
1030
+ return ""
1031
+
1032
+ if __name__ == "__main__":
1033
+ # Setup logging
1034
+ setup_logging()
1035
+
1036
+ chunks_interleaved = [
1037
+ {
1038
+ "content": "Chunk 1",
1039
+ "chunk_type": "standalone image",
1040
+ "artifact": r"![Image](Samples/VLM/Singles/Circuits/IEC60034-2-1{ed3.0}b_Fig14_Circuit.png)"
1041
+ },
1042
+ {
1043
+ "content": r"""Chunk 2: This flowchart illustrates the procedure for determining the IES classification and losses for a Power Drive System (PDS). The process begins by selecting a determination method: Test or Calculation.
1044
+
1045
+ The Test path (left) involves measuring input/output directly to find PDS losses (PL,PDS​), then adding required uncertainties according to Formula (22).
1046
+
1047
+ The Calculation path (right) uses datasheet values. It calculates absolute motor losses and CDM (Complete Drive Module) losses separately. Depending on whether the operating point is at full load (100; 100) or part load, different formulas (12 or 13) are used to sum these into the total absolute PDS losses.
1048
+
1049
+ Both paths converge to calculate the relative PDS losses by comparing absolute losses to the rated motor power (PR,M​). Finally, this relative value is used to assign the specific IES class via Formula (20). Notes indicate this cycle repeats for various speed and torque adjustments. """,
1050
+ "chunk_type": "text",
1051
+ "artifact": "None"
1052
+ },
1053
+ {
1054
+ "content": r"""Chunk 3: This flowchart outlines the procedure for determining the IE classification and losses for a Complete Drive Module (CDM). The process begins by establishing the rated output current (Ir,out​) and equivalent apparent power (Sr,equ​), either from specifications or mechanical power ratings.
1055
+
1056
+ Users then select a determination method: Test (measuring via calorimetric or input-output methods) or Calculation (using formulas or manufacturer models). Both paths involve verifying load conditions and power factors (cosΦ) to calculate absolute CDM losses (PL,CDM​), incorporating required uncertainties. Finally, losses are adjusted if the CDM has modified characteristics, and the result is used to determine the specific IE class. The process repeats for various part-load operating points.""",
1057
+ "chunk_type": "text",
1058
+ "artifact": "None"
1059
+ },
1060
+ {
1061
+ "content": "Chunk 4: Continuous operation periodic duty with related load/speed changes - Duty type S8",
1062
+ "chunk_type": "image",
1063
+ "artifact": r"![Image](Samples/VLM/Singles/CurvePlots/IEC60034-1{ed14.0}b_Fig8_Plot.png)"
1064
+ },
1065
+ {
1066
+ "content": "Chunk 5: Determination of IES classsification f PDS and loss determination for part loading operations",
1067
+ "chunk_type": "image",
1068
+ "artifact": r"![Image](Samples/VLM/Singles/Flowcharts/iec61800-9-2{ed2.0}b_Fig27_FlowChart.png)"
1069
+ }
1070
+ ]
1071
+
1072
+ ### Test LLM
1073
+ print("Testing LLM...")
1074
+ try:
1075
+ llm_response = call_llm_simple("Who is the president of the United States?")
1076
+ print(f"LLM Response: {llm_response[:100]}...")
1077
+ except Exception as e:
1078
+ print(f"LLM Test Failed: {e}")
1079
+
1080
+ ## Test VLM
1081
+ print("\nTesting VLM...")
1082
+ test_image = [
1083
+ "Samples/VLM/Singles/CurvePlots/IEC60034-2-2{ed2.0}b_Fig10_Plot.png", ### water density and specific heat vs temperature
1084
+ "Samples/VLM/Singles/Circuits/IEC60034-2-1{ed3.0}b_Fig14_Circuit.png", # Eh star test circuit
1085
+ "Samples/VLM/Singles/Flowcharts/iec61800-9-2{ed2.0}b_Fig27_FlowChart.png", # IES classification and loss determination
1086
+ "Samples/VLM/Singles/CurvePlots/IEC60034-1{ed14.0}b_Fig8_Plot.png", # Continuous operation periodic duty with related load/speed changes - Duty type S8
1087
+
1088
+ "Samples/VLM/Singles/CurvePlots/iec61800-9-2{ed2.0}b_FigE7_Plot.png", # Efficiency map of exemplary motor in Y or D
1089
+ "Samples/VLM/Singles/CurvePlots/IEC60034-2-3{ed2.0}b_FigC1_Plot.png", # torque vs speed in Y / D
1090
+ ]
1091
+
1092
+ if Path(test_image[0]).exists():
1093
+ ### 1. Simple VLM
1094
+ print("\n1. Testing call_vlm_simple...")
1095
+ try:
1096
+ vlm_response = call_vlm_simple("Describe this image briefly.", test_image[0])
1097
+ print(f"VLM Response: {vlm_response[:100]}...")
1098
+ except Exception as e:
1099
+ print(f"call_vlm_simple Failed: {e}")
1100
+
1101
+ # 3. VLM with Multiple Images
1102
+ print("\n3. Testing call_vlm_with_multiple_images...")
1103
+ try:
1104
+ resp = call_vlm_with_multiple_images("Are these images the same?", [test_image[4], test_image[5]])
1105
+ print(f"Response: {resp[:100]}...")
1106
+ except Exception as e:
1107
+ print(f"call_vlm_with_multiple_images Failed: {e}")
1108
+
1109
+ # 4. VLM Interweaved (Original)
1110
+ print("\n4. Testing call_vlm_interweaved...")
1111
+ try:
1112
+ instruction = """Analyze the sequence of chunks provided. Each chunk contains either text or an image related to motor efficiency and IES classification.
1113
+
1114
+ Your task is to:
1115
+ 1. Identify the logical flow and dependencies between the chunks
1116
+ 2. Determine if the current order makes sense for understanding motor classification procedures
1117
+ 3. Suggest the optimal sequence that would help someone learn about IES classification step-by-step
1118
+ 4. Return your analysis followed by the chunks in your recommended order, using the format: CHUNK<|#|><include chunk number><|#|>[textual content of the chunk]<|#|>
1119
+
1120
+ Consider how text explanations should relate to visual diagrams, flowcharts, and technical specifications."""
1121
+
1122
+ resp = call_vlm_interweaved(instruction, chunks_interleaved)
1123
+ print(f"Response: {resp}...")
1124
+ except Exception as e:
1125
+ print(f"call_vlm_interweaved Failed: {e}")
1126
+
1127
+ # 5. VLM Interleaved Ollama (New)
1128
+ # print("\n5. Testing call_vlm_interleaved_ollama...")
1129
+ # # Construct chunks in the expected format for Ollama function
1130
+
1131
+
1132
+ # try:
1133
+ # # Using empty image_base_dir because test_image is already a valid path
1134
+ # instruction = "Order these chunks based on their relevance to IES classification of motors. Return the chunks with just the text content separated by <|#|>"
1135
+ # resp = call_vlm_interleaved_ollama(chunks_interleaved, image_base_dir="", prompt=instruction)
1136
+ # print(f"Response:\n{resp}...")
1137
+ # except Exception as e:
1138
+ # print(f"call_vlm_interleaved_ollama Failed: {e}")
1139
+
1140
+ # # 6. VLM Multi Images Ollama (New)
1141
+ # print("\n6. Testing call_vlm_multi_images_ollama...")
1142
+ # try:
1143
+ # multi_images = [
1144
+ # "Samples/VLM/Singles/CurvePlots/IEC60034-2-2{ed2.0}b_Fig10_Plot.png",
1145
+ # "Samples/VLM/Singles/CurvePlots/IEC60034-1{ed14.0}b_Fig8_Plot.png",
1146
+ # "Samples/VLM/Singles/Flowcharts/iec61800-9-2{ed2.0}b_Fig27_FlowChart.png"
1147
+ # "Samples/VLM/Singles/Circuits/IEC60034-2-1{ed3.0}b_Fig14_Circuit.png",
1148
+ # "Samples/VLM/Singles/Equations/IEC60034-2-3{ed2.0}b_Eqn_10-11.png"
1149
+ # ]
1150
+ # resp = call_vlm_multi_images_ollama("Describe each image briefly in order.", multi_images)
1151
+ # print(f"Response:\n{resp}...")
1152
+ # except Exception as e:
1153
+ # print(f"call_vlm_multi_images_ollama Failed: {e}")
1154
+
1155
+ else:
1156
+ print(f"Test image not found: {test_image}")
1157
+
1158
+
1159
+ # ============================================================================
1160
+ # ASYNC RATE-LIMITED BATCH PROCESSING
1161
+ # ============================================================================
1162
+
1163
+ class RateLimiter:
1164
+ """Token bucket rate limiter for API calls with per-minute limits.
1165
+
1166
+ Creates asyncio primitives lazily to work with any event loop.
1167
+ """
1168
+
1169
+ def __init__(self, requests_per_minute: int = 60, burst_size: int = 10):
1170
+ """
1171
+ Args:
1172
+ requests_per_minute: Maximum requests allowed per minute
1173
+ burst_size: Maximum burst size (concurrent requests in a short period)
1174
+ """
1175
+ self.rpm = requests_per_minute
1176
+ self.burst_size = burst_size
1177
+ self.interval = 60.0 / requests_per_minute # Seconds between requests
1178
+ self.last_request_time = 0.0
1179
+ # Don't create asyncio primitives here - create them lazily per event loop
1180
+ self._semaphore: Optional[asyncio.Semaphore] = None
1181
+ self._lock: Optional[asyncio.Lock] = None
1182
+ self._loop_id: Optional[int] = None
1183
+
1184
+ def _ensure_primitives(self):
1185
+ """Ensure asyncio primitives exist for the current event loop."""
1186
+ try:
1187
+ current_loop = asyncio.get_running_loop()
1188
+ current_loop_id = id(current_loop)
1189
+ except RuntimeError:
1190
+ current_loop_id = None
1191
+
1192
+ # Recreate primitives if loop changed
1193
+ if self._loop_id != current_loop_id:
1194
+ self._semaphore = asyncio.Semaphore(self.burst_size)
1195
+ self._lock = asyncio.Lock()
1196
+ self._loop_id = current_loop_id
1197
+
1198
+ async def acquire(self):
1199
+ """Acquire permission to make a request"""
1200
+ self._ensure_primitives()
1201
+
1202
+ async with self._lock:
1203
+ now = time.monotonic()
1204
+ time_since_last = now - self.last_request_time
1205
+ if time_since_last < self.interval:
1206
+ await asyncio.sleep(self.interval - time_since_last)
1207
+ self.last_request_time = time.monotonic()
1208
+
1209
+ await self._semaphore.acquire()
1210
+
1211
+ def release(self):
1212
+ """Release the semaphore after request completes"""
1213
+ if self._semaphore is not None:
1214
+ self._semaphore.release()
1215
+
1216
+ # Rate limits are loaded from config.yaml above (GEMINI_RPM, GEMINI_BURST)
1217
+ # Override via environment if needed:
1218
+ GEMINI_RPM = int(os.environ.get("GEMINI_RPM", str(GEMINI_RPM)))
1219
+ GEMINI_BURST = int(os.environ.get("GEMINI_BURST", str(GEMINI_BURST)))
1220
+
1221
+ _rate_limiter: Optional[RateLimiter] = None
1222
+
1223
+ def get_rate_limiter() -> RateLimiter:
1224
+ """Get or create the global rate limiter"""
1225
+ global _rate_limiter
1226
+ if _rate_limiter is None:
1227
+ _rate_limiter = RateLimiter(
1228
+ requests_per_minute=GEMINI_RPM,
1229
+ burst_size=GEMINI_BURST
1230
+ )
1231
+ print(f"⚡ Rate limiter initialized: {GEMINI_RPM} RPM, burst={GEMINI_BURST}")
1232
+ return _rate_limiter
1233
+
1234
+
1235
+ async def _async_call_llm_simple(prompt: str, session: aiohttp.ClientSession,
1236
+ rate_limiter: RateLimiter, timeout: int = 300) -> str:
1237
+ """Async version of call_llm_simple. Supports all backends."""
1238
+ await rate_limiter.acquire()
1239
+ try:
1240
+ if BACKEND == "OLLAMA":
1241
+ data = {
1242
+ "model": LLM_MODEL_NAME,
1243
+ "messages": [{"role": "user", "content": prompt}],
1244
+ "stream": False,
1245
+ "options": {"temperature": 0.0}
1246
+ }
1247
+ async with session.post(API_URL, json=data, timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1248
+ if resp.status == 200:
1249
+ result_json = await resp.json()
1250
+ return result_json["message"]["content"]
1251
+ else:
1252
+ raise Exception(f"HTTP {resp.status}")
1253
+
1254
+ elif BACKEND == "GEMINI":
1255
+ url = GEMINI_URL.format(model=LLM_MODEL_NAME) + f"?key={API_KEY}"
1256
+ data = {
1257
+ "contents": [{"parts": [{"text": prompt}]}],
1258
+ "generationConfig": {"temperature": 0.0}
1259
+ }
1260
+ async with session.post(url, headers=HEADERS, json=data,
1261
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1262
+ if resp.status == 200:
1263
+ resp_json = await resp.json()
1264
+ return resp_json["candidates"][0]["content"]["parts"][0]["text"]
1265
+ else:
1266
+ text = await resp.text()
1267
+ raise Exception(f"HTTP {resp.status}: {text[:100]}")
1268
+
1269
+ elif BACKEND == "OPENAI":
1270
+ data = {
1271
+ "model": LLM_MODEL_NAME,
1272
+ "messages": [{"role": "user", "content": prompt}],
1273
+ "temperature": 0.0
1274
+ }
1275
+ async with session.post(API_URL, headers=HEADERS, json=data,
1276
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1277
+ if resp.status == 200:
1278
+ response_data = await resp.json()
1279
+ return response_data["choices"][0]["message"]["content"]
1280
+ else:
1281
+ raise Exception(f"HTTP {resp.status}")
1282
+
1283
+ else: # OpenAI-compatible
1284
+ data = {
1285
+ "model": LLM_MODEL_NAME,
1286
+ "messages": [{"role": "user", "content": prompt}]
1287
+ }
1288
+ async with session.post(API_URL, headers=HEADERS, json=data,
1289
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1290
+ if resp.status == 200:
1291
+ response_data = await resp.json()
1292
+ return response_data["choices"][0]["message"]["content"]
1293
+ else:
1294
+ raise Exception(f"HTTP {resp.status}")
1295
+ finally:
1296
+ rate_limiter.release()
1297
+
1298
+
1299
+ async def _async_call_vlm_interweaved(prompt: str, chunks: List[Dict],
1300
+ session: aiohttp.ClientSession,
1301
+ rate_limiter: RateLimiter, timeout: int = 300) -> str:
1302
+ """Async version of call_vlm_interweaved. Supports all backends."""
1303
+ await rate_limiter.acquire()
1304
+
1305
+ def _extract_image_path(chunk):
1306
+ """Helper to extract image path from chunk"""
1307
+ if 'chunk_type' in chunk:
1308
+ chunk_type = chunk.get('chunk_type', '')
1309
+ artifact = chunk.get('artifact', 'None')
1310
+ if chunk_type in ['standalone image', 'image'] and artifact != 'None':
1311
+ match = re.search(r'!\[Image\]\(([^)]+)\)', artifact)
1312
+ if match:
1313
+ return match.group(1)
1314
+ return None
1315
+ return chunk.get('image_path')
1316
+
1317
+ try:
1318
+ if BACKEND == "OLLAMA":
1319
+ messages = [{"role": "user", "content": prompt}]
1320
+ for i, chunk in enumerate(chunks):
1321
+ chunk_text = chunk.get('content', '')
1322
+ image_path = _extract_image_path(chunk)
1323
+ msg = {"role": "user", "content": f"CHUNK{i+1}: {chunk_text}"}
1324
+ if image_path and Path(image_path).exists():
1325
+ msg["images"] = [encode_image_to_base64(image_path)]
1326
+ messages.append(msg)
1327
+
1328
+ data = {
1329
+ "model": VLM_MODEL_NAME,
1330
+ "messages": messages,
1331
+ "stream": False,
1332
+ "options": {"num_ctx": 16384, "temperature": 0.0}
1333
+ }
1334
+ async with session.post(API_URL, json=data,
1335
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1336
+ if resp.status == 200:
1337
+ result_json = await resp.json()
1338
+ return result_json["message"]["content"]
1339
+ else:
1340
+ raise Exception(f"HTTP {resp.status}")
1341
+
1342
+ elif BACKEND == "GEMINI":
1343
+ # Gemini: parts array with text and inline_data
1344
+ parts = [{"text": prompt}]
1345
+ for i, chunk in enumerate(chunks):
1346
+ chunk_text = chunk.get('content', '')
1347
+ image_path = _extract_image_path(chunk)
1348
+ parts.append({"text": f"\n\nCHUNK {i+1}:\n{chunk_text}"})
1349
+ if image_path and Path(image_path).exists():
1350
+ base64_image = encode_image_to_base64(image_path)
1351
+ mime_type = get_image_mime_type(image_path)
1352
+ parts.append({"inline_data": {"mime_type": mime_type, "data": base64_image}})
1353
+
1354
+ url = GEMINI_URL.format(model=VLM_MODEL_NAME) + f"?key={API_KEY}"
1355
+ data = {
1356
+ "contents": [{"parts": parts}],
1357
+ "generationConfig": {"temperature": 0.0}
1358
+ }
1359
+ async with session.post(url, headers=HEADERS, json=data,
1360
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1361
+ if resp.status == 200:
1362
+ resp_json = await resp.json()
1363
+ return resp_json["candidates"][0]["content"]["parts"][0]["text"]
1364
+ else:
1365
+ text = await resp.text()
1366
+ raise Exception(f"HTTP {resp.status}: {text[:100]}")
1367
+
1368
+ elif BACKEND == "OPENAI":
1369
+ # OpenAI: content array with text and image_url
1370
+ content = [{"type": "text", "text": prompt}]
1371
+ for i, chunk in enumerate(chunks):
1372
+ chunk_text = chunk.get('content', '')
1373
+ image_path = _extract_image_path(chunk)
1374
+ content.append({"type": "text", "text": f"\n\nCHUNK {i+1}:\n{chunk_text}"})
1375
+ if image_path and Path(image_path).exists():
1376
+ base64_image = encode_image_to_base64(image_path)
1377
+ mime_type = get_image_mime_type(image_path)
1378
+ image_url = f"data:{mime_type};base64,{base64_image}"
1379
+ content.append({"type": "image_url", "image_url": {"url": image_url}})
1380
+
1381
+ data = {
1382
+ "model": VLM_MODEL_NAME,
1383
+ "messages": [{"role": "user", "content": content}],
1384
+ "temperature": 0.0
1385
+ }
1386
+ async with session.post(API_URL, headers=HEADERS, json=data,
1387
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1388
+ if resp.status == 200:
1389
+ response_data = await resp.json()
1390
+ return response_data["choices"][0]["message"]["content"]
1391
+ else:
1392
+ raise Exception(f"HTTP {resp.status}")
1393
+
1394
+ else: # OpenAI-compatible
1395
+ content = [{"type": "text", "text": prompt}]
1396
+ for i, chunk in enumerate(chunks):
1397
+ chunk_text = chunk.get('content', '')
1398
+ image_path = _extract_image_path(chunk)
1399
+ text_block = f"\n\n<|#|>CHUNK{i+1}<|#|>START<|#|>\n{chunk_text}\n"
1400
+
1401
+ if image_path and Path(image_path).exists():
1402
+ text_block += "<|#|>Image<|#|>"
1403
+ content.append({"type": "text", "text": text_block})
1404
+ base64_image = encode_image_to_base64(image_path)
1405
+ mime_type = get_image_mime_type(image_path)
1406
+ image_url = f"data:{mime_type};base64,{base64_image}"
1407
+ content.append({"type": "image_url", "image_url": {"url": image_url}})
1408
+ content.append({"type": "text", "text": f"<|#|>CHUNK{i+1}<|#|>END<|#|>"})
1409
+ else:
1410
+ text_block += f"<|#|>Image<|#|>None<|#|>CHUNK{i+1}<|#|>END<|#|>"
1411
+ content.append({"type": "text", "text": text_block})
1412
+
1413
+ data = {"model": VLM_MODEL_NAME, "messages": [{"role": "user", "content": content}]}
1414
+ local_headers = {**HEADERS, "Connection": "close"}
1415
+
1416
+ async with session.post(API_URL, headers=local_headers, json=data,
1417
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1418
+ if resp.status == 200:
1419
+ response_data = await resp.json()
1420
+ return response_data["choices"][0]["message"]["content"]
1421
+ else:
1422
+ raise Exception(f"HTTP {resp.status}")
1423
+ finally:
1424
+ rate_limiter.release()
1425
+
1426
+
1427
+ async def _batch_llm_calls_async(prompts: List[str], max_retries: int = 3) -> List[str]:
1428
+ """Execute multiple LLM calls concurrently with rate limiting
1429
+
1430
+ Args:
1431
+ prompts: List of prompts to process
1432
+ max_retries: Maximum retries per request
1433
+
1434
+ Returns:
1435
+ List of responses in same order as prompts
1436
+ """
1437
+ rate_limiter = get_rate_limiter()
1438
+ results = [None] * len(prompts)
1439
+
1440
+ async def process_single(idx: int, prompt: str, session: aiohttp.ClientSession):
1441
+ for attempt in range(max_retries):
1442
+ try:
1443
+ result = await _async_call_llm_simple(prompt, session, rate_limiter)
1444
+ results[idx] = result
1445
+ return
1446
+ except Exception as e:
1447
+ if attempt == max_retries - 1:
1448
+ logging.error(f"Batch LLM call {idx} failed after {max_retries} attempts: {e}")
1449
+ results[idx] = f"ERROR: {str(e)}"
1450
+ else:
1451
+ await asyncio.sleep(2 ** attempt) # Exponential backoff
1452
+
1453
+ connector = aiohttp.TCPConnector(limit=GEMINI_BURST)
1454
+ async with aiohttp.ClientSession(connector=connector) as session:
1455
+ tasks = [process_single(i, prompt, session) for i, prompt in enumerate(prompts)]
1456
+ await asyncio.gather(*tasks)
1457
+
1458
+ return results
1459
+
1460
+
1461
+ async def _batch_vlm_calls_async(requests: List[Tuple[str, List[Dict]]],
1462
+ max_retries: int = 3) -> List[str]:
1463
+ """Execute multiple VLM calls concurrently with rate limiting
1464
+
1465
+ Args:
1466
+ requests: List of (prompt, chunks) tuples
1467
+ max_retries: Maximum retries per request
1468
+
1469
+ Returns:
1470
+ List of responses in same order as requests
1471
+ """
1472
+ rate_limiter = get_rate_limiter()
1473
+ results = [None] * len(requests)
1474
+
1475
+ async def process_single(idx: int, prompt: str, chunks: List[Dict],
1476
+ session: aiohttp.ClientSession):
1477
+ for attempt in range(max_retries):
1478
+ try:
1479
+ result = await _async_call_vlm_interweaved(prompt, chunks, session, rate_limiter)
1480
+ results[idx] = result
1481
+ return
1482
+ except Exception as e:
1483
+ if attempt == max_retries - 1:
1484
+ logging.error(f"Batch VLM call {idx} failed after {max_retries} attempts: {e}")
1485
+ results[idx] = f"ERROR: {str(e)}"
1486
+ else:
1487
+ await asyncio.sleep(2 ** attempt)
1488
+
1489
+ connector = aiohttp.TCPConnector(limit=GEMINI_BURST)
1490
+ async with aiohttp.ClientSession(connector=connector) as session:
1491
+ tasks = [process_single(i, prompt, chunks, session)
1492
+ for i, (prompt, chunks) in enumerate(requests)]
1493
+ await asyncio.gather(*tasks)
1494
+
1495
+ return results
1496
+
1497
+
1498
+ def _run_async_batch(coro):
1499
+ """Helper to run async coroutine from sync context, handling various event loop states."""
1500
+ try:
1501
+ # Try to get existing loop
1502
+ try:
1503
+ loop = asyncio.get_running_loop()
1504
+ # Loop is running - use thread executor
1505
+ import concurrent.futures
1506
+ with concurrent.futures.ThreadPoolExecutor() as executor:
1507
+ future = executor.submit(asyncio.run, coro)
1508
+ return future.result()
1509
+ except RuntimeError:
1510
+ # No running loop - safe to use asyncio.run
1511
+ pass
1512
+
1513
+ return asyncio.run(coro)
1514
+
1515
+ except Exception as e:
1516
+ logging.error(f"Async batch execution failed: {e}")
1517
+ raise
1518
+
1519
+
1520
+ def batch_call_llm(prompts: List[str], show_progress: bool = True) -> List[str]:
1521
+ """Synchronous wrapper for batch LLM calls with rate limiting
1522
+
1523
+ Args:
1524
+ prompts: List of prompts to process concurrently
1525
+ show_progress: Whether to print progress
1526
+
1527
+ Returns:
1528
+ List of responses in same order as prompts
1529
+ """
1530
+ if not prompts:
1531
+ return []
1532
+
1533
+ if show_progress:
1534
+ print(f"⚡ Batch LLM: Processing {len(prompts)} requests (RPM={GEMINI_RPM}, burst={GEMINI_BURST})...")
1535
+
1536
+ start_time = time.time()
1537
+ results = _run_async_batch(_batch_llm_calls_async(prompts))
1538
+ elapsed = time.time() - start_time
1539
+
1540
+ if show_progress:
1541
+ print(f"✅ Batch LLM: Completed {len(prompts)} requests in {elapsed:.1f}s "
1542
+ f"({len(prompts)/elapsed:.1f} req/s)")
1543
+
1544
+ return results
1545
+
1546
+
1547
+ def batch_call_vlm_interweaved(requests: List[Tuple[str, List[Dict]]],
1548
+ show_progress: bool = True) -> List[str]:
1549
+ """Synchronous wrapper for batch VLM calls with rate limiting
1550
+
1551
+ Args:
1552
+ requests: List of (prompt, chunks) tuples
1553
+ show_progress: Whether to print progress
1554
+
1555
+ Returns:
1556
+ List of responses in same order as requests
1557
+ """
1558
+ if not requests:
1559
+ return []
1560
+
1561
+ if show_progress:
1562
+ print(f"⚡ Batch VLM: Processing {len(requests)} requests (RPM={GEMINI_RPM}, burst={GEMINI_BURST})...")
1563
+
1564
+ start_time = time.time()
1565
+ results = _run_async_batch(_batch_vlm_calls_async(requests))
1566
+ elapsed = time.time() - start_time
1567
+
1568
+ if show_progress:
1569
+ print(f"✅ Batch VLM: Completed {len(requests)} requests in {elapsed:.1f}s "
1570
+ f"({len(requests)/elapsed:.1f} req/s)")
1571
+
1572
+ return results
1573
+
1574
+
1575
+ # ============================================================================
1576
+ # BATCH VLM WITH BASE64 IMAGES (for pdf_to_md.py and similar use cases)
1577
+ # ============================================================================
1578
+
1579
+ async def _async_call_vlm_base64(prompt: str, base64_image: str,
1580
+ session: aiohttp.ClientSession,
1581
+ rate_limiter: RateLimiter,
1582
+ mime_type: str = "image/png",
1583
+ timeout: int = 300) -> str:
1584
+ """Async VLM call with base64-encoded image. Supports all backends."""
1585
+ await rate_limiter.acquire()
1586
+
1587
+ try:
1588
+ if BACKEND == "OLLAMA":
1589
+ data = {
1590
+ "model": VLM_MODEL_NAME,
1591
+ "messages": [{
1592
+ "role": "user",
1593
+ "content": prompt,
1594
+ "images": [base64_image]
1595
+ }],
1596
+ "stream": False,
1597
+ "options": {"num_ctx": 16384, "temperature": 0.0}
1598
+ }
1599
+ async with session.post(API_URL, json=data,
1600
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1601
+ if resp.status == 200:
1602
+ result_json = await resp.json()
1603
+ return result_json["message"]["content"]
1604
+ else:
1605
+ raise Exception(f"HTTP {resp.status}")
1606
+
1607
+ elif BACKEND == "GEMINI":
1608
+ parts = [
1609
+ {"text": prompt},
1610
+ {"inline_data": {"mime_type": mime_type, "data": base64_image}}
1611
+ ]
1612
+ url = GEMINI_URL.format(model=VLM_MODEL_NAME) + f"?key={API_KEY}"
1613
+ data = {
1614
+ "contents": [{"parts": parts}],
1615
+ "generationConfig": {"temperature": 0.0}
1616
+ }
1617
+ async with session.post(url, headers=HEADERS, json=data,
1618
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1619
+ if resp.status == 200:
1620
+ resp_json = await resp.json()
1621
+ return resp_json["candidates"][0]["content"]["parts"][0]["text"]
1622
+ else:
1623
+ text = await resp.text()
1624
+ raise Exception(f"HTTP {resp.status}: {text[:100]}")
1625
+
1626
+ elif BACKEND == "OPENAI":
1627
+ image_url = f"data:{mime_type};base64,{base64_image}"
1628
+ content = [
1629
+ {"type": "text", "text": prompt},
1630
+ {"type": "image_url", "image_url": {"url": image_url}}
1631
+ ]
1632
+ data = {
1633
+ "model": VLM_MODEL_NAME,
1634
+ "messages": [{"role": "user", "content": content}],
1635
+ "temperature": 0.0
1636
+ }
1637
+ async with session.post(API_URL, headers=HEADERS, json=data,
1638
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1639
+ if resp.status == 200:
1640
+ response_data = await resp.json()
1641
+ return response_data["choices"][0]["message"]["content"]
1642
+ else:
1643
+ raise Exception(f"HTTP {resp.status}")
1644
+
1645
+ else: # OpenAI-compatible
1646
+ image_url = f"data:{mime_type};base64,{base64_image}"
1647
+ content = [
1648
+ {"type": "text", "text": prompt},
1649
+ {"type": "image_url", "image_url": {"url": image_url}}
1650
+ ]
1651
+ data = {
1652
+ "model": VLM_MODEL_NAME,
1653
+ "messages": [{"role": "user", "content": content}]
1654
+ }
1655
+ async with session.post(API_URL, headers=HEADERS, json=data,
1656
+ timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
1657
+ if resp.status == 200:
1658
+ response_data = await resp.json()
1659
+ return response_data["choices"][0]["message"]["content"]
1660
+ else:
1661
+ raise Exception(f"HTTP {resp.status}")
1662
+ finally:
1663
+ rate_limiter.release()
1664
+
1665
+
1666
+ async def _batch_vlm_base64_calls_async(requests: List[Tuple[str, str, str]],
1667
+ max_retries: int = 3) -> List[str]:
1668
+ """Execute multiple VLM calls with base64 images concurrently.
1669
+
1670
+ Args:
1671
+ requests: List of (prompt, base64_image, mime_type) tuples
1672
+ max_retries: Number of retries on failure
1673
+
1674
+ Returns:
1675
+ List of responses in same order as requests
1676
+ """
1677
+ rate_limiter = get_rate_limiter()
1678
+ results = [""] * len(requests)
1679
+
1680
+ async def process_single(idx: int, prompt: str, base64_image: str,
1681
+ mime_type: str, session: aiohttp.ClientSession):
1682
+ for attempt in range(max_retries):
1683
+ try:
1684
+ result = await _async_call_vlm_base64(
1685
+ prompt, base64_image, session, rate_limiter, mime_type
1686
+ )
1687
+ results[idx] = result
1688
+ return
1689
+ except Exception as e:
1690
+ if attempt == max_retries - 1:
1691
+ results[idx] = f"ERROR: {str(e)}"
1692
+ else:
1693
+ await asyncio.sleep(2 ** attempt)
1694
+
1695
+ async with aiohttp.ClientSession() as session:
1696
+ tasks = [
1697
+ process_single(i, prompt, b64, mime, session)
1698
+ for i, (prompt, b64, mime) in enumerate(requests)
1699
+ ]
1700
+ await asyncio.gather(*tasks)
1701
+
1702
+ return results
1703
+
1704
+
1705
+ def batch_call_vlm_base64(requests: List[Tuple[str, str, str]],
1706
+ show_progress: bool = True) -> List[str]:
1707
+ """Synchronous wrapper for batch VLM calls with base64 images.
1708
+
1709
+ Args:
1710
+ requests: List of (prompt, base64_image, mime_type) tuples
1711
+ mime_type is typically "image/png" or "image/jpeg"
1712
+ show_progress: Whether to print progress
1713
+
1714
+ Returns:
1715
+ List of responses in same order as requests
1716
+
1717
+ Example:
1718
+ requests = [
1719
+ ("Describe this image:", base64_data_1, "image/png"),
1720
+ ("What's in this table:", base64_data_2, "image/png"),
1721
+ ]
1722
+ responses = batch_call_vlm_base64(requests)
1723
+ """
1724
+ if not requests:
1725
+ return []
1726
+
1727
+ if show_progress:
1728
+ print(f"⚡ Batch VLM (base64): Processing {len(requests)} requests "
1729
+ f"(RPM={GEMINI_RPM}, burst={GEMINI_BURST})...")
1730
+
1731
+ start_time = time.time()
1732
+ results = _run_async_batch(_batch_vlm_base64_calls_async(requests))
1733
+ elapsed = time.time() - start_time
1734
+
1735
+ if show_progress:
1736
+ success_count = sum(1 for r in results if not r.startswith("ERROR:"))
1737
+ print(f"✅ Batch VLM (base64): Completed {success_count}/{len(requests)} "
1738
+ f"in {elapsed:.1f}s ({len(requests)/elapsed:.1f} req/s)")
1739
+
1740
+ return results
1741
+
1742
+
1743
+ # Alias for convenience
1744
+ call_llm = call_llm_simple
1745
+ call_vlm = call_vlm_interweaved