amd-gaia 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/METADATA +223 -223
  2. amd_gaia-0.15.1.dist-info/RECORD +178 -0
  3. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/entry_points.txt +1 -0
  4. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/licenses/LICENSE.md +20 -20
  5. gaia/__init__.py +29 -29
  6. gaia/agents/__init__.py +19 -19
  7. gaia/agents/base/__init__.py +9 -9
  8. gaia/agents/base/agent.py +2177 -2177
  9. gaia/agents/base/api_agent.py +120 -120
  10. gaia/agents/base/console.py +1841 -1841
  11. gaia/agents/base/errors.py +237 -237
  12. gaia/agents/base/mcp_agent.py +86 -86
  13. gaia/agents/base/tools.py +83 -83
  14. gaia/agents/blender/agent.py +556 -556
  15. gaia/agents/blender/agent_simple.py +133 -135
  16. gaia/agents/blender/app.py +211 -211
  17. gaia/agents/blender/app_simple.py +41 -41
  18. gaia/agents/blender/core/__init__.py +16 -16
  19. gaia/agents/blender/core/materials.py +506 -506
  20. gaia/agents/blender/core/objects.py +316 -316
  21. gaia/agents/blender/core/rendering.py +225 -225
  22. gaia/agents/blender/core/scene.py +220 -220
  23. gaia/agents/blender/core/view.py +146 -146
  24. gaia/agents/chat/__init__.py +9 -9
  25. gaia/agents/chat/agent.py +835 -835
  26. gaia/agents/chat/app.py +1058 -1058
  27. gaia/agents/chat/session.py +508 -508
  28. gaia/agents/chat/tools/__init__.py +15 -15
  29. gaia/agents/chat/tools/file_tools.py +96 -96
  30. gaia/agents/chat/tools/rag_tools.py +1729 -1729
  31. gaia/agents/chat/tools/shell_tools.py +436 -436
  32. gaia/agents/code/__init__.py +7 -7
  33. gaia/agents/code/agent.py +549 -549
  34. gaia/agents/code/cli.py +377 -0
  35. gaia/agents/code/models.py +135 -135
  36. gaia/agents/code/orchestration/__init__.py +24 -24
  37. gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
  38. gaia/agents/code/orchestration/checklist_generator.py +713 -713
  39. gaia/agents/code/orchestration/factories/__init__.py +9 -9
  40. gaia/agents/code/orchestration/factories/base.py +63 -63
  41. gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
  42. gaia/agents/code/orchestration/factories/python_factory.py +106 -106
  43. gaia/agents/code/orchestration/orchestrator.py +841 -841
  44. gaia/agents/code/orchestration/project_analyzer.py +391 -391
  45. gaia/agents/code/orchestration/steps/__init__.py +67 -67
  46. gaia/agents/code/orchestration/steps/base.py +188 -188
  47. gaia/agents/code/orchestration/steps/error_handler.py +314 -314
  48. gaia/agents/code/orchestration/steps/nextjs.py +828 -828
  49. gaia/agents/code/orchestration/steps/python.py +307 -307
  50. gaia/agents/code/orchestration/template_catalog.py +469 -469
  51. gaia/agents/code/orchestration/workflows/__init__.py +14 -14
  52. gaia/agents/code/orchestration/workflows/base.py +80 -80
  53. gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
  54. gaia/agents/code/orchestration/workflows/python.py +94 -94
  55. gaia/agents/code/prompts/__init__.py +11 -11
  56. gaia/agents/code/prompts/base_prompt.py +77 -77
  57. gaia/agents/code/prompts/code_patterns.py +2036 -2036
  58. gaia/agents/code/prompts/nextjs_prompt.py +40 -40
  59. gaia/agents/code/prompts/python_prompt.py +109 -109
  60. gaia/agents/code/schema_inference.py +365 -365
  61. gaia/agents/code/system_prompt.py +41 -41
  62. gaia/agents/code/tools/__init__.py +42 -42
  63. gaia/agents/code/tools/cli_tools.py +1138 -1138
  64. gaia/agents/code/tools/code_formatting.py +319 -319
  65. gaia/agents/code/tools/code_tools.py +769 -769
  66. gaia/agents/code/tools/error_fixing.py +1347 -1347
  67. gaia/agents/code/tools/external_tools.py +180 -180
  68. gaia/agents/code/tools/file_io.py +845 -845
  69. gaia/agents/code/tools/prisma_tools.py +190 -190
  70. gaia/agents/code/tools/project_management.py +1016 -1016
  71. gaia/agents/code/tools/testing.py +321 -321
  72. gaia/agents/code/tools/typescript_tools.py +122 -122
  73. gaia/agents/code/tools/validation_parsing.py +461 -461
  74. gaia/agents/code/tools/validation_tools.py +806 -806
  75. gaia/agents/code/tools/web_dev_tools.py +1758 -1758
  76. gaia/agents/code/validators/__init__.py +16 -16
  77. gaia/agents/code/validators/antipattern_checker.py +241 -241
  78. gaia/agents/code/validators/ast_analyzer.py +197 -197
  79. gaia/agents/code/validators/requirements_validator.py +145 -145
  80. gaia/agents/code/validators/syntax_validator.py +171 -171
  81. gaia/agents/docker/__init__.py +7 -7
  82. gaia/agents/docker/agent.py +642 -642
  83. gaia/agents/emr/__init__.py +8 -8
  84. gaia/agents/emr/agent.py +1506 -1506
  85. gaia/agents/emr/cli.py +1322 -1322
  86. gaia/agents/emr/constants.py +475 -475
  87. gaia/agents/emr/dashboard/__init__.py +4 -4
  88. gaia/agents/emr/dashboard/server.py +1974 -1974
  89. gaia/agents/jira/__init__.py +11 -11
  90. gaia/agents/jira/agent.py +894 -894
  91. gaia/agents/jira/jql_templates.py +299 -299
  92. gaia/agents/routing/__init__.py +7 -7
  93. gaia/agents/routing/agent.py +567 -570
  94. gaia/agents/routing/system_prompt.py +75 -75
  95. gaia/agents/summarize/__init__.py +11 -0
  96. gaia/agents/summarize/agent.py +885 -0
  97. gaia/agents/summarize/prompts.py +129 -0
  98. gaia/api/__init__.py +23 -23
  99. gaia/api/agent_registry.py +238 -238
  100. gaia/api/app.py +305 -305
  101. gaia/api/openai_server.py +575 -575
  102. gaia/api/schemas.py +186 -186
  103. gaia/api/sse_handler.py +373 -373
  104. gaia/apps/__init__.py +4 -4
  105. gaia/apps/llm/__init__.py +6 -6
  106. gaia/apps/llm/app.py +173 -169
  107. gaia/apps/summarize/app.py +116 -633
  108. gaia/apps/summarize/html_viewer.py +133 -133
  109. gaia/apps/summarize/pdf_formatter.py +284 -284
  110. gaia/audio/__init__.py +2 -2
  111. gaia/audio/audio_client.py +439 -439
  112. gaia/audio/audio_recorder.py +269 -269
  113. gaia/audio/kokoro_tts.py +599 -599
  114. gaia/audio/whisper_asr.py +432 -432
  115. gaia/chat/__init__.py +16 -16
  116. gaia/chat/app.py +430 -430
  117. gaia/chat/prompts.py +522 -522
  118. gaia/chat/sdk.py +1228 -1225
  119. gaia/cli.py +5481 -5632
  120. gaia/database/__init__.py +10 -10
  121. gaia/database/agent.py +176 -176
  122. gaia/database/mixin.py +290 -290
  123. gaia/database/testing.py +64 -64
  124. gaia/eval/batch_experiment.py +2332 -2332
  125. gaia/eval/claude.py +542 -542
  126. gaia/eval/config.py +37 -37
  127. gaia/eval/email_generator.py +512 -512
  128. gaia/eval/eval.py +3179 -3179
  129. gaia/eval/groundtruth.py +1130 -1130
  130. gaia/eval/transcript_generator.py +582 -582
  131. gaia/eval/webapp/README.md +167 -167
  132. gaia/eval/webapp/package-lock.json +875 -875
  133. gaia/eval/webapp/package.json +20 -20
  134. gaia/eval/webapp/public/app.js +3402 -3402
  135. gaia/eval/webapp/public/index.html +87 -87
  136. gaia/eval/webapp/public/styles.css +3661 -3661
  137. gaia/eval/webapp/server.js +415 -415
  138. gaia/eval/webapp/test-setup.js +72 -72
  139. gaia/llm/__init__.py +9 -2
  140. gaia/llm/base_client.py +60 -0
  141. gaia/llm/exceptions.py +12 -0
  142. gaia/llm/factory.py +70 -0
  143. gaia/llm/lemonade_client.py +3236 -3221
  144. gaia/llm/lemonade_manager.py +294 -294
  145. gaia/llm/providers/__init__.py +9 -0
  146. gaia/llm/providers/claude.py +108 -0
  147. gaia/llm/providers/lemonade.py +120 -0
  148. gaia/llm/providers/openai_provider.py +79 -0
  149. gaia/llm/vlm_client.py +382 -382
  150. gaia/logger.py +189 -189
  151. gaia/mcp/agent_mcp_server.py +245 -245
  152. gaia/mcp/blender_mcp_client.py +138 -138
  153. gaia/mcp/blender_mcp_server.py +648 -648
  154. gaia/mcp/context7_cache.py +332 -332
  155. gaia/mcp/external_services.py +518 -518
  156. gaia/mcp/mcp_bridge.py +811 -550
  157. gaia/mcp/servers/__init__.py +6 -6
  158. gaia/mcp/servers/docker_mcp.py +83 -83
  159. gaia/perf_analysis.py +361 -0
  160. gaia/rag/__init__.py +10 -10
  161. gaia/rag/app.py +293 -293
  162. gaia/rag/demo.py +304 -304
  163. gaia/rag/pdf_utils.py +235 -235
  164. gaia/rag/sdk.py +2194 -2194
  165. gaia/security.py +163 -163
  166. gaia/talk/app.py +289 -289
  167. gaia/talk/sdk.py +538 -538
  168. gaia/testing/__init__.py +87 -87
  169. gaia/testing/assertions.py +330 -330
  170. gaia/testing/fixtures.py +333 -333
  171. gaia/testing/mocks.py +493 -493
  172. gaia/util.py +46 -46
  173. gaia/utils/__init__.py +33 -33
  174. gaia/utils/file_watcher.py +675 -675
  175. gaia/utils/parsing.py +223 -223
  176. gaia/version.py +100 -100
  177. amd_gaia-0.15.0.dist-info/RECORD +0 -168
  178. gaia/agents/code/app.py +0 -266
  179. gaia/llm/llm_client.py +0 -723
  180. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/WHEEL +0 -0
  181. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.1.dist-info}/top_level.txt +0 -0
gaia/llm/vlm_client.py CHANGED
@@ -1,382 +1,382 @@
1
- #!/usr/bin/env python3
2
- # Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
3
- # SPDX-License-Identifier: MIT
4
-
5
- """
6
- Vision-Language Model (VLM) client for extracting text from images.
7
-
8
- Handles model loading/unloading and image-to-text extraction via Lemonade server.
9
- """
10
-
11
- import base64
12
- import logging
13
- import os
14
- from typing import Optional
15
-
16
- from dotenv import load_dotenv
17
-
18
- # Load environment variables from .env file
19
- load_dotenv()
20
-
21
- # Default Lemonade server URL (can be overridden via LEMONADE_BASE_URL env var)
22
- DEFAULT_LEMONADE_URL = "http://localhost:8000/api/v1"
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
- # Magic bytes for common image formats
27
- IMAGE_SIGNATURES = {
28
- b"\x89PNG\r\n\x1a\n": "image/png",
29
- b"\xff\xd8\xff": "image/jpeg",
30
- b"GIF87a": "image/gif",
31
- b"GIF89a": "image/gif",
32
- b"RIFF": "image/webp", # WebP starts with RIFF...WEBP
33
- b"BM": "image/bmp",
34
- }
35
-
36
-
37
- def detect_image_mime_type(image_bytes: bytes) -> str:
38
- """
39
- Detect MIME type from image bytes using magic number signatures.
40
-
41
- Args:
42
- image_bytes: Raw image bytes
43
-
44
- Returns:
45
- MIME type string (e.g., "image/jpeg", "image/png")
46
- Defaults to "image/png" if format not detected.
47
- """
48
- for signature, mime_type in IMAGE_SIGNATURES.items():
49
- if image_bytes.startswith(signature):
50
- # Special case: WebP needs additional check for WEBP marker
51
- if signature == b"RIFF" and len(image_bytes) >= 12:
52
- if image_bytes[8:12] != b"WEBP":
53
- continue
54
- return mime_type
55
-
56
- # Default to PNG if format not detected
57
- logger.debug("Could not detect image format, defaulting to image/png")
58
- return "image/png"
59
-
60
-
61
- class VLMClient:
62
- """
63
- VLM client for extracting text from images using Lemonade server.
64
-
65
- Handles:
66
- - Model loading (default: Qwen3-VL-4B-Instruct-GGUF)
67
- - Image-to-markdown conversion
68
- - State tracking for VLM processing
69
- """
70
-
71
- def __init__(
72
- self,
73
- vlm_model: str = "Qwen3-VL-4B-Instruct-GGUF",
74
- base_url: Optional[str] = None,
75
- auto_load: bool = True,
76
- ):
77
- """
78
- Initialize VLM client.
79
-
80
- Args:
81
- vlm_model: Vision model to use for image extraction
82
- base_url: Lemonade server API URL (defaults to LEMONADE_BASE_URL env var)
83
- auto_load: Automatically load VLM model on first use
84
- """
85
- # Use provided base_url, fall back to env var, then default
86
- if base_url is None:
87
- base_url = os.getenv("LEMONADE_BASE_URL", DEFAULT_LEMONADE_URL)
88
- from urllib.parse import urlparse
89
-
90
- from gaia.llm.lemonade_client import LemonadeClient
91
-
92
- self.vlm_model = vlm_model
93
- self.base_url = base_url
94
-
95
- # Parse base_url to extract host and port for LemonadeClient
96
- parsed = urlparse(base_url)
97
- host = parsed.hostname or "localhost"
98
- port = parsed.port or 8000
99
-
100
- # Get base server URL (without /api/v1) for user-facing messages
101
- self.server_url = f"http://{host}:{port}"
102
-
103
- self.client = LemonadeClient(model=vlm_model, host=host, port=port)
104
- self.auto_load = auto_load
105
- self.vlm_loaded = False
106
-
107
- logger.debug(f"VLM Client initialized: {self.vlm_model} at {self.server_url}")
108
-
109
- def check_availability(self) -> bool:
110
- """
111
- Check if VLM model is available on Lemonade server.
112
-
113
- Returns:
114
- True if model is available, False otherwise
115
- """
116
- try:
117
- models_response = self.client.list_models()
118
- available_models = [
119
- m.get("id", "") for m in models_response.get("data", [])
120
- ]
121
-
122
- if self.vlm_model in available_models:
123
- logger.debug(f"VLM model available: {self.vlm_model}")
124
- return True
125
- else:
126
- logger.warning(f"❌ VLM model not found: {self.vlm_model}")
127
- logger.warning("")
128
- logger.warning("📥 To download this model:")
129
- logger.warning(f" 1. Open Lemonade Model Manager ({self.server_url})")
130
- logger.warning(f" 2. Search for: {self.vlm_model}")
131
- logger.warning(" 3. Click 'Download' to install the model")
132
- logger.warning("")
133
- logger.warning(
134
- f" Available models: {', '.join(available_models[:3])}..."
135
- )
136
- return False
137
-
138
- except Exception as e:
139
- logger.error(f"Failed to check VLM availability: {e}")
140
- logger.error(
141
- f" Make sure Lemonade server is running at {self.server_url}"
142
- )
143
- return False
144
-
145
- def _ensure_vlm_loaded(self) -> bool:
146
- """
147
- Ensure VLM model is loaded, load it if necessary.
148
-
149
- The model will be automatically downloaded if not available (handled by
150
- lemonade_client.chat_completions with auto_download=True).
151
-
152
- Returns:
153
- True if VLM is loaded, False if loading failed
154
- """
155
- if self.vlm_loaded:
156
- return True
157
-
158
- if not self.auto_load:
159
- logger.warning("VLM not loaded and auto_load=False")
160
- return False
161
-
162
- try:
163
- logger.debug(f"Loading VLM model: {self.vlm_model}")
164
- # Load model (auto-download handled by lemonade_client, may take hours)
165
- self.client.load_model(self.vlm_model, timeout=60, auto_download=True)
166
- self.vlm_loaded = True
167
- logger.debug(f"VLM model loaded: {self.vlm_model}")
168
- return True
169
-
170
- except Exception as e:
171
- logger.error(f"Failed to load VLM model: {e}")
172
- logger.error(
173
- f" Make sure Lemonade server is running at {self.server_url}"
174
- )
175
- return False
176
-
177
- def extract_from_image(
178
- self,
179
- image_bytes: bytes,
180
- image_num: int = 1,
181
- page_num: int = 1,
182
- prompt: Optional[str] = None,
183
- ) -> str:
184
- """
185
- Extract text from an image using VLM.
186
-
187
- Args:
188
- image_bytes: Image as PNG/JPEG bytes
189
- image_num: Image number on page (for logging)
190
- page_num: Page number (for logging)
191
- prompt: Custom extraction prompt (optional)
192
-
193
- Returns:
194
- Extracted text in markdown format
195
- """
196
- # Ensure VLM is loaded
197
- if not self._ensure_vlm_loaded():
198
- error_msg = "VLM model not available"
199
- logger.error(error_msg)
200
- return f"[VLM extraction failed: {error_msg}]"
201
-
202
- # Encode image as base64 and detect MIME type
203
- # Note: Image size optimization happens in pdf_utils.py during extraction
204
- image_b64 = base64.b64encode(image_bytes).decode("utf-8")
205
- mime_type = detect_image_mime_type(image_bytes)
206
-
207
- # Default prompt for text extraction
208
- if not prompt:
209
- prompt = """You are an OCR system. Extract ALL visible text from this image exactly as it appears.
210
-
211
- Instructions:
212
- 1. Extract EVERY word you see - don't skip or paraphrase
213
- 2. Preserve exact formatting (headings, bold, bullets, tables)
214
- 3. If it's a table, format as markdown table
215
- 4. If it's a chart, describe what you see: [CHART: ...]
216
- 5. Do NOT add placeholders like "[Insert ...]" - only extract actual text
217
- 6. Do NOT generate or invent content - only extract what you see
218
-
219
- Output format: Clean markdown with the ACTUAL text from the image."""
220
-
221
- # Format message with image (OpenAI vision format)
222
- messages = [
223
- {
224
- "role": "user",
225
- "content": [
226
- {"type": "text", "text": prompt},
227
- {
228
- "type": "image_url",
229
- "image_url": {"url": f"data:{mime_type};base64,{image_b64}"},
230
- },
231
- ],
232
- }
233
- ]
234
-
235
- try:
236
- import time
237
-
238
- start_time = time.time()
239
-
240
- logger.debug(
241
- f"VLM extracting from image {image_num} on page {page_num} ({mime_type})..."
242
- )
243
- logger.debug(
244
- f" Image: {mime_type}, {len(image_b64)} chars base64 ({len(image_bytes)} bytes raw)"
245
- )
246
-
247
- # Call VLM using chat completions endpoint
248
- response = self.client.chat_completions(
249
- model=self.vlm_model,
250
- messages=messages,
251
- temperature=0.1, # Low temp for accurate extraction
252
- max_completion_tokens=2048, # Allow detailed extraction
253
- timeout=300, # VLM needs more time for complex forms (5 min)
254
- )
255
-
256
- elapsed = time.time() - start_time
257
-
258
- # Extract text from response
259
- if (
260
- isinstance(response, dict)
261
- and "choices" in response
262
- and len(response["choices"]) > 0
263
- ):
264
- extracted_text = response["choices"][0]["message"]["content"]
265
- size_kb = len(image_bytes) / 1024
266
- logger.debug(
267
- f"Extracted {len(extracted_text)} chars from image {image_num} "
268
- f"in {elapsed:.2f}s ({size_kb:.0f}KB image)"
269
- )
270
- return extracted_text
271
- else:
272
- # Check for specific error types and provide helpful messages
273
- error_msg = self._parse_vlm_error(response)
274
- logger.error(error_msg)
275
- return f"[VLM extraction failed: {error_msg}]"
276
-
277
- except Exception as e:
278
- logger.error(
279
- f"VLM extraction failed for page {page_num}, image {image_num}: {e}"
280
- )
281
- import traceback
282
-
283
- logger.debug(traceback.format_exc())
284
- return f"[VLM extraction failed: {str(e)}]"
285
-
286
- def _parse_vlm_error(self, response: dict) -> str:
287
- """Parse VLM error response and return a helpful error message."""
288
- if not isinstance(response, dict):
289
- return f"Unexpected response type: {type(response)}"
290
-
291
- # Check for nested error structure from Lemonade
292
- error = response.get("error", {})
293
- if isinstance(error, dict):
294
- details = error.get("details", {})
295
- inner_response = (
296
- details.get("response", {}) if isinstance(details, dict) else {}
297
- )
298
- inner_error = (
299
- inner_response.get("error", {})
300
- if isinstance(inner_response, dict)
301
- else {}
302
- )
303
-
304
- # Context size error
305
- if inner_error.get("type") == "exceed_context_size_error":
306
- n_ctx = inner_error.get("n_ctx", "unknown")
307
- n_prompt = inner_error.get("n_prompt_tokens", "unknown")
308
- return (
309
- f"Context size too small! Image requires {n_prompt} tokens "
310
- f"but model context is only {n_ctx}. "
311
- f"To fix: Right-click Lemonade tray icon → Settings → "
312
- f"set Context Size to 32768, then restart the model."
313
- )
314
-
315
- # Other backend errors
316
- if error.get("type") == "backend_error":
317
- msg = inner_error.get(
318
- "message", error.get("message", "Unknown backend error")
319
- )
320
- return f"Backend error: {msg}"
321
-
322
- return f"Unexpected response format: {response}"
323
-
324
- def extract_from_page_images(self, images: list, page_num: int) -> list:
325
- """
326
- Extract text from multiple images on a page.
327
-
328
- Args:
329
- images: List of image dicts with 'image_bytes', 'width', 'height', etc.
330
- page_num: Page number
331
-
332
- Returns:
333
- List of dicts:
334
- [
335
- {
336
- "image_num": 1,
337
- "text": "extracted markdown",
338
- "dimensions": "800x600",
339
- "size_kb": 45.2
340
- },
341
- ...
342
- ]
343
- """
344
- results = []
345
-
346
- for img_idx, img_data in enumerate(images, 1):
347
- extracted_text = self.extract_from_image(
348
- image_bytes=img_data["image_bytes"],
349
- image_num=img_idx,
350
- page_num=page_num,
351
- )
352
-
353
- results.append(
354
- {
355
- "image_num": img_idx,
356
- "text": extracted_text,
357
- "dimensions": f"{img_data['width']}x{img_data['height']}",
358
- "size_kb": img_data["size_kb"],
359
- }
360
- )
361
-
362
- return results
363
-
364
- def cleanup(self):
365
- """
366
- Cleanup VLM resources.
367
-
368
- Call this after batch processing to mark VLM as unloaded.
369
- Note: Model remains loaded on server; this just updates local state.
370
- """
371
- if self.vlm_loaded:
372
- logger.info("🧹 VLM processing complete")
373
- self.vlm_loaded = False
374
-
375
- def __enter__(self):
376
- """Context manager entry - ensure VLM loaded."""
377
- self._ensure_vlm_loaded()
378
- return self
379
-
380
- def __exit__(self, exc_type, exc_val, exc_tb):
381
- """Context manager exit - cleanup VLM state."""
382
- self.cleanup()
1
+ #!/usr/bin/env python3
2
+ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ """
6
+ Vision-Language Model (VLM) client for extracting text from images.
7
+
8
+ Handles model loading/unloading and image-to-text extraction via Lemonade server.
9
+ """
10
+
11
+ import base64
12
+ import logging
13
+ import os
14
+ from typing import Optional
15
+
16
+ from dotenv import load_dotenv
17
+
18
+ # Load environment variables from .env file
19
+ load_dotenv()
20
+
21
+ # Default Lemonade server URL (can be overridden via LEMONADE_BASE_URL env var)
22
+ DEFAULT_LEMONADE_URL = "http://localhost:8000/api/v1"
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Magic bytes for common image formats
27
+ IMAGE_SIGNATURES = {
28
+ b"\x89PNG\r\n\x1a\n": "image/png",
29
+ b"\xff\xd8\xff": "image/jpeg",
30
+ b"GIF87a": "image/gif",
31
+ b"GIF89a": "image/gif",
32
+ b"RIFF": "image/webp", # WebP starts with RIFF...WEBP
33
+ b"BM": "image/bmp",
34
+ }
35
+
36
+
37
+ def detect_image_mime_type(image_bytes: bytes) -> str:
38
+ """
39
+ Detect MIME type from image bytes using magic number signatures.
40
+
41
+ Args:
42
+ image_bytes: Raw image bytes
43
+
44
+ Returns:
45
+ MIME type string (e.g., "image/jpeg", "image/png")
46
+ Defaults to "image/png" if format not detected.
47
+ """
48
+ for signature, mime_type in IMAGE_SIGNATURES.items():
49
+ if image_bytes.startswith(signature):
50
+ # Special case: WebP needs additional check for WEBP marker
51
+ if signature == b"RIFF" and len(image_bytes) >= 12:
52
+ if image_bytes[8:12] != b"WEBP":
53
+ continue
54
+ return mime_type
55
+
56
+ # Default to PNG if format not detected
57
+ logger.debug("Could not detect image format, defaulting to image/png")
58
+ return "image/png"
59
+
60
+
61
+ class VLMClient:
62
+ """
63
+ VLM client for extracting text from images using Lemonade server.
64
+
65
+ Handles:
66
+ - Model loading (default: Qwen3-VL-4B-Instruct-GGUF)
67
+ - Image-to-markdown conversion
68
+ - State tracking for VLM processing
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ vlm_model: str = "Qwen3-VL-4B-Instruct-GGUF",
74
+ base_url: Optional[str] = None,
75
+ auto_load: bool = True,
76
+ ):
77
+ """
78
+ Initialize VLM client.
79
+
80
+ Args:
81
+ vlm_model: Vision model to use for image extraction
82
+ base_url: Lemonade server API URL (defaults to LEMONADE_BASE_URL env var)
83
+ auto_load: Automatically load VLM model on first use
84
+ """
85
+ # Use provided base_url, fall back to env var, then default
86
+ if base_url is None:
87
+ base_url = os.getenv("LEMONADE_BASE_URL", DEFAULT_LEMONADE_URL)
88
+ from urllib.parse import urlparse
89
+
90
+ from gaia.llm.lemonade_client import LemonadeClient
91
+
92
+ self.vlm_model = vlm_model
93
+ self.base_url = base_url
94
+
95
+ # Parse base_url to extract host and port for LemonadeClient
96
+ parsed = urlparse(base_url)
97
+ host = parsed.hostname or "localhost"
98
+ port = parsed.port or 8000
99
+
100
+ # Get base server URL (without /api/v1) for user-facing messages
101
+ self.server_url = f"http://{host}:{port}"
102
+
103
+ self.client = LemonadeClient(model=vlm_model, host=host, port=port)
104
+ self.auto_load = auto_load
105
+ self.vlm_loaded = False
106
+
107
+ logger.debug(f"VLM Client initialized: {self.vlm_model} at {self.server_url}")
108
+
109
+ def check_availability(self) -> bool:
110
+ """
111
+ Check if VLM model is available on Lemonade server.
112
+
113
+ Returns:
114
+ True if model is available, False otherwise
115
+ """
116
+ try:
117
+ models_response = self.client.list_models()
118
+ available_models = [
119
+ m.get("id", "") for m in models_response.get("data", [])
120
+ ]
121
+
122
+ if self.vlm_model in available_models:
123
+ logger.debug(f"VLM model available: {self.vlm_model}")
124
+ return True
125
+ else:
126
+ logger.warning(f"❌ VLM model not found: {self.vlm_model}")
127
+ logger.warning("")
128
+ logger.warning("📥 To download this model:")
129
+ logger.warning(f" 1. Open Lemonade Model Manager ({self.server_url})")
130
+ logger.warning(f" 2. Search for: {self.vlm_model}")
131
+ logger.warning(" 3. Click 'Download' to install the model")
132
+ logger.warning("")
133
+ logger.warning(
134
+ f" Available models: {', '.join(available_models[:3])}..."
135
+ )
136
+ return False
137
+
138
+ except Exception as e:
139
+ logger.error(f"Failed to check VLM availability: {e}")
140
+ logger.error(
141
+ f" Make sure Lemonade server is running at {self.server_url}"
142
+ )
143
+ return False
144
+
145
+ def _ensure_vlm_loaded(self) -> bool:
146
+ """
147
+ Ensure VLM model is loaded, load it if necessary.
148
+
149
+ The model will be automatically downloaded if not available (handled by
150
+ lemonade_client.chat_completions with auto_download=True).
151
+
152
+ Returns:
153
+ True if VLM is loaded, False if loading failed
154
+ """
155
+ if self.vlm_loaded:
156
+ return True
157
+
158
+ if not self.auto_load:
159
+ logger.warning("VLM not loaded and auto_load=False")
160
+ return False
161
+
162
+ try:
163
+ logger.debug(f"Loading VLM model: {self.vlm_model}")
164
+ # Load model (auto-download handled by lemonade_client, may take hours)
165
+ self.client.load_model(self.vlm_model, timeout=60, auto_download=True)
166
+ self.vlm_loaded = True
167
+ logger.debug(f"VLM model loaded: {self.vlm_model}")
168
+ return True
169
+
170
+ except Exception as e:
171
+ logger.error(f"Failed to load VLM model: {e}")
172
+ logger.error(
173
+ f" Make sure Lemonade server is running at {self.server_url}"
174
+ )
175
+ return False
176
+
177
+ def extract_from_image(
178
+ self,
179
+ image_bytes: bytes,
180
+ image_num: int = 1,
181
+ page_num: int = 1,
182
+ prompt: Optional[str] = None,
183
+ ) -> str:
184
+ """
185
+ Extract text from an image using VLM.
186
+
187
+ Args:
188
+ image_bytes: Image as PNG/JPEG bytes
189
+ image_num: Image number on page (for logging)
190
+ page_num: Page number (for logging)
191
+ prompt: Custom extraction prompt (optional)
192
+
193
+ Returns:
194
+ Extracted text in markdown format
195
+ """
196
+ # Ensure VLM is loaded
197
+ if not self._ensure_vlm_loaded():
198
+ error_msg = "VLM model not available"
199
+ logger.error(error_msg)
200
+ return f"[VLM extraction failed: {error_msg}]"
201
+
202
+ # Encode image as base64 and detect MIME type
203
+ # Note: Image size optimization happens in pdf_utils.py during extraction
204
+ image_b64 = base64.b64encode(image_bytes).decode("utf-8")
205
+ mime_type = detect_image_mime_type(image_bytes)
206
+
207
+ # Default prompt for text extraction
208
+ if not prompt:
209
+ prompt = """You are an OCR system. Extract ALL visible text from this image exactly as it appears.
210
+
211
+ Instructions:
212
+ 1. Extract EVERY word you see - don't skip or paraphrase
213
+ 2. Preserve exact formatting (headings, bold, bullets, tables)
214
+ 3. If it's a table, format as markdown table
215
+ 4. If it's a chart, describe what you see: [CHART: ...]
216
+ 5. Do NOT add placeholders like "[Insert ...]" - only extract actual text
217
+ 6. Do NOT generate or invent content - only extract what you see
218
+
219
+ Output format: Clean markdown with the ACTUAL text from the image."""
220
+
221
+ # Format message with image (OpenAI vision format)
222
+ messages = [
223
+ {
224
+ "role": "user",
225
+ "content": [
226
+ {"type": "text", "text": prompt},
227
+ {
228
+ "type": "image_url",
229
+ "image_url": {"url": f"data:{mime_type};base64,{image_b64}"},
230
+ },
231
+ ],
232
+ }
233
+ ]
234
+
235
+ try:
236
+ import time
237
+
238
+ start_time = time.time()
239
+
240
+ logger.debug(
241
+ f"VLM extracting from image {image_num} on page {page_num} ({mime_type})..."
242
+ )
243
+ logger.debug(
244
+ f" Image: {mime_type}, {len(image_b64)} chars base64 ({len(image_bytes)} bytes raw)"
245
+ )
246
+
247
+ # Call VLM using chat completions endpoint
248
+ response = self.client.chat_completions(
249
+ model=self.vlm_model,
250
+ messages=messages,
251
+ temperature=0.1, # Low temp for accurate extraction
252
+ max_completion_tokens=2048, # Allow detailed extraction
253
+ timeout=300, # VLM needs more time for complex forms (5 min)
254
+ )
255
+
256
+ elapsed = time.time() - start_time
257
+
258
+ # Extract text from response
259
+ if (
260
+ isinstance(response, dict)
261
+ and "choices" in response
262
+ and len(response["choices"]) > 0
263
+ ):
264
+ extracted_text = response["choices"][0]["message"]["content"]
265
+ size_kb = len(image_bytes) / 1024
266
+ logger.debug(
267
+ f"Extracted {len(extracted_text)} chars from image {image_num} "
268
+ f"in {elapsed:.2f}s ({size_kb:.0f}KB image)"
269
+ )
270
+ return extracted_text
271
+ else:
272
+ # Check for specific error types and provide helpful messages
273
+ error_msg = self._parse_vlm_error(response)
274
+ logger.error(error_msg)
275
+ return f"[VLM extraction failed: {error_msg}]"
276
+
277
+ except Exception as e:
278
+ logger.error(
279
+ f"VLM extraction failed for page {page_num}, image {image_num}: {e}"
280
+ )
281
+ import traceback
282
+
283
+ logger.debug(traceback.format_exc())
284
+ return f"[VLM extraction failed: {str(e)}]"
285
+
286
+ def _parse_vlm_error(self, response: dict) -> str:
287
+ """Parse VLM error response and return a helpful error message."""
288
+ if not isinstance(response, dict):
289
+ return f"Unexpected response type: {type(response)}"
290
+
291
+ # Check for nested error structure from Lemonade
292
+ error = response.get("error", {})
293
+ if isinstance(error, dict):
294
+ details = error.get("details", {})
295
+ inner_response = (
296
+ details.get("response", {}) if isinstance(details, dict) else {}
297
+ )
298
+ inner_error = (
299
+ inner_response.get("error", {})
300
+ if isinstance(inner_response, dict)
301
+ else {}
302
+ )
303
+
304
+ # Context size error
305
+ if inner_error.get("type") == "exceed_context_size_error":
306
+ n_ctx = inner_error.get("n_ctx", "unknown")
307
+ n_prompt = inner_error.get("n_prompt_tokens", "unknown")
308
+ return (
309
+ f"Context size too small! Image requires {n_prompt} tokens "
310
+ f"but model context is only {n_ctx}. "
311
+ f"To fix: Right-click Lemonade tray icon → Settings → "
312
+ f"set Context Size to 32768, then restart the model."
313
+ )
314
+
315
+ # Other backend errors
316
+ if error.get("type") == "backend_error":
317
+ msg = inner_error.get(
318
+ "message", error.get("message", "Unknown backend error")
319
+ )
320
+ return f"Backend error: {msg}"
321
+
322
+ return f"Unexpected response format: {response}"
323
+
324
+ def extract_from_page_images(self, images: list, page_num: int) -> list:
325
+ """
326
+ Extract text from multiple images on a page.
327
+
328
+ Args:
329
+ images: List of image dicts with 'image_bytes', 'width', 'height', etc.
330
+ page_num: Page number
331
+
332
+ Returns:
333
+ List of dicts:
334
+ [
335
+ {
336
+ "image_num": 1,
337
+ "text": "extracted markdown",
338
+ "dimensions": "800x600",
339
+ "size_kb": 45.2
340
+ },
341
+ ...
342
+ ]
343
+ """
344
+ results = []
345
+
346
+ for img_idx, img_data in enumerate(images, 1):
347
+ extracted_text = self.extract_from_image(
348
+ image_bytes=img_data["image_bytes"],
349
+ image_num=img_idx,
350
+ page_num=page_num,
351
+ )
352
+
353
+ results.append(
354
+ {
355
+ "image_num": img_idx,
356
+ "text": extracted_text,
357
+ "dimensions": f"{img_data['width']}x{img_data['height']}",
358
+ "size_kb": img_data["size_kb"],
359
+ }
360
+ )
361
+
362
+ return results
363
+
364
+ def cleanup(self):
365
+ """
366
+ Cleanup VLM resources.
367
+
368
+ Call this after batch processing to mark VLM as unloaded.
369
+ Note: Model remains loaded on server; this just updates local state.
370
+ """
371
+ if self.vlm_loaded:
372
+ logger.info("🧹 VLM processing complete")
373
+ self.vlm_loaded = False
374
+
375
+ def __enter__(self):
376
+ """Context manager entry - ensure VLM loaded."""
377
+ self._ensure_vlm_loaded()
378
+ return self
379
+
380
+ def __exit__(self, exc_type, exc_val, exc_tb):
381
+ """Context manager exit - cleanup VLM state."""
382
+ self.cleanup()