amd-gaia 0.15.0__py3-none-any.whl → 0.15.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/METADATA +222 -223
  2. amd_gaia-0.15.2.dist-info/RECORD +182 -0
  3. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/WHEEL +1 -1
  4. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/entry_points.txt +1 -0
  5. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/licenses/LICENSE.md +20 -20
  6. gaia/__init__.py +29 -29
  7. gaia/agents/__init__.py +19 -19
  8. gaia/agents/base/__init__.py +9 -9
  9. gaia/agents/base/agent.py +2132 -2177
  10. gaia/agents/base/api_agent.py +119 -120
  11. gaia/agents/base/console.py +1967 -1841
  12. gaia/agents/base/errors.py +237 -237
  13. gaia/agents/base/mcp_agent.py +86 -86
  14. gaia/agents/base/tools.py +88 -83
  15. gaia/agents/blender/__init__.py +7 -0
  16. gaia/agents/blender/agent.py +553 -556
  17. gaia/agents/blender/agent_simple.py +133 -135
  18. gaia/agents/blender/app.py +211 -211
  19. gaia/agents/blender/app_simple.py +41 -41
  20. gaia/agents/blender/core/__init__.py +16 -16
  21. gaia/agents/blender/core/materials.py +506 -506
  22. gaia/agents/blender/core/objects.py +316 -316
  23. gaia/agents/blender/core/rendering.py +225 -225
  24. gaia/agents/blender/core/scene.py +220 -220
  25. gaia/agents/blender/core/view.py +146 -146
  26. gaia/agents/chat/__init__.py +9 -9
  27. gaia/agents/chat/agent.py +809 -835
  28. gaia/agents/chat/app.py +1065 -1058
  29. gaia/agents/chat/session.py +508 -508
  30. gaia/agents/chat/tools/__init__.py +15 -15
  31. gaia/agents/chat/tools/file_tools.py +96 -96
  32. gaia/agents/chat/tools/rag_tools.py +1744 -1729
  33. gaia/agents/chat/tools/shell_tools.py +437 -436
  34. gaia/agents/code/__init__.py +7 -7
  35. gaia/agents/code/agent.py +549 -549
  36. gaia/agents/code/cli.py +377 -0
  37. gaia/agents/code/models.py +135 -135
  38. gaia/agents/code/orchestration/__init__.py +24 -24
  39. gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
  40. gaia/agents/code/orchestration/checklist_generator.py +713 -713
  41. gaia/agents/code/orchestration/factories/__init__.py +9 -9
  42. gaia/agents/code/orchestration/factories/base.py +63 -63
  43. gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
  44. gaia/agents/code/orchestration/factories/python_factory.py +106 -106
  45. gaia/agents/code/orchestration/orchestrator.py +841 -841
  46. gaia/agents/code/orchestration/project_analyzer.py +391 -391
  47. gaia/agents/code/orchestration/steps/__init__.py +67 -67
  48. gaia/agents/code/orchestration/steps/base.py +188 -188
  49. gaia/agents/code/orchestration/steps/error_handler.py +314 -314
  50. gaia/agents/code/orchestration/steps/nextjs.py +828 -828
  51. gaia/agents/code/orchestration/steps/python.py +307 -307
  52. gaia/agents/code/orchestration/template_catalog.py +469 -469
  53. gaia/agents/code/orchestration/workflows/__init__.py +14 -14
  54. gaia/agents/code/orchestration/workflows/base.py +80 -80
  55. gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
  56. gaia/agents/code/orchestration/workflows/python.py +94 -94
  57. gaia/agents/code/prompts/__init__.py +11 -11
  58. gaia/agents/code/prompts/base_prompt.py +77 -77
  59. gaia/agents/code/prompts/code_patterns.py +2034 -2036
  60. gaia/agents/code/prompts/nextjs_prompt.py +40 -40
  61. gaia/agents/code/prompts/python_prompt.py +109 -109
  62. gaia/agents/code/schema_inference.py +365 -365
  63. gaia/agents/code/system_prompt.py +41 -41
  64. gaia/agents/code/tools/__init__.py +42 -42
  65. gaia/agents/code/tools/cli_tools.py +1138 -1138
  66. gaia/agents/code/tools/code_formatting.py +319 -319
  67. gaia/agents/code/tools/code_tools.py +769 -769
  68. gaia/agents/code/tools/error_fixing.py +1347 -1347
  69. gaia/agents/code/tools/external_tools.py +180 -180
  70. gaia/agents/code/tools/file_io.py +845 -845
  71. gaia/agents/code/tools/prisma_tools.py +190 -190
  72. gaia/agents/code/tools/project_management.py +1016 -1016
  73. gaia/agents/code/tools/testing.py +321 -321
  74. gaia/agents/code/tools/typescript_tools.py +122 -122
  75. gaia/agents/code/tools/validation_parsing.py +461 -461
  76. gaia/agents/code/tools/validation_tools.py +806 -806
  77. gaia/agents/code/tools/web_dev_tools.py +1758 -1758
  78. gaia/agents/code/validators/__init__.py +16 -16
  79. gaia/agents/code/validators/antipattern_checker.py +241 -241
  80. gaia/agents/code/validators/ast_analyzer.py +197 -197
  81. gaia/agents/code/validators/requirements_validator.py +145 -145
  82. gaia/agents/code/validators/syntax_validator.py +171 -171
  83. gaia/agents/docker/__init__.py +7 -7
  84. gaia/agents/docker/agent.py +643 -642
  85. gaia/agents/emr/__init__.py +8 -8
  86. gaia/agents/emr/agent.py +1504 -1506
  87. gaia/agents/emr/cli.py +1322 -1322
  88. gaia/agents/emr/constants.py +475 -475
  89. gaia/agents/emr/dashboard/__init__.py +4 -4
  90. gaia/agents/emr/dashboard/server.py +1972 -1974
  91. gaia/agents/jira/__init__.py +11 -11
  92. gaia/agents/jira/agent.py +894 -894
  93. gaia/agents/jira/jql_templates.py +299 -299
  94. gaia/agents/routing/__init__.py +7 -7
  95. gaia/agents/routing/agent.py +567 -570
  96. gaia/agents/routing/system_prompt.py +75 -75
  97. gaia/agents/summarize/__init__.py +11 -0
  98. gaia/agents/summarize/agent.py +885 -0
  99. gaia/agents/summarize/prompts.py +129 -0
  100. gaia/api/__init__.py +23 -23
  101. gaia/api/agent_registry.py +238 -238
  102. gaia/api/app.py +305 -305
  103. gaia/api/openai_server.py +575 -575
  104. gaia/api/schemas.py +186 -186
  105. gaia/api/sse_handler.py +373 -373
  106. gaia/apps/__init__.py +4 -4
  107. gaia/apps/llm/__init__.py +6 -6
  108. gaia/apps/llm/app.py +184 -169
  109. gaia/apps/summarize/app.py +116 -633
  110. gaia/apps/summarize/html_viewer.py +133 -133
  111. gaia/apps/summarize/pdf_formatter.py +284 -284
  112. gaia/audio/__init__.py +2 -2
  113. gaia/audio/audio_client.py +439 -439
  114. gaia/audio/audio_recorder.py +269 -269
  115. gaia/audio/kokoro_tts.py +599 -599
  116. gaia/audio/whisper_asr.py +432 -432
  117. gaia/chat/__init__.py +16 -16
  118. gaia/chat/app.py +428 -430
  119. gaia/chat/prompts.py +522 -522
  120. gaia/chat/sdk.py +1228 -1225
  121. gaia/cli.py +5659 -5632
  122. gaia/database/__init__.py +10 -10
  123. gaia/database/agent.py +176 -176
  124. gaia/database/mixin.py +290 -290
  125. gaia/database/testing.py +64 -64
  126. gaia/eval/batch_experiment.py +2332 -2332
  127. gaia/eval/claude.py +542 -542
  128. gaia/eval/config.py +37 -37
  129. gaia/eval/email_generator.py +512 -512
  130. gaia/eval/eval.py +3179 -3179
  131. gaia/eval/groundtruth.py +1130 -1130
  132. gaia/eval/transcript_generator.py +582 -582
  133. gaia/eval/webapp/README.md +167 -167
  134. gaia/eval/webapp/package-lock.json +875 -875
  135. gaia/eval/webapp/package.json +20 -20
  136. gaia/eval/webapp/public/app.js +3402 -3402
  137. gaia/eval/webapp/public/index.html +87 -87
  138. gaia/eval/webapp/public/styles.css +3661 -3661
  139. gaia/eval/webapp/server.js +415 -415
  140. gaia/eval/webapp/test-setup.js +72 -72
  141. gaia/installer/__init__.py +23 -0
  142. gaia/installer/init_command.py +1275 -0
  143. gaia/installer/lemonade_installer.py +619 -0
  144. gaia/llm/__init__.py +10 -2
  145. gaia/llm/base_client.py +60 -0
  146. gaia/llm/exceptions.py +12 -0
  147. gaia/llm/factory.py +70 -0
  148. gaia/llm/lemonade_client.py +3421 -3221
  149. gaia/llm/lemonade_manager.py +294 -294
  150. gaia/llm/providers/__init__.py +9 -0
  151. gaia/llm/providers/claude.py +108 -0
  152. gaia/llm/providers/lemonade.py +118 -0
  153. gaia/llm/providers/openai_provider.py +79 -0
  154. gaia/llm/vlm_client.py +382 -382
  155. gaia/logger.py +189 -189
  156. gaia/mcp/agent_mcp_server.py +245 -245
  157. gaia/mcp/blender_mcp_client.py +138 -138
  158. gaia/mcp/blender_mcp_server.py +648 -648
  159. gaia/mcp/context7_cache.py +332 -332
  160. gaia/mcp/external_services.py +518 -518
  161. gaia/mcp/mcp_bridge.py +811 -550
  162. gaia/mcp/servers/__init__.py +6 -6
  163. gaia/mcp/servers/docker_mcp.py +83 -83
  164. gaia/perf_analysis.py +361 -0
  165. gaia/rag/__init__.py +10 -10
  166. gaia/rag/app.py +293 -293
  167. gaia/rag/demo.py +304 -304
  168. gaia/rag/pdf_utils.py +235 -235
  169. gaia/rag/sdk.py +2194 -2194
  170. gaia/security.py +183 -163
  171. gaia/talk/app.py +287 -289
  172. gaia/talk/sdk.py +538 -538
  173. gaia/testing/__init__.py +87 -87
  174. gaia/testing/assertions.py +330 -330
  175. gaia/testing/fixtures.py +333 -333
  176. gaia/testing/mocks.py +493 -493
  177. gaia/util.py +46 -46
  178. gaia/utils/__init__.py +33 -33
  179. gaia/utils/file_watcher.py +675 -675
  180. gaia/utils/parsing.py +223 -223
  181. gaia/version.py +100 -100
  182. amd_gaia-0.15.0.dist-info/RECORD +0 -168
  183. gaia/agents/code/app.py +0 -266
  184. gaia/llm/llm_client.py +0 -723
  185. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/top_level.txt +0 -0
gaia/llm/vlm_client.py CHANGED
@@ -1,382 +1,382 @@
1
- #!/usr/bin/env python3
2
- # Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
3
- # SPDX-License-Identifier: MIT
4
-
5
- """
6
- Vision-Language Model (VLM) client for extracting text from images.
7
-
8
- Handles model loading/unloading and image-to-text extraction via Lemonade server.
9
- """
10
-
11
- import base64
12
- import logging
13
- import os
14
- from typing import Optional
15
-
16
- from dotenv import load_dotenv
17
-
18
- # Load environment variables from .env file
19
- load_dotenv()
20
-
21
- # Default Lemonade server URL (can be overridden via LEMONADE_BASE_URL env var)
22
- DEFAULT_LEMONADE_URL = "http://localhost:8000/api/v1"
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
- # Magic bytes for common image formats
27
- IMAGE_SIGNATURES = {
28
- b"\x89PNG\r\n\x1a\n": "image/png",
29
- b"\xff\xd8\xff": "image/jpeg",
30
- b"GIF87a": "image/gif",
31
- b"GIF89a": "image/gif",
32
- b"RIFF": "image/webp", # WebP starts with RIFF...WEBP
33
- b"BM": "image/bmp",
34
- }
35
-
36
-
37
- def detect_image_mime_type(image_bytes: bytes) -> str:
38
- """
39
- Detect MIME type from image bytes using magic number signatures.
40
-
41
- Args:
42
- image_bytes: Raw image bytes
43
-
44
- Returns:
45
- MIME type string (e.g., "image/jpeg", "image/png")
46
- Defaults to "image/png" if format not detected.
47
- """
48
- for signature, mime_type in IMAGE_SIGNATURES.items():
49
- if image_bytes.startswith(signature):
50
- # Special case: WebP needs additional check for WEBP marker
51
- if signature == b"RIFF" and len(image_bytes) >= 12:
52
- if image_bytes[8:12] != b"WEBP":
53
- continue
54
- return mime_type
55
-
56
- # Default to PNG if format not detected
57
- logger.debug("Could not detect image format, defaulting to image/png")
58
- return "image/png"
59
-
60
-
61
- class VLMClient:
62
- """
63
- VLM client for extracting text from images using Lemonade server.
64
-
65
- Handles:
66
- - Model loading (default: Qwen3-VL-4B-Instruct-GGUF)
67
- - Image-to-markdown conversion
68
- - State tracking for VLM processing
69
- """
70
-
71
- def __init__(
72
- self,
73
- vlm_model: str = "Qwen3-VL-4B-Instruct-GGUF",
74
- base_url: Optional[str] = None,
75
- auto_load: bool = True,
76
- ):
77
- """
78
- Initialize VLM client.
79
-
80
- Args:
81
- vlm_model: Vision model to use for image extraction
82
- base_url: Lemonade server API URL (defaults to LEMONADE_BASE_URL env var)
83
- auto_load: Automatically load VLM model on first use
84
- """
85
- # Use provided base_url, fall back to env var, then default
86
- if base_url is None:
87
- base_url = os.getenv("LEMONADE_BASE_URL", DEFAULT_LEMONADE_URL)
88
- from urllib.parse import urlparse
89
-
90
- from gaia.llm.lemonade_client import LemonadeClient
91
-
92
- self.vlm_model = vlm_model
93
- self.base_url = base_url
94
-
95
- # Parse base_url to extract host and port for LemonadeClient
96
- parsed = urlparse(base_url)
97
- host = parsed.hostname or "localhost"
98
- port = parsed.port or 8000
99
-
100
- # Get base server URL (without /api/v1) for user-facing messages
101
- self.server_url = f"http://{host}:{port}"
102
-
103
- self.client = LemonadeClient(model=vlm_model, host=host, port=port)
104
- self.auto_load = auto_load
105
- self.vlm_loaded = False
106
-
107
- logger.debug(f"VLM Client initialized: {self.vlm_model} at {self.server_url}")
108
-
109
- def check_availability(self) -> bool:
110
- """
111
- Check if VLM model is available on Lemonade server.
112
-
113
- Returns:
114
- True if model is available, False otherwise
115
- """
116
- try:
117
- models_response = self.client.list_models()
118
- available_models = [
119
- m.get("id", "") for m in models_response.get("data", [])
120
- ]
121
-
122
- if self.vlm_model in available_models:
123
- logger.debug(f"VLM model available: {self.vlm_model}")
124
- return True
125
- else:
126
- logger.warning(f"❌ VLM model not found: {self.vlm_model}")
127
- logger.warning("")
128
- logger.warning("📥 To download this model:")
129
- logger.warning(f" 1. Open Lemonade Model Manager ({self.server_url})")
130
- logger.warning(f" 2. Search for: {self.vlm_model}")
131
- logger.warning(" 3. Click 'Download' to install the model")
132
- logger.warning("")
133
- logger.warning(
134
- f" Available models: {', '.join(available_models[:3])}..."
135
- )
136
- return False
137
-
138
- except Exception as e:
139
- logger.error(f"Failed to check VLM availability: {e}")
140
- logger.error(
141
- f" Make sure Lemonade server is running at {self.server_url}"
142
- )
143
- return False
144
-
145
- def _ensure_vlm_loaded(self) -> bool:
146
- """
147
- Ensure VLM model is loaded, load it if necessary.
148
-
149
- The model will be automatically downloaded if not available (handled by
150
- lemonade_client.chat_completions with auto_download=True).
151
-
152
- Returns:
153
- True if VLM is loaded, False if loading failed
154
- """
155
- if self.vlm_loaded:
156
- return True
157
-
158
- if not self.auto_load:
159
- logger.warning("VLM not loaded and auto_load=False")
160
- return False
161
-
162
- try:
163
- logger.debug(f"Loading VLM model: {self.vlm_model}")
164
- # Load model (auto-download handled by lemonade_client, may take hours)
165
- self.client.load_model(self.vlm_model, timeout=60, auto_download=True)
166
- self.vlm_loaded = True
167
- logger.debug(f"VLM model loaded: {self.vlm_model}")
168
- return True
169
-
170
- except Exception as e:
171
- logger.error(f"Failed to load VLM model: {e}")
172
- logger.error(
173
- f" Make sure Lemonade server is running at {self.server_url}"
174
- )
175
- return False
176
-
177
- def extract_from_image(
178
- self,
179
- image_bytes: bytes,
180
- image_num: int = 1,
181
- page_num: int = 1,
182
- prompt: Optional[str] = None,
183
- ) -> str:
184
- """
185
- Extract text from an image using VLM.
186
-
187
- Args:
188
- image_bytes: Image as PNG/JPEG bytes
189
- image_num: Image number on page (for logging)
190
- page_num: Page number (for logging)
191
- prompt: Custom extraction prompt (optional)
192
-
193
- Returns:
194
- Extracted text in markdown format
195
- """
196
- # Ensure VLM is loaded
197
- if not self._ensure_vlm_loaded():
198
- error_msg = "VLM model not available"
199
- logger.error(error_msg)
200
- return f"[VLM extraction failed: {error_msg}]"
201
-
202
- # Encode image as base64 and detect MIME type
203
- # Note: Image size optimization happens in pdf_utils.py during extraction
204
- image_b64 = base64.b64encode(image_bytes).decode("utf-8")
205
- mime_type = detect_image_mime_type(image_bytes)
206
-
207
- # Default prompt for text extraction
208
- if not prompt:
209
- prompt = """You are an OCR system. Extract ALL visible text from this image exactly as it appears.
210
-
211
- Instructions:
212
- 1. Extract EVERY word you see - don't skip or paraphrase
213
- 2. Preserve exact formatting (headings, bold, bullets, tables)
214
- 3. If it's a table, format as markdown table
215
- 4. If it's a chart, describe what you see: [CHART: ...]
216
- 5. Do NOT add placeholders like "[Insert ...]" - only extract actual text
217
- 6. Do NOT generate or invent content - only extract what you see
218
-
219
- Output format: Clean markdown with the ACTUAL text from the image."""
220
-
221
- # Format message with image (OpenAI vision format)
222
- messages = [
223
- {
224
- "role": "user",
225
- "content": [
226
- {"type": "text", "text": prompt},
227
- {
228
- "type": "image_url",
229
- "image_url": {"url": f"data:{mime_type};base64,{image_b64}"},
230
- },
231
- ],
232
- }
233
- ]
234
-
235
- try:
236
- import time
237
-
238
- start_time = time.time()
239
-
240
- logger.debug(
241
- f"VLM extracting from image {image_num} on page {page_num} ({mime_type})..."
242
- )
243
- logger.debug(
244
- f" Image: {mime_type}, {len(image_b64)} chars base64 ({len(image_bytes)} bytes raw)"
245
- )
246
-
247
- # Call VLM using chat completions endpoint
248
- response = self.client.chat_completions(
249
- model=self.vlm_model,
250
- messages=messages,
251
- temperature=0.1, # Low temp for accurate extraction
252
- max_completion_tokens=2048, # Allow detailed extraction
253
- timeout=300, # VLM needs more time for complex forms (5 min)
254
- )
255
-
256
- elapsed = time.time() - start_time
257
-
258
- # Extract text from response
259
- if (
260
- isinstance(response, dict)
261
- and "choices" in response
262
- and len(response["choices"]) > 0
263
- ):
264
- extracted_text = response["choices"][0]["message"]["content"]
265
- size_kb = len(image_bytes) / 1024
266
- logger.debug(
267
- f"Extracted {len(extracted_text)} chars from image {image_num} "
268
- f"in {elapsed:.2f}s ({size_kb:.0f}KB image)"
269
- )
270
- return extracted_text
271
- else:
272
- # Check for specific error types and provide helpful messages
273
- error_msg = self._parse_vlm_error(response)
274
- logger.error(error_msg)
275
- return f"[VLM extraction failed: {error_msg}]"
276
-
277
- except Exception as e:
278
- logger.error(
279
- f"VLM extraction failed for page {page_num}, image {image_num}: {e}"
280
- )
281
- import traceback
282
-
283
- logger.debug(traceback.format_exc())
284
- return f"[VLM extraction failed: {str(e)}]"
285
-
286
- def _parse_vlm_error(self, response: dict) -> str:
287
- """Parse VLM error response and return a helpful error message."""
288
- if not isinstance(response, dict):
289
- return f"Unexpected response type: {type(response)}"
290
-
291
- # Check for nested error structure from Lemonade
292
- error = response.get("error", {})
293
- if isinstance(error, dict):
294
- details = error.get("details", {})
295
- inner_response = (
296
- details.get("response", {}) if isinstance(details, dict) else {}
297
- )
298
- inner_error = (
299
- inner_response.get("error", {})
300
- if isinstance(inner_response, dict)
301
- else {}
302
- )
303
-
304
- # Context size error
305
- if inner_error.get("type") == "exceed_context_size_error":
306
- n_ctx = inner_error.get("n_ctx", "unknown")
307
- n_prompt = inner_error.get("n_prompt_tokens", "unknown")
308
- return (
309
- f"Context size too small! Image requires {n_prompt} tokens "
310
- f"but model context is only {n_ctx}. "
311
- f"To fix: Right-click Lemonade tray icon → Settings → "
312
- f"set Context Size to 32768, then restart the model."
313
- )
314
-
315
- # Other backend errors
316
- if error.get("type") == "backend_error":
317
- msg = inner_error.get(
318
- "message", error.get("message", "Unknown backend error")
319
- )
320
- return f"Backend error: {msg}"
321
-
322
- return f"Unexpected response format: {response}"
323
-
324
- def extract_from_page_images(self, images: list, page_num: int) -> list:
325
- """
326
- Extract text from multiple images on a page.
327
-
328
- Args:
329
- images: List of image dicts with 'image_bytes', 'width', 'height', etc.
330
- page_num: Page number
331
-
332
- Returns:
333
- List of dicts:
334
- [
335
- {
336
- "image_num": 1,
337
- "text": "extracted markdown",
338
- "dimensions": "800x600",
339
- "size_kb": 45.2
340
- },
341
- ...
342
- ]
343
- """
344
- results = []
345
-
346
- for img_idx, img_data in enumerate(images, 1):
347
- extracted_text = self.extract_from_image(
348
- image_bytes=img_data["image_bytes"],
349
- image_num=img_idx,
350
- page_num=page_num,
351
- )
352
-
353
- results.append(
354
- {
355
- "image_num": img_idx,
356
- "text": extracted_text,
357
- "dimensions": f"{img_data['width']}x{img_data['height']}",
358
- "size_kb": img_data["size_kb"],
359
- }
360
- )
361
-
362
- return results
363
-
364
- def cleanup(self):
365
- """
366
- Cleanup VLM resources.
367
-
368
- Call this after batch processing to mark VLM as unloaded.
369
- Note: Model remains loaded on server; this just updates local state.
370
- """
371
- if self.vlm_loaded:
372
- logger.info("🧹 VLM processing complete")
373
- self.vlm_loaded = False
374
-
375
- def __enter__(self):
376
- """Context manager entry - ensure VLM loaded."""
377
- self._ensure_vlm_loaded()
378
- return self
379
-
380
- def __exit__(self, exc_type, exc_val, exc_tb):
381
- """Context manager exit - cleanup VLM state."""
382
- self.cleanup()
1
+ #!/usr/bin/env python3
2
+ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ """
6
+ Vision-Language Model (VLM) client for extracting text from images.
7
+
8
+ Handles model loading/unloading and image-to-text extraction via Lemonade server.
9
+ """
10
+
11
+ import base64
12
+ import logging
13
+ import os
14
+ from typing import Optional
15
+
16
+ from dotenv import load_dotenv
17
+
18
+ # Load environment variables from .env file
19
+ load_dotenv()
20
+
21
+ # Default Lemonade server URL (can be overridden via LEMONADE_BASE_URL env var)
22
+ DEFAULT_LEMONADE_URL = "http://localhost:8000/api/v1"
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Magic bytes for common image formats
27
+ IMAGE_SIGNATURES = {
28
+ b"\x89PNG\r\n\x1a\n": "image/png",
29
+ b"\xff\xd8\xff": "image/jpeg",
30
+ b"GIF87a": "image/gif",
31
+ b"GIF89a": "image/gif",
32
+ b"RIFF": "image/webp", # WebP starts with RIFF...WEBP
33
+ b"BM": "image/bmp",
34
+ }
35
+
36
+
37
+ def detect_image_mime_type(image_bytes: bytes) -> str:
38
+ """
39
+ Detect MIME type from image bytes using magic number signatures.
40
+
41
+ Args:
42
+ image_bytes: Raw image bytes
43
+
44
+ Returns:
45
+ MIME type string (e.g., "image/jpeg", "image/png")
46
+ Defaults to "image/png" if format not detected.
47
+ """
48
+ for signature, mime_type in IMAGE_SIGNATURES.items():
49
+ if image_bytes.startswith(signature):
50
+ # Special case: WebP needs additional check for WEBP marker
51
+ if signature == b"RIFF" and len(image_bytes) >= 12:
52
+ if image_bytes[8:12] != b"WEBP":
53
+ continue
54
+ return mime_type
55
+
56
+ # Default to PNG if format not detected
57
+ logger.debug("Could not detect image format, defaulting to image/png")
58
+ return "image/png"
59
+
60
+
61
+ class VLMClient:
62
+ """
63
+ VLM client for extracting text from images using Lemonade server.
64
+
65
+ Handles:
66
+ - Model loading (default: Qwen3-VL-4B-Instruct-GGUF)
67
+ - Image-to-markdown conversion
68
+ - State tracking for VLM processing
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ vlm_model: str = "Qwen3-VL-4B-Instruct-GGUF",
74
+ base_url: Optional[str] = None,
75
+ auto_load: bool = True,
76
+ ):
77
+ """
78
+ Initialize VLM client.
79
+
80
+ Args:
81
+ vlm_model: Vision model to use for image extraction
82
+ base_url: Lemonade server API URL (defaults to LEMONADE_BASE_URL env var)
83
+ auto_load: Automatically load VLM model on first use
84
+ """
85
+ # Use provided base_url, fall back to env var, then default
86
+ if base_url is None:
87
+ base_url = os.getenv("LEMONADE_BASE_URL", DEFAULT_LEMONADE_URL)
88
+ from urllib.parse import urlparse
89
+
90
+ from gaia.llm.lemonade_client import LemonadeClient
91
+
92
+ self.vlm_model = vlm_model
93
+ self.base_url = base_url
94
+
95
+ # Parse base_url to extract host and port for LemonadeClient
96
+ parsed = urlparse(base_url)
97
+ host = parsed.hostname or "localhost"
98
+ port = parsed.port or 8000
99
+
100
+ # Get base server URL (without /api/v1) for user-facing messages
101
+ self.server_url = f"http://{host}:{port}"
102
+
103
+ self.client = LemonadeClient(model=vlm_model, host=host, port=port)
104
+ self.auto_load = auto_load
105
+ self.vlm_loaded = False
106
+
107
+ logger.debug(f"VLM Client initialized: {self.vlm_model} at {self.server_url}")
108
+
109
+ def check_availability(self) -> bool:
110
+ """
111
+ Check if VLM model is available on Lemonade server.
112
+
113
+ Returns:
114
+ True if model is available, False otherwise
115
+ """
116
+ try:
117
+ models_response = self.client.list_models()
118
+ available_models = [
119
+ m.get("id", "") for m in models_response.get("data", [])
120
+ ]
121
+
122
+ if self.vlm_model in available_models:
123
+ logger.debug(f"VLM model available: {self.vlm_model}")
124
+ return True
125
+ else:
126
+ logger.warning(f"❌ VLM model not found: {self.vlm_model}")
127
+ logger.warning("")
128
+ logger.warning("📥 To download this model:")
129
+ logger.warning(f" 1. Open Lemonade Model Manager ({self.server_url})")
130
+ logger.warning(f" 2. Search for: {self.vlm_model}")
131
+ logger.warning(" 3. Click 'Download' to install the model")
132
+ logger.warning("")
133
+ logger.warning(
134
+ f" Available models: {', '.join(available_models[:3])}..."
135
+ )
136
+ return False
137
+
138
+ except Exception as e:
139
+ logger.error(f"Failed to check VLM availability: {e}")
140
+ logger.error(
141
+ f" Make sure Lemonade server is running at {self.server_url}"
142
+ )
143
+ return False
144
+
145
+ def _ensure_vlm_loaded(self) -> bool:
146
+ """
147
+ Ensure VLM model is loaded, load it if necessary.
148
+
149
+ The model will be automatically downloaded if not available (handled by
150
+ lemonade_client.chat_completions with auto_download=True).
151
+
152
+ Returns:
153
+ True if VLM is loaded, False if loading failed
154
+ """
155
+ if self.vlm_loaded:
156
+ return True
157
+
158
+ if not self.auto_load:
159
+ logger.warning("VLM not loaded and auto_load=False")
160
+ return False
161
+
162
+ try:
163
+ logger.debug(f"Loading VLM model: {self.vlm_model}")
164
+ # Load model (auto-download handled by lemonade_client, may take hours)
165
+ self.client.load_model(self.vlm_model, timeout=60, auto_download=True)
166
+ self.vlm_loaded = True
167
+ logger.debug(f"VLM model loaded: {self.vlm_model}")
168
+ return True
169
+
170
+ except Exception as e:
171
+ logger.error(f"Failed to load VLM model: {e}")
172
+ logger.error(
173
+ f" Make sure Lemonade server is running at {self.server_url}"
174
+ )
175
+ return False
176
+
177
+ def extract_from_image(
178
+ self,
179
+ image_bytes: bytes,
180
+ image_num: int = 1,
181
+ page_num: int = 1,
182
+ prompt: Optional[str] = None,
183
+ ) -> str:
184
+ """
185
+ Extract text from an image using VLM.
186
+
187
+ Args:
188
+ image_bytes: Image as PNG/JPEG bytes
189
+ image_num: Image number on page (for logging)
190
+ page_num: Page number (for logging)
191
+ prompt: Custom extraction prompt (optional)
192
+
193
+ Returns:
194
+ Extracted text in markdown format
195
+ """
196
+ # Ensure VLM is loaded
197
+ if not self._ensure_vlm_loaded():
198
+ error_msg = "VLM model not available"
199
+ logger.error(error_msg)
200
+ return f"[VLM extraction failed: {error_msg}]"
201
+
202
+ # Encode image as base64 and detect MIME type
203
+ # Note: Image size optimization happens in pdf_utils.py during extraction
204
+ image_b64 = base64.b64encode(image_bytes).decode("utf-8")
205
+ mime_type = detect_image_mime_type(image_bytes)
206
+
207
+ # Default prompt for text extraction
208
+ if not prompt:
209
+ prompt = """You are an OCR system. Extract ALL visible text from this image exactly as it appears.
210
+
211
+ Instructions:
212
+ 1. Extract EVERY word you see - don't skip or paraphrase
213
+ 2. Preserve exact formatting (headings, bold, bullets, tables)
214
+ 3. If it's a table, format as markdown table
215
+ 4. If it's a chart, describe what you see: [CHART: ...]
216
+ 5. Do NOT add placeholders like "[Insert ...]" - only extract actual text
217
+ 6. Do NOT generate or invent content - only extract what you see
218
+
219
+ Output format: Clean markdown with the ACTUAL text from the image."""
220
+
221
+ # Format message with image (OpenAI vision format)
222
+ messages = [
223
+ {
224
+ "role": "user",
225
+ "content": [
226
+ {"type": "text", "text": prompt},
227
+ {
228
+ "type": "image_url",
229
+ "image_url": {"url": f"data:{mime_type};base64,{image_b64}"},
230
+ },
231
+ ],
232
+ }
233
+ ]
234
+
235
+ try:
236
+ import time
237
+
238
+ start_time = time.time()
239
+
240
+ logger.debug(
241
+ f"VLM extracting from image {image_num} on page {page_num} ({mime_type})..."
242
+ )
243
+ logger.debug(
244
+ f" Image: {mime_type}, {len(image_b64)} chars base64 ({len(image_bytes)} bytes raw)"
245
+ )
246
+
247
+ # Call VLM using chat completions endpoint
248
+ response = self.client.chat_completions(
249
+ model=self.vlm_model,
250
+ messages=messages,
251
+ temperature=0.1, # Low temp for accurate extraction
252
+ max_completion_tokens=2048, # Allow detailed extraction
253
+ timeout=300, # VLM needs more time for complex forms (5 min)
254
+ )
255
+
256
+ elapsed = time.time() - start_time
257
+
258
+ # Extract text from response
259
+ if (
260
+ isinstance(response, dict)
261
+ and "choices" in response
262
+ and len(response["choices"]) > 0
263
+ ):
264
+ extracted_text = response["choices"][0]["message"]["content"]
265
+ size_kb = len(image_bytes) / 1024
266
+ logger.debug(
267
+ f"Extracted {len(extracted_text)} chars from image {image_num} "
268
+ f"in {elapsed:.2f}s ({size_kb:.0f}KB image)"
269
+ )
270
+ return extracted_text
271
+ else:
272
+ # Check for specific error types and provide helpful messages
273
+ error_msg = self._parse_vlm_error(response)
274
+ logger.error(error_msg)
275
+ return f"[VLM extraction failed: {error_msg}]"
276
+
277
+ except Exception as e:
278
+ logger.error(
279
+ f"VLM extraction failed for page {page_num}, image {image_num}: {e}"
280
+ )
281
+ import traceback
282
+
283
+ logger.debug(traceback.format_exc())
284
+ return f"[VLM extraction failed: {str(e)}]"
285
+
286
+ def _parse_vlm_error(self, response: dict) -> str:
287
+ """Parse VLM error response and return a helpful error message."""
288
+ if not isinstance(response, dict):
289
+ return f"Unexpected response type: {type(response)}"
290
+
291
+ # Check for nested error structure from Lemonade
292
+ error = response.get("error", {})
293
+ if isinstance(error, dict):
294
+ details = error.get("details", {})
295
+ inner_response = (
296
+ details.get("response", {}) if isinstance(details, dict) else {}
297
+ )
298
+ inner_error = (
299
+ inner_response.get("error", {})
300
+ if isinstance(inner_response, dict)
301
+ else {}
302
+ )
303
+
304
+ # Context size error
305
+ if inner_error.get("type") == "exceed_context_size_error":
306
+ n_ctx = inner_error.get("n_ctx", "unknown")
307
+ n_prompt = inner_error.get("n_prompt_tokens", "unknown")
308
+ return (
309
+ f"Context size too small! Image requires {n_prompt} tokens "
310
+ f"but model context is only {n_ctx}. "
311
+ f"To fix: Right-click Lemonade tray icon → Settings → "
312
+ f"set Context Size to 32768, then restart the model."
313
+ )
314
+
315
+ # Other backend errors
316
+ if error.get("type") == "backend_error":
317
+ msg = inner_error.get(
318
+ "message", error.get("message", "Unknown backend error")
319
+ )
320
+ return f"Backend error: {msg}"
321
+
322
+ return f"Unexpected response format: {response}"
323
+
324
+ def extract_from_page_images(self, images: list, page_num: int) -> list:
325
+ """
326
+ Extract text from multiple images on a page.
327
+
328
+ Args:
329
+ images: List of image dicts with 'image_bytes', 'width', 'height', etc.
330
+ page_num: Page number
331
+
332
+ Returns:
333
+ List of dicts:
334
+ [
335
+ {
336
+ "image_num": 1,
337
+ "text": "extracted markdown",
338
+ "dimensions": "800x600",
339
+ "size_kb": 45.2
340
+ },
341
+ ...
342
+ ]
343
+ """
344
+ results = []
345
+
346
+ for img_idx, img_data in enumerate(images, 1):
347
+ extracted_text = self.extract_from_image(
348
+ image_bytes=img_data["image_bytes"],
349
+ image_num=img_idx,
350
+ page_num=page_num,
351
+ )
352
+
353
+ results.append(
354
+ {
355
+ "image_num": img_idx,
356
+ "text": extracted_text,
357
+ "dimensions": f"{img_data['width']}x{img_data['height']}",
358
+ "size_kb": img_data["size_kb"],
359
+ }
360
+ )
361
+
362
+ return results
363
+
364
+ def cleanup(self):
365
+ """
366
+ Cleanup VLM resources.
367
+
368
+ Call this after batch processing to mark VLM as unloaded.
369
+ Note: Model remains loaded on server; this just updates local state.
370
+ """
371
+ if self.vlm_loaded:
372
+ logger.info("🧹 VLM processing complete")
373
+ self.vlm_loaded = False
374
+
375
+ def __enter__(self):
376
+ """Context manager entry - ensure VLM loaded."""
377
+ self._ensure_vlm_loaded()
378
+ return self
379
+
380
+ def __exit__(self, exc_type, exc_val, exc_tb):
381
+ """Context manager exit - cleanup VLM state."""
382
+ self.cleanup()