isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. isa_model/__init__.py +1 -1
  2. isa_model/client.py +732 -565
  3. isa_model/core/cache/redis_cache.py +401 -0
  4. isa_model/core/config/config_manager.py +53 -10
  5. isa_model/core/config.py +1 -1
  6. isa_model/core/database/__init__.py +1 -0
  7. isa_model/core/database/migrations.py +277 -0
  8. isa_model/core/database/supabase_client.py +123 -0
  9. isa_model/core/models/__init__.py +37 -0
  10. isa_model/core/models/model_billing_tracker.py +60 -88
  11. isa_model/core/models/model_manager.py +36 -18
  12. isa_model/core/models/model_repo.py +44 -38
  13. isa_model/core/models/model_statistics_tracker.py +234 -0
  14. isa_model/core/models/model_storage.py +0 -1
  15. isa_model/core/models/model_version_manager.py +959 -0
  16. isa_model/core/pricing_manager.py +2 -249
  17. isa_model/core/resilience/circuit_breaker.py +366 -0
  18. isa_model/core/security/secrets.py +358 -0
  19. isa_model/core/services/__init__.py +2 -4
  20. isa_model/core/services/intelligent_model_selector.py +101 -370
  21. isa_model/core/storage/hf_storage.py +1 -1
  22. isa_model/core/types.py +7 -0
  23. isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
  24. isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
  25. isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
  26. isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
  27. isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
  28. isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
  29. isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
  30. isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
  31. isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
  32. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
  33. isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
  34. isa_model/deployment/core/deployment_manager.py +6 -4
  35. isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
  36. isa_model/eval/benchmarks/__init__.py +27 -0
  37. isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
  38. isa_model/eval/benchmarks.py +244 -12
  39. isa_model/eval/evaluators/__init__.py +8 -2
  40. isa_model/eval/evaluators/audio_evaluator.py +727 -0
  41. isa_model/eval/evaluators/embedding_evaluator.py +742 -0
  42. isa_model/eval/evaluators/vision_evaluator.py +564 -0
  43. isa_model/eval/example_evaluation.py +395 -0
  44. isa_model/eval/factory.py +272 -5
  45. isa_model/eval/isa_benchmarks.py +700 -0
  46. isa_model/eval/isa_integration.py +582 -0
  47. isa_model/eval/metrics.py +159 -6
  48. isa_model/eval/tests/unit/test_basic.py +396 -0
  49. isa_model/inference/ai_factory.py +44 -8
  50. isa_model/inference/services/audio/__init__.py +21 -0
  51. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  52. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  53. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  54. isa_model/inference/services/audio/openai_stt_service.py +32 -6
  55. isa_model/inference/services/base_service.py +17 -1
  56. isa_model/inference/services/embedding/__init__.py +13 -0
  57. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  58. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  59. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  60. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  61. isa_model/inference/services/img/__init__.py +2 -2
  62. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  63. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  64. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  65. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  66. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  67. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  68. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  69. isa_model/inference/services/llm/base_llm_service.py +30 -6
  70. isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
  71. isa_model/inference/services/llm/ollama_llm_service.py +2 -1
  72. isa_model/inference/services/llm/openai_llm_service.py +652 -55
  73. isa_model/inference/services/llm/yyds_llm_service.py +2 -1
  74. isa_model/inference/services/vision/__init__.py +5 -5
  75. isa_model/inference/services/vision/base_vision_service.py +118 -185
  76. isa_model/inference/services/vision/helpers/image_utils.py +11 -5
  77. isa_model/inference/services/vision/isa_vision_service.py +573 -0
  78. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  79. isa_model/serving/api/fastapi_server.py +88 -16
  80. isa_model/serving/api/middleware/auth.py +311 -0
  81. isa_model/serving/api/middleware/security.py +278 -0
  82. isa_model/serving/api/routes/analytics.py +486 -0
  83. isa_model/serving/api/routes/deployments.py +339 -0
  84. isa_model/serving/api/routes/evaluations.py +579 -0
  85. isa_model/serving/api/routes/logs.py +430 -0
  86. isa_model/serving/api/routes/settings.py +582 -0
  87. isa_model/serving/api/routes/unified.py +324 -165
  88. isa_model/serving/api/startup.py +304 -0
  89. isa_model/serving/modal_proxy_server.py +249 -0
  90. isa_model/training/__init__.py +100 -6
  91. isa_model/training/core/__init__.py +4 -1
  92. isa_model/training/examples/intelligent_training_example.py +281 -0
  93. isa_model/training/intelligent/__init__.py +25 -0
  94. isa_model/training/intelligent/decision_engine.py +643 -0
  95. isa_model/training/intelligent/intelligent_factory.py +888 -0
  96. isa_model/training/intelligent/knowledge_base.py +751 -0
  97. isa_model/training/intelligent/resource_optimizer.py +839 -0
  98. isa_model/training/intelligent/task_classifier.py +576 -0
  99. isa_model/training/storage/__init__.py +24 -0
  100. isa_model/training/storage/core_integration.py +439 -0
  101. isa_model/training/storage/training_repository.py +552 -0
  102. isa_model/training/storage/training_storage.py +628 -0
  103. {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
  104. isa_model-0.4.0.dist-info/RECORD +182 -0
  105. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  106. isa_model/deployment/cloud/modal/register_models.py +0 -321
  107. isa_model/inference/adapter/unified_api.py +0 -248
  108. isa_model/inference/services/helpers/stacked_config.py +0 -148
  109. isa_model/inference/services/img/flux_professional_service.py +0 -603
  110. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  111. isa_model/inference/services/others/table_transformer_service.py +0 -61
  112. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  113. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  114. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  115. isa_model/scripts/inference_tracker.py +0 -283
  116. isa_model/scripts/mlflow_manager.py +0 -379
  117. isa_model/scripts/model_registry.py +0 -465
  118. isa_model/scripts/register_models.py +0 -370
  119. isa_model/scripts/register_models_with_embeddings.py +0 -510
  120. isa_model/scripts/start_mlflow.py +0 -95
  121. isa_model/scripts/training_tracker.py +0 -257
  122. isa_model-0.3.9.dist-info/RECORD +0 -138
  123. {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
  124. {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,564 @@
1
+ """
2
+ Vision Evaluator for ISA Model evaluation framework.
3
+
4
+ Provides comprehensive evaluation capabilities for vision tasks including:
5
+ - OCR (Optical Character Recognition) evaluation
6
+ - Table extraction evaluation
7
+ - UI detection evaluation
8
+ - Document analysis evaluation
9
+ - Image captioning evaluation
10
+ - Visual question answering evaluation
11
+
12
+ Supports ISA custom services and standard vision models.
13
+ """
14
+
15
+ import asyncio
16
+ import logging
17
+ import base64
18
+ import io
19
+ from typing import Dict, List, Any, Optional, Union, Tuple
20
+ from PIL import Image
21
+ import numpy as np
22
+ from pathlib import Path
23
+
24
+ from .base_evaluator import BaseEvaluator, EvaluationResult
25
+ from ..metrics import compute_text_metrics, compute_vision_metrics
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class VisionEvaluator(BaseEvaluator):
31
+ """
32
+ Comprehensive vision model evaluator.
33
+
34
+ Supports evaluation of:
35
+ - OCR accuracy and multilingual capability
36
+ - Table extraction and structure recognition
37
+ - UI element detection and classification
38
+ - Document understanding and analysis
39
+ - Image captioning quality
40
+ - Visual question answering accuracy
41
+ """
42
+
43
+ def __init__(self,
44
+ config: Optional[Dict[str, Any]] = None,
45
+ experiment_tracker: Optional[Any] = None):
46
+ """
47
+ Initialize the vision evaluator.
48
+
49
+ Args:
50
+ config: Evaluation configuration
51
+ experiment_tracker: Optional experiment tracking instance
52
+ """
53
+ super().__init__(
54
+ evaluator_name="vision_evaluator",
55
+ config=config,
56
+ experiment_tracker=experiment_tracker
57
+ )
58
+
59
+ # Vision-specific configuration
60
+ self.supported_formats = self.config.get("supported_formats", ["png", "jpg", "jpeg", "pdf", "webp"])
61
+ self.max_image_size = self.config.get("max_image_size", (2048, 2048))
62
+ self.enable_multilingual = self.config.get("enable_multilingual", True)
63
+
64
+ # Evaluation task types
65
+ self.task_type = self.config.get("task_type", "ocr") # ocr, table, ui, vqa, caption
66
+
67
+ logger.info(f"Initialized VisionEvaluator for task: {self.task_type}")
68
+
69
+ async def evaluate_sample(self,
70
+ sample: Dict[str, Any],
71
+ model_interface: Any) -> Dict[str, Any]:
72
+ """
73
+ Evaluate a single vision sample.
74
+
75
+ Args:
76
+ sample: Vision sample containing image and expected output
77
+ model_interface: Vision model interface
78
+
79
+ Returns:
80
+ Evaluation result for the sample
81
+ """
82
+ try:
83
+ # Extract sample data
84
+ image_data = sample.get("image")
85
+ expected_output = sample.get("expected_output", "")
86
+ task_type = sample.get("task_type", self.task_type)
87
+ prompt = sample.get("prompt", "")
88
+
89
+ # Process image
90
+ processed_image = await self._process_image(image_data)
91
+
92
+ # Get model prediction based on task type
93
+ prediction = await self._get_model_prediction(
94
+ model_interface, processed_image, prompt, task_type
95
+ )
96
+
97
+ # Compute sample-level metrics
98
+ sample_metrics = self._compute_sample_metrics(
99
+ prediction, expected_output, task_type
100
+ )
101
+
102
+ return {
103
+ "prediction": prediction,
104
+ "expected_output": expected_output,
105
+ "task_type": task_type,
106
+ "sample_metrics": sample_metrics,
107
+ "image_info": self._get_image_info(processed_image)
108
+ }
109
+
110
+ except Exception as e:
111
+ logger.error(f"Error evaluating vision sample: {e}")
112
+ raise
113
+
114
+ async def _process_image(self, image_data: Union[str, bytes, Image.Image, Path]) -> Image.Image:
115
+ """
116
+ Process and validate image data.
117
+
118
+ Args:
119
+ image_data: Image in various formats
120
+
121
+ Returns:
122
+ Processed PIL Image
123
+ """
124
+ try:
125
+ if isinstance(image_data, str):
126
+ # Handle base64 encoded images or file paths
127
+ if image_data.startswith("data:"):
128
+ # Base64 data URL
129
+ header, encoded = image_data.split(",", 1)
130
+ image_bytes = base64.b64decode(encoded)
131
+ image = Image.open(io.BytesIO(image_bytes))
132
+ elif Path(image_data).exists():
133
+ # File path
134
+ image = Image.open(image_data)
135
+ else:
136
+ # Assume base64 string
137
+ image_bytes = base64.b64decode(image_data)
138
+ image = Image.open(io.BytesIO(image_bytes))
139
+
140
+ elif isinstance(image_data, bytes):
141
+ # Raw bytes
142
+ image = Image.open(io.BytesIO(image_data))
143
+
144
+ elif isinstance(image_data, Path):
145
+ # Path object
146
+ image = Image.open(image_data)
147
+
148
+ elif isinstance(image_data, Image.Image):
149
+ # PIL Image
150
+ image = image_data
151
+
152
+ else:
153
+ raise ValueError(f"Unsupported image data type: {type(image_data)}")
154
+
155
+ # Convert to RGB if needed
156
+ if image.mode != "RGB":
157
+ image = image.convert("RGB")
158
+
159
+ # Resize if too large
160
+ if image.size[0] > self.max_image_size[0] or image.size[1] > self.max_image_size[1]:
161
+ image.thumbnail(self.max_image_size, Image.Resampling.LANCZOS)
162
+ logger.info(f"Resized image to {image.size}")
163
+
164
+ return image
165
+
166
+ except Exception as e:
167
+ logger.error(f"Error processing image: {e}")
168
+ raise
169
+
170
+ async def _get_model_prediction(self,
171
+ model_interface: Any,
172
+ image: Image.Image,
173
+ prompt: str,
174
+ task_type: str) -> str:
175
+ """
176
+ Get model prediction for vision task.
177
+
178
+ Args:
179
+ model_interface: Vision model interface
180
+ image: Processed PIL image
181
+ prompt: Task-specific prompt
182
+ task_type: Type of vision task
183
+
184
+ Returns:
185
+ Model prediction as string
186
+ """
187
+ try:
188
+ # Prepare task-specific prompt
189
+ if not prompt:
190
+ prompt = self._get_default_prompt(task_type)
191
+
192
+ # Convert image to format expected by model
193
+ if hasattr(model_interface, 'process_image'):
194
+ # ISA custom vision service
195
+ result = await model_interface.process_image(image, prompt, task_type)
196
+ prediction = result.get("text", "") if isinstance(result, dict) else str(result)
197
+
198
+ elif hasattr(model_interface, 'vision_completion'):
199
+ # OpenAI-style vision API
200
+ # Convert image to base64
201
+ buffer = io.BytesIO()
202
+ image.save(buffer, format="PNG")
203
+ image_base64 = base64.b64encode(buffer.getvalue()).decode()
204
+
205
+ result = await model_interface.vision_completion(
206
+ prompt=prompt,
207
+ image_base64=image_base64
208
+ )
209
+ prediction = result.get("content", "") if isinstance(result, dict) else str(result)
210
+
211
+ else:
212
+ # Generic interface
213
+ prediction = await model_interface.predict(image, prompt)
214
+ prediction = str(prediction)
215
+
216
+ return prediction.strip()
217
+
218
+ except Exception as e:
219
+ logger.error(f"Error getting model prediction: {e}")
220
+ raise
221
+
222
+ def _get_default_prompt(self, task_type: str) -> str:
223
+ """Get default prompt for task type."""
224
+ prompts = {
225
+ "ocr": "Extract all text from this image. Preserve the original formatting and layout.",
226
+ "table": "Extract the table structure and content from this image. Provide the data in a structured format.",
227
+ "ui": "Analyze the UI elements in this image. Identify buttons, text fields, labels, and their relationships.",
228
+ "vqa": "Answer the question about this image accurately and concisely.",
229
+ "caption": "Generate a detailed and accurate caption describing this image.",
230
+ "document": "Analyze this document image and extract the key information, structure, and content."
231
+ }
232
+ return prompts.get(task_type, "Analyze this image and provide relevant information.")
233
+
234
+ def _compute_sample_metrics(self,
235
+ prediction: str,
236
+ expected_output: str,
237
+ task_type: str) -> Dict[str, float]:
238
+ """
239
+ Compute metrics for a single sample.
240
+
241
+ Args:
242
+ prediction: Model prediction
243
+ expected_output: Expected/reference output
244
+ task_type: Type of vision task
245
+
246
+ Returns:
247
+ Dictionary of sample-level metrics
248
+ """
249
+ try:
250
+ metrics = {}
251
+
252
+ # Common text-based metrics
253
+ text_metrics = compute_text_metrics(prediction, expected_output)
254
+ metrics.update(text_metrics)
255
+
256
+ # Task-specific metrics
257
+ if task_type == "ocr":
258
+ metrics.update(self._compute_ocr_metrics(prediction, expected_output))
259
+ elif task_type == "table":
260
+ metrics.update(self._compute_table_metrics(prediction, expected_output))
261
+ elif task_type == "ui":
262
+ metrics.update(self._compute_ui_metrics(prediction, expected_output))
263
+ elif task_type in ["vqa", "caption"]:
264
+ metrics.update(self._compute_semantic_metrics(prediction, expected_output))
265
+
266
+ return metrics
267
+
268
+ except Exception as e:
269
+ logger.error(f"Error computing sample metrics: {e}")
270
+ return {"error": 1.0}
271
+
272
+ def _compute_ocr_metrics(self, prediction: str, expected: str) -> Dict[str, float]:
273
+ """Compute OCR-specific metrics."""
274
+ try:
275
+ # Character-level accuracy
276
+ pred_chars = list(prediction.lower().replace(" ", ""))
277
+ exp_chars = list(expected.lower().replace(" ", ""))
278
+
279
+ char_accuracy = self._compute_sequence_accuracy(pred_chars, exp_chars)
280
+
281
+ # Word-level accuracy
282
+ pred_words = prediction.lower().split()
283
+ exp_words = expected.lower().split()
284
+
285
+ word_accuracy = self._compute_sequence_accuracy(pred_words, exp_words)
286
+
287
+ # Line-level accuracy (for formatted text)
288
+ pred_lines = prediction.strip().split("\n")
289
+ exp_lines = expected.strip().split("\n")
290
+
291
+ line_accuracy = self._compute_sequence_accuracy(pred_lines, exp_lines)
292
+
293
+ return {
294
+ "char_accuracy": char_accuracy,
295
+ "word_accuracy": word_accuracy,
296
+ "line_accuracy": line_accuracy,
297
+ "length_ratio": len(prediction) / max(len(expected), 1)
298
+ }
299
+
300
+ except Exception as e:
301
+ logger.error(f"Error computing OCR metrics: {e}")
302
+ return {"ocr_error": 1.0}
303
+
304
+ def _compute_table_metrics(self, prediction: str, expected: str) -> Dict[str, float]:
305
+ """Compute table extraction metrics."""
306
+ try:
307
+ # Simple table structure metrics
308
+ pred_rows = prediction.count("\n") + 1
309
+ exp_rows = expected.count("\n") + 1
310
+
311
+ pred_cells = prediction.count("|") + prediction.count("\t")
312
+ exp_cells = expected.count("|") + expected.count("\t")
313
+
314
+ row_accuracy = 1.0 - abs(pred_rows - exp_rows) / max(exp_rows, 1)
315
+ cell_count_accuracy = 1.0 - abs(pred_cells - exp_cells) / max(exp_cells, 1)
316
+
317
+ return {
318
+ "row_accuracy": max(0.0, row_accuracy),
319
+ "cell_count_accuracy": max(0.0, cell_count_accuracy),
320
+ "structure_similarity": (row_accuracy + cell_count_accuracy) / 2
321
+ }
322
+
323
+ except Exception as e:
324
+ logger.error(f"Error computing table metrics: {e}")
325
+ return {"table_error": 1.0}
326
+
327
+ def _compute_ui_metrics(self, prediction: str, expected: str) -> Dict[str, float]:
328
+ """Compute UI detection metrics."""
329
+ try:
330
+ # Extract UI elements (simplified approach)
331
+ ui_keywords = ["button", "text", "input", "label", "image", "link", "menu", "icon"]
332
+
333
+ pred_elements = []
334
+ exp_elements = []
335
+
336
+ for keyword in ui_keywords:
337
+ pred_count = prediction.lower().count(keyword)
338
+ exp_count = expected.lower().count(keyword)
339
+ pred_elements.extend([keyword] * pred_count)
340
+ exp_elements.extend([keyword] * exp_count)
341
+
342
+ element_accuracy = self._compute_sequence_accuracy(pred_elements, exp_elements)
343
+
344
+ return {
345
+ "element_detection_accuracy": element_accuracy,
346
+ "element_count_ratio": len(pred_elements) / max(len(exp_elements), 1)
347
+ }
348
+
349
+ except Exception as e:
350
+ logger.error(f"Error computing UI metrics: {e}")
351
+ return {"ui_error": 1.0}
352
+
353
+ def _compute_semantic_metrics(self, prediction: str, expected: str) -> Dict[str, float]:
354
+ """Compute semantic similarity metrics for VQA/captioning."""
355
+ try:
356
+ # Simple semantic metrics
357
+ pred_words = set(prediction.lower().split())
358
+ exp_words = set(expected.lower().split())
359
+
360
+ if not exp_words:
361
+ return {"semantic_error": 1.0}
362
+
363
+ intersection = pred_words.intersection(exp_words)
364
+ union = pred_words.union(exp_words)
365
+
366
+ jaccard_similarity = len(intersection) / len(union) if union else 0.0
367
+ word_overlap = len(intersection) / len(exp_words)
368
+
369
+ return {
370
+ "jaccard_similarity": jaccard_similarity,
371
+ "word_overlap": word_overlap,
372
+ "semantic_score": (jaccard_similarity + word_overlap) / 2
373
+ }
374
+
375
+ except Exception as e:
376
+ logger.error(f"Error computing semantic metrics: {e}")
377
+ return {"semantic_error": 1.0}
378
+
379
+ def _compute_sequence_accuracy(self, pred_seq: List[str], exp_seq: List[str]) -> float:
380
+ """Compute sequence-level accuracy using edit distance."""
381
+ try:
382
+ if not exp_seq:
383
+ return 1.0 if not pred_seq else 0.0
384
+
385
+ # Simple edit distance computation
386
+ m, n = len(pred_seq), len(exp_seq)
387
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
388
+
389
+ for i in range(m + 1):
390
+ dp[i][0] = i
391
+ for j in range(n + 1):
392
+ dp[0][j] = j
393
+
394
+ for i in range(1, m + 1):
395
+ for j in range(1, n + 1):
396
+ if pred_seq[i-1] == exp_seq[j-1]:
397
+ dp[i][j] = dp[i-1][j-1]
398
+ else:
399
+ dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
400
+
401
+ edit_distance = dp[m][n]
402
+ accuracy = 1.0 - edit_distance / max(n, 1)
403
+ return max(0.0, accuracy)
404
+
405
+ except Exception as e:
406
+ logger.error(f"Error computing sequence accuracy: {e}")
407
+ return 0.0
408
+
409
+ def _get_image_info(self, image: Image.Image) -> Dict[str, Any]:
410
+ """Get image metadata for analysis."""
411
+ return {
412
+ "width": image.size[0],
413
+ "height": image.size[1],
414
+ "mode": image.mode,
415
+ "format": getattr(image, "format", "unknown"),
416
+ "has_transparency": image.mode in ("RGBA", "LA") or "transparency" in image.info
417
+ }
418
+
419
+ def compute_metrics(self,
420
+ predictions: List[str],
421
+ references: List[str],
422
+ **kwargs) -> Dict[str, float]:
423
+ """
424
+ Compute aggregate vision evaluation metrics.
425
+
426
+ Args:
427
+ predictions: List of model predictions
428
+ references: List of reference outputs
429
+ **kwargs: Additional parameters
430
+
431
+ Returns:
432
+ Dictionary of computed metrics
433
+ """
434
+ try:
435
+ if not predictions or not references:
436
+ logger.warning("Empty predictions or references provided")
437
+ return {}
438
+
439
+ # Ensure equal lengths
440
+ min_len = min(len(predictions), len(references))
441
+ predictions = predictions[:min_len]
442
+ references = references[:min_len]
443
+
444
+ # Compute text-based metrics
445
+ metrics = compute_text_metrics(predictions, references, aggregate=True)
446
+
447
+ # Compute vision-specific metrics
448
+ vision_metrics = self._compute_vision_aggregate_metrics(predictions, references)
449
+ metrics.update(vision_metrics)
450
+
451
+ # Add evaluation metadata
452
+ metrics.update({
453
+ "total_samples": len(predictions),
454
+ "task_type": self.task_type,
455
+ "multilingual_enabled": self.enable_multilingual
456
+ })
457
+
458
+ return metrics
459
+
460
+ except Exception as e:
461
+ logger.error(f"Error computing aggregate metrics: {e}")
462
+ return {"error_rate": 1.0}
463
+
464
+ def _compute_vision_aggregate_metrics(self,
465
+ predictions: List[str],
466
+ references: List[str]) -> Dict[str, float]:
467
+ """Compute aggregate vision-specific metrics."""
468
+ try:
469
+ task_type = self.task_type
470
+
471
+ if task_type == "ocr":
472
+ return self._compute_aggregate_ocr_metrics(predictions, references)
473
+ elif task_type == "table":
474
+ return self._compute_aggregate_table_metrics(predictions, references)
475
+ elif task_type == "ui":
476
+ return self._compute_aggregate_ui_metrics(predictions, references)
477
+ elif task_type in ["vqa", "caption"]:
478
+ return self._compute_aggregate_semantic_metrics(predictions, references)
479
+ else:
480
+ return {}
481
+
482
+ except Exception as e:
483
+ logger.error(f"Error computing vision aggregate metrics: {e}")
484
+ return {}
485
+
486
+ def _compute_aggregate_ocr_metrics(self,
487
+ predictions: List[str],
488
+ references: List[str]) -> Dict[str, float]:
489
+ """Compute aggregate OCR metrics."""
490
+ char_accuracies = []
491
+ word_accuracies = []
492
+
493
+ for pred, ref in zip(predictions, references):
494
+ sample_metrics = self._compute_ocr_metrics(pred, ref)
495
+ char_accuracies.append(sample_metrics.get("char_accuracy", 0.0))
496
+ word_accuracies.append(sample_metrics.get("word_accuracy", 0.0))
497
+
498
+ return {
499
+ "avg_char_accuracy": np.mean(char_accuracies) if char_accuracies else 0.0,
500
+ "avg_word_accuracy": np.mean(word_accuracies) if word_accuracies else 0.0,
501
+ "ocr_score": np.mean(char_accuracies + word_accuracies) if char_accuracies else 0.0
502
+ }
503
+
504
+ def _compute_aggregate_table_metrics(self,
505
+ predictions: List[str],
506
+ references: List[str]) -> Dict[str, float]:
507
+ """Compute aggregate table metrics."""
508
+ structure_similarities = []
509
+
510
+ for pred, ref in zip(predictions, references):
511
+ sample_metrics = self._compute_table_metrics(pred, ref)
512
+ structure_similarities.append(sample_metrics.get("structure_similarity", 0.0))
513
+
514
+ return {
515
+ "avg_structure_similarity": np.mean(structure_similarities) if structure_similarities else 0.0,
516
+ "table_extraction_score": np.mean(structure_similarities) if structure_similarities else 0.0
517
+ }
518
+
519
+ def _compute_aggregate_ui_metrics(self,
520
+ predictions: List[str],
521
+ references: List[str]) -> Dict[str, float]:
522
+ """Compute aggregate UI metrics."""
523
+ detection_accuracies = []
524
+
525
+ for pred, ref in zip(predictions, references):
526
+ sample_metrics = self._compute_ui_metrics(pred, ref)
527
+ detection_accuracies.append(sample_metrics.get("element_detection_accuracy", 0.0))
528
+
529
+ return {
530
+ "avg_element_detection": np.mean(detection_accuracies) if detection_accuracies else 0.0,
531
+ "ui_detection_score": np.mean(detection_accuracies) if detection_accuracies else 0.0
532
+ }
533
+
534
+ def _compute_aggregate_semantic_metrics(self,
535
+ predictions: List[str],
536
+ references: List[str]) -> Dict[str, float]:
537
+ """Compute aggregate semantic metrics."""
538
+ semantic_scores = []
539
+
540
+ for pred, ref in zip(predictions, references):
541
+ sample_metrics = self._compute_semantic_metrics(pred, ref)
542
+ semantic_scores.append(sample_metrics.get("semantic_score", 0.0))
543
+
544
+ return {
545
+ "avg_semantic_similarity": np.mean(semantic_scores) if semantic_scores else 0.0,
546
+ "semantic_understanding_score": np.mean(semantic_scores) if semantic_scores else 0.0
547
+ }
548
+
549
+ def get_supported_metrics(self) -> List[str]:
550
+ """Get list of metrics supported by this evaluator."""
551
+ base_metrics = [
552
+ "exact_match", "f1_score", "bleu_score", "rouge_l",
553
+ "char_accuracy", "word_accuracy", "line_accuracy"
554
+ ]
555
+
556
+ task_specific_metrics = {
557
+ "ocr": ["char_accuracy", "word_accuracy", "ocr_score"],
558
+ "table": ["structure_similarity", "table_extraction_score"],
559
+ "ui": ["element_detection_accuracy", "ui_detection_score"],
560
+ "vqa": ["semantic_similarity", "semantic_understanding_score"],
561
+ "caption": ["semantic_similarity", "semantic_understanding_score"]
562
+ }
563
+
564
+ return base_metrics + task_specific_metrics.get(self.task_type, [])