isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/client.py +732 -565
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.9.dist-info/RECORD +0 -138
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
isa_model/eval/benchmarks.py
CHANGED
@@ -11,13 +11,106 @@ This module provides implementations of standard AI benchmarks:
|
|
11
11
|
import os
|
12
12
|
import json
|
13
13
|
import logging
|
14
|
+
import requests
|
15
|
+
import zipfile
|
16
|
+
import tarfile
|
17
|
+
from pathlib import Path
|
14
18
|
from typing import Dict, List, Any, Optional
|
15
19
|
from abc import ABC, abstractmethod
|
16
20
|
from dataclasses import dataclass
|
21
|
+
import pandas as pd
|
17
22
|
|
18
23
|
logger = logging.getLogger(__name__)
|
19
24
|
|
20
25
|
|
26
|
+
class DatasetDownloader:
|
27
|
+
"""Utility class for downloading and caching benchmark datasets."""
|
28
|
+
|
29
|
+
def __init__(self, cache_dir: str = "~/.isa_model/datasets"):
|
30
|
+
self.cache_dir = Path(cache_dir).expanduser()
|
31
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
32
|
+
|
33
|
+
# Dataset URLs and info
|
34
|
+
self.dataset_info = {
|
35
|
+
"mmlu": {
|
36
|
+
"url": "https://people.eecs.berkeley.edu/~hendrycks/data.tar",
|
37
|
+
"filename": "mmlu_data.tar",
|
38
|
+
"extracted_dir": "data"
|
39
|
+
},
|
40
|
+
"hellaswag": {
|
41
|
+
"url": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl",
|
42
|
+
"filename": "hellaswag_val.jsonl"
|
43
|
+
},
|
44
|
+
"arc": {
|
45
|
+
"url": "https://s3-us-west-2.amazonaws.com/ai2-website/data/ARC-V1-Feb2018.zip",
|
46
|
+
"filename": "arc_data.zip",
|
47
|
+
"extracted_dir": "ARC-V1-Feb2018-2"
|
48
|
+
},
|
49
|
+
"gsm8k": {
|
50
|
+
"url": "https://github.com/openai/grade-school-math/raw/master/grade_school_math/data/test.jsonl",
|
51
|
+
"filename": "gsm8k_test.jsonl"
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
def download_dataset(self, dataset_name: str, force_download: bool = False) -> Path:
|
56
|
+
"""Download and cache a dataset."""
|
57
|
+
if dataset_name not in self.dataset_info:
|
58
|
+
raise ValueError(f"Unknown dataset: {dataset_name}")
|
59
|
+
|
60
|
+
info = self.dataset_info[dataset_name]
|
61
|
+
dataset_dir = self.cache_dir / dataset_name
|
62
|
+
dataset_dir.mkdir(exist_ok=True)
|
63
|
+
|
64
|
+
file_path = dataset_dir / info["filename"]
|
65
|
+
|
66
|
+
# Check if already downloaded
|
67
|
+
if file_path.exists() and not force_download:
|
68
|
+
logger.info(f"Using cached {dataset_name} dataset at {file_path}")
|
69
|
+
return self._get_data_path(dataset_name, file_path)
|
70
|
+
|
71
|
+
# Download the dataset
|
72
|
+
logger.info(f"Downloading {dataset_name} dataset from {info['url']}")
|
73
|
+
try:
|
74
|
+
response = requests.get(info["url"], stream=True)
|
75
|
+
response.raise_for_status()
|
76
|
+
|
77
|
+
with open(file_path, 'wb') as f:
|
78
|
+
for chunk in response.iter_content(chunk_size=8192):
|
79
|
+
f.write(chunk)
|
80
|
+
|
81
|
+
logger.info(f"Downloaded {dataset_name} dataset to {file_path}")
|
82
|
+
|
83
|
+
# Extract if needed
|
84
|
+
return self._get_data_path(dataset_name, file_path)
|
85
|
+
|
86
|
+
except Exception as e:
|
87
|
+
logger.error(f"Failed to download {dataset_name}: {e}")
|
88
|
+
# Fall back to placeholder data
|
89
|
+
return None
|
90
|
+
|
91
|
+
def _get_data_path(self, dataset_name: str, file_path: Path) -> Path:
|
92
|
+
"""Get the actual data path, extracting archives if needed."""
|
93
|
+
info = self.dataset_info[dataset_name]
|
94
|
+
|
95
|
+
if "extracted_dir" in info:
|
96
|
+
# Need to extract
|
97
|
+
extract_dir = file_path.parent / info["extracted_dir"]
|
98
|
+
|
99
|
+
if not extract_dir.exists():
|
100
|
+
logger.info(f"Extracting {file_path}")
|
101
|
+
|
102
|
+
if file_path.suffix == ".zip":
|
103
|
+
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
104
|
+
zip_ref.extractall(file_path.parent)
|
105
|
+
elif file_path.suffix == ".tar" or ".tar." in file_path.name:
|
106
|
+
with tarfile.open(file_path, 'r') as tar_ref:
|
107
|
+
tar_ref.extractall(file_path.parent)
|
108
|
+
|
109
|
+
return extract_dir
|
110
|
+
else:
|
111
|
+
return file_path
|
112
|
+
|
113
|
+
|
21
114
|
@dataclass
|
22
115
|
class BenchmarkConfig:
|
23
116
|
"""Configuration for benchmark evaluation."""
|
@@ -36,6 +129,8 @@ class BaseBenchmark(ABC):
|
|
36
129
|
self.config = config
|
37
130
|
self.name = config.name
|
38
131
|
self.data = None
|
132
|
+
self.downloader = DatasetDownloader()
|
133
|
+
self.use_real_data = True # Flag to control real vs placeholder data
|
39
134
|
|
40
135
|
@abstractmethod
|
41
136
|
def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
@@ -111,10 +206,62 @@ class MMLU(BaseBenchmark):
|
|
111
206
|
self.subjects = subjects or self.all_subjects[:10] # Use first 10 subjects by default
|
112
207
|
|
113
208
|
def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
114
|
-
"""Load MMLU data
|
115
|
-
|
116
|
-
|
209
|
+
"""Load MMLU data with real dataset support."""
|
210
|
+
if self.use_real_data:
|
211
|
+
try:
|
212
|
+
return self._load_real_mmlu_data(max_samples)
|
213
|
+
except Exception as e:
|
214
|
+
logger.warning(f"Failed to load real MMLU data: {e}. Falling back to placeholder data.")
|
215
|
+
return self._load_placeholder_mmlu_data(max_samples)
|
216
|
+
else:
|
217
|
+
return self._load_placeholder_mmlu_data(max_samples)
|
218
|
+
|
219
|
+
def _load_real_mmlu_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
220
|
+
"""Load real MMLU dataset."""
|
221
|
+
data_path = self.downloader.download_dataset("mmlu")
|
222
|
+
if not data_path or not data_path.exists():
|
223
|
+
raise FileNotFoundError("MMLU dataset not found")
|
224
|
+
|
225
|
+
data = []
|
226
|
+
samples_per_subject = max_samples // len(self.subjects) if max_samples else None
|
227
|
+
|
228
|
+
for subject in self.subjects:
|
229
|
+
subject_file = data_path / "test" / f"{subject}_test.csv"
|
230
|
+
if not subject_file.exists():
|
231
|
+
logger.warning(f"Subject file not found: {subject_file}")
|
232
|
+
continue
|
233
|
+
|
234
|
+
try:
|
235
|
+
# Load CSV data
|
236
|
+
df = pd.read_csv(subject_file, header=None,
|
237
|
+
names=["question", "A", "B", "C", "D", "answer"])
|
238
|
+
|
239
|
+
# Convert to our format
|
240
|
+
for idx, row in df.iterrows():
|
241
|
+
if samples_per_subject and len([d for d in data if d["subject"] == subject]) >= samples_per_subject:
|
242
|
+
break
|
243
|
+
|
244
|
+
sample = {
|
245
|
+
"subject": subject,
|
246
|
+
"question": row["question"],
|
247
|
+
"choices": [row["A"], row["B"], row["C"], row["D"]],
|
248
|
+
"answer": str(row["answer"]).strip().upper(),
|
249
|
+
"id": f"{subject}_{idx}"
|
250
|
+
}
|
251
|
+
data.append(sample)
|
252
|
+
|
253
|
+
except Exception as e:
|
254
|
+
logger.error(f"Error loading subject {subject}: {e}")
|
255
|
+
continue
|
117
256
|
|
257
|
+
if max_samples:
|
258
|
+
data = data[:max_samples]
|
259
|
+
|
260
|
+
logger.info(f"Loaded {len(data)} real MMLU samples across {len(self.subjects)} subjects")
|
261
|
+
return data
|
262
|
+
|
263
|
+
def _load_placeholder_mmlu_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
264
|
+
"""Load placeholder MMLU data."""
|
118
265
|
data = []
|
119
266
|
|
120
267
|
for subject in self.subjects:
|
@@ -137,7 +284,7 @@ class MMLU(BaseBenchmark):
|
|
137
284
|
if max_samples:
|
138
285
|
data = data[:max_samples]
|
139
286
|
|
140
|
-
logger.info(f"Loaded {len(data)} MMLU samples across {len(self.subjects)} subjects")
|
287
|
+
logger.info(f"Loaded {len(data)} placeholder MMLU samples across {len(self.subjects)} subjects")
|
141
288
|
return data
|
142
289
|
|
143
290
|
def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
|
@@ -194,12 +341,52 @@ class HellaSwag(BaseBenchmark):
|
|
194
341
|
super().__init__(config)
|
195
342
|
|
196
343
|
def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
197
|
-
"""Load HellaSwag data
|
198
|
-
|
199
|
-
|
344
|
+
"""Load HellaSwag data with real dataset support."""
|
345
|
+
if self.use_real_data:
|
346
|
+
try:
|
347
|
+
return self._load_real_hellaswag_data(max_samples)
|
348
|
+
except Exception as e:
|
349
|
+
logger.warning(f"Failed to load real HellaSwag data: {e}. Falling back to placeholder data.")
|
350
|
+
return self._load_placeholder_hellaswag_data(max_samples)
|
351
|
+
else:
|
352
|
+
return self._load_placeholder_hellaswag_data(max_samples)
|
353
|
+
|
354
|
+
def _load_real_hellaswag_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
355
|
+
"""Load real HellaSwag dataset."""
|
356
|
+
data_path = self.downloader.download_dataset("hellaswag")
|
357
|
+
if not data_path or not data_path.exists():
|
358
|
+
raise FileNotFoundError("HellaSwag dataset not found")
|
200
359
|
|
201
360
|
data = []
|
202
361
|
|
362
|
+
try:
|
363
|
+
with open(data_path, 'r', encoding='utf-8') as f:
|
364
|
+
for i, line in enumerate(f):
|
365
|
+
if max_samples and i >= max_samples:
|
366
|
+
break
|
367
|
+
|
368
|
+
item = json.loads(line.strip())
|
369
|
+
|
370
|
+
sample = {
|
371
|
+
"context": item["ctx"],
|
372
|
+
"question": "What happens next?",
|
373
|
+
"choices": item["endings"],
|
374
|
+
"answer": chr(65 + int(item["label"])), # Convert 0,1,2,3 to A,B,C,D
|
375
|
+
"id": f"hellaswag_{item.get('ind', i)}"
|
376
|
+
}
|
377
|
+
data.append(sample)
|
378
|
+
|
379
|
+
except Exception as e:
|
380
|
+
logger.error(f"Error loading HellaSwag data: {e}")
|
381
|
+
raise
|
382
|
+
|
383
|
+
logger.info(f"Loaded {len(data)} real HellaSwag samples")
|
384
|
+
return data
|
385
|
+
|
386
|
+
def _load_placeholder_hellaswag_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
387
|
+
"""Load placeholder HellaSwag data."""
|
388
|
+
data = []
|
389
|
+
|
203
390
|
sample_contexts = [
|
204
391
|
"A person is washing dishes in the kitchen",
|
205
392
|
"Someone is riding a bicycle down a hill",
|
@@ -226,7 +413,7 @@ class HellaSwag(BaseBenchmark):
|
|
226
413
|
}
|
227
414
|
data.append(sample)
|
228
415
|
|
229
|
-
logger.info(f"Loaded {len(data)} HellaSwag samples")
|
416
|
+
logger.info(f"Loaded {len(data)} placeholder HellaSwag samples")
|
230
417
|
return data
|
231
418
|
|
232
419
|
def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
|
@@ -377,12 +564,57 @@ class GSM8K(BaseBenchmark):
|
|
377
564
|
super().__init__(config)
|
378
565
|
|
379
566
|
def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
380
|
-
"""Load GSM8K data
|
381
|
-
|
382
|
-
|
567
|
+
"""Load GSM8K data with real dataset support."""
|
568
|
+
if self.use_real_data:
|
569
|
+
try:
|
570
|
+
return self._load_real_gsm8k_data(max_samples)
|
571
|
+
except Exception as e:
|
572
|
+
logger.warning(f"Failed to load real GSM8K data: {e}. Falling back to placeholder data.")
|
573
|
+
return self._load_placeholder_gsm8k_data(max_samples)
|
574
|
+
else:
|
575
|
+
return self._load_placeholder_gsm8k_data(max_samples)
|
576
|
+
|
577
|
+
def _load_real_gsm8k_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
578
|
+
"""Load real GSM8K dataset."""
|
579
|
+
data_path = self.downloader.download_dataset("gsm8k")
|
580
|
+
if not data_path or not data_path.exists():
|
581
|
+
raise FileNotFoundError("GSM8K dataset not found")
|
383
582
|
|
384
583
|
data = []
|
385
584
|
|
585
|
+
try:
|
586
|
+
with open(data_path, 'r', encoding='utf-8') as f:
|
587
|
+
for i, line in enumerate(f):
|
588
|
+
if max_samples and i >= max_samples:
|
589
|
+
break
|
590
|
+
|
591
|
+
item = json.loads(line.strip())
|
592
|
+
|
593
|
+
# Extract numerical answer from solution
|
594
|
+
answer_text = item["answer"]
|
595
|
+
import re
|
596
|
+
numbers = re.findall(r'\d+', answer_text)
|
597
|
+
answer = numbers[-1] if numbers else "0"
|
598
|
+
|
599
|
+
sample = {
|
600
|
+
"question": item["question"],
|
601
|
+
"answer": answer,
|
602
|
+
"solution": answer_text, # Keep full solution for reference
|
603
|
+
"id": f"gsm8k_{i}"
|
604
|
+
}
|
605
|
+
data.append(sample)
|
606
|
+
|
607
|
+
except Exception as e:
|
608
|
+
logger.error(f"Error loading GSM8K data: {e}")
|
609
|
+
raise
|
610
|
+
|
611
|
+
logger.info(f"Loaded {len(data)} real GSM8K samples")
|
612
|
+
return data
|
613
|
+
|
614
|
+
def _load_placeholder_gsm8k_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
615
|
+
"""Load placeholder GSM8K data."""
|
616
|
+
data = []
|
617
|
+
|
386
618
|
sample_problems = [
|
387
619
|
{
|
388
620
|
"question": "Janet has 12 apples. She gives 3 apples to her friend and eats 2 apples. How many apples does Janet have left?",
|
@@ -417,7 +649,7 @@ class GSM8K(BaseBenchmark):
|
|
417
649
|
}
|
418
650
|
data.append(sample)
|
419
651
|
|
420
|
-
logger.info(f"Loaded {len(data)} GSM8K samples")
|
652
|
+
logger.info(f"Loaded {len(data)} placeholder GSM8K samples")
|
421
653
|
return data
|
422
654
|
|
423
655
|
def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
|
@@ -7,12 +7,18 @@ Provides specialized evaluators for different model types and evaluation tasks.
|
|
7
7
|
from .base_evaluator import BaseEvaluator, EvaluationResult
|
8
8
|
from .llm_evaluator import LLMEvaluator
|
9
9
|
from .vision_evaluator import VisionEvaluator
|
10
|
-
from .
|
10
|
+
from .audio_evaluator import AudioEvaluator
|
11
|
+
from .embedding_evaluator import EmbeddingEvaluator
|
12
|
+
|
13
|
+
# MultimodalEvaluator will be implemented later
|
14
|
+
# from .multimodal_evaluator import MultimodalEvaluator
|
11
15
|
|
12
16
|
__all__ = [
|
13
17
|
"BaseEvaluator",
|
14
18
|
"EvaluationResult",
|
15
19
|
"LLMEvaluator",
|
16
20
|
"VisionEvaluator",
|
17
|
-
"
|
21
|
+
"AudioEvaluator",
|
22
|
+
"EmbeddingEvaluator"
|
23
|
+
# "MultimodalEvaluator" # TODO: Implement later
|
18
24
|
]
|