isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/client.py +732 -565
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.9.dist-info/RECORD +0 -138
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,27 @@
|
|
1
|
+
"""
|
2
|
+
Benchmarks module for ISA Model evaluation framework.
|
3
|
+
|
4
|
+
Contains benchmark implementations and dataset loaders.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from .multimodal_datasets import (
|
8
|
+
VQAv2Dataset,
|
9
|
+
COCOCaptionsDataset,
|
10
|
+
DocVQADataset,
|
11
|
+
AudioDatasetLoader,
|
12
|
+
create_vqa_dataset,
|
13
|
+
create_coco_captions_dataset,
|
14
|
+
create_docvqa_dataset,
|
15
|
+
create_audio_dataset_loader
|
16
|
+
)
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
"VQAv2Dataset",
|
20
|
+
"COCOCaptionsDataset",
|
21
|
+
"DocVQADataset",
|
22
|
+
"AudioDatasetLoader",
|
23
|
+
"create_vqa_dataset",
|
24
|
+
"create_coco_captions_dataset",
|
25
|
+
"create_docvqa_dataset",
|
26
|
+
"create_audio_dataset_loader"
|
27
|
+
]
|
@@ -0,0 +1,460 @@
|
|
1
|
+
"""
|
2
|
+
Multimodal Dataset Support for ISA Model evaluation framework.
|
3
|
+
|
4
|
+
Provides dataset loaders for:
|
5
|
+
- VQA v2.0 (Visual Question Answering)
|
6
|
+
- COCO Captions (Image Captioning)
|
7
|
+
- DocVQA (Document Visual Question Answering)
|
8
|
+
- Audio datasets (LibriSpeech, Common Voice)
|
9
|
+
"""
|
10
|
+
|
11
|
+
import os
|
12
|
+
import json
|
13
|
+
import logging
|
14
|
+
import requests
|
15
|
+
import zipfile
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import Dict, List, Any, Optional, Union
|
18
|
+
import pandas as pd
|
19
|
+
from PIL import Image
|
20
|
+
import base64
|
21
|
+
from io import BytesIO
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class MultimodalDatasetDownloader:
|
27
|
+
"""Utility class for downloading multimodal datasets."""
|
28
|
+
|
29
|
+
def __init__(self, cache_dir: str = "~/.isa_model/multimodal_datasets"):
|
30
|
+
self.cache_dir = Path(cache_dir).expanduser()
|
31
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
32
|
+
|
33
|
+
self.dataset_info = {
|
34
|
+
"vqa_v2": {
|
35
|
+
"annotations_url": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip",
|
36
|
+
"questions_url": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip",
|
37
|
+
"images_url": "http://images.cocodataset.org/zips/val2014.zip",
|
38
|
+
"description": "VQA v2.0 validation set"
|
39
|
+
},
|
40
|
+
"coco_captions": {
|
41
|
+
"annotations_url": "http://images.cocodataset.org/annotations/annotations_trainval2014.zip",
|
42
|
+
"images_url": "http://images.cocodataset.org/zips/val2014.zip",
|
43
|
+
"description": "COCO Captions validation set"
|
44
|
+
},
|
45
|
+
"docvqa": {
|
46
|
+
"url": "https://datasets.cvc.uab.es/rrc/DocVQA/train.tar.gz",
|
47
|
+
"description": "DocVQA training set"
|
48
|
+
},
|
49
|
+
"librispeech": {
|
50
|
+
"url": "http://www.openslr.org/resources/12/test-clean.tar.gz",
|
51
|
+
"description": "LibriSpeech test-clean set"
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
def download_dataset(self, dataset_name: str, subset: str = "val", force_download: bool = False) -> Optional[Path]:
|
56
|
+
"""Download and cache a multimodal dataset."""
|
57
|
+
if dataset_name not in self.dataset_info:
|
58
|
+
raise ValueError(f"Unknown dataset: {dataset_name}")
|
59
|
+
|
60
|
+
dataset_dir = self.cache_dir / dataset_name
|
61
|
+
dataset_dir.mkdir(exist_ok=True)
|
62
|
+
|
63
|
+
try:
|
64
|
+
if dataset_name == "vqa_v2":
|
65
|
+
return self._download_vqa_v2(dataset_dir, force_download)
|
66
|
+
elif dataset_name == "coco_captions":
|
67
|
+
return self._download_coco_captions(dataset_dir, force_download)
|
68
|
+
elif dataset_name == "docvqa":
|
69
|
+
return self._download_docvqa(dataset_dir, force_download)
|
70
|
+
elif dataset_name == "librispeech":
|
71
|
+
return self._download_librispeech(dataset_dir, force_download)
|
72
|
+
except Exception as e:
|
73
|
+
logger.error(f"Failed to download {dataset_name}: {e}")
|
74
|
+
return None
|
75
|
+
|
76
|
+
def _download_vqa_v2(self, dataset_dir: Path, force_download: bool) -> Path:
|
77
|
+
"""Download VQA v2.0 dataset."""
|
78
|
+
annotations_file = dataset_dir / "v2_mscoco_val2014_annotations.json"
|
79
|
+
questions_file = dataset_dir / "v2_OpenEnded_mscoco_val2014_questions.json"
|
80
|
+
|
81
|
+
if annotations_file.exists() and questions_file.exists() and not force_download:
|
82
|
+
logger.info("Using cached VQA v2.0 dataset")
|
83
|
+
return dataset_dir
|
84
|
+
|
85
|
+
info = self.dataset_info["vqa_v2"]
|
86
|
+
|
87
|
+
# Download annotations
|
88
|
+
if not annotations_file.exists() or force_download:
|
89
|
+
logger.info("Downloading VQA v2.0 annotations")
|
90
|
+
self._download_and_extract(info["annotations_url"], dataset_dir)
|
91
|
+
|
92
|
+
# Download questions
|
93
|
+
if not questions_file.exists() or force_download:
|
94
|
+
logger.info("Downloading VQA v2.0 questions")
|
95
|
+
self._download_and_extract(info["questions_url"], dataset_dir)
|
96
|
+
|
97
|
+
return dataset_dir
|
98
|
+
|
99
|
+
def _download_coco_captions(self, dataset_dir: Path, force_download: bool) -> Path:
|
100
|
+
"""Download COCO Captions dataset."""
|
101
|
+
captions_file = dataset_dir / "annotations" / "captions_val2014.json"
|
102
|
+
|
103
|
+
if captions_file.exists() and not force_download:
|
104
|
+
logger.info("Using cached COCO Captions dataset")
|
105
|
+
return dataset_dir
|
106
|
+
|
107
|
+
info = self.dataset_info["coco_captions"]
|
108
|
+
|
109
|
+
# Download annotations
|
110
|
+
logger.info("Downloading COCO Captions annotations")
|
111
|
+
self._download_and_extract(info["annotations_url"], dataset_dir)
|
112
|
+
|
113
|
+
return dataset_dir
|
114
|
+
|
115
|
+
def _download_docvqa(self, dataset_dir: Path, force_download: bool) -> Path:
|
116
|
+
"""Download DocVQA dataset (placeholder implementation)."""
|
117
|
+
# This would require actual DocVQA dataset access
|
118
|
+
logger.warning("DocVQA dataset download not implemented - using placeholder")
|
119
|
+
return dataset_dir
|
120
|
+
|
121
|
+
def _download_librispeech(self, dataset_dir: Path, force_download: bool) -> Path:
|
122
|
+
"""Download LibriSpeech dataset (placeholder implementation)."""
|
123
|
+
# This would require actual LibriSpeech dataset download
|
124
|
+
logger.warning("LibriSpeech dataset download not implemented - using placeholder")
|
125
|
+
return dataset_dir
|
126
|
+
|
127
|
+
def _download_and_extract(self, url: str, extract_dir: Path):
|
128
|
+
"""Download and extract a file."""
|
129
|
+
filename = url.split('/')[-1]
|
130
|
+
file_path = extract_dir / filename
|
131
|
+
|
132
|
+
# Download
|
133
|
+
response = requests.get(url, stream=True)
|
134
|
+
response.raise_for_status()
|
135
|
+
|
136
|
+
with open(file_path, 'wb') as f:
|
137
|
+
for chunk in response.iter_content(chunk_size=8192):
|
138
|
+
f.write(chunk)
|
139
|
+
|
140
|
+
# Extract
|
141
|
+
if filename.endswith('.zip'):
|
142
|
+
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
143
|
+
zip_ref.extractall(extract_dir)
|
144
|
+
elif filename.endswith('.tar.gz'):
|
145
|
+
import tarfile
|
146
|
+
with tarfile.open(file_path, 'r:gz') as tar_ref:
|
147
|
+
tar_ref.extractall(extract_dir)
|
148
|
+
|
149
|
+
# Clean up archive file
|
150
|
+
file_path.unlink()
|
151
|
+
|
152
|
+
|
153
|
+
class VQAv2Dataset:
|
154
|
+
"""VQA v2.0 Dataset loader."""
|
155
|
+
|
156
|
+
def __init__(self, cache_dir: str = "~/.isa_model/multimodal_datasets"):
|
157
|
+
self.downloader = MultimodalDatasetDownloader(cache_dir)
|
158
|
+
self.dataset_dir = None
|
159
|
+
self.annotations = None
|
160
|
+
self.questions = None
|
161
|
+
|
162
|
+
def load_data(self, max_samples: Optional[int] = None, use_real_data: bool = True) -> List[Dict[str, Any]]:
|
163
|
+
"""Load VQA v2.0 data."""
|
164
|
+
if use_real_data:
|
165
|
+
try:
|
166
|
+
return self._load_real_data(max_samples)
|
167
|
+
except Exception as e:
|
168
|
+
logger.warning(f"Failed to load real VQA data: {e}. Using placeholder data.")
|
169
|
+
return self._load_placeholder_data(max_samples)
|
170
|
+
else:
|
171
|
+
return self._load_placeholder_data(max_samples)
|
172
|
+
|
173
|
+
def _load_real_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
174
|
+
"""Load real VQA v2.0 data."""
|
175
|
+
self.dataset_dir = self.downloader.download_dataset("vqa_v2")
|
176
|
+
if not self.dataset_dir:
|
177
|
+
raise FileNotFoundError("VQA v2.0 dataset not found")
|
178
|
+
|
179
|
+
# Load annotations and questions
|
180
|
+
annotations_file = self.dataset_dir / "v2_mscoco_val2014_annotations.json"
|
181
|
+
questions_file = self.dataset_dir / "v2_OpenEnded_mscoco_val2014_questions.json"
|
182
|
+
|
183
|
+
with open(annotations_file, 'r') as f:
|
184
|
+
annotations_data = json.load(f)
|
185
|
+
|
186
|
+
with open(questions_file, 'r') as f:
|
187
|
+
questions_data = json.load(f)
|
188
|
+
|
189
|
+
# Create question_id -> annotation mapping
|
190
|
+
annotations_dict = {ann['question_id']: ann for ann in annotations_data['annotations']}
|
191
|
+
|
192
|
+
data = []
|
193
|
+
for i, question in enumerate(questions_data['questions']):
|
194
|
+
if max_samples and i >= max_samples:
|
195
|
+
break
|
196
|
+
|
197
|
+
question_id = question['question_id']
|
198
|
+
if question_id in annotations_dict:
|
199
|
+
annotation = annotations_dict[question_id]
|
200
|
+
|
201
|
+
# Get the most common answer
|
202
|
+
answers = [ans['answer'] for ans in annotation['answers']]
|
203
|
+
most_common_answer = max(set(answers), key=answers.count)
|
204
|
+
|
205
|
+
sample = {
|
206
|
+
"image_id": question['image_id'],
|
207
|
+
"question": question['question'],
|
208
|
+
"expected_output": most_common_answer,
|
209
|
+
"task_type": "vqa",
|
210
|
+
"id": f"vqa_{question_id}",
|
211
|
+
"image": f"COCO_val2014_{question['image_id']:012d}.jpg" # COCO image filename format
|
212
|
+
}
|
213
|
+
data.append(sample)
|
214
|
+
|
215
|
+
logger.info(f"Loaded {len(data)} real VQA v2.0 samples")
|
216
|
+
return data
|
217
|
+
|
218
|
+
def _load_placeholder_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
219
|
+
"""Load placeholder VQA data."""
|
220
|
+
sample_questions = [
|
221
|
+
{"question": "What color is the cat?", "answer": "orange"},
|
222
|
+
{"question": "How many people are in the image?", "answer": "3"},
|
223
|
+
{"question": "What is the weather like?", "answer": "sunny"},
|
224
|
+
{"question": "What vehicle is shown?", "answer": "car"},
|
225
|
+
{"question": "What room is this?", "answer": "kitchen"}
|
226
|
+
]
|
227
|
+
|
228
|
+
data = []
|
229
|
+
for i, item in enumerate(sample_questions):
|
230
|
+
if max_samples and i >= max_samples:
|
231
|
+
break
|
232
|
+
|
233
|
+
sample = {
|
234
|
+
"image_id": f"placeholder_{i}",
|
235
|
+
"question": item["question"],
|
236
|
+
"expected_output": item["answer"],
|
237
|
+
"task_type": "vqa",
|
238
|
+
"id": f"vqa_placeholder_{i}",
|
239
|
+
"image": None # Placeholder - no actual image
|
240
|
+
}
|
241
|
+
data.append(sample)
|
242
|
+
|
243
|
+
logger.info(f"Loaded {len(data)} placeholder VQA samples")
|
244
|
+
return data
|
245
|
+
|
246
|
+
|
247
|
+
class COCOCaptionsDataset:
|
248
|
+
"""COCO Captions Dataset loader."""
|
249
|
+
|
250
|
+
def __init__(self, cache_dir: str = "~/.isa_model/multimodal_datasets"):
|
251
|
+
self.downloader = MultimodalDatasetDownloader(cache_dir)
|
252
|
+
self.dataset_dir = None
|
253
|
+
|
254
|
+
def load_data(self, max_samples: Optional[int] = None, use_real_data: bool = True) -> List[Dict[str, Any]]:
|
255
|
+
"""Load COCO Captions data."""
|
256
|
+
if use_real_data:
|
257
|
+
try:
|
258
|
+
return self._load_real_data(max_samples)
|
259
|
+
except Exception as e:
|
260
|
+
logger.warning(f"Failed to load real COCO Captions data: {e}. Using placeholder data.")
|
261
|
+
return self._load_placeholder_data(max_samples)
|
262
|
+
else:
|
263
|
+
return self._load_placeholder_data(max_samples)
|
264
|
+
|
265
|
+
def _load_real_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
266
|
+
"""Load real COCO Captions data."""
|
267
|
+
self.dataset_dir = self.downloader.download_dataset("coco_captions")
|
268
|
+
if not self.dataset_dir:
|
269
|
+
raise FileNotFoundError("COCO Captions dataset not found")
|
270
|
+
|
271
|
+
# Load captions
|
272
|
+
captions_file = self.dataset_dir / "annotations" / "captions_val2014.json"
|
273
|
+
|
274
|
+
with open(captions_file, 'r') as f:
|
275
|
+
captions_data = json.load(f)
|
276
|
+
|
277
|
+
# Group captions by image_id
|
278
|
+
image_captions = {}
|
279
|
+
for annotation in captions_data['annotations']:
|
280
|
+
image_id = annotation['image_id']
|
281
|
+
if image_id not in image_captions:
|
282
|
+
image_captions[image_id] = []
|
283
|
+
image_captions[image_id].append(annotation['caption'])
|
284
|
+
|
285
|
+
data = []
|
286
|
+
for i, (image_id, captions) in enumerate(image_captions.items()):
|
287
|
+
if max_samples and i >= max_samples:
|
288
|
+
break
|
289
|
+
|
290
|
+
# Use the first caption as the expected output
|
291
|
+
sample = {
|
292
|
+
"image_id": image_id,
|
293
|
+
"expected_output": captions[0],
|
294
|
+
"all_captions": captions, # Keep all captions for evaluation
|
295
|
+
"task_type": "caption",
|
296
|
+
"prompt": "Generate a detailed caption describing this image.",
|
297
|
+
"id": f"coco_caption_{image_id}",
|
298
|
+
"image": f"COCO_val2014_{image_id:012d}.jpg"
|
299
|
+
}
|
300
|
+
data.append(sample)
|
301
|
+
|
302
|
+
logger.info(f"Loaded {len(data)} real COCO Captions samples")
|
303
|
+
return data
|
304
|
+
|
305
|
+
def _load_placeholder_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
306
|
+
"""Load placeholder captions data."""
|
307
|
+
sample_captions = [
|
308
|
+
"A cat sitting on a windowsill looking outside",
|
309
|
+
"Three people walking in a park on a sunny day",
|
310
|
+
"A red car parked on a city street",
|
311
|
+
"A kitchen with modern appliances and granite countertops",
|
312
|
+
"A dog playing fetch in a grassy field"
|
313
|
+
]
|
314
|
+
|
315
|
+
data = []
|
316
|
+
for i, caption in enumerate(sample_captions):
|
317
|
+
if max_samples and i >= max_samples:
|
318
|
+
break
|
319
|
+
|
320
|
+
sample = {
|
321
|
+
"image_id": f"placeholder_{i}",
|
322
|
+
"expected_output": caption,
|
323
|
+
"task_type": "caption",
|
324
|
+
"prompt": "Generate a detailed caption describing this image.",
|
325
|
+
"id": f"coco_caption_placeholder_{i}",
|
326
|
+
"image": None # Placeholder - no actual image
|
327
|
+
}
|
328
|
+
data.append(sample)
|
329
|
+
|
330
|
+
logger.info(f"Loaded {len(data)} placeholder caption samples")
|
331
|
+
return data
|
332
|
+
|
333
|
+
|
334
|
+
class DocVQADataset:
|
335
|
+
"""DocVQA Dataset loader."""
|
336
|
+
|
337
|
+
def __init__(self, cache_dir: str = "~/.isa_model/multimodal_datasets"):
|
338
|
+
self.downloader = MultimodalDatasetDownloader(cache_dir)
|
339
|
+
self.dataset_dir = None
|
340
|
+
|
341
|
+
def load_data(self, max_samples: Optional[int] = None, use_real_data: bool = False) -> List[Dict[str, Any]]:
|
342
|
+
"""Load DocVQA data (currently placeholder only)."""
|
343
|
+
# For now, only placeholder data since DocVQA requires special access
|
344
|
+
return self._load_placeholder_data(max_samples)
|
345
|
+
|
346
|
+
def _load_placeholder_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
347
|
+
"""Load placeholder DocVQA data."""
|
348
|
+
sample_doc_questions = [
|
349
|
+
{"question": "What is the title of this document?", "answer": "Annual Report 2023"},
|
350
|
+
{"question": "Who is the author?", "answer": "John Smith"},
|
351
|
+
{"question": "What is the total revenue?", "answer": "$1.2 million"},
|
352
|
+
{"question": "How many pages does this document have?", "answer": "45"},
|
353
|
+
{"question": "What year was this published?", "answer": "2023"}
|
354
|
+
]
|
355
|
+
|
356
|
+
data = []
|
357
|
+
for i, item in enumerate(sample_doc_questions):
|
358
|
+
if max_samples and i >= max_samples:
|
359
|
+
break
|
360
|
+
|
361
|
+
sample = {
|
362
|
+
"document_id": f"doc_{i}",
|
363
|
+
"question": item["question"],
|
364
|
+
"expected_output": item["answer"],
|
365
|
+
"task_type": "document_vqa",
|
366
|
+
"id": f"docvqa_placeholder_{i}",
|
367
|
+
"image": None # Placeholder - no actual document image
|
368
|
+
}
|
369
|
+
data.append(sample)
|
370
|
+
|
371
|
+
logger.info(f"Loaded {len(data)} placeholder DocVQA samples")
|
372
|
+
return data
|
373
|
+
|
374
|
+
|
375
|
+
class AudioDatasetLoader:
|
376
|
+
"""Audio dataset loader for speech tasks."""
|
377
|
+
|
378
|
+
def __init__(self, cache_dir: str = "~/.isa_model/multimodal_datasets"):
|
379
|
+
self.downloader = MultimodalDatasetDownloader(cache_dir)
|
380
|
+
|
381
|
+
def load_librispeech_data(self, max_samples: Optional[int] = None, use_real_data: bool = False) -> List[Dict[str, Any]]:
|
382
|
+
"""Load LibriSpeech data (currently placeholder only)."""
|
383
|
+
return self._load_placeholder_speech_data(max_samples)
|
384
|
+
|
385
|
+
def _load_placeholder_speech_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
386
|
+
"""Load placeholder speech data."""
|
387
|
+
sample_transcripts = [
|
388
|
+
"The quick brown fox jumps over the lazy dog",
|
389
|
+
"Machine learning is transforming artificial intelligence",
|
390
|
+
"Natural language processing enables computers to understand human speech",
|
391
|
+
"Deep learning models require large amounts of training data",
|
392
|
+
"Speech recognition technology has improved significantly in recent years"
|
393
|
+
]
|
394
|
+
|
395
|
+
data = []
|
396
|
+
for i, transcript in enumerate(sample_transcripts):
|
397
|
+
if max_samples and i >= max_samples:
|
398
|
+
break
|
399
|
+
|
400
|
+
sample = {
|
401
|
+
"audio_id": f"speech_{i}",
|
402
|
+
"expected_output": transcript,
|
403
|
+
"task_type": "stt",
|
404
|
+
"id": f"librispeech_placeholder_{i}",
|
405
|
+
"audio": None, # Placeholder - no actual audio file
|
406
|
+
"metadata": {
|
407
|
+
"speaker": f"speaker_{i % 3}",
|
408
|
+
"gender": "male" if i % 2 == 0 else "female",
|
409
|
+
"duration": 3.5 + i * 0.5
|
410
|
+
}
|
411
|
+
}
|
412
|
+
data.append(sample)
|
413
|
+
|
414
|
+
logger.info(f"Loaded {len(data)} placeholder speech samples")
|
415
|
+
return data
|
416
|
+
|
417
|
+
def load_emotion_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
|
418
|
+
"""Load placeholder emotion recognition data."""
|
419
|
+
emotions = ["happy", "sad", "angry", "neutral", "surprised"]
|
420
|
+
|
421
|
+
data = []
|
422
|
+
for i in range(min(max_samples or 20, 20)):
|
423
|
+
emotion = emotions[i % len(emotions)]
|
424
|
+
|
425
|
+
sample = {
|
426
|
+
"audio_id": f"emotion_{i}",
|
427
|
+
"expected_output": emotion,
|
428
|
+
"task_type": "emotion",
|
429
|
+
"id": f"emotion_placeholder_{i}",
|
430
|
+
"audio": None, # Placeholder - no actual audio file
|
431
|
+
"metadata": {
|
432
|
+
"speaker": f"speaker_{i % 5}",
|
433
|
+
"intensity": "medium"
|
434
|
+
}
|
435
|
+
}
|
436
|
+
data.append(sample)
|
437
|
+
|
438
|
+
logger.info(f"Loaded {len(data)} placeholder emotion samples")
|
439
|
+
return data
|
440
|
+
|
441
|
+
|
442
|
+
# Convenience functions
|
443
|
+
def create_vqa_dataset(cache_dir: str = "~/.isa_model/multimodal_datasets") -> VQAv2Dataset:
|
444
|
+
"""Create VQA v2.0 dataset instance."""
|
445
|
+
return VQAv2Dataset(cache_dir)
|
446
|
+
|
447
|
+
|
448
|
+
def create_coco_captions_dataset(cache_dir: str = "~/.isa_model/multimodal_datasets") -> COCOCaptionsDataset:
|
449
|
+
"""Create COCO Captions dataset instance."""
|
450
|
+
return COCOCaptionsDataset(cache_dir)
|
451
|
+
|
452
|
+
|
453
|
+
def create_docvqa_dataset(cache_dir: str = "~/.isa_model/multimodal_datasets") -> DocVQADataset:
|
454
|
+
"""Create DocVQA dataset instance."""
|
455
|
+
return DocVQADataset(cache_dir)
|
456
|
+
|
457
|
+
|
458
|
+
def create_audio_dataset_loader(cache_dir: str = "~/.isa_model/multimodal_datasets") -> AudioDatasetLoader:
|
459
|
+
"""Create audio dataset loader instance."""
|
460
|
+
return AudioDatasetLoader(cache_dir)
|