isa-model 0.0.2__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/core/model_manager.py +69 -4
- isa_model/core/model_registry.py +273 -46
- isa_model/core/storage/hf_storage.py +419 -0
- isa_model/deployment/__init__.py +52 -0
- isa_model/deployment/core/__init__.py +34 -0
- isa_model/deployment/core/deployment_config.py +356 -0
- isa_model/deployment/core/deployment_manager.py +549 -0
- isa_model/deployment/core/isa_deployment_service.py +401 -0
- isa_model/eval/factory.py +381 -140
- isa_model/inference/ai_factory.py +427 -236
- isa_model/inference/billing_tracker.py +406 -0
- isa_model/inference/providers/base_provider.py +51 -4
- isa_model/inference/providers/ml_provider.py +50 -0
- isa_model/inference/providers/ollama_provider.py +37 -18
- isa_model/inference/providers/openai_provider.py +65 -36
- isa_model/inference/providers/replicate_provider.py +42 -30
- isa_model/inference/services/audio/base_stt_service.py +21 -2
- isa_model/inference/services/audio/openai_realtime_service.py +353 -0
- isa_model/inference/services/audio/openai_stt_service.py +252 -0
- isa_model/inference/services/audio/openai_tts_service.py +149 -9
- isa_model/inference/services/audio/replicate_tts_service.py +239 -0
- isa_model/inference/services/base_service.py +36 -1
- isa_model/inference/services/embedding/base_embed_service.py +112 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
- isa_model/inference/services/embedding/openai_embed_service.py +223 -0
- isa_model/inference/services/llm/__init__.py +2 -0
- isa_model/inference/services/llm/base_llm_service.py +158 -86
- isa_model/inference/services/llm/llm_adapter.py +414 -0
- isa_model/inference/services/llm/ollama_llm_service.py +252 -63
- isa_model/inference/services/llm/openai_llm_service.py +231 -93
- isa_model/inference/services/llm/triton_llm_service.py +481 -0
- isa_model/inference/services/ml/base_ml_service.py +78 -0
- isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
- isa_model/inference/services/vision/__init__.py +3 -3
- isa_model/inference/services/vision/base_image_gen_service.py +161 -0
- isa_model/inference/services/vision/base_vision_service.py +177 -0
- isa_model/inference/services/vision/helpers/image_utils.py +4 -3
- isa_model/inference/services/vision/ollama_vision_service.py +151 -17
- isa_model/inference/services/vision/openai_vision_service.py +275 -41
- isa_model/inference/services/vision/replicate_image_gen_service.py +278 -118
- isa_model/training/__init__.py +62 -32
- isa_model/training/cloud/__init__.py +22 -0
- isa_model/training/cloud/job_orchestrator.py +402 -0
- isa_model/training/cloud/runpod_trainer.py +454 -0
- isa_model/training/cloud/storage_manager.py +482 -0
- isa_model/training/core/__init__.py +23 -0
- isa_model/training/core/config.py +181 -0
- isa_model/training/core/dataset.py +222 -0
- isa_model/training/core/trainer.py +720 -0
- isa_model/training/core/utils.py +213 -0
- isa_model/training/factory.py +229 -198
- isa_model-0.3.1.dist-info/METADATA +465 -0
- isa_model-0.3.1.dist-info/RECORD +91 -0
- isa_model/core/model_router.py +0 -226
- isa_model/core/model_version.py +0 -0
- isa_model/core/resource_manager.py +0 -202
- isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
- isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
- isa_model/training/engine/llama_factory/__init__.py +0 -39
- isa_model/training/engine/llama_factory/config.py +0 -115
- isa_model/training/engine/llama_factory/data_adapter.py +0 -284
- isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
- isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
- isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
- isa_model/training/engine/llama_factory/factory.py +0 -331
- isa_model/training/engine/llama_factory/rl.py +0 -254
- isa_model/training/engine/llama_factory/trainer.py +0 -171
- isa_model/training/image_model/configs/create_config.py +0 -37
- isa_model/training/image_model/configs/create_flux_config.py +0 -26
- isa_model/training/image_model/configs/create_lora_config.py +0 -21
- isa_model/training/image_model/prepare_massed_compute.py +0 -97
- isa_model/training/image_model/prepare_upload.py +0 -17
- isa_model/training/image_model/raw_data/create_captions.py +0 -16
- isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
- isa_model/training/image_model/raw_data/pre_processing.py +0 -200
- isa_model/training/image_model/train/train.py +0 -42
- isa_model/training/image_model/train/train_flux.py +0 -41
- isa_model/training/image_model/train/train_lora.py +0 -57
- isa_model/training/image_model/train_main.py +0 -25
- isa_model-0.0.2.dist-info/METADATA +0 -327
- isa_model-0.0.2.dist-info/RECORD +0 -92
- isa_model-0.0.2.dist-info/licenses/LICENSE +0 -21
- /isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
- {isa_model-0.0.2.dist-info → isa_model-0.3.1.dist-info}/WHEEL +0 -0
- {isa_model-0.0.2.dist-info → isa_model-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,222 @@
|
|
1
|
+
"""
|
2
|
+
Dataset Management
|
3
|
+
|
4
|
+
Handles loading and preprocessing of datasets for training.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import json
|
8
|
+
import logging
|
9
|
+
from typing import Optional, Tuple, Dict, Any, List, Union
|
10
|
+
from pathlib import Path
|
11
|
+
from datasets import Dataset, load_dataset
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class DatasetManager:
|
17
|
+
"""Manages dataset loading and preprocessing."""
|
18
|
+
|
19
|
+
def __init__(self, tokenizer, max_length: int = 1024):
|
20
|
+
"""
|
21
|
+
Initialize dataset manager.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
tokenizer: HuggingFace tokenizer
|
25
|
+
max_length: Maximum sequence length
|
26
|
+
"""
|
27
|
+
self.tokenizer = tokenizer
|
28
|
+
self.max_length = max_length
|
29
|
+
|
30
|
+
def prepare_dataset(
|
31
|
+
self,
|
32
|
+
dataset_path: str,
|
33
|
+
dataset_format: str = "alpaca",
|
34
|
+
validation_split: float = 0.1
|
35
|
+
) -> Tuple[Dataset, Optional[Dataset]]:
|
36
|
+
"""
|
37
|
+
Prepare training and validation datasets.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
dataset_path: Path to dataset file or HuggingFace dataset name
|
41
|
+
dataset_format: Format of the dataset (alpaca, sharegpt, custom)
|
42
|
+
validation_split: Fraction of data to use for validation
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
Tuple of (train_dataset, eval_dataset)
|
46
|
+
"""
|
47
|
+
logger.info(f"Preparing dataset: {dataset_path}")
|
48
|
+
|
49
|
+
# Load raw dataset
|
50
|
+
raw_dataset = self._load_raw_dataset(dataset_path)
|
51
|
+
|
52
|
+
# Convert to training format
|
53
|
+
if dataset_format == "alpaca":
|
54
|
+
processed_dataset = self._process_alpaca_format(raw_dataset)
|
55
|
+
elif dataset_format == "sharegpt":
|
56
|
+
processed_dataset = self._process_sharegpt_format(raw_dataset)
|
57
|
+
else:
|
58
|
+
processed_dataset = self._process_custom_format(raw_dataset)
|
59
|
+
|
60
|
+
# Tokenize dataset
|
61
|
+
tokenized_dataset = processed_dataset.map(
|
62
|
+
self._tokenize_function,
|
63
|
+
batched=True,
|
64
|
+
remove_columns=processed_dataset.column_names
|
65
|
+
)
|
66
|
+
|
67
|
+
# Split into train/eval
|
68
|
+
if validation_split > 0:
|
69
|
+
split_dataset = tokenized_dataset.train_test_split(
|
70
|
+
test_size=validation_split,
|
71
|
+
seed=42
|
72
|
+
)
|
73
|
+
train_dataset = split_dataset["train"]
|
74
|
+
eval_dataset = split_dataset["test"]
|
75
|
+
else:
|
76
|
+
train_dataset = tokenized_dataset
|
77
|
+
eval_dataset = None
|
78
|
+
|
79
|
+
logger.info(f"Dataset prepared: {len(train_dataset)} training samples")
|
80
|
+
if eval_dataset:
|
81
|
+
logger.info(f"Validation samples: {len(eval_dataset)}")
|
82
|
+
|
83
|
+
return train_dataset, eval_dataset
|
84
|
+
|
85
|
+
def _load_raw_dataset(self, dataset_path: str) -> Dataset:
|
86
|
+
"""Load raw dataset from file or HuggingFace."""
|
87
|
+
try:
|
88
|
+
# Check if it's a local file
|
89
|
+
if Path(dataset_path).exists():
|
90
|
+
logger.info(f"Loading local dataset: {dataset_path}")
|
91
|
+
|
92
|
+
if dataset_path.endswith('.json'):
|
93
|
+
with open(dataset_path, 'r') as f:
|
94
|
+
data = json.load(f)
|
95
|
+
return Dataset.from_list(data)
|
96
|
+
elif dataset_path.endswith('.jsonl'):
|
97
|
+
data = []
|
98
|
+
with open(dataset_path, 'r') as f:
|
99
|
+
for line in f:
|
100
|
+
data.append(json.loads(line))
|
101
|
+
return Dataset.from_list(data)
|
102
|
+
else:
|
103
|
+
raise ValueError(f"Unsupported file format: {dataset_path}")
|
104
|
+
|
105
|
+
else:
|
106
|
+
# Try loading from HuggingFace Hub
|
107
|
+
logger.info(f"Loading HuggingFace dataset: {dataset_path}")
|
108
|
+
dataset = load_dataset(dataset_path, split="train")
|
109
|
+
return dataset
|
110
|
+
|
111
|
+
except Exception as e:
|
112
|
+
logger.error(f"Failed to load dataset: {e}")
|
113
|
+
raise
|
114
|
+
|
115
|
+
def _process_alpaca_format(self, dataset: Dataset) -> Dataset:
|
116
|
+
"""Process Alpaca format dataset."""
|
117
|
+
def format_alpaca(example):
|
118
|
+
instruction = example.get("instruction", "")
|
119
|
+
input_text = example.get("input", "")
|
120
|
+
output = example.get("output", "")
|
121
|
+
|
122
|
+
# Format prompt
|
123
|
+
if input_text:
|
124
|
+
prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
|
125
|
+
else:
|
126
|
+
prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
|
127
|
+
|
128
|
+
# Combine prompt and response
|
129
|
+
text = prompt + output
|
130
|
+
|
131
|
+
return {"text": text}
|
132
|
+
|
133
|
+
return dataset.map(format_alpaca)
|
134
|
+
|
135
|
+
def _process_sharegpt_format(self, dataset: Dataset) -> Dataset:
|
136
|
+
"""Process ShareGPT format dataset."""
|
137
|
+
def format_sharegpt(example):
|
138
|
+
conversations = example.get("conversations", [])
|
139
|
+
|
140
|
+
text = ""
|
141
|
+
for conv in conversations:
|
142
|
+
role = conv.get("from", "")
|
143
|
+
content = conv.get("value", "")
|
144
|
+
|
145
|
+
if role == "human":
|
146
|
+
text += f"### Human:\n{content}\n\n"
|
147
|
+
elif role == "gpt":
|
148
|
+
text += f"### Assistant:\n{content}\n\n"
|
149
|
+
|
150
|
+
return {"text": text.strip()}
|
151
|
+
|
152
|
+
return dataset.map(format_sharegpt)
|
153
|
+
|
154
|
+
def _process_custom_format(self, dataset: Dataset) -> Dataset:
|
155
|
+
"""Process custom format dataset."""
|
156
|
+
# Assume the dataset already has a 'text' column
|
157
|
+
if "text" not in dataset.column_names:
|
158
|
+
raise ValueError("Custom format dataset must have a 'text' column")
|
159
|
+
|
160
|
+
return dataset
|
161
|
+
|
162
|
+
def _tokenize_function(self, examples):
|
163
|
+
"""Tokenize examples for training."""
|
164
|
+
# Tokenize inputs
|
165
|
+
tokenized = self.tokenizer(
|
166
|
+
examples["text"],
|
167
|
+
truncation=True,
|
168
|
+
padding=False,
|
169
|
+
max_length=self.max_length,
|
170
|
+
return_tensors=None,
|
171
|
+
)
|
172
|
+
|
173
|
+
# For language modeling, labels are the same as input_ids
|
174
|
+
tokenized["labels"] = tokenized["input_ids"].copy()
|
175
|
+
|
176
|
+
return tokenized
|
177
|
+
|
178
|
+
@staticmethod
|
179
|
+
def convert_hf_dataset_to_alpaca(
|
180
|
+
dataset_name: str,
|
181
|
+
output_path: str,
|
182
|
+
instruction_column: str = "instruction",
|
183
|
+
input_column: str = "input",
|
184
|
+
output_column: str = "output"
|
185
|
+
) -> str:
|
186
|
+
"""
|
187
|
+
Convert a HuggingFace dataset to Alpaca format.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
dataset_name: Name of the HuggingFace dataset
|
191
|
+
output_path: Path to save the converted dataset
|
192
|
+
instruction_column: Column name for instructions
|
193
|
+
input_column: Column name for inputs
|
194
|
+
output_column: Column name for outputs
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
Path to the saved dataset
|
198
|
+
"""
|
199
|
+
logger.info(f"Converting {dataset_name} to Alpaca format")
|
200
|
+
|
201
|
+
# Load dataset
|
202
|
+
dataset = load_dataset(dataset_name, split="train")
|
203
|
+
|
204
|
+
# Convert to Alpaca format
|
205
|
+
alpaca_data = []
|
206
|
+
for example in dataset:
|
207
|
+
alpaca_example = {
|
208
|
+
"instruction": example.get(instruction_column, ""),
|
209
|
+
"input": example.get(input_column, ""),
|
210
|
+
"output": example.get(output_column, "")
|
211
|
+
}
|
212
|
+
alpaca_data.append(alpaca_example)
|
213
|
+
|
214
|
+
# Save to file
|
215
|
+
output_path = Path(output_path)
|
216
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
217
|
+
|
218
|
+
with open(output_path, 'w') as f:
|
219
|
+
json.dump(alpaca_data, f, indent=2)
|
220
|
+
|
221
|
+
logger.info(f"Dataset converted and saved to: {output_path}")
|
222
|
+
return str(output_path)
|