isa-model 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +5 -0
- isa_model/core/model_manager.py +143 -0
- isa_model/core/model_registry.py +115 -0
- isa_model/core/model_router.py +226 -0
- isa_model/core/model_storage.py +133 -0
- isa_model/core/model_version.py +0 -0
- isa_model/core/resource_manager.py +202 -0
- isa_model/core/storage/hf_storage.py +0 -0
- isa_model/core/storage/local_storage.py +0 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/mlflow_gateway/__init__.py +8 -0
- isa_model/deployment/mlflow_gateway/start_gateway.py +65 -0
- isa_model/deployment/unified_multimodal_client.py +341 -0
- isa_model/inference/__init__.py +11 -0
- isa_model/inference/adapter/triton_adapter.py +453 -0
- isa_model/inference/adapter/unified_api.py +248 -0
- isa_model/inference/ai_factory.py +354 -0
- isa_model/inference/backends/Pytorch/bge_embed_backend.py +188 -0
- isa_model/inference/backends/Pytorch/gemma_backend.py +167 -0
- isa_model/inference/backends/Pytorch/llama_backend.py +166 -0
- isa_model/inference/backends/Pytorch/whisper_backend.py +194 -0
- isa_model/inference/backends/__init__.py +53 -0
- isa_model/inference/backends/base_backend_client.py +26 -0
- isa_model/inference/backends/container_services.py +104 -0
- isa_model/inference/backends/local_services.py +72 -0
- isa_model/inference/backends/openai_client.py +130 -0
- isa_model/inference/backends/replicate_client.py +197 -0
- isa_model/inference/backends/third_party_services.py +239 -0
- isa_model/inference/backends/triton_client.py +97 -0
- isa_model/inference/base.py +46 -0
- isa_model/inference/client_sdk/__init__.py +0 -0
- isa_model/inference/client_sdk/client.py +134 -0
- isa_model/inference/client_sdk/client_data_std.py +34 -0
- isa_model/inference/client_sdk/client_sdk_schema.py +16 -0
- isa_model/inference/client_sdk/exceptions.py +0 -0
- isa_model/inference/engine/triton/model_repository/bge/1/model.py +174 -0
- isa_model/inference/engine/triton/model_repository/gemma/1/model.py +250 -0
- isa_model/inference/engine/triton/model_repository/llama/1/model.py +76 -0
- isa_model/inference/engine/triton/model_repository/whisper/1/model.py +195 -0
- isa_model/inference/providers/__init__.py +19 -0
- isa_model/inference/providers/base_provider.py +30 -0
- isa_model/inference/providers/model_cache_manager.py +341 -0
- isa_model/inference/providers/ollama_provider.py +73 -0
- isa_model/inference/providers/openai_provider.py +87 -0
- isa_model/inference/providers/replicate_provider.py +94 -0
- isa_model/inference/providers/triton_provider.py +439 -0
- isa_model/inference/providers/vllm_provider.py +0 -0
- isa_model/inference/providers/yyds_provider.py +83 -0
- isa_model/inference/services/__init__.py +14 -0
- isa_model/inference/services/audio/fish_speech/handler.py +215 -0
- isa_model/inference/services/audio/runpod_tts_fish_service.py +212 -0
- isa_model/inference/services/audio/triton_speech_service.py +138 -0
- isa_model/inference/services/audio/whisper_service.py +186 -0
- isa_model/inference/services/audio/yyds_audio_service.py +71 -0
- isa_model/inference/services/base_service.py +106 -0
- isa_model/inference/services/base_tts_service.py +66 -0
- isa_model/inference/services/embedding/bge_service.py +183 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +85 -0
- isa_model/inference/services/embedding/ollama_rerank_service.py +118 -0
- isa_model/inference/services/embedding/onnx_rerank_service.py +73 -0
- isa_model/inference/services/llm/__init__.py +16 -0
- isa_model/inference/services/llm/gemma_service.py +143 -0
- isa_model/inference/services/llm/llama_service.py +143 -0
- isa_model/inference/services/llm/ollama_llm_service.py +108 -0
- isa_model/inference/services/llm/openai_llm_service.py +129 -0
- isa_model/inference/services/llm/replicate_llm_service.py +179 -0
- isa_model/inference/services/llm/triton_llm_service.py +230 -0
- isa_model/inference/services/others/table_transformer_service.py +61 -0
- isa_model/inference/services/vision/__init__.py +12 -0
- isa_model/inference/services/vision/helpers/image_utils.py +58 -0
- isa_model/inference/services/vision/helpers/text_splitter.py +46 -0
- isa_model/inference/services/vision/ollama_vision_service.py +60 -0
- isa_model/inference/services/vision/replicate_vision_service.py +241 -0
- isa_model/inference/services/vision/triton_vision_service.py +199 -0
- isa_model/inference/services/vision/yyds_vision_service.py +80 -0
- isa_model/inference/utils/conversion/bge_rerank_convert.py +73 -0
- isa_model/inference/utils/conversion/onnx_converter.py +0 -0
- isa_model/inference/utils/conversion/torch_converter.py +0 -0
- isa_model/scripts/inference_tracker.py +283 -0
- isa_model/scripts/mlflow_manager.py +379 -0
- isa_model/scripts/model_registry.py +465 -0
- isa_model/scripts/start_mlflow.py +95 -0
- isa_model/scripts/training_tracker.py +257 -0
- isa_model/training/engine/llama_factory/__init__.py +39 -0
- isa_model/training/engine/llama_factory/config.py +115 -0
- isa_model/training/engine/llama_factory/data_adapter.py +284 -0
- isa_model/training/engine/llama_factory/examples/__init__.py +6 -0
- isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +185 -0
- isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +163 -0
- isa_model/training/engine/llama_factory/factory.py +331 -0
- isa_model/training/engine/llama_factory/rl.py +254 -0
- isa_model/training/engine/llama_factory/trainer.py +171 -0
- isa_model/training/image_model/configs/create_config.py +37 -0
- isa_model/training/image_model/configs/create_flux_config.py +26 -0
- isa_model/training/image_model/configs/create_lora_config.py +21 -0
- isa_model/training/image_model/prepare_massed_compute.py +97 -0
- isa_model/training/image_model/prepare_upload.py +17 -0
- isa_model/training/image_model/raw_data/create_captions.py +16 -0
- isa_model/training/image_model/raw_data/create_lora_captions.py +20 -0
- isa_model/training/image_model/raw_data/pre_processing.py +200 -0
- isa_model/training/image_model/train/train.py +42 -0
- isa_model/training/image_model/train/train_flux.py +41 -0
- isa_model/training/image_model/train/train_lora.py +57 -0
- isa_model/training/image_model/train_main.py +25 -0
- isa_model/training/llm_model/annotation/annotation_schema.py +47 -0
- isa_model/training/llm_model/annotation/processors/annotation_processor.py +126 -0
- isa_model/training/llm_model/annotation/storage/dataset_manager.py +131 -0
- isa_model/training/llm_model/annotation/storage/dataset_schema.py +44 -0
- isa_model/training/llm_model/annotation/tests/test_annotation_flow.py +109 -0
- isa_model/training/llm_model/annotation/tests/test_minio copy.py +113 -0
- isa_model/training/llm_model/annotation/tests/test_minio_upload.py +43 -0
- isa_model/training/llm_model/annotation/views/annotation_controller.py +158 -0
- isa_model-0.1.0.dist-info/METADATA +116 -0
- isa_model-0.1.0.dist-info/RECORD +117 -0
- isa_model-0.1.0.dist-info/WHEEL +5 -0
- isa_model-0.1.0.dist-info/licenses/LICENSE +21 -0
- isa_model-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,73 @@
|
|
1
|
+
import os
|
2
|
+
import torch
|
3
|
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
def convert_bge_to_onnx(save_dir: str):
|
7
|
+
"""Convert BGE reranker to ONNX format"""
|
8
|
+
try:
|
9
|
+
# Create save directory if it doesn't exist
|
10
|
+
save_dir = Path(save_dir).resolve() # Get absolute path
|
11
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
12
|
+
|
13
|
+
model_name = "BAAI/bge-reranker-v2-m3"
|
14
|
+
save_path = str(save_dir / "model.onnx") # Convert to string for absolute path
|
15
|
+
|
16
|
+
print(f"Loading model {model_name}...")
|
17
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
18
|
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
19
|
+
model.eval()
|
20
|
+
|
21
|
+
# Save tokenizer for later use
|
22
|
+
print("Saving tokenizer...")
|
23
|
+
tokenizer.save_pretrained(save_dir)
|
24
|
+
|
25
|
+
# Create dummy input
|
26
|
+
print("Creating dummy input...")
|
27
|
+
dummy_input = tokenizer(
|
28
|
+
[["what is panda?", "The giant panda is a bear species."]],
|
29
|
+
padding=True,
|
30
|
+
truncation=True,
|
31
|
+
return_tensors='pt',
|
32
|
+
max_length=512
|
33
|
+
)
|
34
|
+
|
35
|
+
# Export to ONNX with external data storage
|
36
|
+
print(f"Exporting to ONNX: {save_path}")
|
37
|
+
torch.onnx.export(
|
38
|
+
model,
|
39
|
+
(dummy_input['input_ids'], dummy_input['attention_mask']),
|
40
|
+
save_path, # Using string absolute path
|
41
|
+
input_names=['input_ids', 'attention_mask'],
|
42
|
+
output_names=['logits'],
|
43
|
+
dynamic_axes={
|
44
|
+
'input_ids': {0: 'batch', 1: 'sequence'},
|
45
|
+
'attention_mask': {0: 'batch', 1: 'sequence'},
|
46
|
+
'logits': {0: 'batch'}
|
47
|
+
},
|
48
|
+
opset_version=16,
|
49
|
+
export_params=True, # Export the trained parameter weights
|
50
|
+
do_constant_folding=True, # Optimize constant-folding
|
51
|
+
verbose=True,
|
52
|
+
use_external_data_format=True # Enable external data storage
|
53
|
+
)
|
54
|
+
print("Conversion completed successfully!")
|
55
|
+
return True
|
56
|
+
|
57
|
+
except Exception as e:
|
58
|
+
print(f"Error during conversion: {e}")
|
59
|
+
return False
|
60
|
+
|
61
|
+
if __name__ == "__main__":
|
62
|
+
# Get the absolute path to the model directory
|
63
|
+
current_dir = Path(__file__).parent.parent
|
64
|
+
model_dir = current_dir / "model_converted" / "bge-reranker-v2-m3"
|
65
|
+
|
66
|
+
success = convert_bge_to_onnx(str(model_dir))
|
67
|
+
if success:
|
68
|
+
print(f"Model saved to: {model_dir}")
|
69
|
+
print("Files created:")
|
70
|
+
for file in model_dir.glob('*'):
|
71
|
+
print(f"- {file.name}")
|
72
|
+
else:
|
73
|
+
print("Conversion failed!")
|
File without changes
|
File without changes
|
@@ -0,0 +1,283 @@
|
|
1
|
+
"""
|
2
|
+
MLflow tracker for inference workflows.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import json
|
7
|
+
import time
|
8
|
+
import logging
|
9
|
+
from typing import Dict, List, Optional, Any, Union
|
10
|
+
from contextlib import contextmanager
|
11
|
+
|
12
|
+
from .mlflow_manager import MLflowManager, ExperimentType
|
13
|
+
from .model_registry import ModelRegistry, ModelStage, ModelVersion
|
14
|
+
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class InferenceTracker:
|
20
|
+
"""
|
21
|
+
Tracker for model inference workflows.
|
22
|
+
|
23
|
+
This class provides utilities to track model inference using MLflow,
|
24
|
+
including performance metrics and input/output logging.
|
25
|
+
|
26
|
+
Example:
|
27
|
+
```python
|
28
|
+
# Initialize tracker
|
29
|
+
tracker = InferenceTracker(
|
30
|
+
tracking_uri="http://localhost:5000"
|
31
|
+
)
|
32
|
+
|
33
|
+
# Get model from registry
|
34
|
+
model_version = tracker.get_production_model("llama-7b")
|
35
|
+
|
36
|
+
# Track inference
|
37
|
+
with tracker.track_inference(
|
38
|
+
model_name="llama-7b",
|
39
|
+
model_version=model_version.version
|
40
|
+
):
|
41
|
+
# Start timer
|
42
|
+
start_time = time.time()
|
43
|
+
|
44
|
+
# Generate text
|
45
|
+
output = model.generate(prompt)
|
46
|
+
|
47
|
+
# Log inference
|
48
|
+
tracker.log_inference(
|
49
|
+
input=prompt,
|
50
|
+
output=output,
|
51
|
+
latency_ms=(time.time() - start_time) * 1000
|
52
|
+
)
|
53
|
+
```
|
54
|
+
"""
|
55
|
+
|
56
|
+
def __init__(
|
57
|
+
self,
|
58
|
+
tracking_uri: Optional[str] = None,
|
59
|
+
artifact_uri: Optional[str] = None,
|
60
|
+
registry_uri: Optional[str] = None
|
61
|
+
):
|
62
|
+
"""
|
63
|
+
Initialize the inference tracker.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
tracking_uri: URI for MLflow tracking server
|
67
|
+
artifact_uri: URI for MLflow artifacts
|
68
|
+
registry_uri: URI for MLflow model registry
|
69
|
+
"""
|
70
|
+
self.mlflow_manager = MLflowManager(
|
71
|
+
tracking_uri=tracking_uri,
|
72
|
+
artifact_uri=artifact_uri,
|
73
|
+
registry_uri=registry_uri
|
74
|
+
)
|
75
|
+
self.model_registry = ModelRegistry(
|
76
|
+
tracking_uri=tracking_uri,
|
77
|
+
registry_uri=registry_uri
|
78
|
+
)
|
79
|
+
self.current_run_info = {}
|
80
|
+
self.inference_samples = []
|
81
|
+
|
82
|
+
def get_production_model(self, model_name: str) -> Optional[ModelVersion]:
|
83
|
+
"""
|
84
|
+
Get the production version of a model.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
model_name: Name of the model
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
Production ModelVersion or None if not found
|
91
|
+
"""
|
92
|
+
return self.model_registry.get_latest_model_version(
|
93
|
+
name=model_name,
|
94
|
+
stage=ModelStage.PRODUCTION
|
95
|
+
)
|
96
|
+
|
97
|
+
def get_staging_model(self, model_name: str) -> Optional[ModelVersion]:
|
98
|
+
"""
|
99
|
+
Get the staging version of a model.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
model_name: Name of the model
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
Staging ModelVersion or None if not found
|
106
|
+
"""
|
107
|
+
return self.model_registry.get_latest_model_version(
|
108
|
+
name=model_name,
|
109
|
+
stage=ModelStage.STAGING
|
110
|
+
)
|
111
|
+
|
112
|
+
@contextmanager
|
113
|
+
def track_inference(
|
114
|
+
self,
|
115
|
+
model_name: str,
|
116
|
+
model_version: Optional[str] = None,
|
117
|
+
batch_size: Optional[int] = None,
|
118
|
+
tags: Optional[Dict[str, str]] = None
|
119
|
+
):
|
120
|
+
"""
|
121
|
+
Track model inference with MLflow.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
model_name: Name of the model
|
125
|
+
model_version: Version of the model
|
126
|
+
batch_size: Batch size for inference
|
127
|
+
tags: Tags for the run
|
128
|
+
|
129
|
+
Yields:
|
130
|
+
Dictionary with run information
|
131
|
+
"""
|
132
|
+
run_info = {
|
133
|
+
"model_name": model_name,
|
134
|
+
"model_version": model_version,
|
135
|
+
"batch_size": batch_size,
|
136
|
+
"start_time": time.time(),
|
137
|
+
"metrics": {}
|
138
|
+
}
|
139
|
+
|
140
|
+
# Prepare tags
|
141
|
+
if tags is None:
|
142
|
+
tags = {}
|
143
|
+
|
144
|
+
tags["model_name"] = model_name
|
145
|
+
if model_version:
|
146
|
+
tags["model_version"] = model_version
|
147
|
+
|
148
|
+
if batch_size:
|
149
|
+
tags["batch_size"] = str(batch_size)
|
150
|
+
|
151
|
+
# Start the MLflow run
|
152
|
+
with self.mlflow_manager.start_run(
|
153
|
+
experiment_type=ExperimentType.INFERENCE,
|
154
|
+
model_name=model_name,
|
155
|
+
tags=tags
|
156
|
+
) as run:
|
157
|
+
run_info["run_id"] = run.info.run_id
|
158
|
+
run_info["experiment_id"] = run.info.experiment_id
|
159
|
+
|
160
|
+
# Reset inference samples
|
161
|
+
self.inference_samples = []
|
162
|
+
|
163
|
+
self.current_run_info = run_info
|
164
|
+
try:
|
165
|
+
yield run_info
|
166
|
+
|
167
|
+
# Calculate and log summary metrics
|
168
|
+
self._log_summary_metrics()
|
169
|
+
|
170
|
+
# Save inference samples
|
171
|
+
if self.inference_samples:
|
172
|
+
self._save_inference_samples()
|
173
|
+
|
174
|
+
finally:
|
175
|
+
run_info["end_time"] = time.time()
|
176
|
+
run_info["duration"] = run_info["end_time"] - run_info["start_time"]
|
177
|
+
|
178
|
+
# Log duration
|
179
|
+
self.mlflow_manager.log_metrics({
|
180
|
+
"duration_seconds": run_info["duration"]
|
181
|
+
})
|
182
|
+
|
183
|
+
self.current_run_info = {}
|
184
|
+
|
185
|
+
def log_inference(
|
186
|
+
self,
|
187
|
+
input: str,
|
188
|
+
output: str,
|
189
|
+
latency_ms: Optional[float] = None,
|
190
|
+
token_count: Optional[int] = None,
|
191
|
+
tokens_per_second: Optional[float] = None,
|
192
|
+
metadata: Optional[Dict[str, Any]] = None
|
193
|
+
) -> None:
|
194
|
+
"""
|
195
|
+
Log an inference sample.
|
196
|
+
|
197
|
+
Args:
|
198
|
+
input: Input prompt
|
199
|
+
output: Generated output
|
200
|
+
latency_ms: Latency in milliseconds
|
201
|
+
token_count: Number of tokens generated
|
202
|
+
tokens_per_second: Tokens per second
|
203
|
+
metadata: Additional metadata
|
204
|
+
"""
|
205
|
+
if not self.current_run_info:
|
206
|
+
logger.warning("No active run. Inference will not be logged.")
|
207
|
+
return
|
208
|
+
|
209
|
+
sample = {
|
210
|
+
"input": input,
|
211
|
+
"output": output,
|
212
|
+
"timestamp": time.time()
|
213
|
+
}
|
214
|
+
|
215
|
+
if latency_ms is not None:
|
216
|
+
sample["latency_ms"] = latency_ms
|
217
|
+
|
218
|
+
if token_count is not None:
|
219
|
+
sample["token_count"] = token_count
|
220
|
+
|
221
|
+
if tokens_per_second is not None:
|
222
|
+
sample["tokens_per_second"] = tokens_per_second
|
223
|
+
|
224
|
+
if metadata:
|
225
|
+
sample["metadata"] = metadata
|
226
|
+
|
227
|
+
self.inference_samples.append(sample)
|
228
|
+
|
229
|
+
# Log individual metrics
|
230
|
+
metrics = {}
|
231
|
+
if latency_ms is not None:
|
232
|
+
metrics["latency_ms"] = latency_ms
|
233
|
+
|
234
|
+
if token_count is not None:
|
235
|
+
metrics["token_count"] = token_count
|
236
|
+
|
237
|
+
if tokens_per_second is not None:
|
238
|
+
metrics["tokens_per_second"] = tokens_per_second
|
239
|
+
|
240
|
+
if metrics:
|
241
|
+
self.mlflow_manager.log_metrics(metrics)
|
242
|
+
|
243
|
+
def _log_summary_metrics(self) -> None:
|
244
|
+
"""Log summary metrics based on all inference samples."""
|
245
|
+
if not self.inference_samples:
|
246
|
+
return
|
247
|
+
|
248
|
+
latencies = [s.get("latency_ms") for s in self.inference_samples if "latency_ms" in s]
|
249
|
+
token_counts = [s.get("token_count") for s in self.inference_samples if "token_count" in s]
|
250
|
+
tokens_per_second = [s.get("tokens_per_second") for s in self.inference_samples if "tokens_per_second" in s]
|
251
|
+
|
252
|
+
metrics = {
|
253
|
+
"inference_count": len(self.inference_samples)
|
254
|
+
}
|
255
|
+
|
256
|
+
if latencies:
|
257
|
+
metrics["avg_latency_ms"] = sum(latencies) / len(latencies)
|
258
|
+
metrics["min_latency_ms"] = min(latencies)
|
259
|
+
metrics["max_latency_ms"] = max(latencies)
|
260
|
+
|
261
|
+
if token_counts:
|
262
|
+
metrics["avg_token_count"] = sum(token_counts) / len(token_counts)
|
263
|
+
metrics["total_tokens"] = sum(token_counts)
|
264
|
+
|
265
|
+
if tokens_per_second:
|
266
|
+
metrics["avg_tokens_per_second"] = sum(tokens_per_second) / len(tokens_per_second)
|
267
|
+
|
268
|
+
self.mlflow_manager.log_metrics(metrics)
|
269
|
+
|
270
|
+
def _save_inference_samples(self) -> None:
|
271
|
+
"""Save inference samples as an artifact."""
|
272
|
+
import tempfile
|
273
|
+
|
274
|
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f:
|
275
|
+
json.dump(self.inference_samples, f, indent=2)
|
276
|
+
temp_path = f.name
|
277
|
+
|
278
|
+
self.mlflow_manager.log_artifact(temp_path, "inference_samples.json")
|
279
|
+
|
280
|
+
try:
|
281
|
+
os.remove(temp_path)
|
282
|
+
except:
|
283
|
+
pass
|