isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +35 -80
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
- isa_model-0.4.4.dist-info/RECORD +180 -0
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,8 @@
|
|
1
|
+
"""
|
2
|
+
Modal deployment services and utilities
|
3
|
+
"""
|
4
|
+
|
5
|
+
from .deployer import ModalDeployer
|
6
|
+
from .config import ModalConfig, ModalServiceType, create_llm_config, create_vision_config, create_audio_config, create_embedding_config
|
7
|
+
|
8
|
+
__all__ = ["ModalDeployer", "ModalConfig", "ModalServiceType", "create_llm_config", "create_vision_config", "create_audio_config", "create_embedding_config"]
|
@@ -0,0 +1,136 @@
|
|
1
|
+
"""
|
2
|
+
Modal deployment configuration
|
3
|
+
|
4
|
+
Simplified configuration for Modal-specific deployments.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from dataclasses import dataclass, field
|
8
|
+
from typing import Dict, Any, Optional
|
9
|
+
from enum import Enum
|
10
|
+
|
11
|
+
|
12
|
+
class ModalServiceType(Enum):
|
13
|
+
"""Modal service types"""
|
14
|
+
LLM = "llm"
|
15
|
+
VISION = "vision"
|
16
|
+
AUDIO = "audio"
|
17
|
+
EMBEDDING = "embedding"
|
18
|
+
VIDEO = "video"
|
19
|
+
|
20
|
+
|
21
|
+
@dataclass
|
22
|
+
class ModalConfig:
|
23
|
+
"""Configuration for Modal deployment"""
|
24
|
+
|
25
|
+
# Service identification
|
26
|
+
service_name: str
|
27
|
+
service_type: ModalServiceType
|
28
|
+
model_id: str
|
29
|
+
|
30
|
+
# Modal-specific settings
|
31
|
+
image_tag: str = "latest"
|
32
|
+
cpu_cores: int = 2
|
33
|
+
memory_gb: int = 8
|
34
|
+
gpu_type: Optional[str] = None # e.g., "A10G", "T4", "A100"
|
35
|
+
timeout_seconds: int = 300
|
36
|
+
|
37
|
+
# Scaling configuration
|
38
|
+
min_instances: int = 0
|
39
|
+
max_instances: int = 10
|
40
|
+
concurrency_limit: int = 1
|
41
|
+
|
42
|
+
# Environment variables
|
43
|
+
environment: Dict[str, str] = field(default_factory=dict)
|
44
|
+
|
45
|
+
# Service-specific configuration
|
46
|
+
service_config: Dict[str, Any] = field(default_factory=dict)
|
47
|
+
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
49
|
+
"""Convert to dictionary for serialization"""
|
50
|
+
return {
|
51
|
+
"service_name": self.service_name,
|
52
|
+
"service_type": self.service_type.value,
|
53
|
+
"model_id": self.model_id,
|
54
|
+
"image_tag": self.image_tag,
|
55
|
+
"cpu_cores": self.cpu_cores,
|
56
|
+
"memory_gb": self.memory_gb,
|
57
|
+
"gpu_type": self.gpu_type,
|
58
|
+
"timeout_seconds": self.timeout_seconds,
|
59
|
+
"min_instances": self.min_instances,
|
60
|
+
"max_instances": self.max_instances,
|
61
|
+
"concurrency_limit": self.concurrency_limit,
|
62
|
+
"environment": self.environment,
|
63
|
+
"service_config": self.service_config
|
64
|
+
}
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ModalConfig":
|
68
|
+
"""Create from dictionary"""
|
69
|
+
return cls(
|
70
|
+
service_name=data["service_name"],
|
71
|
+
service_type=ModalServiceType(data["service_type"]),
|
72
|
+
model_id=data["model_id"],
|
73
|
+
image_tag=data.get("image_tag", "latest"),
|
74
|
+
cpu_cores=data.get("cpu_cores", 2),
|
75
|
+
memory_gb=data.get("memory_gb", 8),
|
76
|
+
gpu_type=data.get("gpu_type"),
|
77
|
+
timeout_seconds=data.get("timeout_seconds", 300),
|
78
|
+
min_instances=data.get("min_instances", 0),
|
79
|
+
max_instances=data.get("max_instances", 10),
|
80
|
+
concurrency_limit=data.get("concurrency_limit", 1),
|
81
|
+
environment=data.get("environment", {}),
|
82
|
+
service_config=data.get("service_config", {})
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
# Predefined configurations for common service types
|
87
|
+
def create_llm_config(service_name: str, model_id: str, gpu_type: str = "A10G") -> ModalConfig:
|
88
|
+
"""Create configuration for LLM service"""
|
89
|
+
return ModalConfig(
|
90
|
+
service_name=service_name,
|
91
|
+
service_type=ModalServiceType.LLM,
|
92
|
+
model_id=model_id,
|
93
|
+
gpu_type=gpu_type,
|
94
|
+
memory_gb=16,
|
95
|
+
timeout_seconds=600,
|
96
|
+
max_instances=5
|
97
|
+
)
|
98
|
+
|
99
|
+
|
100
|
+
def create_vision_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
|
101
|
+
"""Create configuration for vision service"""
|
102
|
+
return ModalConfig(
|
103
|
+
service_name=service_name,
|
104
|
+
service_type=ModalServiceType.VISION,
|
105
|
+
model_id=model_id,
|
106
|
+
gpu_type=gpu_type,
|
107
|
+
memory_gb=12,
|
108
|
+
timeout_seconds=300,
|
109
|
+
max_instances=10
|
110
|
+
)
|
111
|
+
|
112
|
+
|
113
|
+
def create_audio_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
|
114
|
+
"""Create configuration for audio service"""
|
115
|
+
return ModalConfig(
|
116
|
+
service_name=service_name,
|
117
|
+
service_type=ModalServiceType.AUDIO,
|
118
|
+
model_id=model_id,
|
119
|
+
gpu_type=gpu_type,
|
120
|
+
memory_gb=8,
|
121
|
+
timeout_seconds=300,
|
122
|
+
max_instances=8
|
123
|
+
)
|
124
|
+
|
125
|
+
|
126
|
+
def create_embedding_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
|
127
|
+
"""Create configuration for embedding service"""
|
128
|
+
return ModalConfig(
|
129
|
+
service_name=service_name,
|
130
|
+
service_type=ModalServiceType.EMBEDDING,
|
131
|
+
model_id=model_id,
|
132
|
+
gpu_type=gpu_type,
|
133
|
+
memory_gb=6,
|
134
|
+
timeout_seconds=120,
|
135
|
+
max_instances=15
|
136
|
+
)
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Audio services for Modal deployment"""
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Embedding services for Modal deployment"""
|
@@ -0,0 +1 @@
|
|
1
|
+
"""LLM services for Modal deployment"""
|
@@ -0,0 +1,424 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
ISA LLM Service - Modal deployment for HuggingFace trained models
|
6
|
+
Provides inference API for custom trained models
|
7
|
+
"""
|
8
|
+
|
9
|
+
import os
|
10
|
+
import logging
|
11
|
+
from typing import Dict, Any, List, Optional
|
12
|
+
import modal
|
13
|
+
|
14
|
+
# Modal app configuration
|
15
|
+
app = modal.App("isa-llm-inference")
|
16
|
+
|
17
|
+
# GPU configuration for inference
|
18
|
+
GPU_CONFIG = modal.gpu.A10G()
|
19
|
+
|
20
|
+
# Base image with HuggingFace transformers
|
21
|
+
image = (
|
22
|
+
modal.Image.debian_slim(python_version="3.11")
|
23
|
+
.pip_install([
|
24
|
+
"torch>=2.0.0",
|
25
|
+
"transformers>=4.35.0",
|
26
|
+
"accelerate>=0.20.0",
|
27
|
+
"huggingface_hub>=0.17.0",
|
28
|
+
"peft>=0.5.0", # For LoRA models
|
29
|
+
"bitsandbytes>=0.41.0", # For quantization
|
30
|
+
"sentencepiece>=0.1.99", # For tokenizers
|
31
|
+
])
|
32
|
+
)
|
33
|
+
|
34
|
+
logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
@app.cls(
|
37
|
+
image=image,
|
38
|
+
gpu=GPU_CONFIG,
|
39
|
+
cpu=2.0,
|
40
|
+
memory=16384, # 16GB memory
|
41
|
+
timeout=300, # 5 minute timeout
|
42
|
+
container_idle_timeout=60, # Keep warm for 1 minute
|
43
|
+
allow_concurrent_inputs=5, # Allow concurrent requests
|
44
|
+
)
|
45
|
+
class ISALLMService:
|
46
|
+
"""
|
47
|
+
ISA LLM Service for inference on HuggingFace trained models
|
48
|
+
Designed to work with models trained through ISA training pipeline
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(self):
|
52
|
+
"""Initialize the service (runs on container startup)"""
|
53
|
+
import torch
|
54
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
55
|
+
|
56
|
+
# Model will be loaded when first requested
|
57
|
+
self.model = None
|
58
|
+
self.tokenizer = None
|
59
|
+
self.current_model_id = None
|
60
|
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
61
|
+
|
62
|
+
logger.info(f"ISA LLM Service initialized on {self.device}")
|
63
|
+
|
64
|
+
def _load_model(self, model_id: str, hf_token: str = None):
|
65
|
+
"""Load a specific model"""
|
66
|
+
import torch
|
67
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
68
|
+
|
69
|
+
if self.current_model_id == model_id and self.model is not None:
|
70
|
+
logger.info(f"Model {model_id} already loaded")
|
71
|
+
return
|
72
|
+
|
73
|
+
logger.info(f"Loading model: {model_id}")
|
74
|
+
|
75
|
+
try:
|
76
|
+
# Load tokenizer
|
77
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
78
|
+
model_id,
|
79
|
+
token=hf_token,
|
80
|
+
trust_remote_code=True
|
81
|
+
)
|
82
|
+
|
83
|
+
# Set pad token if not exists
|
84
|
+
if self.tokenizer.pad_token is None:
|
85
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
86
|
+
|
87
|
+
# Load model with GPU optimization
|
88
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
89
|
+
model_id,
|
90
|
+
token=hf_token,
|
91
|
+
torch_dtype=torch.float16,
|
92
|
+
device_map="auto",
|
93
|
+
trust_remote_code=True,
|
94
|
+
low_cpu_mem_usage=True
|
95
|
+
)
|
96
|
+
|
97
|
+
self.current_model_id = model_id
|
98
|
+
logger.info(f"Successfully loaded model {model_id}")
|
99
|
+
|
100
|
+
except Exception as e:
|
101
|
+
logger.error(f"Failed to load model {model_id}: {e}")
|
102
|
+
raise
|
103
|
+
|
104
|
+
@modal.method
|
105
|
+
def generate_text(
|
106
|
+
self,
|
107
|
+
prompt: str,
|
108
|
+
model_id: str,
|
109
|
+
hf_token: str = None,
|
110
|
+
max_length: int = 100,
|
111
|
+
temperature: float = 0.7,
|
112
|
+
do_sample: bool = True,
|
113
|
+
top_p: float = 0.9,
|
114
|
+
repetition_penalty: float = 1.1,
|
115
|
+
**kwargs
|
116
|
+
) -> Dict[str, Any]:
|
117
|
+
"""
|
118
|
+
Generate text using the specified model
|
119
|
+
|
120
|
+
Args:
|
121
|
+
prompt: Input text prompt
|
122
|
+
model_id: HuggingFace model ID (e.g., "xenobordom/dialogpt-isa-trained-xxx")
|
123
|
+
hf_token: HuggingFace token for private models
|
124
|
+
max_length: Maximum generation length
|
125
|
+
temperature: Sampling temperature
|
126
|
+
do_sample: Whether to use sampling
|
127
|
+
top_p: Top-p sampling parameter
|
128
|
+
repetition_penalty: Repetition penalty
|
129
|
+
**kwargs: Additional generation parameters
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
Dictionary containing generated text and metadata
|
133
|
+
"""
|
134
|
+
import torch
|
135
|
+
import time
|
136
|
+
|
137
|
+
start_time = time.time()
|
138
|
+
|
139
|
+
try:
|
140
|
+
# Load model if needed
|
141
|
+
self._load_model(model_id, hf_token)
|
142
|
+
|
143
|
+
if self.model is None or self.tokenizer is None:
|
144
|
+
raise RuntimeError("Model not properly loaded")
|
145
|
+
|
146
|
+
# Tokenize input
|
147
|
+
inputs = self.tokenizer(
|
148
|
+
prompt,
|
149
|
+
return_tensors="pt",
|
150
|
+
padding=True,
|
151
|
+
truncation=True,
|
152
|
+
max_length=512
|
153
|
+
).to(self.device)
|
154
|
+
|
155
|
+
# Generate
|
156
|
+
with torch.no_grad():
|
157
|
+
outputs = self.model.generate(
|
158
|
+
**inputs,
|
159
|
+
max_length=inputs["input_ids"].shape[1] + max_length,
|
160
|
+
temperature=temperature,
|
161
|
+
do_sample=do_sample,
|
162
|
+
top_p=top_p,
|
163
|
+
repetition_penalty=repetition_penalty,
|
164
|
+
pad_token_id=self.tokenizer.pad_token_id,
|
165
|
+
eos_token_id=self.tokenizer.eos_token_id,
|
166
|
+
**kwargs
|
167
|
+
)
|
168
|
+
|
169
|
+
# Decode generated text
|
170
|
+
full_text = self.tokenizer.decode(
|
171
|
+
outputs[0],
|
172
|
+
skip_special_tokens=True,
|
173
|
+
clean_up_tokenization_spaces=True
|
174
|
+
)
|
175
|
+
|
176
|
+
# Extract only the new generated part
|
177
|
+
generated_text = full_text
|
178
|
+
if generated_text.startswith(prompt):
|
179
|
+
generated_text = generated_text[len(prompt):].strip()
|
180
|
+
|
181
|
+
processing_time = time.time() - start_time
|
182
|
+
|
183
|
+
return {
|
184
|
+
"success": True,
|
185
|
+
"text": generated_text,
|
186
|
+
"full_text": full_text,
|
187
|
+
"prompt": prompt,
|
188
|
+
"model_id": model_id,
|
189
|
+
"provider": "ISA",
|
190
|
+
"service": "isa-llm",
|
191
|
+
"generation_config": {
|
192
|
+
"max_length": max_length,
|
193
|
+
"temperature": temperature,
|
194
|
+
"do_sample": do_sample,
|
195
|
+
"top_p": top_p,
|
196
|
+
"repetition_penalty": repetition_penalty
|
197
|
+
},
|
198
|
+
"metadata": {
|
199
|
+
"processing_time": processing_time,
|
200
|
+
"device": str(self.device),
|
201
|
+
"input_tokens": inputs["input_ids"].shape[1],
|
202
|
+
"output_tokens": outputs.shape[1]
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
except Exception as e:
|
207
|
+
logger.error(f"Error during text generation: {e}")
|
208
|
+
return {
|
209
|
+
"success": False,
|
210
|
+
"error": str(e),
|
211
|
+
"prompt": prompt,
|
212
|
+
"model_id": model_id,
|
213
|
+
"provider": "ISA",
|
214
|
+
"service": "isa-llm"
|
215
|
+
}
|
216
|
+
|
217
|
+
@modal.method
|
218
|
+
def chat_completion(
|
219
|
+
self,
|
220
|
+
messages: List[Dict[str, str]],
|
221
|
+
model_id: str,
|
222
|
+
hf_token: str = None,
|
223
|
+
**kwargs
|
224
|
+
) -> Dict[str, Any]:
|
225
|
+
"""
|
226
|
+
Chat completion with conversation history
|
227
|
+
|
228
|
+
Args:
|
229
|
+
messages: List of message dictionaries with 'role' and 'content'
|
230
|
+
model_id: HuggingFace model ID
|
231
|
+
hf_token: HuggingFace token
|
232
|
+
**kwargs: Additional generation parameters
|
233
|
+
|
234
|
+
Returns:
|
235
|
+
Dictionary containing generated response and metadata
|
236
|
+
"""
|
237
|
+
try:
|
238
|
+
# Convert messages to a single prompt
|
239
|
+
conversation = ""
|
240
|
+
for msg in messages:
|
241
|
+
role = msg.get("role", "user")
|
242
|
+
content = msg.get("content", "")
|
243
|
+
if role == "user":
|
244
|
+
conversation += f"User: {content}\n"
|
245
|
+
elif role == "assistant":
|
246
|
+
conversation += f"Assistant: {content}\n"
|
247
|
+
elif role == "system":
|
248
|
+
conversation += f"System: {content}\n"
|
249
|
+
|
250
|
+
conversation += "Assistant: "
|
251
|
+
|
252
|
+
# Generate response
|
253
|
+
result = self.generate_text(
|
254
|
+
prompt=conversation,
|
255
|
+
model_id=model_id,
|
256
|
+
hf_token=hf_token,
|
257
|
+
**kwargs
|
258
|
+
)
|
259
|
+
|
260
|
+
# Format as chat response
|
261
|
+
if result.get("success"):
|
262
|
+
result["role"] = "assistant"
|
263
|
+
result["conversation"] = conversation
|
264
|
+
result["messages"] = messages
|
265
|
+
|
266
|
+
return result
|
267
|
+
|
268
|
+
except Exception as e:
|
269
|
+
logger.error(f"Error during chat completion: {e}")
|
270
|
+
return {
|
271
|
+
"success": False,
|
272
|
+
"error": str(e),
|
273
|
+
"messages": messages,
|
274
|
+
"model_id": model_id,
|
275
|
+
"provider": "ISA",
|
276
|
+
"service": "isa-llm"
|
277
|
+
}
|
278
|
+
|
279
|
+
@modal.method
|
280
|
+
def get_model_info(self, model_id: str, hf_token: str = None) -> Dict[str, Any]:
|
281
|
+
"""Get information about the loaded model"""
|
282
|
+
try:
|
283
|
+
# Load model if needed
|
284
|
+
self._load_model(model_id, hf_token)
|
285
|
+
|
286
|
+
if self.model is None:
|
287
|
+
return {
|
288
|
+
"success": False,
|
289
|
+
"error": "Model not loaded"
|
290
|
+
}
|
291
|
+
|
292
|
+
# Get model config
|
293
|
+
config = self.model.config
|
294
|
+
|
295
|
+
# Count parameters
|
296
|
+
total_params = sum(p.numel() for p in self.model.parameters())
|
297
|
+
trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
|
298
|
+
|
299
|
+
return {
|
300
|
+
"success": True,
|
301
|
+
"model_id": model_id,
|
302
|
+
"provider": "ISA",
|
303
|
+
"service": "isa-llm",
|
304
|
+
"architecture": config.model_type if hasattr(config, 'model_type') else "unknown",
|
305
|
+
"vocab_size": config.vocab_size if hasattr(config, 'vocab_size') else None,
|
306
|
+
"hidden_size": config.hidden_size if hasattr(config, 'hidden_size') else None,
|
307
|
+
"num_layers": getattr(config, 'num_layers', getattr(config, 'n_layer', None)),
|
308
|
+
"num_attention_heads": getattr(config, 'num_attention_heads', getattr(config, 'n_head', None)),
|
309
|
+
"total_parameters": total_params,
|
310
|
+
"trainable_parameters": trainable_params,
|
311
|
+
"device": str(self.device),
|
312
|
+
"dtype": str(next(self.model.parameters()).dtype)
|
313
|
+
}
|
314
|
+
|
315
|
+
except Exception as e:
|
316
|
+
logger.error(f"Error getting model info: {e}")
|
317
|
+
return {
|
318
|
+
"success": False,
|
319
|
+
"error": str(e)
|
320
|
+
}
|
321
|
+
|
322
|
+
@modal.method
|
323
|
+
def health_check(self) -> Dict[str, Any]:
|
324
|
+
"""Health check endpoint"""
|
325
|
+
import torch
|
326
|
+
|
327
|
+
try:
|
328
|
+
gpu_available = torch.cuda.is_available()
|
329
|
+
gpu_count = torch.cuda.device_count() if gpu_available else 0
|
330
|
+
|
331
|
+
return {
|
332
|
+
"success": True,
|
333
|
+
"status": "healthy",
|
334
|
+
"service": "isa-llm",
|
335
|
+
"provider": "ISA",
|
336
|
+
"device": str(self.device),
|
337
|
+
"gpu_available": gpu_available,
|
338
|
+
"gpu_count": gpu_count,
|
339
|
+
"current_model": self.current_model_id,
|
340
|
+
"memory_info": {
|
341
|
+
"allocated": torch.cuda.memory_allocated() if gpu_available else 0,
|
342
|
+
"cached": torch.cuda.memory_reserved() if gpu_available else 0
|
343
|
+
} if gpu_available else None
|
344
|
+
}
|
345
|
+
|
346
|
+
except Exception as e:
|
347
|
+
return {
|
348
|
+
"success": False,
|
349
|
+
"status": "error",
|
350
|
+
"error": str(e)
|
351
|
+
}
|
352
|
+
|
353
|
+
# Deployment functions
|
354
|
+
@app.function(
|
355
|
+
image=image,
|
356
|
+
schedule=modal.Cron("0 2 * * *"), # Deploy daily at 2 AM
|
357
|
+
timeout=300
|
358
|
+
)
|
359
|
+
def deploy_service():
|
360
|
+
"""Deploy the ISA LLM service"""
|
361
|
+
print("ISA LLM Service deployed successfully!")
|
362
|
+
return {"status": "deployed", "service": "isa-llm"}
|
363
|
+
|
364
|
+
# Local testing function
|
365
|
+
@app.local_entrypoint()
|
366
|
+
def test_service():
|
367
|
+
"""Test the ISA LLM service locally"""
|
368
|
+
|
369
|
+
# Test with our trained model
|
370
|
+
test_model_id = "xenobordom/dialogpt-isa-trained-1755493402"
|
371
|
+
test_prompt = "你好"
|
372
|
+
|
373
|
+
# Get HF token from environment
|
374
|
+
hf_token = os.getenv("HF_TOKEN")
|
375
|
+
if not hf_token:
|
376
|
+
print("❌ HF_TOKEN not found in environment")
|
377
|
+
return
|
378
|
+
|
379
|
+
print(f"🧪 Testing ISA LLM Service with model: {test_model_id}")
|
380
|
+
|
381
|
+
# Create service instance
|
382
|
+
service = ISALLMService()
|
383
|
+
|
384
|
+
# Test health check
|
385
|
+
print("📋 Testing health check...")
|
386
|
+
health = service.health_check.remote()
|
387
|
+
print(f"Health: {health}")
|
388
|
+
|
389
|
+
# Test model info
|
390
|
+
print("📊 Testing model info...")
|
391
|
+
info = service.get_model_info.remote(test_model_id, hf_token)
|
392
|
+
print(f"Model info: {info}")
|
393
|
+
|
394
|
+
# Test text generation
|
395
|
+
print("🤖 Testing text generation...")
|
396
|
+
result = service.generate_text.remote(
|
397
|
+
prompt=test_prompt,
|
398
|
+
model_id=test_model_id,
|
399
|
+
hf_token=hf_token,
|
400
|
+
max_length=30,
|
401
|
+
temperature=0.7
|
402
|
+
)
|
403
|
+
print(f"Generation result: {result}")
|
404
|
+
|
405
|
+
# Test chat completion
|
406
|
+
print("💬 Testing chat completion...")
|
407
|
+
messages = [
|
408
|
+
{"role": "user", "content": "你好"},
|
409
|
+
{"role": "assistant", "content": "你好!很高兴见到你。"},
|
410
|
+
{"role": "user", "content": "你能帮我做什么?"}
|
411
|
+
]
|
412
|
+
chat_result = service.chat_completion.remote(
|
413
|
+
messages=messages,
|
414
|
+
model_id=test_model_id,
|
415
|
+
hf_token=hf_token,
|
416
|
+
max_length=30
|
417
|
+
)
|
418
|
+
print(f"Chat result: {chat_result}")
|
419
|
+
|
420
|
+
print("✅ ISA LLM Service test completed!")
|
421
|
+
|
422
|
+
if __name__ == "__main__":
|
423
|
+
# For local development
|
424
|
+
test_service()
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Video services for Modal deployment"""
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Vision services for Modal deployment"""
|
@@ -0,0 +1,48 @@
|
|
1
|
+
"""
|
2
|
+
tenant-a-service LLM Service for Modal
|
3
|
+
|
4
|
+
Auto-generated service for model: gpt2
|
5
|
+
Architecture: gpt
|
6
|
+
"""
|
7
|
+
|
8
|
+
import modal
|
9
|
+
from typing import Dict, Any, List
|
10
|
+
|
11
|
+
app = modal.App("tenant-a-service")
|
12
|
+
|
13
|
+
image = modal.Image.debian_slim().pip_install(
|
14
|
+
"accelerate>=0.24.0", "transformers>=4.35.0", "httpx>=0.26.0", "torch>=2.0.0", "requests>=2.31.0", "numpy>=1.24.0", "pydantic>=2.0.0"
|
15
|
+
)
|
16
|
+
|
17
|
+
@app.cls(
|
18
|
+
image=image,
|
19
|
+
gpu=modal.gpu.A10G(count=1),
|
20
|
+
container_idle_timeout=300,
|
21
|
+
memory=32768
|
22
|
+
)
|
23
|
+
class Tenant_A_ServiceService:
|
24
|
+
|
25
|
+
@modal.enter()
|
26
|
+
def load_model(self):
|
27
|
+
import torch
|
28
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
29
|
+
|
30
|
+
self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
31
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
32
|
+
"gpt2",
|
33
|
+
torch_dtype=torch.float16,
|
34
|
+
device_map="auto",
|
35
|
+
trust_remote_code=True
|
36
|
+
)
|
37
|
+
|
38
|
+
@modal.method()
|
39
|
+
def generate(self, messages: List[Dict[str, str]], **kwargs):
|
40
|
+
# Generate response (simplified)
|
41
|
+
prompt = messages[-1]["content"] if messages else ""
|
42
|
+
return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
|
43
|
+
|
44
|
+
@app.function(image=image)
|
45
|
+
@modal.web_endpoint(method="POST")
|
46
|
+
def inference_endpoint(item: Dict[str, Any]):
|
47
|
+
service = Tenant_A_ServiceService()
|
48
|
+
return service.generate(**item)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
"""
|
2
|
+
prefix-test-service LLM Service for Modal
|
3
|
+
|
4
|
+
Auto-generated service for model: gpt2
|
5
|
+
Architecture: gpt
|
6
|
+
"""
|
7
|
+
|
8
|
+
import modal
|
9
|
+
from typing import Dict, Any, List
|
10
|
+
|
11
|
+
app = modal.App("prefix-test-service")
|
12
|
+
|
13
|
+
image = modal.Image.debian_slim().pip_install(
|
14
|
+
"accelerate>=0.24.0", "transformers>=4.35.0", "httpx>=0.26.0", "torch>=2.0.0", "requests>=2.31.0", "numpy>=1.24.0", "pydantic>=2.0.0"
|
15
|
+
)
|
16
|
+
|
17
|
+
@app.cls(
|
18
|
+
image=image,
|
19
|
+
gpu=modal.gpu.A10G(count=1),
|
20
|
+
container_idle_timeout=300,
|
21
|
+
memory=32768
|
22
|
+
)
|
23
|
+
class Prefix_Test_ServiceService:
|
24
|
+
|
25
|
+
@modal.enter()
|
26
|
+
def load_model(self):
|
27
|
+
import torch
|
28
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
29
|
+
|
30
|
+
self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
31
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
32
|
+
"gpt2",
|
33
|
+
torch_dtype=torch.float16,
|
34
|
+
device_map="auto",
|
35
|
+
trust_remote_code=True
|
36
|
+
)
|
37
|
+
|
38
|
+
@modal.method()
|
39
|
+
def generate(self, messages: List[Dict[str, str]], **kwargs):
|
40
|
+
# Generate response (simplified)
|
41
|
+
prompt = messages[-1]["content"] if messages else ""
|
42
|
+
return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
|
43
|
+
|
44
|
+
@app.function(image=image)
|
45
|
+
@modal.web_endpoint(method="POST")
|
46
|
+
def inference_endpoint(item: Dict[str, Any]):
|
47
|
+
service = Prefix_Test_ServiceService()
|
48
|
+
return service.generate(**item)
|