isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,8 @@
1
+ """
2
+ Modal deployment services and utilities
3
+ """
4
+
5
+ from .deployer import ModalDeployer
6
+ from .config import ModalConfig, ModalServiceType, create_llm_config, create_vision_config, create_audio_config, create_embedding_config
7
+
8
+ __all__ = ["ModalDeployer", "ModalConfig", "ModalServiceType", "create_llm_config", "create_vision_config", "create_audio_config", "create_embedding_config"]
@@ -0,0 +1,136 @@
1
+ """
2
+ Modal deployment configuration
3
+
4
+ Simplified configuration for Modal-specific deployments.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, Any, Optional
9
+ from enum import Enum
10
+
11
+
12
+ class ModalServiceType(Enum):
13
+ """Modal service types"""
14
+ LLM = "llm"
15
+ VISION = "vision"
16
+ AUDIO = "audio"
17
+ EMBEDDING = "embedding"
18
+ VIDEO = "video"
19
+
20
+
21
+ @dataclass
22
+ class ModalConfig:
23
+ """Configuration for Modal deployment"""
24
+
25
+ # Service identification
26
+ service_name: str
27
+ service_type: ModalServiceType
28
+ model_id: str
29
+
30
+ # Modal-specific settings
31
+ image_tag: str = "latest"
32
+ cpu_cores: int = 2
33
+ memory_gb: int = 8
34
+ gpu_type: Optional[str] = None # e.g., "A10G", "T4", "A100"
35
+ timeout_seconds: int = 300
36
+
37
+ # Scaling configuration
38
+ min_instances: int = 0
39
+ max_instances: int = 10
40
+ concurrency_limit: int = 1
41
+
42
+ # Environment variables
43
+ environment: Dict[str, str] = field(default_factory=dict)
44
+
45
+ # Service-specific configuration
46
+ service_config: Dict[str, Any] = field(default_factory=dict)
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """Convert to dictionary for serialization"""
50
+ return {
51
+ "service_name": self.service_name,
52
+ "service_type": self.service_type.value,
53
+ "model_id": self.model_id,
54
+ "image_tag": self.image_tag,
55
+ "cpu_cores": self.cpu_cores,
56
+ "memory_gb": self.memory_gb,
57
+ "gpu_type": self.gpu_type,
58
+ "timeout_seconds": self.timeout_seconds,
59
+ "min_instances": self.min_instances,
60
+ "max_instances": self.max_instances,
61
+ "concurrency_limit": self.concurrency_limit,
62
+ "environment": self.environment,
63
+ "service_config": self.service_config
64
+ }
65
+
66
+ @classmethod
67
+ def from_dict(cls, data: Dict[str, Any]) -> "ModalConfig":
68
+ """Create from dictionary"""
69
+ return cls(
70
+ service_name=data["service_name"],
71
+ service_type=ModalServiceType(data["service_type"]),
72
+ model_id=data["model_id"],
73
+ image_tag=data.get("image_tag", "latest"),
74
+ cpu_cores=data.get("cpu_cores", 2),
75
+ memory_gb=data.get("memory_gb", 8),
76
+ gpu_type=data.get("gpu_type"),
77
+ timeout_seconds=data.get("timeout_seconds", 300),
78
+ min_instances=data.get("min_instances", 0),
79
+ max_instances=data.get("max_instances", 10),
80
+ concurrency_limit=data.get("concurrency_limit", 1),
81
+ environment=data.get("environment", {}),
82
+ service_config=data.get("service_config", {})
83
+ )
84
+
85
+
86
+ # Predefined configurations for common service types
87
+ def create_llm_config(service_name: str, model_id: str, gpu_type: str = "A10G") -> ModalConfig:
88
+ """Create configuration for LLM service"""
89
+ return ModalConfig(
90
+ service_name=service_name,
91
+ service_type=ModalServiceType.LLM,
92
+ model_id=model_id,
93
+ gpu_type=gpu_type,
94
+ memory_gb=16,
95
+ timeout_seconds=600,
96
+ max_instances=5
97
+ )
98
+
99
+
100
+ def create_vision_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
101
+ """Create configuration for vision service"""
102
+ return ModalConfig(
103
+ service_name=service_name,
104
+ service_type=ModalServiceType.VISION,
105
+ model_id=model_id,
106
+ gpu_type=gpu_type,
107
+ memory_gb=12,
108
+ timeout_seconds=300,
109
+ max_instances=10
110
+ )
111
+
112
+
113
+ def create_audio_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
114
+ """Create configuration for audio service"""
115
+ return ModalConfig(
116
+ service_name=service_name,
117
+ service_type=ModalServiceType.AUDIO,
118
+ model_id=model_id,
119
+ gpu_type=gpu_type,
120
+ memory_gb=8,
121
+ timeout_seconds=300,
122
+ max_instances=8
123
+ )
124
+
125
+
126
+ def create_embedding_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
127
+ """Create configuration for embedding service"""
128
+ return ModalConfig(
129
+ service_name=service_name,
130
+ service_type=ModalServiceType.EMBEDDING,
131
+ model_id=model_id,
132
+ gpu_type=gpu_type,
133
+ memory_gb=6,
134
+ timeout_seconds=120,
135
+ max_instances=15
136
+ )
@@ -34,7 +34,7 @@ class ModelConfig:
34
34
  max_tokens: int = 2048
35
35
  estimated_cost_per_hour: float = 0.0
36
36
 
37
- class HuggingFaceModalDeployer:
37
+ class ModalDeployer:
38
38
  """
39
39
  Service to automatically deploy HuggingFace models to Modal
40
40
  """
@@ -0,0 +1,3 @@
1
+ """
2
+ Modal service implementations organized by capability
3
+ """
@@ -0,0 +1 @@
1
+ """Audio services for Modal deployment"""
@@ -0,0 +1 @@
1
+ """Embedding services for Modal deployment"""
@@ -0,0 +1 @@
1
+ """LLM services for Modal deployment"""
@@ -0,0 +1,424 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ ISA LLM Service - Modal deployment for HuggingFace trained models
6
+ Provides inference API for custom trained models
7
+ """
8
+
9
+ import os
10
+ import logging
11
+ from typing import Dict, Any, List, Optional
12
+ import modal
13
+
14
+ # Modal app configuration
15
+ app = modal.App("isa-llm-inference")
16
+
17
+ # GPU configuration for inference
18
+ GPU_CONFIG = modal.gpu.A10G()
19
+
20
+ # Base image with HuggingFace transformers
21
+ image = (
22
+ modal.Image.debian_slim(python_version="3.11")
23
+ .pip_install([
24
+ "torch>=2.0.0",
25
+ "transformers>=4.35.0",
26
+ "accelerate>=0.20.0",
27
+ "huggingface_hub>=0.17.0",
28
+ "peft>=0.5.0", # For LoRA models
29
+ "bitsandbytes>=0.41.0", # For quantization
30
+ "sentencepiece>=0.1.99", # For tokenizers
31
+ ])
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ @app.cls(
37
+ image=image,
38
+ gpu=GPU_CONFIG,
39
+ cpu=2.0,
40
+ memory=16384, # 16GB memory
41
+ timeout=300, # 5 minute timeout
42
+ container_idle_timeout=60, # Keep warm for 1 minute
43
+ allow_concurrent_inputs=5, # Allow concurrent requests
44
+ )
45
+ class ISALLMService:
46
+ """
47
+ ISA LLM Service for inference on HuggingFace trained models
48
+ Designed to work with models trained through ISA training pipeline
49
+ """
50
+
51
+ def __init__(self):
52
+ """Initialize the service (runs on container startup)"""
53
+ import torch
54
+ from transformers import AutoTokenizer, AutoModelForCausalLM
55
+
56
+ # Model will be loaded when first requested
57
+ self.model = None
58
+ self.tokenizer = None
59
+ self.current_model_id = None
60
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
61
+
62
+ logger.info(f"ISA LLM Service initialized on {self.device}")
63
+
64
+ def _load_model(self, model_id: str, hf_token: str = None):
65
+ """Load a specific model"""
66
+ import torch
67
+ from transformers import AutoTokenizer, AutoModelForCausalLM
68
+
69
+ if self.current_model_id == model_id and self.model is not None:
70
+ logger.info(f"Model {model_id} already loaded")
71
+ return
72
+
73
+ logger.info(f"Loading model: {model_id}")
74
+
75
+ try:
76
+ # Load tokenizer
77
+ self.tokenizer = AutoTokenizer.from_pretrained(
78
+ model_id,
79
+ token=hf_token,
80
+ trust_remote_code=True
81
+ )
82
+
83
+ # Set pad token if not exists
84
+ if self.tokenizer.pad_token is None:
85
+ self.tokenizer.pad_token = self.tokenizer.eos_token
86
+
87
+ # Load model with GPU optimization
88
+ self.model = AutoModelForCausalLM.from_pretrained(
89
+ model_id,
90
+ token=hf_token,
91
+ torch_dtype=torch.float16,
92
+ device_map="auto",
93
+ trust_remote_code=True,
94
+ low_cpu_mem_usage=True
95
+ )
96
+
97
+ self.current_model_id = model_id
98
+ logger.info(f"Successfully loaded model {model_id}")
99
+
100
+ except Exception as e:
101
+ logger.error(f"Failed to load model {model_id}: {e}")
102
+ raise
103
+
104
+ @modal.method
105
+ def generate_text(
106
+ self,
107
+ prompt: str,
108
+ model_id: str,
109
+ hf_token: str = None,
110
+ max_length: int = 100,
111
+ temperature: float = 0.7,
112
+ do_sample: bool = True,
113
+ top_p: float = 0.9,
114
+ repetition_penalty: float = 1.1,
115
+ **kwargs
116
+ ) -> Dict[str, Any]:
117
+ """
118
+ Generate text using the specified model
119
+
120
+ Args:
121
+ prompt: Input text prompt
122
+ model_id: HuggingFace model ID (e.g., "xenobordom/dialogpt-isa-trained-xxx")
123
+ hf_token: HuggingFace token for private models
124
+ max_length: Maximum generation length
125
+ temperature: Sampling temperature
126
+ do_sample: Whether to use sampling
127
+ top_p: Top-p sampling parameter
128
+ repetition_penalty: Repetition penalty
129
+ **kwargs: Additional generation parameters
130
+
131
+ Returns:
132
+ Dictionary containing generated text and metadata
133
+ """
134
+ import torch
135
+ import time
136
+
137
+ start_time = time.time()
138
+
139
+ try:
140
+ # Load model if needed
141
+ self._load_model(model_id, hf_token)
142
+
143
+ if self.model is None or self.tokenizer is None:
144
+ raise RuntimeError("Model not properly loaded")
145
+
146
+ # Tokenize input
147
+ inputs = self.tokenizer(
148
+ prompt,
149
+ return_tensors="pt",
150
+ padding=True,
151
+ truncation=True,
152
+ max_length=512
153
+ ).to(self.device)
154
+
155
+ # Generate
156
+ with torch.no_grad():
157
+ outputs = self.model.generate(
158
+ **inputs,
159
+ max_length=inputs["input_ids"].shape[1] + max_length,
160
+ temperature=temperature,
161
+ do_sample=do_sample,
162
+ top_p=top_p,
163
+ repetition_penalty=repetition_penalty,
164
+ pad_token_id=self.tokenizer.pad_token_id,
165
+ eos_token_id=self.tokenizer.eos_token_id,
166
+ **kwargs
167
+ )
168
+
169
+ # Decode generated text
170
+ full_text = self.tokenizer.decode(
171
+ outputs[0],
172
+ skip_special_tokens=True,
173
+ clean_up_tokenization_spaces=True
174
+ )
175
+
176
+ # Extract only the new generated part
177
+ generated_text = full_text
178
+ if generated_text.startswith(prompt):
179
+ generated_text = generated_text[len(prompt):].strip()
180
+
181
+ processing_time = time.time() - start_time
182
+
183
+ return {
184
+ "success": True,
185
+ "text": generated_text,
186
+ "full_text": full_text,
187
+ "prompt": prompt,
188
+ "model_id": model_id,
189
+ "provider": "ISA",
190
+ "service": "isa-llm",
191
+ "generation_config": {
192
+ "max_length": max_length,
193
+ "temperature": temperature,
194
+ "do_sample": do_sample,
195
+ "top_p": top_p,
196
+ "repetition_penalty": repetition_penalty
197
+ },
198
+ "metadata": {
199
+ "processing_time": processing_time,
200
+ "device": str(self.device),
201
+ "input_tokens": inputs["input_ids"].shape[1],
202
+ "output_tokens": outputs.shape[1]
203
+ }
204
+ }
205
+
206
+ except Exception as e:
207
+ logger.error(f"Error during text generation: {e}")
208
+ return {
209
+ "success": False,
210
+ "error": str(e),
211
+ "prompt": prompt,
212
+ "model_id": model_id,
213
+ "provider": "ISA",
214
+ "service": "isa-llm"
215
+ }
216
+
217
+ @modal.method
218
+ def chat_completion(
219
+ self,
220
+ messages: List[Dict[str, str]],
221
+ model_id: str,
222
+ hf_token: str = None,
223
+ **kwargs
224
+ ) -> Dict[str, Any]:
225
+ """
226
+ Chat completion with conversation history
227
+
228
+ Args:
229
+ messages: List of message dictionaries with 'role' and 'content'
230
+ model_id: HuggingFace model ID
231
+ hf_token: HuggingFace token
232
+ **kwargs: Additional generation parameters
233
+
234
+ Returns:
235
+ Dictionary containing generated response and metadata
236
+ """
237
+ try:
238
+ # Convert messages to a single prompt
239
+ conversation = ""
240
+ for msg in messages:
241
+ role = msg.get("role", "user")
242
+ content = msg.get("content", "")
243
+ if role == "user":
244
+ conversation += f"User: {content}\n"
245
+ elif role == "assistant":
246
+ conversation += f"Assistant: {content}\n"
247
+ elif role == "system":
248
+ conversation += f"System: {content}\n"
249
+
250
+ conversation += "Assistant: "
251
+
252
+ # Generate response
253
+ result = self.generate_text(
254
+ prompt=conversation,
255
+ model_id=model_id,
256
+ hf_token=hf_token,
257
+ **kwargs
258
+ )
259
+
260
+ # Format as chat response
261
+ if result.get("success"):
262
+ result["role"] = "assistant"
263
+ result["conversation"] = conversation
264
+ result["messages"] = messages
265
+
266
+ return result
267
+
268
+ except Exception as e:
269
+ logger.error(f"Error during chat completion: {e}")
270
+ return {
271
+ "success": False,
272
+ "error": str(e),
273
+ "messages": messages,
274
+ "model_id": model_id,
275
+ "provider": "ISA",
276
+ "service": "isa-llm"
277
+ }
278
+
279
+ @modal.method
280
+ def get_model_info(self, model_id: str, hf_token: str = None) -> Dict[str, Any]:
281
+ """Get information about the loaded model"""
282
+ try:
283
+ # Load model if needed
284
+ self._load_model(model_id, hf_token)
285
+
286
+ if self.model is None:
287
+ return {
288
+ "success": False,
289
+ "error": "Model not loaded"
290
+ }
291
+
292
+ # Get model config
293
+ config = self.model.config
294
+
295
+ # Count parameters
296
+ total_params = sum(p.numel() for p in self.model.parameters())
297
+ trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
298
+
299
+ return {
300
+ "success": True,
301
+ "model_id": model_id,
302
+ "provider": "ISA",
303
+ "service": "isa-llm",
304
+ "architecture": config.model_type if hasattr(config, 'model_type') else "unknown",
305
+ "vocab_size": config.vocab_size if hasattr(config, 'vocab_size') else None,
306
+ "hidden_size": config.hidden_size if hasattr(config, 'hidden_size') else None,
307
+ "num_layers": getattr(config, 'num_layers', getattr(config, 'n_layer', None)),
308
+ "num_attention_heads": getattr(config, 'num_attention_heads', getattr(config, 'n_head', None)),
309
+ "total_parameters": total_params,
310
+ "trainable_parameters": trainable_params,
311
+ "device": str(self.device),
312
+ "dtype": str(next(self.model.parameters()).dtype)
313
+ }
314
+
315
+ except Exception as e:
316
+ logger.error(f"Error getting model info: {e}")
317
+ return {
318
+ "success": False,
319
+ "error": str(e)
320
+ }
321
+
322
+ @modal.method
323
+ def health_check(self) -> Dict[str, Any]:
324
+ """Health check endpoint"""
325
+ import torch
326
+
327
+ try:
328
+ gpu_available = torch.cuda.is_available()
329
+ gpu_count = torch.cuda.device_count() if gpu_available else 0
330
+
331
+ return {
332
+ "success": True,
333
+ "status": "healthy",
334
+ "service": "isa-llm",
335
+ "provider": "ISA",
336
+ "device": str(self.device),
337
+ "gpu_available": gpu_available,
338
+ "gpu_count": gpu_count,
339
+ "current_model": self.current_model_id,
340
+ "memory_info": {
341
+ "allocated": torch.cuda.memory_allocated() if gpu_available else 0,
342
+ "cached": torch.cuda.memory_reserved() if gpu_available else 0
343
+ } if gpu_available else None
344
+ }
345
+
346
+ except Exception as e:
347
+ return {
348
+ "success": False,
349
+ "status": "error",
350
+ "error": str(e)
351
+ }
352
+
353
+ # Deployment functions
354
+ @app.function(
355
+ image=image,
356
+ schedule=modal.Cron("0 2 * * *"), # Deploy daily at 2 AM
357
+ timeout=300
358
+ )
359
+ def deploy_service():
360
+ """Deploy the ISA LLM service"""
361
+ print("ISA LLM Service deployed successfully!")
362
+ return {"status": "deployed", "service": "isa-llm"}
363
+
364
+ # Local testing function
365
+ @app.local_entrypoint()
366
+ def test_service():
367
+ """Test the ISA LLM service locally"""
368
+
369
+ # Test with our trained model
370
+ test_model_id = "xenobordom/dialogpt-isa-trained-1755493402"
371
+ test_prompt = "你好"
372
+
373
+ # Get HF token from environment
374
+ hf_token = os.getenv("HF_TOKEN")
375
+ if not hf_token:
376
+ print("❌ HF_TOKEN not found in environment")
377
+ return
378
+
379
+ print(f"🧪 Testing ISA LLM Service with model: {test_model_id}")
380
+
381
+ # Create service instance
382
+ service = ISALLMService()
383
+
384
+ # Test health check
385
+ print("📋 Testing health check...")
386
+ health = service.health_check.remote()
387
+ print(f"Health: {health}")
388
+
389
+ # Test model info
390
+ print("📊 Testing model info...")
391
+ info = service.get_model_info.remote(test_model_id, hf_token)
392
+ print(f"Model info: {info}")
393
+
394
+ # Test text generation
395
+ print("🤖 Testing text generation...")
396
+ result = service.generate_text.remote(
397
+ prompt=test_prompt,
398
+ model_id=test_model_id,
399
+ hf_token=hf_token,
400
+ max_length=30,
401
+ temperature=0.7
402
+ )
403
+ print(f"Generation result: {result}")
404
+
405
+ # Test chat completion
406
+ print("💬 Testing chat completion...")
407
+ messages = [
408
+ {"role": "user", "content": "你好"},
409
+ {"role": "assistant", "content": "你好!很高兴见到你。"},
410
+ {"role": "user", "content": "你能帮我做什么?"}
411
+ ]
412
+ chat_result = service.chat_completion.remote(
413
+ messages=messages,
414
+ model_id=test_model_id,
415
+ hf_token=hf_token,
416
+ max_length=30
417
+ )
418
+ print(f"Chat result: {chat_result}")
419
+
420
+ print("✅ ISA LLM Service test completed!")
421
+
422
+ if __name__ == "__main__":
423
+ # For local development
424
+ test_service()
@@ -0,0 +1 @@
1
+ """Video services for Modal deployment"""
@@ -0,0 +1 @@
1
+ """Vision services for Modal deployment"""
@@ -0,0 +1,48 @@
1
+ """
2
+ tenant-a-service LLM Service for Modal
3
+
4
+ Auto-generated service for model: gpt2
5
+ Architecture: gpt
6
+ """
7
+
8
+ import modal
9
+ from typing import Dict, Any, List
10
+
11
+ app = modal.App("tenant-a-service")
12
+
13
+ image = modal.Image.debian_slim().pip_install(
14
+ "accelerate>=0.24.0", "transformers>=4.35.0", "httpx>=0.26.0", "torch>=2.0.0", "requests>=2.31.0", "numpy>=1.24.0", "pydantic>=2.0.0"
15
+ )
16
+
17
+ @app.cls(
18
+ image=image,
19
+ gpu=modal.gpu.A10G(count=1),
20
+ container_idle_timeout=300,
21
+ memory=32768
22
+ )
23
+ class Tenant_A_ServiceService:
24
+
25
+ @modal.enter()
26
+ def load_model(self):
27
+ import torch
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ "gpt2",
33
+ torch_dtype=torch.float16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ @modal.method()
39
+ def generate(self, messages: List[Dict[str, str]], **kwargs):
40
+ # Generate response (simplified)
41
+ prompt = messages[-1]["content"] if messages else ""
42
+ return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
43
+
44
+ @app.function(image=image)
45
+ @modal.web_endpoint(method="POST")
46
+ def inference_endpoint(item: Dict[str, Any]):
47
+ service = Tenant_A_ServiceService()
48
+ return service.generate(**item)
@@ -0,0 +1,48 @@
1
+ """
2
+ prefix-test-service LLM Service for Modal
3
+
4
+ Auto-generated service for model: gpt2
5
+ Architecture: gpt
6
+ """
7
+
8
+ import modal
9
+ from typing import Dict, Any, List
10
+
11
+ app = modal.App("prefix-test-service")
12
+
13
+ image = modal.Image.debian_slim().pip_install(
14
+ "accelerate>=0.24.0", "transformers>=4.35.0", "httpx>=0.26.0", "torch>=2.0.0", "requests>=2.31.0", "numpy>=1.24.0", "pydantic>=2.0.0"
15
+ )
16
+
17
+ @app.cls(
18
+ image=image,
19
+ gpu=modal.gpu.A10G(count=1),
20
+ container_idle_timeout=300,
21
+ memory=32768
22
+ )
23
+ class Prefix_Test_ServiceService:
24
+
25
+ @modal.enter()
26
+ def load_model(self):
27
+ import torch
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ "gpt2",
33
+ torch_dtype=torch.float16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ @modal.method()
39
+ def generate(self, messages: List[Dict[str, str]], **kwargs):
40
+ # Generate response (simplified)
41
+ prompt = messages[-1]["content"] if messages else ""
42
+ return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
43
+
44
+ @app.function(image=image)
45
+ @modal.web_endpoint(method="POST")
46
+ def inference_endpoint(item: Dict[str, Any]):
47
+ service = Prefix_Test_ServiceService()
48
+ return service.generate(**item)