isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,424 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ ISA LLM Service - Modal deployment for HuggingFace trained models
6
+ Provides inference API for custom trained models
7
+ """
8
+
9
+ import os
10
+ import logging
11
+ from typing import Dict, Any, List, Optional
12
+ import modal
13
+
14
+ # Modal app configuration
15
+ app = modal.App("isa-llm-inference")
16
+
17
+ # GPU configuration for inference
18
+ GPU_CONFIG = modal.gpu.A10G()
19
+
20
+ # Base image with HuggingFace transformers
21
+ image = (
22
+ modal.Image.debian_slim(python_version="3.11")
23
+ .pip_install([
24
+ "torch>=2.0.0",
25
+ "transformers>=4.35.0",
26
+ "accelerate>=0.20.0",
27
+ "huggingface_hub>=0.17.0",
28
+ "peft>=0.5.0", # For LoRA models
29
+ "bitsandbytes>=0.41.0", # For quantization
30
+ "sentencepiece>=0.1.99", # For tokenizers
31
+ ])
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ @app.cls(
37
+ image=image,
38
+ gpu=GPU_CONFIG,
39
+ cpu=2.0,
40
+ memory=16384, # 16GB memory
41
+ timeout=300, # 5 minute timeout
42
+ container_idle_timeout=60, # Keep warm for 1 minute
43
+ allow_concurrent_inputs=5, # Allow concurrent requests
44
+ )
45
+ class ISALLMService:
46
+ """
47
+ ISA LLM Service for inference on HuggingFace trained models
48
+ Designed to work with models trained through ISA training pipeline
49
+ """
50
+
51
+ def __init__(self):
52
+ """Initialize the service (runs on container startup)"""
53
+ import torch
54
+ from transformers import AutoTokenizer, AutoModelForCausalLM
55
+
56
+ # Model will be loaded when first requested
57
+ self.model = None
58
+ self.tokenizer = None
59
+ self.current_model_id = None
60
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
61
+
62
+ logger.info(f"ISA LLM Service initialized on {self.device}")
63
+
64
+ def _load_model(self, model_id: str, hf_token: str = None):
65
+ """Load a specific model"""
66
+ import torch
67
+ from transformers import AutoTokenizer, AutoModelForCausalLM
68
+
69
+ if self.current_model_id == model_id and self.model is not None:
70
+ logger.info(f"Model {model_id} already loaded")
71
+ return
72
+
73
+ logger.info(f"Loading model: {model_id}")
74
+
75
+ try:
76
+ # Load tokenizer
77
+ self.tokenizer = AutoTokenizer.from_pretrained(
78
+ model_id,
79
+ token=hf_token,
80
+ trust_remote_code=True
81
+ )
82
+
83
+ # Set pad token if not exists
84
+ if self.tokenizer.pad_token is None:
85
+ self.tokenizer.pad_token = self.tokenizer.eos_token
86
+
87
+ # Load model with GPU optimization
88
+ self.model = AutoModelForCausalLM.from_pretrained(
89
+ model_id,
90
+ token=hf_token,
91
+ torch_dtype=torch.float16,
92
+ device_map="auto",
93
+ trust_remote_code=True,
94
+ low_cpu_mem_usage=True
95
+ )
96
+
97
+ self.current_model_id = model_id
98
+ logger.info(f"Successfully loaded model {model_id}")
99
+
100
+ except Exception as e:
101
+ logger.error(f"Failed to load model {model_id}: {e}")
102
+ raise
103
+
104
+ @modal.method
105
+ def generate_text(
106
+ self,
107
+ prompt: str,
108
+ model_id: str,
109
+ hf_token: str = None,
110
+ max_length: int = 100,
111
+ temperature: float = 0.7,
112
+ do_sample: bool = True,
113
+ top_p: float = 0.9,
114
+ repetition_penalty: float = 1.1,
115
+ **kwargs
116
+ ) -> Dict[str, Any]:
117
+ """
118
+ Generate text using the specified model
119
+
120
+ Args:
121
+ prompt: Input text prompt
122
+ model_id: HuggingFace model ID (e.g., "xenobordom/dialogpt-isa-trained-xxx")
123
+ hf_token: HuggingFace token for private models
124
+ max_length: Maximum generation length
125
+ temperature: Sampling temperature
126
+ do_sample: Whether to use sampling
127
+ top_p: Top-p sampling parameter
128
+ repetition_penalty: Repetition penalty
129
+ **kwargs: Additional generation parameters
130
+
131
+ Returns:
132
+ Dictionary containing generated text and metadata
133
+ """
134
+ import torch
135
+ import time
136
+
137
+ start_time = time.time()
138
+
139
+ try:
140
+ # Load model if needed
141
+ self._load_model(model_id, hf_token)
142
+
143
+ if self.model is None or self.tokenizer is None:
144
+ raise RuntimeError("Model not properly loaded")
145
+
146
+ # Tokenize input
147
+ inputs = self.tokenizer(
148
+ prompt,
149
+ return_tensors="pt",
150
+ padding=True,
151
+ truncation=True,
152
+ max_length=512
153
+ ).to(self.device)
154
+
155
+ # Generate
156
+ with torch.no_grad():
157
+ outputs = self.model.generate(
158
+ **inputs,
159
+ max_length=inputs["input_ids"].shape[1] + max_length,
160
+ temperature=temperature,
161
+ do_sample=do_sample,
162
+ top_p=top_p,
163
+ repetition_penalty=repetition_penalty,
164
+ pad_token_id=self.tokenizer.pad_token_id,
165
+ eos_token_id=self.tokenizer.eos_token_id,
166
+ **kwargs
167
+ )
168
+
169
+ # Decode generated text
170
+ full_text = self.tokenizer.decode(
171
+ outputs[0],
172
+ skip_special_tokens=True,
173
+ clean_up_tokenization_spaces=True
174
+ )
175
+
176
+ # Extract only the new generated part
177
+ generated_text = full_text
178
+ if generated_text.startswith(prompt):
179
+ generated_text = generated_text[len(prompt):].strip()
180
+
181
+ processing_time = time.time() - start_time
182
+
183
+ return {
184
+ "success": True,
185
+ "text": generated_text,
186
+ "full_text": full_text,
187
+ "prompt": prompt,
188
+ "model_id": model_id,
189
+ "provider": "ISA",
190
+ "service": "isa-llm",
191
+ "generation_config": {
192
+ "max_length": max_length,
193
+ "temperature": temperature,
194
+ "do_sample": do_sample,
195
+ "top_p": top_p,
196
+ "repetition_penalty": repetition_penalty
197
+ },
198
+ "metadata": {
199
+ "processing_time": processing_time,
200
+ "device": str(self.device),
201
+ "input_tokens": inputs["input_ids"].shape[1],
202
+ "output_tokens": outputs.shape[1]
203
+ }
204
+ }
205
+
206
+ except Exception as e:
207
+ logger.error(f"Error during text generation: {e}")
208
+ return {
209
+ "success": False,
210
+ "error": str(e),
211
+ "prompt": prompt,
212
+ "model_id": model_id,
213
+ "provider": "ISA",
214
+ "service": "isa-llm"
215
+ }
216
+
217
+ @modal.method
218
+ def chat_completion(
219
+ self,
220
+ messages: List[Dict[str, str]],
221
+ model_id: str,
222
+ hf_token: str = None,
223
+ **kwargs
224
+ ) -> Dict[str, Any]:
225
+ """
226
+ Chat completion with conversation history
227
+
228
+ Args:
229
+ messages: List of message dictionaries with 'role' and 'content'
230
+ model_id: HuggingFace model ID
231
+ hf_token: HuggingFace token
232
+ **kwargs: Additional generation parameters
233
+
234
+ Returns:
235
+ Dictionary containing generated response and metadata
236
+ """
237
+ try:
238
+ # Convert messages to a single prompt
239
+ conversation = ""
240
+ for msg in messages:
241
+ role = msg.get("role", "user")
242
+ content = msg.get("content", "")
243
+ if role == "user":
244
+ conversation += f"User: {content}\n"
245
+ elif role == "assistant":
246
+ conversation += f"Assistant: {content}\n"
247
+ elif role == "system":
248
+ conversation += f"System: {content}\n"
249
+
250
+ conversation += "Assistant: "
251
+
252
+ # Generate response
253
+ result = self.generate_text(
254
+ prompt=conversation,
255
+ model_id=model_id,
256
+ hf_token=hf_token,
257
+ **kwargs
258
+ )
259
+
260
+ # Format as chat response
261
+ if result.get("success"):
262
+ result["role"] = "assistant"
263
+ result["conversation"] = conversation
264
+ result["messages"] = messages
265
+
266
+ return result
267
+
268
+ except Exception as e:
269
+ logger.error(f"Error during chat completion: {e}")
270
+ return {
271
+ "success": False,
272
+ "error": str(e),
273
+ "messages": messages,
274
+ "model_id": model_id,
275
+ "provider": "ISA",
276
+ "service": "isa-llm"
277
+ }
278
+
279
+ @modal.method
280
+ def get_model_info(self, model_id: str, hf_token: str = None) -> Dict[str, Any]:
281
+ """Get information about the loaded model"""
282
+ try:
283
+ # Load model if needed
284
+ self._load_model(model_id, hf_token)
285
+
286
+ if self.model is None:
287
+ return {
288
+ "success": False,
289
+ "error": "Model not loaded"
290
+ }
291
+
292
+ # Get model config
293
+ config = self.model.config
294
+
295
+ # Count parameters
296
+ total_params = sum(p.numel() for p in self.model.parameters())
297
+ trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
298
+
299
+ return {
300
+ "success": True,
301
+ "model_id": model_id,
302
+ "provider": "ISA",
303
+ "service": "isa-llm",
304
+ "architecture": config.model_type if hasattr(config, 'model_type') else "unknown",
305
+ "vocab_size": config.vocab_size if hasattr(config, 'vocab_size') else None,
306
+ "hidden_size": config.hidden_size if hasattr(config, 'hidden_size') else None,
307
+ "num_layers": getattr(config, 'num_layers', getattr(config, 'n_layer', None)),
308
+ "num_attention_heads": getattr(config, 'num_attention_heads', getattr(config, 'n_head', None)),
309
+ "total_parameters": total_params,
310
+ "trainable_parameters": trainable_params,
311
+ "device": str(self.device),
312
+ "dtype": str(next(self.model.parameters()).dtype)
313
+ }
314
+
315
+ except Exception as e:
316
+ logger.error(f"Error getting model info: {e}")
317
+ return {
318
+ "success": False,
319
+ "error": str(e)
320
+ }
321
+
322
+ @modal.method
323
+ def health_check(self) -> Dict[str, Any]:
324
+ """Health check endpoint"""
325
+ import torch
326
+
327
+ try:
328
+ gpu_available = torch.cuda.is_available()
329
+ gpu_count = torch.cuda.device_count() if gpu_available else 0
330
+
331
+ return {
332
+ "success": True,
333
+ "status": "healthy",
334
+ "service": "isa-llm",
335
+ "provider": "ISA",
336
+ "device": str(self.device),
337
+ "gpu_available": gpu_available,
338
+ "gpu_count": gpu_count,
339
+ "current_model": self.current_model_id,
340
+ "memory_info": {
341
+ "allocated": torch.cuda.memory_allocated() if gpu_available else 0,
342
+ "cached": torch.cuda.memory_reserved() if gpu_available else 0
343
+ } if gpu_available else None
344
+ }
345
+
346
+ except Exception as e:
347
+ return {
348
+ "success": False,
349
+ "status": "error",
350
+ "error": str(e)
351
+ }
352
+
353
+ # Deployment functions
354
+ @app.function(
355
+ image=image,
356
+ schedule=modal.Cron("0 2 * * *"), # Deploy daily at 2 AM
357
+ timeout=300
358
+ )
359
+ def deploy_service():
360
+ """Deploy the ISA LLM service"""
361
+ print("ISA LLM Service deployed successfully!")
362
+ return {"status": "deployed", "service": "isa-llm"}
363
+
364
+ # Local testing function
365
+ @app.local_entrypoint()
366
+ def test_service():
367
+ """Test the ISA LLM service locally"""
368
+
369
+ # Test with our trained model
370
+ test_model_id = "xenobordom/dialogpt-isa-trained-1755493402"
371
+ test_prompt = "你好"
372
+
373
+ # Get HF token from environment
374
+ hf_token = os.getenv("HF_TOKEN")
375
+ if not hf_token:
376
+ print("❌ HF_TOKEN not found in environment")
377
+ return
378
+
379
+ print(f"🧪 Testing ISA LLM Service with model: {test_model_id}")
380
+
381
+ # Create service instance
382
+ service = ISALLMService()
383
+
384
+ # Test health check
385
+ print("📋 Testing health check...")
386
+ health = service.health_check.remote()
387
+ print(f"Health: {health}")
388
+
389
+ # Test model info
390
+ print("📊 Testing model info...")
391
+ info = service.get_model_info.remote(test_model_id, hf_token)
392
+ print(f"Model info: {info}")
393
+
394
+ # Test text generation
395
+ print("🤖 Testing text generation...")
396
+ result = service.generate_text.remote(
397
+ prompt=test_prompt,
398
+ model_id=test_model_id,
399
+ hf_token=hf_token,
400
+ max_length=30,
401
+ temperature=0.7
402
+ )
403
+ print(f"Generation result: {result}")
404
+
405
+ # Test chat completion
406
+ print("💬 Testing chat completion...")
407
+ messages = [
408
+ {"role": "user", "content": "你好"},
409
+ {"role": "assistant", "content": "你好!很高兴见到你。"},
410
+ {"role": "user", "content": "你能帮我做什么?"}
411
+ ]
412
+ chat_result = service.chat_completion.remote(
413
+ messages=messages,
414
+ model_id=test_model_id,
415
+ hf_token=hf_token,
416
+ max_length=30
417
+ )
418
+ print(f"Chat result: {chat_result}")
419
+
420
+ print("✅ ISA LLM Service test completed!")
421
+
422
+ if __name__ == "__main__":
423
+ # For local development
424
+ test_service()
@@ -0,0 +1 @@
1
+ """Video services for Modal deployment"""
@@ -0,0 +1 @@
1
+ """Vision services for Modal deployment"""
@@ -0,0 +1,48 @@
1
+ """
2
+ tenant-a-service LLM Service for Modal
3
+
4
+ Auto-generated service for model: gpt2
5
+ Architecture: gpt
6
+ """
7
+
8
+ import modal
9
+ from typing import Dict, Any, List
10
+
11
+ app = modal.App("tenant-a-service")
12
+
13
+ image = modal.Image.debian_slim().pip_install(
14
+ "accelerate>=0.24.0", "transformers>=4.35.0", "httpx>=0.26.0", "torch>=2.0.0", "requests>=2.31.0", "numpy>=1.24.0", "pydantic>=2.0.0"
15
+ )
16
+
17
+ @app.cls(
18
+ image=image,
19
+ gpu=modal.gpu.A10G(count=1),
20
+ container_idle_timeout=300,
21
+ memory=32768
22
+ )
23
+ class Tenant_A_ServiceService:
24
+
25
+ @modal.enter()
26
+ def load_model(self):
27
+ import torch
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ "gpt2",
33
+ torch_dtype=torch.float16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ @modal.method()
39
+ def generate(self, messages: List[Dict[str, str]], **kwargs):
40
+ # Generate response (simplified)
41
+ prompt = messages[-1]["content"] if messages else ""
42
+ return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
43
+
44
+ @app.function(image=image)
45
+ @modal.web_endpoint(method="POST")
46
+ def inference_endpoint(item: Dict[str, Any]):
47
+ service = Tenant_A_ServiceService()
48
+ return service.generate(**item)
@@ -0,0 +1,48 @@
1
+ """
2
+ prefix-test-service LLM Service for Modal
3
+
4
+ Auto-generated service for model: gpt2
5
+ Architecture: gpt
6
+ """
7
+
8
+ import modal
9
+ from typing import Dict, Any, List
10
+
11
+ app = modal.App("prefix-test-service")
12
+
13
+ image = modal.Image.debian_slim().pip_install(
14
+ "accelerate>=0.24.0", "transformers>=4.35.0", "httpx>=0.26.0", "torch>=2.0.0", "requests>=2.31.0", "numpy>=1.24.0", "pydantic>=2.0.0"
15
+ )
16
+
17
+ @app.cls(
18
+ image=image,
19
+ gpu=modal.gpu.A10G(count=1),
20
+ container_idle_timeout=300,
21
+ memory=32768
22
+ )
23
+ class Prefix_Test_ServiceService:
24
+
25
+ @modal.enter()
26
+ def load_model(self):
27
+ import torch
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ "gpt2",
33
+ torch_dtype=torch.float16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ @modal.method()
39
+ def generate(self, messages: List[Dict[str, str]], **kwargs):
40
+ # Generate response (simplified)
41
+ prompt = messages[-1]["content"] if messages else ""
42
+ return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
43
+
44
+ @app.function(image=image)
45
+ @modal.web_endpoint(method="POST")
46
+ def inference_endpoint(item: Dict[str, Any]):
47
+ service = Prefix_Test_ServiceService()
48
+ return service.generate(**item)
@@ -0,0 +1,48 @@
1
+ """
2
+ test-llm-service LLM Service for Modal
3
+
4
+ Auto-generated service for model: gpt2
5
+ Architecture: gpt
6
+ """
7
+
8
+ import modal
9
+ from typing import Dict, Any, List
10
+
11
+ app = modal.App("test-llm-service")
12
+
13
+ image = modal.Image.debian_slim().pip_install(
14
+ "torch>=2.0.0", "httpx>=0.26.0", "transformers>=4.35.0", "requests>=2.31.0", "pydantic>=2.0.0", "numpy>=1.24.0", "accelerate>=0.24.0"
15
+ )
16
+
17
+ @app.cls(
18
+ image=image,
19
+ gpu=modal.gpu.A10G(count=1),
20
+ container_idle_timeout=300,
21
+ memory=32768
22
+ )
23
+ class Test_Llm_ServiceService:
24
+
25
+ @modal.enter()
26
+ def load_model(self):
27
+ import torch
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ "gpt2",
33
+ torch_dtype=torch.float16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ @modal.method()
39
+ def generate(self, messages: List[Dict[str, str]], **kwargs):
40
+ # Generate response (simplified)
41
+ prompt = messages[-1]["content"] if messages else ""
42
+ return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
43
+
44
+ @app.function(image=image)
45
+ @modal.web_endpoint(method="POST")
46
+ def inference_endpoint(item: Dict[str, Any]):
47
+ service = Test_Llm_ServiceService()
48
+ return service.generate(**item)
@@ -0,0 +1,48 @@
1
+ """
2
+ test-monitoring-gpt2 LLM Service for Modal
3
+
4
+ Auto-generated service for model: gpt2
5
+ Architecture: gpt
6
+ """
7
+
8
+ import modal
9
+ from typing import Dict, Any, List
10
+
11
+ app = modal.App("test-monitoring-gpt2")
12
+
13
+ image = modal.Image.debian_slim().pip_install(
14
+ "numpy>=1.24.0", "requests>=2.31.0", "accelerate>=0.24.0", "httpx>=0.26.0", "pydantic>=2.0.0", "transformers>=4.35.0", "torch>=2.0.0"
15
+ )
16
+
17
+ @app.cls(
18
+ image=image,
19
+ gpu=modal.gpu.A10G(count=1),
20
+ container_idle_timeout=300,
21
+ memory=32768
22
+ )
23
+ class Test_Monitoring_Gpt2Service:
24
+
25
+ @modal.enter()
26
+ def load_model(self):
27
+ import torch
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ "gpt2",
33
+ torch_dtype=torch.float16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ @modal.method()
39
+ def generate(self, messages: List[Dict[str, str]], **kwargs):
40
+ # Generate response (simplified)
41
+ prompt = messages[-1]["content"] if messages else ""
42
+ return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
43
+
44
+ @app.function(image=image)
45
+ @modal.web_endpoint(method="POST")
46
+ def inference_endpoint(item: Dict[str, Any]):
47
+ service = Test_Monitoring_Gpt2Service()
48
+ return service.generate(**item)
@@ -0,0 +1,48 @@
1
+ """
2
+ test-monitoring-gpt2 LLM Service for Modal
3
+
4
+ Auto-generated service for model: gpt2
5
+ Architecture: gpt
6
+ """
7
+
8
+ import modal
9
+ from typing import Dict, Any, List
10
+
11
+ app = modal.App("test-monitoring-gpt2")
12
+
13
+ image = modal.Image.debian_slim().pip_install(
14
+ "transformers>=4.35.0", "torch>=2.0.0", "accelerate>=0.24.0", "httpx>=0.26.0", "numpy>=1.24.0", "requests>=2.31.0", "pydantic>=2.0.0"
15
+ )
16
+
17
+ @app.cls(
18
+ image=image,
19
+ gpu=modal.gpu.A10G(count=1),
20
+ container_idle_timeout=300,
21
+ memory=32768
22
+ )
23
+ class Test_Monitoring_Gpt2Service:
24
+
25
+ @modal.enter()
26
+ def load_model(self):
27
+ import torch
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
31
+ self.model = AutoModelForCausalLM.from_pretrained(
32
+ "gpt2",
33
+ torch_dtype=torch.float16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ @modal.method()
39
+ def generate(self, messages: List[Dict[str, str]], **kwargs):
40
+ # Generate response (simplified)
41
+ prompt = messages[-1]["content"] if messages else ""
42
+ return {"response": f"Generated response for: {prompt}", "model": "gpt2"}
43
+
44
+ @app.function(image=image)
45
+ @modal.web_endpoint(method="POST")
46
+ def inference_endpoint(item: Dict[str, Any]):
47
+ service = Test_Monitoring_Gpt2Service()
48
+ return service.generate(**item)
@@ -0,0 +1,5 @@
1
+ """Storage and persistence for deployments"""
2
+
3
+ from .deployment_repository import DeploymentRepository
4
+
5
+ __all__ = ["DeploymentRepository"]