isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,401 +0,0 @@
1
- """
2
- ISA Model Deployment Service
3
-
4
- Complete deployment pipeline that:
5
- 1. Downloads fine-tuned models from HuggingFace storage
6
- 2. Quantizes models using open-source TensorRT-LLM
7
- 3. Builds optimized engines
8
- 4. Deploys as custom container service on RunPod
9
- """
10
-
11
- import os
12
- import json
13
- import logging
14
- import asyncio
15
- from typing import Dict, Any, Optional, List
16
- from pathlib import Path
17
- import shutil
18
- from datetime import datetime
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class ISADeploymentService:
24
- """
25
- Complete deployment service for ISA Model SDK.
26
-
27
- Example:
28
- ```python
29
- from isa_model.deployment.core import ISADeploymentService
30
-
31
- service = ISADeploymentService()
32
-
33
- # Complete deployment pipeline
34
- deployment = await service.deploy_finetuned_model(
35
- model_id="gemma-4b-alpaca-v1",
36
- quantization="int8"
37
- )
38
- ```
39
- """
40
-
41
- def __init__(self,
42
- work_dir: str = "./isa_deployment_work",
43
- hf_username: str = "xenobordom"):
44
- """Initialize ISA deployment service."""
45
- self.work_dir = Path(work_dir)
46
- self.work_dir.mkdir(parents=True, exist_ok=True)
47
- self.hf_username = hf_username
48
-
49
- # Create subdirectories
50
- (self.work_dir / "models").mkdir(exist_ok=True)
51
- (self.work_dir / "containers").mkdir(exist_ok=True)
52
- (self.work_dir / "deployments").mkdir(exist_ok=True)
53
-
54
- logger.info(f"ISA Deployment Service initialized with work_dir: {self.work_dir}")
55
-
56
- async def deploy_finetuned_model(self,
57
- model_id: str,
58
- quantization: str = "int8",
59
- container_registry: str = "docker.io") -> Dict[str, Any]:
60
- """Complete deployment pipeline for fine-tuned models."""
61
- deployment_id = f"{model_id}-{quantization}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
62
- logger.info(f"Starting deployment pipeline: {deployment_id}")
63
-
64
- deployment_info = {
65
- "deployment_id": deployment_id,
66
- "model_id": model_id,
67
- "quantization": quantization,
68
- "status": "starting",
69
- "steps": []
70
- }
71
-
72
- try:
73
- # Step 1: Download model
74
- model_path = await self._download_finetuned_model(model_id)
75
- deployment_info["steps"].append({
76
- "step": 1,
77
- "name": "download_model",
78
- "status": "completed",
79
- "model_path": str(model_path)
80
- })
81
-
82
- # Step 2: Build container
83
- container_image = await self._build_deployment_container(
84
- model_id=model_id,
85
- model_path=model_path,
86
- quantization=quantization,
87
- container_registry=container_registry
88
- )
89
- deployment_info["steps"].append({
90
- "step": 2,
91
- "name": "build_container",
92
- "status": "completed",
93
- "container_image": container_image
94
- })
95
-
96
- deployment_info["status"] = "completed"
97
- deployment_info["completed_at"] = datetime.now().isoformat()
98
-
99
- # Save configuration
100
- config_file = self.work_dir / "deployments" / f"{deployment_id}.json"
101
- with open(config_file, 'w') as f:
102
- json.dump(deployment_info, f, indent=2)
103
-
104
- logger.info(f"✅ Deployment completed: {deployment_id}")
105
- return deployment_info
106
-
107
- except Exception as e:
108
- deployment_info["status"] = "failed"
109
- deployment_info["error"] = str(e)
110
- logger.error(f"❌ Deployment failed: {e}")
111
- raise
112
-
113
- async def _download_finetuned_model(self, model_id: str) -> Path:
114
- """Download fine-tuned model from HuggingFace storage."""
115
- from ...core.storage.hf_storage import HuggingFaceStorage
116
-
117
- logger.info(f"Downloading model {model_id}...")
118
-
119
- storage = HuggingFaceStorage(username=self.hf_username)
120
- model_path = await storage.load_model(model_id)
121
-
122
- if not model_path:
123
- raise ValueError(f"Failed to download model {model_id}")
124
-
125
- # Copy to work directory
126
- local_model_path = self.work_dir / "models" / model_id
127
- if local_model_path.exists():
128
- shutil.rmtree(local_model_path)
129
-
130
- shutil.copytree(model_path, local_model_path)
131
- logger.info(f"Model downloaded to: {local_model_path}")
132
-
133
- return local_model_path
134
-
135
- async def _build_deployment_container(self,
136
- model_id: str,
137
- model_path: Path,
138
- quantization: str,
139
- container_registry: str) -> str:
140
- """Build custom deployment container."""
141
- container_name = f"isa-model-{model_id}"
142
- container_tag = f"{container_registry}/{container_name}:latest"
143
-
144
- logger.info(f"Building container: {container_tag}")
145
-
146
- container_dir = self.work_dir / "containers" / model_id
147
- container_dir.mkdir(parents=True, exist_ok=True)
148
-
149
- # Create Dockerfile
150
- dockerfile_content = self._create_deployment_dockerfile(quantization)
151
- with open(container_dir / "Dockerfile", 'w') as f:
152
- f.write(dockerfile_content)
153
-
154
- # Copy model files
155
- model_dst = container_dir / "hf_model"
156
- if model_dst.exists():
157
- shutil.rmtree(model_dst)
158
- shutil.copytree(model_path, model_dst)
159
-
160
- # Create server.py
161
- server_content = self._create_server_py()
162
- with open(container_dir / "server.py", 'w') as f:
163
- f.write(server_content)
164
-
165
- # Build container
166
- process = await asyncio.create_subprocess_exec(
167
- "docker", "build", "-t", container_tag, str(container_dir),
168
- stdout=asyncio.subprocess.PIPE,
169
- stderr=asyncio.subprocess.PIPE
170
- )
171
-
172
- stdout, stderr = await process.communicate()
173
-
174
- if process.returncode != 0:
175
- raise RuntimeError(f"Container build failed: {stderr.decode()}")
176
-
177
- logger.info(f"Container built: {container_tag}")
178
- return container_tag
179
-
180
- def _create_deployment_dockerfile(self, quantization: str) -> str:
181
- """Create Dockerfile for deployment."""
182
- return f'''# ISA Model Deployment Container
183
- FROM nvcr.io/nvidia/pytorch:24.05-py3
184
-
185
- # Install dependencies
186
- RUN apt-get update && apt-get install -y git-lfs curl && rm -rf /var/lib/apt/lists/*
187
-
188
- # Install Python packages
189
- RUN pip install fastapi uvicorn transformers torch
190
-
191
- # Clone TensorRT-LLM for quantization and inference
192
- RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git /opt/TensorRT-LLM
193
- WORKDIR /opt/TensorRT-LLM
194
- RUN pip install -r requirements.txt
195
-
196
- # Set up application
197
- WORKDIR /app
198
- COPY hf_model/ /app/hf_model/
199
- COPY server.py /app/server.py
200
-
201
- # Environment variables
202
- ENV QUANTIZATION={quantization}
203
- ENV MODEL_PATH=/app/hf_model
204
- ENV PYTHONPATH=/opt/TensorRT-LLM:$PYTHONPATH
205
-
206
- # Expose port
207
- EXPOSE 8000
208
-
209
- # Health check
210
- HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \\
211
- CMD curl -f http://localhost:8000/health || exit 1
212
-
213
- # Start server
214
- CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
215
- '''
216
-
217
- def _create_server_py(self) -> str:
218
- """Create FastAPI server."""
219
- return '''"""
220
- ISA Model Deployment Server
221
- """
222
-
223
- import os
224
- import logging
225
- import asyncio
226
- from pathlib import Path
227
- from fastapi import FastAPI, HTTPException
228
- from pydantic import BaseModel
229
- from contextlib import asynccontextmanager
230
- from transformers import AutoTokenizer, AutoModelForCausalLM
231
- import torch
232
-
233
- logging.basicConfig(level=logging.INFO)
234
- logger = logging.getLogger(__name__)
235
-
236
- # Global variables
237
- MODEL_PATH = os.getenv("MODEL_PATH", "/app/hf_model")
238
- QUANTIZATION = os.getenv("QUANTIZATION", "int8")
239
-
240
- model = None
241
- tokenizer = None
242
-
243
- @asynccontextmanager
244
- async def lifespan(app: FastAPI):
245
- """FastAPI lifespan events."""
246
- global model, tokenizer
247
-
248
- logger.info("Starting ISA Model Deployment Service...")
249
- logger.info(f"Loading model from: {MODEL_PATH}")
250
- logger.info(f"Quantization: {QUANTIZATION}")
251
-
252
- try:
253
- # Load tokenizer
254
- tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
255
-
256
- # Load model with appropriate settings
257
- model = AutoModelForCausalLM.from_pretrained(
258
- MODEL_PATH,
259
- torch_dtype=torch.float16,
260
- device_map="auto",
261
- trust_remote_code=True
262
- )
263
-
264
- logger.info("🚀 Model loaded successfully!")
265
-
266
- except Exception as e:
267
- logger.error(f"Failed to load model: {e}")
268
- raise
269
-
270
- yield
271
-
272
- logger.info("Shutting down...")
273
- model = None
274
- tokenizer = None
275
-
276
- app = FastAPI(
277
- title="ISA Model Deployment Service",
278
- description="Quantized model inference service",
279
- version="1.0.0",
280
- lifespan=lifespan
281
- )
282
-
283
- class GenerateRequest(BaseModel):
284
- prompt: str
285
- max_new_tokens: int = 256
286
- temperature: float = 0.7
287
- top_p: float = 0.9
288
-
289
- class GenerateResponse(BaseModel):
290
- text: str
291
- quantization: str
292
- backend: str
293
-
294
- @app.post("/generate", response_model=GenerateResponse)
295
- async def generate(request: GenerateRequest):
296
- """Generate text."""
297
- if model is None or tokenizer is None:
298
- raise HTTPException(status_code=503, detail="Model not loaded")
299
-
300
- try:
301
- # Tokenize input
302
- inputs = tokenizer(request.prompt, return_tensors="pt").to(model.device)
303
-
304
- # Generate response
305
- with torch.no_grad():
306
- outputs = model.generate(
307
- **inputs,
308
- max_new_tokens=request.max_new_tokens,
309
- temperature=request.temperature,
310
- top_p=request.top_p,
311
- do_sample=True,
312
- eos_token_id=tokenizer.eos_token_id,
313
- pad_token_id=tokenizer.pad_token_id,
314
- )
315
-
316
- # Decode response
317
- generated_text = tokenizer.decode(
318
- outputs[0][len(inputs.input_ids[0]):],
319
- skip_special_tokens=True
320
- )
321
-
322
- return GenerateResponse(
323
- text=generated_text,
324
- quantization=QUANTIZATION,
325
- backend="Transformers"
326
- )
327
-
328
- except Exception as e:
329
- logger.error(f"Generation failed: {e}")
330
- raise HTTPException(status_code=500, detail=str(e))
331
-
332
- @app.get("/health")
333
- async def health_check():
334
- """Health check."""
335
- return {
336
- "status": "healthy" if (model is not None and tokenizer is not None) else "loading",
337
- "quantization": QUANTIZATION,
338
- "backend": "Transformers"
339
- }
340
-
341
- @app.get("/info")
342
- async def model_info():
343
- """Model information."""
344
- return {
345
- "model_path": MODEL_PATH,
346
- "quantization": QUANTIZATION,
347
- "framework": "ISA Model SDK",
348
- "backend": "Transformers"
349
- }
350
- '''
351
-
352
- def get_deployment_instructions(self, deployment_info: Dict[str, Any]) -> str:
353
- """Generate deployment instructions."""
354
- container_image = None
355
-
356
- for step in deployment_info.get("steps", []):
357
- if step["name"] == "build_container":
358
- container_image = step.get("container_image")
359
-
360
- return f'''# ISA Model Deployment Instructions
361
-
362
- ## Deployment ID: {deployment_info['deployment_id']}
363
- ## Model: {deployment_info['model_id']}
364
- ## Quantization: {deployment_info['quantization']}
365
-
366
- ### Container Image
367
- ```
368
- {container_image or 'Not built yet'}
369
- ```
370
-
371
- ### RunPod Configuration
372
- - **Container Image**: {container_image}
373
- - **GPU Type**: NVIDIA RTX A6000
374
- - **Container Disk**: 30GB
375
- - **Ports**: 8000 (HTTP API)
376
-
377
- ### Testing the Deployment
378
- ```python
379
- import requests
380
-
381
- # Health check
382
- response = requests.get("http://your-endpoint/health")
383
- print(response.json())
384
-
385
- # Generate text
386
- payload = {{
387
- "prompt": "What is machine learning?",
388
- "max_new_tokens": 100,
389
- "temperature": 0.7
390
- }}
391
-
392
- response = requests.post("http://your-endpoint/generate", json=payload)
393
- print(response.json())
394
- ```
395
-
396
- ### Features
397
- - ✅ Automatic model download from HuggingFace
398
- - ✅ {deployment_info['quantization'].upper()} quantization for efficiency
399
- - ✅ FastAPI REST interface
400
- - ✅ Health monitoring
401
- '''
@@ -1,66 +0,0 @@
1
- import os
2
- from fastapi import FastAPI
3
- from pydantic import BaseModel
4
- from contextlib import asynccontextmanager
5
- from pathlib import Path
6
- from threading import Thread
7
- from transformers import AutoTokenizer
8
- from tensorrt_llm.runtime import ModelRunner
9
-
10
- # --- 全局变量 ---
11
- ENGINE_PATH = "/app/built_engine/deepseek_engine"
12
- TOKENIZER_PATH = "/app/hf_model" # 我们需要原始HF模型中的tokenizer
13
- runner = None
14
- tokenizer = None
15
-
16
- # --- FastAPI生命周期事件 ---
17
- @asynccontextmanager
18
- async def lifespan(app: FastAPI):
19
- global runner, tokenizer
20
- print("--- 正在加载模型引擎和Tokenizer... ---")
21
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
22
- runner = ModelRunner.from_dir(engine_dir=ENGINE_PATH, rank=0, stream=True)
23
- print("--- ✅ 模型加载完毕,服务准备就绪 ---")
24
- yield
25
- print("--- 正在清理资源... ---")
26
- runner = None
27
- tokenizer = None
28
-
29
- app = FastAPI(lifespan=lifespan)
30
-
31
- # --- API请求和响应模型 ---
32
- class GenerateRequest(BaseModel):
33
- prompt: str
34
- max_new_tokens: int = 256
35
- temperature: float = 0.7
36
-
37
- class GenerateResponse(BaseModel):
38
- text: str
39
-
40
- # --- API端点 ---
41
- @app.post("/generate", response_model=GenerateResponse)
42
- async def generate(request: GenerateRequest):
43
- print(f"收到请求: {request.prompt}")
44
-
45
- # 准备输入
46
- input_ids = tokenizer.encode(request.prompt, return_tensors="pt").to("cuda")
47
-
48
- # 执行推理
49
- output_ids = runner.generate(
50
- input_ids,
51
- max_new_tokens=request.max_new_tokens,
52
- temperature=request.temperature,
53
- eos_token_id=tokenizer.eos_token_id,
54
- pad_token_id=tokenizer.pad_token_id,
55
- )
56
-
57
- # 清理并解码输出
58
- # output_ids[0] 的形状是 [beam_width, seq_length]
59
- generated_text = tokenizer.decode(output_ids[0, 0, len(input_ids[0]):], skip_special_tokens=True)
60
-
61
- print(f"生成响应: {generated_text}")
62
- return GenerateResponse(text=generated_text)
63
-
64
- @app.get("/health")
65
- async def health_check():
66
- return {"status": "ok" if runner is not None else "loading"}
@@ -1,43 +0,0 @@
1
- import requests
2
- import json
3
-
4
- # --- 配置 ---
5
- TRITON_SERVER_URL = "http://localhost:8000"
6
- MODEL_NAME = "deepseek_trtllm"
7
- PROMPT = "请给我讲一个关于人工智能的笑话。"
8
- MAX_TOKENS = 256
9
- STREAM = False
10
- # ----------------------------------------------------
11
-
12
- def main():
13
- """向Triton服务器发送请求并打印结果。"""
14
- url = f"{TRITON_SERVER_URL}/v2/models/{MODEL_NAME}/generate"
15
- payload = {
16
- "text_input": PROMPT,
17
- "max_new_tokens": MAX_TOKENS,
18
- "temperature": 0.7,
19
- "stream": STREAM
20
- }
21
- print(f"Sending request to: {url}")
22
- print(f"Payload: {json.dumps(payload, indent=2, ensure_ascii=False)}")
23
- print("-" * 30)
24
-
25
- try:
26
- response = requests.post(url, json=payload, headers={"Accept": "application/json"})
27
- response.raise_for_status()
28
- response_data = response.json()
29
- generated_text = response_data.get('text_output', 'Error: "text_output" key not found.')
30
-
31
- print("✅ Request successful!")
32
- print("-" * 30)
33
- print("Prompt:", PROMPT)
34
- print("\nGenerated Text:", generated_text)
35
-
36
- except requests.exceptions.RequestException as e:
37
- print(f"❌ Error making request to Triton server: {e}")
38
- if e.response:
39
- print(f"Response Status Code: {e.response.status_code}")
40
- print(f"Response Body: {e.response.text}")
41
-
42
- if __name__ == '__main__':
43
- main()
@@ -1,35 +0,0 @@
1
- import requests
2
- import json
3
-
4
- PROMPT = "请给我讲一个关于人工智能的笑话。"
5
- API_URL = "http://localhost:8000/generate"
6
-
7
- def main():
8
- payload = {
9
- "prompt": PROMPT,
10
- "max_new_tokens": 100
11
- }
12
-
13
- print(f"Sending request to: {API_URL}")
14
- print(f"Payload: {json.dumps(payload, ensure_ascii=False)}")
15
- print("-" * 30)
16
-
17
- try:
18
- response = requests.post(API_URL, json=payload)
19
- response.raise_for_status()
20
-
21
- response_data = response.json()
22
- generated_text = response_data.get('text')
23
-
24
- print("✅ Request successful!")
25
- print("-" * 30)
26
- print("Prompt:", PROMPT)
27
- print("\nGenerated Text:", generated_text)
28
-
29
- except requests.exceptions.RequestException as e:
30
- print(f"❌ Error making request: {e}")
31
- if e.response:
32
- print(f"Response Body: {e.response.text}")
33
-
34
- if __name__ == '__main__':
35
- main()