isa-model 0.3.91__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. isa_model/client.py +732 -573
  2. isa_model/core/cache/redis_cache.py +401 -0
  3. isa_model/core/config/config_manager.py +53 -10
  4. isa_model/core/config.py +1 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/migrations.py +277 -0
  7. isa_model/core/database/supabase_client.py +123 -0
  8. isa_model/core/models/__init__.py +37 -0
  9. isa_model/core/models/model_billing_tracker.py +60 -88
  10. isa_model/core/models/model_manager.py +36 -18
  11. isa_model/core/models/model_repo.py +44 -38
  12. isa_model/core/models/model_statistics_tracker.py +234 -0
  13. isa_model/core/models/model_storage.py +0 -1
  14. isa_model/core/models/model_version_manager.py +959 -0
  15. isa_model/core/pricing_manager.py +2 -249
  16. isa_model/core/resilience/circuit_breaker.py +366 -0
  17. isa_model/core/security/secrets.py +358 -0
  18. isa_model/core/services/__init__.py +2 -4
  19. isa_model/core/services/intelligent_model_selector.py +101 -370
  20. isa_model/core/storage/hf_storage.py +1 -1
  21. isa_model/core/types.py +7 -0
  22. isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
  23. isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
  24. isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
  25. isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
  26. isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
  27. isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
  28. isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
  29. isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
  30. isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
  31. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
  32. isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
  33. isa_model/deployment/core/deployment_manager.py +6 -4
  34. isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
  35. isa_model/eval/benchmarks/__init__.py +27 -0
  36. isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
  37. isa_model/eval/benchmarks.py +244 -12
  38. isa_model/eval/evaluators/__init__.py +8 -2
  39. isa_model/eval/evaluators/audio_evaluator.py +727 -0
  40. isa_model/eval/evaluators/embedding_evaluator.py +742 -0
  41. isa_model/eval/evaluators/vision_evaluator.py +564 -0
  42. isa_model/eval/example_evaluation.py +395 -0
  43. isa_model/eval/factory.py +272 -5
  44. isa_model/eval/isa_benchmarks.py +700 -0
  45. isa_model/eval/isa_integration.py +582 -0
  46. isa_model/eval/metrics.py +159 -6
  47. isa_model/eval/tests/unit/test_basic.py +396 -0
  48. isa_model/inference/ai_factory.py +44 -8
  49. isa_model/inference/services/audio/__init__.py +21 -0
  50. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  51. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  52. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  53. isa_model/inference/services/audio/openai_stt_service.py +32 -6
  54. isa_model/inference/services/base_service.py +17 -1
  55. isa_model/inference/services/embedding/__init__.py +13 -0
  56. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  57. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  58. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  59. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  60. isa_model/inference/services/img/__init__.py +2 -2
  61. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  62. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  63. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  64. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  65. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  66. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  67. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  68. isa_model/inference/services/llm/base_llm_service.py +30 -6
  69. isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
  70. isa_model/inference/services/llm/ollama_llm_service.py +2 -1
  71. isa_model/inference/services/llm/openai_llm_service.py +652 -55
  72. isa_model/inference/services/llm/yyds_llm_service.py +2 -1
  73. isa_model/inference/services/vision/__init__.py +5 -5
  74. isa_model/inference/services/vision/base_vision_service.py +118 -185
  75. isa_model/inference/services/vision/helpers/image_utils.py +11 -5
  76. isa_model/inference/services/vision/isa_vision_service.py +573 -0
  77. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  78. isa_model/serving/api/fastapi_server.py +88 -16
  79. isa_model/serving/api/middleware/auth.py +311 -0
  80. isa_model/serving/api/middleware/security.py +278 -0
  81. isa_model/serving/api/routes/analytics.py +486 -0
  82. isa_model/serving/api/routes/deployments.py +339 -0
  83. isa_model/serving/api/routes/evaluations.py +579 -0
  84. isa_model/serving/api/routes/logs.py +430 -0
  85. isa_model/serving/api/routes/settings.py +582 -0
  86. isa_model/serving/api/routes/unified.py +324 -165
  87. isa_model/serving/api/startup.py +304 -0
  88. isa_model/serving/modal_proxy_server.py +249 -0
  89. isa_model/training/__init__.py +100 -6
  90. isa_model/training/core/__init__.py +4 -1
  91. isa_model/training/examples/intelligent_training_example.py +281 -0
  92. isa_model/training/intelligent/__init__.py +25 -0
  93. isa_model/training/intelligent/decision_engine.py +643 -0
  94. isa_model/training/intelligent/intelligent_factory.py +888 -0
  95. isa_model/training/intelligent/knowledge_base.py +751 -0
  96. isa_model/training/intelligent/resource_optimizer.py +839 -0
  97. isa_model/training/intelligent/task_classifier.py +576 -0
  98. isa_model/training/storage/__init__.py +24 -0
  99. isa_model/training/storage/core_integration.py +439 -0
  100. isa_model/training/storage/training_repository.py +552 -0
  101. isa_model/training/storage/training_storage.py +628 -0
  102. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
  103. isa_model-0.4.0.dist-info/RECORD +182 -0
  104. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  105. isa_model/deployment/cloud/modal/register_models.py +0 -321
  106. isa_model/inference/adapter/unified_api.py +0 -248
  107. isa_model/inference/services/helpers/stacked_config.py +0 -148
  108. isa_model/inference/services/img/flux_professional_service.py +0 -603
  109. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  110. isa_model/inference/services/others/table_transformer_service.py +0 -61
  111. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  112. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  113. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  114. isa_model/scripts/inference_tracker.py +0 -283
  115. isa_model/scripts/mlflow_manager.py +0 -379
  116. isa_model/scripts/model_registry.py +0 -465
  117. isa_model/scripts/register_models.py +0 -370
  118. isa_model/scripts/register_models_with_embeddings.py +0 -510
  119. isa_model/scripts/start_mlflow.py +0 -95
  120. isa_model/scripts/training_tracker.py +0 -257
  121. isa_model-0.3.91.dist-info/RECORD +0 -138
  122. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
  123. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,520 @@
1
+ """
2
+ ISA ChatTTS Service
3
+
4
+ ChatTTS text-to-speech service optimized for dialogue scenarios
5
+ - High-quality Chinese and English speech synthesis
6
+ - Support for prosody control (laughter, pauses, etc.)
7
+ - Fast inference and deployment
8
+ - Professional dialogue scene optimization
9
+ """
10
+
11
+ import modal
12
+ import time
13
+ import json
14
+ import os
15
+ import logging
16
+ import base64
17
+ import tempfile
18
+ import io
19
+ from typing import Dict, List, Optional, Any, Union
20
+ from pathlib import Path
21
+ import numpy as np
22
+
23
+ # Define Modal application
24
+ app = modal.App("isa-audio-chatTTS")
25
+
26
+ # Define Modal container image with ChatTTS dependencies
27
+ image = (
28
+ modal.Image.debian_slim(python_version="3.10")
29
+ .pip_install([
30
+ "torch>=2.0.0",
31
+ "torchaudio>=2.0.0",
32
+ "transformers>=4.41.0",
33
+ "accelerate>=0.26.0",
34
+ "numpy>=1.24.0",
35
+ "soundfile>=0.12.0",
36
+ "librosa>=0.10.0",
37
+ "scipy>=1.11.0",
38
+ "omegaconf>=2.3.0",
39
+ "hydra-core>=1.3.0",
40
+ "pydantic>=2.0.0",
41
+ "requests>=2.31.0",
42
+ "httpx>=0.26.0",
43
+ "python-dotenv>=1.0.0",
44
+ "ChatTTS", # ChatTTS main package
45
+ "pyopenjtalk", # For better text processing
46
+ "pypinyin", # For Chinese pronunciation
47
+ "jieba", # Chinese word segmentation
48
+ "opencc-python-reimplemented", # Chinese text conversion
49
+ "vocos", # Neural vocoder
50
+ "vector-quantize-pytorch", # Vector quantization
51
+ "einops", # Tensor operations
52
+ "pydub", # Audio processing
53
+ "ffmpeg-python", # Audio conversion
54
+ ])
55
+ .apt_install([
56
+ "ffmpeg",
57
+ "libsndfile1",
58
+ "libsox-dev",
59
+ "sox",
60
+ "espeak-ng",
61
+ "libmecab-dev",
62
+ "mecab-ipadic-utf8",
63
+ "git-lfs"
64
+ ])
65
+ .env({
66
+ "TRANSFORMERS_CACHE": "/models",
67
+ "TORCH_HOME": "/models/torch",
68
+ "HF_HOME": "/models",
69
+ "CUDA_VISIBLE_DEVICES": "0",
70
+ "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512"
71
+ })
72
+ )
73
+
74
+ # ChatTTS Service - Optimized for A10G GPU
75
+ @app.cls(
76
+ gpu="A10G", # 24GB A10G for ChatTTS
77
+ image=image,
78
+ memory=16384, # 16GB RAM
79
+ timeout=1800, # 30 minutes
80
+ scaledown_window=300, # 5 minutes idle timeout
81
+ min_containers=0, # Scale to zero
82
+ max_containers=8, # Support multiple concurrent requests
83
+ # secrets=[modal.Secret.from_name("huggingface-secret")], # Optional HF token
84
+ )
85
+ class ISAudioChatTTSService:
86
+ """
87
+ ISA ChatTTS Service
88
+
89
+ ChatTTS text-to-speech service:
90
+ - Model: ChatTTS (2DFN-AI/ChatTTS)
91
+ - Architecture: Transformer-based TTS
92
+ - Capabilities: Chinese/English TTS, prosody control, dialogue optimization
93
+ - Performance: Fast inference, high-quality output
94
+ """
95
+
96
+ @modal.enter()
97
+ def load_models(self):
98
+ """Load ChatTTS model and dependencies"""
99
+ print("Loading ChatTTS model...")
100
+ start_time = time.time()
101
+
102
+ # Initialize instance variables
103
+ self.chat_tts = None
104
+ self.logger = logging.getLogger(__name__)
105
+ self.request_count = 0
106
+ self.total_processing_time = 0.0
107
+
108
+ try:
109
+ import torch
110
+ import ChatTTS
111
+
112
+ # Initialize ChatTTS
113
+ self.chat_tts = ChatTTS.Chat()
114
+
115
+ # Load models - use HF models for better stability
116
+ print("Loading ChatTTS models from HuggingFace...")
117
+ self.chat_tts.load(
118
+ compile=False, # Disable compilation for compatibility
119
+ source="huggingface" # Use HuggingFace models
120
+ )
121
+
122
+ # Remove default params that might cause issues
123
+ self.default_params = {}
124
+
125
+ # Test model with a simple generation
126
+ print("Testing ChatTTS model...")
127
+ test_text = "Hello world, this is a test."
128
+ test_audio = self.chat_tts.infer([test_text], use_decoder=True)
129
+
130
+ if test_audio and len(test_audio) > 0:
131
+ print("ChatTTS model test successful")
132
+ self.models_loaded = True
133
+ else:
134
+ print("ChatTTS model test failed")
135
+ self.models_loaded = False
136
+
137
+ load_time = time.time() - start_time
138
+ print(f"ChatTTS loaded successfully in {load_time:.2f}s")
139
+
140
+ except Exception as e:
141
+ print(f"ChatTTS loading failed: {e}")
142
+ import traceback
143
+ traceback.print_exc()
144
+ self.models_loaded = False
145
+ self.chat_tts = None
146
+
147
+ @modal.method()
148
+ def synthesize_speech(
149
+ self,
150
+ text: str,
151
+ speaker_id: Optional[str] = None,
152
+ language: str = "auto",
153
+ speed: float = 1.0,
154
+ temperature: float = 0.3,
155
+ top_p: float = 0.7,
156
+ top_k: int = 20,
157
+ audio_seed: int = 2,
158
+ text_seed: int = 42,
159
+ enable_enhancement: bool = True,
160
+ output_format: str = "wav"
161
+ ) -> Dict[str, Any]:
162
+ """
163
+ Synthesize speech using ChatTTS
164
+
165
+ Args:
166
+ text: Text to synthesize
167
+ speaker_id: Optional speaker ID for voice consistency
168
+ language: Language code ("zh", "en", "auto")
169
+ speed: Speech speed multiplier (0.5-2.0)
170
+ temperature: Sampling temperature (0.01-1.0)
171
+ top_p: Top-p sampling (0.1-1.0)
172
+ top_k: Top-k sampling (1-100)
173
+ audio_seed: Audio generation seed
174
+ text_seed: Text processing seed
175
+ enable_enhancement: Enable audio enhancement
176
+ output_format: Output format ("wav", "mp3", "flac")
177
+
178
+ Returns:
179
+ Speech synthesis results
180
+ """
181
+ start_time = time.time()
182
+ self.request_count += 1
183
+
184
+ try:
185
+ # Validate model loading status
186
+ if not self.models_loaded or not self.chat_tts:
187
+ raise RuntimeError("ChatTTS model not loaded")
188
+
189
+ # Validate input parameters
190
+ if not text or not text.strip():
191
+ raise ValueError("Text cannot be empty")
192
+
193
+ # Preprocess text
194
+ processed_text = self._preprocess_text(text, language)
195
+
196
+ print(f"Synthesizing: '{processed_text[:50]}...'")
197
+
198
+ # Generate speech using correct ChatTTS API
199
+ import torch
200
+ import ChatTTS
201
+
202
+ with torch.no_grad():
203
+ # Sample random speaker if speaker_id is provided
204
+ if speaker_id:
205
+ spk_emb = self.chat_tts.sample_random_speaker()
206
+ else:
207
+ spk_emb = None
208
+
209
+ # Configure inference parameters using correct API
210
+ params_infer_code = ChatTTS.Chat.InferCodeParams(
211
+ spk_emb=spk_emb,
212
+ temperature=temperature,
213
+ top_P=top_p,
214
+ top_K=top_k
215
+ )
216
+
217
+ # Configure text refinement parameters
218
+ params_refine_text = ChatTTS.Chat.RefineTextParams(
219
+ prompt='[oral_2][laugh_0][break_4]' # Default prosody control
220
+ )
221
+
222
+ # Generate audio with proper parameters
223
+ audio_data = self.chat_tts.infer(
224
+ [processed_text],
225
+ use_decoder=True,
226
+ params_infer_code=params_infer_code,
227
+ params_refine_text=params_refine_text
228
+ )
229
+
230
+ if not audio_data or len(audio_data) == 0:
231
+ raise RuntimeError("Speech synthesis failed - no audio generated")
232
+
233
+ # Process audio output
234
+ audio_array = audio_data[0] # Get first (and only) audio
235
+
236
+ # Apply speed adjustment
237
+ if speed != 1.0:
238
+ audio_array = self._adjust_speed(audio_array, speed)
239
+
240
+ # Apply enhancement if enabled
241
+ if enable_enhancement:
242
+ audio_array = self._enhance_audio(audio_array)
243
+
244
+ # Convert to desired format and encode
245
+ audio_b64 = self._encode_audio(audio_array, output_format)
246
+
247
+ processing_time = time.time() - start_time
248
+ self.total_processing_time += processing_time
249
+
250
+ # Calculate cost (A10G GPU: ~$1.20/hour)
251
+ gpu_cost = (processing_time / 3600) * 1.20
252
+
253
+ # Calculate audio metrics
254
+ sample_rate = 24000 # ChatTTS default sample rate
255
+ duration = len(audio_array) / sample_rate
256
+
257
+ result = {
258
+ 'success': True,
259
+ 'service': 'isa-audio-chatTTS',
260
+ 'operation': 'speech_synthesis',
261
+ 'provider': 'ISA',
262
+ 'audio_b64': audio_b64,
263
+ 'text': text,
264
+ 'processed_text': processed_text,
265
+ 'model': 'ChatTTS',
266
+ 'architecture': 'Transformer-based TTS',
267
+ 'parameters': {
268
+ 'speaker_id': speaker_id,
269
+ 'language': language,
270
+ 'speed': speed,
271
+ 'temperature': temperature,
272
+ 'top_p': top_p,
273
+ 'top_k': top_k,
274
+ 'audio_seed': audio_seed,
275
+ 'text_seed': text_seed,
276
+ 'enhancement': enable_enhancement,
277
+ 'output_format': output_format
278
+ },
279
+ 'audio_info': {
280
+ 'sample_rate': sample_rate,
281
+ 'duration': round(duration, 2),
282
+ 'channels': 1,
283
+ 'format': output_format,
284
+ 'quality': 'high'
285
+ },
286
+ 'processing_time': processing_time,
287
+ 'billing': {
288
+ 'request_id': f"tts_{self.request_count}_{int(time.time())}",
289
+ 'gpu_seconds': processing_time,
290
+ 'estimated_cost_usd': round(gpu_cost, 4),
291
+ 'gpu_type': 'A10G'
292
+ },
293
+ 'model_info': {
294
+ 'model_name': 'ChatTTS',
295
+ 'provider': 'ISA',
296
+ 'architecture': 'Transformer-based TTS',
297
+ 'specialization': 'dialogue_optimized',
298
+ 'gpu': 'A10G',
299
+ 'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts'],
300
+ 'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
301
+ }
302
+ }
303
+
304
+ # Output JSON results
305
+ print("=== JSON_RESULT_START ===")
306
+ print(json.dumps(result, default=str, ensure_ascii=False))
307
+ print("=== JSON_RESULT_END ===")
308
+
309
+ return result
310
+
311
+ except Exception as e:
312
+ processing_time = time.time() - start_time
313
+ error_result = {
314
+ 'success': False,
315
+ 'service': 'isa-audio-chatTTS',
316
+ 'operation': 'speech_synthesis',
317
+ 'provider': 'ISA',
318
+ 'error': str(e),
319
+ 'text': text,
320
+ 'processing_time': processing_time,
321
+ 'billing': {
322
+ 'request_id': f"tts_{self.request_count}_{int(time.time())}",
323
+ 'gpu_seconds': processing_time,
324
+ 'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
325
+ 'gpu_type': 'A10G'
326
+ }
327
+ }
328
+
329
+ print("=== JSON_RESULT_START ===")
330
+ print(json.dumps(error_result, default=str, ensure_ascii=False))
331
+ print("=== JSON_RESULT_END ===")
332
+
333
+ return error_result
334
+
335
+ @modal.method()
336
+ def health_check(self) -> Dict[str, Any]:
337
+ """Health check endpoint"""
338
+ return {
339
+ 'status': 'healthy',
340
+ 'service': 'isa-audio-chatTTS',
341
+ 'provider': 'ISA',
342
+ 'models_loaded': self.models_loaded,
343
+ 'model': 'ChatTTS',
344
+ 'architecture': 'Transformer-based TTS',
345
+ 'timestamp': time.time(),
346
+ 'gpu': 'A10G',
347
+ 'memory_usage': '16GB',
348
+ 'request_count': self.request_count,
349
+ 'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts']
350
+ }
351
+
352
+ # ==================== UTILITY METHODS ====================
353
+
354
+ def _preprocess_text(self, text: str, language: str) -> str:
355
+ """Preprocess text for TTS"""
356
+ # Basic text cleaning
357
+ text = text.strip()
358
+
359
+ # Language-specific preprocessing
360
+ if language == "zh" or self._is_chinese(text):
361
+ return self._preprocess_chinese(text)
362
+ elif language == "en" or self._is_english(text):
363
+ return self._preprocess_english(text)
364
+ else:
365
+ # Auto-detect and process
366
+ if self._is_chinese(text):
367
+ return self._preprocess_chinese(text)
368
+ else:
369
+ return self._preprocess_english(text)
370
+
371
+ def _preprocess_chinese(self, text: str) -> str:
372
+ """Preprocess Chinese text"""
373
+ try:
374
+ # Traditional to Simplified conversion
375
+ from opencc import OpenCC
376
+ cc = OpenCC('t2s')
377
+ text = cc.convert(text)
378
+ return text
379
+ except:
380
+ return text
381
+
382
+ def _preprocess_english(self, text: str) -> str:
383
+ """Preprocess English text"""
384
+ # Basic normalization
385
+ text = text.replace('&', ' and ')
386
+ text = text.replace('@', ' at ')
387
+ text = text.replace('#', ' number ')
388
+ text = text.replace('%', ' percent ')
389
+ return text
390
+
391
+ def _is_chinese(self, text: str) -> bool:
392
+ """Check if text contains Chinese characters"""
393
+ for char in text:
394
+ if '\u4e00' <= char <= '\u9fff':
395
+ return True
396
+ return False
397
+
398
+ def _is_english(self, text: str) -> bool:
399
+ """Check if text is primarily English"""
400
+ english_chars = sum(1 for char in text if char.isalpha() and ord(char) < 128)
401
+ total_chars = sum(1 for char in text if char.isalpha())
402
+ return total_chars > 0 and english_chars / total_chars > 0.8
403
+
404
+ def _get_speaker_embedding(self, speaker_id: Optional[str]) -> Optional[Any]:
405
+ """Get speaker embedding for voice consistency"""
406
+ if not speaker_id:
407
+ return None
408
+
409
+ try:
410
+ import torch
411
+ # Sample a random speaker embedding
412
+ rand_spk = self.chat_tts.sample_random_speaker()
413
+ return rand_spk
414
+ except Exception as e:
415
+ print(f"Speaker embedding error: {e}")
416
+ return None
417
+
418
+ def _adjust_speed(self, audio: np.ndarray, speed: float) -> np.ndarray:
419
+ """Adjust audio speed"""
420
+ try:
421
+ import librosa
422
+ return librosa.effects.time_stretch(audio, rate=speed)
423
+ except:
424
+ return audio
425
+
426
+ def _enhance_audio(self, audio: np.ndarray) -> np.ndarray:
427
+ """Apply audio enhancement"""
428
+ try:
429
+ import scipy.signal
430
+ # Simple audio enhancement
431
+ audio = scipy.signal.wiener(audio)
432
+ audio = audio / np.max(np.abs(audio))
433
+ return audio
434
+ except:
435
+ return audio
436
+
437
+ def _encode_audio(self, audio: np.ndarray, format: str) -> str:
438
+ """Encode audio to base64"""
439
+ try:
440
+ import soundfile as sf
441
+ import io
442
+
443
+ # Convert to 16-bit PCM
444
+ audio_int16 = (audio * 32767).astype(np.int16)
445
+
446
+ # Save to bytes
447
+ buffer = io.BytesIO()
448
+ sf.write(buffer, audio_int16, 24000, format=format.upper())
449
+ buffer.seek(0)
450
+
451
+ # Encode to base64
452
+ audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
453
+ return audio_b64
454
+
455
+ except Exception as e:
456
+ print(f"Audio encoding error: {e}")
457
+ return ""
458
+
459
+ # Deployment functions
460
+ @app.function()
461
+ def deploy_info():
462
+ """Deployment information"""
463
+ return {
464
+ 'service': 'isa-audio-chatTTS',
465
+ 'version': '1.0.0',
466
+ 'description': 'ISA ChatTTS service - Dialogue-optimized TTS',
467
+ 'model': 'ChatTTS',
468
+ 'architecture': 'Transformer-based TTS',
469
+ 'gpu': 'A10G',
470
+ 'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts'],
471
+ 'deployment_time': time.time()
472
+ }
473
+
474
+ @app.function()
475
+ def register_service():
476
+ """Register service to model repository"""
477
+ try:
478
+ from isa_model.core.models.model_repo import ModelRepository
479
+
480
+ repo = ModelRepository()
481
+
482
+ # Register ChatTTS service
483
+ repo.register_model({
484
+ 'model_id': 'isa-chatTTS-service',
485
+ 'model_type': 'audio',
486
+ 'provider': 'isa',
487
+ 'endpoint': 'https://isa-audio-chatTTS.modal.run',
488
+ 'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts'],
489
+ 'pricing': {'gpu_type': 'A10G', 'cost_per_hour': 1.20},
490
+ 'metadata': {
491
+ 'model': 'ChatTTS',
492
+ 'architecture': 'Transformer-based TTS',
493
+ 'specialization': 'dialogue_optimized',
494
+ 'languages': ['zh', 'en'],
495
+ 'sample_rate': 24000,
496
+ 'max_text_length': 1000
497
+ }
498
+ })
499
+
500
+ print("ChatTTS service registered successfully")
501
+ return {'status': 'registered'}
502
+
503
+ except Exception as e:
504
+ print(f"Service registration failed: {e}")
505
+ return {'status': 'failed', 'error': str(e)}
506
+
507
+ if __name__ == "__main__":
508
+ print("ISA ChatTTS Service - Modal Deployment")
509
+ print("Deploy with: modal deploy isa_audio_chatTTS_service.py")
510
+ print()
511
+ print("Model: ChatTTS")
512
+ print("Architecture: Transformer-based TTS")
513
+ print("Capabilities: Chinese/English TTS, prosody control, dialogue optimization")
514
+ print("GPU: A10G (24GB)")
515
+ print()
516
+ print("Usage:")
517
+ print("# Speech synthesis")
518
+ print("service.synthesize_speech('Hello world!', language='en')")
519
+ print("# Health check")
520
+ print("service.health_check()")