isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/client.py +732 -565
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.9.dist-info/RECORD +0 -138
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,520 @@
|
|
1
|
+
"""
|
2
|
+
ISA ChatTTS Service
|
3
|
+
|
4
|
+
ChatTTS text-to-speech service optimized for dialogue scenarios
|
5
|
+
- High-quality Chinese and English speech synthesis
|
6
|
+
- Support for prosody control (laughter, pauses, etc.)
|
7
|
+
- Fast inference and deployment
|
8
|
+
- Professional dialogue scene optimization
|
9
|
+
"""
|
10
|
+
|
11
|
+
import modal
|
12
|
+
import time
|
13
|
+
import json
|
14
|
+
import os
|
15
|
+
import logging
|
16
|
+
import base64
|
17
|
+
import tempfile
|
18
|
+
import io
|
19
|
+
from typing import Dict, List, Optional, Any, Union
|
20
|
+
from pathlib import Path
|
21
|
+
import numpy as np
|
22
|
+
|
23
|
+
# Define Modal application
|
24
|
+
app = modal.App("isa-audio-chatTTS")
|
25
|
+
|
26
|
+
# Define Modal container image with ChatTTS dependencies
|
27
|
+
image = (
|
28
|
+
modal.Image.debian_slim(python_version="3.10")
|
29
|
+
.pip_install([
|
30
|
+
"torch>=2.0.0",
|
31
|
+
"torchaudio>=2.0.0",
|
32
|
+
"transformers>=4.41.0",
|
33
|
+
"accelerate>=0.26.0",
|
34
|
+
"numpy>=1.24.0",
|
35
|
+
"soundfile>=0.12.0",
|
36
|
+
"librosa>=0.10.0",
|
37
|
+
"scipy>=1.11.0",
|
38
|
+
"omegaconf>=2.3.0",
|
39
|
+
"hydra-core>=1.3.0",
|
40
|
+
"pydantic>=2.0.0",
|
41
|
+
"requests>=2.31.0",
|
42
|
+
"httpx>=0.26.0",
|
43
|
+
"python-dotenv>=1.0.0",
|
44
|
+
"ChatTTS", # ChatTTS main package
|
45
|
+
"pyopenjtalk", # For better text processing
|
46
|
+
"pypinyin", # For Chinese pronunciation
|
47
|
+
"jieba", # Chinese word segmentation
|
48
|
+
"opencc-python-reimplemented", # Chinese text conversion
|
49
|
+
"vocos", # Neural vocoder
|
50
|
+
"vector-quantize-pytorch", # Vector quantization
|
51
|
+
"einops", # Tensor operations
|
52
|
+
"pydub", # Audio processing
|
53
|
+
"ffmpeg-python", # Audio conversion
|
54
|
+
])
|
55
|
+
.apt_install([
|
56
|
+
"ffmpeg",
|
57
|
+
"libsndfile1",
|
58
|
+
"libsox-dev",
|
59
|
+
"sox",
|
60
|
+
"espeak-ng",
|
61
|
+
"libmecab-dev",
|
62
|
+
"mecab-ipadic-utf8",
|
63
|
+
"git-lfs"
|
64
|
+
])
|
65
|
+
.env({
|
66
|
+
"TRANSFORMERS_CACHE": "/models",
|
67
|
+
"TORCH_HOME": "/models/torch",
|
68
|
+
"HF_HOME": "/models",
|
69
|
+
"CUDA_VISIBLE_DEVICES": "0",
|
70
|
+
"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512"
|
71
|
+
})
|
72
|
+
)
|
73
|
+
|
74
|
+
# ChatTTS Service - Optimized for A10G GPU
|
75
|
+
@app.cls(
|
76
|
+
gpu="A10G", # 24GB A10G for ChatTTS
|
77
|
+
image=image,
|
78
|
+
memory=16384, # 16GB RAM
|
79
|
+
timeout=1800, # 30 minutes
|
80
|
+
scaledown_window=300, # 5 minutes idle timeout
|
81
|
+
min_containers=0, # Scale to zero
|
82
|
+
max_containers=8, # Support multiple concurrent requests
|
83
|
+
# secrets=[modal.Secret.from_name("huggingface-secret")], # Optional HF token
|
84
|
+
)
|
85
|
+
class ISAudioChatTTSService:
|
86
|
+
"""
|
87
|
+
ISA ChatTTS Service
|
88
|
+
|
89
|
+
ChatTTS text-to-speech service:
|
90
|
+
- Model: ChatTTS (2DFN-AI/ChatTTS)
|
91
|
+
- Architecture: Transformer-based TTS
|
92
|
+
- Capabilities: Chinese/English TTS, prosody control, dialogue optimization
|
93
|
+
- Performance: Fast inference, high-quality output
|
94
|
+
"""
|
95
|
+
|
96
|
+
@modal.enter()
|
97
|
+
def load_models(self):
|
98
|
+
"""Load ChatTTS model and dependencies"""
|
99
|
+
print("Loading ChatTTS model...")
|
100
|
+
start_time = time.time()
|
101
|
+
|
102
|
+
# Initialize instance variables
|
103
|
+
self.chat_tts = None
|
104
|
+
self.logger = logging.getLogger(__name__)
|
105
|
+
self.request_count = 0
|
106
|
+
self.total_processing_time = 0.0
|
107
|
+
|
108
|
+
try:
|
109
|
+
import torch
|
110
|
+
import ChatTTS
|
111
|
+
|
112
|
+
# Initialize ChatTTS
|
113
|
+
self.chat_tts = ChatTTS.Chat()
|
114
|
+
|
115
|
+
# Load models - use HF models for better stability
|
116
|
+
print("Loading ChatTTS models from HuggingFace...")
|
117
|
+
self.chat_tts.load(
|
118
|
+
compile=False, # Disable compilation for compatibility
|
119
|
+
source="huggingface" # Use HuggingFace models
|
120
|
+
)
|
121
|
+
|
122
|
+
# Remove default params that might cause issues
|
123
|
+
self.default_params = {}
|
124
|
+
|
125
|
+
# Test model with a simple generation
|
126
|
+
print("Testing ChatTTS model...")
|
127
|
+
test_text = "Hello world, this is a test."
|
128
|
+
test_audio = self.chat_tts.infer([test_text], use_decoder=True)
|
129
|
+
|
130
|
+
if test_audio and len(test_audio) > 0:
|
131
|
+
print("ChatTTS model test successful")
|
132
|
+
self.models_loaded = True
|
133
|
+
else:
|
134
|
+
print("ChatTTS model test failed")
|
135
|
+
self.models_loaded = False
|
136
|
+
|
137
|
+
load_time = time.time() - start_time
|
138
|
+
print(f"ChatTTS loaded successfully in {load_time:.2f}s")
|
139
|
+
|
140
|
+
except Exception as e:
|
141
|
+
print(f"ChatTTS loading failed: {e}")
|
142
|
+
import traceback
|
143
|
+
traceback.print_exc()
|
144
|
+
self.models_loaded = False
|
145
|
+
self.chat_tts = None
|
146
|
+
|
147
|
+
@modal.method()
|
148
|
+
def synthesize_speech(
|
149
|
+
self,
|
150
|
+
text: str,
|
151
|
+
speaker_id: Optional[str] = None,
|
152
|
+
language: str = "auto",
|
153
|
+
speed: float = 1.0,
|
154
|
+
temperature: float = 0.3,
|
155
|
+
top_p: float = 0.7,
|
156
|
+
top_k: int = 20,
|
157
|
+
audio_seed: int = 2,
|
158
|
+
text_seed: int = 42,
|
159
|
+
enable_enhancement: bool = True,
|
160
|
+
output_format: str = "wav"
|
161
|
+
) -> Dict[str, Any]:
|
162
|
+
"""
|
163
|
+
Synthesize speech using ChatTTS
|
164
|
+
|
165
|
+
Args:
|
166
|
+
text: Text to synthesize
|
167
|
+
speaker_id: Optional speaker ID for voice consistency
|
168
|
+
language: Language code ("zh", "en", "auto")
|
169
|
+
speed: Speech speed multiplier (0.5-2.0)
|
170
|
+
temperature: Sampling temperature (0.01-1.0)
|
171
|
+
top_p: Top-p sampling (0.1-1.0)
|
172
|
+
top_k: Top-k sampling (1-100)
|
173
|
+
audio_seed: Audio generation seed
|
174
|
+
text_seed: Text processing seed
|
175
|
+
enable_enhancement: Enable audio enhancement
|
176
|
+
output_format: Output format ("wav", "mp3", "flac")
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
Speech synthesis results
|
180
|
+
"""
|
181
|
+
start_time = time.time()
|
182
|
+
self.request_count += 1
|
183
|
+
|
184
|
+
try:
|
185
|
+
# Validate model loading status
|
186
|
+
if not self.models_loaded or not self.chat_tts:
|
187
|
+
raise RuntimeError("ChatTTS model not loaded")
|
188
|
+
|
189
|
+
# Validate input parameters
|
190
|
+
if not text or not text.strip():
|
191
|
+
raise ValueError("Text cannot be empty")
|
192
|
+
|
193
|
+
# Preprocess text
|
194
|
+
processed_text = self._preprocess_text(text, language)
|
195
|
+
|
196
|
+
print(f"Synthesizing: '{processed_text[:50]}...'")
|
197
|
+
|
198
|
+
# Generate speech using correct ChatTTS API
|
199
|
+
import torch
|
200
|
+
import ChatTTS
|
201
|
+
|
202
|
+
with torch.no_grad():
|
203
|
+
# Sample random speaker if speaker_id is provided
|
204
|
+
if speaker_id:
|
205
|
+
spk_emb = self.chat_tts.sample_random_speaker()
|
206
|
+
else:
|
207
|
+
spk_emb = None
|
208
|
+
|
209
|
+
# Configure inference parameters using correct API
|
210
|
+
params_infer_code = ChatTTS.Chat.InferCodeParams(
|
211
|
+
spk_emb=spk_emb,
|
212
|
+
temperature=temperature,
|
213
|
+
top_P=top_p,
|
214
|
+
top_K=top_k
|
215
|
+
)
|
216
|
+
|
217
|
+
# Configure text refinement parameters
|
218
|
+
params_refine_text = ChatTTS.Chat.RefineTextParams(
|
219
|
+
prompt='[oral_2][laugh_0][break_4]' # Default prosody control
|
220
|
+
)
|
221
|
+
|
222
|
+
# Generate audio with proper parameters
|
223
|
+
audio_data = self.chat_tts.infer(
|
224
|
+
[processed_text],
|
225
|
+
use_decoder=True,
|
226
|
+
params_infer_code=params_infer_code,
|
227
|
+
params_refine_text=params_refine_text
|
228
|
+
)
|
229
|
+
|
230
|
+
if not audio_data or len(audio_data) == 0:
|
231
|
+
raise RuntimeError("Speech synthesis failed - no audio generated")
|
232
|
+
|
233
|
+
# Process audio output
|
234
|
+
audio_array = audio_data[0] # Get first (and only) audio
|
235
|
+
|
236
|
+
# Apply speed adjustment
|
237
|
+
if speed != 1.0:
|
238
|
+
audio_array = self._adjust_speed(audio_array, speed)
|
239
|
+
|
240
|
+
# Apply enhancement if enabled
|
241
|
+
if enable_enhancement:
|
242
|
+
audio_array = self._enhance_audio(audio_array)
|
243
|
+
|
244
|
+
# Convert to desired format and encode
|
245
|
+
audio_b64 = self._encode_audio(audio_array, output_format)
|
246
|
+
|
247
|
+
processing_time = time.time() - start_time
|
248
|
+
self.total_processing_time += processing_time
|
249
|
+
|
250
|
+
# Calculate cost (A10G GPU: ~$1.20/hour)
|
251
|
+
gpu_cost = (processing_time / 3600) * 1.20
|
252
|
+
|
253
|
+
# Calculate audio metrics
|
254
|
+
sample_rate = 24000 # ChatTTS default sample rate
|
255
|
+
duration = len(audio_array) / sample_rate
|
256
|
+
|
257
|
+
result = {
|
258
|
+
'success': True,
|
259
|
+
'service': 'isa-audio-chatTTS',
|
260
|
+
'operation': 'speech_synthesis',
|
261
|
+
'provider': 'ISA',
|
262
|
+
'audio_b64': audio_b64,
|
263
|
+
'text': text,
|
264
|
+
'processed_text': processed_text,
|
265
|
+
'model': 'ChatTTS',
|
266
|
+
'architecture': 'Transformer-based TTS',
|
267
|
+
'parameters': {
|
268
|
+
'speaker_id': speaker_id,
|
269
|
+
'language': language,
|
270
|
+
'speed': speed,
|
271
|
+
'temperature': temperature,
|
272
|
+
'top_p': top_p,
|
273
|
+
'top_k': top_k,
|
274
|
+
'audio_seed': audio_seed,
|
275
|
+
'text_seed': text_seed,
|
276
|
+
'enhancement': enable_enhancement,
|
277
|
+
'output_format': output_format
|
278
|
+
},
|
279
|
+
'audio_info': {
|
280
|
+
'sample_rate': sample_rate,
|
281
|
+
'duration': round(duration, 2),
|
282
|
+
'channels': 1,
|
283
|
+
'format': output_format,
|
284
|
+
'quality': 'high'
|
285
|
+
},
|
286
|
+
'processing_time': processing_time,
|
287
|
+
'billing': {
|
288
|
+
'request_id': f"tts_{self.request_count}_{int(time.time())}",
|
289
|
+
'gpu_seconds': processing_time,
|
290
|
+
'estimated_cost_usd': round(gpu_cost, 4),
|
291
|
+
'gpu_type': 'A10G'
|
292
|
+
},
|
293
|
+
'model_info': {
|
294
|
+
'model_name': 'ChatTTS',
|
295
|
+
'provider': 'ISA',
|
296
|
+
'architecture': 'Transformer-based TTS',
|
297
|
+
'specialization': 'dialogue_optimized',
|
298
|
+
'gpu': 'A10G',
|
299
|
+
'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts'],
|
300
|
+
'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
# Output JSON results
|
305
|
+
print("=== JSON_RESULT_START ===")
|
306
|
+
print(json.dumps(result, default=str, ensure_ascii=False))
|
307
|
+
print("=== JSON_RESULT_END ===")
|
308
|
+
|
309
|
+
return result
|
310
|
+
|
311
|
+
except Exception as e:
|
312
|
+
processing_time = time.time() - start_time
|
313
|
+
error_result = {
|
314
|
+
'success': False,
|
315
|
+
'service': 'isa-audio-chatTTS',
|
316
|
+
'operation': 'speech_synthesis',
|
317
|
+
'provider': 'ISA',
|
318
|
+
'error': str(e),
|
319
|
+
'text': text,
|
320
|
+
'processing_time': processing_time,
|
321
|
+
'billing': {
|
322
|
+
'request_id': f"tts_{self.request_count}_{int(time.time())}",
|
323
|
+
'gpu_seconds': processing_time,
|
324
|
+
'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
|
325
|
+
'gpu_type': 'A10G'
|
326
|
+
}
|
327
|
+
}
|
328
|
+
|
329
|
+
print("=== JSON_RESULT_START ===")
|
330
|
+
print(json.dumps(error_result, default=str, ensure_ascii=False))
|
331
|
+
print("=== JSON_RESULT_END ===")
|
332
|
+
|
333
|
+
return error_result
|
334
|
+
|
335
|
+
@modal.method()
|
336
|
+
def health_check(self) -> Dict[str, Any]:
|
337
|
+
"""Health check endpoint"""
|
338
|
+
return {
|
339
|
+
'status': 'healthy',
|
340
|
+
'service': 'isa-audio-chatTTS',
|
341
|
+
'provider': 'ISA',
|
342
|
+
'models_loaded': self.models_loaded,
|
343
|
+
'model': 'ChatTTS',
|
344
|
+
'architecture': 'Transformer-based TTS',
|
345
|
+
'timestamp': time.time(),
|
346
|
+
'gpu': 'A10G',
|
347
|
+
'memory_usage': '16GB',
|
348
|
+
'request_count': self.request_count,
|
349
|
+
'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts']
|
350
|
+
}
|
351
|
+
|
352
|
+
# ==================== UTILITY METHODS ====================
|
353
|
+
|
354
|
+
def _preprocess_text(self, text: str, language: str) -> str:
|
355
|
+
"""Preprocess text for TTS"""
|
356
|
+
# Basic text cleaning
|
357
|
+
text = text.strip()
|
358
|
+
|
359
|
+
# Language-specific preprocessing
|
360
|
+
if language == "zh" or self._is_chinese(text):
|
361
|
+
return self._preprocess_chinese(text)
|
362
|
+
elif language == "en" or self._is_english(text):
|
363
|
+
return self._preprocess_english(text)
|
364
|
+
else:
|
365
|
+
# Auto-detect and process
|
366
|
+
if self._is_chinese(text):
|
367
|
+
return self._preprocess_chinese(text)
|
368
|
+
else:
|
369
|
+
return self._preprocess_english(text)
|
370
|
+
|
371
|
+
def _preprocess_chinese(self, text: str) -> str:
|
372
|
+
"""Preprocess Chinese text"""
|
373
|
+
try:
|
374
|
+
# Traditional to Simplified conversion
|
375
|
+
from opencc import OpenCC
|
376
|
+
cc = OpenCC('t2s')
|
377
|
+
text = cc.convert(text)
|
378
|
+
return text
|
379
|
+
except:
|
380
|
+
return text
|
381
|
+
|
382
|
+
def _preprocess_english(self, text: str) -> str:
|
383
|
+
"""Preprocess English text"""
|
384
|
+
# Basic normalization
|
385
|
+
text = text.replace('&', ' and ')
|
386
|
+
text = text.replace('@', ' at ')
|
387
|
+
text = text.replace('#', ' number ')
|
388
|
+
text = text.replace('%', ' percent ')
|
389
|
+
return text
|
390
|
+
|
391
|
+
def _is_chinese(self, text: str) -> bool:
|
392
|
+
"""Check if text contains Chinese characters"""
|
393
|
+
for char in text:
|
394
|
+
if '\u4e00' <= char <= '\u9fff':
|
395
|
+
return True
|
396
|
+
return False
|
397
|
+
|
398
|
+
def _is_english(self, text: str) -> bool:
|
399
|
+
"""Check if text is primarily English"""
|
400
|
+
english_chars = sum(1 for char in text if char.isalpha() and ord(char) < 128)
|
401
|
+
total_chars = sum(1 for char in text if char.isalpha())
|
402
|
+
return total_chars > 0 and english_chars / total_chars > 0.8
|
403
|
+
|
404
|
+
def _get_speaker_embedding(self, speaker_id: Optional[str]) -> Optional[Any]:
|
405
|
+
"""Get speaker embedding for voice consistency"""
|
406
|
+
if not speaker_id:
|
407
|
+
return None
|
408
|
+
|
409
|
+
try:
|
410
|
+
import torch
|
411
|
+
# Sample a random speaker embedding
|
412
|
+
rand_spk = self.chat_tts.sample_random_speaker()
|
413
|
+
return rand_spk
|
414
|
+
except Exception as e:
|
415
|
+
print(f"Speaker embedding error: {e}")
|
416
|
+
return None
|
417
|
+
|
418
|
+
def _adjust_speed(self, audio: np.ndarray, speed: float) -> np.ndarray:
|
419
|
+
"""Adjust audio speed"""
|
420
|
+
try:
|
421
|
+
import librosa
|
422
|
+
return librosa.effects.time_stretch(audio, rate=speed)
|
423
|
+
except:
|
424
|
+
return audio
|
425
|
+
|
426
|
+
def _enhance_audio(self, audio: np.ndarray) -> np.ndarray:
|
427
|
+
"""Apply audio enhancement"""
|
428
|
+
try:
|
429
|
+
import scipy.signal
|
430
|
+
# Simple audio enhancement
|
431
|
+
audio = scipy.signal.wiener(audio)
|
432
|
+
audio = audio / np.max(np.abs(audio))
|
433
|
+
return audio
|
434
|
+
except:
|
435
|
+
return audio
|
436
|
+
|
437
|
+
def _encode_audio(self, audio: np.ndarray, format: str) -> str:
|
438
|
+
"""Encode audio to base64"""
|
439
|
+
try:
|
440
|
+
import soundfile as sf
|
441
|
+
import io
|
442
|
+
|
443
|
+
# Convert to 16-bit PCM
|
444
|
+
audio_int16 = (audio * 32767).astype(np.int16)
|
445
|
+
|
446
|
+
# Save to bytes
|
447
|
+
buffer = io.BytesIO()
|
448
|
+
sf.write(buffer, audio_int16, 24000, format=format.upper())
|
449
|
+
buffer.seek(0)
|
450
|
+
|
451
|
+
# Encode to base64
|
452
|
+
audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
|
453
|
+
return audio_b64
|
454
|
+
|
455
|
+
except Exception as e:
|
456
|
+
print(f"Audio encoding error: {e}")
|
457
|
+
return ""
|
458
|
+
|
459
|
+
# Deployment functions
|
460
|
+
@app.function()
|
461
|
+
def deploy_info():
|
462
|
+
"""Deployment information"""
|
463
|
+
return {
|
464
|
+
'service': 'isa-audio-chatTTS',
|
465
|
+
'version': '1.0.0',
|
466
|
+
'description': 'ISA ChatTTS service - Dialogue-optimized TTS',
|
467
|
+
'model': 'ChatTTS',
|
468
|
+
'architecture': 'Transformer-based TTS',
|
469
|
+
'gpu': 'A10G',
|
470
|
+
'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts'],
|
471
|
+
'deployment_time': time.time()
|
472
|
+
}
|
473
|
+
|
474
|
+
@app.function()
|
475
|
+
def register_service():
|
476
|
+
"""Register service to model repository"""
|
477
|
+
try:
|
478
|
+
from isa_model.core.models.model_repo import ModelRepository
|
479
|
+
|
480
|
+
repo = ModelRepository()
|
481
|
+
|
482
|
+
# Register ChatTTS service
|
483
|
+
repo.register_model({
|
484
|
+
'model_id': 'isa-chatTTS-service',
|
485
|
+
'model_type': 'audio',
|
486
|
+
'provider': 'isa',
|
487
|
+
'endpoint': 'https://isa-audio-chatTTS.modal.run',
|
488
|
+
'capabilities': ['chinese_tts', 'english_tts', 'prosody_control', 'dialogue_tts'],
|
489
|
+
'pricing': {'gpu_type': 'A10G', 'cost_per_hour': 1.20},
|
490
|
+
'metadata': {
|
491
|
+
'model': 'ChatTTS',
|
492
|
+
'architecture': 'Transformer-based TTS',
|
493
|
+
'specialization': 'dialogue_optimized',
|
494
|
+
'languages': ['zh', 'en'],
|
495
|
+
'sample_rate': 24000,
|
496
|
+
'max_text_length': 1000
|
497
|
+
}
|
498
|
+
})
|
499
|
+
|
500
|
+
print("ChatTTS service registered successfully")
|
501
|
+
return {'status': 'registered'}
|
502
|
+
|
503
|
+
except Exception as e:
|
504
|
+
print(f"Service registration failed: {e}")
|
505
|
+
return {'status': 'failed', 'error': str(e)}
|
506
|
+
|
507
|
+
if __name__ == "__main__":
|
508
|
+
print("ISA ChatTTS Service - Modal Deployment")
|
509
|
+
print("Deploy with: modal deploy isa_audio_chatTTS_service.py")
|
510
|
+
print()
|
511
|
+
print("Model: ChatTTS")
|
512
|
+
print("Architecture: Transformer-based TTS")
|
513
|
+
print("Capabilities: Chinese/English TTS, prosody control, dialogue optimization")
|
514
|
+
print("GPU: A10G (24GB)")
|
515
|
+
print()
|
516
|
+
print("Usage:")
|
517
|
+
print("# Speech synthesis")
|
518
|
+
print("service.synthesize_speech('Hello world!', language='en')")
|
519
|
+
print("# Health check")
|
520
|
+
print("service.health_check()")
|
File without changes
|