langchain-camb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_camb/__init__.py +81 -0
- langchain_camb/toolkits/__init__.py +5 -0
- langchain_camb/toolkits/camb_toolkit.py +148 -0
- langchain_camb/tools/__init__.py +40 -0
- langchain_camb/tools/audio_separation.py +189 -0
- langchain_camb/tools/base.py +161 -0
- langchain_camb/tools/text_to_sound.py +156 -0
- langchain_camb/tools/transcription.py +189 -0
- langchain_camb/tools/translated_tts.py +340 -0
- langchain_camb/tools/translation.py +150 -0
- langchain_camb/tools/tts.py +182 -0
- langchain_camb/tools/voice_clone.py +152 -0
- langchain_camb/tools/voice_list.py +108 -0
- langchain_camb/version.py +3 -0
- langchain_camb-0.1.0.dist-info/METADATA +307 -0
- langchain_camb-0.1.0.dist-info/RECORD +17 -0
- langchain_camb-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Text-to-Sound tool for CAMB AI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import tempfile
|
|
7
|
+
from typing import Any, Literal, Optional, Type, Union
|
|
8
|
+
|
|
9
|
+
from langchain_core.callbacks import (
|
|
10
|
+
AsyncCallbackManagerForToolRun,
|
|
11
|
+
CallbackManagerForToolRun,
|
|
12
|
+
)
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
from langchain_camb.tools.base import CambBaseTool
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TextToSoundInput(BaseModel):
|
|
19
|
+
"""Input schema for Text-to-Sound tool."""
|
|
20
|
+
|
|
21
|
+
prompt: str = Field(
|
|
22
|
+
...,
|
|
23
|
+
description="Description of the sound or music to generate.",
|
|
24
|
+
)
|
|
25
|
+
duration: Optional[float] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="Duration of the audio in seconds.",
|
|
28
|
+
)
|
|
29
|
+
audio_type: Optional[Literal["music", "sound"]] = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="Type of audio: 'music' or 'sound'.",
|
|
32
|
+
)
|
|
33
|
+
output_format: Literal["file_path", "base64"] = Field(
|
|
34
|
+
default="file_path",
|
|
35
|
+
description="Output format: 'file_path' or 'base64'.",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CambTextToSoundTool(CambBaseTool):
|
|
40
|
+
"""Tool for generating sounds and music from text using CAMB AI.
|
|
41
|
+
|
|
42
|
+
This tool creates audio from text descriptions. It can generate
|
|
43
|
+
music, sound effects, or ambient soundscapes.
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
```python
|
|
47
|
+
from langchain_camb import CambTextToSoundTool
|
|
48
|
+
|
|
49
|
+
sound_gen = CambTextToSoundTool()
|
|
50
|
+
result = sound_gen.invoke({
|
|
51
|
+
"prompt": "Upbeat electronic music with a driving beat",
|
|
52
|
+
"duration": 30,
|
|
53
|
+
"audio_type": "music"
|
|
54
|
+
})
|
|
55
|
+
print(result) # File path to generated audio
|
|
56
|
+
```
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
name: str = "camb_text_to_sound"
|
|
60
|
+
description: str = (
|
|
61
|
+
"Generate sounds, music, or soundscapes from text descriptions using CAMB AI. "
|
|
62
|
+
"Describe the audio you want and optionally specify duration and type "
|
|
63
|
+
"(music, sound_effect, ambient). Returns audio file."
|
|
64
|
+
)
|
|
65
|
+
args_schema: Type[BaseModel] = TextToSoundInput
|
|
66
|
+
|
|
67
|
+
def _run(
|
|
68
|
+
self,
|
|
69
|
+
prompt: str,
|
|
70
|
+
duration: Optional[float] = None,
|
|
71
|
+
audio_type: Optional[str] = None,
|
|
72
|
+
output_format: str = "file_path",
|
|
73
|
+
run_manager: Optional[CallbackManagerForToolRun] = None,
|
|
74
|
+
) -> str:
|
|
75
|
+
"""Generate sound synchronously.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
File path or base64 encoded audio.
|
|
79
|
+
"""
|
|
80
|
+
kwargs: dict[str, Any] = {"prompt": prompt}
|
|
81
|
+
|
|
82
|
+
if duration:
|
|
83
|
+
kwargs["duration"] = duration
|
|
84
|
+
if audio_type:
|
|
85
|
+
kwargs["audio_type"] = audio_type
|
|
86
|
+
|
|
87
|
+
# Create task
|
|
88
|
+
result = self.sync_client.text_to_audio.create_text_to_audio(**kwargs)
|
|
89
|
+
task_id = result.task_id
|
|
90
|
+
|
|
91
|
+
# Poll for completion and get run_id from status
|
|
92
|
+
status = self._poll_task_status_sync(
|
|
93
|
+
self.sync_client.text_to_audio.get_text_to_audio_status,
|
|
94
|
+
task_id,
|
|
95
|
+
)
|
|
96
|
+
run_id = status.run_id
|
|
97
|
+
|
|
98
|
+
# Get audio result (streaming)
|
|
99
|
+
audio_chunks: list[bytes] = []
|
|
100
|
+
for chunk in self.sync_client.text_to_audio.get_text_to_audio_result(run_id):
|
|
101
|
+
audio_chunks.append(chunk)
|
|
102
|
+
|
|
103
|
+
audio_data = b"".join(audio_chunks)
|
|
104
|
+
|
|
105
|
+
return self._format_output(audio_data, output_format)
|
|
106
|
+
|
|
107
|
+
async def _arun(
|
|
108
|
+
self,
|
|
109
|
+
prompt: str,
|
|
110
|
+
duration: Optional[float] = None,
|
|
111
|
+
audio_type: Optional[str] = None,
|
|
112
|
+
output_format: str = "file_path",
|
|
113
|
+
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Generate sound asynchronously.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
File path or base64 encoded audio.
|
|
119
|
+
"""
|
|
120
|
+
kwargs: dict[str, Any] = {"prompt": prompt}
|
|
121
|
+
|
|
122
|
+
if duration:
|
|
123
|
+
kwargs["duration"] = duration
|
|
124
|
+
if audio_type:
|
|
125
|
+
kwargs["audio_type"] = audio_type
|
|
126
|
+
|
|
127
|
+
# Create task
|
|
128
|
+
result = await self.async_client.text_to_audio.create_text_to_audio(**kwargs)
|
|
129
|
+
task_id = result.task_id
|
|
130
|
+
|
|
131
|
+
# Poll for completion and get run_id from status
|
|
132
|
+
status = await self._poll_task_status(
|
|
133
|
+
self.async_client.text_to_audio.get_text_to_audio_status,
|
|
134
|
+
task_id,
|
|
135
|
+
)
|
|
136
|
+
run_id = status.run_id
|
|
137
|
+
|
|
138
|
+
# Get audio result (streaming)
|
|
139
|
+
audio_chunks: list[bytes] = []
|
|
140
|
+
async for chunk in self.async_client.text_to_audio.get_text_to_audio_result(
|
|
141
|
+
run_id
|
|
142
|
+
):
|
|
143
|
+
audio_chunks.append(chunk)
|
|
144
|
+
|
|
145
|
+
audio_data = b"".join(audio_chunks)
|
|
146
|
+
|
|
147
|
+
return self._format_output(audio_data, output_format)
|
|
148
|
+
|
|
149
|
+
def _format_output(self, audio_data: bytes, output_format: str) -> str:
|
|
150
|
+
"""Format audio data according to output_format."""
|
|
151
|
+
if output_format == "base64":
|
|
152
|
+
return base64.b64encode(audio_data).decode("utf-8")
|
|
153
|
+
else: # file_path
|
|
154
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
155
|
+
f.write(audio_data)
|
|
156
|
+
return f.name
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Transcription tool for CAMB AI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Optional, Type
|
|
7
|
+
|
|
8
|
+
from langchain_core.callbacks import (
|
|
9
|
+
AsyncCallbackManagerForToolRun,
|
|
10
|
+
CallbackManagerForToolRun,
|
|
11
|
+
)
|
|
12
|
+
from pydantic import BaseModel, Field, model_validator
|
|
13
|
+
|
|
14
|
+
from langchain_camb.tools.base import CambBaseTool
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TranscriptionInput(BaseModel):
|
|
18
|
+
"""Input schema for Transcription tool."""
|
|
19
|
+
|
|
20
|
+
language: int = Field(
|
|
21
|
+
...,
|
|
22
|
+
description="Language code (integer) for the audio. Common codes: 1=English, 2=Spanish, 3=French, 4=German, 5=Italian.",
|
|
23
|
+
)
|
|
24
|
+
audio_url: Optional[str] = Field(
|
|
25
|
+
default=None,
|
|
26
|
+
description="URL of the audio file to transcribe. Provide either audio_url or audio_file_path.",
|
|
27
|
+
)
|
|
28
|
+
audio_file_path: Optional[str] = Field(
|
|
29
|
+
default=None,
|
|
30
|
+
description="Local file path to the audio file. Provide either audio_url or audio_file_path.",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
@model_validator(mode="after")
|
|
34
|
+
def validate_audio_source(self) -> "TranscriptionInput":
|
|
35
|
+
"""Ensure exactly one audio source is provided."""
|
|
36
|
+
if not self.audio_url and not self.audio_file_path:
|
|
37
|
+
raise ValueError("Either audio_url or audio_file_path must be provided.")
|
|
38
|
+
if self.audio_url and self.audio_file_path:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
"Provide only one of audio_url or audio_file_path, not both."
|
|
41
|
+
)
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CambTranscriptionTool(CambBaseTool):
|
|
46
|
+
"""Tool for transcribing audio using CAMB AI.
|
|
47
|
+
|
|
48
|
+
This tool converts speech to text with speaker identification.
|
|
49
|
+
Supports audio URLs or local files. Returns transcription with
|
|
50
|
+
segments and speaker information.
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
```python
|
|
54
|
+
from langchain_camb import CambTranscriptionTool
|
|
55
|
+
|
|
56
|
+
transcriber = CambTranscriptionTool()
|
|
57
|
+
result = transcriber.invoke({
|
|
58
|
+
"audio_url": "https://example.com/audio.mp3",
|
|
59
|
+
"language": 1 # English
|
|
60
|
+
})
|
|
61
|
+
print(result) # JSON with text, segments, speakers
|
|
62
|
+
```
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
name: str = "camb_transcription"
|
|
66
|
+
description: str = (
|
|
67
|
+
"Transcribe audio to text using CAMB AI. "
|
|
68
|
+
"Supports audio URLs or local files. "
|
|
69
|
+
"Returns transcription with segments and speaker identification. "
|
|
70
|
+
"Provide language code (1=English, 2=Spanish, etc.) and audio source."
|
|
71
|
+
)
|
|
72
|
+
args_schema: Type[BaseModel] = TranscriptionInput
|
|
73
|
+
|
|
74
|
+
def _run(
|
|
75
|
+
self,
|
|
76
|
+
language: int,
|
|
77
|
+
audio_url: Optional[str] = None,
|
|
78
|
+
audio_file_path: Optional[str] = None,
|
|
79
|
+
run_manager: Optional[CallbackManagerForToolRun] = None,
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Transcribe audio synchronously.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
JSON string with text, segments, and speakers.
|
|
85
|
+
"""
|
|
86
|
+
kwargs: dict[str, Any] = {"language": language}
|
|
87
|
+
|
|
88
|
+
if audio_url:
|
|
89
|
+
kwargs["audio_url"] = audio_url
|
|
90
|
+
elif audio_file_path:
|
|
91
|
+
with open(audio_file_path, "rb") as f:
|
|
92
|
+
kwargs["media_file"] = f
|
|
93
|
+
# Create task
|
|
94
|
+
result = self.sync_client.transcription.create_transcription(**kwargs)
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError("No audio source provided")
|
|
97
|
+
|
|
98
|
+
if audio_url:
|
|
99
|
+
result = self.sync_client.transcription.create_transcription(**kwargs)
|
|
100
|
+
|
|
101
|
+
task_id = result.task_id
|
|
102
|
+
|
|
103
|
+
# Poll for completion and get run_id from status
|
|
104
|
+
status = self._poll_task_status_sync(
|
|
105
|
+
self.sync_client.transcription.get_transcription_task_status,
|
|
106
|
+
task_id,
|
|
107
|
+
)
|
|
108
|
+
run_id = status.run_id
|
|
109
|
+
|
|
110
|
+
# Get result
|
|
111
|
+
transcription = self.sync_client.transcription.get_transcription_result(run_id)
|
|
112
|
+
|
|
113
|
+
return self._format_result(transcription)
|
|
114
|
+
|
|
115
|
+
async def _arun(
|
|
116
|
+
self,
|
|
117
|
+
language: int,
|
|
118
|
+
audio_url: Optional[str] = None,
|
|
119
|
+
audio_file_path: Optional[str] = None,
|
|
120
|
+
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
|
|
121
|
+
) -> str:
|
|
122
|
+
"""Transcribe audio asynchronously.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
JSON string with text, segments, and speakers.
|
|
126
|
+
"""
|
|
127
|
+
kwargs: dict[str, Any] = {"language": language}
|
|
128
|
+
|
|
129
|
+
if audio_url:
|
|
130
|
+
kwargs["audio_url"] = audio_url
|
|
131
|
+
elif audio_file_path:
|
|
132
|
+
with open(audio_file_path, "rb") as f:
|
|
133
|
+
kwargs["media_file"] = f
|
|
134
|
+
result = await self.async_client.transcription.create_transcription(
|
|
135
|
+
**kwargs
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError("No audio source provided")
|
|
139
|
+
|
|
140
|
+
if audio_url:
|
|
141
|
+
result = await self.async_client.transcription.create_transcription(**kwargs)
|
|
142
|
+
|
|
143
|
+
task_id = result.task_id
|
|
144
|
+
|
|
145
|
+
# Poll for completion and get run_id from status
|
|
146
|
+
status = await self._poll_task_status(
|
|
147
|
+
self.async_client.transcription.get_transcription_task_status,
|
|
148
|
+
task_id,
|
|
149
|
+
)
|
|
150
|
+
run_id = status.run_id
|
|
151
|
+
|
|
152
|
+
# Get result
|
|
153
|
+
transcription = await self.async_client.transcription.get_transcription_result(
|
|
154
|
+
run_id
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return self._format_result(transcription)
|
|
158
|
+
|
|
159
|
+
def _format_result(self, transcription: Any) -> str:
|
|
160
|
+
"""Format transcription result as JSON."""
|
|
161
|
+
result = {
|
|
162
|
+
"text": getattr(transcription, "text", ""),
|
|
163
|
+
"segments": [],
|
|
164
|
+
"speakers": [],
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# Extract segments if available
|
|
168
|
+
if hasattr(transcription, "segments"):
|
|
169
|
+
for seg in transcription.segments:
|
|
170
|
+
result["segments"].append(
|
|
171
|
+
{
|
|
172
|
+
"start": getattr(seg, "start", 0),
|
|
173
|
+
"end": getattr(seg, "end", 0),
|
|
174
|
+
"text": getattr(seg, "text", ""),
|
|
175
|
+
"speaker": getattr(seg, "speaker", None),
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Extract unique speakers
|
|
180
|
+
if hasattr(transcription, "speakers"):
|
|
181
|
+
result["speakers"] = list(transcription.speakers)
|
|
182
|
+
elif result["segments"]:
|
|
183
|
+
speakers = set()
|
|
184
|
+
for seg in result["segments"]:
|
|
185
|
+
if seg.get("speaker"):
|
|
186
|
+
speakers.add(seg["speaker"])
|
|
187
|
+
result["speakers"] = list(speakers)
|
|
188
|
+
|
|
189
|
+
return json.dumps(result, indent=2)
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""Translated TTS tool for CAMB AI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import tempfile
|
|
7
|
+
from typing import Any, Literal, Optional, Type, Union
|
|
8
|
+
|
|
9
|
+
from langchain_core.callbacks import (
|
|
10
|
+
AsyncCallbackManagerForToolRun,
|
|
11
|
+
CallbackManagerForToolRun,
|
|
12
|
+
)
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
from langchain_camb.tools.base import CambBaseTool
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TranslatedTTSInput(BaseModel):
|
|
19
|
+
"""Input schema for Translated TTS tool."""
|
|
20
|
+
|
|
21
|
+
text: str = Field(
|
|
22
|
+
...,
|
|
23
|
+
description="Text to translate and convert to speech.",
|
|
24
|
+
)
|
|
25
|
+
source_language: int = Field(
|
|
26
|
+
...,
|
|
27
|
+
description="Source language code (integer). Common codes: 1=English, 2=Spanish, 3=French.",
|
|
28
|
+
)
|
|
29
|
+
target_language: int = Field(
|
|
30
|
+
...,
|
|
31
|
+
description="Target language code (integer) for the output speech.",
|
|
32
|
+
)
|
|
33
|
+
voice_id: int = Field(
|
|
34
|
+
default=147320,
|
|
35
|
+
description="Voice ID for TTS. Get available voices with CambVoiceListTool.",
|
|
36
|
+
)
|
|
37
|
+
output_format: Literal["file_path", "base64"] = Field(
|
|
38
|
+
default="file_path",
|
|
39
|
+
description="Output format: 'file_path' or 'base64'.",
|
|
40
|
+
)
|
|
41
|
+
formality: Optional[int] = Field(
|
|
42
|
+
default=None,
|
|
43
|
+
description="Translation formality: 1=formal, 2=informal.",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class CambTranslatedTTSTool(CambBaseTool):
|
|
48
|
+
"""Tool for translating text and converting to speech using CAMB AI.
|
|
49
|
+
|
|
50
|
+
This tool combines translation and TTS in a single operation.
|
|
51
|
+
It translates text to the target language and generates speech.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
```python
|
|
55
|
+
from langchain_camb import CambTranslatedTTSTool
|
|
56
|
+
|
|
57
|
+
translated_tts = CambTranslatedTTSTool()
|
|
58
|
+
result = translated_tts.invoke({
|
|
59
|
+
"text": "Hello, how are you?",
|
|
60
|
+
"source_language": 1, # English
|
|
61
|
+
"target_language": 2, # Spanish
|
|
62
|
+
"voice_id": 147320
|
|
63
|
+
})
|
|
64
|
+
print(result) # File path to Spanish audio
|
|
65
|
+
```
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
name: str = "camb_translated_tts"
|
|
69
|
+
description: str = (
|
|
70
|
+
"Translate text and convert to speech in one step. "
|
|
71
|
+
"Provide source text, source language, target language, and voice ID. "
|
|
72
|
+
"Returns audio file of the translated text spoken in the target language."
|
|
73
|
+
)
|
|
74
|
+
args_schema: Type[BaseModel] = TranslatedTTSInput
|
|
75
|
+
|
|
76
|
+
def _run(
|
|
77
|
+
self,
|
|
78
|
+
text: str,
|
|
79
|
+
source_language: int,
|
|
80
|
+
target_language: int,
|
|
81
|
+
voice_id: int = 147320,
|
|
82
|
+
output_format: str = "file_path",
|
|
83
|
+
formality: Optional[int] = None,
|
|
84
|
+
run_manager: Optional[CallbackManagerForToolRun] = None,
|
|
85
|
+
) -> str:
|
|
86
|
+
"""Translate and convert to speech synchronously.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
File path or base64 encoded audio.
|
|
90
|
+
"""
|
|
91
|
+
kwargs: dict[str, Any] = {
|
|
92
|
+
"text": text,
|
|
93
|
+
"voice_id": voice_id,
|
|
94
|
+
"source_language": source_language,
|
|
95
|
+
"target_language": target_language,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if formality:
|
|
99
|
+
kwargs["formality"] = formality
|
|
100
|
+
|
|
101
|
+
# Create translated TTS task
|
|
102
|
+
result = self.sync_client.translated_tts.create_translated_tts(**kwargs)
|
|
103
|
+
task_id = result.task_id
|
|
104
|
+
|
|
105
|
+
# Poll for completion and get result from status
|
|
106
|
+
status = self._poll_task_status_sync(
|
|
107
|
+
self.sync_client.translated_tts.get_translated_tts_task_status,
|
|
108
|
+
task_id,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Get audio from status message (contains URL) or run_id
|
|
112
|
+
audio_data, audio_format = self._get_audio_from_status(status)
|
|
113
|
+
|
|
114
|
+
return self._format_output(audio_data, output_format, audio_format)
|
|
115
|
+
|
|
116
|
+
async def _arun(
|
|
117
|
+
self,
|
|
118
|
+
text: str,
|
|
119
|
+
source_language: int,
|
|
120
|
+
target_language: int,
|
|
121
|
+
voice_id: int = 147320,
|
|
122
|
+
output_format: str = "file_path",
|
|
123
|
+
formality: Optional[int] = None,
|
|
124
|
+
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
|
|
125
|
+
) -> str:
|
|
126
|
+
"""Translate and convert to speech asynchronously.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
File path or base64 encoded audio.
|
|
130
|
+
"""
|
|
131
|
+
kwargs: dict[str, Any] = {
|
|
132
|
+
"text": text,
|
|
133
|
+
"voice_id": voice_id,
|
|
134
|
+
"source_language": source_language,
|
|
135
|
+
"target_language": target_language,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if formality:
|
|
139
|
+
kwargs["formality"] = formality
|
|
140
|
+
|
|
141
|
+
# Create translated TTS task
|
|
142
|
+
result = await self.async_client.translated_tts.create_translated_tts(**kwargs)
|
|
143
|
+
task_id = result.task_id
|
|
144
|
+
|
|
145
|
+
# Poll for completion and get result from status
|
|
146
|
+
status = await self._poll_task_status(
|
|
147
|
+
self.async_client.translated_tts.get_translated_tts_task_status,
|
|
148
|
+
task_id,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Get audio from status message (contains URL) or run_id
|
|
152
|
+
audio_data, audio_format = await self._get_audio_from_status_async(status)
|
|
153
|
+
|
|
154
|
+
return self._format_output(audio_data, output_format, audio_format)
|
|
155
|
+
|
|
156
|
+
def _get_audio_from_status(self, status: Any) -> tuple[bytes, str]:
|
|
157
|
+
"""Extract audio from status response.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Tuple of (audio_data, detected_format) where format is 'wav', 'mp3', 'flac', or 'pcm'.
|
|
161
|
+
"""
|
|
162
|
+
import httpx
|
|
163
|
+
|
|
164
|
+
# Get audio via run_id using the tts-result endpoint
|
|
165
|
+
run_id = getattr(status, "run_id", None)
|
|
166
|
+
if run_id:
|
|
167
|
+
# Use direct API endpoint - more reliable than SDK method
|
|
168
|
+
base_url = getattr(self.sync_client, '_client_wrapper', None)
|
|
169
|
+
if base_url and hasattr(base_url, 'base_url'):
|
|
170
|
+
result_url = f"{base_url.base_url}/tts-result/{run_id}"
|
|
171
|
+
else:
|
|
172
|
+
result_url = f"https://client.camb.ai/apis/tts-result/{run_id}"
|
|
173
|
+
|
|
174
|
+
with httpx.Client() as client:
|
|
175
|
+
response = client.get(
|
|
176
|
+
result_url,
|
|
177
|
+
headers={"x-api-key": self.api_key}
|
|
178
|
+
)
|
|
179
|
+
if response.status_code == 200:
|
|
180
|
+
audio_data = response.content
|
|
181
|
+
audio_format = self._detect_audio_format(
|
|
182
|
+
audio_data, response.headers.get("content-type", "")
|
|
183
|
+
)
|
|
184
|
+
return audio_data, audio_format
|
|
185
|
+
|
|
186
|
+
# Fallback: check if message contains URL
|
|
187
|
+
message = getattr(status, "message", None)
|
|
188
|
+
if message:
|
|
189
|
+
if isinstance(message, dict):
|
|
190
|
+
url = message.get("output_url") or message.get("audio_url") or message.get("url")
|
|
191
|
+
elif isinstance(message, str) and message.startswith("http"):
|
|
192
|
+
url = message
|
|
193
|
+
else:
|
|
194
|
+
url = None
|
|
195
|
+
|
|
196
|
+
if url:
|
|
197
|
+
with httpx.Client() as client:
|
|
198
|
+
response = client.get(url)
|
|
199
|
+
audio_data = response.content
|
|
200
|
+
audio_format = self._detect_audio_format(
|
|
201
|
+
audio_data, response.headers.get("content-type", "")
|
|
202
|
+
)
|
|
203
|
+
return audio_data, audio_format
|
|
204
|
+
|
|
205
|
+
return b"", "pcm"
|
|
206
|
+
|
|
207
|
+
async def _get_audio_from_status_async(self, status: Any) -> tuple[bytes, str]:
|
|
208
|
+
"""Extract audio from status response (async).
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Tuple of (audio_data, detected_format) where format is 'wav', 'mp3', 'flac', or 'pcm'.
|
|
212
|
+
"""
|
|
213
|
+
import httpx
|
|
214
|
+
|
|
215
|
+
# Get audio via run_id using the tts-result endpoint
|
|
216
|
+
run_id = getattr(status, "run_id", None)
|
|
217
|
+
if run_id:
|
|
218
|
+
# Use direct API endpoint - more reliable than SDK method
|
|
219
|
+
base_url = getattr(self.async_client, '_client_wrapper', None)
|
|
220
|
+
if base_url and hasattr(base_url, 'base_url'):
|
|
221
|
+
result_url = f"{base_url.base_url}/tts-result/{run_id}"
|
|
222
|
+
else:
|
|
223
|
+
result_url = f"https://client.camb.ai/apis/tts-result/{run_id}"
|
|
224
|
+
|
|
225
|
+
async with httpx.AsyncClient() as client:
|
|
226
|
+
response = await client.get(
|
|
227
|
+
result_url,
|
|
228
|
+
headers={"x-api-key": self.api_key}
|
|
229
|
+
)
|
|
230
|
+
if response.status_code == 200:
|
|
231
|
+
audio_data = response.content
|
|
232
|
+
audio_format = self._detect_audio_format(
|
|
233
|
+
audio_data, response.headers.get("content-type", "")
|
|
234
|
+
)
|
|
235
|
+
return audio_data, audio_format
|
|
236
|
+
|
|
237
|
+
# Fallback: check if message contains URL
|
|
238
|
+
message = getattr(status, "message", None)
|
|
239
|
+
if message:
|
|
240
|
+
if isinstance(message, dict):
|
|
241
|
+
url = message.get("output_url") or message.get("audio_url") or message.get("url")
|
|
242
|
+
elif isinstance(message, str) and message.startswith("http"):
|
|
243
|
+
url = message
|
|
244
|
+
else:
|
|
245
|
+
url = None
|
|
246
|
+
|
|
247
|
+
if url:
|
|
248
|
+
async with httpx.AsyncClient() as client:
|
|
249
|
+
response = await client.get(url)
|
|
250
|
+
audio_data = response.content
|
|
251
|
+
audio_format = self._detect_audio_format(
|
|
252
|
+
audio_data, response.headers.get("content-type", "")
|
|
253
|
+
)
|
|
254
|
+
return audio_data, audio_format
|
|
255
|
+
|
|
256
|
+
return b"", "pcm"
|
|
257
|
+
|
|
258
|
+
def _detect_audio_format(self, audio_data: bytes, content_type: str) -> str:
|
|
259
|
+
"""Detect audio format from data bytes and content-type header.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Detected format: 'wav', 'mp3', 'flac', or 'pcm' (raw).
|
|
263
|
+
"""
|
|
264
|
+
# Check magic bytes first
|
|
265
|
+
if audio_data.startswith(b"RIFF"):
|
|
266
|
+
return "wav"
|
|
267
|
+
if audio_data.startswith(b"\xff\xfb") or audio_data.startswith(b"\xff\xfa") or audio_data.startswith(b"ID3"):
|
|
268
|
+
return "mp3"
|
|
269
|
+
if audio_data.startswith(b"fLaC"):
|
|
270
|
+
return "flac"
|
|
271
|
+
if audio_data.startswith(b"OggS"):
|
|
272
|
+
return "ogg"
|
|
273
|
+
|
|
274
|
+
# Check content-type header as fallback
|
|
275
|
+
content_type = content_type.lower()
|
|
276
|
+
if "wav" in content_type or "wave" in content_type:
|
|
277
|
+
return "wav"
|
|
278
|
+
if "mpeg" in content_type or "mp3" in content_type:
|
|
279
|
+
return "mp3"
|
|
280
|
+
if "flac" in content_type:
|
|
281
|
+
return "flac"
|
|
282
|
+
if "ogg" in content_type:
|
|
283
|
+
return "ogg"
|
|
284
|
+
|
|
285
|
+
# Unknown format, assume raw PCM
|
|
286
|
+
return "pcm"
|
|
287
|
+
|
|
288
|
+
def _format_output(self, audio_data: bytes, output_format: str, audio_format: str = "pcm") -> str:
|
|
289
|
+
"""Format audio data according to output_format.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
audio_data: Raw audio bytes.
|
|
293
|
+
output_format: Desired output format ('file_path' or 'base64').
|
|
294
|
+
audio_format: Detected audio format ('wav', 'mp3', 'flac', 'ogg', or 'pcm').
|
|
295
|
+
"""
|
|
296
|
+
# Only add WAV header if it's raw PCM data
|
|
297
|
+
if audio_format == "pcm" and audio_data:
|
|
298
|
+
audio_data = self._add_wav_header(audio_data)
|
|
299
|
+
audio_format = "wav"
|
|
300
|
+
|
|
301
|
+
# Determine file extension
|
|
302
|
+
ext_map = {"wav": ".wav", "mp3": ".mp3", "flac": ".flac", "ogg": ".ogg", "pcm": ".wav"}
|
|
303
|
+
extension = ext_map.get(audio_format, ".wav")
|
|
304
|
+
|
|
305
|
+
if output_format == "base64":
|
|
306
|
+
return base64.b64encode(audio_data).decode("utf-8")
|
|
307
|
+
else: # file_path
|
|
308
|
+
with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as f:
|
|
309
|
+
f.write(audio_data)
|
|
310
|
+
return f.name
|
|
311
|
+
|
|
312
|
+
def _add_wav_header(self, pcm_data: bytes) -> bytes:
|
|
313
|
+
"""Add WAV header to raw PCM data (16-bit, 24kHz, mono)."""
|
|
314
|
+
import struct
|
|
315
|
+
|
|
316
|
+
sample_rate = 24000
|
|
317
|
+
num_channels = 1
|
|
318
|
+
bits_per_sample = 16
|
|
319
|
+
byte_rate = sample_rate * num_channels * bits_per_sample // 8
|
|
320
|
+
block_align = num_channels * bits_per_sample // 8
|
|
321
|
+
data_size = len(pcm_data)
|
|
322
|
+
|
|
323
|
+
header = struct.pack(
|
|
324
|
+
"<4sI4s4sIHHIIHH4sI",
|
|
325
|
+
b"RIFF",
|
|
326
|
+
36 + data_size,
|
|
327
|
+
b"WAVE",
|
|
328
|
+
b"fmt ",
|
|
329
|
+
16,
|
|
330
|
+
1,
|
|
331
|
+
num_channels,
|
|
332
|
+
sample_rate,
|
|
333
|
+
byte_rate,
|
|
334
|
+
block_align,
|
|
335
|
+
bits_per_sample,
|
|
336
|
+
b"data",
|
|
337
|
+
data_size,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
return header + pcm_data
|