chatterer 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +93 -93
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/examples/__init__.py +0 -0
- chatterer/examples/anything_to_markdown.py +95 -91
- chatterer/examples/get_code_snippets.py +64 -62
- chatterer/examples/login_with_playwright.py +171 -167
- chatterer/examples/make_ppt.py +499 -497
- chatterer/examples/pdf_to_markdown.py +107 -107
- chatterer/examples/pdf_to_text.py +60 -56
- chatterer/examples/transcription_api.py +127 -123
- chatterer/examples/upstage_parser.py +95 -100
- chatterer/examples/webpage_to_markdown.py +79 -79
- chatterer/interactive.py +354 -354
- chatterer/language_model.py +533 -533
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +302 -302
- chatterer/tools/convert_to_text.py +447 -447
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +285 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/METADATA +392 -392
- chatterer-0.1.19.dist-info/RECORD +44 -0
- {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/WHEEL +1 -1
- chatterer-0.1.19.dist-info/entry_points.txt +10 -0
- chatterer-0.1.18.dist-info/RECORD +0 -42
- {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/top_level.txt +0 -0
@@ -1,107 +1,107 @@
|
|
1
|
-
def resolve_import_path_and_get_logger():
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
14
|
-
import sys
|
15
|
-
from pathlib import Path
|
16
|
-
from typing import Optional
|
17
|
-
|
18
|
-
from spargear import ArgumentSpec, BaseArguments
|
19
|
-
|
20
|
-
from chatterer import Chatterer, PdfToMarkdown
|
21
|
-
|
22
|
-
|
23
|
-
class PdfToMarkdownArgs(BaseArguments):
|
24
|
-
in_path: ArgumentSpec[str] = ArgumentSpec(
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
)
|
36
|
-
|
37
|
-
"""
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
out_base =
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
start
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
if __name__ == "__main__":
|
107
|
-
|
1
|
+
def resolve_import_path_and_get_logger():
|
2
|
+
# ruff: noqa: E402
|
3
|
+
import logging
|
4
|
+
import sys
|
5
|
+
|
6
|
+
if __name__ == "__main__" and "." not in sys.path:
|
7
|
+
sys.path.append(".")
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
return logger
|
11
|
+
|
12
|
+
|
13
|
+
logger = resolve_import_path_and_get_logger()
|
14
|
+
import sys
|
15
|
+
from pathlib import Path
|
16
|
+
from typing import Optional
|
17
|
+
|
18
|
+
from spargear import ArgumentSpec, BaseArguments
|
19
|
+
|
20
|
+
from chatterer import Chatterer, PdfToMarkdown
|
21
|
+
|
22
|
+
|
23
|
+
class PdfToMarkdownArgs(BaseArguments):
|
24
|
+
in_path: ArgumentSpec[str] = ArgumentSpec(["in-path"], help="Path to the input PDF file or a directory containing PDF files.")
|
25
|
+
out_path: Optional[str] = None
|
26
|
+
"""Output path. For a file, path to the output markdown file. For a directory, output directory for .md files."""
|
27
|
+
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
28
|
+
["--chatterer"],
|
29
|
+
default=None,
|
30
|
+
help="Chatterer instance for communication.",
|
31
|
+
type=Chatterer.from_provider,
|
32
|
+
required=True,
|
33
|
+
)
|
34
|
+
pages: Optional[str] = None
|
35
|
+
"""Page indices to convert (e.g., '1,3,5-9')."""
|
36
|
+
recursive: bool = False
|
37
|
+
"""If input is a directory, search for PDFs recursively."""
|
38
|
+
|
39
|
+
def run(self) -> list[dict[str, str]]:
|
40
|
+
in_path = Path(self.in_path.unwrap()).resolve()
|
41
|
+
page_indices = parse_page_indices(self.pages) if self.pages else None
|
42
|
+
pdf_files: list[Path] = []
|
43
|
+
is_dir = False
|
44
|
+
if in_path.is_file():
|
45
|
+
if in_path.suffix.lower() != ".pdf":
|
46
|
+
sys.exit(1)
|
47
|
+
pdf_files.append(in_path)
|
48
|
+
elif in_path.is_dir():
|
49
|
+
is_dir = True
|
50
|
+
pattern = "*.pdf"
|
51
|
+
pdf_files = sorted([f for f in (in_path.rglob(pattern) if self.recursive else in_path.glob(pattern)) if f.is_file()])
|
52
|
+
if not pdf_files:
|
53
|
+
sys.exit(0)
|
54
|
+
else:
|
55
|
+
sys.exit(1)
|
56
|
+
if self.out_path:
|
57
|
+
out_base = Path(self.out_path).resolve()
|
58
|
+
elif is_dir:
|
59
|
+
out_base = in_path
|
60
|
+
else:
|
61
|
+
out_base = in_path.with_suffix(".md")
|
62
|
+
|
63
|
+
if is_dir:
|
64
|
+
out_base.mkdir(parents=True, exist_ok=True)
|
65
|
+
else:
|
66
|
+
out_base.parent.mkdir(parents=True, exist_ok=True)
|
67
|
+
|
68
|
+
converter = PdfToMarkdown(chatterer=self.chatterer.unwrap())
|
69
|
+
results: list[dict[str, str]] = []
|
70
|
+
for pdf in pdf_files:
|
71
|
+
out_path = (out_base / (pdf.stem + ".md")) if is_dir else out_base
|
72
|
+
md = converter.convert(str(pdf), page_indices)
|
73
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
74
|
+
out_path.write_text(md, encoding="utf-8")
|
75
|
+
results.append({"input": pdf.as_posix(), "output": out_path.as_posix(), "result": md})
|
76
|
+
logger.info(f"Converted {len(pdf_files)} PDF(s) to markdown and saved to `{out_base}`.")
|
77
|
+
return results
|
78
|
+
|
79
|
+
|
80
|
+
def parse_page_indices(pages_str: str) -> list[int] | None:
|
81
|
+
if not pages_str:
|
82
|
+
return None
|
83
|
+
indices: set[int] = set()
|
84
|
+
for part in pages_str.split(","):
|
85
|
+
part = part.strip()
|
86
|
+
if not part:
|
87
|
+
continue
|
88
|
+
if "-" in part:
|
89
|
+
start_str, end_str = part.split("-", 1)
|
90
|
+
start = int(start_str.strip())
|
91
|
+
end = int(end_str.strip())
|
92
|
+
if start > end:
|
93
|
+
raise ValueError
|
94
|
+
indices.update(range(start, end + 1))
|
95
|
+
else:
|
96
|
+
indices.add(int(part))
|
97
|
+
if not indices:
|
98
|
+
raise ValueError
|
99
|
+
return sorted(indices)
|
100
|
+
|
101
|
+
|
102
|
+
def main() -> None:
|
103
|
+
PdfToMarkdownArgs().run()
|
104
|
+
|
105
|
+
|
106
|
+
if __name__ == "__main__":
|
107
|
+
main()
|
@@ -1,56 +1,60 @@
|
|
1
|
-
def resolve_import_path_and_get_logger():
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
14
|
-
import sys
|
15
|
-
from pathlib import Path
|
16
|
-
|
17
|
-
from spargear import ArgumentSpec, BaseArguments
|
18
|
-
|
19
|
-
from chatterer.tools.convert_to_text import pdf_to_text
|
20
|
-
|
21
|
-
|
22
|
-
class PdfToTextArgs(BaseArguments):
|
23
|
-
in_path: ArgumentSpec[Path] = ArgumentSpec(["in-path"], help="Path to the PDF file.")
|
24
|
-
out_path: ArgumentSpec[Path] = ArgumentSpec(["--out-path"], default=None, help="Output file path.")
|
25
|
-
pages: ArgumentSpec[str] = ArgumentSpec(["--pages"], default=None, help="Page indices to extract, e.g. '1,3,5-9'.")
|
26
|
-
|
27
|
-
def run(self) -> None:
|
28
|
-
input = self.in_path.unwrap().resolve()
|
29
|
-
out = self.out_path.value or input.with_suffix(".txt")
|
30
|
-
if not input.is_file():
|
31
|
-
sys.exit(1)
|
32
|
-
out.write_text(
|
33
|
-
pdf_to_text(input, parse_page_indices(pages_arg) if (pages_arg := self.pages.value) else None),
|
34
|
-
encoding="utf-8",
|
35
|
-
)
|
36
|
-
logger.info(f"Extracted text from `{input}` to `{out}`")
|
37
|
-
|
38
|
-
|
39
|
-
def parse_page_indices(pages_str: str) -> list[int]:
|
40
|
-
indices: set[int] = set()
|
41
|
-
for part in pages_str.split(","):
|
42
|
-
part = part.strip()
|
43
|
-
if "-" in part:
|
44
|
-
start_str, end_str = part.split("-", 1)
|
45
|
-
start = int(start_str)
|
46
|
-
end = int(end_str)
|
47
|
-
if start > end:
|
48
|
-
raise ValueError
|
49
|
-
indices.update(range(start, end + 1))
|
50
|
-
else:
|
51
|
-
indices.add(int(part))
|
52
|
-
return sorted(indices)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
PdfToTextArgs().run()
|
1
|
+
def resolve_import_path_and_get_logger():
|
2
|
+
# ruff: noqa: E402
|
3
|
+
import logging
|
4
|
+
import sys
|
5
|
+
|
6
|
+
if __name__ == "__main__" and "." not in sys.path:
|
7
|
+
sys.path.append(".")
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
return logger
|
11
|
+
|
12
|
+
|
13
|
+
logger = resolve_import_path_and_get_logger()
|
14
|
+
import sys
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
from spargear import ArgumentSpec, BaseArguments
|
18
|
+
|
19
|
+
from chatterer.tools.convert_to_text import pdf_to_text
|
20
|
+
|
21
|
+
|
22
|
+
class PdfToTextArgs(BaseArguments):
|
23
|
+
in_path: ArgumentSpec[Path] = ArgumentSpec(["in-path"], help="Path to the PDF file.")
|
24
|
+
out_path: ArgumentSpec[Path] = ArgumentSpec(["--out-path"], default=None, help="Output file path.")
|
25
|
+
pages: ArgumentSpec[str] = ArgumentSpec(["--pages"], default=None, help="Page indices to extract, e.g. '1,3,5-9'.")
|
26
|
+
|
27
|
+
def run(self) -> None:
|
28
|
+
input = self.in_path.unwrap().resolve()
|
29
|
+
out = self.out_path.value or input.with_suffix(".txt")
|
30
|
+
if not input.is_file():
|
31
|
+
sys.exit(1)
|
32
|
+
out.write_text(
|
33
|
+
pdf_to_text(input, parse_page_indices(pages_arg) if (pages_arg := self.pages.value) else None),
|
34
|
+
encoding="utf-8",
|
35
|
+
)
|
36
|
+
logger.info(f"Extracted text from `{input}` to `{out}`")
|
37
|
+
|
38
|
+
|
39
|
+
def parse_page_indices(pages_str: str) -> list[int]:
|
40
|
+
indices: set[int] = set()
|
41
|
+
for part in pages_str.split(","):
|
42
|
+
part = part.strip()
|
43
|
+
if "-" in part:
|
44
|
+
start_str, end_str = part.split("-", 1)
|
45
|
+
start = int(start_str)
|
46
|
+
end = int(end_str)
|
47
|
+
if start > end:
|
48
|
+
raise ValueError
|
49
|
+
indices.update(range(start, end + 1))
|
50
|
+
else:
|
51
|
+
indices.add(int(part))
|
52
|
+
return sorted(indices)
|
53
|
+
|
54
|
+
|
55
|
+
def main() -> None:
|
56
|
+
PdfToTextArgs().run()
|
57
|
+
|
58
|
+
|
59
|
+
if __name__ == "__main__":
|
60
|
+
main()
|
@@ -1,123 +1,127 @@
|
|
1
|
-
# pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
|
2
|
-
|
3
|
-
from io import BytesIO
|
4
|
-
from pathlib import Path
|
5
|
-
from typing import cast
|
6
|
-
|
7
|
-
from openai import OpenAI
|
8
|
-
from pydub import AudioSegment
|
9
|
-
from spargear import ArgumentSpec, BaseArguments
|
10
|
-
|
11
|
-
# Maximum chunk length in seconds
|
12
|
-
MAX_CHUNK_DURATION = 600
|
13
|
-
|
14
|
-
|
15
|
-
class TranscriptionApiArguments(BaseArguments):
|
16
|
-
in_path = ArgumentSpec(
|
17
|
-
["in-path"],
|
18
|
-
type=Path,
|
19
|
-
help="The audio file to transcribe.",
|
20
|
-
)
|
21
|
-
out_path = ArgumentSpec(
|
22
|
-
["--out-path"],
|
23
|
-
type=Path,
|
24
|
-
default=None,
|
25
|
-
help="Path to save the transcription output.",
|
26
|
-
)
|
27
|
-
model: ArgumentSpec[str] = ArgumentSpec(
|
28
|
-
["--model"],
|
29
|
-
default="gpt-4o-transcribe",
|
30
|
-
help="The model to use for transcription.",
|
31
|
-
)
|
32
|
-
api_key: ArgumentSpec[str] = ArgumentSpec(
|
33
|
-
["--api-key"],
|
34
|
-
default=None,
|
35
|
-
help="The API key for authentication.",
|
36
|
-
)
|
37
|
-
base_url: ArgumentSpec[str] = ArgumentSpec(
|
38
|
-
["--base-url"],
|
39
|
-
default="https://api.openai.com/v1",
|
40
|
-
help="The base URL for the API.",
|
41
|
-
)
|
42
|
-
|
43
|
-
def run(self) -> None:
|
44
|
-
audio_path = self.in_path.unwrap()
|
45
|
-
model = self.model.unwrap()
|
46
|
-
|
47
|
-
client = OpenAI(api_key=self.api_key.value, base_url=self.base_url.value)
|
48
|
-
|
49
|
-
audio = load_audio_segment(audio_path)
|
50
|
-
|
51
|
-
segments = split_audio(audio, MAX_CHUNK_DURATION)
|
52
|
-
print(f"[i] Audio duration: {len(audio) / 1000:.1f}s; splitting into {len(segments)} segment(s)")
|
53
|
-
|
54
|
-
transcripts: list[str] = []
|
55
|
-
for idx, seg in enumerate(segments, start=1):
|
56
|
-
print(f"[i] Transcribing segment {idx}/{len(segments)}...")
|
57
|
-
transcripts.append(transcribe_segment(seg, client, model))
|
58
|
-
|
59
|
-
full_transcript = "\n\n".join(transcripts)
|
60
|
-
output_path: Path = self.out_path.value or audio_path.with_suffix(".txt")
|
61
|
-
output_path.write_text(full_transcript, encoding="utf-8")
|
62
|
-
print(f"[✓] Transcription saved to: {output_path}")
|
63
|
-
|
64
|
-
|
65
|
-
def load_audio_segment(file_path: Path) -> AudioSegment:
|
66
|
-
"""
|
67
|
-
Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
|
68
|
-
"""
|
69
|
-
ext = file_path.suffix.lower()[1:]
|
70
|
-
audio = AudioSegment.from_file(file_path.as_posix(), format=ext if ext != "mp3" else None)
|
71
|
-
if ext != "mp3":
|
72
|
-
buffer = BytesIO()
|
73
|
-
audio.export(buffer, format="mp3")
|
74
|
-
buffer.seek(0)
|
75
|
-
audio = AudioSegment.from_file(buffer, format="mp3")
|
76
|
-
return audio
|
77
|
-
|
78
|
-
|
79
|
-
def split_audio(audio: AudioSegment, max_duration_s: int) -> list[AudioSegment]:
|
80
|
-
"""
|
81
|
-
Split the AudioSegment into chunks no longer than max_duration_s seconds.
|
82
|
-
"""
|
83
|
-
chunk_length_ms = (max_duration_s - 1) * 1000
|
84
|
-
duration_ms = len(audio)
|
85
|
-
segments: list[AudioSegment] = []
|
86
|
-
segment_idx: int = 0
|
87
|
-
for start_ms in range(0, duration_ms, chunk_length_ms):
|
88
|
-
end_ms = min(start_ms + chunk_length_ms, duration_ms)
|
89
|
-
segment = cast(AudioSegment, audio[start_ms:end_ms])
|
90
|
-
segments.append(segment)
|
91
|
-
# with open(f"segment_{segment_idx}.mp3", "wb") as f:
|
92
|
-
# segment.export(f, format="mp3")
|
93
|
-
segment_idx += 1
|
94
|
-
return segments
|
95
|
-
|
96
|
-
|
97
|
-
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str) -> str:
|
98
|
-
"""
|
99
|
-
Transcribe a single AudioSegment chunk and return its text.
|
100
|
-
"""
|
101
|
-
buffer = BytesIO()
|
102
|
-
segment.export(buffer, format="mp3")
|
103
|
-
buffer.seek(0)
|
104
|
-
mp3_bytes = buffer.read()
|
105
|
-
response = client.audio.transcriptions.create(
|
106
|
-
model=model,
|
107
|
-
prompt="Transcribe whole text from audio.",
|
108
|
-
file=("audio.mp3", mp3_bytes),
|
109
|
-
response_format="text",
|
110
|
-
stream=True,
|
111
|
-
)
|
112
|
-
for res in response:
|
113
|
-
if res.type == "transcript.text.delta":
|
114
|
-
print(res.delta, end="", flush=True)
|
115
|
-
if res.type == "transcript.text.done":
|
116
|
-
print()
|
117
|
-
return res.text
|
118
|
-
else:
|
119
|
-
raise RuntimeError("No transcription result found.")
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
TranscriptionApiArguments().run()
|
1
|
+
# pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
|
2
|
+
|
3
|
+
from io import BytesIO
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import cast
|
6
|
+
|
7
|
+
from openai import OpenAI
|
8
|
+
from pydub import AudioSegment
|
9
|
+
from spargear import ArgumentSpec, BaseArguments
|
10
|
+
|
11
|
+
# Maximum chunk length in seconds
|
12
|
+
MAX_CHUNK_DURATION = 600
|
13
|
+
|
14
|
+
|
15
|
+
class TranscriptionApiArguments(BaseArguments):
|
16
|
+
in_path = ArgumentSpec(
|
17
|
+
["in-path"],
|
18
|
+
type=Path,
|
19
|
+
help="The audio file to transcribe.",
|
20
|
+
)
|
21
|
+
out_path = ArgumentSpec(
|
22
|
+
["--out-path"],
|
23
|
+
type=Path,
|
24
|
+
default=None,
|
25
|
+
help="Path to save the transcription output.",
|
26
|
+
)
|
27
|
+
model: ArgumentSpec[str] = ArgumentSpec(
|
28
|
+
["--model"],
|
29
|
+
default="gpt-4o-transcribe",
|
30
|
+
help="The model to use for transcription.",
|
31
|
+
)
|
32
|
+
api_key: ArgumentSpec[str] = ArgumentSpec(
|
33
|
+
["--api-key"],
|
34
|
+
default=None,
|
35
|
+
help="The API key for authentication.",
|
36
|
+
)
|
37
|
+
base_url: ArgumentSpec[str] = ArgumentSpec(
|
38
|
+
["--base-url"],
|
39
|
+
default="https://api.openai.com/v1",
|
40
|
+
help="The base URL for the API.",
|
41
|
+
)
|
42
|
+
|
43
|
+
def run(self) -> None:
|
44
|
+
audio_path = self.in_path.unwrap()
|
45
|
+
model = self.model.unwrap()
|
46
|
+
|
47
|
+
client = OpenAI(api_key=self.api_key.value, base_url=self.base_url.value)
|
48
|
+
|
49
|
+
audio = load_audio_segment(audio_path)
|
50
|
+
|
51
|
+
segments = split_audio(audio, MAX_CHUNK_DURATION)
|
52
|
+
print(f"[i] Audio duration: {len(audio) / 1000:.1f}s; splitting into {len(segments)} segment(s)")
|
53
|
+
|
54
|
+
transcripts: list[str] = []
|
55
|
+
for idx, seg in enumerate(segments, start=1):
|
56
|
+
print(f"[i] Transcribing segment {idx}/{len(segments)}...")
|
57
|
+
transcripts.append(transcribe_segment(seg, client, model))
|
58
|
+
|
59
|
+
full_transcript = "\n\n".join(transcripts)
|
60
|
+
output_path: Path = self.out_path.value or audio_path.with_suffix(".txt")
|
61
|
+
output_path.write_text(full_transcript, encoding="utf-8")
|
62
|
+
print(f"[✓] Transcription saved to: {output_path}")
|
63
|
+
|
64
|
+
|
65
|
+
def load_audio_segment(file_path: Path) -> AudioSegment:
|
66
|
+
"""
|
67
|
+
Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
|
68
|
+
"""
|
69
|
+
ext = file_path.suffix.lower()[1:]
|
70
|
+
audio = AudioSegment.from_file(file_path.as_posix(), format=ext if ext != "mp3" else None)
|
71
|
+
if ext != "mp3":
|
72
|
+
buffer = BytesIO()
|
73
|
+
audio.export(buffer, format="mp3")
|
74
|
+
buffer.seek(0)
|
75
|
+
audio = AudioSegment.from_file(buffer, format="mp3")
|
76
|
+
return audio
|
77
|
+
|
78
|
+
|
79
|
+
def split_audio(audio: AudioSegment, max_duration_s: int) -> list[AudioSegment]:
|
80
|
+
"""
|
81
|
+
Split the AudioSegment into chunks no longer than max_duration_s seconds.
|
82
|
+
"""
|
83
|
+
chunk_length_ms = (max_duration_s - 1) * 1000
|
84
|
+
duration_ms = len(audio)
|
85
|
+
segments: list[AudioSegment] = []
|
86
|
+
segment_idx: int = 0
|
87
|
+
for start_ms in range(0, duration_ms, chunk_length_ms):
|
88
|
+
end_ms = min(start_ms + chunk_length_ms, duration_ms)
|
89
|
+
segment = cast(AudioSegment, audio[start_ms:end_ms])
|
90
|
+
segments.append(segment)
|
91
|
+
# with open(f"segment_{segment_idx}.mp3", "wb") as f:
|
92
|
+
# segment.export(f, format="mp3")
|
93
|
+
segment_idx += 1
|
94
|
+
return segments
|
95
|
+
|
96
|
+
|
97
|
+
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str) -> str:
|
98
|
+
"""
|
99
|
+
Transcribe a single AudioSegment chunk and return its text.
|
100
|
+
"""
|
101
|
+
buffer = BytesIO()
|
102
|
+
segment.export(buffer, format="mp3")
|
103
|
+
buffer.seek(0)
|
104
|
+
mp3_bytes = buffer.read()
|
105
|
+
response = client.audio.transcriptions.create(
|
106
|
+
model=model,
|
107
|
+
prompt="Transcribe whole text from audio.",
|
108
|
+
file=("audio.mp3", mp3_bytes),
|
109
|
+
response_format="text",
|
110
|
+
stream=True,
|
111
|
+
)
|
112
|
+
for res in response:
|
113
|
+
if res.type == "transcript.text.delta":
|
114
|
+
print(res.delta, end="", flush=True)
|
115
|
+
if res.type == "transcript.text.done":
|
116
|
+
print()
|
117
|
+
return res.text
|
118
|
+
else:
|
119
|
+
raise RuntimeError("No transcription result found.")
|
120
|
+
|
121
|
+
|
122
|
+
def main() -> None:
|
123
|
+
TranscriptionApiArguments().run()
|
124
|
+
|
125
|
+
|
126
|
+
if __name__ == "__main__":
|
127
|
+
main()
|