chatterer 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +87 -87
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/constants.py +5 -0
- chatterer/examples/__main__.py +75 -75
- chatterer/examples/any2md.py +83 -85
- chatterer/examples/pdf2md.py +231 -338
- chatterer/examples/pdf2txt.py +52 -54
- chatterer/examples/ppt.py +487 -486
- chatterer/examples/pw.py +141 -143
- chatterer/examples/snippet.py +54 -56
- chatterer/examples/transcribe.py +192 -192
- chatterer/examples/upstage.py +87 -89
- chatterer/examples/web2md.py +80 -80
- chatterer/interactive.py +422 -354
- chatterer/language_model.py +530 -536
- chatterer/messages.py +21 -21
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +388 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +51 -53
- chatterer/tools/citation_chunking/citation_chunker.py +117 -118
- chatterer/tools/citation_chunking/citations.py +284 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +636 -645
- chatterer/tools/convert_to_text.py +446 -446
- chatterer/tools/upstage_document_parser.py +704 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -147
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +349 -350
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +145 -145
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/METADATA +377 -390
- chatterer-0.1.27.dist-info/RECORD +43 -0
- chatterer-0.1.26.dist-info/RECORD +0 -42
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/WHEEL +0 -0
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/entry_points.txt +0 -0
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/top_level.txt +0 -0
chatterer/examples/transcribe.py
CHANGED
@@ -1,192 +1,192 @@
|
|
1
|
-
# pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
|
2
|
-
|
3
|
-
from io import BytesIO
|
4
|
-
from pathlib import Path
|
5
|
-
from typing import List, Optional
|
6
|
-
|
7
|
-
from openai import OpenAI
|
8
|
-
from pydub import AudioSegment
|
9
|
-
from spargear import RunnableArguments
|
10
|
-
|
11
|
-
|
12
|
-
# -------------------------------------------------------------------
|
13
|
-
# Helper functions for timestamp parsing & segment selection
|
14
|
-
# -------------------------------------------------------------------
|
15
|
-
def parse_timestamp(ts: str) -> float:
|
16
|
-
"""
|
17
|
-
Parse a timestamp string into seconds.
|
18
|
-
Supports:
|
19
|
-
- "SS" or "SS.sss"
|
20
|
-
- "MM:SS" or "MM:SS.sss"
|
21
|
-
- "HH:MM:SS" or "HH:MM:SS.sss"
|
22
|
-
"""
|
23
|
-
parts = ts.split(":")
|
24
|
-
seconds = 0.0
|
25
|
-
for idx, part in enumerate(reversed(parts)):
|
26
|
-
if not part:
|
27
|
-
value = 0.0
|
28
|
-
else:
|
29
|
-
value = float(part)
|
30
|
-
if idx == 0:
|
31
|
-
seconds += value
|
32
|
-
elif idx == 1:
|
33
|
-
seconds += value * 60
|
34
|
-
elif idx == 2:
|
35
|
-
seconds += value * 3600
|
36
|
-
else:
|
37
|
-
raise ValueError(f"Timestamp '{ts}' is too long (use H:MM:SS at most)")
|
38
|
-
return seconds
|
39
|
-
|
40
|
-
|
41
|
-
def get_selected_audio(audio: AudioSegment, segments_str: str) -> AudioSegment:
|
42
|
-
"""
|
43
|
-
Given full audio and a segments string (e.g. "650-750,16:50-17:30,800-"),
|
44
|
-
extract those subranges and concatenate them.
|
45
|
-
"""
|
46
|
-
duration_ms = len(audio)
|
47
|
-
duration_s = duration_ms / 1000.0
|
48
|
-
subsegments: List[AudioSegment] = []
|
49
|
-
|
50
|
-
for part in segments_str.split(","):
|
51
|
-
if "-" not in part:
|
52
|
-
raise ValueError(f"Invalid segment '{part}' (must contain '-')")
|
53
|
-
start_str, end_str = part.split("-", 1)
|
54
|
-
start_s = parse_timestamp(start_str) if start_str.strip() else 0.0
|
55
|
-
end_s = parse_timestamp(end_str) if end_str.strip() else duration_s
|
56
|
-
|
57
|
-
# clamp
|
58
|
-
start_s = max(0.0, min(start_s, duration_s))
|
59
|
-
end_s = max(0.0, min(end_s, duration_s))
|
60
|
-
if end_s <= start_s:
|
61
|
-
print(f"[!] Warning: segment '{part}' yields non-positive duration; skipping.")
|
62
|
-
continue
|
63
|
-
|
64
|
-
start_ms = int(start_s * 1000)
|
65
|
-
end_ms = int(end_s * 1000)
|
66
|
-
sub = audio[start_ms:end_ms]
|
67
|
-
subsegments.append(sub)
|
68
|
-
print(f"[i] Selected segment {start_s:.2f}s–{end_s:.2f}s ({end_s - start_s:.2f}s)")
|
69
|
-
|
70
|
-
if not subsegments:
|
71
|
-
raise RuntimeError("No valid segments were specified.")
|
72
|
-
# concatenate
|
73
|
-
combined = subsegments[0]
|
74
|
-
for seg in subsegments[1:]:
|
75
|
-
combined += seg
|
76
|
-
return combined
|
77
|
-
|
78
|
-
|
79
|
-
# -------------------------------------------------------------------
|
80
|
-
# Main transcription logic
|
81
|
-
# -------------------------------------------------------------------
|
82
|
-
class Arguments(RunnableArguments[None]):
|
83
|
-
AUDIO_PATH: Path
|
84
|
-
"""The audio file to transcribe."""
|
85
|
-
output: Optional[Path] = None
|
86
|
-
"""Path to save the transcription output."""
|
87
|
-
model: str = "gpt-4o-transcribe"
|
88
|
-
"""The model to use for transcription."""
|
89
|
-
api_key: Optional[str] = None
|
90
|
-
"""The API key for authentication."""
|
91
|
-
base_url: str = "https://api.openai.com/v1"
|
92
|
-
"""The base URL for the API."""
|
93
|
-
prompt: str = "Transcribe whole text from audio."
|
94
|
-
"""The prompt to use for transcription."""
|
95
|
-
segments: Optional[str] = None
|
96
|
-
"""
|
97
|
-
Comma-separated list of time ranges to include (e.g. "650-750,16:50-17:30,800-").
|
98
|
-
Each range is start-end; start or end may be omitted.
|
99
|
-
Supports seconds or H:MM:SS formats.
|
100
|
-
"""
|
101
|
-
max_chunk_duration: int = 600
|
102
|
-
"""Maximum duration of each chunk in seconds."""
|
103
|
-
|
104
|
-
def run(self) -> None:
|
105
|
-
client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
106
|
-
|
107
|
-
# 1) Load entire audio
|
108
|
-
original_audio = load_audio_segment(self.AUDIO_PATH)
|
109
|
-
|
110
|
-
# 2) If segments specified, extract & combine
|
111
|
-
if self.segments:
|
112
|
-
audio = get_selected_audio(original_audio, self.segments)
|
113
|
-
print(f"[i] Combined audio duration: {len(audio) / 1000:.1f}s (from segments)")
|
114
|
-
else:
|
115
|
-
audio = original_audio
|
116
|
-
print(f"[i] Audio duration: {len(audio) / 1000:.1f}s (full audio)")
|
117
|
-
|
118
|
-
# 3) Split into chunks
|
119
|
-
segments = split_audio(audio, self.max_chunk_duration)
|
120
|
-
print(f"[i] Splitting into {len(segments)} segment(s) for transcription")
|
121
|
-
|
122
|
-
# 4) Transcribe each chunk
|
123
|
-
transcripts: List[str] = []
|
124
|
-
for idx, seg in enumerate(segments, start=1):
|
125
|
-
print(f"[i] Transcribing segment {idx}/{len(segments)}...")
|
126
|
-
transcripts.append(transcribe_segment(seg, client, self.model, self.prompt))
|
127
|
-
|
128
|
-
# 5) Write out
|
129
|
-
full = "\n\n".join(transcripts)
|
130
|
-
out_path = self.output or self.AUDIO_PATH.with_suffix(".txt")
|
131
|
-
out_path.write_text(full, encoding="utf-8")
|
132
|
-
print(f"[✓] Transcription saved to: {out_path}")
|
133
|
-
|
134
|
-
|
135
|
-
def load_audio_segment(file_path: Path) -> AudioSegment:
|
136
|
-
"""
|
137
|
-
Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
|
138
|
-
"""
|
139
|
-
ext = file_path.suffix.lower()[1:]
|
140
|
-
audio = AudioSegment.from_file(file_path.as_posix(), format=None if ext == "mp3" else ext)
|
141
|
-
if ext != "mp3":
|
142
|
-
buffer = BytesIO()
|
143
|
-
audio.export(buffer, format="mp3")
|
144
|
-
buffer.seek(0)
|
145
|
-
audio = AudioSegment.from_file(buffer, format="mp3")
|
146
|
-
return audio
|
147
|
-
|
148
|
-
|
149
|
-
def split_audio(audio: AudioSegment, max_duration_s: int) -> List[AudioSegment]:
|
150
|
-
"""
|
151
|
-
Split the AudioSegment into chunks no longer than max_duration_s seconds.
|
152
|
-
"""
|
153
|
-
chunk_ms = (max_duration_s - 1) * 1000
|
154
|
-
duration_ms = len(audio)
|
155
|
-
segments: List[AudioSegment] = []
|
156
|
-
for start in range(0, duration_ms, chunk_ms):
|
157
|
-
end = min(start + chunk_ms, duration_ms)
|
158
|
-
segments.append(audio[start:end])
|
159
|
-
return segments
|
160
|
-
|
161
|
-
|
162
|
-
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
|
163
|
-
"""
|
164
|
-
Transcribe a single AudioSegment chunk and return its text.
|
165
|
-
"""
|
166
|
-
buffer = BytesIO()
|
167
|
-
segment.export(buffer, format="mp3")
|
168
|
-
buffer.seek(0)
|
169
|
-
mp3_bytes = buffer.read()
|
170
|
-
|
171
|
-
response = client.audio.transcriptions.create(
|
172
|
-
model=model,
|
173
|
-
prompt=prompt,
|
174
|
-
file=("audio.mp3", mp3_bytes),
|
175
|
-
response_format="text",
|
176
|
-
stream=True,
|
177
|
-
)
|
178
|
-
for res in response:
|
179
|
-
if res.type == "transcript.text.delta":
|
180
|
-
print(res.delta, end="", flush=True)
|
181
|
-
elif res.type == "transcript.text.done":
|
182
|
-
print()
|
183
|
-
return res.text
|
184
|
-
raise RuntimeError("No transcription result found.")
|
185
|
-
|
186
|
-
|
187
|
-
def main() -> None:
|
188
|
-
Arguments().run()
|
189
|
-
|
190
|
-
|
191
|
-
if __name__ == "__main__":
|
192
|
-
main()
|
1
|
+
# pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
|
2
|
+
|
3
|
+
from io import BytesIO
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import List, Optional
|
6
|
+
|
7
|
+
from openai import OpenAI
|
8
|
+
from pydub import AudioSegment
|
9
|
+
from spargear import RunnableArguments
|
10
|
+
|
11
|
+
|
12
|
+
# -------------------------------------------------------------------
|
13
|
+
# Helper functions for timestamp parsing & segment selection
|
14
|
+
# -------------------------------------------------------------------
|
15
|
+
def parse_timestamp(ts: str) -> float:
|
16
|
+
"""
|
17
|
+
Parse a timestamp string into seconds.
|
18
|
+
Supports:
|
19
|
+
- "SS" or "SS.sss"
|
20
|
+
- "MM:SS" or "MM:SS.sss"
|
21
|
+
- "HH:MM:SS" or "HH:MM:SS.sss"
|
22
|
+
"""
|
23
|
+
parts = ts.split(":")
|
24
|
+
seconds = 0.0
|
25
|
+
for idx, part in enumerate(reversed(parts)):
|
26
|
+
if not part:
|
27
|
+
value = 0.0
|
28
|
+
else:
|
29
|
+
value = float(part)
|
30
|
+
if idx == 0:
|
31
|
+
seconds += value
|
32
|
+
elif idx == 1:
|
33
|
+
seconds += value * 60
|
34
|
+
elif idx == 2:
|
35
|
+
seconds += value * 3600
|
36
|
+
else:
|
37
|
+
raise ValueError(f"Timestamp '{ts}' is too long (use H:MM:SS at most)")
|
38
|
+
return seconds
|
39
|
+
|
40
|
+
|
41
|
+
def get_selected_audio(audio: AudioSegment, segments_str: str) -> AudioSegment:
|
42
|
+
"""
|
43
|
+
Given full audio and a segments string (e.g. "650-750,16:50-17:30,800-"),
|
44
|
+
extract those subranges and concatenate them.
|
45
|
+
"""
|
46
|
+
duration_ms = len(audio)
|
47
|
+
duration_s = duration_ms / 1000.0
|
48
|
+
subsegments: List[AudioSegment] = []
|
49
|
+
|
50
|
+
for part in segments_str.split(","):
|
51
|
+
if "-" not in part:
|
52
|
+
raise ValueError(f"Invalid segment '{part}' (must contain '-')")
|
53
|
+
start_str, end_str = part.split("-", 1)
|
54
|
+
start_s = parse_timestamp(start_str) if start_str.strip() else 0.0
|
55
|
+
end_s = parse_timestamp(end_str) if end_str.strip() else duration_s
|
56
|
+
|
57
|
+
# clamp
|
58
|
+
start_s = max(0.0, min(start_s, duration_s))
|
59
|
+
end_s = max(0.0, min(end_s, duration_s))
|
60
|
+
if end_s <= start_s:
|
61
|
+
print(f"[!] Warning: segment '{part}' yields non-positive duration; skipping.")
|
62
|
+
continue
|
63
|
+
|
64
|
+
start_ms = int(start_s * 1000)
|
65
|
+
end_ms = int(end_s * 1000)
|
66
|
+
sub = audio[start_ms:end_ms]
|
67
|
+
subsegments.append(sub)
|
68
|
+
print(f"[i] Selected segment {start_s:.2f}s–{end_s:.2f}s ({end_s - start_s:.2f}s)")
|
69
|
+
|
70
|
+
if not subsegments:
|
71
|
+
raise RuntimeError("No valid segments were specified.")
|
72
|
+
# concatenate
|
73
|
+
combined = subsegments[0]
|
74
|
+
for seg in subsegments[1:]:
|
75
|
+
combined += seg
|
76
|
+
return combined
|
77
|
+
|
78
|
+
|
79
|
+
# -------------------------------------------------------------------
|
80
|
+
# Main transcription logic
|
81
|
+
# -------------------------------------------------------------------
|
82
|
+
class Arguments(RunnableArguments[None]):
|
83
|
+
AUDIO_PATH: Path
|
84
|
+
"""The audio file to transcribe."""
|
85
|
+
output: Optional[Path] = None
|
86
|
+
"""Path to save the transcription output."""
|
87
|
+
model: str = "gpt-4o-transcribe"
|
88
|
+
"""The model to use for transcription."""
|
89
|
+
api_key: Optional[str] = None
|
90
|
+
"""The API key for authentication."""
|
91
|
+
base_url: str = "https://api.openai.com/v1"
|
92
|
+
"""The base URL for the API."""
|
93
|
+
prompt: str = "Transcribe whole text from audio."
|
94
|
+
"""The prompt to use for transcription."""
|
95
|
+
segments: Optional[str] = None
|
96
|
+
"""
|
97
|
+
Comma-separated list of time ranges to include (e.g. "650-750,16:50-17:30,800-").
|
98
|
+
Each range is start-end; start or end may be omitted.
|
99
|
+
Supports seconds or H:MM:SS formats.
|
100
|
+
"""
|
101
|
+
max_chunk_duration: int = 600
|
102
|
+
"""Maximum duration of each chunk in seconds."""
|
103
|
+
|
104
|
+
def run(self) -> None:
|
105
|
+
client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
106
|
+
|
107
|
+
# 1) Load entire audio
|
108
|
+
original_audio = load_audio_segment(self.AUDIO_PATH)
|
109
|
+
|
110
|
+
# 2) If segments specified, extract & combine
|
111
|
+
if self.segments:
|
112
|
+
audio = get_selected_audio(original_audio, self.segments)
|
113
|
+
print(f"[i] Combined audio duration: {len(audio) / 1000:.1f}s (from segments)")
|
114
|
+
else:
|
115
|
+
audio = original_audio
|
116
|
+
print(f"[i] Audio duration: {len(audio) / 1000:.1f}s (full audio)")
|
117
|
+
|
118
|
+
# 3) Split into chunks
|
119
|
+
segments = split_audio(audio, self.max_chunk_duration)
|
120
|
+
print(f"[i] Splitting into {len(segments)} segment(s) for transcription")
|
121
|
+
|
122
|
+
# 4) Transcribe each chunk
|
123
|
+
transcripts: List[str] = []
|
124
|
+
for idx, seg in enumerate(segments, start=1):
|
125
|
+
print(f"[i] Transcribing segment {idx}/{len(segments)}...")
|
126
|
+
transcripts.append(transcribe_segment(seg, client, self.model, self.prompt))
|
127
|
+
|
128
|
+
# 5) Write out
|
129
|
+
full = "\n\n".join(transcripts)
|
130
|
+
out_path = self.output or self.AUDIO_PATH.with_suffix(".txt")
|
131
|
+
out_path.write_text(full, encoding="utf-8")
|
132
|
+
print(f"[✓] Transcription saved to: {out_path}")
|
133
|
+
|
134
|
+
|
135
|
+
def load_audio_segment(file_path: Path) -> AudioSegment:
|
136
|
+
"""
|
137
|
+
Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
|
138
|
+
"""
|
139
|
+
ext = file_path.suffix.lower()[1:]
|
140
|
+
audio = AudioSegment.from_file(file_path.as_posix(), format=None if ext == "mp3" else ext)
|
141
|
+
if ext != "mp3":
|
142
|
+
buffer = BytesIO()
|
143
|
+
audio.export(buffer, format="mp3")
|
144
|
+
buffer.seek(0)
|
145
|
+
audio = AudioSegment.from_file(buffer, format="mp3")
|
146
|
+
return audio
|
147
|
+
|
148
|
+
|
149
|
+
def split_audio(audio: AudioSegment, max_duration_s: int) -> List[AudioSegment]:
|
150
|
+
"""
|
151
|
+
Split the AudioSegment into chunks no longer than max_duration_s seconds.
|
152
|
+
"""
|
153
|
+
chunk_ms = (max_duration_s - 1) * 1000
|
154
|
+
duration_ms = len(audio)
|
155
|
+
segments: List[AudioSegment] = []
|
156
|
+
for start in range(0, duration_ms, chunk_ms):
|
157
|
+
end = min(start + chunk_ms, duration_ms)
|
158
|
+
segments.append(audio[start:end])
|
159
|
+
return segments
|
160
|
+
|
161
|
+
|
162
|
+
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
|
163
|
+
"""
|
164
|
+
Transcribe a single AudioSegment chunk and return its text.
|
165
|
+
"""
|
166
|
+
buffer = BytesIO()
|
167
|
+
segment.export(buffer, format="mp3")
|
168
|
+
buffer.seek(0)
|
169
|
+
mp3_bytes = buffer.read()
|
170
|
+
|
171
|
+
response = client.audio.transcriptions.create(
|
172
|
+
model=model,
|
173
|
+
prompt=prompt,
|
174
|
+
file=("audio.mp3", mp3_bytes),
|
175
|
+
response_format="text",
|
176
|
+
stream=True,
|
177
|
+
)
|
178
|
+
for res in response:
|
179
|
+
if res.type == "transcript.text.delta":
|
180
|
+
print(res.delta, end="", flush=True)
|
181
|
+
elif res.type == "transcript.text.done":
|
182
|
+
print()
|
183
|
+
return res.text
|
184
|
+
raise RuntimeError("No transcription result found.")
|
185
|
+
|
186
|
+
|
187
|
+
def main() -> None:
|
188
|
+
Arguments().run()
|
189
|
+
|
190
|
+
|
191
|
+
if __name__ == "__main__":
|
192
|
+
main()
|
chatterer/examples/upstage.py
CHANGED
@@ -1,89 +1,87 @@
|
|
1
|
-
import
|
2
|
-
from
|
3
|
-
|
4
|
-
|
5
|
-
from
|
6
|
-
from spargear import ArgumentSpec, BaseArguments
|
7
|
-
|
8
|
-
from chatterer import Chatterer, UpstageDocumentParseParser
|
9
|
-
from chatterer.tools.upstage_document_parser import (
|
10
|
-
DEFAULT_IMAGE_DIR,
|
11
|
-
DOCUMENT_PARSE_BASE_URL,
|
12
|
-
DOCUMENT_PARSE_DEFAULT_MODEL,
|
13
|
-
OCR,
|
14
|
-
Category,
|
15
|
-
OutputFormat,
|
16
|
-
SplitType,
|
17
|
-
)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
"""
|
25
|
-
|
26
|
-
"""
|
27
|
-
|
28
|
-
"""
|
29
|
-
|
30
|
-
"""
|
31
|
-
|
32
|
-
"""
|
33
|
-
|
34
|
-
"""
|
35
|
-
|
36
|
-
"""
|
37
|
-
|
38
|
-
"""
|
39
|
-
|
40
|
-
"""
|
41
|
-
|
42
|
-
"""
|
43
|
-
|
44
|
-
"""
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
(
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
if __name__ == "__main__":
|
89
|
-
main()
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from langchain_core.documents.base import Blob
|
5
|
+
from loguru import logger
|
6
|
+
from spargear import ArgumentSpec, BaseArguments
|
7
|
+
|
8
|
+
from chatterer import Chatterer, UpstageDocumentParseParser
|
9
|
+
from chatterer.tools.upstage_document_parser import (
|
10
|
+
DEFAULT_IMAGE_DIR,
|
11
|
+
DOCUMENT_PARSE_BASE_URL,
|
12
|
+
DOCUMENT_PARSE_DEFAULT_MODEL,
|
13
|
+
OCR,
|
14
|
+
Category,
|
15
|
+
OutputFormat,
|
16
|
+
SplitType,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
class Arguments(BaseArguments):
|
21
|
+
INPUT_PATH: Path
|
22
|
+
"""Input file to parse. Can be a PDF, image, or other supported formats."""
|
23
|
+
output: Optional[Path] = None
|
24
|
+
"""Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
|
25
|
+
api_key: Optional[str] = None
|
26
|
+
"""API key for the Upstage API."""
|
27
|
+
base_url: str = DOCUMENT_PARSE_BASE_URL
|
28
|
+
"""Base URL for the Upstage API."""
|
29
|
+
model: str = DOCUMENT_PARSE_DEFAULT_MODEL
|
30
|
+
"""Model to use for parsing."""
|
31
|
+
split: SplitType = "none"
|
32
|
+
"""Split type for the parsed content."""
|
33
|
+
ocr: OCR = "auto"
|
34
|
+
"""OCR type for parsing."""
|
35
|
+
output_format: OutputFormat = "markdown"
|
36
|
+
"""Output format for the parsed content."""
|
37
|
+
coordinates: bool = False
|
38
|
+
"""Whether to include coordinates in the output."""
|
39
|
+
base64_encoding: list[Category] = ["figure"]
|
40
|
+
"""Base64 encoding for specific categories in the parsed content."""
|
41
|
+
image_description_instruction: str = "Describe the image in detail."
|
42
|
+
"""Instruction for generating image descriptions."""
|
43
|
+
image_dir: str = DEFAULT_IMAGE_DIR
|
44
|
+
"""Directory to save images extracted from the document."""
|
45
|
+
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
46
|
+
["--chatterer"],
|
47
|
+
default=None,
|
48
|
+
help="Chatterer instance for communication.",
|
49
|
+
type=Chatterer.from_provider,
|
50
|
+
)
|
51
|
+
|
52
|
+
def run(self) -> None:
|
53
|
+
input = self.INPUT_PATH.resolve()
|
54
|
+
out = self.output or input.with_suffix(".md")
|
55
|
+
|
56
|
+
parser = UpstageDocumentParseParser(
|
57
|
+
api_key=self.api_key,
|
58
|
+
base_url=self.base_url,
|
59
|
+
model=self.model,
|
60
|
+
split=self.split,
|
61
|
+
ocr=self.ocr,
|
62
|
+
output_format=self.output_format,
|
63
|
+
coordinates=self.coordinates,
|
64
|
+
base64_encoding=self.base64_encoding,
|
65
|
+
image_description_instruction=self.image_description_instruction,
|
66
|
+
image_dir=self.image_dir,
|
67
|
+
chatterer=self.chatterer.value,
|
68
|
+
)
|
69
|
+
docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
|
70
|
+
|
71
|
+
if self.image_dir:
|
72
|
+
for path, image in parser.image_data.items():
|
73
|
+
(path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
|
74
|
+
path.write_bytes(image)
|
75
|
+
logger.info(f"Saved image to `{path}`")
|
76
|
+
|
77
|
+
markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
|
78
|
+
out.write_text(markdown, encoding="utf-8")
|
79
|
+
logger.info(f"Parsed `{input}` to `{out}`")
|
80
|
+
|
81
|
+
|
82
|
+
def main() -> None:
|
83
|
+
Arguments().run()
|
84
|
+
|
85
|
+
|
86
|
+
if __name__ == "__main__":
|
87
|
+
main()
|