chatterer 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. chatterer/__init__.py +87 -87
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/constants.py +5 -0
  5. chatterer/examples/__main__.py +75 -75
  6. chatterer/examples/any2md.py +83 -85
  7. chatterer/examples/pdf2md.py +231 -338
  8. chatterer/examples/pdf2txt.py +52 -54
  9. chatterer/examples/ppt.py +487 -486
  10. chatterer/examples/pw.py +141 -143
  11. chatterer/examples/snippet.py +54 -56
  12. chatterer/examples/transcribe.py +192 -192
  13. chatterer/examples/upstage.py +87 -89
  14. chatterer/examples/web2md.py +80 -80
  15. chatterer/interactive.py +422 -354
  16. chatterer/language_model.py +530 -536
  17. chatterer/messages.py +21 -21
  18. chatterer/tools/__init__.py +46 -46
  19. chatterer/tools/caption_markdown_images.py +388 -384
  20. chatterer/tools/citation_chunking/__init__.py +3 -3
  21. chatterer/tools/citation_chunking/chunks.py +51 -53
  22. chatterer/tools/citation_chunking/citation_chunker.py +117 -118
  23. chatterer/tools/citation_chunking/citations.py +284 -285
  24. chatterer/tools/citation_chunking/prompt.py +157 -157
  25. chatterer/tools/citation_chunking/reference.py +26 -26
  26. chatterer/tools/citation_chunking/utils.py +138 -138
  27. chatterer/tools/convert_pdf_to_markdown.py +636 -645
  28. chatterer/tools/convert_to_text.py +446 -446
  29. chatterer/tools/upstage_document_parser.py +704 -705
  30. chatterer/tools/webpage_to_markdown.py +739 -739
  31. chatterer/tools/youtube.py +146 -147
  32. chatterer/utils/__init__.py +15 -15
  33. chatterer/utils/base64_image.py +349 -350
  34. chatterer/utils/bytesio.py +59 -59
  35. chatterer/utils/code_agent.py +237 -237
  36. chatterer/utils/imghdr.py +145 -145
  37. {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/METADATA +377 -390
  38. chatterer-0.1.27.dist-info/RECORD +43 -0
  39. chatterer-0.1.26.dist-info/RECORD +0 -42
  40. {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/WHEEL +0 -0
  41. {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/entry_points.txt +0 -0
  42. {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/top_level.txt +0 -0
@@ -1,192 +1,192 @@
1
- # pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
2
-
3
- from io import BytesIO
4
- from pathlib import Path
5
- from typing import List, Optional
6
-
7
- from openai import OpenAI
8
- from pydub import AudioSegment
9
- from spargear import RunnableArguments
10
-
11
-
12
- # -------------------------------------------------------------------
13
- # Helper functions for timestamp parsing & segment selection
14
- # -------------------------------------------------------------------
15
- def parse_timestamp(ts: str) -> float:
16
- """
17
- Parse a timestamp string into seconds.
18
- Supports:
19
- - "SS" or "SS.sss"
20
- - "MM:SS" or "MM:SS.sss"
21
- - "HH:MM:SS" or "HH:MM:SS.sss"
22
- """
23
- parts = ts.split(":")
24
- seconds = 0.0
25
- for idx, part in enumerate(reversed(parts)):
26
- if not part:
27
- value = 0.0
28
- else:
29
- value = float(part)
30
- if idx == 0:
31
- seconds += value
32
- elif idx == 1:
33
- seconds += value * 60
34
- elif idx == 2:
35
- seconds += value * 3600
36
- else:
37
- raise ValueError(f"Timestamp '{ts}' is too long (use H:MM:SS at most)")
38
- return seconds
39
-
40
-
41
- def get_selected_audio(audio: AudioSegment, segments_str: str) -> AudioSegment:
42
- """
43
- Given full audio and a segments string (e.g. "650-750,16:50-17:30,800-"),
44
- extract those subranges and concatenate them.
45
- """
46
- duration_ms = len(audio)
47
- duration_s = duration_ms / 1000.0
48
- subsegments: List[AudioSegment] = []
49
-
50
- for part in segments_str.split(","):
51
- if "-" not in part:
52
- raise ValueError(f"Invalid segment '{part}' (must contain '-')")
53
- start_str, end_str = part.split("-", 1)
54
- start_s = parse_timestamp(start_str) if start_str.strip() else 0.0
55
- end_s = parse_timestamp(end_str) if end_str.strip() else duration_s
56
-
57
- # clamp
58
- start_s = max(0.0, min(start_s, duration_s))
59
- end_s = max(0.0, min(end_s, duration_s))
60
- if end_s <= start_s:
61
- print(f"[!] Warning: segment '{part}' yields non-positive duration; skipping.")
62
- continue
63
-
64
- start_ms = int(start_s * 1000)
65
- end_ms = int(end_s * 1000)
66
- sub = audio[start_ms:end_ms]
67
- subsegments.append(sub)
68
- print(f"[i] Selected segment {start_s:.2f}s–{end_s:.2f}s ({end_s - start_s:.2f}s)")
69
-
70
- if not subsegments:
71
- raise RuntimeError("No valid segments were specified.")
72
- # concatenate
73
- combined = subsegments[0]
74
- for seg in subsegments[1:]:
75
- combined += seg
76
- return combined
77
-
78
-
79
- # -------------------------------------------------------------------
80
- # Main transcription logic
81
- # -------------------------------------------------------------------
82
- class Arguments(RunnableArguments[None]):
83
- AUDIO_PATH: Path
84
- """The audio file to transcribe."""
85
- output: Optional[Path] = None
86
- """Path to save the transcription output."""
87
- model: str = "gpt-4o-transcribe"
88
- """The model to use for transcription."""
89
- api_key: Optional[str] = None
90
- """The API key for authentication."""
91
- base_url: str = "https://api.openai.com/v1"
92
- """The base URL for the API."""
93
- prompt: str = "Transcribe whole text from audio."
94
- """The prompt to use for transcription."""
95
- segments: Optional[str] = None
96
- """
97
- Comma-separated list of time ranges to include (e.g. "650-750,16:50-17:30,800-").
98
- Each range is start-end; start or end may be omitted.
99
- Supports seconds or H:MM:SS formats.
100
- """
101
- max_chunk_duration: int = 600
102
- """Maximum duration of each chunk in seconds."""
103
-
104
- def run(self) -> None:
105
- client = OpenAI(api_key=self.api_key, base_url=self.base_url)
106
-
107
- # 1) Load entire audio
108
- original_audio = load_audio_segment(self.AUDIO_PATH)
109
-
110
- # 2) If segments specified, extract & combine
111
- if self.segments:
112
- audio = get_selected_audio(original_audio, self.segments)
113
- print(f"[i] Combined audio duration: {len(audio) / 1000:.1f}s (from segments)")
114
- else:
115
- audio = original_audio
116
- print(f"[i] Audio duration: {len(audio) / 1000:.1f}s (full audio)")
117
-
118
- # 3) Split into chunks
119
- segments = split_audio(audio, self.max_chunk_duration)
120
- print(f"[i] Splitting into {len(segments)} segment(s) for transcription")
121
-
122
- # 4) Transcribe each chunk
123
- transcripts: List[str] = []
124
- for idx, seg in enumerate(segments, start=1):
125
- print(f"[i] Transcribing segment {idx}/{len(segments)}...")
126
- transcripts.append(transcribe_segment(seg, client, self.model, self.prompt))
127
-
128
- # 5) Write out
129
- full = "\n\n".join(transcripts)
130
- out_path = self.output or self.AUDIO_PATH.with_suffix(".txt")
131
- out_path.write_text(full, encoding="utf-8")
132
- print(f"[✓] Transcription saved to: {out_path}")
133
-
134
-
135
- def load_audio_segment(file_path: Path) -> AudioSegment:
136
- """
137
- Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
138
- """
139
- ext = file_path.suffix.lower()[1:]
140
- audio = AudioSegment.from_file(file_path.as_posix(), format=None if ext == "mp3" else ext)
141
- if ext != "mp3":
142
- buffer = BytesIO()
143
- audio.export(buffer, format="mp3")
144
- buffer.seek(0)
145
- audio = AudioSegment.from_file(buffer, format="mp3")
146
- return audio
147
-
148
-
149
- def split_audio(audio: AudioSegment, max_duration_s: int) -> List[AudioSegment]:
150
- """
151
- Split the AudioSegment into chunks no longer than max_duration_s seconds.
152
- """
153
- chunk_ms = (max_duration_s - 1) * 1000
154
- duration_ms = len(audio)
155
- segments: List[AudioSegment] = []
156
- for start in range(0, duration_ms, chunk_ms):
157
- end = min(start + chunk_ms, duration_ms)
158
- segments.append(audio[start:end])
159
- return segments
160
-
161
-
162
- def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
163
- """
164
- Transcribe a single AudioSegment chunk and return its text.
165
- """
166
- buffer = BytesIO()
167
- segment.export(buffer, format="mp3")
168
- buffer.seek(0)
169
- mp3_bytes = buffer.read()
170
-
171
- response = client.audio.transcriptions.create(
172
- model=model,
173
- prompt=prompt,
174
- file=("audio.mp3", mp3_bytes),
175
- response_format="text",
176
- stream=True,
177
- )
178
- for res in response:
179
- if res.type == "transcript.text.delta":
180
- print(res.delta, end="", flush=True)
181
- elif res.type == "transcript.text.done":
182
- print()
183
- return res.text
184
- raise RuntimeError("No transcription result found.")
185
-
186
-
187
- def main() -> None:
188
- Arguments().run()
189
-
190
-
191
- if __name__ == "__main__":
192
- main()
1
+ # pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
2
+
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import List, Optional
6
+
7
+ from openai import OpenAI
8
+ from pydub import AudioSegment
9
+ from spargear import RunnableArguments
10
+
11
+
12
+ # -------------------------------------------------------------------
13
+ # Helper functions for timestamp parsing & segment selection
14
+ # -------------------------------------------------------------------
15
+ def parse_timestamp(ts: str) -> float:
16
+ """
17
+ Parse a timestamp string into seconds.
18
+ Supports:
19
+ - "SS" or "SS.sss"
20
+ - "MM:SS" or "MM:SS.sss"
21
+ - "HH:MM:SS" or "HH:MM:SS.sss"
22
+ """
23
+ parts = ts.split(":")
24
+ seconds = 0.0
25
+ for idx, part in enumerate(reversed(parts)):
26
+ if not part:
27
+ value = 0.0
28
+ else:
29
+ value = float(part)
30
+ if idx == 0:
31
+ seconds += value
32
+ elif idx == 1:
33
+ seconds += value * 60
34
+ elif idx == 2:
35
+ seconds += value * 3600
36
+ else:
37
+ raise ValueError(f"Timestamp '{ts}' is too long (use H:MM:SS at most)")
38
+ return seconds
39
+
40
+
41
+ def get_selected_audio(audio: AudioSegment, segments_str: str) -> AudioSegment:
42
+ """
43
+ Given full audio and a segments string (e.g. "650-750,16:50-17:30,800-"),
44
+ extract those subranges and concatenate them.
45
+ """
46
+ duration_ms = len(audio)
47
+ duration_s = duration_ms / 1000.0
48
+ subsegments: List[AudioSegment] = []
49
+
50
+ for part in segments_str.split(","):
51
+ if "-" not in part:
52
+ raise ValueError(f"Invalid segment '{part}' (must contain '-')")
53
+ start_str, end_str = part.split("-", 1)
54
+ start_s = parse_timestamp(start_str) if start_str.strip() else 0.0
55
+ end_s = parse_timestamp(end_str) if end_str.strip() else duration_s
56
+
57
+ # clamp
58
+ start_s = max(0.0, min(start_s, duration_s))
59
+ end_s = max(0.0, min(end_s, duration_s))
60
+ if end_s <= start_s:
61
+ print(f"[!] Warning: segment '{part}' yields non-positive duration; skipping.")
62
+ continue
63
+
64
+ start_ms = int(start_s * 1000)
65
+ end_ms = int(end_s * 1000)
66
+ sub = audio[start_ms:end_ms]
67
+ subsegments.append(sub)
68
+ print(f"[i] Selected segment {start_s:.2f}s–{end_s:.2f}s ({end_s - start_s:.2f}s)")
69
+
70
+ if not subsegments:
71
+ raise RuntimeError("No valid segments were specified.")
72
+ # concatenate
73
+ combined = subsegments[0]
74
+ for seg in subsegments[1:]:
75
+ combined += seg
76
+ return combined
77
+
78
+
79
+ # -------------------------------------------------------------------
80
+ # Main transcription logic
81
+ # -------------------------------------------------------------------
82
+ class Arguments(RunnableArguments[None]):
83
+ AUDIO_PATH: Path
84
+ """The audio file to transcribe."""
85
+ output: Optional[Path] = None
86
+ """Path to save the transcription output."""
87
+ model: str = "gpt-4o-transcribe"
88
+ """The model to use for transcription."""
89
+ api_key: Optional[str] = None
90
+ """The API key for authentication."""
91
+ base_url: str = "https://api.openai.com/v1"
92
+ """The base URL for the API."""
93
+ prompt: str = "Transcribe whole text from audio."
94
+ """The prompt to use for transcription."""
95
+ segments: Optional[str] = None
96
+ """
97
+ Comma-separated list of time ranges to include (e.g. "650-750,16:50-17:30,800-").
98
+ Each range is start-end; start or end may be omitted.
99
+ Supports seconds or H:MM:SS formats.
100
+ """
101
+ max_chunk_duration: int = 600
102
+ """Maximum duration of each chunk in seconds."""
103
+
104
+ def run(self) -> None:
105
+ client = OpenAI(api_key=self.api_key, base_url=self.base_url)
106
+
107
+ # 1) Load entire audio
108
+ original_audio = load_audio_segment(self.AUDIO_PATH)
109
+
110
+ # 2) If segments specified, extract & combine
111
+ if self.segments:
112
+ audio = get_selected_audio(original_audio, self.segments)
113
+ print(f"[i] Combined audio duration: {len(audio) / 1000:.1f}s (from segments)")
114
+ else:
115
+ audio = original_audio
116
+ print(f"[i] Audio duration: {len(audio) / 1000:.1f}s (full audio)")
117
+
118
+ # 3) Split into chunks
119
+ segments = split_audio(audio, self.max_chunk_duration)
120
+ print(f"[i] Splitting into {len(segments)} segment(s) for transcription")
121
+
122
+ # 4) Transcribe each chunk
123
+ transcripts: List[str] = []
124
+ for idx, seg in enumerate(segments, start=1):
125
+ print(f"[i] Transcribing segment {idx}/{len(segments)}...")
126
+ transcripts.append(transcribe_segment(seg, client, self.model, self.prompt))
127
+
128
+ # 5) Write out
129
+ full = "\n\n".join(transcripts)
130
+ out_path = self.output or self.AUDIO_PATH.with_suffix(".txt")
131
+ out_path.write_text(full, encoding="utf-8")
132
+ print(f"[✓] Transcription saved to: {out_path}")
133
+
134
+
135
+ def load_audio_segment(file_path: Path) -> AudioSegment:
136
+ """
137
+ Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
138
+ """
139
+ ext = file_path.suffix.lower()[1:]
140
+ audio = AudioSegment.from_file(file_path.as_posix(), format=None if ext == "mp3" else ext)
141
+ if ext != "mp3":
142
+ buffer = BytesIO()
143
+ audio.export(buffer, format="mp3")
144
+ buffer.seek(0)
145
+ audio = AudioSegment.from_file(buffer, format="mp3")
146
+ return audio
147
+
148
+
149
+ def split_audio(audio: AudioSegment, max_duration_s: int) -> List[AudioSegment]:
150
+ """
151
+ Split the AudioSegment into chunks no longer than max_duration_s seconds.
152
+ """
153
+ chunk_ms = (max_duration_s - 1) * 1000
154
+ duration_ms = len(audio)
155
+ segments: List[AudioSegment] = []
156
+ for start in range(0, duration_ms, chunk_ms):
157
+ end = min(start + chunk_ms, duration_ms)
158
+ segments.append(audio[start:end])
159
+ return segments
160
+
161
+
162
+ def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
163
+ """
164
+ Transcribe a single AudioSegment chunk and return its text.
165
+ """
166
+ buffer = BytesIO()
167
+ segment.export(buffer, format="mp3")
168
+ buffer.seek(0)
169
+ mp3_bytes = buffer.read()
170
+
171
+ response = client.audio.transcriptions.create(
172
+ model=model,
173
+ prompt=prompt,
174
+ file=("audio.mp3", mp3_bytes),
175
+ response_format="text",
176
+ stream=True,
177
+ )
178
+ for res in response:
179
+ if res.type == "transcript.text.delta":
180
+ print(res.delta, end="", flush=True)
181
+ elif res.type == "transcript.text.done":
182
+ print()
183
+ return res.text
184
+ raise RuntimeError("No transcription result found.")
185
+
186
+
187
+ def main() -> None:
188
+ Arguments().run()
189
+
190
+
191
+ if __name__ == "__main__":
192
+ main()
@@ -1,89 +1,87 @@
1
- import logging
2
- from pathlib import Path
3
- from typing import Optional
4
-
5
- from langchain_core.documents.base import Blob
6
- from spargear import ArgumentSpec, BaseArguments
7
-
8
- from chatterer import Chatterer, UpstageDocumentParseParser
9
- from chatterer.tools.upstage_document_parser import (
10
- DEFAULT_IMAGE_DIR,
11
- DOCUMENT_PARSE_BASE_URL,
12
- DOCUMENT_PARSE_DEFAULT_MODEL,
13
- OCR,
14
- Category,
15
- OutputFormat,
16
- SplitType,
17
- )
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class Arguments(BaseArguments):
23
- INPUT_PATH: Path
24
- """Input file to parse. Can be a PDF, image, or other supported formats."""
25
- output: Optional[Path] = None
26
- """Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
27
- api_key: Optional[str] = None
28
- """API key for the Upstage API."""
29
- base_url: str = DOCUMENT_PARSE_BASE_URL
30
- """Base URL for the Upstage API."""
31
- model: str = DOCUMENT_PARSE_DEFAULT_MODEL
32
- """Model to use for parsing."""
33
- split: SplitType = "none"
34
- """Split type for the parsed content."""
35
- ocr: OCR = "auto"
36
- """OCR type for parsing."""
37
- output_format: OutputFormat = "markdown"
38
- """Output format for the parsed content."""
39
- coordinates: bool = False
40
- """Whether to include coordinates in the output."""
41
- base64_encoding: list[Category] = ["figure"]
42
- """Base64 encoding for specific categories in the parsed content."""
43
- image_description_instruction: str = "Describe the image in detail."
44
- """Instruction for generating image descriptions."""
45
- image_dir: str = DEFAULT_IMAGE_DIR
46
- """Directory to save images extracted from the document."""
47
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
48
- ["--chatterer"],
49
- default=None,
50
- help="Chatterer instance for communication.",
51
- type=Chatterer.from_provider,
52
- )
53
-
54
- def run(self) -> None:
55
- input = self.INPUT_PATH.resolve()
56
- out = self.output or input.with_suffix(".md")
57
-
58
- parser = UpstageDocumentParseParser(
59
- api_key=self.api_key,
60
- base_url=self.base_url,
61
- model=self.model,
62
- split=self.split,
63
- ocr=self.ocr,
64
- output_format=self.output_format,
65
- coordinates=self.coordinates,
66
- base64_encoding=self.base64_encoding,
67
- image_description_instruction=self.image_description_instruction,
68
- image_dir=self.image_dir,
69
- chatterer=self.chatterer.value,
70
- )
71
- docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
72
-
73
- if self.image_dir:
74
- for path, image in parser.image_data.items():
75
- (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
76
- path.write_bytes(image)
77
- logger.info(f"Saved image to `{path}`")
78
-
79
- markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
80
- out.write_text(markdown, encoding="utf-8")
81
- logger.info(f"Parsed `{input}` to `{out}`")
82
-
83
-
84
- def main() -> None:
85
- Arguments().run()
86
-
87
-
88
- if __name__ == "__main__":
89
- main()
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ from langchain_core.documents.base import Blob
5
+ from loguru import logger
6
+ from spargear import ArgumentSpec, BaseArguments
7
+
8
+ from chatterer import Chatterer, UpstageDocumentParseParser
9
+ from chatterer.tools.upstage_document_parser import (
10
+ DEFAULT_IMAGE_DIR,
11
+ DOCUMENT_PARSE_BASE_URL,
12
+ DOCUMENT_PARSE_DEFAULT_MODEL,
13
+ OCR,
14
+ Category,
15
+ OutputFormat,
16
+ SplitType,
17
+ )
18
+
19
+
20
+ class Arguments(BaseArguments):
21
+ INPUT_PATH: Path
22
+ """Input file to parse. Can be a PDF, image, or other supported formats."""
23
+ output: Optional[Path] = None
24
+ """Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
25
+ api_key: Optional[str] = None
26
+ """API key for the Upstage API."""
27
+ base_url: str = DOCUMENT_PARSE_BASE_URL
28
+ """Base URL for the Upstage API."""
29
+ model: str = DOCUMENT_PARSE_DEFAULT_MODEL
30
+ """Model to use for parsing."""
31
+ split: SplitType = "none"
32
+ """Split type for the parsed content."""
33
+ ocr: OCR = "auto"
34
+ """OCR type for parsing."""
35
+ output_format: OutputFormat = "markdown"
36
+ """Output format for the parsed content."""
37
+ coordinates: bool = False
38
+ """Whether to include coordinates in the output."""
39
+ base64_encoding: list[Category] = ["figure"]
40
+ """Base64 encoding for specific categories in the parsed content."""
41
+ image_description_instruction: str = "Describe the image in detail."
42
+ """Instruction for generating image descriptions."""
43
+ image_dir: str = DEFAULT_IMAGE_DIR
44
+ """Directory to save images extracted from the document."""
45
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
46
+ ["--chatterer"],
47
+ default=None,
48
+ help="Chatterer instance for communication.",
49
+ type=Chatterer.from_provider,
50
+ )
51
+
52
+ def run(self) -> None:
53
+ input = self.INPUT_PATH.resolve()
54
+ out = self.output or input.with_suffix(".md")
55
+
56
+ parser = UpstageDocumentParseParser(
57
+ api_key=self.api_key,
58
+ base_url=self.base_url,
59
+ model=self.model,
60
+ split=self.split,
61
+ ocr=self.ocr,
62
+ output_format=self.output_format,
63
+ coordinates=self.coordinates,
64
+ base64_encoding=self.base64_encoding,
65
+ image_description_instruction=self.image_description_instruction,
66
+ image_dir=self.image_dir,
67
+ chatterer=self.chatterer.value,
68
+ )
69
+ docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
70
+
71
+ if self.image_dir:
72
+ for path, image in parser.image_data.items():
73
+ (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
74
+ path.write_bytes(image)
75
+ logger.info(f"Saved image to `{path}`")
76
+
77
+ markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
78
+ out.write_text(markdown, encoding="utf-8")
79
+ logger.info(f"Parsed `{input}` to `{out}`")
80
+
81
+
82
+ def main() -> None:
83
+ Arguments().run()
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()