chatterer 0.1.23__py3-none-any.whl → 0.1.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. chatterer/__init__.py +97 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__main__.py +75 -0
  5. chatterer/examples/{anything_to_markdown.py → any2md.py} +85 -85
  6. chatterer/examples/{pdf_to_markdown.py → pdf2md.py} +338 -338
  7. chatterer/examples/{pdf_to_text.py → pdf2txt.py} +54 -54
  8. chatterer/examples/{make_ppt.py → ppt.py} +486 -488
  9. chatterer/examples/pw.py +143 -0
  10. chatterer/examples/{get_code_snippets.py → snippet.py} +56 -55
  11. chatterer/examples/transcribe.py +192 -0
  12. chatterer/examples/{upstage_parser.py → upstage.py} +89 -89
  13. chatterer/examples/{webpage_to_markdown.py → web2md.py} +80 -70
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +536 -536
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +645 -625
  30. chatterer/tools/convert_to_text.py +446 -446
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +293 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.23.dist-info → chatterer-0.1.25.dist-info}/METADATA +390 -392
  40. chatterer-0.1.25.dist-info/RECORD +45 -0
  41. chatterer-0.1.25.dist-info/entry_points.txt +2 -0
  42. chatterer/examples/login_with_playwright.py +0 -156
  43. chatterer/examples/transcription_api.py +0 -112
  44. chatterer-0.1.23.dist-info/RECORD +0 -44
  45. chatterer-0.1.23.dist-info/entry_points.txt +0 -10
  46. {chatterer-0.1.23.dist-info → chatterer-0.1.25.dist-info}/WHEEL +0 -0
  47. {chatterer-0.1.23.dist-info → chatterer-0.1.25.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
1
+ import json
2
+ import logging
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from spargear import ArgumentSpec, BaseArguments, RunnableArguments, SubcommandSpec
7
+
8
+ from chatterer import PlayWrightBot
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def generate_json_path() -> Path:
14
+ return Path("session_state.json").resolve()
15
+
16
+
17
+ class ReadArgs(RunnableArguments[None]):
18
+ """Arguments for the 'read' subcommand."""
19
+
20
+ URL: str
21
+ """URL (potentially protected) to navigate to using the saved session."""
22
+ json: ArgumentSpec[Path] = ArgumentSpec(
23
+ ["--json", "-j"],
24
+ default_factory=generate_json_path,
25
+ help="Path to the session state JSON file to load.",
26
+ )
27
+
28
+ def run(self) -> None:
29
+ """
30
+ Loads the session state from the specified JSON file, then navigates
31
+ to a protected_url that normally requires login. If the stored session
32
+ is valid, it should open without re-entering credentials.
33
+
34
+ Correction: Loads the JSON content into a dict first to satisfy type hints.
35
+ """
36
+ url = self.URL
37
+ jsonpath = self.json.unwrap()
38
+ logger.info(f"Loading session from {jsonpath} and navigating to {url} ...")
39
+
40
+ if not jsonpath.exists():
41
+ logger.error(f"Session file not found at {jsonpath}")
42
+ sys.exit(1)
43
+
44
+ # Load the storage state from the JSON file into a dictionary
45
+ logger.info(f"Reading storage state content from {jsonpath} ...")
46
+ try:
47
+ with open(jsonpath, "r", encoding="utf-8") as f:
48
+ # This dictionary should match the 'StorageState' type expected by Playwright/chatterer
49
+ storage_state_dict = json.load(f)
50
+ except json.JSONDecodeError:
51
+ logger.error(f"Failed to decode JSON from {jsonpath}")
52
+ sys.exit(1)
53
+ except Exception as e:
54
+ logger.error(f"Error reading file {jsonpath}: {e}")
55
+ sys.exit(1)
56
+
57
+ logger.info("Launching browser with loaded session state...")
58
+ with PlayWrightBot(
59
+ playwright_launch_options={"headless": False},
60
+ # Pass the loaded dictionary, which should match the expected 'StorageState' type
61
+ playwright_persistency_options={"storage_state": storage_state_dict},
62
+ ) as bot:
63
+ bot.get_page(url)
64
+
65
+ logger.info("Press Enter in the console when you're done checking the protected page.")
66
+ input(" >> Press Enter to exit: ")
67
+
68
+ logger.info("Done! Browser is now closed.")
69
+
70
+
71
+ class WriteArgs(RunnableArguments[None]):
72
+ """Arguments for the 'write' subcommand."""
73
+
74
+ URL: str
75
+ """URL to navigate to for manual login."""
76
+ json: ArgumentSpec[Path] = ArgumentSpec(
77
+ ["--json", "-j"],
78
+ default_factory=generate_json_path,
79
+ help="Path to save the session state JSON file.",
80
+ )
81
+
82
+ def run(self) -> None:
83
+ """
84
+ Launches a non-headless browser and navigates to the login_url.
85
+ The user can manually log in, then press Enter in the console
86
+ to store the current session state into a JSON file.
87
+ """
88
+ url = self.URL
89
+ jsonpath = self.json.unwrap()
90
+ logger.info(f"Launching browser and navigating to {url} ... Please log in manually.")
91
+
92
+ # Ensure jsonpath directory exists
93
+ jsonpath.parent.mkdir(parents=True, exist_ok=True)
94
+
95
+ with PlayWrightBot(playwright_launch_options={"headless": False}) as bot:
96
+ bot.get_page(url)
97
+
98
+ logger.info("After completing the login in the browser, press Enter here to save the session.")
99
+ input(" >> Press Enter when ready: ")
100
+
101
+ # get_sync_browser() returns the BrowserContext internally
102
+ context = bot.get_sync_browser()
103
+
104
+ # Save the current session (cookies, localStorage) to a JSON file
105
+ logger.info(f"Saving storage state to {jsonpath} ...")
106
+ context.storage_state(path=jsonpath) # Pass Path object directly
107
+
108
+ logger.info("Done! Browser is now closed.")
109
+
110
+
111
+ class Arguments(BaseArguments):
112
+ """
113
+ A simple CLI tool for saving and using Playwright sessions via storage_state.
114
+ Uses spargear for declarative argument parsing.
115
+ """
116
+
117
+ read: SubcommandSpec[ReadArgs] = SubcommandSpec(
118
+ name="read",
119
+ argument_class=ReadArgs,
120
+ help="Use a saved session to view a protected page.",
121
+ description="Loads session state from the specified JSON file and navigates to the URL.",
122
+ )
123
+ write: SubcommandSpec[WriteArgs] = SubcommandSpec(
124
+ name="write",
125
+ argument_class=WriteArgs,
126
+ help="Save a new session by manually logging in.",
127
+ description="Launches a browser to the specified URL. Log in manually, then press Enter to save session state.",
128
+ )
129
+
130
+ def run(self) -> None:
131
+ """Parses arguments using spargear and executes the corresponding command."""
132
+ if isinstance(last_subcommand := self.last_command, RunnableArguments):
133
+ last_subcommand.run()
134
+ else:
135
+ self.get_parser().print_help()
136
+
137
+
138
+ def main() -> None:
139
+ Arguments().run()
140
+
141
+
142
+ if __name__ == "__main__":
143
+ main()
@@ -1,55 +1,56 @@
1
- import logging
2
- from pathlib import Path
3
- from typing import Optional
4
-
5
- from spargear import BaseArguments
6
-
7
- from chatterer import CodeSnippets
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class GetCodeSnippetsArgs(BaseArguments):
13
- input: str
14
- """Path to the package or file from which to extract code snippets."""
15
- output: Optional[str] = None
16
- """Output path for the extracted code snippets. If not provided, defaults to a file with the same name as the input."""
17
- ban_file_patterns: list[str] = [".venv/*", Path(__file__).relative_to(Path.cwd()).as_posix()]
18
- """List of file patterns to ignore."""
19
- glob_patterns: list[str] = ["*.py"]
20
- """List of glob patterns to include."""
21
- case_sensitive: bool = False
22
- """Enable case-sensitive matching for glob patterns."""
23
- prevent_save_file: bool = False
24
- """Prevent saving the extracted code snippets to a file."""
25
-
26
- def run(self) -> CodeSnippets:
27
- if not self.prevent_save_file:
28
- if not self.output:
29
- output = Path(__file__).with_suffix(".txt")
30
- else:
31
- output = Path(self.output)
32
- else:
33
- output = None
34
-
35
- cs = CodeSnippets.from_path_or_pkgname(
36
- path_or_pkgname=self.input,
37
- ban_file_patterns=self.ban_file_patterns,
38
- glob_patterns=self.glob_patterns,
39
- case_sensitive=self.case_sensitive,
40
- )
41
- if output is not None:
42
- output.parent.mkdir(parents=True, exist_ok=True)
43
- output.write_text(cs.snippets_text, encoding="utf-8")
44
- logger.info(f"Extracted code snippets from `{self.input}` and saved to `{output}`.")
45
- else:
46
- logger.info(f"Extracted code snippets from `{self.input}`.")
47
- return cs
48
-
49
-
50
- def main() -> None:
51
- GetCodeSnippetsArgs().run()
52
-
53
-
54
- if __name__ == "__main__":
55
- main()
1
+ import logging
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from spargear import RunnableArguments
7
+
8
+ from chatterer import CodeSnippets
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Arguments(RunnableArguments[CodeSnippets]):
14
+ PATH_OR_PACKAGE_NAME: str
15
+ """Path to the package or file from which to extract code snippets."""
16
+ output: Optional[str] = None
17
+ """Output path for the extracted code snippets. If not provided, defaults to a file with the current timestamp."""
18
+ ban_file_patterns: list[str] = [".venv/*", Path(__file__).relative_to(Path.cwd()).as_posix()]
19
+ """List of file patterns to ignore."""
20
+ glob_patterns: list[str] = ["*.py"]
21
+ """List of glob patterns to include."""
22
+ case_sensitive: bool = False
23
+ """Enable case-sensitive matching for glob patterns."""
24
+ prevent_save_file: bool = False
25
+ """Prevent saving the extracted code snippets to a file."""
26
+
27
+ def run(self) -> CodeSnippets:
28
+ if not self.prevent_save_file:
29
+ if not self.output:
30
+ output = Path(datetime.now().strftime("%Y%m%d_%H%M%S") + "_snippets.txt")
31
+ else:
32
+ output = Path(self.output)
33
+ else:
34
+ output = None
35
+
36
+ cs = CodeSnippets.from_path_or_pkgname(
37
+ path_or_pkgname=self.PATH_OR_PACKAGE_NAME,
38
+ ban_file_patterns=self.ban_file_patterns,
39
+ glob_patterns=self.glob_patterns,
40
+ case_sensitive=self.case_sensitive,
41
+ )
42
+ if output is not None:
43
+ output.parent.mkdir(parents=True, exist_ok=True)
44
+ output.write_text(cs.snippets_text, encoding="utf-8")
45
+ logger.info(f"Extracted code snippets from `{self.PATH_OR_PACKAGE_NAME}` and saved to `{output}`.")
46
+ else:
47
+ logger.info(f"Extracted code snippets from `{self.PATH_OR_PACKAGE_NAME}`.")
48
+ return cs
49
+
50
+
51
+ def main() -> None:
52
+ Arguments().run()
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -0,0 +1,192 @@
1
+ # pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
2
+
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import List, Optional
6
+
7
+ from openai import OpenAI
8
+ from pydub import AudioSegment
9
+ from spargear import RunnableArguments
10
+
11
+
12
+ # -------------------------------------------------------------------
13
+ # Helper functions for timestamp parsing & segment selection
14
+ # -------------------------------------------------------------------
15
+ def parse_timestamp(ts: str) -> float:
16
+ """
17
+ Parse a timestamp string into seconds.
18
+ Supports:
19
+ - "SS" or "SS.sss"
20
+ - "MM:SS" or "MM:SS.sss"
21
+ - "HH:MM:SS" or "HH:MM:SS.sss"
22
+ """
23
+ parts = ts.split(":")
24
+ seconds = 0.0
25
+ for idx, part in enumerate(reversed(parts)):
26
+ if not part:
27
+ value = 0.0
28
+ else:
29
+ value = float(part)
30
+ if idx == 0:
31
+ seconds += value
32
+ elif idx == 1:
33
+ seconds += value * 60
34
+ elif idx == 2:
35
+ seconds += value * 3600
36
+ else:
37
+ raise ValueError(f"Timestamp '{ts}' is too long (use H:MM:SS at most)")
38
+ return seconds
39
+
40
+
41
+ def get_selected_audio(audio: AudioSegment, segments_str: str) -> AudioSegment:
42
+ """
43
+ Given full audio and a segments string (e.g. "650-750,16:50-17:30,800-"),
44
+ extract those subranges and concatenate them.
45
+ """
46
+ duration_ms = len(audio)
47
+ duration_s = duration_ms / 1000.0
48
+ subsegments: List[AudioSegment] = []
49
+
50
+ for part in segments_str.split(","):
51
+ if "-" not in part:
52
+ raise ValueError(f"Invalid segment '{part}' (must contain '-')")
53
+ start_str, end_str = part.split("-", 1)
54
+ start_s = parse_timestamp(start_str) if start_str.strip() else 0.0
55
+ end_s = parse_timestamp(end_str) if end_str.strip() else duration_s
56
+
57
+ # clamp
58
+ start_s = max(0.0, min(start_s, duration_s))
59
+ end_s = max(0.0, min(end_s, duration_s))
60
+ if end_s <= start_s:
61
+ print(f"[!] Warning: segment '{part}' yields non-positive duration; skipping.")
62
+ continue
63
+
64
+ start_ms = int(start_s * 1000)
65
+ end_ms = int(end_s * 1000)
66
+ sub = audio[start_ms:end_ms]
67
+ subsegments.append(sub)
68
+ print(f"[i] Selected segment {start_s:.2f}s–{end_s:.2f}s ({end_s - start_s:.2f}s)")
69
+
70
+ if not subsegments:
71
+ raise RuntimeError("No valid segments were specified.")
72
+ # concatenate
73
+ combined = subsegments[0]
74
+ for seg in subsegments[1:]:
75
+ combined += seg
76
+ return combined
77
+
78
+
79
+ # -------------------------------------------------------------------
80
+ # Main transcription logic
81
+ # -------------------------------------------------------------------
82
+ class Arguments(RunnableArguments[None]):
83
+ AUDIO_PATH: Path
84
+ """The audio file to transcribe."""
85
+ output: Optional[Path] = None
86
+ """Path to save the transcription output."""
87
+ model: str = "gpt-4o-transcribe"
88
+ """The model to use for transcription."""
89
+ api_key: Optional[str] = None
90
+ """The API key for authentication."""
91
+ base_url: str = "https://api.openai.com/v1"
92
+ """The base URL for the API."""
93
+ prompt: str = "Transcribe whole text from audio."
94
+ """The prompt to use for transcription."""
95
+ segments: Optional[str] = None
96
+ """
97
+ Comma-separated list of time ranges to include (e.g. "650-750,16:50-17:30,800-").
98
+ Each range is start-end; start or end may be omitted.
99
+ Supports seconds or H:MM:SS formats.
100
+ """
101
+ max_chunk_duration: int = 600
102
+ """Maximum duration of each chunk in seconds."""
103
+
104
+ def run(self) -> None:
105
+ client = OpenAI(api_key=self.api_key, base_url=self.base_url)
106
+
107
+ # 1) Load entire audio
108
+ original_audio = load_audio_segment(self.AUDIO_PATH)
109
+
110
+ # 2) If segments specified, extract & combine
111
+ if self.segments:
112
+ audio = get_selected_audio(original_audio, self.segments)
113
+ print(f"[i] Combined audio duration: {len(audio) / 1000:.1f}s (from segments)")
114
+ else:
115
+ audio = original_audio
116
+ print(f"[i] Audio duration: {len(audio) / 1000:.1f}s (full audio)")
117
+
118
+ # 3) Split into chunks
119
+ segments = split_audio(audio, self.max_chunk_duration)
120
+ print(f"[i] Splitting into {len(segments)} segment(s) for transcription")
121
+
122
+ # 4) Transcribe each chunk
123
+ transcripts: List[str] = []
124
+ for idx, seg in enumerate(segments, start=1):
125
+ print(f"[i] Transcribing segment {idx}/{len(segments)}...")
126
+ transcripts.append(transcribe_segment(seg, client, self.model, self.prompt))
127
+
128
+ # 5) Write out
129
+ full = "\n\n".join(transcripts)
130
+ out_path = self.output or self.AUDIO_PATH.with_suffix(".txt")
131
+ out_path.write_text(full, encoding="utf-8")
132
+ print(f"[✓] Transcription saved to: {out_path}")
133
+
134
+
135
+ def load_audio_segment(file_path: Path) -> AudioSegment:
136
+ """
137
+ Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
138
+ """
139
+ ext = file_path.suffix.lower()[1:]
140
+ audio = AudioSegment.from_file(file_path.as_posix(), format=None if ext == "mp3" else ext)
141
+ if ext != "mp3":
142
+ buffer = BytesIO()
143
+ audio.export(buffer, format="mp3")
144
+ buffer.seek(0)
145
+ audio = AudioSegment.from_file(buffer, format="mp3")
146
+ return audio
147
+
148
+
149
+ def split_audio(audio: AudioSegment, max_duration_s: int) -> List[AudioSegment]:
150
+ """
151
+ Split the AudioSegment into chunks no longer than max_duration_s seconds.
152
+ """
153
+ chunk_ms = (max_duration_s - 1) * 1000
154
+ duration_ms = len(audio)
155
+ segments: List[AudioSegment] = []
156
+ for start in range(0, duration_ms, chunk_ms):
157
+ end = min(start + chunk_ms, duration_ms)
158
+ segments.append(audio[start:end])
159
+ return segments
160
+
161
+
162
+ def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
163
+ """
164
+ Transcribe a single AudioSegment chunk and return its text.
165
+ """
166
+ buffer = BytesIO()
167
+ segment.export(buffer, format="mp3")
168
+ buffer.seek(0)
169
+ mp3_bytes = buffer.read()
170
+
171
+ response = client.audio.transcriptions.create(
172
+ model=model,
173
+ prompt=prompt,
174
+ file=("audio.mp3", mp3_bytes),
175
+ response_format="text",
176
+ stream=True,
177
+ )
178
+ for res in response:
179
+ if res.type == "transcript.text.delta":
180
+ print(res.delta, end="", flush=True)
181
+ elif res.type == "transcript.text.done":
182
+ print()
183
+ return res.text
184
+ raise RuntimeError("No transcription result found.")
185
+
186
+
187
+ def main() -> None:
188
+ Arguments().run()
189
+
190
+
191
+ if __name__ == "__main__":
192
+ main()
@@ -1,89 +1,89 @@
1
- import logging
2
- from pathlib import Path
3
- from typing import Optional
4
-
5
- from langchain_core.documents.base import Blob
6
- from spargear import ArgumentSpec, BaseArguments
7
-
8
- from chatterer import Chatterer, UpstageDocumentParseParser
9
- from chatterer.tools.upstage_document_parser import (
10
- DEFAULT_IMAGE_DIR,
11
- DOCUMENT_PARSE_BASE_URL,
12
- DOCUMENT_PARSE_DEFAULT_MODEL,
13
- OCR,
14
- Category,
15
- OutputFormat,
16
- SplitType,
17
- )
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class UpstageParserArguments(BaseArguments):
23
- input: Path
24
- """Input file to parse. Can be a PDF, image, or other supported formats."""
25
- output: Optional[Path] = None
26
- """Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
27
- api_key: Optional[str] = None
28
- """API key for the Upstage API."""
29
- base_url: str = DOCUMENT_PARSE_BASE_URL
30
- """Base URL for the Upstage API."""
31
- model: str = DOCUMENT_PARSE_DEFAULT_MODEL
32
- """Model to use for parsing."""
33
- split: SplitType = "none"
34
- """Split type for the parsed content."""
35
- ocr: OCR = "auto"
36
- """OCR type for parsing."""
37
- output_format: OutputFormat = "markdown"
38
- """Output format for the parsed content."""
39
- coordinates: bool = False
40
- """Whether to include coordinates in the output."""
41
- base64_encoding: list[Category] = ["figure"]
42
- """Base64 encoding for specific categories in the parsed content."""
43
- image_description_instruction: str = "Describe the image in detail."
44
- """Instruction for generating image descriptions."""
45
- image_dir: str = DEFAULT_IMAGE_DIR
46
- """Directory to save images extracted from the document."""
47
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
48
- ["--chatterer"],
49
- default=None,
50
- help="Chatterer instance for communication.",
51
- type=Chatterer.from_provider,
52
- )
53
-
54
- def run(self) -> None:
55
- input = UpstageParserArguments.input.resolve()
56
- out = UpstageParserArguments.output or input.with_suffix(".md")
57
-
58
- parser = UpstageDocumentParseParser(
59
- api_key=UpstageParserArguments.api_key,
60
- base_url=UpstageParserArguments.base_url,
61
- model=UpstageParserArguments.model,
62
- split=UpstageParserArguments.split,
63
- ocr=UpstageParserArguments.ocr,
64
- output_format=UpstageParserArguments.output_format,
65
- coordinates=UpstageParserArguments.coordinates,
66
- base64_encoding=UpstageParserArguments.base64_encoding,
67
- image_description_instruction=UpstageParserArguments.image_description_instruction,
68
- image_dir=UpstageParserArguments.image_dir,
69
- chatterer=UpstageParserArguments.chatterer.value,
70
- )
71
- docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
72
-
73
- if UpstageParserArguments.image_dir:
74
- for path, image in parser.image_data.items():
75
- (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
76
- path.write_bytes(image)
77
- logger.info(f"Saved image to `{path}`")
78
-
79
- markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
80
- out.write_text(markdown, encoding="utf-8")
81
- logger.info(f"Parsed `{input}` to `{out}`")
82
-
83
-
84
- def main() -> None:
85
- UpstageParserArguments().run()
86
-
87
-
88
- if __name__ == "__main__":
89
- main()
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from langchain_core.documents.base import Blob
6
+ from spargear import ArgumentSpec, BaseArguments
7
+
8
+ from chatterer import Chatterer, UpstageDocumentParseParser
9
+ from chatterer.tools.upstage_document_parser import (
10
+ DEFAULT_IMAGE_DIR,
11
+ DOCUMENT_PARSE_BASE_URL,
12
+ DOCUMENT_PARSE_DEFAULT_MODEL,
13
+ OCR,
14
+ Category,
15
+ OutputFormat,
16
+ SplitType,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class Arguments(BaseArguments):
23
+ INPUT_PATH: Path
24
+ """Input file to parse. Can be a PDF, image, or other supported formats."""
25
+ output: Optional[Path] = None
26
+ """Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
27
+ api_key: Optional[str] = None
28
+ """API key for the Upstage API."""
29
+ base_url: str = DOCUMENT_PARSE_BASE_URL
30
+ """Base URL for the Upstage API."""
31
+ model: str = DOCUMENT_PARSE_DEFAULT_MODEL
32
+ """Model to use for parsing."""
33
+ split: SplitType = "none"
34
+ """Split type for the parsed content."""
35
+ ocr: OCR = "auto"
36
+ """OCR type for parsing."""
37
+ output_format: OutputFormat = "markdown"
38
+ """Output format for the parsed content."""
39
+ coordinates: bool = False
40
+ """Whether to include coordinates in the output."""
41
+ base64_encoding: list[Category] = ["figure"]
42
+ """Base64 encoding for specific categories in the parsed content."""
43
+ image_description_instruction: str = "Describe the image in detail."
44
+ """Instruction for generating image descriptions."""
45
+ image_dir: str = DEFAULT_IMAGE_DIR
46
+ """Directory to save images extracted from the document."""
47
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
48
+ ["--chatterer"],
49
+ default=None,
50
+ help="Chatterer instance for communication.",
51
+ type=Chatterer.from_provider,
52
+ )
53
+
54
+ def run(self) -> None:
55
+ input = self.INPUT_PATH.resolve()
56
+ out = self.output or input.with_suffix(".md")
57
+
58
+ parser = UpstageDocumentParseParser(
59
+ api_key=self.api_key,
60
+ base_url=self.base_url,
61
+ model=self.model,
62
+ split=self.split,
63
+ ocr=self.ocr,
64
+ output_format=self.output_format,
65
+ coordinates=self.coordinates,
66
+ base64_encoding=self.base64_encoding,
67
+ image_description_instruction=self.image_description_instruction,
68
+ image_dir=self.image_dir,
69
+ chatterer=self.chatterer.value,
70
+ )
71
+ docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
72
+
73
+ if self.image_dir:
74
+ for path, image in parser.image_data.items():
75
+ (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
76
+ path.write_bytes(image)
77
+ logger.info(f"Saved image to `{path}`")
78
+
79
+ markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
80
+ out.write_text(markdown, encoding="utf-8")
81
+ logger.info(f"Parsed `{input}` to `{out}`")
82
+
83
+
84
+ def main() -> None:
85
+ Arguments().run()
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()