chatterer 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +87 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__main__.py +75 -75
  5. chatterer/examples/any2md.py +85 -85
  6. chatterer/examples/pdf2md.py +338 -338
  7. chatterer/examples/pdf2txt.py +54 -54
  8. chatterer/examples/ppt.py +486 -486
  9. chatterer/examples/pw.py +143 -137
  10. chatterer/examples/snippet.py +56 -55
  11. chatterer/examples/transcribe.py +192 -112
  12. chatterer/examples/upstage.py +89 -89
  13. chatterer/examples/web2md.py +80 -66
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +536 -536
  16. chatterer/messages.py +21 -21
  17. chatterer/tools/__init__.py +46 -46
  18. chatterer/tools/caption_markdown_images.py +384 -384
  19. chatterer/tools/citation_chunking/__init__.py +3 -3
  20. chatterer/tools/citation_chunking/chunks.py +53 -53
  21. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  22. chatterer/tools/citation_chunking/citations.py +285 -285
  23. chatterer/tools/citation_chunking/prompt.py +157 -157
  24. chatterer/tools/citation_chunking/reference.py +26 -26
  25. chatterer/tools/citation_chunking/utils.py +138 -138
  26. chatterer/tools/convert_pdf_to_markdown.py +645 -625
  27. chatterer/tools/convert_to_text.py +446 -446
  28. chatterer/tools/upstage_document_parser.py +705 -705
  29. chatterer/tools/webpage_to_markdown.py +739 -739
  30. chatterer/tools/youtube.py +146 -146
  31. chatterer/utils/__init__.py +15 -15
  32. chatterer/utils/base64_image.py +350 -285
  33. chatterer/utils/bytesio.py +59 -59
  34. chatterer/utils/code_agent.py +237 -237
  35. chatterer/utils/imghdr.py +145 -148
  36. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/METADATA +390 -389
  37. chatterer-0.1.26.dist-info/RECORD +42 -0
  38. chatterer/strategies/__init__.py +0 -13
  39. chatterer/strategies/atom_of_thoughts.py +0 -975
  40. chatterer/strategies/base.py +0 -14
  41. chatterer-0.1.24.dist-info/RECORD +0 -45
  42. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/WHEEL +0 -0
  43. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/entry_points.txt +0 -0
  44. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/top_level.txt +0 -0
@@ -1,112 +1,192 @@
1
- # pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
2
-
3
- from io import BytesIO
4
- from pathlib import Path
5
- from typing import Optional, cast
6
-
7
- from openai import OpenAI
8
- from pydub import AudioSegment
9
- from spargear import RunnableArguments
10
-
11
- # Maximum chunk length in seconds
12
- MAX_CHUNK_DURATION = 600
13
-
14
-
15
- class Arguments(RunnableArguments[None]):
16
- AUDIO_PATH: Path
17
- """The audio file to transcribe."""
18
- output: Optional[Path] = None
19
- """Path to save the transcription output."""
20
- model: str = "gpt-4o-transcribe"
21
- """The model to use for transcription."""
22
- api_key: Optional[str] = None
23
- """The API key for authentication."""
24
- base_url: str = "https://api.openai.com/v1"
25
- """The base URL for the API."""
26
- prompt: str = "Transcribe whole text from audio."
27
- """The prompt to use for transcription."""
28
-
29
- def run(self) -> None:
30
- model = self.model
31
-
32
- client = OpenAI(api_key=self.api_key, base_url=self.base_url)
33
-
34
- audio = load_audio_segment(self.AUDIO_PATH)
35
-
36
- segments = split_audio(audio, MAX_CHUNK_DURATION)
37
- print(f"[i] Audio duration: {len(audio) / 1000:.1f}s; splitting into {len(segments)} segment(s)")
38
-
39
- transcripts: list[str] = []
40
- for idx, seg in enumerate(segments, start=1):
41
- print(f"[i] Transcribing segment {idx}/{len(segments)}...")
42
- transcripts.append(transcribe_segment(seg, client, model, self.prompt))
43
-
44
- full_transcript = "\n\n".join(transcripts)
45
- output_path: Path = self.output or self.AUDIO_PATH.with_suffix(".txt")
46
- output_path.write_text(full_transcript, encoding="utf-8")
47
- print(f"[✓] Transcription saved to: {output_path}")
48
-
49
-
50
- def load_audio_segment(file_path: Path) -> AudioSegment:
51
- """
52
- Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
53
- """
54
- ext = file_path.suffix.lower()[1:]
55
- audio = AudioSegment.from_file(file_path.as_posix(), format=ext if ext != "mp3" else None)
56
- if ext != "mp3":
57
- buffer = BytesIO()
58
- audio.export(buffer, format="mp3")
59
- buffer.seek(0)
60
- audio = AudioSegment.from_file(buffer, format="mp3")
61
- return audio
62
-
63
-
64
- def split_audio(audio: AudioSegment, max_duration_s: int) -> list[AudioSegment]:
65
- """
66
- Split the AudioSegment into chunks no longer than max_duration_s seconds.
67
- """
68
- chunk_length_ms = (max_duration_s - 1) * 1000
69
- duration_ms = len(audio)
70
- segments: list[AudioSegment] = []
71
- segment_idx: int = 0
72
- for start_ms in range(0, duration_ms, chunk_length_ms):
73
- end_ms = min(start_ms + chunk_length_ms, duration_ms)
74
- segment = cast(AudioSegment, audio[start_ms:end_ms])
75
- segments.append(segment)
76
- # with open(f"segment_{segment_idx}.mp3", "wb") as f:
77
- # segment.export(f, format="mp3")
78
- segment_idx += 1
79
- return segments
80
-
81
-
82
- def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
83
- """
84
- Transcribe a single AudioSegment chunk and return its text.
85
- """
86
- buffer = BytesIO()
87
- segment.export(buffer, format="mp3")
88
- buffer.seek(0)
89
- mp3_bytes = buffer.read()
90
- response = client.audio.transcriptions.create(
91
- model=model,
92
- prompt=prompt,
93
- file=("audio.mp3", mp3_bytes),
94
- response_format="text",
95
- stream=True,
96
- )
97
- for res in response:
98
- if res.type == "transcript.text.delta":
99
- print(res.delta, end="", flush=True)
100
- if res.type == "transcript.text.done":
101
- print()
102
- return res.text
103
- else:
104
- raise RuntimeError("No transcription result found.")
105
-
106
-
107
- def main() -> None:
108
- Arguments().run()
109
-
110
-
111
- if __name__ == "__main__":
112
- main()
1
+ # pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportArgumentType=false, reportMissingTypeStubs=false
2
+
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import List, Optional
6
+
7
+ from openai import OpenAI
8
+ from pydub import AudioSegment
9
+ from spargear import RunnableArguments
10
+
11
+
12
+ # -------------------------------------------------------------------
13
+ # Helper functions for timestamp parsing & segment selection
14
+ # -------------------------------------------------------------------
15
+ def parse_timestamp(ts: str) -> float:
16
+ """
17
+ Parse a timestamp string into seconds.
18
+ Supports:
19
+ - "SS" or "SS.sss"
20
+ - "MM:SS" or "MM:SS.sss"
21
+ - "HH:MM:SS" or "HH:MM:SS.sss"
22
+ """
23
+ parts = ts.split(":")
24
+ seconds = 0.0
25
+ for idx, part in enumerate(reversed(parts)):
26
+ if not part:
27
+ value = 0.0
28
+ else:
29
+ value = float(part)
30
+ if idx == 0:
31
+ seconds += value
32
+ elif idx == 1:
33
+ seconds += value * 60
34
+ elif idx == 2:
35
+ seconds += value * 3600
36
+ else:
37
+ raise ValueError(f"Timestamp '{ts}' is too long (use H:MM:SS at most)")
38
+ return seconds
39
+
40
+
41
+ def get_selected_audio(audio: AudioSegment, segments_str: str) -> AudioSegment:
42
+ """
43
+ Given full audio and a segments string (e.g. "650-750,16:50-17:30,800-"),
44
+ extract those subranges and concatenate them.
45
+ """
46
+ duration_ms = len(audio)
47
+ duration_s = duration_ms / 1000.0
48
+ subsegments: List[AudioSegment] = []
49
+
50
+ for part in segments_str.split(","):
51
+ if "-" not in part:
52
+ raise ValueError(f"Invalid segment '{part}' (must contain '-')")
53
+ start_str, end_str = part.split("-", 1)
54
+ start_s = parse_timestamp(start_str) if start_str.strip() else 0.0
55
+ end_s = parse_timestamp(end_str) if end_str.strip() else duration_s
56
+
57
+ # clamp
58
+ start_s = max(0.0, min(start_s, duration_s))
59
+ end_s = max(0.0, min(end_s, duration_s))
60
+ if end_s <= start_s:
61
+ print(f"[!] Warning: segment '{part}' yields non-positive duration; skipping.")
62
+ continue
63
+
64
+ start_ms = int(start_s * 1000)
65
+ end_ms = int(end_s * 1000)
66
+ sub = audio[start_ms:end_ms]
67
+ subsegments.append(sub)
68
+ print(f"[i] Selected segment {start_s:.2f}s–{end_s:.2f}s ({end_s - start_s:.2f}s)")
69
+
70
+ if not subsegments:
71
+ raise RuntimeError("No valid segments were specified.")
72
+ # concatenate
73
+ combined = subsegments[0]
74
+ for seg in subsegments[1:]:
75
+ combined += seg
76
+ return combined
77
+
78
+
79
+ # -------------------------------------------------------------------
80
+ # Main transcription logic
81
+ # -------------------------------------------------------------------
82
+ class Arguments(RunnableArguments[None]):
83
+ AUDIO_PATH: Path
84
+ """The audio file to transcribe."""
85
+ output: Optional[Path] = None
86
+ """Path to save the transcription output."""
87
+ model: str = "gpt-4o-transcribe"
88
+ """The model to use for transcription."""
89
+ api_key: Optional[str] = None
90
+ """The API key for authentication."""
91
+ base_url: str = "https://api.openai.com/v1"
92
+ """The base URL for the API."""
93
+ prompt: str = "Transcribe whole text from audio."
94
+ """The prompt to use for transcription."""
95
+ segments: Optional[str] = None
96
+ """
97
+ Comma-separated list of time ranges to include (e.g. "650-750,16:50-17:30,800-").
98
+ Each range is start-end; start or end may be omitted.
99
+ Supports seconds or H:MM:SS formats.
100
+ """
101
+ max_chunk_duration: int = 600
102
+ """Maximum duration of each chunk in seconds."""
103
+
104
+ def run(self) -> None:
105
+ client = OpenAI(api_key=self.api_key, base_url=self.base_url)
106
+
107
+ # 1) Load entire audio
108
+ original_audio = load_audio_segment(self.AUDIO_PATH)
109
+
110
+ # 2) If segments specified, extract & combine
111
+ if self.segments:
112
+ audio = get_selected_audio(original_audio, self.segments)
113
+ print(f"[i] Combined audio duration: {len(audio) / 1000:.1f}s (from segments)")
114
+ else:
115
+ audio = original_audio
116
+ print(f"[i] Audio duration: {len(audio) / 1000:.1f}s (full audio)")
117
+
118
+ # 3) Split into chunks
119
+ segments = split_audio(audio, self.max_chunk_duration)
120
+ print(f"[i] Splitting into {len(segments)} segment(s) for transcription")
121
+
122
+ # 4) Transcribe each chunk
123
+ transcripts: List[str] = []
124
+ for idx, seg in enumerate(segments, start=1):
125
+ print(f"[i] Transcribing segment {idx}/{len(segments)}...")
126
+ transcripts.append(transcribe_segment(seg, client, self.model, self.prompt))
127
+
128
+ # 5) Write out
129
+ full = "\n\n".join(transcripts)
130
+ out_path = self.output or self.AUDIO_PATH.with_suffix(".txt")
131
+ out_path.write_text(full, encoding="utf-8")
132
+ print(f"[✓] Transcription saved to: {out_path}")
133
+
134
+
135
+ def load_audio_segment(file_path: Path) -> AudioSegment:
136
+ """
137
+ Load an audio file as an AudioSegment. Convert to mp3 format in-memory if needed.
138
+ """
139
+ ext = file_path.suffix.lower()[1:]
140
+ audio = AudioSegment.from_file(file_path.as_posix(), format=None if ext == "mp3" else ext)
141
+ if ext != "mp3":
142
+ buffer = BytesIO()
143
+ audio.export(buffer, format="mp3")
144
+ buffer.seek(0)
145
+ audio = AudioSegment.from_file(buffer, format="mp3")
146
+ return audio
147
+
148
+
149
+ def split_audio(audio: AudioSegment, max_duration_s: int) -> List[AudioSegment]:
150
+ """
151
+ Split the AudioSegment into chunks no longer than max_duration_s seconds.
152
+ """
153
+ chunk_ms = (max_duration_s - 1) * 1000
154
+ duration_ms = len(audio)
155
+ segments: List[AudioSegment] = []
156
+ for start in range(0, duration_ms, chunk_ms):
157
+ end = min(start + chunk_ms, duration_ms)
158
+ segments.append(audio[start:end])
159
+ return segments
160
+
161
+
162
+ def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
163
+ """
164
+ Transcribe a single AudioSegment chunk and return its text.
165
+ """
166
+ buffer = BytesIO()
167
+ segment.export(buffer, format="mp3")
168
+ buffer.seek(0)
169
+ mp3_bytes = buffer.read()
170
+
171
+ response = client.audio.transcriptions.create(
172
+ model=model,
173
+ prompt=prompt,
174
+ file=("audio.mp3", mp3_bytes),
175
+ response_format="text",
176
+ stream=True,
177
+ )
178
+ for res in response:
179
+ if res.type == "transcript.text.delta":
180
+ print(res.delta, end="", flush=True)
181
+ elif res.type == "transcript.text.done":
182
+ print()
183
+ return res.text
184
+ raise RuntimeError("No transcription result found.")
185
+
186
+
187
+ def main() -> None:
188
+ Arguments().run()
189
+
190
+
191
+ if __name__ == "__main__":
192
+ main()
@@ -1,89 +1,89 @@
1
- import logging
2
- from pathlib import Path
3
- from typing import Optional
4
-
5
- from langchain_core.documents.base import Blob
6
- from spargear import ArgumentSpec, BaseArguments
7
-
8
- from chatterer import Chatterer, UpstageDocumentParseParser
9
- from chatterer.tools.upstage_document_parser import (
10
- DEFAULT_IMAGE_DIR,
11
- DOCUMENT_PARSE_BASE_URL,
12
- DOCUMENT_PARSE_DEFAULT_MODEL,
13
- OCR,
14
- Category,
15
- OutputFormat,
16
- SplitType,
17
- )
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class Arguments(BaseArguments):
23
- INPUT_PATH: Path
24
- """Input file to parse. Can be a PDF, image, or other supported formats."""
25
- output: Optional[Path] = None
26
- """Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
27
- api_key: Optional[str] = None
28
- """API key for the Upstage API."""
29
- base_url: str = DOCUMENT_PARSE_BASE_URL
30
- """Base URL for the Upstage API."""
31
- model: str = DOCUMENT_PARSE_DEFAULT_MODEL
32
- """Model to use for parsing."""
33
- split: SplitType = "none"
34
- """Split type for the parsed content."""
35
- ocr: OCR = "auto"
36
- """OCR type for parsing."""
37
- output_format: OutputFormat = "markdown"
38
- """Output format for the parsed content."""
39
- coordinates: bool = False
40
- """Whether to include coordinates in the output."""
41
- base64_encoding: list[Category] = ["figure"]
42
- """Base64 encoding for specific categories in the parsed content."""
43
- image_description_instruction: str = "Describe the image in detail."
44
- """Instruction for generating image descriptions."""
45
- image_dir: str = DEFAULT_IMAGE_DIR
46
- """Directory to save images extracted from the document."""
47
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
48
- ["--chatterer"],
49
- default=None,
50
- help="Chatterer instance for communication.",
51
- type=Chatterer.from_provider,
52
- )
53
-
54
- def run(self) -> None:
55
- input = self.INPUT_PATH.resolve()
56
- out = self.output or input.with_suffix(".md")
57
-
58
- parser = UpstageDocumentParseParser(
59
- api_key=self.api_key,
60
- base_url=self.base_url,
61
- model=self.model,
62
- split=self.split,
63
- ocr=self.ocr,
64
- output_format=self.output_format,
65
- coordinates=self.coordinates,
66
- base64_encoding=self.base64_encoding,
67
- image_description_instruction=self.image_description_instruction,
68
- image_dir=self.image_dir,
69
- chatterer=self.chatterer.value,
70
- )
71
- docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
72
-
73
- if self.image_dir:
74
- for path, image in parser.image_data.items():
75
- (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
76
- path.write_bytes(image)
77
- logger.info(f"Saved image to `{path}`")
78
-
79
- markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
80
- out.write_text(markdown, encoding="utf-8")
81
- logger.info(f"Parsed `{input}` to `{out}`")
82
-
83
-
84
- def main() -> None:
85
- Arguments().run()
86
-
87
-
88
- if __name__ == "__main__":
89
- main()
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from langchain_core.documents.base import Blob
6
+ from spargear import ArgumentSpec, BaseArguments
7
+
8
+ from chatterer import Chatterer, UpstageDocumentParseParser
9
+ from chatterer.tools.upstage_document_parser import (
10
+ DEFAULT_IMAGE_DIR,
11
+ DOCUMENT_PARSE_BASE_URL,
12
+ DOCUMENT_PARSE_DEFAULT_MODEL,
13
+ OCR,
14
+ Category,
15
+ OutputFormat,
16
+ SplitType,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class Arguments(BaseArguments):
23
+ INPUT_PATH: Path
24
+ """Input file to parse. Can be a PDF, image, or other supported formats."""
25
+ output: Optional[Path] = None
26
+ """Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
27
+ api_key: Optional[str] = None
28
+ """API key for the Upstage API."""
29
+ base_url: str = DOCUMENT_PARSE_BASE_URL
30
+ """Base URL for the Upstage API."""
31
+ model: str = DOCUMENT_PARSE_DEFAULT_MODEL
32
+ """Model to use for parsing."""
33
+ split: SplitType = "none"
34
+ """Split type for the parsed content."""
35
+ ocr: OCR = "auto"
36
+ """OCR type for parsing."""
37
+ output_format: OutputFormat = "markdown"
38
+ """Output format for the parsed content."""
39
+ coordinates: bool = False
40
+ """Whether to include coordinates in the output."""
41
+ base64_encoding: list[Category] = ["figure"]
42
+ """Base64 encoding for specific categories in the parsed content."""
43
+ image_description_instruction: str = "Describe the image in detail."
44
+ """Instruction for generating image descriptions."""
45
+ image_dir: str = DEFAULT_IMAGE_DIR
46
+ """Directory to save images extracted from the document."""
47
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
48
+ ["--chatterer"],
49
+ default=None,
50
+ help="Chatterer instance for communication.",
51
+ type=Chatterer.from_provider,
52
+ )
53
+
54
+ def run(self) -> None:
55
+ input = self.INPUT_PATH.resolve()
56
+ out = self.output or input.with_suffix(".md")
57
+
58
+ parser = UpstageDocumentParseParser(
59
+ api_key=self.api_key,
60
+ base_url=self.base_url,
61
+ model=self.model,
62
+ split=self.split,
63
+ ocr=self.ocr,
64
+ output_format=self.output_format,
65
+ coordinates=self.coordinates,
66
+ base64_encoding=self.base64_encoding,
67
+ image_description_instruction=self.image_description_instruction,
68
+ image_dir=self.image_dir,
69
+ chatterer=self.chatterer.value,
70
+ )
71
+ docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
72
+
73
+ if self.image_dir:
74
+ for path, image in parser.image_data.items():
75
+ (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
76
+ path.write_bytes(image)
77
+ logger.info(f"Saved image to `{path}`")
78
+
79
+ markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
80
+ out.write_text(markdown, encoding="utf-8")
81
+ logger.info(f"Parsed `{input}` to `{out}`")
82
+
83
+
84
+ def main() -> None:
85
+ Arguments().run()
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()
@@ -1,66 +1,80 @@
1
- from pathlib import Path
2
- from typing import Literal
3
-
4
- from spargear import ArgumentSpec, RunnableArguments
5
-
6
- from chatterer import Chatterer, MarkdownLink, PlayWrightBot
7
-
8
-
9
- class Arguments(RunnableArguments[None]):
10
- URL: str
11
- """The URL to crawl."""
12
- output: str = Path(__file__).with_suffix(".md").as_posix()
13
- """The output file path for the markdown file."""
14
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
15
- ["--chatterer"],
16
- help="The Chatterer backend and model to use for filtering the markdown.",
17
- type=Chatterer.from_provider,
18
- )
19
- engine: Literal["firefox", "chromium", "webkit"] = "firefox"
20
- """The browser engine to use."""
21
-
22
- def run(self) -> None:
23
- chatterer = self.chatterer.value
24
- url: str = self.URL.strip()
25
- output: Path = Path(self.output).resolve()
26
- with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
27
- md = bot.url_to_md(url)
28
- output.write_text(md, encoding="utf-8")
29
- if chatterer is not None:
30
- md_llm = bot.url_to_md_with_llm(url.strip())
31
- output.write_text(md_llm, encoding="utf-8")
32
- links = MarkdownLink.from_markdown(md, referer_url=url)
33
- for link in links:
34
- if link.type == "link":
35
- print(f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})")
36
- elif link.type == "image":
37
- print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
38
-
39
- async def arun(self) -> None:
40
- chatterer = self.chatterer.value
41
- url: str = self.URL.strip()
42
- output: Path = Path(self.output).resolve()
43
- async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
44
- md = await bot.aurl_to_md(url)
45
- output.write_text(md, encoding="utf-8")
46
- if chatterer is not None:
47
- md_llm = await bot.aurl_to_md_with_llm(url.strip())
48
- output.write_text(md_llm, encoding="utf-8")
49
- links = MarkdownLink.from_markdown(md, referer_url=url)
50
- for link in links:
51
- if link.type == "link":
52
- print(f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})")
53
- elif link.type == "image":
54
- print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
55
-
56
-
57
- def truncate_string(s: str) -> str:
58
- return s[:50] + "..." if len(s) > 50 else s
59
-
60
-
61
- def main() -> None:
62
- Arguments().run()
63
-
64
-
65
- if __name__ == "__main__":
66
- main()
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ from spargear import ArgumentSpec, RunnableArguments
6
+
7
+ from chatterer import Chatterer, MarkdownLink, PlayWrightBot
8
+
9
+
10
+ def ouput_path_factory() -> Path:
11
+ """Factory function to generate a default output path for the markdown file."""
12
+ return Path(datetime.now().strftime("%Y%m%d_%H%M%S") + "_web2md.md").resolve()
13
+
14
+
15
+ class Arguments(RunnableArguments[None]):
16
+ URL: str
17
+ """The URL to crawl."""
18
+ output: ArgumentSpec[Path] = ArgumentSpec(
19
+ ["--output", "-o"],
20
+ default_factory=ouput_path_factory,
21
+ help="The output file path for the markdown file.",
22
+ )
23
+ """The output file path for the markdown file."""
24
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
25
+ ["--chatterer"],
26
+ help="The Chatterer backend and model to use for filtering the markdown.",
27
+ type=Chatterer.from_provider,
28
+ )
29
+ engine: Literal["firefox", "chromium", "webkit"] = "firefox"
30
+ """The browser engine to use."""
31
+
32
+ def run(self) -> None:
33
+ chatterer = self.chatterer.value
34
+ url: str = self.URL.strip()
35
+ output: Path = self.output.unwrap().resolve()
36
+ with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
37
+ md = bot.url_to_md(url)
38
+ output.write_text(md, encoding="utf-8")
39
+ if chatterer is not None:
40
+ md_llm = bot.url_to_md_with_llm(url.strip())
41
+ output.write_text(md_llm, encoding="utf-8")
42
+ links = MarkdownLink.from_markdown(md, referer_url=url)
43
+ for link in links:
44
+ if link.type == "link":
45
+ print(
46
+ f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
47
+ )
48
+ elif link.type == "image":
49
+ print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
50
+
51
+ async def arun(self) -> None:
52
+ chatterer = self.chatterer.value
53
+ url: str = self.URL.strip()
54
+ output: Path = self.output.unwrap().resolve()
55
+ async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
56
+ md = await bot.aurl_to_md(url)
57
+ output.write_text(md, encoding="utf-8")
58
+ if chatterer is not None:
59
+ md_llm = await bot.aurl_to_md_with_llm(url.strip())
60
+ output.write_text(md_llm, encoding="utf-8")
61
+ links = MarkdownLink.from_markdown(md, referer_url=url)
62
+ for link in links:
63
+ if link.type == "link":
64
+ print(
65
+ f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
66
+ )
67
+ elif link.type == "image":
68
+ print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
69
+
70
+
71
+ def truncate_string(s: str) -> str:
72
+ return s[:50] + "..." if len(s) > 50 else s
73
+
74
+
75
+ def main() -> None:
76
+ Arguments().run()
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()