chatterer 0.1.23__py3-none-any.whl → 0.1.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. chatterer/__init__.py +97 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__main__.py +75 -0
  5. chatterer/examples/{anything_to_markdown.py → any2md.py} +85 -85
  6. chatterer/examples/{pdf_to_markdown.py → pdf2md.py} +338 -338
  7. chatterer/examples/{pdf_to_text.py → pdf2txt.py} +54 -54
  8. chatterer/examples/{make_ppt.py → ppt.py} +486 -488
  9. chatterer/examples/pw.py +143 -0
  10. chatterer/examples/{get_code_snippets.py → snippet.py} +56 -55
  11. chatterer/examples/transcribe.py +192 -0
  12. chatterer/examples/{upstage_parser.py → upstage.py} +89 -89
  13. chatterer/examples/{webpage_to_markdown.py → web2md.py} +80 -70
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +536 -536
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +645 -625
  30. chatterer/tools/convert_to_text.py +446 -446
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +293 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.23.dist-info → chatterer-0.1.25.dist-info}/METADATA +390 -392
  40. chatterer-0.1.25.dist-info/RECORD +45 -0
  41. chatterer-0.1.25.dist-info/entry_points.txt +2 -0
  42. chatterer/examples/login_with_playwright.py +0 -156
  43. chatterer/examples/transcription_api.py +0 -112
  44. chatterer-0.1.23.dist-info/RECORD +0 -44
  45. chatterer-0.1.23.dist-info/entry_points.txt +0 -10
  46. {chatterer-0.1.23.dist-info → chatterer-0.1.25.dist-info}/WHEEL +0 -0
  47. {chatterer-0.1.23.dist-info → chatterer-0.1.25.dist-info}/top_level.txt +0 -0
@@ -1,70 +1,80 @@
1
- from pathlib import Path
2
- from typing import Literal
3
-
4
- from spargear import ArgumentSpec, BaseArguments
5
-
6
- from chatterer import Chatterer, MarkdownLink, PlayWrightBot
7
-
8
-
9
- class WebpageToMarkdownArgs(BaseArguments):
10
- url: str
11
- """The URL to crawl."""
12
- output: str = Path(__file__).with_suffix(".md").as_posix()
13
- """The output file path for the markdown file."""
14
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
15
- ["--chatterer"],
16
- help="The Chatterer backend and model to use for filtering the markdown.",
17
- type=Chatterer.from_provider,
18
- )
19
- engine: Literal["firefox", "chromium", "webkit"] = "firefox"
20
- """The browser engine to use."""
21
-
22
- def run(self) -> None:
23
- chatterer = self.chatterer.value
24
- url: str = self.url.strip()
25
- output: Path = Path(self.output).resolve()
26
- with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
27
- md = bot.url_to_md(url)
28
- output.write_text(md, encoding="utf-8")
29
- if chatterer is not None:
30
- md_llm = bot.url_to_md_with_llm(url.strip())
31
- output.write_text(md_llm, encoding="utf-8")
32
- links = MarkdownLink.from_markdown(md, referer_url=url)
33
- for link in links:
34
- if link.type == "link":
35
- print(
36
- f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
37
- )
38
- elif link.type == "image":
39
- print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
40
-
41
- async def arun(self) -> None:
42
- chatterer = self.chatterer.value
43
- url: str = self.url.strip()
44
- output: Path = Path(self.output).resolve()
45
- async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
46
- md = await bot.aurl_to_md(url)
47
- output.write_text(md, encoding="utf-8")
48
- if chatterer is not None:
49
- md_llm = await bot.aurl_to_md_with_llm(url.strip())
50
- output.write_text(md_llm, encoding="utf-8")
51
- links = MarkdownLink.from_markdown(md, referer_url=url)
52
- for link in links:
53
- if link.type == "link":
54
- print(
55
- f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
56
- )
57
- elif link.type == "image":
58
- print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
59
-
60
-
61
- def truncate_string(s: str) -> str:
62
- return s[:50] + "..." if len(s) > 50 else s
63
-
64
-
65
- def main() -> None:
66
- WebpageToMarkdownArgs().run()
67
-
68
-
69
- if __name__ == "__main__":
70
- main()
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ from spargear import ArgumentSpec, RunnableArguments
6
+
7
+ from chatterer import Chatterer, MarkdownLink, PlayWrightBot
8
+
9
+
10
+ def ouput_path_factory() -> Path:
11
+ """Factory function to generate a default output path for the markdown file."""
12
+ return Path(datetime.now().strftime("%Y%m%d_%H%M%S") + "_web2md.md").resolve()
13
+
14
+
15
+ class Arguments(RunnableArguments[None]):
16
+ URL: str
17
+ """The URL to crawl."""
18
+ output: ArgumentSpec[Path] = ArgumentSpec(
19
+ ["--output", "-o"],
20
+ default_factory=ouput_path_factory,
21
+ help="The output file path for the markdown file.",
22
+ )
23
+ """The output file path for the markdown file."""
24
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
25
+ ["--chatterer"],
26
+ help="The Chatterer backend and model to use for filtering the markdown.",
27
+ type=Chatterer.from_provider,
28
+ )
29
+ engine: Literal["firefox", "chromium", "webkit"] = "firefox"
30
+ """The browser engine to use."""
31
+
32
+ def run(self) -> None:
33
+ chatterer = self.chatterer.value
34
+ url: str = self.URL.strip()
35
+ output: Path = self.output.unwrap().resolve()
36
+ with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
37
+ md = bot.url_to_md(url)
38
+ output.write_text(md, encoding="utf-8")
39
+ if chatterer is not None:
40
+ md_llm = bot.url_to_md_with_llm(url.strip())
41
+ output.write_text(md_llm, encoding="utf-8")
42
+ links = MarkdownLink.from_markdown(md, referer_url=url)
43
+ for link in links:
44
+ if link.type == "link":
45
+ print(
46
+ f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
47
+ )
48
+ elif link.type == "image":
49
+ print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
50
+
51
+ async def arun(self) -> None:
52
+ chatterer = self.chatterer.value
53
+ url: str = self.URL.strip()
54
+ output: Path = self.output.unwrap().resolve()
55
+ async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
56
+ md = await bot.aurl_to_md(url)
57
+ output.write_text(md, encoding="utf-8")
58
+ if chatterer is not None:
59
+ md_llm = await bot.aurl_to_md_with_llm(url.strip())
60
+ output.write_text(md_llm, encoding="utf-8")
61
+ links = MarkdownLink.from_markdown(md, referer_url=url)
62
+ for link in links:
63
+ if link.type == "link":
64
+ print(
65
+ f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
66
+ )
67
+ elif link.type == "image":
68
+ print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
69
+
70
+
71
+ def truncate_string(s: str) -> str:
72
+ return s[:50] + "..." if len(s) > 50 else s
73
+
74
+
75
+ def main() -> None:
76
+ Arguments().run()
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()