chatterer 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +87 -97
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/constants.py +5 -0
- chatterer/examples/__main__.py +75 -75
- chatterer/examples/any2md.py +83 -85
- chatterer/examples/pdf2md.py +231 -338
- chatterer/examples/pdf2txt.py +52 -54
- chatterer/examples/ppt.py +487 -486
- chatterer/examples/pw.py +141 -143
- chatterer/examples/snippet.py +54 -56
- chatterer/examples/transcribe.py +192 -192
- chatterer/examples/upstage.py +87 -89
- chatterer/examples/web2md.py +80 -80
- chatterer/interactive.py +422 -354
- chatterer/language_model.py +530 -536
- chatterer/messages.py +21 -21
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +388 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +51 -53
- chatterer/tools/citation_chunking/citation_chunker.py +117 -118
- chatterer/tools/citation_chunking/citations.py +284 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +636 -645
- chatterer/tools/convert_to_text.py +446 -446
- chatterer/tools/upstage_document_parser.py +704 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -147
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +349 -293
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +145 -148
- {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/METADATA +377 -390
- chatterer-0.1.27.dist-info/RECORD +43 -0
- chatterer/strategies/__init__.py +0 -13
- chatterer/strategies/atom_of_thoughts.py +0 -975
- chatterer/strategies/base.py +0 -14
- chatterer-0.1.25.dist-info/RECORD +0 -45
- {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/WHEEL +0 -0
- {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/entry_points.txt +0 -0
- {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/top_level.txt +0 -0
chatterer/examples/web2md.py
CHANGED
@@ -1,80 +1,80 @@
|
|
1
|
-
from datetime import datetime
|
2
|
-
from pathlib import Path
|
3
|
-
from typing import Literal
|
4
|
-
|
5
|
-
from spargear import ArgumentSpec, RunnableArguments
|
6
|
-
|
7
|
-
from chatterer import Chatterer, MarkdownLink, PlayWrightBot
|
8
|
-
|
9
|
-
|
10
|
-
def ouput_path_factory() -> Path:
|
11
|
-
"""Factory function to generate a default output path for the markdown file."""
|
12
|
-
return Path(datetime.now().strftime("%Y%m%d_%H%M%S") + "_web2md.md").resolve()
|
13
|
-
|
14
|
-
|
15
|
-
class Arguments(RunnableArguments[None]):
|
16
|
-
URL: str
|
17
|
-
"""The URL to crawl."""
|
18
|
-
output: ArgumentSpec[Path] = ArgumentSpec(
|
19
|
-
["--output", "-o"],
|
20
|
-
default_factory=ouput_path_factory,
|
21
|
-
help="The output file path for the markdown file.",
|
22
|
-
)
|
23
|
-
"""The output file path for the markdown file."""
|
24
|
-
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
25
|
-
["--chatterer"],
|
26
|
-
help="The Chatterer backend and model to use for filtering the markdown.",
|
27
|
-
type=Chatterer.from_provider,
|
28
|
-
)
|
29
|
-
engine: Literal["firefox", "chromium", "webkit"] = "firefox"
|
30
|
-
"""The browser engine to use."""
|
31
|
-
|
32
|
-
def run(self) -> None:
|
33
|
-
chatterer = self.chatterer.value
|
34
|
-
url: str = self.URL.strip()
|
35
|
-
output: Path = self.output.unwrap().resolve()
|
36
|
-
with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
37
|
-
md = bot.url_to_md(url)
|
38
|
-
output.write_text(md, encoding="utf-8")
|
39
|
-
if chatterer is not None:
|
40
|
-
md_llm = bot.url_to_md_with_llm(url.strip())
|
41
|
-
output.write_text(md_llm, encoding="utf-8")
|
42
|
-
links = MarkdownLink.from_markdown(md, referer_url=url)
|
43
|
-
for link in links:
|
44
|
-
if link.type == "link":
|
45
|
-
print(
|
46
|
-
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
47
|
-
)
|
48
|
-
elif link.type == "image":
|
49
|
-
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
50
|
-
|
51
|
-
async def arun(self) -> None:
|
52
|
-
chatterer = self.chatterer.value
|
53
|
-
url: str = self.URL.strip()
|
54
|
-
output: Path = self.output.unwrap().resolve()
|
55
|
-
async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
56
|
-
md = await bot.aurl_to_md(url)
|
57
|
-
output.write_text(md, encoding="utf-8")
|
58
|
-
if chatterer is not None:
|
59
|
-
md_llm = await bot.aurl_to_md_with_llm(url.strip())
|
60
|
-
output.write_text(md_llm, encoding="utf-8")
|
61
|
-
links = MarkdownLink.from_markdown(md, referer_url=url)
|
62
|
-
for link in links:
|
63
|
-
if link.type == "link":
|
64
|
-
print(
|
65
|
-
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
66
|
-
)
|
67
|
-
elif link.type == "image":
|
68
|
-
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
69
|
-
|
70
|
-
|
71
|
-
def truncate_string(s: str) -> str:
|
72
|
-
return s[:50] + "..." if len(s) > 50 else s
|
73
|
-
|
74
|
-
|
75
|
-
def main() -> None:
|
76
|
-
Arguments().run()
|
77
|
-
|
78
|
-
|
79
|
-
if __name__ == "__main__":
|
80
|
-
main()
|
1
|
+
from datetime import datetime
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Literal
|
4
|
+
|
5
|
+
from spargear import ArgumentSpec, RunnableArguments
|
6
|
+
|
7
|
+
from chatterer import Chatterer, MarkdownLink, PlayWrightBot
|
8
|
+
|
9
|
+
|
10
|
+
def ouput_path_factory() -> Path:
|
11
|
+
"""Factory function to generate a default output path for the markdown file."""
|
12
|
+
return Path(datetime.now().strftime("%Y%m%d_%H%M%S") + "_web2md.md").resolve()
|
13
|
+
|
14
|
+
|
15
|
+
class Arguments(RunnableArguments[None]):
|
16
|
+
URL: str
|
17
|
+
"""The URL to crawl."""
|
18
|
+
output: ArgumentSpec[Path] = ArgumentSpec(
|
19
|
+
["--output", "-o"],
|
20
|
+
default_factory=ouput_path_factory,
|
21
|
+
help="The output file path for the markdown file.",
|
22
|
+
)
|
23
|
+
"""The output file path for the markdown file."""
|
24
|
+
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
25
|
+
["--chatterer"],
|
26
|
+
help="The Chatterer backend and model to use for filtering the markdown.",
|
27
|
+
type=Chatterer.from_provider,
|
28
|
+
)
|
29
|
+
engine: Literal["firefox", "chromium", "webkit"] = "firefox"
|
30
|
+
"""The browser engine to use."""
|
31
|
+
|
32
|
+
def run(self) -> None:
|
33
|
+
chatterer = self.chatterer.value
|
34
|
+
url: str = self.URL.strip()
|
35
|
+
output: Path = self.output.unwrap().resolve()
|
36
|
+
with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
37
|
+
md = bot.url_to_md(url)
|
38
|
+
output.write_text(md, encoding="utf-8")
|
39
|
+
if chatterer is not None:
|
40
|
+
md_llm = bot.url_to_md_with_llm(url.strip())
|
41
|
+
output.write_text(md_llm, encoding="utf-8")
|
42
|
+
links = MarkdownLink.from_markdown(md, referer_url=url)
|
43
|
+
for link in links:
|
44
|
+
if link.type == "link":
|
45
|
+
print(
|
46
|
+
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
47
|
+
)
|
48
|
+
elif link.type == "image":
|
49
|
+
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
50
|
+
|
51
|
+
async def arun(self) -> None:
|
52
|
+
chatterer = self.chatterer.value
|
53
|
+
url: str = self.URL.strip()
|
54
|
+
output: Path = self.output.unwrap().resolve()
|
55
|
+
async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
56
|
+
md = await bot.aurl_to_md(url)
|
57
|
+
output.write_text(md, encoding="utf-8")
|
58
|
+
if chatterer is not None:
|
59
|
+
md_llm = await bot.aurl_to_md_with_llm(url.strip())
|
60
|
+
output.write_text(md_llm, encoding="utf-8")
|
61
|
+
links = MarkdownLink.from_markdown(md, referer_url=url)
|
62
|
+
for link in links:
|
63
|
+
if link.type == "link":
|
64
|
+
print(
|
65
|
+
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
66
|
+
)
|
67
|
+
elif link.type == "image":
|
68
|
+
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
69
|
+
|
70
|
+
|
71
|
+
def truncate_string(s: str) -> str:
|
72
|
+
return s[:50] + "..." if len(s) > 50 else s
|
73
|
+
|
74
|
+
|
75
|
+
def main() -> None:
|
76
|
+
Arguments().run()
|
77
|
+
|
78
|
+
|
79
|
+
if __name__ == "__main__":
|
80
|
+
main()
|