chatterer 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +93 -93
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/examples/__init__.py +0 -0
- chatterer/examples/anything_to_markdown.py +85 -91
- chatterer/examples/get_code_snippets.py +55 -62
- chatterer/examples/login_with_playwright.py +156 -167
- chatterer/examples/make_ppt.py +488 -497
- chatterer/examples/pdf_to_markdown.py +100 -107
- chatterer/examples/pdf_to_text.py +54 -56
- chatterer/examples/transcription_api.py +112 -123
- chatterer/examples/upstage_parser.py +89 -100
- chatterer/examples/webpage_to_markdown.py +70 -79
- chatterer/interactive.py +354 -354
- chatterer/language_model.py +533 -533
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +393 -302
- chatterer/tools/convert_to_text.py +446 -447
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +285 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/METADATA +392 -392
- chatterer-0.1.20.dist-info/RECORD +44 -0
- {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/WHEEL +1 -1
- chatterer-0.1.20.dist-info/entry_points.txt +10 -0
- chatterer-0.1.18.dist-info/RECORD +0 -42
- {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/top_level.txt +0 -0
@@ -1,100 +1,89 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
["--
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
)
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
(path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
|
91
|
-
path.write_bytes(image)
|
92
|
-
logger.info(f"Saved image to `{path}`")
|
93
|
-
|
94
|
-
markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
|
95
|
-
out.write_text(markdown, encoding="utf-8")
|
96
|
-
logger.info(f"Parsed `{input}` to `{out}`")
|
97
|
-
|
98
|
-
|
99
|
-
if __name__ == "__main__":
|
100
|
-
UpstageParserArguments().run()
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from langchain_core.documents.base import Blob
|
6
|
+
from spargear import ArgumentSpec, BaseArguments
|
7
|
+
|
8
|
+
from chatterer import Chatterer, UpstageDocumentParseParser
|
9
|
+
from chatterer.tools.upstage_document_parser import (
|
10
|
+
DEFAULT_IMAGE_DIR,
|
11
|
+
DOCUMENT_PARSE_BASE_URL,
|
12
|
+
DOCUMENT_PARSE_DEFAULT_MODEL,
|
13
|
+
OCR,
|
14
|
+
Category,
|
15
|
+
OutputFormat,
|
16
|
+
SplitType,
|
17
|
+
)
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class UpstageParserArguments(BaseArguments):
|
23
|
+
input: Path
|
24
|
+
"""Input file to parse. Can be a PDF, image, or other supported formats."""
|
25
|
+
output: Optional[Path] = None
|
26
|
+
"""Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
|
27
|
+
api_key: Optional[str] = None
|
28
|
+
"""API key for the Upstage API."""
|
29
|
+
base_url: str = DOCUMENT_PARSE_BASE_URL
|
30
|
+
"""Base URL for the Upstage API."""
|
31
|
+
model: str = DOCUMENT_PARSE_DEFAULT_MODEL
|
32
|
+
"""Model to use for parsing."""
|
33
|
+
split: SplitType = "none"
|
34
|
+
"""Split type for the parsed content."""
|
35
|
+
ocr: OCR = "auto"
|
36
|
+
"""OCR type for parsing."""
|
37
|
+
output_format: OutputFormat = "markdown"
|
38
|
+
"""Output format for the parsed content."""
|
39
|
+
coordinates: bool = False
|
40
|
+
"""Whether to include coordinates in the output."""
|
41
|
+
base64_encoding: list[Category] = ["figure"]
|
42
|
+
"""Base64 encoding for specific categories in the parsed content."""
|
43
|
+
image_description_instruction: str = "Describe the image in detail."
|
44
|
+
"""Instruction for generating image descriptions."""
|
45
|
+
image_dir: str = DEFAULT_IMAGE_DIR
|
46
|
+
"""Directory to save images extracted from the document."""
|
47
|
+
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
48
|
+
["--chatterer"],
|
49
|
+
default=None,
|
50
|
+
help="Chatterer instance for communication.",
|
51
|
+
type=Chatterer.from_provider,
|
52
|
+
)
|
53
|
+
|
54
|
+
def run(self) -> None:
|
55
|
+
input = UpstageParserArguments.input.resolve()
|
56
|
+
out = UpstageParserArguments.output or input.with_suffix(".md")
|
57
|
+
|
58
|
+
parser = UpstageDocumentParseParser(
|
59
|
+
api_key=UpstageParserArguments.api_key,
|
60
|
+
base_url=UpstageParserArguments.base_url,
|
61
|
+
model=UpstageParserArguments.model,
|
62
|
+
split=UpstageParserArguments.split,
|
63
|
+
ocr=UpstageParserArguments.ocr,
|
64
|
+
output_format=UpstageParserArguments.output_format,
|
65
|
+
coordinates=UpstageParserArguments.coordinates,
|
66
|
+
base64_encoding=UpstageParserArguments.base64_encoding,
|
67
|
+
image_description_instruction=UpstageParserArguments.image_description_instruction,
|
68
|
+
image_dir=UpstageParserArguments.image_dir,
|
69
|
+
chatterer=UpstageParserArguments.chatterer.value,
|
70
|
+
)
|
71
|
+
docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
|
72
|
+
|
73
|
+
if UpstageParserArguments.image_dir:
|
74
|
+
for path, image in parser.image_data.items():
|
75
|
+
(path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
|
76
|
+
path.write_bytes(image)
|
77
|
+
logger.info(f"Saved image to `{path}`")
|
78
|
+
|
79
|
+
markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
|
80
|
+
out.write_text(markdown, encoding="utf-8")
|
81
|
+
logger.info(f"Parsed `{input}` to `{out}`")
|
82
|
+
|
83
|
+
|
84
|
+
def main() -> None:
|
85
|
+
UpstageParserArguments().run()
|
86
|
+
|
87
|
+
|
88
|
+
if __name__ == "__main__":
|
89
|
+
main()
|
@@ -1,79 +1,70 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
72
|
-
|
73
|
-
|
74
|
-
def truncate_string(s: str) -> str:
|
75
|
-
return s[:50] + "..." if len(s) > 50 else s
|
76
|
-
|
77
|
-
|
78
|
-
if __name__ == "__main__":
|
79
|
-
WebpageToMarkdownArgs().run()
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Literal
|
3
|
+
|
4
|
+
from spargear import ArgumentSpec, BaseArguments
|
5
|
+
|
6
|
+
from chatterer import Chatterer, MarkdownLink, PlayWrightBot
|
7
|
+
|
8
|
+
|
9
|
+
class WebpageToMarkdownArgs(BaseArguments):
|
10
|
+
url: str
|
11
|
+
"""The URL to crawl."""
|
12
|
+
output: str = Path(__file__).with_suffix(".md").as_posix()
|
13
|
+
"""The output file path for the markdown file."""
|
14
|
+
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
15
|
+
["--chatterer"],
|
16
|
+
help="The Chatterer backend and model to use for filtering the markdown.",
|
17
|
+
type=Chatterer.from_provider,
|
18
|
+
)
|
19
|
+
engine: Literal["firefox", "chromium", "webkit"] = "firefox"
|
20
|
+
"""The browser engine to use."""
|
21
|
+
|
22
|
+
def run(self) -> None:
|
23
|
+
chatterer = self.chatterer.value
|
24
|
+
url: str = self.url.strip()
|
25
|
+
output: Path = Path(self.output).resolve()
|
26
|
+
with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
27
|
+
md = bot.url_to_md(url)
|
28
|
+
output.write_text(md, encoding="utf-8")
|
29
|
+
if chatterer is not None:
|
30
|
+
md_llm = bot.url_to_md_with_llm(url.strip())
|
31
|
+
output.write_text(md_llm, encoding="utf-8")
|
32
|
+
links = MarkdownLink.from_markdown(md, referer_url=url)
|
33
|
+
for link in links:
|
34
|
+
if link.type == "link":
|
35
|
+
print(
|
36
|
+
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
37
|
+
)
|
38
|
+
elif link.type == "image":
|
39
|
+
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
40
|
+
|
41
|
+
async def arun(self) -> None:
|
42
|
+
chatterer = self.chatterer.value
|
43
|
+
url: str = self.url.strip()
|
44
|
+
output: Path = Path(self.output).resolve()
|
45
|
+
async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
46
|
+
md = await bot.aurl_to_md(url)
|
47
|
+
output.write_text(md, encoding="utf-8")
|
48
|
+
if chatterer is not None:
|
49
|
+
md_llm = await bot.aurl_to_md_with_llm(url.strip())
|
50
|
+
output.write_text(md_llm, encoding="utf-8")
|
51
|
+
links = MarkdownLink.from_markdown(md, referer_url=url)
|
52
|
+
for link in links:
|
53
|
+
if link.type == "link":
|
54
|
+
print(
|
55
|
+
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
56
|
+
)
|
57
|
+
elif link.type == "image":
|
58
|
+
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
59
|
+
|
60
|
+
|
61
|
+
def truncate_string(s: str) -> str:
|
62
|
+
return s[:50] + "..." if len(s) > 50 else s
|
63
|
+
|
64
|
+
|
65
|
+
def main() -> None:
|
66
|
+
WebpageToMarkdownArgs().run()
|
67
|
+
|
68
|
+
|
69
|
+
if __name__ == "__main__":
|
70
|
+
main()
|