chatterer 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +93 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__init__.py +0 -0
  5. chatterer/examples/anything_to_markdown.py +95 -91
  6. chatterer/examples/get_code_snippets.py +64 -62
  7. chatterer/examples/login_with_playwright.py +171 -167
  8. chatterer/examples/make_ppt.py +499 -497
  9. chatterer/examples/pdf_to_markdown.py +107 -107
  10. chatterer/examples/pdf_to_text.py +60 -56
  11. chatterer/examples/transcription_api.py +127 -123
  12. chatterer/examples/upstage_parser.py +95 -100
  13. chatterer/examples/webpage_to_markdown.py +79 -79
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +533 -533
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +302 -302
  30. chatterer/tools/convert_to_text.py +447 -447
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +285 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/METADATA +392 -392
  40. chatterer-0.1.19.dist-info/RECORD +44 -0
  41. {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/WHEEL +1 -1
  42. chatterer-0.1.19.dist-info/entry_points.txt +10 -0
  43. chatterer-0.1.18.dist-info/RECORD +0 -42
  44. {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/top_level.txt +0 -0
@@ -1,100 +1,95 @@
1
- def resolve_import_path_and_get_logger():
2
- # ruff: noqa: E402
3
- import logging
4
- import sys
5
-
6
- if __name__ == "__main__" and "." not in sys.path:
7
- sys.path.append(".")
8
-
9
- logger = logging.getLogger(__name__)
10
- return logger
11
-
12
-
13
- logger = resolve_import_path_and_get_logger()
14
- from pathlib import Path
15
-
16
- from langchain_core.documents.base import Blob
17
- from spargear import ArgumentSpec, BaseArguments
18
-
19
- from chatterer import Chatterer, UpstageDocumentParseParser
20
- from chatterer.tools.upstage_document_parser import (
21
- DEFAULT_IMAGE_DIR,
22
- DOCUMENT_PARSE_BASE_URL,
23
- DOCUMENT_PARSE_DEFAULT_MODEL,
24
- OCR,
25
- Category,
26
- OutputFormat,
27
- SplitType,
28
- )
29
-
30
-
31
- class UpstageParserArguments(BaseArguments):
32
- in_path: ArgumentSpec[Path] = ArgumentSpec(["in-path"], help="Path to the input file.")
33
- out_path: ArgumentSpec[Path] = ArgumentSpec(["--out-path"], default=None, help="Output file path.")
34
- api_key: ArgumentSpec[str] = ArgumentSpec(["--api-key"], default=None, help="API key for the Upstage API.")
35
- base_url: ArgumentSpec[str] = ArgumentSpec(
36
- ["--base-url"], default=DOCUMENT_PARSE_BASE_URL, help="Base URL for the Upstage API."
37
- )
38
- model: ArgumentSpec[str] = ArgumentSpec(
39
- ["--model"], default=DOCUMENT_PARSE_DEFAULT_MODEL, help="Model to use for parsing."
40
- )
41
- split: ArgumentSpec[SplitType] = ArgumentSpec(["--split"], default="none", help="Split type for parsing.")
42
- ocr: ArgumentSpec[OCR] = ArgumentSpec(["--ocr"], default="auto", help="OCR type for parsing.")
43
- output_format: ArgumentSpec[OutputFormat] = ArgumentSpec(
44
- ["--output-format"], default="markdown", help="Output format."
45
- )
46
- coordinates: ArgumentSpec[bool] = ArgumentSpec(["--coordinates"], action="store_true", help="Include coordinates.")
47
- base64_encoding: ArgumentSpec[list[Category]] = ArgumentSpec(
48
- ["--base64-encoding"], default=["figure"], help="Base64 encoding for specific categories."
49
- )
50
- image_description_instruction: ArgumentSpec[str] = ArgumentSpec(
51
- ["--image-description-instruction"],
52
- default="Describe the image in detail.",
53
- help="Instruction for image description.",
54
- )
55
- image_dir: ArgumentSpec[str] = ArgumentSpec(
56
- ["--image-dir"],
57
- default=DEFAULT_IMAGE_DIR,
58
- help="Directory for image paths.",
59
- )
60
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
61
- ["--chatterer"],
62
- default=None,
63
- help="Chatterer instance for communication.",
64
- type=Chatterer.from_provider,
65
- )
66
-
67
- def run(self) -> None:
68
- UpstageParserArguments.load()
69
- input = UpstageParserArguments.in_path.unwrap().resolve()
70
- out = UpstageParserArguments.out_path.value or input.with_suffix(".md")
71
-
72
- parser = UpstageDocumentParseParser(
73
- api_key=UpstageParserArguments.api_key.value,
74
- base_url=UpstageParserArguments.base_url.unwrap(),
75
- model=UpstageParserArguments.model.unwrap(),
76
- split=UpstageParserArguments.split.unwrap(),
77
- ocr=UpstageParserArguments.ocr.unwrap(),
78
- output_format=UpstageParserArguments.output_format.unwrap(),
79
- coordinates=UpstageParserArguments.coordinates.unwrap(),
80
- base64_encoding=UpstageParserArguments.base64_encoding.unwrap(),
81
- image_description_instruction=UpstageParserArguments.image_description_instruction.unwrap(),
82
- image_dir=UpstageParserArguments.image_dir.value,
83
- chatterer=UpstageParserArguments.chatterer.value,
84
- )
85
-
86
- docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
87
-
88
- if UpstageParserArguments.image_dir.value:
89
- for path, image in parser.image_data.items():
90
- (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
91
- path.write_bytes(image)
92
- logger.info(f"Saved image to `{path}`")
93
-
94
- markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
95
- out.write_text(markdown, encoding="utf-8")
96
- logger.info(f"Parsed `{input}` to `{out}`")
97
-
98
-
99
- if __name__ == "__main__":
100
- UpstageParserArguments().run()
1
+ def resolve_import_path_and_get_logger():
2
+ # ruff: noqa: E402
3
+ import logging
4
+ import sys
5
+
6
+ if __name__ == "__main__" and "." not in sys.path:
7
+ sys.path.append(".")
8
+
9
+ logger = logging.getLogger(__name__)
10
+ return logger
11
+
12
+
13
+ logger = resolve_import_path_and_get_logger()
14
+ from pathlib import Path
15
+
16
+ from langchain_core.documents.base import Blob
17
+ from spargear import ArgumentSpec, BaseArguments
18
+
19
+ from chatterer import Chatterer, UpstageDocumentParseParser
20
+ from chatterer.tools.upstage_document_parser import (
21
+ DEFAULT_IMAGE_DIR,
22
+ DOCUMENT_PARSE_BASE_URL,
23
+ DOCUMENT_PARSE_DEFAULT_MODEL,
24
+ OCR,
25
+ Category,
26
+ OutputFormat,
27
+ SplitType,
28
+ )
29
+
30
+
31
+ class UpstageParserArguments(BaseArguments):
32
+ in_path: ArgumentSpec[Path] = ArgumentSpec(["in-path"], help="Path to the input file.")
33
+ out_path: ArgumentSpec[Path] = ArgumentSpec(["--out-path"], default=None, help="Output file path.")
34
+ api_key: ArgumentSpec[str] = ArgumentSpec(["--api-key"], default=None, help="API key for the Upstage API.")
35
+ base_url: ArgumentSpec[str] = ArgumentSpec(["--base-url"], default=DOCUMENT_PARSE_BASE_URL, help="Base URL for the Upstage API.")
36
+ model: ArgumentSpec[str] = ArgumentSpec(["--model"], default=DOCUMENT_PARSE_DEFAULT_MODEL, help="Model to use for parsing.")
37
+ split: ArgumentSpec[SplitType] = ArgumentSpec(["--split"], default="none", help="Split type for parsing.")
38
+ ocr: ArgumentSpec[OCR] = ArgumentSpec(["--ocr"], default="auto", help="OCR type for parsing.")
39
+ output_format: ArgumentSpec[OutputFormat] = ArgumentSpec(["--output-format"], default="markdown", help="Output format.")
40
+ coordinates: ArgumentSpec[bool] = ArgumentSpec(["--coordinates"], action="store_true", help="Include coordinates.")
41
+ base64_encoding: ArgumentSpec[list[Category]] = ArgumentSpec(["--base64-encoding"], default=["figure"], help="Base64 encoding for specific categories.")
42
+ image_description_instruction: ArgumentSpec[str] = ArgumentSpec(
43
+ ["--image-description-instruction"],
44
+ default="Describe the image in detail.",
45
+ help="Instruction for image description.",
46
+ )
47
+ image_dir: ArgumentSpec[str] = ArgumentSpec(
48
+ ["--image-dir"],
49
+ default=DEFAULT_IMAGE_DIR,
50
+ help="Directory for image paths.",
51
+ )
52
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
53
+ ["--chatterer"],
54
+ default=None,
55
+ help="Chatterer instance for communication.",
56
+ type=Chatterer.from_provider,
57
+ )
58
+
59
+ def run(self) -> None:
60
+ input = UpstageParserArguments.in_path.unwrap().resolve()
61
+ out = UpstageParserArguments.out_path.value or input.with_suffix(".md")
62
+
63
+ parser = UpstageDocumentParseParser(
64
+ api_key=UpstageParserArguments.api_key.value,
65
+ base_url=UpstageParserArguments.base_url.unwrap(),
66
+ model=UpstageParserArguments.model.unwrap(),
67
+ split=UpstageParserArguments.split.unwrap(),
68
+ ocr=UpstageParserArguments.ocr.unwrap(),
69
+ output_format=UpstageParserArguments.output_format.unwrap(),
70
+ coordinates=UpstageParserArguments.coordinates.unwrap(),
71
+ base64_encoding=UpstageParserArguments.base64_encoding.unwrap(),
72
+ image_description_instruction=UpstageParserArguments.image_description_instruction.unwrap(),
73
+ image_dir=UpstageParserArguments.image_dir.value,
74
+ chatterer=UpstageParserArguments.chatterer.value,
75
+ )
76
+
77
+ docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
78
+
79
+ if UpstageParserArguments.image_dir.value:
80
+ for path, image in parser.image_data.items():
81
+ (path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
82
+ path.write_bytes(image)
83
+ logger.info(f"Saved image to `{path}`")
84
+
85
+ markdown: str = "\n\n".join(f"<!--- page {i} -->\n{doc.page_content}" for i, doc in enumerate(docs, 1))
86
+ out.write_text(markdown, encoding="utf-8")
87
+ logger.info(f"Parsed `{input}` to `{out}`")
88
+
89
+
90
+ def main() -> None:
91
+ UpstageParserArguments().run()
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
@@ -1,79 +1,79 @@
1
- def resolve_import_path_and_get_logger():
2
- # ruff: noqa: E402
3
- import logging
4
- import sys
5
-
6
- if __name__ == "__main__" and "." not in sys.path:
7
- sys.path.append(".")
8
-
9
- logger = logging.getLogger(__name__)
10
- return logger
11
-
12
-
13
- logger = resolve_import_path_and_get_logger()
14
- from pathlib import Path
15
- from typing import Literal
16
-
17
- from spargear import ArgumentSpec, BaseArguments
18
-
19
- from chatterer import Chatterer, MarkdownLink, PlayWrightBot
20
-
21
-
22
- class WebpageToMarkdownArgs(BaseArguments):
23
- url: ArgumentSpec[str] = ArgumentSpec(["url"], help="The URL to crawl.")
24
- out_path: str = Path(__file__).with_suffix(".md").as_posix()
25
- """The output file path for the markdown file."""
26
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
27
- ["--llm"],
28
- default=None,
29
- type=Chatterer.from_provider,
30
- help="The Chatterer backend and model to use for filtering the markdown.",
31
- )
32
- engine: Literal["firefox", "chromium", "webkit"] = "firefox"
33
- """The browser engine to use."""
34
-
35
- def run(self) -> None:
36
- chatterer = self.chatterer.value
37
- url: str = self.url.unwrap().strip()
38
- out_path: Path = Path(self.out_path).resolve()
39
- with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
40
- md = bot.url_to_md(url)
41
- out_path.write_text(md, encoding="utf-8")
42
- if chatterer is not None:
43
- md_llm = bot.url_to_md_with_llm(url.strip())
44
- out_path.write_text(md_llm, encoding="utf-8")
45
- links = MarkdownLink.from_markdown(md, referer_url=url)
46
- for link in links:
47
- if link.type == "link":
48
- print(
49
- f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
50
- )
51
- elif link.type == "image":
52
- print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
53
-
54
- async def arun(self) -> None:
55
- chatterer = self.chatterer.value
56
- url: str = self.url.unwrap().strip()
57
- out_path: Path = Path(self.out_path).resolve()
58
- async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
59
- md = await bot.aurl_to_md(url)
60
- out_path.write_text(md, encoding="utf-8")
61
- if chatterer is not None:
62
- md_llm = await bot.aurl_to_md_with_llm(url.strip())
63
- out_path.write_text(md_llm, encoding="utf-8")
64
- links = MarkdownLink.from_markdown(md, referer_url=url)
65
- for link in links:
66
- if link.type == "link":
67
- print(
68
- f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
69
- )
70
- elif link.type == "image":
71
- print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
72
-
73
-
74
- def truncate_string(s: str) -> str:
75
- return s[:50] + "..." if len(s) > 50 else s
76
-
77
-
78
- if __name__ == "__main__":
79
- WebpageToMarkdownArgs().run()
1
+ def resolve_import_path_and_get_logger():
2
+ # ruff: noqa: E402
3
+ import logging
4
+ import sys
5
+
6
+ if __name__ == "__main__" and "." not in sys.path:
7
+ sys.path.append(".")
8
+
9
+ logger = logging.getLogger(__name__)
10
+ return logger
11
+
12
+
13
+ logger = resolve_import_path_and_get_logger()
14
+ from pathlib import Path
15
+ from typing import Literal
16
+
17
+ from spargear import ArgumentSpec, BaseArguments
18
+
19
+ from chatterer import Chatterer, MarkdownLink, PlayWrightBot
20
+
21
+
22
+ class WebpageToMarkdownArgs(BaseArguments):
23
+ url: ArgumentSpec[str] = ArgumentSpec(["url"], help="The URL to crawl.")
24
+ out_path: str = Path(__file__).with_suffix(".md").as_posix()
25
+ """The output file path for the markdown file."""
26
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
27
+ ["--llm"],
28
+ default=None,
29
+ type=Chatterer.from_provider,
30
+ help="The Chatterer backend and model to use for filtering the markdown.",
31
+ )
32
+ engine: Literal["firefox", "chromium", "webkit"] = "firefox"
33
+ """The browser engine to use."""
34
+
35
+ def run(self) -> None:
36
+ chatterer = self.chatterer.value
37
+ url: str = self.url.unwrap().strip()
38
+ out_path: Path = Path(self.out_path).resolve()
39
+ with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
40
+ md = bot.url_to_md(url)
41
+ out_path.write_text(md, encoding="utf-8")
42
+ if chatterer is not None:
43
+ md_llm = bot.url_to_md_with_llm(url.strip())
44
+ out_path.write_text(md_llm, encoding="utf-8")
45
+ links = MarkdownLink.from_markdown(md, referer_url=url)
46
+ for link in links:
47
+ if link.type == "link":
48
+ print(f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})")
49
+ elif link.type == "image":
50
+ print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
51
+
52
+ async def arun(self) -> None:
53
+ chatterer = self.chatterer.value
54
+ url: str = self.url.unwrap().strip()
55
+ out_path: Path = Path(self.out_path).resolve()
56
+ async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
57
+ md = await bot.aurl_to_md(url)
58
+ out_path.write_text(md, encoding="utf-8")
59
+ if chatterer is not None:
60
+ md_llm = await bot.aurl_to_md_with_llm(url.strip())
61
+ out_path.write_text(md_llm, encoding="utf-8")
62
+ links = MarkdownLink.from_markdown(md, referer_url=url)
63
+ for link in links:
64
+ if link.type == "link":
65
+ print(f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})")
66
+ elif link.type == "image":
67
+ print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
68
+
69
+
70
+ def truncate_string(s: str) -> str:
71
+ return s[:50] + "..." if len(s) > 50 else s
72
+
73
+
74
+ def main() -> None:
75
+ WebpageToMarkdownArgs().run()
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main()