chatterer 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. chatterer/__init__.py +87 -87
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/constants.py +5 -0
  5. chatterer/examples/__main__.py +75 -75
  6. chatterer/examples/any2md.py +83 -85
  7. chatterer/examples/pdf2md.py +231 -338
  8. chatterer/examples/pdf2txt.py +52 -54
  9. chatterer/examples/ppt.py +487 -486
  10. chatterer/examples/pw.py +141 -143
  11. chatterer/examples/snippet.py +54 -56
  12. chatterer/examples/transcribe.py +192 -192
  13. chatterer/examples/upstage.py +87 -89
  14. chatterer/examples/web2md.py +80 -80
  15. chatterer/interactive.py +422 -354
  16. chatterer/language_model.py +530 -536
  17. chatterer/messages.py +21 -21
  18. chatterer/tools/__init__.py +46 -46
  19. chatterer/tools/caption_markdown_images.py +388 -384
  20. chatterer/tools/citation_chunking/__init__.py +3 -3
  21. chatterer/tools/citation_chunking/chunks.py +51 -53
  22. chatterer/tools/citation_chunking/citation_chunker.py +117 -118
  23. chatterer/tools/citation_chunking/citations.py +284 -285
  24. chatterer/tools/citation_chunking/prompt.py +157 -157
  25. chatterer/tools/citation_chunking/reference.py +26 -26
  26. chatterer/tools/citation_chunking/utils.py +138 -138
  27. chatterer/tools/convert_pdf_to_markdown.py +634 -645
  28. chatterer/tools/convert_to_text.py +446 -446
  29. chatterer/tools/upstage_document_parser.py +704 -705
  30. chatterer/tools/webpage_to_markdown.py +739 -739
  31. chatterer/tools/youtube.py +146 -147
  32. chatterer/utils/__init__.py +15 -15
  33. chatterer/utils/base64_image.py +349 -350
  34. chatterer/utils/bytesio.py +59 -59
  35. chatterer/utils/code_agent.py +237 -237
  36. chatterer/utils/imghdr.py +145 -145
  37. {chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/METADATA +377 -390
  38. chatterer-0.1.28.dist-info/RECORD +43 -0
  39. chatterer-0.1.26.dist-info/RECORD +0 -42
  40. {chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/WHEEL +0 -0
  41. {chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/entry_points.txt +0 -0
  42. {chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/top_level.txt +0 -0
chatterer/__init__.py CHANGED
@@ -1,87 +1,87 @@
1
- from dotenv import load_dotenv
2
-
3
- from .interactive import interactive_shell
4
- from .language_model import Chatterer
5
- from .messages import (
6
- AIMessage,
7
- BaseMessage,
8
- BaseMessageChunk,
9
- FunctionMessage,
10
- HumanMessage,
11
- LanguageModelInput,
12
- SystemMessage,
13
- UsageMetadata,
14
- )
15
- from .tools import (
16
- CodeSnippets,
17
- MarkdownLink,
18
- PdfToMarkdown,
19
- PlayWrightBot,
20
- PlaywrightLaunchOptions,
21
- PlaywrightOptions,
22
- PlaywrightPersistencyOptions,
23
- UpstageDocumentParseParser,
24
- acaption_markdown_images,
25
- anything_to_markdown,
26
- caption_markdown_images,
27
- citation_chunker,
28
- extract_text_from_pdf,
29
- get_default_html_to_markdown_options,
30
- get_default_playwright_launch_options,
31
- get_youtube_video_details,
32
- get_youtube_video_subtitle,
33
- html_to_markdown,
34
- open_pdf,
35
- pdf_to_text,
36
- pyscripts_to_snippets,
37
- render_pdf_as_image,
38
- )
39
- from .utils import (
40
- Base64Image,
41
- CodeExecutionResult,
42
- FunctionSignature,
43
- get_default_repl_tool,
44
- insert_callables_into_global,
45
- )
46
-
47
- load_dotenv()
48
-
49
- __all__ = [
50
- "Chatterer",
51
- "html_to_markdown",
52
- "anything_to_markdown",
53
- "pdf_to_text",
54
- "get_default_html_to_markdown_options",
55
- "pyscripts_to_snippets",
56
- "citation_chunker",
57
- "BaseMessage",
58
- "HumanMessage",
59
- "SystemMessage",
60
- "AIMessage",
61
- "FunctionMessage",
62
- "Base64Image",
63
- "FunctionSignature",
64
- "CodeExecutionResult",
65
- "get_default_repl_tool",
66
- "insert_callables_into_global",
67
- "get_youtube_video_subtitle",
68
- "get_youtube_video_details",
69
- "interactive_shell",
70
- "UpstageDocumentParseParser",
71
- "BaseMessageChunk",
72
- "CodeSnippets",
73
- "LanguageModelInput",
74
- "UsageMetadata",
75
- "PlayWrightBot",
76
- "PlaywrightLaunchOptions",
77
- "PlaywrightOptions",
78
- "PlaywrightPersistencyOptions",
79
- "get_default_playwright_launch_options",
80
- "acaption_markdown_images",
81
- "caption_markdown_images",
82
- "MarkdownLink",
83
- "PdfToMarkdown",
84
- "extract_text_from_pdf",
85
- "open_pdf",
86
- "render_pdf_as_image",
87
- ]
1
+ from dotenv import load_dotenv
2
+
3
+ from .interactive import interactive_shell
4
+ from .language_model import Chatterer
5
+ from .messages import (
6
+ AIMessage,
7
+ BaseMessage,
8
+ BaseMessageChunk,
9
+ FunctionMessage,
10
+ HumanMessage,
11
+ LanguageModelInput,
12
+ SystemMessage,
13
+ UsageMetadata,
14
+ )
15
+ from .tools import (
16
+ CodeSnippets,
17
+ MarkdownLink,
18
+ PdfToMarkdown,
19
+ PlayWrightBot,
20
+ PlaywrightLaunchOptions,
21
+ PlaywrightOptions,
22
+ PlaywrightPersistencyOptions,
23
+ UpstageDocumentParseParser,
24
+ acaption_markdown_images,
25
+ anything_to_markdown,
26
+ caption_markdown_images,
27
+ citation_chunker,
28
+ extract_text_from_pdf,
29
+ get_default_html_to_markdown_options,
30
+ get_default_playwright_launch_options,
31
+ get_youtube_video_details,
32
+ get_youtube_video_subtitle,
33
+ html_to_markdown,
34
+ open_pdf,
35
+ pdf_to_text,
36
+ pyscripts_to_snippets,
37
+ render_pdf_as_image,
38
+ )
39
+ from .utils import (
40
+ Base64Image,
41
+ CodeExecutionResult,
42
+ FunctionSignature,
43
+ get_default_repl_tool,
44
+ insert_callables_into_global,
45
+ )
46
+
47
+ load_dotenv()
48
+
49
+ __all__ = [
50
+ "Chatterer",
51
+ "html_to_markdown",
52
+ "anything_to_markdown",
53
+ "pdf_to_text",
54
+ "get_default_html_to_markdown_options",
55
+ "pyscripts_to_snippets",
56
+ "citation_chunker",
57
+ "BaseMessage",
58
+ "HumanMessage",
59
+ "SystemMessage",
60
+ "AIMessage",
61
+ "FunctionMessage",
62
+ "Base64Image",
63
+ "FunctionSignature",
64
+ "CodeExecutionResult",
65
+ "get_default_repl_tool",
66
+ "insert_callables_into_global",
67
+ "get_youtube_video_subtitle",
68
+ "get_youtube_video_details",
69
+ "interactive_shell",
70
+ "UpstageDocumentParseParser",
71
+ "BaseMessageChunk",
72
+ "CodeSnippets",
73
+ "LanguageModelInput",
74
+ "UsageMetadata",
75
+ "PlayWrightBot",
76
+ "PlaywrightLaunchOptions",
77
+ "PlaywrightOptions",
78
+ "PlaywrightPersistencyOptions",
79
+ "get_default_playwright_launch_options",
80
+ "acaption_markdown_images",
81
+ "caption_markdown_images",
82
+ "MarkdownLink",
83
+ "PdfToMarkdown",
84
+ "extract_text_from_pdf",
85
+ "open_pdf",
86
+ "render_pdf_as_image",
87
+ ]
@@ -1,21 +1,21 @@
1
- from .io import (
2
- BytesReadable,
3
- BytesWritable,
4
- FileDescriptorOrPath,
5
- PathOrReadable,
6
- Readable,
7
- StringReadable,
8
- StringWritable,
9
- Writable,
10
- )
11
-
12
- __all__ = [
13
- "BytesReadable",
14
- "BytesWritable",
15
- "FileDescriptorOrPath",
16
- "PathOrReadable",
17
- "Readable",
18
- "StringReadable",
19
- "StringWritable",
20
- "Writable",
21
- ]
1
+ from .io import (
2
+ BytesReadable,
3
+ BytesWritable,
4
+ FileDescriptorOrPath,
5
+ PathOrReadable,
6
+ Readable,
7
+ StringReadable,
8
+ StringWritable,
9
+ Writable,
10
+ )
11
+
12
+ __all__ = [
13
+ "BytesReadable",
14
+ "BytesWritable",
15
+ "FileDescriptorOrPath",
16
+ "PathOrReadable",
17
+ "Readable",
18
+ "StringReadable",
19
+ "StringWritable",
20
+ "Writable",
21
+ ]
@@ -1,19 +1,19 @@
1
- import os
2
- from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
3
- from typing import TypeAlias
4
-
5
- # Type aliases for callback functions and file descriptors
6
- FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
7
-
8
- # Type aliases for different types of IO objects
9
- BytesReadable: TypeAlias = BytesIO | BufferedReader
10
- BytesWritable: TypeAlias = BytesIO | BufferedWriter
11
- StringReadable: TypeAlias = StringIO | TextIOWrapper
12
- StringWritable: TypeAlias = StringIO | TextIOWrapper
13
-
14
- # Combined type aliases for readable and writable objects
15
- Readable: TypeAlias = BytesReadable | StringReadable
16
- Writable: TypeAlias = BytesWritable | StringWritable
17
-
18
- # Type alias for path or readable object
19
- PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
1
+ import os
2
+ from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
3
+ from typing import TypeAlias
4
+
5
+ # Type aliases for callback functions and file descriptors
6
+ FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
7
+
8
+ # Type aliases for different types of IO objects
9
+ BytesReadable: TypeAlias = BytesIO | BufferedReader
10
+ BytesWritable: TypeAlias = BytesIO | BufferedWriter
11
+ StringReadable: TypeAlias = StringIO | TextIOWrapper
12
+ StringWritable: TypeAlias = StringIO | TextIOWrapper
13
+
14
+ # Combined type aliases for readable and writable objects
15
+ Readable: TypeAlias = BytesReadable | StringReadable
16
+ Writable: TypeAlias = BytesWritable | StringWritable
17
+
18
+ # Type alias for path or readable object
19
+ PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
chatterer/constants.py ADDED
@@ -0,0 +1,5 @@
1
+ DEFAULT_ANTHROPIC_MODEL = "claude-3-7-sonnet-20250219"
2
+ DEFAULT_GOOGLE_MODEL = "gemini-2.5-flash"
3
+ DEFAULT_OPENAI_MODEL = "gpt-5-mini"
4
+ DEFAULT_OPENROUTER_MODEL = "z-ai/glm-4.6"
5
+ DEFAULT_XAI_MODEL = "grok-4-fast-non-reasoning"
@@ -1,75 +1,75 @@
1
- from spargear import SubcommandArguments, SubcommandSpec
2
-
3
-
4
- def any2md():
5
- from .any2md import Arguments
6
-
7
- return Arguments
8
-
9
-
10
- def pdf2md():
11
- from .pdf2md import Arguments
12
-
13
- return Arguments
14
-
15
-
16
- def pdf2txt():
17
- from .pdf2txt import Arguments
18
-
19
- return Arguments
20
-
21
-
22
- def ppt():
23
- from .ppt import Arguments
24
-
25
- return Arguments
26
-
27
-
28
- def pw():
29
- from .pw import Arguments
30
-
31
- return Arguments
32
-
33
-
34
- def snippet():
35
- from .snippet import Arguments
36
-
37
- return Arguments
38
-
39
-
40
- def transcribe():
41
- from .transcribe import Arguments
42
-
43
- return Arguments
44
-
45
-
46
- def upstage():
47
- from .upstage import Arguments
48
-
49
- return Arguments
50
-
51
-
52
- def web2md():
53
- from .web2md import Arguments
54
-
55
- return Arguments
56
-
57
-
58
- class Arguments(SubcommandArguments):
59
- any2md = SubcommandSpec(name="any2md", argument_class_factory=any2md)
60
- pdf2md = SubcommandSpec(name="pdf2md", argument_class_factory=pdf2md)
61
- pdf2txt = SubcommandSpec(name="pdf2txt", argument_class_factory=pdf2txt)
62
- ppt = SubcommandSpec(name="ppt", argument_class_factory=ppt)
63
- pw = SubcommandSpec(name="pw", argument_class_factory=pw)
64
- snippet = SubcommandSpec(name="snippet", argument_class_factory=snippet)
65
- transcribe = SubcommandSpec(name="transcribe", argument_class_factory=transcribe)
66
- upstage = SubcommandSpec(name="upstage", argument_class_factory=upstage)
67
- web2md = SubcommandSpec(name="web2md", argument_class_factory=web2md)
68
-
69
-
70
- def main():
71
- Arguments().execute()
72
-
73
-
74
- if __name__ == "__main__":
75
- main()
1
+ from spargear import SubcommandArguments, SubcommandSpec
2
+
3
+
4
+ def any2md():
5
+ from .any2md import Arguments
6
+
7
+ return Arguments
8
+
9
+
10
+ def pdf2md():
11
+ from .pdf2md import Arguments
12
+
13
+ return Arguments
14
+
15
+
16
+ def pdf2txt():
17
+ from .pdf2txt import Arguments
18
+
19
+ return Arguments
20
+
21
+
22
+ def ppt():
23
+ from .ppt import Arguments
24
+
25
+ return Arguments
26
+
27
+
28
+ def pw():
29
+ from .pw import Arguments
30
+
31
+ return Arguments
32
+
33
+
34
+ def snippet():
35
+ from .snippet import Arguments
36
+
37
+ return Arguments
38
+
39
+
40
+ def transcribe():
41
+ from .transcribe import Arguments
42
+
43
+ return Arguments
44
+
45
+
46
+ def upstage():
47
+ from .upstage import Arguments
48
+
49
+ return Arguments
50
+
51
+
52
+ def web2md():
53
+ from .web2md import Arguments
54
+
55
+ return Arguments
56
+
57
+
58
+ class Arguments(SubcommandArguments):
59
+ any2md = SubcommandSpec(name="any2md", argument_class_factory=any2md)
60
+ pdf2md = SubcommandSpec(name="pdf2md", argument_class_factory=pdf2md)
61
+ pdf2txt = SubcommandSpec(name="pdf2txt", argument_class_factory=pdf2txt)
62
+ ppt = SubcommandSpec(name="ppt", argument_class_factory=ppt)
63
+ pw = SubcommandSpec(name="pw", argument_class_factory=pw)
64
+ snippet = SubcommandSpec(name="snippet", argument_class_factory=snippet)
65
+ transcribe = SubcommandSpec(name="transcribe", argument_class_factory=transcribe)
66
+ upstage = SubcommandSpec(name="upstage", argument_class_factory=upstage)
67
+ web2md = SubcommandSpec(name="web2md", argument_class_factory=web2md)
68
+
69
+
70
+ def main():
71
+ Arguments().execute()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()
@@ -1,85 +1,83 @@
1
- import logging
2
- from pathlib import Path
3
- from typing import Optional, TypedDict
4
-
5
- import openai
6
- from spargear import RunnableArguments
7
-
8
- from chatterer import anything_to_markdown
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class AnythingToMarkdownReturns(TypedDict):
14
- input: str
15
- output: Optional[str]
16
- out_text: str
17
-
18
-
19
- class Arguments(RunnableArguments[AnythingToMarkdownReturns]):
20
- """Command line arguments for converting various file types to markdown."""
21
-
22
- SOURCE: str
23
- """Input file to convert to markdown. Can be a file path or a URL."""
24
- output: Optional[str] = None
25
- """Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
26
- model: Optional[str] = None
27
- """OpenAI Model to use for conversion"""
28
- api_key: Optional[str] = None
29
- """API key for OpenAI API"""
30
- base_url: Optional[str] = None
31
- """Base URL for OpenAI API"""
32
- style_map: Optional[str] = None
33
- """Output style map"""
34
- exiftool_path: Optional[str] = None
35
- """"Path to exiftool for metadata extraction"""
36
- docintel_endpoint: Optional[str] = None
37
- "Document Intelligence API endpoint"
38
- prevent_save_file: bool = False
39
- """Prevent saving the converted file to disk."""
40
- encoding: str = "utf-8"
41
- """Encoding for the output file."""
42
-
43
- def run(self) -> AnythingToMarkdownReturns:
44
- if not self.prevent_save_file:
45
- if not self.output:
46
- output = Path(self.SOURCE).with_suffix(".md")
47
- else:
48
- output = Path(self.output)
49
- else:
50
- output = None
51
-
52
- if self.model:
53
- llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
54
- llm_model = self.model
55
- else:
56
- llm_client = None
57
- llm_model = None
58
-
59
- text: str = anything_to_markdown(
60
- self.SOURCE,
61
- llm_client=llm_client,
62
- llm_model=llm_model,
63
- style_map=self.style_map,
64
- exiftool_path=self.exiftool_path,
65
- docintel_endpoint=self.docintel_endpoint,
66
- )
67
- if output:
68
- output.parent.mkdir(parents=True, exist_ok=True)
69
- output.write_text(text, encoding=self.encoding)
70
- logger.info(f"Converted `{self.SOURCE}` to markdown and saved to `{output}`.")
71
- else:
72
- logger.info(f"Converted `{self.SOURCE}` to markdown.")
73
- return {
74
- "input": self.SOURCE,
75
- "output": str(output) if output is not None else None,
76
- "out_text": text,
77
- }
78
-
79
-
80
- def main() -> None:
81
- Arguments().run()
82
-
83
-
84
- if __name__ == "__main__":
85
- main()
1
+ from pathlib import Path
2
+ from typing import Optional, TypedDict
3
+
4
+ import openai
5
+ from loguru import logger
6
+ from spargear import RunnableArguments
7
+
8
+ from chatterer import anything_to_markdown
9
+
10
+
11
+ class AnythingToMarkdownReturns(TypedDict):
12
+ input: str
13
+ output: Optional[str]
14
+ out_text: str
15
+
16
+
17
+ class Arguments(RunnableArguments[AnythingToMarkdownReturns]):
18
+ """Command line arguments for converting various file types to markdown."""
19
+
20
+ SOURCE: str
21
+ """Input file to convert to markdown. Can be a file path or a URL."""
22
+ output: Optional[str] = None
23
+ """Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
24
+ model: Optional[str] = None
25
+ """OpenAI Model to use for conversion"""
26
+ api_key: Optional[str] = None
27
+ """API key for OpenAI API"""
28
+ base_url: Optional[str] = None
29
+ """Base URL for OpenAI API"""
30
+ style_map: Optional[str] = None
31
+ """Output style map"""
32
+ exiftool_path: Optional[str] = None
33
+ """"Path to exiftool for metadata extraction"""
34
+ docintel_endpoint: Optional[str] = None
35
+ "Document Intelligence API endpoint"
36
+ prevent_save_file: bool = False
37
+ """Prevent saving the converted file to disk."""
38
+ encoding: str = "utf-8"
39
+ """Encoding for the output file."""
40
+
41
+ def run(self) -> AnythingToMarkdownReturns:
42
+ if not self.prevent_save_file:
43
+ if not self.output:
44
+ output = Path(self.SOURCE).with_suffix(".md")
45
+ else:
46
+ output = Path(self.output)
47
+ else:
48
+ output = None
49
+
50
+ if self.model:
51
+ llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
52
+ llm_model = self.model
53
+ else:
54
+ llm_client = None
55
+ llm_model = None
56
+
57
+ text: str = anything_to_markdown(
58
+ self.SOURCE,
59
+ llm_client=llm_client,
60
+ llm_model=llm_model,
61
+ style_map=self.style_map,
62
+ exiftool_path=self.exiftool_path,
63
+ docintel_endpoint=self.docintel_endpoint,
64
+ )
65
+ if output:
66
+ output.parent.mkdir(parents=True, exist_ok=True)
67
+ output.write_text(text, encoding=self.encoding)
68
+ logger.info(f"Converted `{self.SOURCE}` to markdown and saved to `{output}`.")
69
+ else:
70
+ logger.info(f"Converted `{self.SOURCE}` to markdown.")
71
+ return {
72
+ "input": self.SOURCE,
73
+ "output": str(output) if output is not None else None,
74
+ "out_text": text,
75
+ }
76
+
77
+
78
+ def main() -> None:
79
+ Arguments().run()
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()