chatterer 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +87 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__main__.py +75 -75
  5. chatterer/examples/any2md.py +85 -85
  6. chatterer/examples/pdf2md.py +338 -338
  7. chatterer/examples/pdf2txt.py +54 -54
  8. chatterer/examples/ppt.py +486 -486
  9. chatterer/examples/pw.py +143 -137
  10. chatterer/examples/snippet.py +56 -55
  11. chatterer/examples/transcribe.py +192 -112
  12. chatterer/examples/upstage.py +89 -89
  13. chatterer/examples/web2md.py +80 -66
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +536 -536
  16. chatterer/messages.py +21 -21
  17. chatterer/tools/__init__.py +46 -46
  18. chatterer/tools/caption_markdown_images.py +384 -384
  19. chatterer/tools/citation_chunking/__init__.py +3 -3
  20. chatterer/tools/citation_chunking/chunks.py +53 -53
  21. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  22. chatterer/tools/citation_chunking/citations.py +285 -285
  23. chatterer/tools/citation_chunking/prompt.py +157 -157
  24. chatterer/tools/citation_chunking/reference.py +26 -26
  25. chatterer/tools/citation_chunking/utils.py +138 -138
  26. chatterer/tools/convert_pdf_to_markdown.py +645 -625
  27. chatterer/tools/convert_to_text.py +446 -446
  28. chatterer/tools/upstage_document_parser.py +705 -705
  29. chatterer/tools/webpage_to_markdown.py +739 -739
  30. chatterer/tools/youtube.py +146 -146
  31. chatterer/utils/__init__.py +15 -15
  32. chatterer/utils/base64_image.py +350 -285
  33. chatterer/utils/bytesio.py +59 -59
  34. chatterer/utils/code_agent.py +237 -237
  35. chatterer/utils/imghdr.py +145 -148
  36. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/METADATA +390 -389
  37. chatterer-0.1.26.dist-info/RECORD +42 -0
  38. chatterer/strategies/__init__.py +0 -13
  39. chatterer/strategies/atom_of_thoughts.py +0 -975
  40. chatterer/strategies/base.py +0 -14
  41. chatterer-0.1.24.dist-info/RECORD +0 -45
  42. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/WHEEL +0 -0
  43. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/entry_points.txt +0 -0
  44. {chatterer-0.1.24.dist-info → chatterer-0.1.26.dist-info}/top_level.txt +0 -0
chatterer/__init__.py CHANGED
@@ -1,93 +1,87 @@
1
- from .interactive import interactive_shell
2
- from .language_model import Chatterer
3
- from .messages import (
4
- AIMessage,
5
- BaseMessage,
6
- BaseMessageChunk,
7
- FunctionMessage,
8
- HumanMessage,
9
- LanguageModelInput,
10
- SystemMessage,
11
- UsageMetadata,
12
- )
13
- from .strategies import (
14
- AoTPipeline,
15
- AoTPrompter,
16
- AoTStrategy,
17
- BaseStrategy,
18
- )
19
- from .tools import (
20
- CodeSnippets,
21
- MarkdownLink,
22
- PdfToMarkdown,
23
- PlayWrightBot,
24
- PlaywrightLaunchOptions,
25
- PlaywrightOptions,
26
- PlaywrightPersistencyOptions,
27
- UpstageDocumentParseParser,
28
- acaption_markdown_images,
29
- anything_to_markdown,
30
- caption_markdown_images,
31
- citation_chunker,
32
- extract_text_from_pdf,
33
- get_default_html_to_markdown_options,
34
- get_default_playwright_launch_options,
35
- get_youtube_video_details,
36
- get_youtube_video_subtitle,
37
- html_to_markdown,
38
- open_pdf,
39
- pdf_to_text,
40
- pyscripts_to_snippets,
41
- render_pdf_as_image,
42
- )
43
- from .utils import (
44
- Base64Image,
45
- CodeExecutionResult,
46
- FunctionSignature,
47
- get_default_repl_tool,
48
- insert_callables_into_global,
49
- )
50
-
51
- __all__ = [
52
- "BaseStrategy",
53
- "Chatterer",
54
- "AoTStrategy",
55
- "AoTPipeline",
56
- "AoTPrompter",
57
- "html_to_markdown",
58
- "anything_to_markdown",
59
- "pdf_to_text",
60
- "get_default_html_to_markdown_options",
61
- "pyscripts_to_snippets",
62
- "citation_chunker",
63
- "BaseMessage",
64
- "HumanMessage",
65
- "SystemMessage",
66
- "AIMessage",
67
- "FunctionMessage",
68
- "Base64Image",
69
- "FunctionSignature",
70
- "CodeExecutionResult",
71
- "get_default_repl_tool",
72
- "insert_callables_into_global",
73
- "get_youtube_video_subtitle",
74
- "get_youtube_video_details",
75
- "interactive_shell",
76
- "UpstageDocumentParseParser",
77
- "BaseMessageChunk",
78
- "CodeSnippets",
79
- "LanguageModelInput",
80
- "UsageMetadata",
81
- "PlayWrightBot",
82
- "PlaywrightLaunchOptions",
83
- "PlaywrightOptions",
84
- "PlaywrightPersistencyOptions",
85
- "get_default_playwright_launch_options",
86
- "acaption_markdown_images",
87
- "caption_markdown_images",
88
- "MarkdownLink",
89
- "PdfToMarkdown",
90
- "extract_text_from_pdf",
91
- "open_pdf",
92
- "render_pdf_as_image",
93
- ]
1
+ from dotenv import load_dotenv
2
+
3
+ from .interactive import interactive_shell
4
+ from .language_model import Chatterer
5
+ from .messages import (
6
+ AIMessage,
7
+ BaseMessage,
8
+ BaseMessageChunk,
9
+ FunctionMessage,
10
+ HumanMessage,
11
+ LanguageModelInput,
12
+ SystemMessage,
13
+ UsageMetadata,
14
+ )
15
+ from .tools import (
16
+ CodeSnippets,
17
+ MarkdownLink,
18
+ PdfToMarkdown,
19
+ PlayWrightBot,
20
+ PlaywrightLaunchOptions,
21
+ PlaywrightOptions,
22
+ PlaywrightPersistencyOptions,
23
+ UpstageDocumentParseParser,
24
+ acaption_markdown_images,
25
+ anything_to_markdown,
26
+ caption_markdown_images,
27
+ citation_chunker,
28
+ extract_text_from_pdf,
29
+ get_default_html_to_markdown_options,
30
+ get_default_playwright_launch_options,
31
+ get_youtube_video_details,
32
+ get_youtube_video_subtitle,
33
+ html_to_markdown,
34
+ open_pdf,
35
+ pdf_to_text,
36
+ pyscripts_to_snippets,
37
+ render_pdf_as_image,
38
+ )
39
+ from .utils import (
40
+ Base64Image,
41
+ CodeExecutionResult,
42
+ FunctionSignature,
43
+ get_default_repl_tool,
44
+ insert_callables_into_global,
45
+ )
46
+
47
+ load_dotenv()
48
+
49
+ __all__ = [
50
+ "Chatterer",
51
+ "html_to_markdown",
52
+ "anything_to_markdown",
53
+ "pdf_to_text",
54
+ "get_default_html_to_markdown_options",
55
+ "pyscripts_to_snippets",
56
+ "citation_chunker",
57
+ "BaseMessage",
58
+ "HumanMessage",
59
+ "SystemMessage",
60
+ "AIMessage",
61
+ "FunctionMessage",
62
+ "Base64Image",
63
+ "FunctionSignature",
64
+ "CodeExecutionResult",
65
+ "get_default_repl_tool",
66
+ "insert_callables_into_global",
67
+ "get_youtube_video_subtitle",
68
+ "get_youtube_video_details",
69
+ "interactive_shell",
70
+ "UpstageDocumentParseParser",
71
+ "BaseMessageChunk",
72
+ "CodeSnippets",
73
+ "LanguageModelInput",
74
+ "UsageMetadata",
75
+ "PlayWrightBot",
76
+ "PlaywrightLaunchOptions",
77
+ "PlaywrightOptions",
78
+ "PlaywrightPersistencyOptions",
79
+ "get_default_playwright_launch_options",
80
+ "acaption_markdown_images",
81
+ "caption_markdown_images",
82
+ "MarkdownLink",
83
+ "PdfToMarkdown",
84
+ "extract_text_from_pdf",
85
+ "open_pdf",
86
+ "render_pdf_as_image",
87
+ ]
@@ -1,21 +1,21 @@
1
- from .io import (
2
- BytesReadable,
3
- BytesWritable,
4
- FileDescriptorOrPath,
5
- PathOrReadable,
6
- Readable,
7
- StringReadable,
8
- StringWritable,
9
- Writable,
10
- )
11
-
12
- __all__ = [
13
- "BytesReadable",
14
- "BytesWritable",
15
- "FileDescriptorOrPath",
16
- "PathOrReadable",
17
- "Readable",
18
- "StringReadable",
19
- "StringWritable",
20
- "Writable",
21
- ]
1
+ from .io import (
2
+ BytesReadable,
3
+ BytesWritable,
4
+ FileDescriptorOrPath,
5
+ PathOrReadable,
6
+ Readable,
7
+ StringReadable,
8
+ StringWritable,
9
+ Writable,
10
+ )
11
+
12
+ __all__ = [
13
+ "BytesReadable",
14
+ "BytesWritable",
15
+ "FileDescriptorOrPath",
16
+ "PathOrReadable",
17
+ "Readable",
18
+ "StringReadable",
19
+ "StringWritable",
20
+ "Writable",
21
+ ]
@@ -1,19 +1,19 @@
1
- import os
2
- from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
3
- from typing import TypeAlias
4
-
5
- # Type aliases for callback functions and file descriptors
6
- FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
7
-
8
- # Type aliases for different types of IO objects
9
- BytesReadable: TypeAlias = BytesIO | BufferedReader
10
- BytesWritable: TypeAlias = BytesIO | BufferedWriter
11
- StringReadable: TypeAlias = StringIO | TextIOWrapper
12
- StringWritable: TypeAlias = StringIO | TextIOWrapper
13
-
14
- # Combined type aliases for readable and writable objects
15
- Readable: TypeAlias = BytesReadable | StringReadable
16
- Writable: TypeAlias = BytesWritable | StringWritable
17
-
18
- # Type alias for path or readable object
19
- PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
1
+ import os
2
+ from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
3
+ from typing import TypeAlias
4
+
5
+ # Type aliases for callback functions and file descriptors
6
+ FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
7
+
8
+ # Type aliases for different types of IO objects
9
+ BytesReadable: TypeAlias = BytesIO | BufferedReader
10
+ BytesWritable: TypeAlias = BytesIO | BufferedWriter
11
+ StringReadable: TypeAlias = StringIO | TextIOWrapper
12
+ StringWritable: TypeAlias = StringIO | TextIOWrapper
13
+
14
+ # Combined type aliases for readable and writable objects
15
+ Readable: TypeAlias = BytesReadable | StringReadable
16
+ Writable: TypeAlias = BytesWritable | StringWritable
17
+
18
+ # Type alias for path or readable object
19
+ PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
@@ -1,75 +1,75 @@
1
- from spargear import SubcommandArguments, SubcommandSpec
2
-
3
-
4
- def any2md():
5
- from .any2md import Arguments
6
-
7
- return Arguments
8
-
9
-
10
- def pdf2md():
11
- from .pdf2md import Arguments
12
-
13
- return Arguments
14
-
15
-
16
- def pdf2txt():
17
- from .pdf2txt import Arguments
18
-
19
- return Arguments
20
-
21
-
22
- def ppt():
23
- from .ppt import Arguments
24
-
25
- return Arguments
26
-
27
-
28
- def pw():
29
- from .pw import Arguments
30
-
31
- return Arguments
32
-
33
-
34
- def snippet():
35
- from .snippet import Arguments
36
-
37
- return Arguments
38
-
39
-
40
- def transcribe():
41
- from .transcribe import Arguments
42
-
43
- return Arguments
44
-
45
-
46
- def upstage():
47
- from .upstage import Arguments
48
-
49
- return Arguments
50
-
51
-
52
- def web2md():
53
- from .web2md import Arguments
54
-
55
- return Arguments
56
-
57
-
58
- class Arguments(SubcommandArguments):
59
- any2md = SubcommandSpec(name="any2md", argument_class_factory=any2md)
60
- pdf2md = SubcommandSpec(name="pdf2md", argument_class_factory=pdf2md)
61
- pdf2txt = SubcommandSpec(name="pdf2txt", argument_class_factory=pdf2txt)
62
- ppt = SubcommandSpec(name="ppt", argument_class_factory=ppt)
63
- pw = SubcommandSpec(name="pw", argument_class_factory=pw)
64
- snippet = SubcommandSpec(name="snippet", argument_class_factory=snippet)
65
- transcribe = SubcommandSpec(name="transcribe", argument_class_factory=transcribe)
66
- upstage = SubcommandSpec(name="upstage", argument_class_factory=upstage)
67
- web2md = SubcommandSpec(name="web2md", argument_class_factory=web2md)
68
-
69
-
70
- def main():
71
- Arguments().execute()
72
-
73
-
74
- if __name__ == "__main__":
75
- main()
1
+ from spargear import SubcommandArguments, SubcommandSpec
2
+
3
+
4
+ def any2md():
5
+ from .any2md import Arguments
6
+
7
+ return Arguments
8
+
9
+
10
+ def pdf2md():
11
+ from .pdf2md import Arguments
12
+
13
+ return Arguments
14
+
15
+
16
+ def pdf2txt():
17
+ from .pdf2txt import Arguments
18
+
19
+ return Arguments
20
+
21
+
22
+ def ppt():
23
+ from .ppt import Arguments
24
+
25
+ return Arguments
26
+
27
+
28
+ def pw():
29
+ from .pw import Arguments
30
+
31
+ return Arguments
32
+
33
+
34
+ def snippet():
35
+ from .snippet import Arguments
36
+
37
+ return Arguments
38
+
39
+
40
+ def transcribe():
41
+ from .transcribe import Arguments
42
+
43
+ return Arguments
44
+
45
+
46
+ def upstage():
47
+ from .upstage import Arguments
48
+
49
+ return Arguments
50
+
51
+
52
+ def web2md():
53
+ from .web2md import Arguments
54
+
55
+ return Arguments
56
+
57
+
58
+ class Arguments(SubcommandArguments):
59
+ any2md = SubcommandSpec(name="any2md", argument_class_factory=any2md)
60
+ pdf2md = SubcommandSpec(name="pdf2md", argument_class_factory=pdf2md)
61
+ pdf2txt = SubcommandSpec(name="pdf2txt", argument_class_factory=pdf2txt)
62
+ ppt = SubcommandSpec(name="ppt", argument_class_factory=ppt)
63
+ pw = SubcommandSpec(name="pw", argument_class_factory=pw)
64
+ snippet = SubcommandSpec(name="snippet", argument_class_factory=snippet)
65
+ transcribe = SubcommandSpec(name="transcribe", argument_class_factory=transcribe)
66
+ upstage = SubcommandSpec(name="upstage", argument_class_factory=upstage)
67
+ web2md = SubcommandSpec(name="web2md", argument_class_factory=web2md)
68
+
69
+
70
+ def main():
71
+ Arguments().execute()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()
@@ -1,85 +1,85 @@
1
- import logging
2
- from pathlib import Path
3
- from typing import Optional, TypedDict
4
-
5
- import openai
6
- from spargear import RunnableArguments
7
-
8
- from chatterer import anything_to_markdown
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class AnythingToMarkdownReturns(TypedDict):
14
- input: str
15
- output: Optional[str]
16
- out_text: str
17
-
18
-
19
- class Arguments(RunnableArguments[AnythingToMarkdownReturns]):
20
- """Command line arguments for converting various file types to markdown."""
21
-
22
- SOURCE: str
23
- """Input file to convert to markdown. Can be a file path or a URL."""
24
- output: Optional[str] = None
25
- """Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
26
- model: Optional[str] = None
27
- """OpenAI Model to use for conversion"""
28
- api_key: Optional[str] = None
29
- """API key for OpenAI API"""
30
- base_url: Optional[str] = None
31
- """Base URL for OpenAI API"""
32
- style_map: Optional[str] = None
33
- """Output style map"""
34
- exiftool_path: Optional[str] = None
35
- """"Path to exiftool for metadata extraction"""
36
- docintel_endpoint: Optional[str] = None
37
- "Document Intelligence API endpoint"
38
- prevent_save_file: bool = False
39
- """Prevent saving the converted file to disk."""
40
- encoding: str = "utf-8"
41
- """Encoding for the output file."""
42
-
43
- def run(self) -> AnythingToMarkdownReturns:
44
- if not self.prevent_save_file:
45
- if not self.output:
46
- output = Path(self.SOURCE).with_suffix(".md")
47
- else:
48
- output = Path(self.output)
49
- else:
50
- output = None
51
-
52
- if self.model:
53
- llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
54
- llm_model = self.model
55
- else:
56
- llm_client = None
57
- llm_model = None
58
-
59
- text: str = anything_to_markdown(
60
- self.SOURCE,
61
- llm_client=llm_client,
62
- llm_model=llm_model,
63
- style_map=self.style_map,
64
- exiftool_path=self.exiftool_path,
65
- docintel_endpoint=self.docintel_endpoint,
66
- )
67
- if output:
68
- output.parent.mkdir(parents=True, exist_ok=True)
69
- output.write_text(text, encoding=self.encoding)
70
- logger.info(f"Converted `{self.SOURCE}` to markdown and saved to `{output}`.")
71
- else:
72
- logger.info(f"Converted `{self.SOURCE}` to markdown.")
73
- return {
74
- "input": self.SOURCE,
75
- "output": str(output) if output is not None else None,
76
- "out_text": text,
77
- }
78
-
79
-
80
- def main() -> None:
81
- Arguments().run()
82
-
83
-
84
- if __name__ == "__main__":
85
- main()
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional, TypedDict
4
+
5
+ import openai
6
+ from spargear import RunnableArguments
7
+
8
+ from chatterer import anything_to_markdown
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class AnythingToMarkdownReturns(TypedDict):
14
+ input: str
15
+ output: Optional[str]
16
+ out_text: str
17
+
18
+
19
+ class Arguments(RunnableArguments[AnythingToMarkdownReturns]):
20
+ """Command line arguments for converting various file types to markdown."""
21
+
22
+ SOURCE: str
23
+ """Input file to convert to markdown. Can be a file path or a URL."""
24
+ output: Optional[str] = None
25
+ """Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
26
+ model: Optional[str] = None
27
+ """OpenAI Model to use for conversion"""
28
+ api_key: Optional[str] = None
29
+ """API key for OpenAI API"""
30
+ base_url: Optional[str] = None
31
+ """Base URL for OpenAI API"""
32
+ style_map: Optional[str] = None
33
+ """Output style map"""
34
+ exiftool_path: Optional[str] = None
35
+ """"Path to exiftool for metadata extraction"""
36
+ docintel_endpoint: Optional[str] = None
37
+ "Document Intelligence API endpoint"
38
+ prevent_save_file: bool = False
39
+ """Prevent saving the converted file to disk."""
40
+ encoding: str = "utf-8"
41
+ """Encoding for the output file."""
42
+
43
+ def run(self) -> AnythingToMarkdownReturns:
44
+ if not self.prevent_save_file:
45
+ if not self.output:
46
+ output = Path(self.SOURCE).with_suffix(".md")
47
+ else:
48
+ output = Path(self.output)
49
+ else:
50
+ output = None
51
+
52
+ if self.model:
53
+ llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
54
+ llm_model = self.model
55
+ else:
56
+ llm_client = None
57
+ llm_model = None
58
+
59
+ text: str = anything_to_markdown(
60
+ self.SOURCE,
61
+ llm_client=llm_client,
62
+ llm_model=llm_model,
63
+ style_map=self.style_map,
64
+ exiftool_path=self.exiftool_path,
65
+ docintel_endpoint=self.docintel_endpoint,
66
+ )
67
+ if output:
68
+ output.parent.mkdir(parents=True, exist_ok=True)
69
+ output.write_text(text, encoding=self.encoding)
70
+ logger.info(f"Converted `{self.SOURCE}` to markdown and saved to `{output}`.")
71
+ else:
72
+ logger.info(f"Converted `{self.SOURCE}` to markdown.")
73
+ return {
74
+ "input": self.SOURCE,
75
+ "output": str(output) if output is not None else None,
76
+ "out_text": text,
77
+ }
78
+
79
+
80
+ def main() -> None:
81
+ Arguments().run()
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()