chatterer 0.1.24__py3-none-any.whl → 0.1.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +97 -93
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/examples/__main__.py +75 -75
- chatterer/examples/any2md.py +85 -85
- chatterer/examples/pdf2md.py +338 -338
- chatterer/examples/pdf2txt.py +54 -54
- chatterer/examples/ppt.py +486 -486
- chatterer/examples/pw.py +143 -137
- chatterer/examples/snippet.py +56 -55
- chatterer/examples/transcribe.py +192 -112
- chatterer/examples/upstage.py +89 -89
- chatterer/examples/web2md.py +80 -66
- chatterer/interactive.py +354 -354
- chatterer/language_model.py +536 -536
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +645 -625
- chatterer/tools/convert_to_text.py +446 -446
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +293 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/METADATA +390 -389
- chatterer-0.1.25.dist-info/RECORD +45 -0
- chatterer-0.1.24.dist-info/RECORD +0 -45
- {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/WHEEL +0 -0
- {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/entry_points.txt +0 -0
- {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/top_level.txt +0 -0
chatterer/__init__.py
CHANGED
@@ -1,93 +1,97 @@
|
|
1
|
-
from
|
2
|
-
|
3
|
-
from .
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
"
|
57
|
-
"
|
58
|
-
"
|
59
|
-
"
|
60
|
-
"
|
61
|
-
"
|
62
|
-
"
|
63
|
-
"
|
64
|
-
"
|
65
|
-
"
|
66
|
-
"
|
67
|
-
"
|
68
|
-
"
|
69
|
-
"
|
70
|
-
"
|
71
|
-
"
|
72
|
-
"
|
73
|
-
"
|
74
|
-
"
|
75
|
-
"
|
76
|
-
"
|
77
|
-
"
|
78
|
-
"
|
79
|
-
"
|
80
|
-
"
|
81
|
-
"
|
82
|
-
"
|
83
|
-
"
|
84
|
-
"
|
85
|
-
"
|
86
|
-
"
|
87
|
-
"
|
88
|
-
"
|
89
|
-
"
|
90
|
-
"
|
91
|
-
"
|
92
|
-
"
|
93
|
-
|
1
|
+
from dotenv import load_dotenv
|
2
|
+
|
3
|
+
from .interactive import interactive_shell
|
4
|
+
from .language_model import Chatterer
|
5
|
+
from .messages import (
|
6
|
+
AIMessage,
|
7
|
+
BaseMessage,
|
8
|
+
BaseMessageChunk,
|
9
|
+
FunctionMessage,
|
10
|
+
HumanMessage,
|
11
|
+
LanguageModelInput,
|
12
|
+
SystemMessage,
|
13
|
+
UsageMetadata,
|
14
|
+
)
|
15
|
+
from .strategies import (
|
16
|
+
AoTPipeline,
|
17
|
+
AoTPrompter,
|
18
|
+
AoTStrategy,
|
19
|
+
BaseStrategy,
|
20
|
+
)
|
21
|
+
from .tools import (
|
22
|
+
CodeSnippets,
|
23
|
+
MarkdownLink,
|
24
|
+
PdfToMarkdown,
|
25
|
+
PlayWrightBot,
|
26
|
+
PlaywrightLaunchOptions,
|
27
|
+
PlaywrightOptions,
|
28
|
+
PlaywrightPersistencyOptions,
|
29
|
+
UpstageDocumentParseParser,
|
30
|
+
acaption_markdown_images,
|
31
|
+
anything_to_markdown,
|
32
|
+
caption_markdown_images,
|
33
|
+
citation_chunker,
|
34
|
+
extract_text_from_pdf,
|
35
|
+
get_default_html_to_markdown_options,
|
36
|
+
get_default_playwright_launch_options,
|
37
|
+
get_youtube_video_details,
|
38
|
+
get_youtube_video_subtitle,
|
39
|
+
html_to_markdown,
|
40
|
+
open_pdf,
|
41
|
+
pdf_to_text,
|
42
|
+
pyscripts_to_snippets,
|
43
|
+
render_pdf_as_image,
|
44
|
+
)
|
45
|
+
from .utils import (
|
46
|
+
Base64Image,
|
47
|
+
CodeExecutionResult,
|
48
|
+
FunctionSignature,
|
49
|
+
get_default_repl_tool,
|
50
|
+
insert_callables_into_global,
|
51
|
+
)
|
52
|
+
|
53
|
+
load_dotenv()
|
54
|
+
|
55
|
+
__all__ = [
|
56
|
+
"BaseStrategy",
|
57
|
+
"Chatterer",
|
58
|
+
"AoTStrategy",
|
59
|
+
"AoTPipeline",
|
60
|
+
"AoTPrompter",
|
61
|
+
"html_to_markdown",
|
62
|
+
"anything_to_markdown",
|
63
|
+
"pdf_to_text",
|
64
|
+
"get_default_html_to_markdown_options",
|
65
|
+
"pyscripts_to_snippets",
|
66
|
+
"citation_chunker",
|
67
|
+
"BaseMessage",
|
68
|
+
"HumanMessage",
|
69
|
+
"SystemMessage",
|
70
|
+
"AIMessage",
|
71
|
+
"FunctionMessage",
|
72
|
+
"Base64Image",
|
73
|
+
"FunctionSignature",
|
74
|
+
"CodeExecutionResult",
|
75
|
+
"get_default_repl_tool",
|
76
|
+
"insert_callables_into_global",
|
77
|
+
"get_youtube_video_subtitle",
|
78
|
+
"get_youtube_video_details",
|
79
|
+
"interactive_shell",
|
80
|
+
"UpstageDocumentParseParser",
|
81
|
+
"BaseMessageChunk",
|
82
|
+
"CodeSnippets",
|
83
|
+
"LanguageModelInput",
|
84
|
+
"UsageMetadata",
|
85
|
+
"PlayWrightBot",
|
86
|
+
"PlaywrightLaunchOptions",
|
87
|
+
"PlaywrightOptions",
|
88
|
+
"PlaywrightPersistencyOptions",
|
89
|
+
"get_default_playwright_launch_options",
|
90
|
+
"acaption_markdown_images",
|
91
|
+
"caption_markdown_images",
|
92
|
+
"MarkdownLink",
|
93
|
+
"PdfToMarkdown",
|
94
|
+
"extract_text_from_pdf",
|
95
|
+
"open_pdf",
|
96
|
+
"render_pdf_as_image",
|
97
|
+
]
|
@@ -1,21 +1,21 @@
|
|
1
|
-
from .io import (
|
2
|
-
BytesReadable,
|
3
|
-
BytesWritable,
|
4
|
-
FileDescriptorOrPath,
|
5
|
-
PathOrReadable,
|
6
|
-
Readable,
|
7
|
-
StringReadable,
|
8
|
-
StringWritable,
|
9
|
-
Writable,
|
10
|
-
)
|
11
|
-
|
12
|
-
__all__ = [
|
13
|
-
"BytesReadable",
|
14
|
-
"BytesWritable",
|
15
|
-
"FileDescriptorOrPath",
|
16
|
-
"PathOrReadable",
|
17
|
-
"Readable",
|
18
|
-
"StringReadable",
|
19
|
-
"StringWritable",
|
20
|
-
"Writable",
|
21
|
-
]
|
1
|
+
from .io import (
|
2
|
+
BytesReadable,
|
3
|
+
BytesWritable,
|
4
|
+
FileDescriptorOrPath,
|
5
|
+
PathOrReadable,
|
6
|
+
Readable,
|
7
|
+
StringReadable,
|
8
|
+
StringWritable,
|
9
|
+
Writable,
|
10
|
+
)
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"BytesReadable",
|
14
|
+
"BytesWritable",
|
15
|
+
"FileDescriptorOrPath",
|
16
|
+
"PathOrReadable",
|
17
|
+
"Readable",
|
18
|
+
"StringReadable",
|
19
|
+
"StringWritable",
|
20
|
+
"Writable",
|
21
|
+
]
|
chatterer/common_types/io.py
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
import os
|
2
|
-
from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
|
3
|
-
from typing import TypeAlias
|
4
|
-
|
5
|
-
# Type aliases for callback functions and file descriptors
|
6
|
-
FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
|
7
|
-
|
8
|
-
# Type aliases for different types of IO objects
|
9
|
-
BytesReadable: TypeAlias = BytesIO | BufferedReader
|
10
|
-
BytesWritable: TypeAlias = BytesIO | BufferedWriter
|
11
|
-
StringReadable: TypeAlias = StringIO | TextIOWrapper
|
12
|
-
StringWritable: TypeAlias = StringIO | TextIOWrapper
|
13
|
-
|
14
|
-
# Combined type aliases for readable and writable objects
|
15
|
-
Readable: TypeAlias = BytesReadable | StringReadable
|
16
|
-
Writable: TypeAlias = BytesWritable | StringWritable
|
17
|
-
|
18
|
-
# Type alias for path or readable object
|
19
|
-
PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
|
1
|
+
import os
|
2
|
+
from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
|
3
|
+
from typing import TypeAlias
|
4
|
+
|
5
|
+
# Type aliases for callback functions and file descriptors
|
6
|
+
FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
|
7
|
+
|
8
|
+
# Type aliases for different types of IO objects
|
9
|
+
BytesReadable: TypeAlias = BytesIO | BufferedReader
|
10
|
+
BytesWritable: TypeAlias = BytesIO | BufferedWriter
|
11
|
+
StringReadable: TypeAlias = StringIO | TextIOWrapper
|
12
|
+
StringWritable: TypeAlias = StringIO | TextIOWrapper
|
13
|
+
|
14
|
+
# Combined type aliases for readable and writable objects
|
15
|
+
Readable: TypeAlias = BytesReadable | StringReadable
|
16
|
+
Writable: TypeAlias = BytesWritable | StringWritable
|
17
|
+
|
18
|
+
# Type alias for path or readable object
|
19
|
+
PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
|
chatterer/examples/__main__.py
CHANGED
@@ -1,75 +1,75 @@
|
|
1
|
-
from spargear import SubcommandArguments, SubcommandSpec
|
2
|
-
|
3
|
-
|
4
|
-
def any2md():
|
5
|
-
from .any2md import Arguments
|
6
|
-
|
7
|
-
return Arguments
|
8
|
-
|
9
|
-
|
10
|
-
def pdf2md():
|
11
|
-
from .pdf2md import Arguments
|
12
|
-
|
13
|
-
return Arguments
|
14
|
-
|
15
|
-
|
16
|
-
def pdf2txt():
|
17
|
-
from .pdf2txt import Arguments
|
18
|
-
|
19
|
-
return Arguments
|
20
|
-
|
21
|
-
|
22
|
-
def ppt():
|
23
|
-
from .ppt import Arguments
|
24
|
-
|
25
|
-
return Arguments
|
26
|
-
|
27
|
-
|
28
|
-
def pw():
|
29
|
-
from .pw import Arguments
|
30
|
-
|
31
|
-
return Arguments
|
32
|
-
|
33
|
-
|
34
|
-
def snippet():
|
35
|
-
from .snippet import Arguments
|
36
|
-
|
37
|
-
return Arguments
|
38
|
-
|
39
|
-
|
40
|
-
def transcribe():
|
41
|
-
from .transcribe import Arguments
|
42
|
-
|
43
|
-
return Arguments
|
44
|
-
|
45
|
-
|
46
|
-
def upstage():
|
47
|
-
from .upstage import Arguments
|
48
|
-
|
49
|
-
return Arguments
|
50
|
-
|
51
|
-
|
52
|
-
def web2md():
|
53
|
-
from .web2md import Arguments
|
54
|
-
|
55
|
-
return Arguments
|
56
|
-
|
57
|
-
|
58
|
-
class Arguments(SubcommandArguments):
|
59
|
-
any2md = SubcommandSpec(name="any2md", argument_class_factory=any2md)
|
60
|
-
pdf2md = SubcommandSpec(name="pdf2md", argument_class_factory=pdf2md)
|
61
|
-
pdf2txt = SubcommandSpec(name="pdf2txt", argument_class_factory=pdf2txt)
|
62
|
-
ppt = SubcommandSpec(name="ppt", argument_class_factory=ppt)
|
63
|
-
pw = SubcommandSpec(name="pw", argument_class_factory=pw)
|
64
|
-
snippet = SubcommandSpec(name="snippet", argument_class_factory=snippet)
|
65
|
-
transcribe = SubcommandSpec(name="transcribe", argument_class_factory=transcribe)
|
66
|
-
upstage = SubcommandSpec(name="upstage", argument_class_factory=upstage)
|
67
|
-
web2md = SubcommandSpec(name="web2md", argument_class_factory=web2md)
|
68
|
-
|
69
|
-
|
70
|
-
def main():
|
71
|
-
Arguments().execute()
|
72
|
-
|
73
|
-
|
74
|
-
if __name__ == "__main__":
|
75
|
-
main()
|
1
|
+
from spargear import SubcommandArguments, SubcommandSpec
|
2
|
+
|
3
|
+
|
4
|
+
def any2md():
|
5
|
+
from .any2md import Arguments
|
6
|
+
|
7
|
+
return Arguments
|
8
|
+
|
9
|
+
|
10
|
+
def pdf2md():
|
11
|
+
from .pdf2md import Arguments
|
12
|
+
|
13
|
+
return Arguments
|
14
|
+
|
15
|
+
|
16
|
+
def pdf2txt():
|
17
|
+
from .pdf2txt import Arguments
|
18
|
+
|
19
|
+
return Arguments
|
20
|
+
|
21
|
+
|
22
|
+
def ppt():
|
23
|
+
from .ppt import Arguments
|
24
|
+
|
25
|
+
return Arguments
|
26
|
+
|
27
|
+
|
28
|
+
def pw():
|
29
|
+
from .pw import Arguments
|
30
|
+
|
31
|
+
return Arguments
|
32
|
+
|
33
|
+
|
34
|
+
def snippet():
|
35
|
+
from .snippet import Arguments
|
36
|
+
|
37
|
+
return Arguments
|
38
|
+
|
39
|
+
|
40
|
+
def transcribe():
|
41
|
+
from .transcribe import Arguments
|
42
|
+
|
43
|
+
return Arguments
|
44
|
+
|
45
|
+
|
46
|
+
def upstage():
|
47
|
+
from .upstage import Arguments
|
48
|
+
|
49
|
+
return Arguments
|
50
|
+
|
51
|
+
|
52
|
+
def web2md():
|
53
|
+
from .web2md import Arguments
|
54
|
+
|
55
|
+
return Arguments
|
56
|
+
|
57
|
+
|
58
|
+
class Arguments(SubcommandArguments):
|
59
|
+
any2md = SubcommandSpec(name="any2md", argument_class_factory=any2md)
|
60
|
+
pdf2md = SubcommandSpec(name="pdf2md", argument_class_factory=pdf2md)
|
61
|
+
pdf2txt = SubcommandSpec(name="pdf2txt", argument_class_factory=pdf2txt)
|
62
|
+
ppt = SubcommandSpec(name="ppt", argument_class_factory=ppt)
|
63
|
+
pw = SubcommandSpec(name="pw", argument_class_factory=pw)
|
64
|
+
snippet = SubcommandSpec(name="snippet", argument_class_factory=snippet)
|
65
|
+
transcribe = SubcommandSpec(name="transcribe", argument_class_factory=transcribe)
|
66
|
+
upstage = SubcommandSpec(name="upstage", argument_class_factory=upstage)
|
67
|
+
web2md = SubcommandSpec(name="web2md", argument_class_factory=web2md)
|
68
|
+
|
69
|
+
|
70
|
+
def main():
|
71
|
+
Arguments().execute()
|
72
|
+
|
73
|
+
|
74
|
+
if __name__ == "__main__":
|
75
|
+
main()
|
chatterer/examples/any2md.py
CHANGED
@@ -1,85 +1,85 @@
|
|
1
|
-
import logging
|
2
|
-
from pathlib import Path
|
3
|
-
from typing import Optional, TypedDict
|
4
|
-
|
5
|
-
import openai
|
6
|
-
from spargear import RunnableArguments
|
7
|
-
|
8
|
-
from chatterer import anything_to_markdown
|
9
|
-
|
10
|
-
logger = logging.getLogger(__name__)
|
11
|
-
|
12
|
-
|
13
|
-
class AnythingToMarkdownReturns(TypedDict):
|
14
|
-
input: str
|
15
|
-
output: Optional[str]
|
16
|
-
out_text: str
|
17
|
-
|
18
|
-
|
19
|
-
class Arguments(RunnableArguments[AnythingToMarkdownReturns]):
|
20
|
-
"""Command line arguments for converting various file types to markdown."""
|
21
|
-
|
22
|
-
SOURCE: str
|
23
|
-
"""Input file to convert to markdown. Can be a file path or a URL."""
|
24
|
-
output: Optional[str] = None
|
25
|
-
"""Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
|
26
|
-
model: Optional[str] = None
|
27
|
-
"""OpenAI Model to use for conversion"""
|
28
|
-
api_key: Optional[str] = None
|
29
|
-
"""API key for OpenAI API"""
|
30
|
-
base_url: Optional[str] = None
|
31
|
-
"""Base URL for OpenAI API"""
|
32
|
-
style_map: Optional[str] = None
|
33
|
-
"""Output style map"""
|
34
|
-
exiftool_path: Optional[str] = None
|
35
|
-
""""Path to exiftool for metadata extraction"""
|
36
|
-
docintel_endpoint: Optional[str] = None
|
37
|
-
"Document Intelligence API endpoint"
|
38
|
-
prevent_save_file: bool = False
|
39
|
-
"""Prevent saving the converted file to disk."""
|
40
|
-
encoding: str = "utf-8"
|
41
|
-
"""Encoding for the output file."""
|
42
|
-
|
43
|
-
def run(self) -> AnythingToMarkdownReturns:
|
44
|
-
if not self.prevent_save_file:
|
45
|
-
if not self.output:
|
46
|
-
output = Path(self.SOURCE).with_suffix(".md")
|
47
|
-
else:
|
48
|
-
output = Path(self.output)
|
49
|
-
else:
|
50
|
-
output = None
|
51
|
-
|
52
|
-
if self.model:
|
53
|
-
llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
|
54
|
-
llm_model = self.model
|
55
|
-
else:
|
56
|
-
llm_client = None
|
57
|
-
llm_model = None
|
58
|
-
|
59
|
-
text: str = anything_to_markdown(
|
60
|
-
self.SOURCE,
|
61
|
-
llm_client=llm_client,
|
62
|
-
llm_model=llm_model,
|
63
|
-
style_map=self.style_map,
|
64
|
-
exiftool_path=self.exiftool_path,
|
65
|
-
docintel_endpoint=self.docintel_endpoint,
|
66
|
-
)
|
67
|
-
if output:
|
68
|
-
output.parent.mkdir(parents=True, exist_ok=True)
|
69
|
-
output.write_text(text, encoding=self.encoding)
|
70
|
-
logger.info(f"Converted `{self.SOURCE}` to markdown and saved to `{output}`.")
|
71
|
-
else:
|
72
|
-
logger.info(f"Converted `{self.SOURCE}` to markdown.")
|
73
|
-
return {
|
74
|
-
"input": self.SOURCE,
|
75
|
-
"output": str(output) if output is not None else None,
|
76
|
-
"out_text": text,
|
77
|
-
}
|
78
|
-
|
79
|
-
|
80
|
-
def main() -> None:
|
81
|
-
Arguments().run()
|
82
|
-
|
83
|
-
|
84
|
-
if __name__ == "__main__":
|
85
|
-
main()
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional, TypedDict
|
4
|
+
|
5
|
+
import openai
|
6
|
+
from spargear import RunnableArguments
|
7
|
+
|
8
|
+
from chatterer import anything_to_markdown
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class AnythingToMarkdownReturns(TypedDict):
|
14
|
+
input: str
|
15
|
+
output: Optional[str]
|
16
|
+
out_text: str
|
17
|
+
|
18
|
+
|
19
|
+
class Arguments(RunnableArguments[AnythingToMarkdownReturns]):
|
20
|
+
"""Command line arguments for converting various file types to markdown."""
|
21
|
+
|
22
|
+
SOURCE: str
|
23
|
+
"""Input file to convert to markdown. Can be a file path or a URL."""
|
24
|
+
output: Optional[str] = None
|
25
|
+
"""Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
|
26
|
+
model: Optional[str] = None
|
27
|
+
"""OpenAI Model to use for conversion"""
|
28
|
+
api_key: Optional[str] = None
|
29
|
+
"""API key for OpenAI API"""
|
30
|
+
base_url: Optional[str] = None
|
31
|
+
"""Base URL for OpenAI API"""
|
32
|
+
style_map: Optional[str] = None
|
33
|
+
"""Output style map"""
|
34
|
+
exiftool_path: Optional[str] = None
|
35
|
+
""""Path to exiftool for metadata extraction"""
|
36
|
+
docintel_endpoint: Optional[str] = None
|
37
|
+
"Document Intelligence API endpoint"
|
38
|
+
prevent_save_file: bool = False
|
39
|
+
"""Prevent saving the converted file to disk."""
|
40
|
+
encoding: str = "utf-8"
|
41
|
+
"""Encoding for the output file."""
|
42
|
+
|
43
|
+
def run(self) -> AnythingToMarkdownReturns:
|
44
|
+
if not self.prevent_save_file:
|
45
|
+
if not self.output:
|
46
|
+
output = Path(self.SOURCE).with_suffix(".md")
|
47
|
+
else:
|
48
|
+
output = Path(self.output)
|
49
|
+
else:
|
50
|
+
output = None
|
51
|
+
|
52
|
+
if self.model:
|
53
|
+
llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
|
54
|
+
llm_model = self.model
|
55
|
+
else:
|
56
|
+
llm_client = None
|
57
|
+
llm_model = None
|
58
|
+
|
59
|
+
text: str = anything_to_markdown(
|
60
|
+
self.SOURCE,
|
61
|
+
llm_client=llm_client,
|
62
|
+
llm_model=llm_model,
|
63
|
+
style_map=self.style_map,
|
64
|
+
exiftool_path=self.exiftool_path,
|
65
|
+
docintel_endpoint=self.docintel_endpoint,
|
66
|
+
)
|
67
|
+
if output:
|
68
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
69
|
+
output.write_text(text, encoding=self.encoding)
|
70
|
+
logger.info(f"Converted `{self.SOURCE}` to markdown and saved to `{output}`.")
|
71
|
+
else:
|
72
|
+
logger.info(f"Converted `{self.SOURCE}` to markdown.")
|
73
|
+
return {
|
74
|
+
"input": self.SOURCE,
|
75
|
+
"output": str(output) if output is not None else None,
|
76
|
+
"out_text": text,
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
def main() -> None:
|
81
|
+
Arguments().run()
|
82
|
+
|
83
|
+
|
84
|
+
if __name__ == "__main__":
|
85
|
+
main()
|