chatterer 0.1.19__tar.gz → 0.1.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatterer-0.1.19 → chatterer-0.1.21}/PKG-INFO +2 -2
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/examples/anything_to_markdown.py +21 -31
- chatterer-0.1.21/chatterer/examples/get_code_snippets.py +55 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/examples/login_with_playwright.py +68 -83
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/examples/make_ppt.py +3 -14
- chatterer-0.1.21/chatterer/examples/pdf_to_markdown.py +77 -0
- chatterer-0.1.21/chatterer/examples/pdf_to_text.py +54 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/examples/transcription_api.py +21 -36
- chatterer-0.1.21/chatterer/examples/upstage_parser.py +89 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/examples/webpage_to_markdown.py +19 -28
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/strategies/atom_of_thoughts.py +1 -1
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/convert_pdf_to_markdown.py +105 -14
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/convert_to_text.py +3 -4
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/upstage_document_parser.py +2 -2
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/utils/code_agent.py +1 -1
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer.egg-info/PKG-INFO +2 -2
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer.egg-info/requires.txt +1 -1
- {chatterer-0.1.19 → chatterer-0.1.21}/pyproject.toml +2 -2
- chatterer-0.1.19/chatterer/examples/get_code_snippets.py +0 -64
- chatterer-0.1.19/chatterer/examples/pdf_to_markdown.py +0 -107
- chatterer-0.1.19/chatterer/examples/pdf_to_text.py +0 -60
- chatterer-0.1.19/chatterer/examples/upstage_parser.py +0 -95
- {chatterer-0.1.19 → chatterer-0.1.21}/README.md +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/__init__.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/common_types/__init__.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/common_types/io.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/examples/__init__.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/interactive.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/language_model.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/messages.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/py.typed +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/strategies/__init__.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/strategies/base.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/__init__.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/caption_markdown_images.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/citation_chunking/__init__.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/citation_chunking/chunks.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/citation_chunking/citation_chunker.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/citation_chunking/citations.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/citation_chunking/prompt.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/citation_chunking/reference.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/citation_chunking/utils.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/webpage_to_markdown.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/tools/youtube.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/utils/__init__.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/utils/base64_image.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/utils/bytesio.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer/utils/imghdr.py +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer.egg-info/SOURCES.txt +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer.egg-info/dependency_links.txt +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer.egg-info/entry_points.txt +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/chatterer.egg-info/top_level.txt +0 -0
- {chatterer-0.1.19 → chatterer-0.1.21}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: chatterer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.21
|
4
4
|
Summary: The highest-level interface for various LLM APIs.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -11,7 +11,7 @@ Requires-Dist: pillow>=11.1.0
|
|
11
11
|
Requires-Dist: regex>=2024.11.6
|
12
12
|
Requires-Dist: rich>=13.9.4
|
13
13
|
Requires-Dist: colorama>=0.4.6
|
14
|
-
Requires-Dist: spargear>=0.
|
14
|
+
Requires-Dist: spargear>=0.2.0
|
15
15
|
Provides-Extra: dev
|
16
16
|
Requires-Dist: neo4j-extension>=0.1.14; extra == "dev"
|
17
17
|
Requires-Dist: ipykernel>=6.29.5; extra == "dev"
|
@@ -1,36 +1,27 @@
|
|
1
|
-
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
1
|
+
import logging
|
14
2
|
from pathlib import Path
|
15
3
|
from typing import Optional, TypedDict
|
16
4
|
|
17
5
|
import openai
|
18
|
-
from spargear import
|
6
|
+
from spargear import BaseArguments
|
19
7
|
|
20
8
|
from chatterer import anything_to_markdown
|
21
9
|
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
22
12
|
|
23
13
|
class AnythingToMarkdownReturns(TypedDict):
|
24
|
-
|
25
|
-
|
14
|
+
input: str
|
15
|
+
output: Optional[str]
|
26
16
|
out_text: str
|
27
17
|
|
28
18
|
|
29
19
|
class AnythingToMarkdownArguments(BaseArguments):
|
30
20
|
"""Command line arguments for converting various file types to markdown."""
|
31
21
|
|
32
|
-
|
33
|
-
|
22
|
+
input: str
|
23
|
+
"""Input file to convert to markdown. Can be a file path or a URL."""
|
24
|
+
output: Optional[str] = None
|
34
25
|
"""Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
|
35
26
|
model: Optional[str] = None
|
36
27
|
"""OpenAI Model to use for conversion"""
|
@@ -50,14 +41,13 @@ class AnythingToMarkdownArguments(BaseArguments):
|
|
50
41
|
"""Encoding for the output file."""
|
51
42
|
|
52
43
|
def run(self) -> AnythingToMarkdownReturns:
|
53
|
-
in_path = self.in_path.unwrap()
|
54
44
|
if not self.prevent_save_file:
|
55
|
-
if not self.
|
56
|
-
|
45
|
+
if not self.output:
|
46
|
+
output = Path(self.input).with_suffix(".md")
|
57
47
|
else:
|
58
|
-
|
48
|
+
output = Path(self.output)
|
59
49
|
else:
|
60
|
-
|
50
|
+
output = None
|
61
51
|
|
62
52
|
if self.model:
|
63
53
|
llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
|
@@ -67,22 +57,22 @@ class AnythingToMarkdownArguments(BaseArguments):
|
|
67
57
|
llm_model = None
|
68
58
|
|
69
59
|
text: str = anything_to_markdown(
|
70
|
-
|
60
|
+
self.input,
|
71
61
|
llm_client=llm_client,
|
72
62
|
llm_model=llm_model,
|
73
63
|
style_map=self.style_map,
|
74
64
|
exiftool_path=self.exiftool_path,
|
75
65
|
docintel_endpoint=self.docintel_endpoint,
|
76
66
|
)
|
77
|
-
if
|
78
|
-
|
79
|
-
|
80
|
-
logger.info(f"Converted `{
|
67
|
+
if output:
|
68
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
69
|
+
output.write_text(text, encoding=self.encoding)
|
70
|
+
logger.info(f"Converted `{self.input}` to markdown and saved to `{output}`.")
|
81
71
|
else:
|
82
|
-
logger.info(f"Converted `{
|
72
|
+
logger.info(f"Converted `{self.input}` to markdown.")
|
83
73
|
return {
|
84
|
-
"
|
85
|
-
"
|
74
|
+
"input": self.input,
|
75
|
+
"output": str(output) if output is not None else None,
|
86
76
|
"out_text": text,
|
87
77
|
}
|
88
78
|
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from spargear import BaseArguments
|
6
|
+
|
7
|
+
from chatterer import CodeSnippets
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class GetCodeSnippetsArgs(BaseArguments):
|
13
|
+
input: str
|
14
|
+
"""Path to the package or file from which to extract code snippets."""
|
15
|
+
output: Optional[str] = None
|
16
|
+
"""Output path for the extracted code snippets. If not provided, defaults to a file with the same name as the input."""
|
17
|
+
ban_file_patterns: list[str] = [".venv/*", Path(__file__).relative_to(Path.cwd()).as_posix()]
|
18
|
+
"""List of file patterns to ignore."""
|
19
|
+
glob_patterns: list[str] = ["*.py"]
|
20
|
+
"""List of glob patterns to include."""
|
21
|
+
case_sensitive: bool = False
|
22
|
+
"""Enable case-sensitive matching for glob patterns."""
|
23
|
+
prevent_save_file: bool = False
|
24
|
+
"""Prevent saving the extracted code snippets to a file."""
|
25
|
+
|
26
|
+
def run(self) -> CodeSnippets:
|
27
|
+
if not self.prevent_save_file:
|
28
|
+
if not self.output:
|
29
|
+
output = Path(__file__).with_suffix(".txt")
|
30
|
+
else:
|
31
|
+
output = Path(self.output)
|
32
|
+
else:
|
33
|
+
output = None
|
34
|
+
|
35
|
+
cs = CodeSnippets.from_path_or_pkgname(
|
36
|
+
path_or_pkgname=self.input,
|
37
|
+
ban_file_patterns=self.ban_file_patterns,
|
38
|
+
glob_patterns=self.glob_patterns,
|
39
|
+
case_sensitive=self.case_sensitive,
|
40
|
+
)
|
41
|
+
if output is not None:
|
42
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
43
|
+
output.write_text(cs.snippets_text, encoding="utf-8")
|
44
|
+
logger.info(f"Extracted code snippets from `{self.input}` and saved to `{output}`.")
|
45
|
+
else:
|
46
|
+
logger.info(f"Extracted code snippets from `{self.input}`.")
|
47
|
+
return cs
|
48
|
+
|
49
|
+
|
50
|
+
def main() -> None:
|
51
|
+
GetCodeSnippetsArgs().run()
|
52
|
+
|
53
|
+
|
54
|
+
if __name__ == "__main__":
|
55
|
+
main()
|
@@ -1,17 +1,5 @@
|
|
1
|
-
def resolve_import_path_and_get_logger():
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
14
1
|
import json
|
2
|
+
import logging
|
15
3
|
import sys
|
16
4
|
from pathlib import Path
|
17
5
|
|
@@ -19,76 +7,8 @@ from spargear import BaseArguments, SubcommandSpec
|
|
19
7
|
|
20
8
|
from chatterer import PlayWrightBot
|
21
9
|
|
10
|
+
logger = logging.getLogger(__name__)
|
22
11
|
|
23
|
-
def read_session(url: str, jsonpath: Path) -> None:
|
24
|
-
"""
|
25
|
-
Loads the session state from the specified JSON file, then navigates
|
26
|
-
to a protected_url that normally requires login. If the stored session
|
27
|
-
is valid, it should open without re-entering credentials.
|
28
|
-
|
29
|
-
Correction: Loads the JSON content into a dict first to satisfy type hints.
|
30
|
-
"""
|
31
|
-
logger.info(f"Loading session from {jsonpath} and navigating to {url} ...")
|
32
|
-
|
33
|
-
if not jsonpath.exists():
|
34
|
-
logger.error(f"Session file not found at {jsonpath}")
|
35
|
-
sys.exit(1)
|
36
|
-
|
37
|
-
# Load the storage state from the JSON file into a dictionary
|
38
|
-
logger.info(f"Reading storage state content from {jsonpath} ...")
|
39
|
-
try:
|
40
|
-
with open(jsonpath, "r", encoding="utf-8") as f:
|
41
|
-
# This dictionary should match the 'StorageState' type expected by Playwright/chatterer
|
42
|
-
storage_state_dict = json.load(f)
|
43
|
-
except json.JSONDecodeError:
|
44
|
-
logger.error(f"Failed to decode JSON from {jsonpath}")
|
45
|
-
sys.exit(1)
|
46
|
-
except Exception as e:
|
47
|
-
logger.error(f"Error reading file {jsonpath}: {e}")
|
48
|
-
sys.exit(1)
|
49
|
-
|
50
|
-
logger.info("Launching browser with loaded session state...")
|
51
|
-
with PlayWrightBot(
|
52
|
-
playwright_launch_options={"headless": False},
|
53
|
-
# Pass the loaded dictionary, which should match the expected 'StorageState' type
|
54
|
-
playwright_persistency_options={"storage_state": storage_state_dict},
|
55
|
-
) as bot:
|
56
|
-
bot.get_page(url)
|
57
|
-
|
58
|
-
logger.info("Press Enter in the console when you're done checking the protected page.")
|
59
|
-
input(" >> Press Enter to exit: ")
|
60
|
-
|
61
|
-
logger.info("Done! Browser is now closed.")
|
62
|
-
|
63
|
-
|
64
|
-
def write_session(url: str, jsonpath: Path) -> None:
|
65
|
-
"""
|
66
|
-
Launches a non-headless browser and navigates to the login_url.
|
67
|
-
The user can manually log in, then press Enter in the console
|
68
|
-
to store the current session state into a JSON file.
|
69
|
-
"""
|
70
|
-
logger.info(f"Launching browser and navigating to {url} ... Please log in manually.")
|
71
|
-
|
72
|
-
# Ensure jsonpath directory exists
|
73
|
-
jsonpath.parent.mkdir(parents=True, exist_ok=True)
|
74
|
-
|
75
|
-
with PlayWrightBot(playwright_launch_options={"headless": False}) as bot:
|
76
|
-
bot.get_page(url)
|
77
|
-
|
78
|
-
logger.info("After completing the login in the browser, press Enter here to save the session.")
|
79
|
-
input(" >> Press Enter when ready: ")
|
80
|
-
|
81
|
-
# get_sync_browser() returns the BrowserContext internally
|
82
|
-
context = bot.get_sync_browser()
|
83
|
-
|
84
|
-
# Save the current session (cookies, localStorage) to a JSON file
|
85
|
-
logger.info(f"Saving storage state to {jsonpath} ...")
|
86
|
-
context.storage_state(path=jsonpath) # Pass Path object directly
|
87
|
-
|
88
|
-
logger.info("Done! Browser is now closed.")
|
89
|
-
|
90
|
-
|
91
|
-
# --- Spargear Declarative CLI Definition ---
|
92
12
|
|
93
13
|
# Define the default path location relative to this script file
|
94
14
|
DEFAULT_JSON_PATH = Path(__file__).resolve().parent / "session_state.json"
|
@@ -160,7 +80,72 @@ class LoginWithPlaywrightArgs(BaseArguments):
|
|
160
80
|
sys.exit(1)
|
161
81
|
|
162
82
|
|
163
|
-
|
83
|
+
def read_session(url: str, jsonpath: Path) -> None:
|
84
|
+
"""
|
85
|
+
Loads the session state from the specified JSON file, then navigates
|
86
|
+
to a protected_url that normally requires login. If the stored session
|
87
|
+
is valid, it should open without re-entering credentials.
|
88
|
+
|
89
|
+
Correction: Loads the JSON content into a dict first to satisfy type hints.
|
90
|
+
"""
|
91
|
+
logger.info(f"Loading session from {jsonpath} and navigating to {url} ...")
|
92
|
+
|
93
|
+
if not jsonpath.exists():
|
94
|
+
logger.error(f"Session file not found at {jsonpath}")
|
95
|
+
sys.exit(1)
|
96
|
+
|
97
|
+
# Load the storage state from the JSON file into a dictionary
|
98
|
+
logger.info(f"Reading storage state content from {jsonpath} ...")
|
99
|
+
try:
|
100
|
+
with open(jsonpath, "r", encoding="utf-8") as f:
|
101
|
+
# This dictionary should match the 'StorageState' type expected by Playwright/chatterer
|
102
|
+
storage_state_dict = json.load(f)
|
103
|
+
except json.JSONDecodeError:
|
104
|
+
logger.error(f"Failed to decode JSON from {jsonpath}")
|
105
|
+
sys.exit(1)
|
106
|
+
except Exception as e:
|
107
|
+
logger.error(f"Error reading file {jsonpath}: {e}")
|
108
|
+
sys.exit(1)
|
109
|
+
|
110
|
+
logger.info("Launching browser with loaded session state...")
|
111
|
+
with PlayWrightBot(
|
112
|
+
playwright_launch_options={"headless": False},
|
113
|
+
# Pass the loaded dictionary, which should match the expected 'StorageState' type
|
114
|
+
playwright_persistency_options={"storage_state": storage_state_dict},
|
115
|
+
) as bot:
|
116
|
+
bot.get_page(url)
|
117
|
+
|
118
|
+
logger.info("Press Enter in the console when you're done checking the protected page.")
|
119
|
+
input(" >> Press Enter to exit: ")
|
120
|
+
|
121
|
+
logger.info("Done! Browser is now closed.")
|
122
|
+
|
123
|
+
|
124
|
+
def write_session(url: str, jsonpath: Path) -> None:
|
125
|
+
"""
|
126
|
+
Launches a non-headless browser and navigates to the login_url.
|
127
|
+
The user can manually log in, then press Enter in the console
|
128
|
+
to store the current session state into a JSON file.
|
129
|
+
"""
|
130
|
+
logger.info(f"Launching browser and navigating to {url} ... Please log in manually.")
|
131
|
+
|
132
|
+
# Ensure jsonpath directory exists
|
133
|
+
jsonpath.parent.mkdir(parents=True, exist_ok=True)
|
134
|
+
|
135
|
+
with PlayWrightBot(playwright_launch_options={"headless": False}) as bot:
|
136
|
+
bot.get_page(url)
|
137
|
+
|
138
|
+
logger.info("After completing the login in the browser, press Enter here to save the session.")
|
139
|
+
input(" >> Press Enter when ready: ")
|
140
|
+
|
141
|
+
# get_sync_browser() returns the BrowserContext internally
|
142
|
+
context = bot.get_sync_browser()
|
143
|
+
|
144
|
+
# Save the current session (cookies, localStorage) to a JSON file
|
145
|
+
logger.info(f"Saving storage state to {jsonpath} ...")
|
146
|
+
context.storage_state(path=jsonpath) # Pass Path object directly
|
147
|
+
|
148
|
+
logger.info("Done! Browser is now closed.")
|
164
149
|
|
165
150
|
|
166
151
|
def main() -> None:
|
@@ -1,16 +1,3 @@
|
|
1
|
-
def resolve_import_path_and_get_logger():
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
14
1
|
import re
|
15
2
|
import sys
|
16
3
|
from pathlib import Path
|
@@ -192,7 +179,9 @@ class MakePptArguments(BaseArguments):
|
|
192
179
|
"""Prompt for organizing slides into a presentation script"""
|
193
180
|
|
194
181
|
# LLM Settings
|
195
|
-
provider: str =
|
182
|
+
provider: str = (
|
183
|
+
"openai:gpt-4.1" # Example: "openai:gpt-4o", "anthropic:claude-3-haiku-20240307", "google:gemini-1.5-flash"
|
184
|
+
)
|
196
185
|
"""Name of the language model to use (provider:model_name)"""
|
197
186
|
|
198
187
|
# Other settings
|
@@ -0,0 +1,77 @@
|
|
1
|
+
import logging
|
2
|
+
import sys
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from spargear import ArgumentSpec, BaseArguments
|
7
|
+
|
8
|
+
from chatterer import Chatterer, PdfToMarkdown
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class PdfToMarkdownArgs(BaseArguments):
|
14
|
+
input: str
|
15
|
+
"""Input PDF file or directory containing PDF files to convert to markdown."""
|
16
|
+
output: Optional[str] = None
|
17
|
+
"""Output path. For a file, path to the output markdown file. For a directory, output directory for .md files."""
|
18
|
+
"""Chatterer instance for communication."""
|
19
|
+
page: Optional[str] = None
|
20
|
+
"""Zero-based page indices to convert (e.g., '0,2,4-8')."""
|
21
|
+
recursive: bool = False
|
22
|
+
"""If input is a directory, search for PDFs recursively."""
|
23
|
+
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
24
|
+
["--chatterer"],
|
25
|
+
default_factory=lambda: Chatterer.from_provider("google:gemini-2.5-flash-preview-05-20"),
|
26
|
+
help="Chatterer instance for communication.",
|
27
|
+
type=Chatterer.from_provider,
|
28
|
+
)
|
29
|
+
|
30
|
+
def run(self) -> list[dict[str, str]]:
|
31
|
+
input = Path(self.input).resolve()
|
32
|
+
pdf_files: list[Path] = []
|
33
|
+
is_dir = False
|
34
|
+
if input.is_file():
|
35
|
+
if input.suffix.lower() != ".pdf":
|
36
|
+
sys.exit(1)
|
37
|
+
pdf_files.append(input)
|
38
|
+
elif input.is_dir():
|
39
|
+
is_dir = True
|
40
|
+
pattern = "*.pdf"
|
41
|
+
pdf_files = sorted([
|
42
|
+
f for f in (input.rglob(pattern) if self.recursive else input.glob(pattern)) if f.is_file()
|
43
|
+
])
|
44
|
+
if not pdf_files:
|
45
|
+
sys.exit(0)
|
46
|
+
else:
|
47
|
+
sys.exit(1)
|
48
|
+
if self.output:
|
49
|
+
out_base = Path(self.output).resolve()
|
50
|
+
elif is_dir:
|
51
|
+
out_base = input
|
52
|
+
else:
|
53
|
+
out_base = input.with_suffix(".md")
|
54
|
+
|
55
|
+
if is_dir:
|
56
|
+
out_base.mkdir(parents=True, exist_ok=True)
|
57
|
+
else:
|
58
|
+
out_base.parent.mkdir(parents=True, exist_ok=True)
|
59
|
+
|
60
|
+
converter = PdfToMarkdown(chatterer=self.chatterer.unwrap())
|
61
|
+
results: list[dict[str, str]] = []
|
62
|
+
for pdf in pdf_files:
|
63
|
+
output: Path = (out_base / (pdf.stem + ".md")) if is_dir else out_base
|
64
|
+
md: str = converter.convert(pdf_input=str(pdf), page_indices=self.page)
|
65
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
66
|
+
output.write_text(md, encoding="utf-8")
|
67
|
+
results.append({"input": pdf.as_posix(), "output": output.as_posix(), "result": md})
|
68
|
+
logger.info(f"Converted {len(pdf_files)} PDF(s) to markdown and saved to `{out_base}`.")
|
69
|
+
return results
|
70
|
+
|
71
|
+
|
72
|
+
def main() -> None:
|
73
|
+
PdfToMarkdownArgs().run()
|
74
|
+
|
75
|
+
|
76
|
+
if __name__ == "__main__":
|
77
|
+
main()
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import logging
|
2
|
+
import sys
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from spargear import BaseArguments
|
7
|
+
|
8
|
+
from chatterer.tools.convert_to_text import pdf_to_text
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class PdfToTextArgs(BaseArguments):
|
14
|
+
input: Path
|
15
|
+
"""Path to the PDF file to convert to text."""
|
16
|
+
output: Optional[Path]
|
17
|
+
"""Path to the output text file. If not provided, defaults to the input file with a .txt suffix."""
|
18
|
+
page: Optional[str] = None
|
19
|
+
"""Comma-separated list of zero-based page indices to extract from the PDF. Supports ranges, e.g., '0,2,4-8'."""
|
20
|
+
|
21
|
+
def run(self) -> None:
|
22
|
+
input = self.input.resolve()
|
23
|
+
out = self.output or input.with_suffix(".txt")
|
24
|
+
if not input.is_file():
|
25
|
+
sys.exit(1)
|
26
|
+
out.write_text(
|
27
|
+
pdf_to_text(path_or_file=input, page_indices=self.page),
|
28
|
+
encoding="utf-8",
|
29
|
+
)
|
30
|
+
logger.info(f"Extracted text from `{input}` to `{out}`")
|
31
|
+
|
32
|
+
|
33
|
+
def parse_page_indices(pages_str: str) -> list[int]:
|
34
|
+
indices: set[int] = set()
|
35
|
+
for part in pages_str.split(","):
|
36
|
+
part = part.strip()
|
37
|
+
if "-" in part:
|
38
|
+
start_str, end_str = part.split("-", 1)
|
39
|
+
start = int(start_str)
|
40
|
+
end = int(end_str)
|
41
|
+
if start > end:
|
42
|
+
raise ValueError
|
43
|
+
indices.update(range(start, end + 1))
|
44
|
+
else:
|
45
|
+
indices.add(int(part))
|
46
|
+
return sorted(indices)
|
47
|
+
|
48
|
+
|
49
|
+
def main() -> None:
|
50
|
+
PdfToTextArgs().run()
|
51
|
+
|
52
|
+
|
53
|
+
if __name__ == "__main__":
|
54
|
+
main()
|
@@ -2,51 +2,36 @@
|
|
2
2
|
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import cast
|
5
|
+
from typing import Optional, cast
|
6
6
|
|
7
7
|
from openai import OpenAI
|
8
8
|
from pydub import AudioSegment
|
9
|
-
from spargear import
|
9
|
+
from spargear import BaseArguments
|
10
10
|
|
11
11
|
# Maximum chunk length in seconds
|
12
12
|
MAX_CHUNK_DURATION = 600
|
13
13
|
|
14
14
|
|
15
15
|
class TranscriptionApiArguments(BaseArguments):
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
["--model"],
|
29
|
-
default="gpt-4o-transcribe",
|
30
|
-
help="The model to use for transcription.",
|
31
|
-
)
|
32
|
-
api_key: ArgumentSpec[str] = ArgumentSpec(
|
33
|
-
["--api-key"],
|
34
|
-
default=None,
|
35
|
-
help="The API key for authentication.",
|
36
|
-
)
|
37
|
-
base_url: ArgumentSpec[str] = ArgumentSpec(
|
38
|
-
["--base-url"],
|
39
|
-
default="https://api.openai.com/v1",
|
40
|
-
help="The base URL for the API.",
|
41
|
-
)
|
16
|
+
input: Path
|
17
|
+
"""The audio file to transcribe."""
|
18
|
+
output: Optional[Path] = None
|
19
|
+
"""Path to save the transcription output."""
|
20
|
+
model: str = "gpt-4o-transcribe"
|
21
|
+
"""The model to use for transcription."""
|
22
|
+
api_key: Optional[str] = None
|
23
|
+
"""The API key for authentication."""
|
24
|
+
base_url: str = "https://api.openai.com/v1"
|
25
|
+
"""The base URL for the API."""
|
26
|
+
prompt: str = "Transcribe whole text from audio."
|
27
|
+
"""The prompt to use for transcription."""
|
42
28
|
|
43
29
|
def run(self) -> None:
|
44
|
-
|
45
|
-
model = self.model.unwrap()
|
30
|
+
model = self.model
|
46
31
|
|
47
|
-
client = OpenAI(api_key=self.api_key
|
32
|
+
client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
48
33
|
|
49
|
-
audio = load_audio_segment(
|
34
|
+
audio = load_audio_segment(self.input)
|
50
35
|
|
51
36
|
segments = split_audio(audio, MAX_CHUNK_DURATION)
|
52
37
|
print(f"[i] Audio duration: {len(audio) / 1000:.1f}s; splitting into {len(segments)} segment(s)")
|
@@ -54,10 +39,10 @@ class TranscriptionApiArguments(BaseArguments):
|
|
54
39
|
transcripts: list[str] = []
|
55
40
|
for idx, seg in enumerate(segments, start=1):
|
56
41
|
print(f"[i] Transcribing segment {idx}/{len(segments)}...")
|
57
|
-
transcripts.append(transcribe_segment(seg, client, model))
|
42
|
+
transcripts.append(transcribe_segment(seg, client, model, self.prompt))
|
58
43
|
|
59
44
|
full_transcript = "\n\n".join(transcripts)
|
60
|
-
output_path: Path = self.
|
45
|
+
output_path: Path = self.output or self.input.with_suffix(".txt")
|
61
46
|
output_path.write_text(full_transcript, encoding="utf-8")
|
62
47
|
print(f"[✓] Transcription saved to: {output_path}")
|
63
48
|
|
@@ -94,7 +79,7 @@ def split_audio(audio: AudioSegment, max_duration_s: int) -> list[AudioSegment]:
|
|
94
79
|
return segments
|
95
80
|
|
96
81
|
|
97
|
-
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str) -> str:
|
82
|
+
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
|
98
83
|
"""
|
99
84
|
Transcribe a single AudioSegment chunk and return its text.
|
100
85
|
"""
|
@@ -104,7 +89,7 @@ def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str) -> str
|
|
104
89
|
mp3_bytes = buffer.read()
|
105
90
|
response = client.audio.transcriptions.create(
|
106
91
|
model=model,
|
107
|
-
prompt=
|
92
|
+
prompt=prompt,
|
108
93
|
file=("audio.mp3", mp3_bytes),
|
109
94
|
response_format="text",
|
110
95
|
stream=True,
|