chatterer 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/examples/anything_to_markdown.py +21 -31
- chatterer/examples/get_code_snippets.py +18 -27
- chatterer/examples/login_with_playwright.py +68 -83
- chatterer/examples/make_ppt.py +3 -14
- chatterer/examples/pdf_to_markdown.py +29 -59
- chatterer/examples/pdf_to_text.py +14 -20
- chatterer/examples/transcription_api.py +21 -36
- chatterer/examples/upstage_parser.py +41 -47
- chatterer/examples/webpage_to_markdown.py +19 -28
- chatterer/strategies/atom_of_thoughts.py +1 -1
- chatterer/tools/convert_pdf_to_markdown.py +105 -14
- chatterer/tools/convert_to_text.py +3 -4
- chatterer/tools/upstage_document_parser.py +2 -2
- chatterer/utils/code_agent.py +1 -1
- {chatterer-0.1.19.dist-info → chatterer-0.1.21.dist-info}/METADATA +2 -2
- {chatterer-0.1.19.dist-info → chatterer-0.1.21.dist-info}/RECORD +19 -19
- {chatterer-0.1.19.dist-info → chatterer-0.1.21.dist-info}/WHEEL +1 -1
- {chatterer-0.1.19.dist-info → chatterer-0.1.21.dist-info}/entry_points.txt +0 -0
- {chatterer-0.1.19.dist-info → chatterer-0.1.21.dist-info}/top_level.txt +0 -0
@@ -1,36 +1,27 @@
|
|
1
|
-
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
1
|
+
import logging
|
14
2
|
from pathlib import Path
|
15
3
|
from typing import Optional, TypedDict
|
16
4
|
|
17
5
|
import openai
|
18
|
-
from spargear import
|
6
|
+
from spargear import BaseArguments
|
19
7
|
|
20
8
|
from chatterer import anything_to_markdown
|
21
9
|
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
22
12
|
|
23
13
|
class AnythingToMarkdownReturns(TypedDict):
|
24
|
-
|
25
|
-
|
14
|
+
input: str
|
15
|
+
output: Optional[str]
|
26
16
|
out_text: str
|
27
17
|
|
28
18
|
|
29
19
|
class AnythingToMarkdownArguments(BaseArguments):
|
30
20
|
"""Command line arguments for converting various file types to markdown."""
|
31
21
|
|
32
|
-
|
33
|
-
|
22
|
+
input: str
|
23
|
+
"""Input file to convert to markdown. Can be a file path or a URL."""
|
24
|
+
output: Optional[str] = None
|
34
25
|
"""Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
|
35
26
|
model: Optional[str] = None
|
36
27
|
"""OpenAI Model to use for conversion"""
|
@@ -50,14 +41,13 @@ class AnythingToMarkdownArguments(BaseArguments):
|
|
50
41
|
"""Encoding for the output file."""
|
51
42
|
|
52
43
|
def run(self) -> AnythingToMarkdownReturns:
|
53
|
-
in_path = self.in_path.unwrap()
|
54
44
|
if not self.prevent_save_file:
|
55
|
-
if not self.
|
56
|
-
|
45
|
+
if not self.output:
|
46
|
+
output = Path(self.input).with_suffix(".md")
|
57
47
|
else:
|
58
|
-
|
48
|
+
output = Path(self.output)
|
59
49
|
else:
|
60
|
-
|
50
|
+
output = None
|
61
51
|
|
62
52
|
if self.model:
|
63
53
|
llm_client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
|
@@ -67,22 +57,22 @@ class AnythingToMarkdownArguments(BaseArguments):
|
|
67
57
|
llm_model = None
|
68
58
|
|
69
59
|
text: str = anything_to_markdown(
|
70
|
-
|
60
|
+
self.input,
|
71
61
|
llm_client=llm_client,
|
72
62
|
llm_model=llm_model,
|
73
63
|
style_map=self.style_map,
|
74
64
|
exiftool_path=self.exiftool_path,
|
75
65
|
docintel_endpoint=self.docintel_endpoint,
|
76
66
|
)
|
77
|
-
if
|
78
|
-
|
79
|
-
|
80
|
-
logger.info(f"Converted `{
|
67
|
+
if output:
|
68
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
69
|
+
output.write_text(text, encoding=self.encoding)
|
70
|
+
logger.info(f"Converted `{self.input}` to markdown and saved to `{output}`.")
|
81
71
|
else:
|
82
|
-
logger.info(f"Converted `{
|
72
|
+
logger.info(f"Converted `{self.input}` to markdown.")
|
83
73
|
return {
|
84
|
-
"
|
85
|
-
"
|
74
|
+
"input": self.input,
|
75
|
+
"output": str(output) if output is not None else None,
|
86
76
|
"out_text": text,
|
87
77
|
}
|
88
78
|
|
@@ -1,27 +1,19 @@
|
|
1
|
-
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
1
|
+
import logging
|
14
2
|
from pathlib import Path
|
15
3
|
from typing import Optional
|
16
4
|
|
17
|
-
from spargear import
|
5
|
+
from spargear import BaseArguments
|
18
6
|
|
19
7
|
from chatterer import CodeSnippets
|
20
8
|
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
21
11
|
|
22
12
|
class GetCodeSnippetsArgs(BaseArguments):
|
23
|
-
|
24
|
-
|
13
|
+
input: str
|
14
|
+
"""Path to the package or file from which to extract code snippets."""
|
15
|
+
output: Optional[str] = None
|
16
|
+
"""Output path for the extracted code snippets. If not provided, defaults to a file with the same name as the input."""
|
25
17
|
ban_file_patterns: list[str] = [".venv/*", Path(__file__).relative_to(Path.cwd()).as_posix()]
|
26
18
|
"""List of file patterns to ignore."""
|
27
19
|
glob_patterns: list[str] = ["*.py"]
|
@@ -32,27 +24,26 @@ class GetCodeSnippetsArgs(BaseArguments):
|
|
32
24
|
"""Prevent saving the extracted code snippets to a file."""
|
33
25
|
|
34
26
|
def run(self) -> CodeSnippets:
|
35
|
-
path_or_pkgname = self.path_or_pkgname.unwrap()
|
36
27
|
if not self.prevent_save_file:
|
37
|
-
if not self.
|
38
|
-
|
28
|
+
if not self.output:
|
29
|
+
output = Path(__file__).with_suffix(".txt")
|
39
30
|
else:
|
40
|
-
|
31
|
+
output = Path(self.output)
|
41
32
|
else:
|
42
|
-
|
33
|
+
output = None
|
43
34
|
|
44
35
|
cs = CodeSnippets.from_path_or_pkgname(
|
45
|
-
path_or_pkgname=
|
36
|
+
path_or_pkgname=self.input,
|
46
37
|
ban_file_patterns=self.ban_file_patterns,
|
47
38
|
glob_patterns=self.glob_patterns,
|
48
39
|
case_sensitive=self.case_sensitive,
|
49
40
|
)
|
50
|
-
if
|
51
|
-
|
52
|
-
|
53
|
-
logger.info(f"Extracted code snippets from `{
|
41
|
+
if output is not None:
|
42
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
43
|
+
output.write_text(cs.snippets_text, encoding="utf-8")
|
44
|
+
logger.info(f"Extracted code snippets from `{self.input}` and saved to `{output}`.")
|
54
45
|
else:
|
55
|
-
logger.info(f"Extracted code snippets from `{
|
46
|
+
logger.info(f"Extracted code snippets from `{self.input}`.")
|
56
47
|
return cs
|
57
48
|
|
58
49
|
|
@@ -1,17 +1,5 @@
|
|
1
|
-
def resolve_import_path_and_get_logger():
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
14
1
|
import json
|
2
|
+
import logging
|
15
3
|
import sys
|
16
4
|
from pathlib import Path
|
17
5
|
|
@@ -19,76 +7,8 @@ from spargear import BaseArguments, SubcommandSpec
|
|
19
7
|
|
20
8
|
from chatterer import PlayWrightBot
|
21
9
|
|
10
|
+
logger = logging.getLogger(__name__)
|
22
11
|
|
23
|
-
def read_session(url: str, jsonpath: Path) -> None:
|
24
|
-
"""
|
25
|
-
Loads the session state from the specified JSON file, then navigates
|
26
|
-
to a protected_url that normally requires login. If the stored session
|
27
|
-
is valid, it should open without re-entering credentials.
|
28
|
-
|
29
|
-
Correction: Loads the JSON content into a dict first to satisfy type hints.
|
30
|
-
"""
|
31
|
-
logger.info(f"Loading session from {jsonpath} and navigating to {url} ...")
|
32
|
-
|
33
|
-
if not jsonpath.exists():
|
34
|
-
logger.error(f"Session file not found at {jsonpath}")
|
35
|
-
sys.exit(1)
|
36
|
-
|
37
|
-
# Load the storage state from the JSON file into a dictionary
|
38
|
-
logger.info(f"Reading storage state content from {jsonpath} ...")
|
39
|
-
try:
|
40
|
-
with open(jsonpath, "r", encoding="utf-8") as f:
|
41
|
-
# This dictionary should match the 'StorageState' type expected by Playwright/chatterer
|
42
|
-
storage_state_dict = json.load(f)
|
43
|
-
except json.JSONDecodeError:
|
44
|
-
logger.error(f"Failed to decode JSON from {jsonpath}")
|
45
|
-
sys.exit(1)
|
46
|
-
except Exception as e:
|
47
|
-
logger.error(f"Error reading file {jsonpath}: {e}")
|
48
|
-
sys.exit(1)
|
49
|
-
|
50
|
-
logger.info("Launching browser with loaded session state...")
|
51
|
-
with PlayWrightBot(
|
52
|
-
playwright_launch_options={"headless": False},
|
53
|
-
# Pass the loaded dictionary, which should match the expected 'StorageState' type
|
54
|
-
playwright_persistency_options={"storage_state": storage_state_dict},
|
55
|
-
) as bot:
|
56
|
-
bot.get_page(url)
|
57
|
-
|
58
|
-
logger.info("Press Enter in the console when you're done checking the protected page.")
|
59
|
-
input(" >> Press Enter to exit: ")
|
60
|
-
|
61
|
-
logger.info("Done! Browser is now closed.")
|
62
|
-
|
63
|
-
|
64
|
-
def write_session(url: str, jsonpath: Path) -> None:
|
65
|
-
"""
|
66
|
-
Launches a non-headless browser and navigates to the login_url.
|
67
|
-
The user can manually log in, then press Enter in the console
|
68
|
-
to store the current session state into a JSON file.
|
69
|
-
"""
|
70
|
-
logger.info(f"Launching browser and navigating to {url} ... Please log in manually.")
|
71
|
-
|
72
|
-
# Ensure jsonpath directory exists
|
73
|
-
jsonpath.parent.mkdir(parents=True, exist_ok=True)
|
74
|
-
|
75
|
-
with PlayWrightBot(playwright_launch_options={"headless": False}) as bot:
|
76
|
-
bot.get_page(url)
|
77
|
-
|
78
|
-
logger.info("After completing the login in the browser, press Enter here to save the session.")
|
79
|
-
input(" >> Press Enter when ready: ")
|
80
|
-
|
81
|
-
# get_sync_browser() returns the BrowserContext internally
|
82
|
-
context = bot.get_sync_browser()
|
83
|
-
|
84
|
-
# Save the current session (cookies, localStorage) to a JSON file
|
85
|
-
logger.info(f"Saving storage state to {jsonpath} ...")
|
86
|
-
context.storage_state(path=jsonpath) # Pass Path object directly
|
87
|
-
|
88
|
-
logger.info("Done! Browser is now closed.")
|
89
|
-
|
90
|
-
|
91
|
-
# --- Spargear Declarative CLI Definition ---
|
92
12
|
|
93
13
|
# Define the default path location relative to this script file
|
94
14
|
DEFAULT_JSON_PATH = Path(__file__).resolve().parent / "session_state.json"
|
@@ -160,7 +80,72 @@ class LoginWithPlaywrightArgs(BaseArguments):
|
|
160
80
|
sys.exit(1)
|
161
81
|
|
162
82
|
|
163
|
-
|
83
|
+
def read_session(url: str, jsonpath: Path) -> None:
|
84
|
+
"""
|
85
|
+
Loads the session state from the specified JSON file, then navigates
|
86
|
+
to a protected_url that normally requires login. If the stored session
|
87
|
+
is valid, it should open without re-entering credentials.
|
88
|
+
|
89
|
+
Correction: Loads the JSON content into a dict first to satisfy type hints.
|
90
|
+
"""
|
91
|
+
logger.info(f"Loading session from {jsonpath} and navigating to {url} ...")
|
92
|
+
|
93
|
+
if not jsonpath.exists():
|
94
|
+
logger.error(f"Session file not found at {jsonpath}")
|
95
|
+
sys.exit(1)
|
96
|
+
|
97
|
+
# Load the storage state from the JSON file into a dictionary
|
98
|
+
logger.info(f"Reading storage state content from {jsonpath} ...")
|
99
|
+
try:
|
100
|
+
with open(jsonpath, "r", encoding="utf-8") as f:
|
101
|
+
# This dictionary should match the 'StorageState' type expected by Playwright/chatterer
|
102
|
+
storage_state_dict = json.load(f)
|
103
|
+
except json.JSONDecodeError:
|
104
|
+
logger.error(f"Failed to decode JSON from {jsonpath}")
|
105
|
+
sys.exit(1)
|
106
|
+
except Exception as e:
|
107
|
+
logger.error(f"Error reading file {jsonpath}: {e}")
|
108
|
+
sys.exit(1)
|
109
|
+
|
110
|
+
logger.info("Launching browser with loaded session state...")
|
111
|
+
with PlayWrightBot(
|
112
|
+
playwright_launch_options={"headless": False},
|
113
|
+
# Pass the loaded dictionary, which should match the expected 'StorageState' type
|
114
|
+
playwright_persistency_options={"storage_state": storage_state_dict},
|
115
|
+
) as bot:
|
116
|
+
bot.get_page(url)
|
117
|
+
|
118
|
+
logger.info("Press Enter in the console when you're done checking the protected page.")
|
119
|
+
input(" >> Press Enter to exit: ")
|
120
|
+
|
121
|
+
logger.info("Done! Browser is now closed.")
|
122
|
+
|
123
|
+
|
124
|
+
def write_session(url: str, jsonpath: Path) -> None:
|
125
|
+
"""
|
126
|
+
Launches a non-headless browser and navigates to the login_url.
|
127
|
+
The user can manually log in, then press Enter in the console
|
128
|
+
to store the current session state into a JSON file.
|
129
|
+
"""
|
130
|
+
logger.info(f"Launching browser and navigating to {url} ... Please log in manually.")
|
131
|
+
|
132
|
+
# Ensure jsonpath directory exists
|
133
|
+
jsonpath.parent.mkdir(parents=True, exist_ok=True)
|
134
|
+
|
135
|
+
with PlayWrightBot(playwright_launch_options={"headless": False}) as bot:
|
136
|
+
bot.get_page(url)
|
137
|
+
|
138
|
+
logger.info("After completing the login in the browser, press Enter here to save the session.")
|
139
|
+
input(" >> Press Enter when ready: ")
|
140
|
+
|
141
|
+
# get_sync_browser() returns the BrowserContext internally
|
142
|
+
context = bot.get_sync_browser()
|
143
|
+
|
144
|
+
# Save the current session (cookies, localStorage) to a JSON file
|
145
|
+
logger.info(f"Saving storage state to {jsonpath} ...")
|
146
|
+
context.storage_state(path=jsonpath) # Pass Path object directly
|
147
|
+
|
148
|
+
logger.info("Done! Browser is now closed.")
|
164
149
|
|
165
150
|
|
166
151
|
def main() -> None:
|
chatterer/examples/make_ppt.py
CHANGED
@@ -1,16 +1,3 @@
|
|
1
|
-
def resolve_import_path_and_get_logger():
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
14
1
|
import re
|
15
2
|
import sys
|
16
3
|
from pathlib import Path
|
@@ -192,7 +179,9 @@ class MakePptArguments(BaseArguments):
|
|
192
179
|
"""Prompt for organizing slides into a presentation script"""
|
193
180
|
|
194
181
|
# LLM Settings
|
195
|
-
provider: str =
|
182
|
+
provider: str = (
|
183
|
+
"openai:gpt-4.1" # Example: "openai:gpt-4o", "anthropic:claude-3-haiku-20240307", "google:gemini-1.5-flash"
|
184
|
+
)
|
196
185
|
"""Name of the language model to use (provider:model_name)"""
|
197
186
|
|
198
187
|
# Other settings
|
@@ -1,16 +1,4 @@
|
|
1
|
-
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
1
|
+
import logging
|
14
2
|
import sys
|
15
3
|
from pathlib import Path
|
16
4
|
from typing import Optional
|
@@ -19,46 +7,50 @@ from spargear import ArgumentSpec, BaseArguments
|
|
19
7
|
|
20
8
|
from chatterer import Chatterer, PdfToMarkdown
|
21
9
|
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
22
12
|
|
23
13
|
class PdfToMarkdownArgs(BaseArguments):
|
24
|
-
|
25
|
-
|
14
|
+
input: str
|
15
|
+
"""Input PDF file or directory containing PDF files to convert to markdown."""
|
16
|
+
output: Optional[str] = None
|
26
17
|
"""Output path. For a file, path to the output markdown file. For a directory, output directory for .md files."""
|
18
|
+
"""Chatterer instance for communication."""
|
19
|
+
page: Optional[str] = None
|
20
|
+
"""Zero-based page indices to convert (e.g., '0,2,4-8')."""
|
21
|
+
recursive: bool = False
|
22
|
+
"""If input is a directory, search for PDFs recursively."""
|
27
23
|
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
28
24
|
["--chatterer"],
|
29
|
-
|
25
|
+
default_factory=lambda: Chatterer.from_provider("google:gemini-2.5-flash-preview-05-20"),
|
30
26
|
help="Chatterer instance for communication.",
|
31
27
|
type=Chatterer.from_provider,
|
32
|
-
required=True,
|
33
28
|
)
|
34
|
-
pages: Optional[str] = None
|
35
|
-
"""Page indices to convert (e.g., '1,3,5-9')."""
|
36
|
-
recursive: bool = False
|
37
|
-
"""If input is a directory, search for PDFs recursively."""
|
38
29
|
|
39
30
|
def run(self) -> list[dict[str, str]]:
|
40
|
-
|
41
|
-
page_indices = parse_page_indices(self.pages) if self.pages else None
|
31
|
+
input = Path(self.input).resolve()
|
42
32
|
pdf_files: list[Path] = []
|
43
33
|
is_dir = False
|
44
|
-
if
|
45
|
-
if
|
34
|
+
if input.is_file():
|
35
|
+
if input.suffix.lower() != ".pdf":
|
46
36
|
sys.exit(1)
|
47
|
-
pdf_files.append(
|
48
|
-
elif
|
37
|
+
pdf_files.append(input)
|
38
|
+
elif input.is_dir():
|
49
39
|
is_dir = True
|
50
40
|
pattern = "*.pdf"
|
51
|
-
pdf_files = sorted([
|
41
|
+
pdf_files = sorted([
|
42
|
+
f for f in (input.rglob(pattern) if self.recursive else input.glob(pattern)) if f.is_file()
|
43
|
+
])
|
52
44
|
if not pdf_files:
|
53
45
|
sys.exit(0)
|
54
46
|
else:
|
55
47
|
sys.exit(1)
|
56
|
-
if self.
|
57
|
-
out_base = Path(self.
|
48
|
+
if self.output:
|
49
|
+
out_base = Path(self.output).resolve()
|
58
50
|
elif is_dir:
|
59
|
-
out_base =
|
51
|
+
out_base = input
|
60
52
|
else:
|
61
|
-
out_base =
|
53
|
+
out_base = input.with_suffix(".md")
|
62
54
|
|
63
55
|
if is_dir:
|
64
56
|
out_base.mkdir(parents=True, exist_ok=True)
|
@@ -68,37 +60,15 @@ class PdfToMarkdownArgs(BaseArguments):
|
|
68
60
|
converter = PdfToMarkdown(chatterer=self.chatterer.unwrap())
|
69
61
|
results: list[dict[str, str]] = []
|
70
62
|
for pdf in pdf_files:
|
71
|
-
|
72
|
-
md = converter.convert(str(pdf), page_indices)
|
73
|
-
|
74
|
-
|
75
|
-
results.append({"input": pdf.as_posix(), "output":
|
63
|
+
output: Path = (out_base / (pdf.stem + ".md")) if is_dir else out_base
|
64
|
+
md: str = converter.convert(pdf_input=str(pdf), page_indices=self.page)
|
65
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
66
|
+
output.write_text(md, encoding="utf-8")
|
67
|
+
results.append({"input": pdf.as_posix(), "output": output.as_posix(), "result": md})
|
76
68
|
logger.info(f"Converted {len(pdf_files)} PDF(s) to markdown and saved to `{out_base}`.")
|
77
69
|
return results
|
78
70
|
|
79
71
|
|
80
|
-
def parse_page_indices(pages_str: str) -> list[int] | None:
|
81
|
-
if not pages_str:
|
82
|
-
return None
|
83
|
-
indices: set[int] = set()
|
84
|
-
for part in pages_str.split(","):
|
85
|
-
part = part.strip()
|
86
|
-
if not part:
|
87
|
-
continue
|
88
|
-
if "-" in part:
|
89
|
-
start_str, end_str = part.split("-", 1)
|
90
|
-
start = int(start_str.strip())
|
91
|
-
end = int(end_str.strip())
|
92
|
-
if start > end:
|
93
|
-
raise ValueError
|
94
|
-
indices.update(range(start, end + 1))
|
95
|
-
else:
|
96
|
-
indices.add(int(part))
|
97
|
-
if not indices:
|
98
|
-
raise ValueError
|
99
|
-
return sorted(indices)
|
100
|
-
|
101
|
-
|
102
72
|
def main() -> None:
|
103
73
|
PdfToMarkdownArgs().run()
|
104
74
|
|
@@ -1,36 +1,30 @@
|
|
1
|
-
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
1
|
+
import logging
|
14
2
|
import sys
|
15
3
|
from pathlib import Path
|
4
|
+
from typing import Optional
|
16
5
|
|
17
|
-
from spargear import
|
6
|
+
from spargear import BaseArguments
|
18
7
|
|
19
8
|
from chatterer.tools.convert_to_text import pdf_to_text
|
20
9
|
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
21
12
|
|
22
13
|
class PdfToTextArgs(BaseArguments):
|
23
|
-
|
24
|
-
|
25
|
-
|
14
|
+
input: Path
|
15
|
+
"""Path to the PDF file to convert to text."""
|
16
|
+
output: Optional[Path]
|
17
|
+
"""Path to the output text file. If not provided, defaults to the input file with a .txt suffix."""
|
18
|
+
page: Optional[str] = None
|
19
|
+
"""Comma-separated list of zero-based page indices to extract from the PDF. Supports ranges, e.g., '0,2,4-8'."""
|
26
20
|
|
27
21
|
def run(self) -> None:
|
28
|
-
input = self.
|
29
|
-
out = self.
|
22
|
+
input = self.input.resolve()
|
23
|
+
out = self.output or input.with_suffix(".txt")
|
30
24
|
if not input.is_file():
|
31
25
|
sys.exit(1)
|
32
26
|
out.write_text(
|
33
|
-
pdf_to_text(input,
|
27
|
+
pdf_to_text(path_or_file=input, page_indices=self.page),
|
34
28
|
encoding="utf-8",
|
35
29
|
)
|
36
30
|
logger.info(f"Extracted text from `{input}` to `{out}`")
|
@@ -2,51 +2,36 @@
|
|
2
2
|
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import cast
|
5
|
+
from typing import Optional, cast
|
6
6
|
|
7
7
|
from openai import OpenAI
|
8
8
|
from pydub import AudioSegment
|
9
|
-
from spargear import
|
9
|
+
from spargear import BaseArguments
|
10
10
|
|
11
11
|
# Maximum chunk length in seconds
|
12
12
|
MAX_CHUNK_DURATION = 600
|
13
13
|
|
14
14
|
|
15
15
|
class TranscriptionApiArguments(BaseArguments):
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
["--model"],
|
29
|
-
default="gpt-4o-transcribe",
|
30
|
-
help="The model to use for transcription.",
|
31
|
-
)
|
32
|
-
api_key: ArgumentSpec[str] = ArgumentSpec(
|
33
|
-
["--api-key"],
|
34
|
-
default=None,
|
35
|
-
help="The API key for authentication.",
|
36
|
-
)
|
37
|
-
base_url: ArgumentSpec[str] = ArgumentSpec(
|
38
|
-
["--base-url"],
|
39
|
-
default="https://api.openai.com/v1",
|
40
|
-
help="The base URL for the API.",
|
41
|
-
)
|
16
|
+
input: Path
|
17
|
+
"""The audio file to transcribe."""
|
18
|
+
output: Optional[Path] = None
|
19
|
+
"""Path to save the transcription output."""
|
20
|
+
model: str = "gpt-4o-transcribe"
|
21
|
+
"""The model to use for transcription."""
|
22
|
+
api_key: Optional[str] = None
|
23
|
+
"""The API key for authentication."""
|
24
|
+
base_url: str = "https://api.openai.com/v1"
|
25
|
+
"""The base URL for the API."""
|
26
|
+
prompt: str = "Transcribe whole text from audio."
|
27
|
+
"""The prompt to use for transcription."""
|
42
28
|
|
43
29
|
def run(self) -> None:
|
44
|
-
|
45
|
-
model = self.model.unwrap()
|
30
|
+
model = self.model
|
46
31
|
|
47
|
-
client = OpenAI(api_key=self.api_key
|
32
|
+
client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
48
33
|
|
49
|
-
audio = load_audio_segment(
|
34
|
+
audio = load_audio_segment(self.input)
|
50
35
|
|
51
36
|
segments = split_audio(audio, MAX_CHUNK_DURATION)
|
52
37
|
print(f"[i] Audio duration: {len(audio) / 1000:.1f}s; splitting into {len(segments)} segment(s)")
|
@@ -54,10 +39,10 @@ class TranscriptionApiArguments(BaseArguments):
|
|
54
39
|
transcripts: list[str] = []
|
55
40
|
for idx, seg in enumerate(segments, start=1):
|
56
41
|
print(f"[i] Transcribing segment {idx}/{len(segments)}...")
|
57
|
-
transcripts.append(transcribe_segment(seg, client, model))
|
42
|
+
transcripts.append(transcribe_segment(seg, client, model, self.prompt))
|
58
43
|
|
59
44
|
full_transcript = "\n\n".join(transcripts)
|
60
|
-
output_path: Path = self.
|
45
|
+
output_path: Path = self.output or self.input.with_suffix(".txt")
|
61
46
|
output_path.write_text(full_transcript, encoding="utf-8")
|
62
47
|
print(f"[✓] Transcription saved to: {output_path}")
|
63
48
|
|
@@ -94,7 +79,7 @@ def split_audio(audio: AudioSegment, max_duration_s: int) -> list[AudioSegment]:
|
|
94
79
|
return segments
|
95
80
|
|
96
81
|
|
97
|
-
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str) -> str:
|
82
|
+
def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str, prompt: str) -> str:
|
98
83
|
"""
|
99
84
|
Transcribe a single AudioSegment chunk and return its text.
|
100
85
|
"""
|
@@ -104,7 +89,7 @@ def transcribe_segment(segment: AudioSegment, client: OpenAI, model: str) -> str
|
|
104
89
|
mp3_bytes = buffer.read()
|
105
90
|
response = client.audio.transcriptions.create(
|
106
91
|
model=model,
|
107
|
-
prompt=
|
92
|
+
prompt=prompt,
|
108
93
|
file=("audio.mp3", mp3_bytes),
|
109
94
|
response_format="text",
|
110
95
|
stream=True,
|
@@ -1,17 +1,6 @@
|
|
1
|
-
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
1
|
+
import logging
|
14
2
|
from pathlib import Path
|
3
|
+
from typing import Optional
|
15
4
|
|
16
5
|
from langchain_core.documents.base import Blob
|
17
6
|
from spargear import ArgumentSpec, BaseArguments
|
@@ -27,28 +16,34 @@ from chatterer.tools.upstage_document_parser import (
|
|
27
16
|
SplitType,
|
28
17
|
)
|
29
18
|
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
30
21
|
|
31
22
|
class UpstageParserArguments(BaseArguments):
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
23
|
+
input: Path
|
24
|
+
"""Input file to parse. Can be a PDF, image, or other supported formats."""
|
25
|
+
output: Optional[Path] = None
|
26
|
+
"""Output file path for the parsed content. Defaults to input file with .md suffix if not provided."""
|
27
|
+
api_key: Optional[str] = None
|
28
|
+
"""API key for the Upstage API."""
|
29
|
+
base_url: str = DOCUMENT_PARSE_BASE_URL
|
30
|
+
"""Base URL for the Upstage API."""
|
31
|
+
model: str = DOCUMENT_PARSE_DEFAULT_MODEL
|
32
|
+
"""Model to use for parsing."""
|
33
|
+
split: SplitType = "none"
|
34
|
+
"""Split type for the parsed content."""
|
35
|
+
ocr: OCR = "auto"
|
36
|
+
"""OCR type for parsing."""
|
37
|
+
output_format: OutputFormat = "markdown"
|
38
|
+
"""Output format for the parsed content."""
|
39
|
+
coordinates: bool = False
|
40
|
+
"""Whether to include coordinates in the output."""
|
41
|
+
base64_encoding: list[Category] = ["figure"]
|
42
|
+
"""Base64 encoding for specific categories in the parsed content."""
|
43
|
+
image_description_instruction: str = "Describe the image in detail."
|
44
|
+
"""Instruction for generating image descriptions."""
|
45
|
+
image_dir: str = DEFAULT_IMAGE_DIR
|
46
|
+
"""Directory to save images extracted from the document."""
|
52
47
|
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
53
48
|
["--chatterer"],
|
54
49
|
default=None,
|
@@ -57,26 +52,25 @@ class UpstageParserArguments(BaseArguments):
|
|
57
52
|
)
|
58
53
|
|
59
54
|
def run(self) -> None:
|
60
|
-
input = UpstageParserArguments.
|
61
|
-
out = UpstageParserArguments.
|
55
|
+
input = UpstageParserArguments.input.resolve()
|
56
|
+
out = UpstageParserArguments.output or input.with_suffix(".md")
|
62
57
|
|
63
58
|
parser = UpstageDocumentParseParser(
|
64
|
-
api_key=UpstageParserArguments.api_key
|
65
|
-
base_url=UpstageParserArguments.base_url
|
66
|
-
model=UpstageParserArguments.model
|
67
|
-
split=UpstageParserArguments.split
|
68
|
-
ocr=UpstageParserArguments.ocr
|
69
|
-
output_format=UpstageParserArguments.output_format
|
70
|
-
coordinates=UpstageParserArguments.coordinates
|
71
|
-
base64_encoding=UpstageParserArguments.base64_encoding
|
72
|
-
image_description_instruction=UpstageParserArguments.image_description_instruction
|
73
|
-
image_dir=UpstageParserArguments.image_dir
|
59
|
+
api_key=UpstageParserArguments.api_key,
|
60
|
+
base_url=UpstageParserArguments.base_url,
|
61
|
+
model=UpstageParserArguments.model,
|
62
|
+
split=UpstageParserArguments.split,
|
63
|
+
ocr=UpstageParserArguments.ocr,
|
64
|
+
output_format=UpstageParserArguments.output_format,
|
65
|
+
coordinates=UpstageParserArguments.coordinates,
|
66
|
+
base64_encoding=UpstageParserArguments.base64_encoding,
|
67
|
+
image_description_instruction=UpstageParserArguments.image_description_instruction,
|
68
|
+
image_dir=UpstageParserArguments.image_dir,
|
74
69
|
chatterer=UpstageParserArguments.chatterer.value,
|
75
70
|
)
|
76
|
-
|
77
71
|
docs = parser.parse(Blob.from_path(input)) # pyright: ignore[reportUnknownMemberType]
|
78
72
|
|
79
|
-
if UpstageParserArguments.image_dir
|
73
|
+
if UpstageParserArguments.image_dir:
|
80
74
|
for path, image in parser.image_data.items():
|
81
75
|
(path := Path(path)).parent.mkdir(parents=True, exist_ok=True)
|
82
76
|
path.write_bytes(image)
|
@@ -1,16 +1,3 @@
|
|
1
|
-
def resolve_import_path_and_get_logger():
|
2
|
-
# ruff: noqa: E402
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
if __name__ == "__main__" and "." not in sys.path:
|
7
|
-
sys.path.append(".")
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
return logger
|
11
|
-
|
12
|
-
|
13
|
-
logger = resolve_import_path_and_get_logger()
|
14
1
|
from pathlib import Path
|
15
2
|
from typing import Literal
|
16
3
|
|
@@ -20,49 +7,53 @@ from chatterer import Chatterer, MarkdownLink, PlayWrightBot
|
|
20
7
|
|
21
8
|
|
22
9
|
class WebpageToMarkdownArgs(BaseArguments):
|
23
|
-
url:
|
24
|
-
|
10
|
+
url: str
|
11
|
+
"""The URL to crawl."""
|
12
|
+
output: str = Path(__file__).with_suffix(".md").as_posix()
|
25
13
|
"""The output file path for the markdown file."""
|
26
14
|
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
27
|
-
["--
|
28
|
-
default=None,
|
29
|
-
type=Chatterer.from_provider,
|
15
|
+
["--chatterer"],
|
30
16
|
help="The Chatterer backend and model to use for filtering the markdown.",
|
17
|
+
type=Chatterer.from_provider,
|
31
18
|
)
|
32
19
|
engine: Literal["firefox", "chromium", "webkit"] = "firefox"
|
33
20
|
"""The browser engine to use."""
|
34
21
|
|
35
22
|
def run(self) -> None:
|
36
23
|
chatterer = self.chatterer.value
|
37
|
-
url: str = self.url.
|
38
|
-
|
24
|
+
url: str = self.url.strip()
|
25
|
+
output: Path = Path(self.output).resolve()
|
39
26
|
with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
40
27
|
md = bot.url_to_md(url)
|
41
|
-
|
28
|
+
output.write_text(md, encoding="utf-8")
|
42
29
|
if chatterer is not None:
|
43
30
|
md_llm = bot.url_to_md_with_llm(url.strip())
|
44
|
-
|
31
|
+
output.write_text(md_llm, encoding="utf-8")
|
45
32
|
links = MarkdownLink.from_markdown(md, referer_url=url)
|
46
33
|
for link in links:
|
47
34
|
if link.type == "link":
|
48
|
-
print(
|
35
|
+
print(
|
36
|
+
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
37
|
+
)
|
49
38
|
elif link.type == "image":
|
50
39
|
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
51
40
|
|
52
41
|
async def arun(self) -> None:
|
53
42
|
chatterer = self.chatterer.value
|
54
|
-
url: str = self.url.
|
55
|
-
|
43
|
+
url: str = self.url.strip()
|
44
|
+
output: Path = Path(self.output).resolve()
|
56
45
|
async with PlayWrightBot(chatterer=chatterer, engine=self.engine) as bot:
|
57
46
|
md = await bot.aurl_to_md(url)
|
58
|
-
|
47
|
+
output.write_text(md, encoding="utf-8")
|
59
48
|
if chatterer is not None:
|
60
49
|
md_llm = await bot.aurl_to_md_with_llm(url.strip())
|
61
|
-
|
50
|
+
output.write_text(md_llm, encoding="utf-8")
|
62
51
|
links = MarkdownLink.from_markdown(md, referer_url=url)
|
63
52
|
for link in links:
|
64
53
|
if link.type == "link":
|
65
|
-
print(
|
54
|
+
print(
|
55
|
+
f"- [{truncate_string(link.url)}] {truncate_string(link.inline_text)} ({truncate_string(link.inline_title)})"
|
56
|
+
)
|
66
57
|
elif link.type == "image":
|
67
58
|
print(f"- ![{truncate_string(link.url)}] ({truncate_string(link.inline_text)})")
|
68
59
|
|
@@ -379,7 +379,7 @@ class AoTPipeline:
|
|
379
379
|
chatterer: Chatterer
|
380
380
|
max_depth: int = 2
|
381
381
|
max_retries: int = 2
|
382
|
-
steps_history: list[StepRecord] = field(default_factory=list)
|
382
|
+
steps_history: list[StepRecord] = field(default_factory=list[StepRecord])
|
383
383
|
prompter: AoTPrompter = field(default_factory=AoTPrompter)
|
384
384
|
|
385
385
|
# 4.1) Utility for calling the LLM with Pydantic parsing
|
@@ -4,7 +4,8 @@ import logging
|
|
4
4
|
import re
|
5
5
|
from contextlib import contextmanager
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from
|
7
|
+
from types import EllipsisType
|
8
|
+
from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional
|
8
9
|
|
9
10
|
from ..language_model import Chatterer, HumanMessage
|
10
11
|
from ..utils.base64_image import Base64Image
|
@@ -17,6 +18,7 @@ if TYPE_CHECKING:
|
|
17
18
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
18
19
|
logger = logging.getLogger(__name__)
|
19
20
|
MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
|
21
|
+
PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] | int | str
|
20
22
|
|
21
23
|
|
22
24
|
@dataclass
|
@@ -107,8 +109,8 @@ class PdfToMarkdown:
|
|
107
109
|
|
108
110
|
def convert(
|
109
111
|
self,
|
110
|
-
pdf_input:
|
111
|
-
page_indices: Optional[
|
112
|
+
pdf_input: "Document | PathOrReadable",
|
113
|
+
page_indices: Optional[PageIndexType] = None,
|
112
114
|
progress_callback: Optional[Callable[[int, int], None]] = None,
|
113
115
|
) -> str:
|
114
116
|
"""
|
@@ -123,7 +125,9 @@ class PdfToMarkdown:
|
|
123
125
|
A single string containing the concatenated Markdown output for the processed pages.
|
124
126
|
"""
|
125
127
|
with open_pdf(pdf_input) as doc:
|
126
|
-
target_page_indices = list(
|
128
|
+
target_page_indices = list(
|
129
|
+
_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
|
130
|
+
)
|
127
131
|
total_pages_to_process = len(target_page_indices)
|
128
132
|
if total_pages_to_process == 0:
|
129
133
|
logger.warning("No pages selected for processing.")
|
@@ -232,7 +236,7 @@ def render_pdf_as_image(
|
|
232
236
|
|
233
237
|
images_bytes: dict[int, bytes] = {}
|
234
238
|
matrix = Matrix(zoom, zoom) # Control output resolution
|
235
|
-
for page_idx in _get_page_indices(page_indices, len(doc)):
|
239
|
+
for page_idx in _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True):
|
236
240
|
img_bytes = bytes(
|
237
241
|
get_pixmap(
|
238
242
|
page=doc[page_idx],
|
@@ -243,10 +247,7 @@ def render_pdf_as_image(
|
|
243
247
|
return images_bytes
|
244
248
|
|
245
249
|
|
246
|
-
def extract_text_from_pdf(
|
247
|
-
doc: "Document",
|
248
|
-
page_indices: Iterable[int] | int | None = None,
|
249
|
-
) -> dict[int, str]:
|
250
|
+
def extract_text_from_pdf(doc: "Document", page_indices: Optional[PageIndexType] = None) -> dict[int, str]:
|
250
251
|
"""Convert a PDF file to plain text.
|
251
252
|
|
252
253
|
Extracts text from each page of a PDF file and formats it with page markers.
|
@@ -261,7 +262,11 @@ def extract_text_from_pdf(
|
|
261
262
|
"""
|
262
263
|
return {
|
263
264
|
page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
|
264
|
-
for page_idx in _get_page_indices(
|
265
|
+
for page_idx in _get_page_indices(
|
266
|
+
page_indices=page_indices,
|
267
|
+
max_doc_pages=len(doc),
|
268
|
+
is_input_zero_based=True,
|
269
|
+
)
|
265
270
|
}
|
266
271
|
|
267
272
|
|
@@ -292,11 +297,97 @@ def open_pdf(pdf_input: PathOrReadable | Document):
|
|
292
297
|
doc.close()
|
293
298
|
|
294
299
|
|
295
|
-
def _get_page_indices(
|
300
|
+
def _get_page_indices(
|
301
|
+
page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool
|
302
|
+
) -> list[int]:
|
296
303
|
"""Helper function to handle page indices for PDF conversion."""
|
304
|
+
|
305
|
+
def _to_zero_based_int(idx: int) -> int:
|
306
|
+
"""Convert a 1-based index to a 0-based index if necessary."""
|
307
|
+
if is_input_zero_based:
|
308
|
+
return idx
|
309
|
+
else:
|
310
|
+
if idx < 1 or idx > max_doc_pages:
|
311
|
+
raise ValueError(f"Index {idx} is out of bounds for document with {max_doc_pages} pages (1-based).")
|
312
|
+
return idx - 1
|
313
|
+
|
297
314
|
if page_indices is None:
|
298
|
-
return range(max_doc_pages)
|
315
|
+
return list(range(max_doc_pages)) # Convert all pages
|
299
316
|
elif isinstance(page_indices, int):
|
300
|
-
|
317
|
+
# Handle single integer input for page index
|
318
|
+
return [_to_zero_based_int(page_indices)]
|
319
|
+
elif isinstance(page_indices, str):
|
320
|
+
# Handle string input for page indices
|
321
|
+
return _interpret_index_string(
|
322
|
+
index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based
|
323
|
+
)
|
301
324
|
else:
|
302
|
-
|
325
|
+
# Handle iterable input for page indices
|
326
|
+
indices: set[int] = set()
|
327
|
+
for idx in page_indices:
|
328
|
+
if isinstance(idx, int):
|
329
|
+
indices.add(_to_zero_based_int(idx))
|
330
|
+
else:
|
331
|
+
start, end = idx
|
332
|
+
if isinstance(start, EllipsisType):
|
333
|
+
start = 0
|
334
|
+
else:
|
335
|
+
start = _to_zero_based_int(start)
|
336
|
+
|
337
|
+
if isinstance(end, EllipsisType):
|
338
|
+
end = max_doc_pages - 1
|
339
|
+
else:
|
340
|
+
end = _to_zero_based_int(end)
|
341
|
+
|
342
|
+
if start > end:
|
343
|
+
raise ValueError(
|
344
|
+
f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
|
345
|
+
)
|
346
|
+
indices.update(range(start, end + 1))
|
347
|
+
|
348
|
+
return sorted(indices) # Return sorted list of indices
|
349
|
+
|
350
|
+
|
351
|
+
def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
|
352
|
+
"""Interpret a string of comma-separated indices and ranges."""
|
353
|
+
|
354
|
+
def _to_zero_based_int(idx_str: str) -> int:
|
355
|
+
i = int(idx_str)
|
356
|
+
if is_input_zero_based:
|
357
|
+
if i < 0 or i >= max_doc_pages:
|
358
|
+
raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages.")
|
359
|
+
return i
|
360
|
+
else:
|
361
|
+
if i < 1 or i > max_doc_pages:
|
362
|
+
raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages (1-based).")
|
363
|
+
return i - 1 # Convert to zero-based index
|
364
|
+
|
365
|
+
indices: set[int] = set()
|
366
|
+
for part in index_str.split(","):
|
367
|
+
part: str = part.strip()
|
368
|
+
count_dash: int = part.count("-")
|
369
|
+
if count_dash == 0:
|
370
|
+
indices.add(_to_zero_based_int(part))
|
371
|
+
elif count_dash == 1:
|
372
|
+
idx_dash: int = part.index("-")
|
373
|
+
start = part[:idx_dash].strip()
|
374
|
+
end = part[idx_dash + 1 :].strip()
|
375
|
+
if not start:
|
376
|
+
start = _to_zero_based_int("0") # Default to 0 if no start index is provided
|
377
|
+
else:
|
378
|
+
start = _to_zero_based_int(start)
|
379
|
+
|
380
|
+
if not end:
|
381
|
+
end = _to_zero_based_int(str(max_doc_pages - 1)) # Default to last page if no end index is provided
|
382
|
+
else:
|
383
|
+
end = _to_zero_based_int(end)
|
384
|
+
|
385
|
+
if start > end:
|
386
|
+
raise ValueError(
|
387
|
+
f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
|
388
|
+
)
|
389
|
+
indices.update(range(start, end + 1))
|
390
|
+
else:
|
391
|
+
raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")
|
392
|
+
|
393
|
+
return sorted(indices) # Return sorted list of indices, ensuring no duplicates
|
@@ -8,7 +8,6 @@ from pathlib import Path
|
|
8
8
|
from typing import (
|
9
9
|
TYPE_CHECKING,
|
10
10
|
Callable,
|
11
|
-
Iterable,
|
12
11
|
NamedTuple,
|
13
12
|
NotRequired,
|
14
13
|
Optional,
|
@@ -20,7 +19,7 @@ from typing import (
|
|
20
19
|
|
21
20
|
from ..common_types.io import PathOrReadable
|
22
21
|
from ..utils.bytesio import read_bytes_stream
|
23
|
-
from .convert_pdf_to_markdown import extract_text_from_pdf
|
22
|
+
from .convert_pdf_to_markdown import PageIndexType, extract_text_from_pdf
|
24
23
|
|
25
24
|
if TYPE_CHECKING:
|
26
25
|
from bs4 import Tag
|
@@ -222,7 +221,7 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
|
|
222
221
|
return str(markdownify(html, **(options or {}))) # pyright: ignore[reportUnknownArgumentType]
|
223
222
|
|
224
223
|
|
225
|
-
def pdf_to_text(path_or_file: PathOrReadable, page_indices:
|
224
|
+
def pdf_to_text(path_or_file: PathOrReadable, page_indices: Optional[PageIndexType] = None) -> str:
|
226
225
|
"""
|
227
226
|
Convert a PDF file to plain text.
|
228
227
|
|
@@ -248,7 +247,7 @@ def pdf_to_text(path_or_file: PathOrReadable, page_indices: Iterable[int] | int
|
|
248
247
|
with Document(stream=stream.read()) as doc:
|
249
248
|
return "\n".join(
|
250
249
|
f"<!-- Page {page_no} -->\n{text}\n"
|
251
|
-
for page_no, text in extract_text_from_pdf(doc, page_indices).items()
|
250
|
+
for page_no, text in extract_text_from_pdf(doc=doc, page_indices=page_indices).items()
|
252
251
|
)
|
253
252
|
|
254
253
|
|
@@ -67,7 +67,7 @@ class Coordinate(BaseModel):
|
|
67
67
|
class Element(BaseModel):
|
68
68
|
category: Category
|
69
69
|
content: Content
|
70
|
-
coordinates: list[Coordinate] = Field(default_factory=list)
|
70
|
+
coordinates: list[Coordinate] = Field(default_factory=list[Coordinate])
|
71
71
|
base64_encoding: str = ""
|
72
72
|
id: int
|
73
73
|
page: int
|
@@ -701,5 +701,5 @@ def _get_metadata_from_document(doc: Document) -> dict[object, object]:
|
|
701
701
|
Helper function to extract metadata from a Document object.
|
702
702
|
This is a placeholder and should be adjusted based on actual metadata structure.
|
703
703
|
"""
|
704
|
-
metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType]
|
704
|
+
metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
705
705
|
return metadata
|
chatterer/utils/code_agent.py
CHANGED
@@ -185,7 +185,7 @@ def insert_callables_into_global(
|
|
185
185
|
repl_tool.globals = {} # Or handle appropriately
|
186
186
|
|
187
187
|
# Safely update globals
|
188
|
-
current_globals: dict[object, object] = repl_tool.globals # pyright: ignore[reportUnknownMemberType]
|
188
|
+
current_globals: dict[object, object] = repl_tool.globals # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
189
189
|
for fsig in function_signatures:
|
190
190
|
current_globals[fsig.name] = fsig.callable
|
191
191
|
# No need to reassign if globals is mutable (dict)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: chatterer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.21
|
4
4
|
Summary: The highest-level interface for various LLM APIs.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -11,7 +11,7 @@ Requires-Dist: pillow>=11.1.0
|
|
11
11
|
Requires-Dist: regex>=2024.11.6
|
12
12
|
Requires-Dist: rich>=13.9.4
|
13
13
|
Requires-Dist: colorama>=0.4.6
|
14
|
-
Requires-Dist: spargear>=0.
|
14
|
+
Requires-Dist: spargear>=0.2.0
|
15
15
|
Provides-Extra: dev
|
16
16
|
Requires-Dist: neo4j-extension>=0.1.14; extra == "dev"
|
17
17
|
Requires-Dist: ipykernel>=6.29.5; extra == "dev"
|
@@ -6,23 +6,23 @@ chatterer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
chatterer/common_types/__init__.py,sha256=jfS6m5UANSvGjzQ_nzYDpryn5uZqNb06-4xCsQ2C_lw,376
|
7
7
|
chatterer/common_types/io.py,sha256=fetiyi1suZ3NF2mj5k5KDLJLGKS1n4J-5UmH7JN36g8,817
|
8
8
|
chatterer/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
chatterer/examples/anything_to_markdown.py,sha256=
|
10
|
-
chatterer/examples/get_code_snippets.py,sha256=
|
11
|
-
chatterer/examples/login_with_playwright.py,sha256=
|
12
|
-
chatterer/examples/make_ppt.py,sha256=
|
13
|
-
chatterer/examples/pdf_to_markdown.py,sha256=
|
14
|
-
chatterer/examples/pdf_to_text.py,sha256=
|
15
|
-
chatterer/examples/transcription_api.py,sha256=
|
16
|
-
chatterer/examples/upstage_parser.py,sha256=
|
17
|
-
chatterer/examples/webpage_to_markdown.py,sha256=
|
9
|
+
chatterer/examples/anything_to_markdown.py,sha256=4O9ze7AIHcwEzvVmm5JMMKo_rVSFwhPL8MVHtfMLJ5Y,2734
|
10
|
+
chatterer/examples/get_code_snippets.py,sha256=pz05JjhKaWAknVKlk1ftEEzpSG4-sqD9oa_gyIQoCAs,1911
|
11
|
+
chatterer/examples/login_with_playwright.py,sha256=EhvJLaH5TD7bmDi12uP8YLd0fRhdjR-oyIkBHLi1Jjs,5988
|
12
|
+
chatterer/examples/make_ppt.py,sha256=vsT_iL_jS2ami5VYrReLMQcD576FfZUH7913F7_As0A,23278
|
13
|
+
chatterer/examples/pdf_to_markdown.py,sha256=ZeGRO5CZxGQxJpScK0iB1lTzUkfSiXtuqoeKEQL1ICA,2787
|
14
|
+
chatterer/examples/pdf_to_text.py,sha256=DznTyhu1REv8Wp4RimQWVgEU5j0_BmlwjfJYJvx3dbI,1590
|
15
|
+
chatterer/examples/transcription_api.py,sha256=WUs12qHH4616eLMQDHOiyVGxaXstTpgeE47djYyli6c,3897
|
16
|
+
chatterer/examples/upstage_parser.py,sha256=TrfeSIiF0xklhFCknop22TIOVibI4CJ_UKj5-lD8c8E,3487
|
17
|
+
chatterer/examples/webpage_to_markdown.py,sha256=DnZfQ-trXBiOiszA2tMlgadgKH-ObTi6l4gGloT-cQw,2846
|
18
18
|
chatterer/strategies/__init__.py,sha256=SdOggbmHpw4f7Njwy-T8q64e91OLOUp1k0a0ozZd4qI,221
|
19
|
-
chatterer/strategies/atom_of_thoughts.py,sha256=
|
19
|
+
chatterer/strategies/atom_of_thoughts.py,sha256=pUhqt47YlzBIVNRh0UebeBwuJ0J94Ge6yZgXxrsiDPE,40884
|
20
20
|
chatterer/strategies/base.py,sha256=b2gMPqodp97OP1dkHfj0UqixjdjVhmTw_V5qJ7i2S6g,427
|
21
21
|
chatterer/tools/__init__.py,sha256=m3PRK9H5vOhk-2gG9W2eg8CYBlEn-K9-eaulOu91bgo,1474
|
22
22
|
chatterer/tools/caption_markdown_images.py,sha256=r4QajHYuL4mdyYQXP1vQcNmqKN8lxBf5y0VKELXILOI,15392
|
23
|
-
chatterer/tools/convert_pdf_to_markdown.py,sha256=
|
24
|
-
chatterer/tools/convert_to_text.py,sha256=
|
25
|
-
chatterer/tools/upstage_document_parser.py,sha256=
|
23
|
+
chatterer/tools/convert_pdf_to_markdown.py,sha256=Q5ln-_av2eor0A2LkQG7-IgyQKJ79wwrSOvv5Jncfso,18901
|
24
|
+
chatterer/tools/convert_to_text.py,sha256=WHQ0Xj4Ri_jYbFjzTx3mjmvJ9U8bAv4wGaKEVC88Nlk,15457
|
25
|
+
chatterer/tools/upstage_document_parser.py,sha256=CXslVYAHDK8EV8jtUAUWzf8rxU4qilSnW8_dhAxHOE8,33142
|
26
26
|
chatterer/tools/webpage_to_markdown.py,sha256=ADH4sqM6iquJR7HU6umMQ5qO7EvcbNutuchXDpAcxAo,31961
|
27
27
|
chatterer/tools/youtube.py,sha256=Hl2MMXJwwZ-i6_YAq0zh0rN4LHpYOb1Rt88P1gMjlLE,6081
|
28
28
|
chatterer/tools/citation_chunking/__init__.py,sha256=gG7Fnkkp28UpcWMbfMY_4gqzZSZ8QzlhalHBoeoq7K0,82
|
@@ -35,10 +35,10 @@ chatterer/tools/citation_chunking/utils.py,sha256=M4pH2-UIE1VLzQLXDqjEe4L3Xcy0e0
|
|
35
35
|
chatterer/utils/__init__.py,sha256=2v-lB2dqHgBlGcyaKKHc_hcyeH_AVoOddpr0STF7YAw,341
|
36
36
|
chatterer/utils/base64_image.py,sha256=m_qAT3ERBiq8D-H4H9Z7rLfL31_BiPmV_m4uQ5XRLs0,11124
|
37
37
|
chatterer/utils/bytesio.py,sha256=3MC2atOOFKo5YxuReo_y_t8Wem9p2Y1ahC5M2lGclwI,2618
|
38
|
-
chatterer/utils/code_agent.py,sha256=
|
38
|
+
chatterer/utils/code_agent.py,sha256=7ka_WRI4TQmZ5H46mjY3hI6RO_pxw6pg3LAxjgW4AbM,10495
|
39
39
|
chatterer/utils/imghdr.py,sha256=6JhJMXD4MZ0dQolT2VM87YrRYm3hPf3RTEWnP4lYRVc,3842
|
40
|
-
chatterer-0.1.
|
41
|
-
chatterer-0.1.
|
42
|
-
chatterer-0.1.
|
43
|
-
chatterer-0.1.
|
44
|
-
chatterer-0.1.
|
40
|
+
chatterer-0.1.21.dist-info/METADATA,sha256=j3QGPYik-jm75MHIfAvbvUbv-EaxvlVKdEIc7_dMUjk,11826
|
41
|
+
chatterer-0.1.21.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
42
|
+
chatterer-0.1.21.dist-info/entry_points.txt,sha256=KhxL2dctnZalnDSmPoB5dZBBa9hZpJETW3C5xkoRaW4,554
|
43
|
+
chatterer-0.1.21.dist-info/top_level.txt,sha256=7nSQKP0bHxPRc7HyzdbKsJdkvPgYD0214o6slRizv9s,10
|
44
|
+
chatterer-0.1.21.dist-info/RECORD,,
|
File without changes
|
File without changes
|