chatterer 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +36 -5
- chatterer/interactive.py +692 -0
- chatterer/language_model.py +217 -261
- chatterer/messages.py +13 -1
- chatterer/tools/__init__.py +26 -15
- chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
- chatterer/tools/convert_pdf_to_markdown.py +302 -0
- chatterer/tools/convert_to_text.py +45 -16
- chatterer/tools/upstage_document_parser.py +481 -214
- chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
- chatterer/tools/youtube.py +2 -1
- chatterer/utils/__init__.py +1 -1
- chatterer/utils/{image.py → base64_image.py} +56 -62
- chatterer/utils/code_agent.py +137 -38
- chatterer/utils/imghdr.py +148 -0
- chatterer-0.1.16.dist-info/METADATA +392 -0
- chatterer-0.1.16.dist-info/RECORD +33 -0
- {chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/WHEEL +1 -1
- chatterer/tools/webpage_to_markdown/__init__.py +0 -4
- chatterer-0.1.13.dist-info/METADATA +0 -171
- chatterer-0.1.13.dist-info/RECORD +0 -31
- {chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
8
8
|
from typing import (
|
9
9
|
TYPE_CHECKING,
|
10
10
|
Callable,
|
11
|
+
Iterable,
|
11
12
|
NamedTuple,
|
12
13
|
NotRequired,
|
13
14
|
Optional,
|
@@ -19,6 +20,7 @@ from typing import (
|
|
19
20
|
|
20
21
|
from ..common_types.io import PathOrReadable
|
21
22
|
from ..utils.bytesio import read_bytes_stream
|
23
|
+
from .convert_pdf_to_markdown import extract_text_from_pdf
|
22
24
|
|
23
25
|
if TYPE_CHECKING:
|
24
26
|
from bs4 import Tag
|
@@ -121,7 +123,13 @@ class CodeSnippets(NamedTuple):
|
|
121
123
|
base_dir: Path
|
122
124
|
|
123
125
|
@classmethod
|
124
|
-
def from_path_or_pkgname(
|
126
|
+
def from_path_or_pkgname(
|
127
|
+
cls,
|
128
|
+
path_or_pkgname: str,
|
129
|
+
glob_patterns: str | list[str] = "*.py",
|
130
|
+
case_sensitive: bool = False,
|
131
|
+
ban_file_patterns: Optional[list[str]] = None,
|
132
|
+
) -> Self:
|
125
133
|
"""
|
126
134
|
Creates a CodeSnippets instance from a file path or package name.
|
127
135
|
|
@@ -132,7 +140,12 @@ class CodeSnippets(NamedTuple):
|
|
132
140
|
Returns:
|
133
141
|
A new CodeSnippets instance with extracted code snippets.
|
134
142
|
"""
|
135
|
-
paths: list[Path] =
|
143
|
+
paths: list[Path] = _get_filepaths(
|
144
|
+
path_or_pkgname=path_or_pkgname,
|
145
|
+
glob_patterns=glob_patterns,
|
146
|
+
case_sensitive=case_sensitive,
|
147
|
+
ban_fn_patterns=ban_file_patterns,
|
148
|
+
)
|
136
149
|
snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
|
137
150
|
return cls(
|
138
151
|
paths=paths,
|
@@ -209,7 +222,7 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
|
|
209
222
|
return str(markdownify(html, **(options or {}))) # pyright: ignore[reportUnknownArgumentType]
|
210
223
|
|
211
224
|
|
212
|
-
def pdf_to_text(path_or_file: PathOrReadable) -> str:
|
225
|
+
def pdf_to_text(path_or_file: PathOrReadable, page_indices: Iterable[int] | int | None = None) -> str:
|
213
226
|
"""
|
214
227
|
Convert a PDF file to plain text.
|
215
228
|
|
@@ -217,6 +230,9 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
|
|
217
230
|
|
218
231
|
Args:
|
219
232
|
path_or_file: Path to a PDF file or a readable object containing PDF data.
|
233
|
+
page_indices: Optional list of page indices to extract. If None, all pages are extracted.
|
234
|
+
If an integer is provided, it extracts that specific page.
|
235
|
+
If a list is provided, it extracts the specified pages.
|
220
236
|
|
221
237
|
Returns:
|
222
238
|
str: Extracted text with page markers.
|
@@ -229,16 +245,11 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
|
|
229
245
|
with read_bytes_stream(path_or_file) as stream:
|
230
246
|
if stream is None:
|
231
247
|
raise FileNotFoundError(path_or_file)
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
(
|
236
|
-
page.get_textpage().extractText() # pyright: ignore[reportUnknownMemberType]
|
237
|
-
for page in Document(stream=stream.read())
|
238
|
-
),
|
239
|
-
1,
|
248
|
+
with Document(stream=stream.read()) as doc:
|
249
|
+
return "\n".join(
|
250
|
+
f"<!-- Page {page_no} -->\n{text}\n"
|
251
|
+
for page_no, text in extract_text_from_pdf(doc, page_indices).items()
|
240
252
|
)
|
241
|
-
)
|
242
253
|
|
243
254
|
|
244
255
|
def anything_to_markdown(
|
@@ -386,16 +397,23 @@ def _get_base_dir(target_files: Sequence[Path]) -> Path:
|
|
386
397
|
return Path(os.path.commonpath(target_files))
|
387
398
|
|
388
399
|
|
389
|
-
def
|
400
|
+
def _get_filepaths(
|
401
|
+
path_or_pkgname: str,
|
402
|
+
glob_patterns: str | list[str] = "*.py",
|
403
|
+
case_sensitive: bool = False,
|
404
|
+
ban_fn_patterns: Optional[list[str]] = None,
|
405
|
+
) -> list[Path]:
|
390
406
|
"""
|
391
|
-
Gets paths to
|
407
|
+
Gets paths to files from a directory, file, or Python package name.
|
392
408
|
|
393
|
-
If path_or_pkgname is a directory, finds all
|
409
|
+
If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
|
394
410
|
If it's a file, returns just that file.
|
395
411
|
If it's a package name, imports the package and finds all .py files in its directory.
|
396
412
|
|
397
413
|
Args:
|
398
414
|
path_or_pkgname: Path to directory/file or package name.
|
415
|
+
glob_pattern: Pattern to match files.
|
416
|
+
case_sensitive: Whether to match files case-sensitively.
|
399
417
|
ban_fn_patterns: Optional list of patterns to exclude files.
|
400
418
|
|
401
419
|
Returns:
|
@@ -404,7 +422,18 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
|
|
404
422
|
path = Path(path_or_pkgname)
|
405
423
|
pypaths: list[Path]
|
406
424
|
if path.is_dir():
|
407
|
-
|
425
|
+
glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
|
426
|
+
pypaths = []
|
427
|
+
for pattern in glob_patterns:
|
428
|
+
if "**" in pattern:
|
429
|
+
regex = _pattern_to_regex(pattern)
|
430
|
+
pypaths.extend(
|
431
|
+
p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
|
432
|
+
)
|
433
|
+
else:
|
434
|
+
pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
|
435
|
+
|
436
|
+
# pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
|
408
437
|
elif path.is_file():
|
409
438
|
pypaths = [path]
|
410
439
|
else:
|