chatterer 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +41 -4
- chatterer/common_types/__init__.py +21 -0
- chatterer/common_types/io.py +19 -0
- chatterer/interactive.py +353 -0
- chatterer/language_model.py +129 -252
- chatterer/messages.py +13 -1
- chatterer/tools/__init__.py +27 -9
- chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
- chatterer/tools/convert_pdf_to_markdown.py +302 -0
- chatterer/tools/convert_to_text.py +49 -65
- chatterer/tools/upstage_document_parser.py +705 -0
- chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
- chatterer/tools/youtube.py +2 -1
- chatterer/utils/__init__.py +4 -1
- chatterer/utils/{image.py → base64_image.py} +56 -62
- chatterer/utils/bytesio.py +59 -0
- chatterer/utils/cli.py +476 -0
- chatterer/utils/code_agent.py +137 -38
- chatterer/utils/imghdr.py +148 -0
- chatterer-0.1.14.dist-info/METADATA +387 -0
- chatterer-0.1.14.dist-info/RECORD +34 -0
- chatterer/tools/webpage_to_markdown/__init__.py +0 -4
- chatterer-0.1.12.dist-info/METADATA +0 -170
- chatterer-0.1.12.dist-info/RECORD +0 -27
- {chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/WHEEL +0 -0
- {chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/top_level.txt +0 -0
@@ -3,14 +3,12 @@ import importlib
|
|
3
3
|
import os
|
4
4
|
import re
|
5
5
|
import site
|
6
|
-
from contextlib import contextmanager, suppress
|
7
6
|
from fnmatch import fnmatch
|
8
|
-
from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
|
9
7
|
from pathlib import Path
|
10
8
|
from typing import (
|
11
9
|
TYPE_CHECKING,
|
12
10
|
Callable,
|
13
|
-
|
11
|
+
Iterable,
|
14
12
|
NamedTuple,
|
15
13
|
NotRequired,
|
16
14
|
Optional,
|
@@ -20,6 +18,10 @@ from typing import (
|
|
20
18
|
TypedDict,
|
21
19
|
)
|
22
20
|
|
21
|
+
from ..common_types.io import PathOrReadable
|
22
|
+
from ..utils.bytesio import read_bytes_stream
|
23
|
+
from .convert_pdf_to_markdown import extract_text_from_pdf
|
24
|
+
|
23
25
|
if TYPE_CHECKING:
|
24
26
|
from bs4 import Tag
|
25
27
|
from openai import OpenAI
|
@@ -38,20 +40,6 @@ type FileTree = dict[str, Optional[FileTree]]
|
|
38
40
|
|
39
41
|
# Type aliases for callback functions and file descriptors
|
40
42
|
CodeLanguageCallback: TypeAlias = Callable[["Tag"], Optional[str]]
|
41
|
-
FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
|
42
|
-
|
43
|
-
# Type aliases for different types of IO objects
|
44
|
-
BytesReadable: TypeAlias = BytesIO | BufferedReader
|
45
|
-
BytesWritable: TypeAlias = BytesIO | BufferedWriter
|
46
|
-
StringReadable: TypeAlias = StringIO | TextIOWrapper
|
47
|
-
StringWritable: TypeAlias = StringIO | TextIOWrapper
|
48
|
-
|
49
|
-
# Combined type aliases for readable and writable objects
|
50
|
-
Readable: TypeAlias = BytesReadable | StringReadable
|
51
|
-
Writable: TypeAlias = BytesWritable | StringWritable
|
52
|
-
|
53
|
-
# Type alias for path or readable object
|
54
|
-
PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
|
55
43
|
|
56
44
|
|
57
45
|
class HtmlToMarkdownOptions(TypedDict):
|
@@ -135,7 +123,13 @@ class CodeSnippets(NamedTuple):
|
|
135
123
|
base_dir: Path
|
136
124
|
|
137
125
|
@classmethod
|
138
|
-
def from_path_or_pkgname(
|
126
|
+
def from_path_or_pkgname(
|
127
|
+
cls,
|
128
|
+
path_or_pkgname: str,
|
129
|
+
glob_patterns: str | list[str] = "*.py",
|
130
|
+
case_sensitive: bool = False,
|
131
|
+
ban_file_patterns: Optional[list[str]] = None,
|
132
|
+
) -> Self:
|
139
133
|
"""
|
140
134
|
Creates a CodeSnippets instance from a file path or package name.
|
141
135
|
|
@@ -146,7 +140,12 @@ class CodeSnippets(NamedTuple):
|
|
146
140
|
Returns:
|
147
141
|
A new CodeSnippets instance with extracted code snippets.
|
148
142
|
"""
|
149
|
-
paths: list[Path] =
|
143
|
+
paths: list[Path] = _get_filepaths(
|
144
|
+
path_or_pkgname=path_or_pkgname,
|
145
|
+
glob_patterns=glob_patterns,
|
146
|
+
case_sensitive=case_sensitive,
|
147
|
+
ban_fn_patterns=ban_file_patterns,
|
148
|
+
)
|
150
149
|
snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
|
151
150
|
return cls(
|
152
151
|
paths=paths,
|
@@ -223,7 +222,7 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
|
|
223
222
|
return str(markdownify(html, **(options or {}))) # pyright: ignore[reportUnknownArgumentType]
|
224
223
|
|
225
224
|
|
226
|
-
def pdf_to_text(path_or_file: PathOrReadable) -> str:
|
225
|
+
def pdf_to_text(path_or_file: PathOrReadable, page_indices: Iterable[int] | int | None = None) -> str:
|
227
226
|
"""
|
228
227
|
Convert a PDF file to plain text.
|
229
228
|
|
@@ -231,6 +230,9 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
|
|
231
230
|
|
232
231
|
Args:
|
233
232
|
path_or_file: Path to a PDF file or a readable object containing PDF data.
|
233
|
+
page_indices: Optional list of page indices to extract. If None, all pages are extracted.
|
234
|
+
If an integer is provided, it extracts that specific page.
|
235
|
+
If a list is provided, it extracts the specified pages.
|
234
236
|
|
235
237
|
Returns:
|
236
238
|
str: Extracted text with page markers.
|
@@ -240,19 +242,14 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
|
|
240
242
|
"""
|
241
243
|
from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
|
242
244
|
|
243
|
-
with
|
245
|
+
with read_bytes_stream(path_or_file) as stream:
|
244
246
|
if stream is None:
|
245
247
|
raise FileNotFoundError(path_or_file)
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
(
|
250
|
-
page.get_textpage().extractText() # pyright: ignore[reportUnknownMemberType]
|
251
|
-
for page in Document(stream=stream.read())
|
252
|
-
),
|
253
|
-
1,
|
248
|
+
with Document(stream=stream.read()) as doc:
|
249
|
+
return "\n".join(
|
250
|
+
f"<!-- Page {page_no} -->\n{text}\n"
|
251
|
+
for page_no, text in extract_text_from_pdf(doc, page_indices).items()
|
254
252
|
)
|
255
|
-
)
|
256
253
|
|
257
254
|
|
258
255
|
def anything_to_markdown(
|
@@ -400,16 +397,23 @@ def _get_base_dir(target_files: Sequence[Path]) -> Path:
|
|
400
397
|
return Path(os.path.commonpath(target_files))
|
401
398
|
|
402
399
|
|
403
|
-
def
|
400
|
+
def _get_filepaths(
|
401
|
+
path_or_pkgname: str,
|
402
|
+
glob_patterns: str | list[str] = "*.py",
|
403
|
+
case_sensitive: bool = False,
|
404
|
+
ban_fn_patterns: Optional[list[str]] = None,
|
405
|
+
) -> list[Path]:
|
404
406
|
"""
|
405
|
-
Gets paths to
|
407
|
+
Gets paths to files from a directory, file, or Python package name.
|
406
408
|
|
407
|
-
If path_or_pkgname is a directory, finds all
|
409
|
+
If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
|
408
410
|
If it's a file, returns just that file.
|
409
411
|
If it's a package name, imports the package and finds all .py files in its directory.
|
410
412
|
|
411
413
|
Args:
|
412
414
|
path_or_pkgname: Path to directory/file or package name.
|
415
|
+
glob_pattern: Pattern to match files.
|
416
|
+
case_sensitive: Whether to match files case-sensitively.
|
413
417
|
ban_fn_patterns: Optional list of patterns to exclude files.
|
414
418
|
|
415
419
|
Returns:
|
@@ -418,7 +422,18 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
|
|
418
422
|
path = Path(path_or_pkgname)
|
419
423
|
pypaths: list[Path]
|
420
424
|
if path.is_dir():
|
421
|
-
|
425
|
+
glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
|
426
|
+
pypaths = []
|
427
|
+
for pattern in glob_patterns:
|
428
|
+
if "**" in pattern:
|
429
|
+
regex = _pattern_to_regex(pattern)
|
430
|
+
pypaths.extend(
|
431
|
+
p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
|
432
|
+
)
|
433
|
+
else:
|
434
|
+
pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
|
435
|
+
|
436
|
+
# pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
|
422
437
|
elif path.is_file():
|
423
438
|
pypaths = [path]
|
424
439
|
else:
|
@@ -430,34 +445,3 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
|
|
430
445
|
if p.is_file()
|
431
446
|
]
|
432
447
|
return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
|
433
|
-
|
434
|
-
|
435
|
-
@contextmanager
|
436
|
-
def _open_stream(
|
437
|
-
path_or_file: PathOrReadable,
|
438
|
-
) -> Iterator[Optional[BytesReadable]]:
|
439
|
-
"""
|
440
|
-
Context manager for opening a file or using an existing stream.
|
441
|
-
|
442
|
-
Handles different types of input (file paths, byte streams, string streams)
|
443
|
-
and yields a BytesReadable object that can be used to read binary data.
|
444
|
-
|
445
|
-
Args:
|
446
|
-
path_or_file: File path or readable object.
|
447
|
-
|
448
|
-
Yields:
|
449
|
-
Optional[BytesReadable]: A readable binary stream or None if opening fails.
|
450
|
-
"""
|
451
|
-
stream: Optional[BytesReadable] = None
|
452
|
-
try:
|
453
|
-
with suppress(BaseException):
|
454
|
-
if isinstance(path_or_file, BytesReadable):
|
455
|
-
stream = path_or_file
|
456
|
-
elif isinstance(path_or_file, StringReadable):
|
457
|
-
stream = BytesIO(path_or_file.read().encode("utf-8"))
|
458
|
-
else:
|
459
|
-
stream = open(path_or_file, "rb")
|
460
|
-
yield stream
|
461
|
-
finally:
|
462
|
-
if stream is not None:
|
463
|
-
stream.close()
|