PyPI - chatterer - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

chatterer 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

chatterer/__init__.py +41 -4
chatterer/common_types/__init__.py +21 -0
chatterer/common_types/io.py +19 -0
chatterer/interactive.py +353 -0
chatterer/language_model.py +129 -252
chatterer/messages.py +13 -1
chatterer/tools/__init__.py +27 -9
chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
chatterer/tools/convert_pdf_to_markdown.py +302 -0
chatterer/tools/convert_to_text.py +49 -65
chatterer/tools/upstage_document_parser.py +705 -0
chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
chatterer/tools/youtube.py +2 -1
chatterer/utils/__init__.py +4 -1
chatterer/utils/{image.py → base64_image.py} +56 -62
chatterer/utils/bytesio.py +59 -0
chatterer/utils/cli.py +476 -0
chatterer/utils/code_agent.py +137 -38
chatterer/utils/imghdr.py +148 -0
chatterer-0.1.14.dist-info/METADATA +387 -0
chatterer-0.1.14.dist-info/RECORD +34 -0
chatterer/tools/webpage_to_markdown/__init__.py +0 -4
chatterer-0.1.12.dist-info/METADATA +0 -170
chatterer-0.1.12.dist-info/RECORD +0 -27
{chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/WHEEL +0 -0
{chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/top_level.txt +0 -0

chatterer/tools/convert_to_text.py CHANGED Viewed

@@ -3,14 +3,12 @@ import importlib
 import os
 import re
 import site
-from contextlib import contextmanager, suppress
 from fnmatch import fnmatch
-from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
     Callable,
-    Iterator,
+    Iterable,
     NamedTuple,
     NotRequired,
     Optional,
@@ -20,6 +18,10 @@ from typing import (
     TypedDict,
 )
+from ..common_types.io import PathOrReadable
+from ..utils.bytesio import read_bytes_stream
+from .convert_pdf_to_markdown import extract_text_from_pdf
 if TYPE_CHECKING:
     from bs4 import Tag
     from openai import OpenAI
@@ -38,20 +40,6 @@ type FileTree = dict[str, Optional[FileTree]]
 # Type aliases for callback functions and file descriptors
 CodeLanguageCallback: TypeAlias = Callable[["Tag"], Optional[str]]
-FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
-# Type aliases for different types of IO objects
-BytesReadable: TypeAlias = BytesIO | BufferedReader
-BytesWritable: TypeAlias = BytesIO | BufferedWriter
-StringReadable: TypeAlias = StringIO | TextIOWrapper
-StringWritable: TypeAlias = StringIO | TextIOWrapper
-# Combined type aliases for readable and writable objects
-Readable: TypeAlias = BytesReadable | StringReadable
-Writable: TypeAlias = BytesWritable | StringWritable
-# Type alias for path or readable object
-PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
 class HtmlToMarkdownOptions(TypedDict):
@@ -135,7 +123,13 @@ class CodeSnippets(NamedTuple):
     base_dir: Path
     @classmethod
-    def from_path_or_pkgname(cls, path_or_pkgname: str, ban_file_patterns: Optional[list[str]] = None) -> Self:
+    def from_path_or_pkgname(
+        cls,
+        path_or_pkgname: str,
+        glob_patterns: str | list[str] = "*.py",
+        case_sensitive: bool = False,
+        ban_file_patterns: Optional[list[str]] = None,
+    ) -> Self:
         """
         Creates a CodeSnippets instance from a file path or package name.
@@ -146,7 +140,12 @@ class CodeSnippets(NamedTuple):
         Returns:
             A new CodeSnippets instance with extracted code snippets.
         """
-        paths: list[Path] = _get_pyscript_paths(path_or_pkgname=path_or_pkgname, ban_fn_patterns=ban_file_patterns)
+        paths: list[Path] = _get_filepaths(
+            path_or_pkgname=path_or_pkgname,
+            glob_patterns=glob_patterns,
+            case_sensitive=case_sensitive,
+            ban_fn_patterns=ban_file_patterns,
+        )
         snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
         return cls(
             paths=paths,
@@ -223,7 +222,7 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
     return str(markdownify(html, **(options or {})))  # pyright: ignore[reportUnknownArgumentType]
-def pdf_to_text(path_or_file: PathOrReadable) -> str:
+def pdf_to_text(path_or_file: PathOrReadable, page_indices: Iterable[int] | int | None = None) -> str:
     """
     Convert a PDF file to plain text.
@@ -231,6 +230,9 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
     Args:
         path_or_file: Path to a PDF file or a readable object containing PDF data.
+        page_indices: Optional list of page indices to extract. If None, all pages are extracted.
+            If an integer is provided, it extracts that specific page.
+            If a list is provided, it extracts the specified pages.
     Returns:
         str: Extracted text with page markers.
@@ -240,19 +242,14 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
     """
     from pymupdf import Document  # pyright: ignore[reportMissingTypeStubs]
-    with _open_stream(path_or_file) as stream:
+    with read_bytes_stream(path_or_file) as stream:
         if stream is None:
             raise FileNotFoundError(path_or_file)
-        return "\n".join(
-            f"<!-- Page {page_no} -->\n{text.strip()}\n"
-            for page_no, text in enumerate(
-                (
-                    page.get_textpage().extractText()  # pyright: ignore[reportUnknownMemberType]
-                    for page in Document(stream=stream.read())
-                ),
-                1,
+        with Document(stream=stream.read()) as doc:
+            return "\n".join(
+                f"<!-- Page {page_no} -->\n{text}\n"
+                for page_no, text in extract_text_from_pdf(doc, page_indices).items()
             )
-        )
 def anything_to_markdown(
@@ -400,16 +397,23 @@ def _get_base_dir(target_files: Sequence[Path]) -> Path:
     return Path(os.path.commonpath(target_files))
-def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str]] = None) -> list[Path]:
+def _get_filepaths(
+    path_or_pkgname: str,
+    glob_patterns: str | list[str] = "*.py",
+    case_sensitive: bool = False,
+    ban_fn_patterns: Optional[list[str]] = None,
+) -> list[Path]:
     """
-    Gets paths to Python script files from a directory, file, or package name.
+    Gets paths to files from a directory, file, or Python package name.
-    If path_or_pkgname is a directory, finds all .py files recursively.
+    If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
     If it's a file, returns just that file.
     If it's a package name, imports the package and finds all .py files in its directory.
     Args:
         path_or_pkgname: Path to directory/file or package name.
+        glob_pattern: Pattern to match files.
+        case_sensitive: Whether to match files case-sensitively.
         ban_fn_patterns: Optional list of patterns to exclude files.
     Returns:
@@ -418,7 +422,18 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
     path = Path(path_or_pkgname)
     pypaths: list[Path]
     if path.is_dir():
-        pypaths = list(path.rglob("*.py", case_sensitive=False))
+        glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
+        pypaths = []
+        for pattern in glob_patterns:
+            if "**" in pattern:
+                regex = _pattern_to_regex(pattern)
+                pypaths.extend(
+                    p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
+                )
+            else:
+                pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
+        # pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
     elif path.is_file():
         pypaths = [path]
     else:
@@ -430,34 +445,3 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
             if p.is_file()
         ]
     return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
-@contextmanager
-def _open_stream(
-    path_or_file: PathOrReadable,
-) -> Iterator[Optional[BytesReadable]]:
-    """
-    Context manager for opening a file or using an existing stream.
-    Handles different types of input (file paths, byte streams, string streams)
-    and yields a BytesReadable object that can be used to read binary data.
-    Args:
-        path_or_file: File path or readable object.
-    Yields:
-        Optional[BytesReadable]: A readable binary stream or None if opening fails.
-    """
-    stream: Optional[BytesReadable] = None
-    try:
-        with suppress(BaseException):
-            if isinstance(path_or_file, BytesReadable):
-                stream = path_or_file
-            elif isinstance(path_or_file, StringReadable):
-                stream = BytesIO(path_or_file.read().encode("utf-8"))
-            else:
-                stream = open(path_or_file, "rb")
-        yield stream
-    finally:
-        if stream is not None:
-            stream.close()

chatterer 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

chatterer 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl