PyPI - chatterer - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

chatterer 0.1.13py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

chatterer/__init__.py +36 -5
chatterer/interactive.py +692 -0
chatterer/language_model.py +217 -261
chatterer/messages.py +13 -1
chatterer/tools/__init__.py +26 -15
chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
chatterer/tools/convert_pdf_to_markdown.py +302 -0
chatterer/tools/convert_to_text.py +45 -16
chatterer/tools/upstage_document_parser.py +481 -214
chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
chatterer/tools/youtube.py +2 -1
chatterer/utils/__init__.py +1 -1
chatterer/utils/{image.py → base64_image.py} +56 -62
chatterer/utils/code_agent.py +137 -38
chatterer/utils/imghdr.py +148 -0
chatterer-0.1.16.dist-info/METADATA +392 -0
chatterer-0.1.16.dist-info/RECORD +33 -0
{chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/WHEEL +1 -1
chatterer/tools/webpage_to_markdown/__init__.py +0 -4
chatterer-0.1.13.dist-info/METADATA +0 -171
chatterer-0.1.13.dist-info/RECORD +0 -31
{chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/top_level.txt +0 -0

chatterer/tools/convert_to_text.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 from typing import (
     TYPE_CHECKING,
     Callable,
+    Iterable,
     NamedTuple,
     NotRequired,
     Optional,
@@ -19,6 +20,7 @@ from typing import (
 from ..common_types.io import PathOrReadable
 from ..utils.bytesio import read_bytes_stream
+from .convert_pdf_to_markdown import extract_text_from_pdf
 if TYPE_CHECKING:
     from bs4 import Tag
@@ -121,7 +123,13 @@ class CodeSnippets(NamedTuple):
     base_dir: Path
     @classmethod
-    def from_path_or_pkgname(cls, path_or_pkgname: str, ban_file_patterns: Optional[list[str]] = None) -> Self:
+    def from_path_or_pkgname(
+        cls,
+        path_or_pkgname: str,
+        glob_patterns: str | list[str] = "*.py",
+        case_sensitive: bool = False,
+        ban_file_patterns: Optional[list[str]] = None,
+    ) -> Self:
         """
         Creates a CodeSnippets instance from a file path or package name.
@@ -132,7 +140,12 @@ class CodeSnippets(NamedTuple):
         Returns:
             A new CodeSnippets instance with extracted code snippets.
         """
-        paths: list[Path] = _get_pyscript_paths(path_or_pkgname=path_or_pkgname, ban_fn_patterns=ban_file_patterns)
+        paths: list[Path] = _get_filepaths(
+            path_or_pkgname=path_or_pkgname,
+            glob_patterns=glob_patterns,
+            case_sensitive=case_sensitive,
+            ban_fn_patterns=ban_file_patterns,
+        )
         snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
         return cls(
             paths=paths,
@@ -209,7 +222,7 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
     return str(markdownify(html, **(options or {})))  # pyright: ignore[reportUnknownArgumentType]
-def pdf_to_text(path_or_file: PathOrReadable) -> str:
+def pdf_to_text(path_or_file: PathOrReadable, page_indices: Iterable[int] | int | None = None) -> str:
     """
     Convert a PDF file to plain text.
@@ -217,6 +230,9 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
     Args:
         path_or_file: Path to a PDF file or a readable object containing PDF data.
+        page_indices: Optional list of page indices to extract. If None, all pages are extracted.
+            If an integer is provided, it extracts that specific page.
+            If a list is provided, it extracts the specified pages.
     Returns:
         str: Extracted text with page markers.
@@ -229,16 +245,11 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
     with read_bytes_stream(path_or_file) as stream:
         if stream is None:
             raise FileNotFoundError(path_or_file)
-        return "\n".join(
-            f"<!-- Page {page_no} -->\n{text.strip()}\n"
-            for page_no, text in enumerate(
-                (
-                    page.get_textpage().extractText()  # pyright: ignore[reportUnknownMemberType]
-                    for page in Document(stream=stream.read())
-                ),
-                1,
+        with Document(stream=stream.read()) as doc:
+            return "\n".join(
+                f"<!-- Page {page_no} -->\n{text}\n"
+                for page_no, text in extract_text_from_pdf(doc, page_indices).items()
             )
-        )
 def anything_to_markdown(
@@ -386,16 +397,23 @@ def _get_base_dir(target_files: Sequence[Path]) -> Path:
     return Path(os.path.commonpath(target_files))
-def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str]] = None) -> list[Path]:
+def _get_filepaths(
+    path_or_pkgname: str,
+    glob_patterns: str | list[str] = "*.py",
+    case_sensitive: bool = False,
+    ban_fn_patterns: Optional[list[str]] = None,
+) -> list[Path]:
     """
-    Gets paths to Python script files from a directory, file, or package name.
+    Gets paths to files from a directory, file, or Python package name.
-    If path_or_pkgname is a directory, finds all .py files recursively.
+    If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
     If it's a file, returns just that file.
     If it's a package name, imports the package and finds all .py files in its directory.
     Args:
         path_or_pkgname: Path to directory/file or package name.
+        glob_pattern: Pattern to match files.
+        case_sensitive: Whether to match files case-sensitively.
         ban_fn_patterns: Optional list of patterns to exclude files.
     Returns:
@@ -404,7 +422,18 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
     path = Path(path_or_pkgname)
     pypaths: list[Path]
     if path.is_dir():
-        pypaths = list(path.rglob("*.py", case_sensitive=False))
+        glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
+        pypaths = []
+        for pattern in glob_patterns:
+            if "**" in pattern:
+                regex = _pattern_to_regex(pattern)
+                pypaths.extend(
+                    p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
+                )
+            else:
+                pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
+        # pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
     elif path.is_file():
         pypaths = [path]
     else:

chatterer 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl

chatterer 0.1.13py3-none-any.whl → 0.1.16py3-none-any.whl