PyPI - chatterer - Versions diffs - 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl - Mend

chatterer 0.1.26py3-none-any.whl → 0.1.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

chatterer/__init__.py +87 -87
chatterer/common_types/__init__.py +21 -21
chatterer/common_types/io.py +19 -19
chatterer/constants.py +5 -0
chatterer/examples/__main__.py +75 -75
chatterer/examples/any2md.py +83 -85
chatterer/examples/pdf2md.py +231 -338
chatterer/examples/pdf2txt.py +52 -54
chatterer/examples/ppt.py +487 -486
chatterer/examples/pw.py +141 -143
chatterer/examples/snippet.py +54 -56
chatterer/examples/transcribe.py +192 -192
chatterer/examples/upstage.py +87 -89
chatterer/examples/web2md.py +80 -80
chatterer/interactive.py +422 -354
chatterer/language_model.py +530 -536
chatterer/messages.py +21 -21
chatterer/tools/__init__.py +46 -46
chatterer/tools/caption_markdown_images.py +388 -384
chatterer/tools/citation_chunking/__init__.py +3 -3
chatterer/tools/citation_chunking/chunks.py +51 -53
chatterer/tools/citation_chunking/citation_chunker.py +117 -118
chatterer/tools/citation_chunking/citations.py +284 -285
chatterer/tools/citation_chunking/prompt.py +157 -157
chatterer/tools/citation_chunking/reference.py +26 -26
chatterer/tools/citation_chunking/utils.py +138 -138
chatterer/tools/convert_pdf_to_markdown.py +634 -645
chatterer/tools/convert_to_text.py +446 -446
chatterer/tools/upstage_document_parser.py +704 -705
chatterer/tools/webpage_to_markdown.py +739 -739
chatterer/tools/youtube.py +146 -147
chatterer/utils/__init__.py +15 -15
chatterer/utils/base64_image.py +349 -350
chatterer/utils/bytesio.py +59 -59
chatterer/utils/code_agent.py +237 -237
chatterer/utils/imghdr.py +145 -145
{chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/METADATA +377 -390
chatterer-0.1.28.dist-info/RECORD +43 -0
chatterer-0.1.26.dist-info/RECORD +0 -42
{chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/WHEEL +0 -0
{chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/entry_points.txt +0 -0
{chatterer-0.1.26.dist-info → chatterer-0.1.28.dist-info}/top_level.txt +0 -0

chatterer/tools/youtube.py CHANGED Viewed

@@ -1,147 +1,146 @@
-import json
-import unicodedata
-import urllib.parse
-from dataclasses import dataclass
-from typing import Any, Optional, Self, cast
-import requests
-def get_youtube_video_details(
-    query: str,
-) -> list[dict[str, Optional[str]]]:
-    """Search for video metadata on YouTube using the given query. Returns a list of dictionaries containing `video_id`, `title`, `channel`, `duration`, `views`, `publish_time`, and `long_desc`."""
-    return [
-        {
-            "video_id": video_id,
-            "title": video.title,
-            "channel": video.channel,
-            "duration": video.duration,
-            "views": video.views,
-            "publish_time": video.publish_time,
-            "long_desc": video.long_desc,
-        }
-        for video in YoutubeSearchResult.from_query(base_url="https://youtube.com", query=query, max_results=10)
-        if (video_id := _get_video_id(video.url_suffix))
-    ]
-def get_youtube_video_subtitle(video_id: str) -> str:
-    """Get the transcript of a YouTube video using the given video ID."""
-    from youtube_transcript_api import YouTubeTranscriptApi  # pyright: ignore[reportPrivateImportUsage]
-    get_transcript = YouTubeTranscriptApi.get_transcript  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
-    list_transcripts = YouTubeTranscriptApi.list_transcripts  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
-    result: str = ""
-    buffer_timestamp: str = "0s"
-    buffer_texts: list[str] = []
-    for entry in get_transcript(video_id, languages=(next(iter(list_transcripts(video_id))).language_code,)):  # pyright: ignore[reportUnknownVariableType]
-        entry = cast(dict[object, object], entry)
-        text: str = str(entry.get("text", "")).strip().replace("\n", " ")
-        if not text:
-            continue
-        if len(buffer_texts) >= 10 or _is_special_char(text) or (buffer_texts and _is_special_char(buffer_texts[-1])):
-            result += f"[{buffer_timestamp}] {'. '.join(buffer_texts)}\n"
-            start = entry.get("start", 0)
-            if start:
-                buffer_timestamp = f"{start:.0f}s"
-            buffer_texts = [text]
-        else:
-            buffer_texts.append(text)
-    if buffer_texts:
-        result += f"[{buffer_timestamp}] {' '.join(buffer_texts)}"
-    return result
-def _get_video_id(suffix: str) -> str:
-    urllib_parse_result = urllib.parse.urlparse(suffix)
-    if urllib_parse_result.path.startswith("/shorts/"):
-        # Fore shorts (/shorts/...) the video ID is in the path
-        parts = urllib_parse_result.path.split("/")
-        if len(parts) < 3:
-            print(f"Failed to get video ID from {suffix}")
-            return ""
-        return parts[2]
-    query: str = urllib.parse.urlparse(suffix).query
-    query_strings = urllib.parse.parse_qs(query)
-    if "v" not in query_strings:
-        print(f"Failed to get video ID from {suffix}")
-        return ""
-    return next(iter(query_strings["v"]), "")
-def _is_special_char(text: str) -> bool:
-    if not text:
-        return False
-    return not unicodedata.category(text[0]).startswith("L")
-@dataclass
-class YoutubeSearchResult:
-    url_suffix: str
-    id: Optional[str]
-    thumbnails: list[str]
-    title: Optional[str]
-    long_desc: Optional[str]
-    channel: Optional[str]
-    duration: Optional[str]
-    views: Optional[str]
-    publish_time: Optional[str]
-    @classmethod
-    def from_query(cls, base_url: str, query: str, max_results: int) -> list[Self]:
-        url: str = f"{base_url}/results?search_query={urllib.parse.quote_plus(query)}"
-        response: str = requests.get(url).text
-        while "ytInitialData" not in response:
-            response = requests.get(url).text
-        results: list[Self] = cls.parse_html(response)
-        return results[:max_results]
-    @classmethod
-    def parse_html(cls, html: str) -> list[Self]:
-        results: list[Self] = []
-        start: int = html.index("ytInitialData") + len("ytInitialData") + 3
-        end: int = html.index("};", start) + 1
-        data: Any = json.loads(html[start:end])
-        for contents in data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"][
-            "contents"
-        ]:
-            for video in contents["itemSectionRenderer"]["contents"]:
-                if "videoRenderer" in video.keys():
-                    video_data = video.get("videoRenderer", {})
-                    suffix = (
-                        video_data.get("navigationEndpoint", {})
-                        .get("commandMetadata", {})
-                        .get("webCommandMetadata", {})
-                        .get("url", None)
-                    )
-                    if not suffix:
-                        continue
-                    res = cls(
-                        id=video_data.get("videoId", None),
-                        thumbnails=[
-                            thumb.get("url", None) for thumb in video_data.get("thumbnail", {}).get("thumbnails", [{}])
-                        ],
-                        title=video_data.get("title", {}).get("runs", [[{}]])[0].get("text", None),
-                        long_desc=video_data.get("descriptionSnippet", {}).get("runs", [{}])[0].get("text", None),
-                        channel=video_data.get("longBylineText", {}).get("runs", [[{}]])[0].get("text", None),
-                        duration=video_data.get("lengthText", {}).get("simpleText", 0),
-                        views=video_data.get("viewCountText", {}).get("simpleText", 0),
-                        publish_time=video_data.get("publishedTimeText", {}).get("simpleText", 0),
-                        url_suffix=suffix,
-                    )
-                    results.append(res)
-            if results:
-                break
-        return results
-if __name__ == "__main__":
-    print(get_youtube_video_details("BTS"))
-    # print(get_youtube_transcript("y7jrpS8GHxs"))
+import json
+import unicodedata
+import urllib.parse
+from dataclasses import dataclass
+from typing import Any, Optional, Self, cast
+import requests
+def get_youtube_video_details(
+    query: str,
+) -> list[dict[str, Optional[str]]]:
+    """Search for video metadata on YouTube using the given query. Returns a list of dictionaries containing `video_id`, `title`, `channel`, `duration`, `views`, `publish_time`, and `long_desc`."""
+    return [
+        {
+            "video_id": video_id,
+            "title": video.title,
+            "channel": video.channel,
+            "duration": video.duration,
+            "views": video.views,
+            "publish_time": video.publish_time,
+            "long_desc": video.long_desc,
+        }
+        for video in YoutubeSearchResult.from_query(base_url="https://youtube.com", query=query, max_results=10)
+        if (video_id := _get_video_id(video.url_suffix))
+    ]
+def get_youtube_video_subtitle(video_id: str) -> str:
+    """Get the transcript of a YouTube video using the given video ID."""
+    from youtube_transcript_api import YouTubeTranscriptApi  # pyright: ignore[reportPrivateImportUsage]
+    get_transcript = YouTubeTranscriptApi.get_transcript  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
+    list_transcripts = YouTubeTranscriptApi.list_transcripts  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
+    result: str = ""
+    buffer_timestamp: str = "0s"
+    buffer_texts: list[str] = []
+    for entry in get_transcript(video_id, languages=(next(iter(list_transcripts(video_id))).language_code,)):  # pyright: ignore[reportUnknownVariableType]
+        entry = cast(dict[object, object], entry)
+        text: str = str(entry.get("text", "")).strip().replace("\n", " ")
+        if not text:
+            continue
+        if len(buffer_texts) >= 10 or _is_special_char(text) or (buffer_texts and _is_special_char(buffer_texts[-1])):
+            result += f"[{buffer_timestamp}] {'. '.join(buffer_texts)}\n"
+            start = entry.get("start", 0)
+            if start:
+                buffer_timestamp = f"{start:.0f}s"
+            buffer_texts = [text]
+        else:
+            buffer_texts.append(text)
+    if buffer_texts:
+        result += f"[{buffer_timestamp}] {' '.join(buffer_texts)}"
+    return result
+def _get_video_id(suffix: str) -> str:
+    urllib_parse_result = urllib.parse.urlparse(suffix)
+    if urllib_parse_result.path.startswith("/shorts/"):
+        # Fore shorts (/shorts/...) the video ID is in the path
+        parts = urllib_parse_result.path.split("/")
+        if len(parts) < 3:
+            print(f"Failed to get video ID from {suffix}")
+            return ""
+        return parts[2]
+    query: str = urllib.parse.urlparse(suffix).query
+    query_strings = urllib.parse.parse_qs(query)
+    if "v" not in query_strings:
+        print(f"Failed to get video ID from {suffix}")
+        return ""
+    return next(iter(query_strings["v"]), "")
+def _is_special_char(text: str) -> bool:
+    if not text:
+        return False
+    return not unicodedata.category(text[0]).startswith("L")
+@dataclass
+class YoutubeSearchResult:
+    url_suffix: str
+    id: Optional[str]
+    thumbnails: list[str]
+    title: Optional[str]
+    long_desc: Optional[str]
+    channel: Optional[str]
+    duration: Optional[str]
+    views: Optional[str]
+    publish_time: Optional[str]
+    @classmethod
+    def from_query(cls, base_url: str, query: str, max_results: int) -> list[Self]:
+        url: str = f"{base_url}/results?search_query={urllib.parse.quote_plus(query)}"
+        response: str = requests.get(url).text
+        while "ytInitialData" not in response:
+            response = requests.get(url).text
+        results: list[Self] = cls.parse_html(response)
+        return results[:max_results]
+    @classmethod
+    def parse_html(cls, html: str) -> list[Self]:
+        results: list[Self] = []
+        start: int = html.index("ytInitialData") + len("ytInitialData") + 3
+        end: int = html.index("};", start) + 1
+        data: Any = json.loads(html[start:end])
+        for contents in data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"][
+            "contents"
+        ]:
+            for video in contents["itemSectionRenderer"]["contents"]:
+                if "videoRenderer" in video.keys():
+                    video_data = video.get("videoRenderer", {})
+                    suffix = (
+                        video_data.get("navigationEndpoint", {})
+                        .get("commandMetadata", {})
+                        .get("webCommandMetadata", {})
+                        .get("url", None)
+                    )
+                    if not suffix:
+                        continue
+                    res = cls(
+                        id=video_data.get("videoId", None),
+                        thumbnails=[
+                            thumb.get("url", None) for thumb in video_data.get("thumbnail", {}).get("thumbnails", [{}])
+                        ],
+                        title=video_data.get("title", {}).get("runs", [[{}]])[0].get("text", None),
+                        long_desc=video_data.get("descriptionSnippet", {}).get("runs", [{}])[0].get("text", None),
+                        channel=video_data.get("longBylineText", {}).get("runs", [[{}]])[0].get("text", None),
+                        duration=video_data.get("lengthText", {}).get("simpleText", 0),
+                        views=video_data.get("viewCountText", {}).get("simpleText", 0),
+                        publish_time=video_data.get("publishedTimeText", {}).get("simpleText", 0),
+                        url_suffix=suffix,
+                    )
+                    results.append(res)
+            if results:
+                break
+        return results
+if __name__ == "__main__":
+    print(get_youtube_video_details("BTS"))
+    # print(get_youtube_transcript("y7jrpS8GHxs"))

chatterer/utils/__init__.py CHANGED Viewed

@@ -1,15 +1,15 @@
-from .base64_image import Base64Image
-from .code_agent import (
-    CodeExecutionResult,
-    FunctionSignature,
-    get_default_repl_tool,
-    insert_callables_into_global,
-)
-__all__ = [
-    "Base64Image",
-    "FunctionSignature",
-    "CodeExecutionResult",
-    "get_default_repl_tool",
-    "insert_callables_into_global",
-]
+from .base64_image import Base64Image
+from .code_agent import (
+    CodeExecutionResult,
+    FunctionSignature,
+    get_default_repl_tool,
+    insert_callables_into_global,
+)
+__all__ = [
+    "Base64Image",
+    "FunctionSignature",
+    "CodeExecutionResult",
+    "get_default_repl_tool",
+    "insert_callables_into_global",
+]

chatterer 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

chatterer 0.1.26py3-none-any.whl → 0.1.28py3-none-any.whl