chatterer 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. chatterer/__init__.py +97 -62
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/interactive.py +353 -0
  5. chatterer/language_model.py +454 -577
  6. chatterer/messages.py +21 -9
  7. chatterer/strategies/__init__.py +13 -13
  8. chatterer/strategies/atom_of_thoughts.py +975 -975
  9. chatterer/strategies/base.py +14 -14
  10. chatterer/tools/__init__.py +46 -35
  11. chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +384 -334
  12. chatterer/tools/citation_chunking/__init__.py +3 -3
  13. chatterer/tools/citation_chunking/chunks.py +53 -53
  14. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  15. chatterer/tools/citation_chunking/citations.py +285 -285
  16. chatterer/tools/citation_chunking/prompt.py +157 -157
  17. chatterer/tools/citation_chunking/reference.py +26 -26
  18. chatterer/tools/citation_chunking/utils.py +138 -138
  19. chatterer/tools/convert_pdf_to_markdown.py +302 -0
  20. chatterer/tools/convert_to_text.py +447 -418
  21. chatterer/tools/upstage_document_parser.py +705 -438
  22. chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +739 -649
  23. chatterer/tools/youtube.py +147 -146
  24. chatterer/utils/__init__.py +18 -15
  25. chatterer/utils/{image.py → base64_image.py} +285 -291
  26. chatterer/utils/bytesio.py +59 -59
  27. chatterer/utils/cli.py +476 -0
  28. chatterer/utils/code_agent.py +237 -138
  29. chatterer/utils/imghdr.py +148 -0
  30. chatterer-0.1.14.dist-info/METADATA +387 -0
  31. chatterer-0.1.14.dist-info/RECORD +34 -0
  32. chatterer/tools/webpage_to_markdown/__init__.py +0 -4
  33. chatterer-0.1.13.dist-info/METADATA +0 -171
  34. chatterer-0.1.13.dist-info/RECORD +0 -31
  35. {chatterer-0.1.13.dist-info → chatterer-0.1.14.dist-info}/WHEEL +0 -0
  36. {chatterer-0.1.13.dist-info → chatterer-0.1.14.dist-info}/top_level.txt +0 -0
@@ -1,146 +1,147 @@
1
- import json
2
- import unicodedata
3
- import urllib.parse
4
- from dataclasses import dataclass
5
- from typing import Any, Optional, Self, cast
6
-
7
- import requests
8
-
9
-
10
- def get_youtube_video_details(
11
- query: str,
12
- ) -> list[dict[str, Optional[str]]]:
13
- """Search for video metadata on YouTube using the given query. Returns a list of dictionaries containing `video_id`, `title`, `channel`, `duration`, `views`, `publish_time`, and `long_desc`."""
14
- return [
15
- {
16
- "video_id": video_id,
17
- "title": video.title,
18
- "channel": video.channel,
19
- "duration": video.duration,
20
- "views": video.views,
21
- "publish_time": video.publish_time,
22
- "long_desc": video.long_desc,
23
- }
24
- for video in YoutubeSearchResult.from_query(base_url="https://youtube.com", query=query, max_results=10)
25
- if (video_id := _get_video_id(video.url_suffix))
26
- ]
27
-
28
-
29
- def get_youtube_video_subtitle(video_id: str) -> str:
30
- """Get the transcript of a YouTube video using the given video ID."""
31
-
32
- from youtube_transcript_api._api import YouTubeTranscriptApi
33
-
34
- get_transcript = YouTubeTranscriptApi.get_transcript # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
35
- list_transcripts = YouTubeTranscriptApi.list_transcripts # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
36
-
37
- result: str = ""
38
- buffer_timestamp: str = "0s"
39
- buffer_texts: list[str] = []
40
- for entry in get_transcript(video_id, languages=(next(iter(list_transcripts(video_id))).language_code,)): # pyright: ignore[reportUnknownVariableType]
41
- entry = cast(dict[object, object], entry)
42
- text: str = str(entry.get("text", "")).strip().replace("\n", " ")
43
- if not text:
44
- continue
45
- if len(buffer_texts) >= 10 or _is_special_char(text) or (buffer_texts and _is_special_char(buffer_texts[-1])):
46
- result += f"[{buffer_timestamp}] {'. '.join(buffer_texts)}\n"
47
- start = entry.get("start", 0)
48
- if start:
49
- buffer_timestamp = f"{start:.0f}s"
50
- buffer_texts = [text]
51
- else:
52
- buffer_texts.append(text)
53
-
54
- if buffer_texts:
55
- result += f"[{buffer_timestamp}] {' '.join(buffer_texts)}"
56
- return result
57
-
58
-
59
- def _get_video_id(suffix: str) -> str:
60
- urllib_parse_result = urllib.parse.urlparse(suffix)
61
- if urllib_parse_result.path.startswith("/shorts/"):
62
- # Fore shorts (/shorts/...) the video ID is in the path
63
- parts = urllib_parse_result.path.split("/")
64
- if len(parts) < 3:
65
- print(f"Failed to get video ID from {suffix}")
66
- return ""
67
- return parts[2]
68
-
69
- query: str = urllib.parse.urlparse(suffix).query
70
- query_strings = urllib.parse.parse_qs(query)
71
- if "v" not in query_strings:
72
- print(f"Failed to get video ID from {suffix}")
73
- return ""
74
- return next(iter(query_strings["v"]), "")
75
-
76
-
77
- def _is_special_char(text: str) -> bool:
78
- if not text:
79
- return False
80
- return not unicodedata.category(text[0]).startswith("L")
81
-
82
-
83
- @dataclass
84
- class YoutubeSearchResult:
85
- url_suffix: str
86
- id: Optional[str]
87
- thumbnails: list[str]
88
- title: Optional[str]
89
- long_desc: Optional[str]
90
- channel: Optional[str]
91
- duration: Optional[str]
92
- views: Optional[str]
93
- publish_time: Optional[str]
94
-
95
- @classmethod
96
- def from_query(cls, base_url: str, query: str, max_results: int) -> list[Self]:
97
- url: str = f"{base_url}/results?search_query={urllib.parse.quote_plus(query)}"
98
- response: str = requests.get(url).text
99
- while "ytInitialData" not in response:
100
- response = requests.get(url).text
101
- results: list[Self] = cls.parse_html(response)
102
- return results[:max_results]
103
-
104
- @classmethod
105
- def parse_html(cls, html: str) -> list[Self]:
106
- results: list[Self] = []
107
- start: int = html.index("ytInitialData") + len("ytInitialData") + 3
108
- end: int = html.index("};", start) + 1
109
- data: Any = json.loads(html[start:end])
110
- for contents in data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"][
111
- "contents"
112
- ]:
113
- for video in contents["itemSectionRenderer"]["contents"]:
114
- if "videoRenderer" in video.keys():
115
- video_data = video.get("videoRenderer", {})
116
- suffix = (
117
- video_data.get("navigationEndpoint", {})
118
- .get("commandMetadata", {})
119
- .get("webCommandMetadata", {})
120
- .get("url", None)
121
- )
122
- if not suffix:
123
- continue
124
- res = cls(
125
- id=video_data.get("videoId", None),
126
- thumbnails=[
127
- thumb.get("url", None) for thumb in video_data.get("thumbnail", {}).get("thumbnails", [{}])
128
- ],
129
- title=video_data.get("title", {}).get("runs", [[{}]])[0].get("text", None),
130
- long_desc=video_data.get("descriptionSnippet", {}).get("runs", [{}])[0].get("text", None),
131
- channel=video_data.get("longBylineText", {}).get("runs", [[{}]])[0].get("text", None),
132
- duration=video_data.get("lengthText", {}).get("simpleText", 0),
133
- views=video_data.get("viewCountText", {}).get("simpleText", 0),
134
- publish_time=video_data.get("publishedTimeText", {}).get("simpleText", 0),
135
- url_suffix=suffix,
136
- )
137
- results.append(res)
138
-
139
- if results:
140
- break
141
- return results
142
-
143
-
144
- if __name__ == "__main__":
145
- print(get_youtube_video_details("BTS"))
146
- # print(get_youtube_transcript("y7jrpS8GHxs"))
1
+ import json
2
+ import unicodedata
3
+ import urllib.parse
4
+ from dataclasses import dataclass
5
+ from typing import Any, Optional, Self, cast
6
+
7
+ import requests
8
+
9
+
10
+ def get_youtube_video_details(
11
+ query: str,
12
+ ) -> list[dict[str, Optional[str]]]:
13
+ """Search for video metadata on YouTube using the given query. Returns a list of dictionaries containing `video_id`, `title`, `channel`, `duration`, `views`, `publish_time`, and `long_desc`."""
14
+ return [
15
+ {
16
+ "video_id": video_id,
17
+ "title": video.title,
18
+ "channel": video.channel,
19
+ "duration": video.duration,
20
+ "views": video.views,
21
+ "publish_time": video.publish_time,
22
+ "long_desc": video.long_desc,
23
+ }
24
+ for video in YoutubeSearchResult.from_query(base_url="https://youtube.com", query=query, max_results=10)
25
+ if (video_id := _get_video_id(video.url_suffix))
26
+ ]
27
+
28
+
29
+ def get_youtube_video_subtitle(video_id: str) -> str:
30
+ """Get the transcript of a YouTube video using the given video ID."""
31
+
32
+ from youtube_transcript_api import YouTubeTranscriptApi # pyright: ignore[reportPrivateImportUsage]
33
+
34
+ get_transcript = YouTubeTranscriptApi.get_transcript # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
35
+ list_transcripts = YouTubeTranscriptApi.list_transcripts # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
36
+
37
+ result: str = ""
38
+ buffer_timestamp: str = "0s"
39
+ buffer_texts: list[str] = []
40
+ for entry in get_transcript(video_id, languages=(next(iter(list_transcripts(video_id))).language_code,)): # pyright: ignore[reportUnknownVariableType]
41
+ entry = cast(dict[object, object], entry)
42
+ text: str = str(entry.get("text", "")).strip().replace("\n", " ")
43
+ if not text:
44
+ continue
45
+ if len(buffer_texts) >= 10 or _is_special_char(text) or (buffer_texts and _is_special_char(buffer_texts[-1])):
46
+ result += f"[{buffer_timestamp}] {'. '.join(buffer_texts)}\n"
47
+ start = entry.get("start", 0)
48
+ if start:
49
+ buffer_timestamp = f"{start:.0f}s"
50
+ buffer_texts = [text]
51
+ else:
52
+ buffer_texts.append(text)
53
+
54
+ if buffer_texts:
55
+ result += f"[{buffer_timestamp}] {' '.join(buffer_texts)}"
56
+ return result
57
+
58
+
59
+ def _get_video_id(suffix: str) -> str:
60
+ urllib_parse_result = urllib.parse.urlparse(suffix)
61
+ if urllib_parse_result.path.startswith("/shorts/"):
62
+ # Fore shorts (/shorts/...) the video ID is in the path
63
+ parts = urllib_parse_result.path.split("/")
64
+ if len(parts) < 3:
65
+ print(f"Failed to get video ID from {suffix}")
66
+ return ""
67
+ return parts[2]
68
+
69
+ query: str = urllib.parse.urlparse(suffix).query
70
+ query_strings = urllib.parse.parse_qs(query)
71
+ if "v" not in query_strings:
72
+ print(f"Failed to get video ID from {suffix}")
73
+ return ""
74
+ return next(iter(query_strings["v"]), "")
75
+
76
+
77
+ def _is_special_char(text: str) -> bool:
78
+ if not text:
79
+ return False
80
+ return not unicodedata.category(text[0]).startswith("L")
81
+
82
+
83
+ @dataclass
84
+ class YoutubeSearchResult:
85
+ url_suffix: str
86
+ id: Optional[str]
87
+ thumbnails: list[str]
88
+ title: Optional[str]
89
+ long_desc: Optional[str]
90
+ channel: Optional[str]
91
+ duration: Optional[str]
92
+ views: Optional[str]
93
+ publish_time: Optional[str]
94
+
95
+ @classmethod
96
+ def from_query(cls, base_url: str, query: str, max_results: int) -> list[Self]:
97
+ url: str = f"{base_url}/results?search_query={urllib.parse.quote_plus(query)}"
98
+ response: str = requests.get(url).text
99
+ while "ytInitialData" not in response:
100
+ response = requests.get(url).text
101
+ results: list[Self] = cls.parse_html(response)
102
+ return results[:max_results]
103
+
104
+ @classmethod
105
+ def parse_html(cls, html: str) -> list[Self]:
106
+ results: list[Self] = []
107
+ start: int = html.index("ytInitialData") + len("ytInitialData") + 3
108
+ end: int = html.index("};", start) + 1
109
+ data: Any = json.loads(html[start:end])
110
+ for contents in data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"][
111
+ "contents"
112
+ ]:
113
+ for video in contents["itemSectionRenderer"]["contents"]:
114
+ if "videoRenderer" in video.keys():
115
+ video_data = video.get("videoRenderer", {})
116
+ suffix = (
117
+ video_data.get("navigationEndpoint", {})
118
+ .get("commandMetadata", {})
119
+ .get("webCommandMetadata", {})
120
+ .get("url", None)
121
+ )
122
+ if not suffix:
123
+ continue
124
+ res = cls(
125
+ id=video_data.get("videoId", None),
126
+ thumbnails=[
127
+ thumb.get("url", None) for thumb in video_data.get("thumbnail", {}).get("thumbnails", [{}])
128
+ ],
129
+ title=video_data.get("title", {}).get("runs", [[{}]])[0].get("text", None),
130
+ long_desc=video_data.get("descriptionSnippet", {}).get("runs", [{}])[0].get("text", None),
131
+ channel=video_data.get("longBylineText", {}).get("runs", [[{}]])[0].get("text", None),
132
+ duration=video_data.get("lengthText", {}).get("simpleText", 0),
133
+ views=video_data.get("viewCountText", {}).get("simpleText", 0),
134
+ publish_time=video_data.get("publishedTimeText", {}).get("simpleText", 0),
135
+ url_suffix=suffix,
136
+ )
137
+ results.append(res)
138
+
139
+ if results:
140
+ break
141
+ return results
142
+
143
+
144
+ if __name__ == "__main__":
145
+ print(get_youtube_video_details("BTS"))
146
+ # print(get_youtube_transcript("y7jrpS8GHxs"))
147
+
@@ -1,15 +1,18 @@
1
- from .code_agent import (
2
- CodeExecutionResult,
3
- FunctionSignature,
4
- get_default_repl_tool,
5
- insert_callables_into_global,
6
- )
7
- from .image import Base64Image
8
-
9
- __all__ = [
10
- "Base64Image",
11
- "FunctionSignature",
12
- "CodeExecutionResult",
13
- "get_default_repl_tool",
14
- "insert_callables_into_global",
15
- ]
1
+ from .base64_image import Base64Image
2
+ from .cli import ArgumentSpec, BaseArguments
3
+ from .code_agent import (
4
+ CodeExecutionResult,
5
+ FunctionSignature,
6
+ get_default_repl_tool,
7
+ insert_callables_into_global,
8
+ )
9
+
10
+ __all__ = [
11
+ "Base64Image",
12
+ "FunctionSignature",
13
+ "CodeExecutionResult",
14
+ "get_default_repl_tool",
15
+ "insert_callables_into_global",
16
+ "BaseArguments",
17
+ "ArgumentSpec",
18
+ ]