I4-0-Client-Utils 20.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,170 @@
1
+ """
2
+ # I4.0 chatbot tools
3
+
4
+ These tools are provided to make it more easy to do certain things with I4.0.
5
+ """
6
+
7
+ from typing import Any
8
+ from . import internet
9
+
10
+ def GetDefaultTools() -> list[dict[str, Any]]:
11
+ return [
12
+ # Search tools
13
+ {
14
+ "type": "function",
15
+ "function": {
16
+ "name": "scrape_websites",
17
+ "description": (
18
+ "Scrapes websites for information"
19
+ ),
20
+ "parameters": {
21
+ "type": "object",
22
+ "properties": {
23
+ "urls": {
24
+ "type": "array",
25
+ "items": {"type": "string"},
26
+ "description": "URLs to scrape"
27
+ }
28
+ },
29
+ "required": ["urls"]
30
+ }
31
+ }
32
+ },
33
+ {
34
+ "type": "function",
35
+ "function": {
36
+ "name": "search_text",
37
+ "description": (
38
+ "Searches the internet for information using keywords"
39
+ ),
40
+ "parameters": {
41
+ "type": "object",
42
+ "properties": {
43
+ "keywords": {
44
+ "type": "string",
45
+ "description": "Keywords to search on the internet, space separated, allows search operators"
46
+ },
47
+ "backend": {
48
+ "type": "string",
49
+ "description": (
50
+ "Backend search engine to use. The available backends are: "
51
+ "bing, brave, duckduckgo, google, grokipedia, mojeek, yandex, wikipedia, auto"
52
+ ),
53
+ "default": "auto"
54
+ }
55
+ },
56
+ "required": ["keywords"]
57
+ }
58
+ }
59
+ }
60
+ ]
61
+
62
+ def ToolExists(ToolName: str) -> bool:
63
+ return ToolName in [t["function"]["name"] for t in GetDefaultTools()]
64
+
65
+ def ExecuteTool(ToolName: str, ToolArgs: dict[str, Any], MaxLength: int | None = None, Multimodal: str = "") -> dict[str, list[dict[str, str]]] | None:
66
+ if (not ToolExists(ToolName)):
67
+ raise ValueError("Tool does not exist in the default tool list.")
68
+
69
+ if (ToolName == "scrape_websites"):
70
+ if ("urls" not in ToolArgs or not isinstance(ToolArgs["urls"], list)):
71
+ raise RuntimeError("Tool parsing error: required parameter does not exist or is an invalid type of data.")
72
+
73
+ inputText = "# Results from all the websites\n\n"
74
+ inputMedia = []
75
+
76
+ for url in ToolArgs["urls"]:
77
+ inputText += f"## {url}\n\n"
78
+
79
+ try:
80
+ scrapeData = internet.Scrape_Auto(url)
81
+
82
+ if (scrapeData["type"] == "reddit subreddit"):
83
+ for post in scrapeData["posts"]:
84
+ for mediaElement in post["content_media"]:
85
+ if (mediaElement["type"] not in Multimodal):
86
+ continue
87
+
88
+ inputMedia.append(mediaElement)
89
+
90
+ inputText += f"### Post {scrapeData['posts'].index(post) + 1}\n\nTitle: {post['title']}\n\nContent:\n```markdown\n{post['content_text']}\n```\n\n"
91
+
92
+ if (len(scrapeData["posts"]) == 0):
93
+ inputText += "No posts available."
94
+
95
+ continue
96
+
97
+ inputText += f"Title: {scrapeData['title']}\n\nContent:\n```markdown\n{scrapeData['content_text']}\n```"
98
+
99
+ for mediaElement in scrapeData["content_media"]:
100
+ if (mediaElement["type"] not in Multimodal):
101
+ continue
102
+
103
+ inputMedia.append(mediaElement)
104
+ except Exception as ex:
105
+ inputText += f"Could not scrape website. Error type {type(ex)}, details: {ex}"
106
+
107
+ inputText += "\n\n"
108
+
109
+ inputText = inputText.strip()
110
+
111
+ if (MaxLength is not None and MaxLength >= 100):
112
+ inputText = inputText[:MaxLength - 1]
113
+
114
+ return inputMedia + [{"type": "text", "text": inputText}]
115
+ elif (ToolName == "search_text"):
116
+ if ("keywords" not in ToolArgs or not isinstance(ToolArgs["keywords"], str)):
117
+ raise RuntimeError("Tool parsing error: required parameter does not exist or is an invalid type of data.")
118
+
119
+ keywords = ToolArgs["keywords"]
120
+ backend = ToolArgs["backend"] if ("backend" in ToolArgs) else "auto"
121
+ inputText = "# Results from all the websites\n\n"
122
+ inputMedia = []
123
+
124
+ try:
125
+ websites = internet.SearchText(keywords, Backend = backend)
126
+
127
+ for url in websites:
128
+ inputText += f"## {url}\n\n"
129
+
130
+ try:
131
+ scrapeData = internet.Scrape_Auto(url)
132
+
133
+ if (scrapeData["type"] == "reddit subreddit"):
134
+ for post in scrapeData["posts"]:
135
+ for mediaElement in post["content_media"]:
136
+ if (mediaElement["type"] not in Multimodal):
137
+ continue
138
+
139
+ inputMedia.append(mediaElement)
140
+
141
+ inputText += f"### Post {scrapeData['posts'].index(post) + 1}\n\nTitle: {post['title']}\n\nContent:\n```markdown\n{post['content_text']}\n```\n\n"
142
+
143
+ if (len(scrapeData["posts"]) == 0):
144
+ inputText += "No posts available."
145
+
146
+ continue
147
+
148
+ inputText += f"Title: {scrapeData['title']}\n\nContent:\n```markdown\n{scrapeData['content_text']}\n```"
149
+
150
+ for mediaElement in scrapeData["content_media"]:
151
+ if (mediaElement["type"] not in Multimodal):
152
+ continue
153
+
154
+ inputMedia.append(mediaElement)
155
+ except Exception as ex:
156
+ inputText += f"Could not scrape website. Error type {type(ex)}, details: {ex}"
157
+
158
+ inputText += "\n\n"
159
+ except:
160
+ websites = []
161
+
162
+ if (len(websites) == 0):
163
+ inputText += "No results found."
164
+
165
+ inputText = inputText.strip()
166
+
167
+ if (MaxLength is not None and MaxLength >= 100):
168
+ inputText = inputText[:MaxLength - 1]
169
+
170
+ return inputMedia + [{"type": "text", "text": inputText}]
@@ -0,0 +1,10 @@
1
+ from html2text import HTML2Text
2
+
3
+ def HTML_To_Markdown(Content: str) -> str:
4
+ h = HTML2Text(bodywidth = 0)
5
+ h.single_line_break = True
6
+
7
+ content = h.handle(Content)
8
+
9
+ h.close()
10
+ return content
@@ -0,0 +1,285 @@
1
+ from bs4 import BeautifulSoup
2
+ from typing import Any, Literal
3
+ from urllib.parse import urlparse
4
+ from urllib.robotparser import RobotFileParser
5
+ from ddgs.ddgs import DDGS
6
+ from . import format_conversion
7
+ import base64
8
+ import requests
9
+ import re
10
+
11
+ __DDGS__: DDGS = DDGS()
12
+ SCRAPE_HEADERS: dict[str, Any] = {
13
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
14
+ }
15
+ FollowScrapeGuidelines: bool = True
16
+
17
+ class ScrapeGuidelinesError(BaseException):
18
+ def __init__(self) -> None:
19
+ super().__init__("Scrapping not allowed here. Please disable the scrapping guidelines or scrape another website if you see this error often.")
20
+
21
+ def __get_requests_response__(URL: str) -> requests.Response:
22
+ response = requests.get(URL, headers = SCRAPE_HEADERS)
23
+ response.raise_for_status()
24
+
25
+ return response
26
+
27
+ def DownloadContent(URL: str, ReturnAsBase64: bool = False, ReturnAsString: bool = False) -> bytes | str:
28
+ response = requests.get(URL)
29
+ response.raise_for_status()
30
+
31
+ content = response.content
32
+
33
+ if (ReturnAsBase64):
34
+ content = base64.b64encode(content)
35
+
36
+ if (ReturnAsString):
37
+ content = content.decode("utf-8")
38
+
39
+ return content
40
+
41
+ def SearchText(
42
+ Keywords: str,
43
+ Region: str = "auto",
44
+ UseSafeSearch: bool = True,
45
+ MaxResults: int = 5,
46
+ Backend: Literal[
47
+ "bing", "brave", "duckduckgo", "google", "grokipedia",
48
+ "mojeek", "yandex", "yahoo", "wikipedia", "auto"
49
+ ] = "auto"
50
+ ) -> list[str]:
51
+ results = __DDGS__.text(
52
+ query = Keywords,
53
+ region = Region,
54
+ safesearch = "moderate" if (UseSafeSearch) else "off",
55
+ max_results = MaxResults,
56
+ backend = Backend
57
+ )
58
+ return [r["href"] for r in results]
59
+
60
+ def GetBaseURL(URL: str) -> str:
61
+ if ("/" in URL):
62
+ url = URL[:URL.rfind("/")]
63
+ url2 = URL[URL.rfind("/") + 1:]
64
+
65
+ if ("#" in url2):
66
+ url2 = url2[:url2.find("#")]
67
+
68
+ if ("?" in url2):
69
+ url2 = url2[:url2.find("?")]
70
+
71
+ url = f"{url}/{url2}"
72
+ else:
73
+ url = URL
74
+
75
+ if ("#" in url):
76
+ url = url[:url.find("#")]
77
+
78
+ if ("?" in url):
79
+ url = url[:url.find("?")]
80
+
81
+ return url
82
+
83
+ def GetURLInfo(URL: str) -> dict[str, str]:
84
+ if ("://" in URL):
85
+ protocol = URL[:URL.find("://")]
86
+ website = URL[URL.find("://") + 3:]
87
+ else:
88
+ protocol = "http"
89
+ website = URL
90
+
91
+ if ("/" in website):
92
+ website = website[:website.find("/")]
93
+
94
+ if (website.count(".") == 1):
95
+ subdomain = None
96
+ elif (website.count(".") >= 2):
97
+ subdomain = ".".join(website.split(".")[:-2])
98
+ website = ".".join(website.split(".")[-2:])
99
+
100
+ return {
101
+ "protocol": protocol,
102
+ "website": website,
103
+ "subdomain": subdomain
104
+ }
105
+
106
+ def Scrape_Base(URL: str) -> BeautifulSoup:
107
+ if (FollowScrapeGuidelines):
108
+ baseURL = urlparse(URL)
109
+ baseURL = f"{baseURL.scheme}://{baseURL.netloc}/"
110
+
111
+ rp = RobotFileParser(baseURL + "robots.txt")
112
+ rp.read()
113
+
114
+ if (not rp.can_fetch("*", URL)):
115
+ raise ScrapeGuidelinesError()
116
+
117
+ response = __get_requests_response__(URL)
118
+ soup = BeautifulSoup(response.text, "html.parser")
119
+
120
+ return soup
121
+
122
+ def Scrape_Wikipedia(URL: str) -> dict[str, str | list[dict[str, str]]]:
123
+ soup = Scrape_Base(URL)
124
+ title = soup.find("h1", {"class": "mw-first-heading"}).get_text().strip()
125
+ paragraphs = soup.find("div", {"class": "mw-parser-output"}).find_all("p")
126
+ content = []
127
+
128
+ for p in paragraphs:
129
+ if (p.get_text().strip()):
130
+ content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
131
+
132
+ return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
133
+
134
+ def Scrape_Reddit_Post(URL: str) -> dict[str, str | list[dict[str, str]]]:
135
+ soup = Scrape_Base(URL)
136
+ title = soup.find("h1", {"slot": "title"})
137
+ contentTxt = soup.find("div", {"property": "schema:articleBody"})
138
+ media = []
139
+
140
+ if (title is None):
141
+ title = "No title"
142
+ else:
143
+ title = title.get_text().strip()
144
+
145
+ if (contentTxt is None):
146
+ contentTxt = "No text content"
147
+ else:
148
+ contentTxt = format_conversion.HTML_To_Markdown(str(contentTxt)).strip()
149
+
150
+ gallery = soup.find("gallery-carousel")
151
+
152
+ if (gallery is None):
153
+ mediaData = soup.find("img", {"id": "post-image"})
154
+ mediaType = "image"
155
+
156
+ if (mediaData is None):
157
+ mediaData = soup.find("shreddit-player")
158
+ mediaType = None if (mediaData is None) else "video" if (mediaData.get("post-type") == "video") else "gif"
159
+
160
+ if (mediaData is not None):
161
+ if (mediaType == "image"):
162
+ mediaURL = mediaData.get("src")
163
+ elif(mediaType == "gif"):
164
+ mediaURL = mediaData.get("src")
165
+ mediaType = "video" # Reddit converts GIF to MP4
166
+ elif (mediaType == "video"):
167
+ # When handling with videos, only low-quality previews can be get
168
+ # It's also buggy sometimes
169
+ mediaURL = mediaData.get("preview")
170
+
171
+ if (mediaURL is None):
172
+ mediaURL = mediaData.get("data-lazy-src")
173
+
174
+ if (mediaURL is not None):
175
+ media.append({
176
+ "type": mediaType,
177
+ mediaType: DownloadContent(URL = mediaURL, ReturnAsBase64 = True, ReturnAsString = True)
178
+ })
179
+ else:
180
+ gallery = gallery.find_all("li")
181
+
182
+ for item in gallery:
183
+ mediaContainer = item.find("figure", {"class": "items-center"})
184
+ mediaData = mediaContainer.find("img") # All gallery items must be items
185
+ mediaURL = mediaData.get("src")
186
+
187
+ if (mediaURL is None):
188
+ mediaURL = mediaData.get("data-lazy-src")
189
+
190
+ if (mediaURL is None):
191
+ continue
192
+
193
+ media.append({
194
+ "type": "image",
195
+ "image": DownloadContent(URL = mediaURL, ReturnAsBase64 = True, ReturnAsString = True)
196
+ })
197
+
198
+ return {"title": title, "content_text": contentTxt, "content_media": media}
199
+
200
+ def Scrape_Reddit_Subreddit(
201
+ URL: str,
202
+ IsName: bool = False,
203
+ ScrapePosts: bool = False,
204
+ PostsLimit: int | None = None
205
+ ) -> list[str | dict[str, str | list[dict[str, str]]]]:
206
+ if (IsName):
207
+ url = f"https://reddit.com/r/{URL}/hot.json"
208
+ else:
209
+ url = re.search(r"/r/([^/]+)", URL).group(1)
210
+ url = f"https://reddit.com/r/{url}/hot.json"
211
+
212
+ response = __get_requests_response__(url)
213
+ data = response.json()
214
+ posts = []
215
+
216
+ for post in data["data"]["children"]:
217
+ if (PostsLimit is not None and len(posts) >= PostsLimit):
218
+ break
219
+
220
+ postUrl = post["data"]["url"]
221
+ posts.append(Scrape_Reddit_Post(postUrl) if (ScrapePosts) else postUrl)
222
+
223
+ return posts
224
+
225
+ def Scrape_Wikidot(URL: str) -> dict[str, str | list[dict[str, str]]]:
226
+ soup = Scrape_Base(URL)
227
+ title = soup.find("div", {"id": "page-title"}).get_text().strip()
228
+ paragraphs = soup.find("div", {"id": "page-content"}).find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"])
229
+ content = []
230
+
231
+ for p in paragraphs:
232
+ if (p.get_text().strip()):
233
+ content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
234
+
235
+ return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
236
+
237
+ def Scrape_Fandom(URL: str) -> dict[str, str | list[dict[str, str]]]:
238
+ soup = Scrape_Base(URL)
239
+ title = soup.find("h1", {"class": "page-header__title"}).get_text().strip()
240
+ paragraphs = soup.find("div", {"class": "mw-content-ltr"}).find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"])
241
+ content = []
242
+
243
+ for p in paragraphs:
244
+ if (p.get_text().strip()):
245
+ content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
246
+
247
+ return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
248
+
249
+ def Scrape_Grokipedia(URL: str) -> dict[str, str | list[dict[str, str]]]:
250
+ soup = Scrape_Base(URL)
251
+ article = soup.find("article")
252
+ article.find("div", {"id": "references"}).decompose()
253
+
254
+ title = article.find("h1").get_text().strip()
255
+ paragraphs = article.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "span"], {"class": "block"})
256
+ content = []
257
+
258
+ for p in paragraphs:
259
+ if (p.get_text().strip()):
260
+ content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
261
+
262
+ return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
263
+
264
+ def Scrape_Auto(URL: str, RedditSubredditPosts: int | None = None) -> dict[str, str | list[dict[str, str]]]:
265
+ urlInfo = GetURLInfo(GetBaseURL(URL))
266
+
267
+ if (urlInfo["website"] == "reddit.com"):
268
+ if ("/comments/" in URL):
269
+ # Scrape Reddit post
270
+ return Scrape_Reddit_Post(URL) | {"type": "reddit post"}
271
+ else:
272
+ # Scrape Reddit subreddit
273
+ return {"posts": Scrape_Reddit_Subreddit(URL, False, True, RedditSubredditPosts), "type": "reddit subreddit"}
274
+ elif (urlInfo["website"] == "wikipedia.org"):
275
+ return Scrape_Wikipedia(URL) | {"type": "wikipedia"}
276
+ elif (urlInfo["website"] == "wikidot.com"):
277
+ return Scrape_Wikidot(URL) | {"type": "wikidot"}
278
+ elif (urlInfo["website"] == "fandom.com"):
279
+ return Scrape_Fandom(URL) | {"type": "fandom"}
280
+ elif (urlInfo["website"] == "grokipedia.com"):
281
+ return Scrape_Grokipedia(URL) | {"type": "grokipedia"}
282
+ else:
283
+ websiteContent = str(Scrape_Base(URL).find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "span"]))
284
+ websiteContent = format_conversion.HTML_To_Markdown(websiteContent)
285
+ return {"title": "No title detected", "content_text": websiteContent, "content_media": [], "type": "unknown"}
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: I4_0-Client-Utils
3
+ Version: 20.0.0
4
+ Summary: Client-side utilities for I4.0.
5
+ Author: TAO71-AI
6
+ License: TAO71 I4.0 License (version 2)
7
+ Project-URL: Homepage, https://github.com/TAO71-AI/I4.0-NEW
8
+ Project-URL: Source, https://github.com/TAO71-AI/I4.0-NEW/tree/master/Client/Utils
9
+ Project-URL: License, https://github.com/TAO71-AI/I4.0-NEW/blob/master/LICENSE.md
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: Other/Proprietary License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.11
14
+ Requires-Dist: html2text
15
+ Requires-Dist: requests
16
+ Requires-Dist: beautifulsoup4
17
+ Requires-Dist: ddgs
@@ -0,0 +1,10 @@
1
+ pyproject.toml
2
+ I40ClientUtils/__init__.py
3
+ I40ClientUtils/chatbot_tools.py
4
+ I40ClientUtils/format_conversion.py
5
+ I40ClientUtils/internet.py
6
+ I4_0_Client_Utils.egg-info/PKG-INFO
7
+ I4_0_Client_Utils.egg-info/SOURCES.txt
8
+ I4_0_Client_Utils.egg-info/dependency_links.txt
9
+ I4_0_Client_Utils.egg-info/requires.txt
10
+ I4_0_Client_Utils.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ html2text
2
+ requests
3
+ beautifulsoup4
4
+ ddgs
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: I4_0-Client-Utils
3
+ Version: 20.0.0
4
+ Summary: Client-side utilities for I4.0.
5
+ Author: TAO71-AI
6
+ License: TAO71 I4.0 License (version 2)
7
+ Project-URL: Homepage, https://github.com/TAO71-AI/I4.0-NEW
8
+ Project-URL: Source, https://github.com/TAO71-AI/I4.0-NEW/tree/master/Client/Utils
9
+ Project-URL: License, https://github.com/TAO71-AI/I4.0-NEW/blob/master/LICENSE.md
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: Other/Proprietary License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.11
14
+ Requires-Dist: html2text
15
+ Requires-Dist: requests
16
+ Requires-Dist: beautifulsoup4
17
+ Requires-Dist: ddgs
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "I4_0-Client-Utils"
7
+ version = "20.0.0"
8
+ description = "Client-side utilities for I4.0."
9
+ authors = [{name = "TAO71-AI"}]
10
+ license = {text = "TAO71 I4.0 License (version 2)"}
11
+ requires-python = ">=3.11"
12
+ dependencies = [
13
+ "html2text",
14
+ "requests",
15
+ "beautifulsoup4",
16
+ "ddgs"
17
+ ]
18
+ classifiers = [
19
+ "Programming Language :: Python :: 3",
20
+ "License :: Other/Proprietary License",
21
+ "Operating System :: OS Independent"
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/TAO71-AI/I4.0-NEW"
26
+ Source = "https://github.com/TAO71-AI/I4.0-NEW/tree/master/Client/Utils"
27
+ License = "https://github.com/TAO71-AI/I4.0-NEW/blob/master/LICENSE.md"
28
+
29
+ [tool.setuptools.packages.find]
30
+ include = ["I40ClientUtils", "I40ClientUtils.*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+