I4-0-Client-Utils 20.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- i4_0_client_utils-20.0.0/I40ClientUtils/__init__.py +0 -0
- i4_0_client_utils-20.0.0/I40ClientUtils/chatbot_tools.py +170 -0
- i4_0_client_utils-20.0.0/I40ClientUtils/format_conversion.py +10 -0
- i4_0_client_utils-20.0.0/I40ClientUtils/internet.py +285 -0
- i4_0_client_utils-20.0.0/I4_0_Client_Utils.egg-info/PKG-INFO +17 -0
- i4_0_client_utils-20.0.0/I4_0_Client_Utils.egg-info/SOURCES.txt +10 -0
- i4_0_client_utils-20.0.0/I4_0_Client_Utils.egg-info/dependency_links.txt +1 -0
- i4_0_client_utils-20.0.0/I4_0_Client_Utils.egg-info/requires.txt +4 -0
- i4_0_client_utils-20.0.0/I4_0_Client_Utils.egg-info/top_level.txt +1 -0
- i4_0_client_utils-20.0.0/PKG-INFO +17 -0
- i4_0_client_utils-20.0.0/pyproject.toml +30 -0
- i4_0_client_utils-20.0.0/setup.cfg +4 -0
|
File without changes
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
# I4.0 chatbot tools
|
|
3
|
+
|
|
4
|
+
These tools are provided to make it more easy to do certain things with I4.0.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
from . import internet
|
|
9
|
+
|
|
10
|
+
def GetDefaultTools() -> list[dict[str, Any]]:
|
|
11
|
+
return [
|
|
12
|
+
# Search tools
|
|
13
|
+
{
|
|
14
|
+
"type": "function",
|
|
15
|
+
"function": {
|
|
16
|
+
"name": "scrape_websites",
|
|
17
|
+
"description": (
|
|
18
|
+
"Scrapes websites for information"
|
|
19
|
+
),
|
|
20
|
+
"parameters": {
|
|
21
|
+
"type": "object",
|
|
22
|
+
"properties": {
|
|
23
|
+
"urls": {
|
|
24
|
+
"type": "array",
|
|
25
|
+
"items": {"type": "string"},
|
|
26
|
+
"description": "URLs to scrape"
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"required": ["urls"]
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"type": "function",
|
|
35
|
+
"function": {
|
|
36
|
+
"name": "search_text",
|
|
37
|
+
"description": (
|
|
38
|
+
"Searches the internet for information using keywords"
|
|
39
|
+
),
|
|
40
|
+
"parameters": {
|
|
41
|
+
"type": "object",
|
|
42
|
+
"properties": {
|
|
43
|
+
"keywords": {
|
|
44
|
+
"type": "string",
|
|
45
|
+
"description": "Keywords to search on the internet, space separated, allows search operators"
|
|
46
|
+
},
|
|
47
|
+
"backend": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": (
|
|
50
|
+
"Backend search engine to use. The available backends are: "
|
|
51
|
+
"bing, brave, duckduckgo, google, grokipedia, mojeek, yandex, wikipedia, auto"
|
|
52
|
+
),
|
|
53
|
+
"default": "auto"
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"required": ["keywords"]
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
def ToolExists(ToolName: str) -> bool:
|
|
63
|
+
return ToolName in [t["function"]["name"] for t in GetDefaultTools()]
|
|
64
|
+
|
|
65
|
+
def ExecuteTool(ToolName: str, ToolArgs: dict[str, Any], MaxLength: int | None = None, Multimodal: str = "") -> dict[str, list[dict[str, str]]] | None:
|
|
66
|
+
if (not ToolExists(ToolName)):
|
|
67
|
+
raise ValueError("Tool does not exist in the default tool list.")
|
|
68
|
+
|
|
69
|
+
if (ToolName == "scrape_websites"):
|
|
70
|
+
if ("urls" not in ToolArgs or not isinstance(ToolArgs["urls"], list)):
|
|
71
|
+
raise RuntimeError("Tool parsing error: required parameter does not exist or is an invalid type of data.")
|
|
72
|
+
|
|
73
|
+
inputText = "# Results from all the websites\n\n"
|
|
74
|
+
inputMedia = []
|
|
75
|
+
|
|
76
|
+
for url in ToolArgs["urls"]:
|
|
77
|
+
inputText += f"## {url}\n\n"
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
scrapeData = internet.Scrape_Auto(url)
|
|
81
|
+
|
|
82
|
+
if (scrapeData["type"] == "reddit subreddit"):
|
|
83
|
+
for post in scrapeData["posts"]:
|
|
84
|
+
for mediaElement in post["content_media"]:
|
|
85
|
+
if (mediaElement["type"] not in Multimodal):
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
inputMedia.append(mediaElement)
|
|
89
|
+
|
|
90
|
+
inputText += f"### Post {scrapeData['posts'].index(post) + 1}\n\nTitle: {post['title']}\n\nContent:\n```markdown\n{post['content_text']}\n```\n\n"
|
|
91
|
+
|
|
92
|
+
if (len(scrapeData["posts"]) == 0):
|
|
93
|
+
inputText += "No posts available."
|
|
94
|
+
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
inputText += f"Title: {scrapeData['title']}\n\nContent:\n```markdown\n{scrapeData['content_text']}\n```"
|
|
98
|
+
|
|
99
|
+
for mediaElement in scrapeData["content_media"]:
|
|
100
|
+
if (mediaElement["type"] not in Multimodal):
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
inputMedia.append(mediaElement)
|
|
104
|
+
except Exception as ex:
|
|
105
|
+
inputText += f"Could not scrape website. Error type {type(ex)}, details: {ex}"
|
|
106
|
+
|
|
107
|
+
inputText += "\n\n"
|
|
108
|
+
|
|
109
|
+
inputText = inputText.strip()
|
|
110
|
+
|
|
111
|
+
if (MaxLength is not None and MaxLength >= 100):
|
|
112
|
+
inputText = inputText[:MaxLength - 1]
|
|
113
|
+
|
|
114
|
+
return inputMedia + [{"type": "text", "text": inputText}]
|
|
115
|
+
elif (ToolName == "search_text"):
|
|
116
|
+
if ("keywords" not in ToolArgs or not isinstance(ToolArgs["keywords"], str)):
|
|
117
|
+
raise RuntimeError("Tool parsing error: required parameter does not exist or is an invalid type of data.")
|
|
118
|
+
|
|
119
|
+
keywords = ToolArgs["keywords"]
|
|
120
|
+
backend = ToolArgs["backend"] if ("backend" in ToolArgs) else "auto"
|
|
121
|
+
inputText = "# Results from all the websites\n\n"
|
|
122
|
+
inputMedia = []
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
websites = internet.SearchText(keywords, Backend = backend)
|
|
126
|
+
|
|
127
|
+
for url in websites:
|
|
128
|
+
inputText += f"## {url}\n\n"
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
scrapeData = internet.Scrape_Auto(url)
|
|
132
|
+
|
|
133
|
+
if (scrapeData["type"] == "reddit subreddit"):
|
|
134
|
+
for post in scrapeData["posts"]:
|
|
135
|
+
for mediaElement in post["content_media"]:
|
|
136
|
+
if (mediaElement["type"] not in Multimodal):
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
inputMedia.append(mediaElement)
|
|
140
|
+
|
|
141
|
+
inputText += f"### Post {scrapeData['posts'].index(post) + 1}\n\nTitle: {post['title']}\n\nContent:\n```markdown\n{post['content_text']}\n```\n\n"
|
|
142
|
+
|
|
143
|
+
if (len(scrapeData["posts"]) == 0):
|
|
144
|
+
inputText += "No posts available."
|
|
145
|
+
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
inputText += f"Title: {scrapeData['title']}\n\nContent:\n```markdown\n{scrapeData['content_text']}\n```"
|
|
149
|
+
|
|
150
|
+
for mediaElement in scrapeData["content_media"]:
|
|
151
|
+
if (mediaElement["type"] not in Multimodal):
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
inputMedia.append(mediaElement)
|
|
155
|
+
except Exception as ex:
|
|
156
|
+
inputText += f"Could not scrape website. Error type {type(ex)}, details: {ex}"
|
|
157
|
+
|
|
158
|
+
inputText += "\n\n"
|
|
159
|
+
except:
|
|
160
|
+
websites = []
|
|
161
|
+
|
|
162
|
+
if (len(websites) == 0):
|
|
163
|
+
inputText += "No results found."
|
|
164
|
+
|
|
165
|
+
inputText = inputText.strip()
|
|
166
|
+
|
|
167
|
+
if (MaxLength is not None and MaxLength >= 100):
|
|
168
|
+
inputText = inputText[:MaxLength - 1]
|
|
169
|
+
|
|
170
|
+
return inputMedia + [{"type": "text", "text": inputText}]
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
from typing import Any, Literal
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
from urllib.robotparser import RobotFileParser
|
|
5
|
+
from ddgs.ddgs import DDGS
|
|
6
|
+
from . import format_conversion
|
|
7
|
+
import base64
|
|
8
|
+
import requests
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
__DDGS__: DDGS = DDGS()
|
|
12
|
+
SCRAPE_HEADERS: dict[str, Any] = {
|
|
13
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
14
|
+
}
|
|
15
|
+
FollowScrapeGuidelines: bool = True
|
|
16
|
+
|
|
17
|
+
class ScrapeGuidelinesError(BaseException):
|
|
18
|
+
def __init__(self) -> None:
|
|
19
|
+
super().__init__("Scrapping not allowed here. Please disable the scrapping guidelines or scrape another website if you see this error often.")
|
|
20
|
+
|
|
21
|
+
def __get_requests_response__(URL: str) -> requests.Response:
|
|
22
|
+
response = requests.get(URL, headers = SCRAPE_HEADERS)
|
|
23
|
+
response.raise_for_status()
|
|
24
|
+
|
|
25
|
+
return response
|
|
26
|
+
|
|
27
|
+
def DownloadContent(URL: str, ReturnAsBase64: bool = False, ReturnAsString: bool = False) -> bytes | str:
|
|
28
|
+
response = requests.get(URL)
|
|
29
|
+
response.raise_for_status()
|
|
30
|
+
|
|
31
|
+
content = response.content
|
|
32
|
+
|
|
33
|
+
if (ReturnAsBase64):
|
|
34
|
+
content = base64.b64encode(content)
|
|
35
|
+
|
|
36
|
+
if (ReturnAsString):
|
|
37
|
+
content = content.decode("utf-8")
|
|
38
|
+
|
|
39
|
+
return content
|
|
40
|
+
|
|
41
|
+
def SearchText(
|
|
42
|
+
Keywords: str,
|
|
43
|
+
Region: str = "auto",
|
|
44
|
+
UseSafeSearch: bool = True,
|
|
45
|
+
MaxResults: int = 5,
|
|
46
|
+
Backend: Literal[
|
|
47
|
+
"bing", "brave", "duckduckgo", "google", "grokipedia",
|
|
48
|
+
"mojeek", "yandex", "yahoo", "wikipedia", "auto"
|
|
49
|
+
] = "auto"
|
|
50
|
+
) -> list[str]:
|
|
51
|
+
results = __DDGS__.text(
|
|
52
|
+
query = Keywords,
|
|
53
|
+
region = Region,
|
|
54
|
+
safesearch = "moderate" if (UseSafeSearch) else "off",
|
|
55
|
+
max_results = MaxResults,
|
|
56
|
+
backend = Backend
|
|
57
|
+
)
|
|
58
|
+
return [r["href"] for r in results]
|
|
59
|
+
|
|
60
|
+
def GetBaseURL(URL: str) -> str:
|
|
61
|
+
if ("/" in URL):
|
|
62
|
+
url = URL[:URL.rfind("/")]
|
|
63
|
+
url2 = URL[URL.rfind("/") + 1:]
|
|
64
|
+
|
|
65
|
+
if ("#" in url2):
|
|
66
|
+
url2 = url2[:url2.find("#")]
|
|
67
|
+
|
|
68
|
+
if ("?" in url2):
|
|
69
|
+
url2 = url2[:url2.find("?")]
|
|
70
|
+
|
|
71
|
+
url = f"{url}/{url2}"
|
|
72
|
+
else:
|
|
73
|
+
url = URL
|
|
74
|
+
|
|
75
|
+
if ("#" in url):
|
|
76
|
+
url = url[:url.find("#")]
|
|
77
|
+
|
|
78
|
+
if ("?" in url):
|
|
79
|
+
url = url[:url.find("?")]
|
|
80
|
+
|
|
81
|
+
return url
|
|
82
|
+
|
|
83
|
+
def GetURLInfo(URL: str) -> dict[str, str]:
|
|
84
|
+
if ("://" in URL):
|
|
85
|
+
protocol = URL[:URL.find("://")]
|
|
86
|
+
website = URL[URL.find("://") + 3:]
|
|
87
|
+
else:
|
|
88
|
+
protocol = "http"
|
|
89
|
+
website = URL
|
|
90
|
+
|
|
91
|
+
if ("/" in website):
|
|
92
|
+
website = website[:website.find("/")]
|
|
93
|
+
|
|
94
|
+
if (website.count(".") == 1):
|
|
95
|
+
subdomain = None
|
|
96
|
+
elif (website.count(".") >= 2):
|
|
97
|
+
subdomain = ".".join(website.split(".")[:-2])
|
|
98
|
+
website = ".".join(website.split(".")[-2:])
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
"protocol": protocol,
|
|
102
|
+
"website": website,
|
|
103
|
+
"subdomain": subdomain
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
def Scrape_Base(URL: str) -> BeautifulSoup:
|
|
107
|
+
if (FollowScrapeGuidelines):
|
|
108
|
+
baseURL = urlparse(URL)
|
|
109
|
+
baseURL = f"{baseURL.scheme}://{baseURL.netloc}/"
|
|
110
|
+
|
|
111
|
+
rp = RobotFileParser(baseURL + "robots.txt")
|
|
112
|
+
rp.read()
|
|
113
|
+
|
|
114
|
+
if (not rp.can_fetch("*", URL)):
|
|
115
|
+
raise ScrapeGuidelinesError()
|
|
116
|
+
|
|
117
|
+
response = __get_requests_response__(URL)
|
|
118
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
119
|
+
|
|
120
|
+
return soup
|
|
121
|
+
|
|
122
|
+
def Scrape_Wikipedia(URL: str) -> dict[str, str | list[dict[str, str]]]:
|
|
123
|
+
soup = Scrape_Base(URL)
|
|
124
|
+
title = soup.find("h1", {"class": "mw-first-heading"}).get_text().strip()
|
|
125
|
+
paragraphs = soup.find("div", {"class": "mw-parser-output"}).find_all("p")
|
|
126
|
+
content = []
|
|
127
|
+
|
|
128
|
+
for p in paragraphs:
|
|
129
|
+
if (p.get_text().strip()):
|
|
130
|
+
content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
|
|
131
|
+
|
|
132
|
+
return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
|
|
133
|
+
|
|
134
|
+
def Scrape_Reddit_Post(URL: str) -> dict[str, str | list[dict[str, str]]]:
|
|
135
|
+
soup = Scrape_Base(URL)
|
|
136
|
+
title = soup.find("h1", {"slot": "title"})
|
|
137
|
+
contentTxt = soup.find("div", {"property": "schema:articleBody"})
|
|
138
|
+
media = []
|
|
139
|
+
|
|
140
|
+
if (title is None):
|
|
141
|
+
title = "No title"
|
|
142
|
+
else:
|
|
143
|
+
title = title.get_text().strip()
|
|
144
|
+
|
|
145
|
+
if (contentTxt is None):
|
|
146
|
+
contentTxt = "No text content"
|
|
147
|
+
else:
|
|
148
|
+
contentTxt = format_conversion.HTML_To_Markdown(str(contentTxt)).strip()
|
|
149
|
+
|
|
150
|
+
gallery = soup.find("gallery-carousel")
|
|
151
|
+
|
|
152
|
+
if (gallery is None):
|
|
153
|
+
mediaData = soup.find("img", {"id": "post-image"})
|
|
154
|
+
mediaType = "image"
|
|
155
|
+
|
|
156
|
+
if (mediaData is None):
|
|
157
|
+
mediaData = soup.find("shreddit-player")
|
|
158
|
+
mediaType = None if (mediaData is None) else "video" if (mediaData.get("post-type") == "video") else "gif"
|
|
159
|
+
|
|
160
|
+
if (mediaData is not None):
|
|
161
|
+
if (mediaType == "image"):
|
|
162
|
+
mediaURL = mediaData.get("src")
|
|
163
|
+
elif(mediaType == "gif"):
|
|
164
|
+
mediaURL = mediaData.get("src")
|
|
165
|
+
mediaType = "video" # Reddit converts GIF to MP4
|
|
166
|
+
elif (mediaType == "video"):
|
|
167
|
+
# When handling with videos, only low-quality previews can be get
|
|
168
|
+
# It's also buggy sometimes
|
|
169
|
+
mediaURL = mediaData.get("preview")
|
|
170
|
+
|
|
171
|
+
if (mediaURL is None):
|
|
172
|
+
mediaURL = mediaData.get("data-lazy-src")
|
|
173
|
+
|
|
174
|
+
if (mediaURL is not None):
|
|
175
|
+
media.append({
|
|
176
|
+
"type": mediaType,
|
|
177
|
+
mediaType: DownloadContent(URL = mediaURL, ReturnAsBase64 = True, ReturnAsString = True)
|
|
178
|
+
})
|
|
179
|
+
else:
|
|
180
|
+
gallery = gallery.find_all("li")
|
|
181
|
+
|
|
182
|
+
for item in gallery:
|
|
183
|
+
mediaContainer = item.find("figure", {"class": "items-center"})
|
|
184
|
+
mediaData = mediaContainer.find("img") # All gallery items must be items
|
|
185
|
+
mediaURL = mediaData.get("src")
|
|
186
|
+
|
|
187
|
+
if (mediaURL is None):
|
|
188
|
+
mediaURL = mediaData.get("data-lazy-src")
|
|
189
|
+
|
|
190
|
+
if (mediaURL is None):
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
media.append({
|
|
194
|
+
"type": "image",
|
|
195
|
+
"image": DownloadContent(URL = mediaURL, ReturnAsBase64 = True, ReturnAsString = True)
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
return {"title": title, "content_text": contentTxt, "content_media": media}
|
|
199
|
+
|
|
200
|
+
def Scrape_Reddit_Subreddit(
|
|
201
|
+
URL: str,
|
|
202
|
+
IsName: bool = False,
|
|
203
|
+
ScrapePosts: bool = False,
|
|
204
|
+
PostsLimit: int | None = None
|
|
205
|
+
) -> list[str | dict[str, str | list[dict[str, str]]]]:
|
|
206
|
+
if (IsName):
|
|
207
|
+
url = f"https://reddit.com/r/{URL}/hot.json"
|
|
208
|
+
else:
|
|
209
|
+
url = re.search(r"/r/([^/]+)", URL).group(1)
|
|
210
|
+
url = f"https://reddit.com/r/{url}/hot.json"
|
|
211
|
+
|
|
212
|
+
response = __get_requests_response__(url)
|
|
213
|
+
data = response.json()
|
|
214
|
+
posts = []
|
|
215
|
+
|
|
216
|
+
for post in data["data"]["children"]:
|
|
217
|
+
if (PostsLimit is not None and len(posts) >= PostsLimit):
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
postUrl = post["data"]["url"]
|
|
221
|
+
posts.append(Scrape_Reddit_Post(postUrl) if (ScrapePosts) else postUrl)
|
|
222
|
+
|
|
223
|
+
return posts
|
|
224
|
+
|
|
225
|
+
def Scrape_Wikidot(URL: str) -> dict[str, str | list[dict[str, str]]]:
|
|
226
|
+
soup = Scrape_Base(URL)
|
|
227
|
+
title = soup.find("div", {"id": "page-title"}).get_text().strip()
|
|
228
|
+
paragraphs = soup.find("div", {"id": "page-content"}).find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"])
|
|
229
|
+
content = []
|
|
230
|
+
|
|
231
|
+
for p in paragraphs:
|
|
232
|
+
if (p.get_text().strip()):
|
|
233
|
+
content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
|
|
234
|
+
|
|
235
|
+
return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
|
|
236
|
+
|
|
237
|
+
def Scrape_Fandom(URL: str) -> dict[str, str | list[dict[str, str]]]:
|
|
238
|
+
soup = Scrape_Base(URL)
|
|
239
|
+
title = soup.find("h1", {"class": "page-header__title"}).get_text().strip()
|
|
240
|
+
paragraphs = soup.find("div", {"class": "mw-content-ltr"}).find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p"])
|
|
241
|
+
content = []
|
|
242
|
+
|
|
243
|
+
for p in paragraphs:
|
|
244
|
+
if (p.get_text().strip()):
|
|
245
|
+
content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
|
|
246
|
+
|
|
247
|
+
return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
|
|
248
|
+
|
|
249
|
+
def Scrape_Grokipedia(URL: str) -> dict[str, str | list[dict[str, str]]]:
|
|
250
|
+
soup = Scrape_Base(URL)
|
|
251
|
+
article = soup.find("article")
|
|
252
|
+
article.find("div", {"id": "references"}).decompose()
|
|
253
|
+
|
|
254
|
+
title = article.find("h1").get_text().strip()
|
|
255
|
+
paragraphs = article.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "span"], {"class": "block"})
|
|
256
|
+
content = []
|
|
257
|
+
|
|
258
|
+
for p in paragraphs:
|
|
259
|
+
if (p.get_text().strip()):
|
|
260
|
+
content.append(format_conversion.HTML_To_Markdown(str(p)).strip())
|
|
261
|
+
|
|
262
|
+
return {"title": title, "content_text": "\n\n".join(content), "content_media": []} # TODO: Scrape images too
|
|
263
|
+
|
|
264
|
+
def Scrape_Auto(URL: str, RedditSubredditPosts: int | None = None) -> dict[str, str | list[dict[str, str]]]:
|
|
265
|
+
urlInfo = GetURLInfo(GetBaseURL(URL))
|
|
266
|
+
|
|
267
|
+
if (urlInfo["website"] == "reddit.com"):
|
|
268
|
+
if ("/comments/" in URL):
|
|
269
|
+
# Scrape Reddit post
|
|
270
|
+
return Scrape_Reddit_Post(URL) | {"type": "reddit post"}
|
|
271
|
+
else:
|
|
272
|
+
# Scrape Reddit subreddit
|
|
273
|
+
return {"posts": Scrape_Reddit_Subreddit(URL, False, True, RedditSubredditPosts), "type": "reddit subreddit"}
|
|
274
|
+
elif (urlInfo["website"] == "wikipedia.org"):
|
|
275
|
+
return Scrape_Wikipedia(URL) | {"type": "wikipedia"}
|
|
276
|
+
elif (urlInfo["website"] == "wikidot.com"):
|
|
277
|
+
return Scrape_Wikidot(URL) | {"type": "wikidot"}
|
|
278
|
+
elif (urlInfo["website"] == "fandom.com"):
|
|
279
|
+
return Scrape_Fandom(URL) | {"type": "fandom"}
|
|
280
|
+
elif (urlInfo["website"] == "grokipedia.com"):
|
|
281
|
+
return Scrape_Grokipedia(URL) | {"type": "grokipedia"}
|
|
282
|
+
else:
|
|
283
|
+
websiteContent = str(Scrape_Base(URL).find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "span"]))
|
|
284
|
+
websiteContent = format_conversion.HTML_To_Markdown(websiteContent)
|
|
285
|
+
return {"title": "No title detected", "content_text": websiteContent, "content_media": [], "type": "unknown"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: I4_0-Client-Utils
|
|
3
|
+
Version: 20.0.0
|
|
4
|
+
Summary: Client-side utilities for I4.0.
|
|
5
|
+
Author: TAO71-AI
|
|
6
|
+
License: TAO71 I4.0 License (version 2)
|
|
7
|
+
Project-URL: Homepage, https://github.com/TAO71-AI/I4.0-NEW
|
|
8
|
+
Project-URL: Source, https://github.com/TAO71-AI/I4.0-NEW/tree/master/Client/Utils
|
|
9
|
+
Project-URL: License, https://github.com/TAO71-AI/I4.0-NEW/blob/master/LICENSE.md
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: Other/Proprietary License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Requires-Dist: html2text
|
|
15
|
+
Requires-Dist: requests
|
|
16
|
+
Requires-Dist: beautifulsoup4
|
|
17
|
+
Requires-Dist: ddgs
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
I40ClientUtils/__init__.py
|
|
3
|
+
I40ClientUtils/chatbot_tools.py
|
|
4
|
+
I40ClientUtils/format_conversion.py
|
|
5
|
+
I40ClientUtils/internet.py
|
|
6
|
+
I4_0_Client_Utils.egg-info/PKG-INFO
|
|
7
|
+
I4_0_Client_Utils.egg-info/SOURCES.txt
|
|
8
|
+
I4_0_Client_Utils.egg-info/dependency_links.txt
|
|
9
|
+
I4_0_Client_Utils.egg-info/requires.txt
|
|
10
|
+
I4_0_Client_Utils.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
I40ClientUtils
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: I4_0-Client-Utils
|
|
3
|
+
Version: 20.0.0
|
|
4
|
+
Summary: Client-side utilities for I4.0.
|
|
5
|
+
Author: TAO71-AI
|
|
6
|
+
License: TAO71 I4.0 License (version 2)
|
|
7
|
+
Project-URL: Homepage, https://github.com/TAO71-AI/I4.0-NEW
|
|
8
|
+
Project-URL: Source, https://github.com/TAO71-AI/I4.0-NEW/tree/master/Client/Utils
|
|
9
|
+
Project-URL: License, https://github.com/TAO71-AI/I4.0-NEW/blob/master/LICENSE.md
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: Other/Proprietary License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Requires-Dist: html2text
|
|
15
|
+
Requires-Dist: requests
|
|
16
|
+
Requires-Dist: beautifulsoup4
|
|
17
|
+
Requires-Dist: ddgs
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "I4_0-Client-Utils"
|
|
7
|
+
version = "20.0.0"
|
|
8
|
+
description = "Client-side utilities for I4.0."
|
|
9
|
+
authors = [{name = "TAO71-AI"}]
|
|
10
|
+
license = {text = "TAO71 I4.0 License (version 2)"}
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"html2text",
|
|
14
|
+
"requests",
|
|
15
|
+
"beautifulsoup4",
|
|
16
|
+
"ddgs"
|
|
17
|
+
]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: Other/Proprietary License",
|
|
21
|
+
"Operating System :: OS Independent"
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/TAO71-AI/I4.0-NEW"
|
|
26
|
+
Source = "https://github.com/TAO71-AI/I4.0-NEW/tree/master/Client/Utils"
|
|
27
|
+
License = "https://github.com/TAO71-AI/I4.0-NEW/blob/master/LICENSE.md"
|
|
28
|
+
|
|
29
|
+
[tool.setuptools.packages.find]
|
|
30
|
+
include = ["I40ClientUtils", "I40ClientUtils.*"]
|