webscout 8.2.8__py3-none-any.whl → 8.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. webscout/AIauto.py +32 -14
  2. webscout/AIbase.py +96 -37
  3. webscout/AIutel.py +491 -87
  4. webscout/Bard.py +441 -323
  5. webscout/Extra/GitToolkit/__init__.py +10 -10
  6. webscout/Extra/YTToolkit/ytapi/video.py +232 -232
  7. webscout/Litlogger/README.md +10 -0
  8. webscout/Litlogger/__init__.py +7 -59
  9. webscout/Litlogger/formats.py +4 -0
  10. webscout/Litlogger/handlers.py +103 -0
  11. webscout/Litlogger/levels.py +13 -0
  12. webscout/Litlogger/logger.py +92 -0
  13. webscout/Provider/AISEARCH/Perplexity.py +332 -358
  14. webscout/Provider/AISEARCH/felo_search.py +9 -35
  15. webscout/Provider/AISEARCH/genspark_search.py +30 -56
  16. webscout/Provider/AISEARCH/hika_search.py +4 -16
  17. webscout/Provider/AISEARCH/iask_search.py +410 -436
  18. webscout/Provider/AISEARCH/monica_search.py +4 -30
  19. webscout/Provider/AISEARCH/scira_search.py +6 -32
  20. webscout/Provider/AISEARCH/webpilotai_search.py +38 -64
  21. webscout/Provider/Blackboxai.py +153 -35
  22. webscout/Provider/Deepinfra.py +339 -339
  23. webscout/Provider/ExaChat.py +358 -358
  24. webscout/Provider/Gemini.py +169 -169
  25. webscout/Provider/GithubChat.py +1 -2
  26. webscout/Provider/Glider.py +3 -3
  27. webscout/Provider/HeckAI.py +171 -81
  28. webscout/Provider/OPENAI/BLACKBOXAI.py +766 -735
  29. webscout/Provider/OPENAI/Cloudflare.py +7 -7
  30. webscout/Provider/OPENAI/FreeGemini.py +6 -5
  31. webscout/Provider/OPENAI/NEMOTRON.py +8 -20
  32. webscout/Provider/OPENAI/Qwen3.py +283 -0
  33. webscout/Provider/OPENAI/README.md +952 -1253
  34. webscout/Provider/OPENAI/TwoAI.py +357 -0
  35. webscout/Provider/OPENAI/__init__.py +5 -1
  36. webscout/Provider/OPENAI/ai4chat.py +40 -40
  37. webscout/Provider/OPENAI/api.py +808 -649
  38. webscout/Provider/OPENAI/c4ai.py +3 -3
  39. webscout/Provider/OPENAI/chatgpt.py +555 -555
  40. webscout/Provider/OPENAI/chatgptclone.py +493 -487
  41. webscout/Provider/OPENAI/chatsandbox.py +4 -3
  42. webscout/Provider/OPENAI/copilot.py +242 -0
  43. webscout/Provider/OPENAI/deepinfra.py +5 -2
  44. webscout/Provider/OPENAI/e2b.py +63 -5
  45. webscout/Provider/OPENAI/exaai.py +416 -410
  46. webscout/Provider/OPENAI/exachat.py +444 -443
  47. webscout/Provider/OPENAI/freeaichat.py +2 -2
  48. webscout/Provider/OPENAI/glider.py +5 -2
  49. webscout/Provider/OPENAI/groq.py +5 -2
  50. webscout/Provider/OPENAI/heckai.py +308 -307
  51. webscout/Provider/OPENAI/mcpcore.py +8 -2
  52. webscout/Provider/OPENAI/multichat.py +4 -4
  53. webscout/Provider/OPENAI/netwrck.py +6 -5
  54. webscout/Provider/OPENAI/oivscode.py +287 -0
  55. webscout/Provider/OPENAI/opkfc.py +496 -496
  56. webscout/Provider/OPENAI/pydantic_imports.py +172 -0
  57. webscout/Provider/OPENAI/scirachat.py +15 -9
  58. webscout/Provider/OPENAI/sonus.py +304 -303
  59. webscout/Provider/OPENAI/standardinput.py +433 -433
  60. webscout/Provider/OPENAI/textpollinations.py +4 -4
  61. webscout/Provider/OPENAI/toolbaz.py +413 -413
  62. webscout/Provider/OPENAI/typefully.py +3 -3
  63. webscout/Provider/OPENAI/typegpt.py +11 -5
  64. webscout/Provider/OPENAI/uncovrAI.py +463 -462
  65. webscout/Provider/OPENAI/utils.py +90 -79
  66. webscout/Provider/OPENAI/venice.py +431 -425
  67. webscout/Provider/OPENAI/wisecat.py +387 -381
  68. webscout/Provider/OPENAI/writecream.py +3 -3
  69. webscout/Provider/OPENAI/x0gpt.py +365 -378
  70. webscout/Provider/OPENAI/yep.py +39 -13
  71. webscout/Provider/TTI/README.md +55 -101
  72. webscout/Provider/TTI/__init__.py +4 -9
  73. webscout/Provider/TTI/aiarta.py +365 -0
  74. webscout/Provider/TTI/artbit.py +0 -0
  75. webscout/Provider/TTI/base.py +64 -0
  76. webscout/Provider/TTI/fastflux.py +200 -0
  77. webscout/Provider/TTI/magicstudio.py +201 -0
  78. webscout/Provider/TTI/piclumen.py +203 -0
  79. webscout/Provider/TTI/pixelmuse.py +225 -0
  80. webscout/Provider/TTI/pollinations.py +221 -0
  81. webscout/Provider/TTI/utils.py +11 -0
  82. webscout/Provider/TTS/__init__.py +2 -1
  83. webscout/Provider/TTS/base.py +159 -159
  84. webscout/Provider/TTS/openai_fm.py +129 -0
  85. webscout/Provider/TextPollinationsAI.py +308 -308
  86. webscout/Provider/TwoAI.py +239 -44
  87. webscout/Provider/UNFINISHED/Youchat.py +330 -330
  88. webscout/Provider/UNFINISHED/puterjs.py +635 -0
  89. webscout/Provider/UNFINISHED/test_lmarena.py +119 -119
  90. webscout/Provider/Writecream.py +246 -246
  91. webscout/Provider/__init__.py +2 -0
  92. webscout/Provider/ai4chat.py +33 -8
  93. webscout/Provider/koala.py +169 -169
  94. webscout/Provider/oivscode.py +309 -0
  95. webscout/Provider/samurai.py +3 -2
  96. webscout/Provider/typegpt.py +3 -3
  97. webscout/Provider/uncovr.py +368 -368
  98. webscout/client.py +70 -0
  99. webscout/litprinter/__init__.py +58 -58
  100. webscout/optimizers.py +419 -419
  101. webscout/scout/README.md +3 -1
  102. webscout/scout/core/crawler.py +134 -64
  103. webscout/scout/core/scout.py +148 -109
  104. webscout/scout/element.py +106 -88
  105. webscout/swiftcli/Readme.md +323 -323
  106. webscout/swiftcli/plugins/manager.py +9 -2
  107. webscout/version.py +1 -1
  108. webscout/zeroart/__init__.py +134 -134
  109. webscout/zeroart/effects.py +100 -100
  110. webscout/zeroart/fonts.py +1238 -1238
  111. {webscout-8.2.8.dist-info → webscout-8.2.9.dist-info}/METADATA +159 -35
  112. {webscout-8.2.8.dist-info → webscout-8.2.9.dist-info}/RECORD +116 -161
  113. {webscout-8.2.8.dist-info → webscout-8.2.9.dist-info}/WHEEL +1 -1
  114. {webscout-8.2.8.dist-info → webscout-8.2.9.dist-info}/entry_points.txt +1 -0
  115. webscout/Litlogger/Readme.md +0 -175
  116. webscout/Litlogger/core/__init__.py +0 -6
  117. webscout/Litlogger/core/level.py +0 -23
  118. webscout/Litlogger/core/logger.py +0 -165
  119. webscout/Litlogger/handlers/__init__.py +0 -12
  120. webscout/Litlogger/handlers/console.py +0 -33
  121. webscout/Litlogger/handlers/file.py +0 -143
  122. webscout/Litlogger/handlers/network.py +0 -173
  123. webscout/Litlogger/styles/__init__.py +0 -7
  124. webscout/Litlogger/styles/colors.py +0 -249
  125. webscout/Litlogger/styles/formats.py +0 -458
  126. webscout/Litlogger/styles/text.py +0 -87
  127. webscout/Litlogger/utils/__init__.py +0 -6
  128. webscout/Litlogger/utils/detectors.py +0 -153
  129. webscout/Litlogger/utils/formatters.py +0 -200
  130. webscout/Provider/TTI/AiForce/README.md +0 -159
  131. webscout/Provider/TTI/AiForce/__init__.py +0 -22
  132. webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
  133. webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
  134. webscout/Provider/TTI/FreeAIPlayground/README.md +0 -99
  135. webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
  136. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
  137. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
  138. webscout/Provider/TTI/ImgSys/README.md +0 -174
  139. webscout/Provider/TTI/ImgSys/__init__.py +0 -23
  140. webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
  141. webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
  142. webscout/Provider/TTI/MagicStudio/README.md +0 -101
  143. webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
  144. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
  145. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
  146. webscout/Provider/TTI/Nexra/README.md +0 -155
  147. webscout/Provider/TTI/Nexra/__init__.py +0 -22
  148. webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
  149. webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
  150. webscout/Provider/TTI/PollinationsAI/README.md +0 -146
  151. webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
  152. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
  153. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
  154. webscout/Provider/TTI/aiarta/README.md +0 -134
  155. webscout/Provider/TTI/aiarta/__init__.py +0 -2
  156. webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
  157. webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
  158. webscout/Provider/TTI/artbit/README.md +0 -100
  159. webscout/Provider/TTI/artbit/__init__.py +0 -22
  160. webscout/Provider/TTI/artbit/async_artbit.py +0 -155
  161. webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
  162. webscout/Provider/TTI/fastflux/README.md +0 -129
  163. webscout/Provider/TTI/fastflux/__init__.py +0 -22
  164. webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
  165. webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
  166. webscout/Provider/TTI/huggingface/README.md +0 -114
  167. webscout/Provider/TTI/huggingface/__init__.py +0 -22
  168. webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
  169. webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
  170. webscout/Provider/TTI/piclumen/README.md +0 -161
  171. webscout/Provider/TTI/piclumen/__init__.py +0 -23
  172. webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
  173. webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
  174. webscout/Provider/TTI/pixelmuse/README.md +0 -79
  175. webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
  176. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
  177. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
  178. webscout/Provider/TTI/talkai/README.md +0 -139
  179. webscout/Provider/TTI/talkai/__init__.py +0 -4
  180. webscout/Provider/TTI/talkai/async_talkai.py +0 -229
  181. webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
  182. webscout/Provider/UNFINISHED/oivscode.py +0 -351
  183. {webscout-8.2.8.dist-info → webscout-8.2.9.dist-info}/licenses/LICENSE.md +0 -0
  184. {webscout-8.2.8.dist-info → webscout-8.2.9.dist-info}/top_level.txt +0 -0
webscout/scout/README.md CHANGED
@@ -148,6 +148,7 @@ Scout provides powerful tools for navigating and manipulating HTML/XML documents
148
148
  - **Tree Traversal**: Navigate parent-child relationships and sibling elements
149
149
  - **Content Extraction**: Extract text, attributes, and structured data
150
150
  - **Document Manipulation**: Modify, replace, or remove elements
151
+ - **Dynamic Building**: Easily append or insert new nodes
151
152
 
152
153
  ```python
153
154
  # CSS selector support
@@ -159,6 +160,7 @@ results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
159
160
  # Tree traversal
160
161
  parent = element.find_parent('div')
161
162
  siblings = element.find_next_siblings('p')
163
+ prev_sibling = element.find_previous_sibling('p')
162
164
  ```
163
165
 
164
166
  ### 🧠 Intelligent Analysis
@@ -363,7 +365,7 @@ For detailed API documentation, please refer to the [documentation](https://gith
363
365
 
364
366
  ## 🔧 Dependencies
365
367
 
366
- - `requests`: HTTP library for making web requests
368
+ - `curl_cffi`: HTTP library used for web requests
367
369
  - `lxml`: XML and HTML processing library (optional, recommended)
368
370
  - `html5lib`: Standards-compliant HTML parser (optional)
369
371
  - `markdownify`: HTML to Markdown conversion
@@ -4,19 +4,26 @@ Scout Crawler Module
4
4
 
5
5
  import concurrent.futures
6
6
  import urllib.parse
7
- from typing import Union, List, Dict
8
- import requests
7
+ import time
8
+ import hashlib
9
+ import re
10
+ from urllib import robotparser
11
+ from datetime import datetime
12
+ from typing import Dict, List, Optional, Union
13
+ from webscout.litagent import LitAgent
14
+ from curl_cffi.requests import Session
9
15
 
10
16
  from .scout import Scout
11
17
 
18
+
12
19
  class ScoutCrawler:
13
20
  """
14
21
  Advanced web crawling utility for Scout library.
15
22
  """
16
- def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None):
23
+ def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
17
24
  """
18
25
  Initialize the web crawler.
19
-
26
+
20
27
  Args:
21
28
  base_url (str): Starting URL to crawl
22
29
  max_pages (int, optional): Maximum number of pages to crawl
@@ -24,117 +31,180 @@ class ScoutCrawler:
24
31
  """
25
32
  self.base_url = base_url
26
33
  self.max_pages = max_pages
27
- self.tags_to_remove = tags_to_remove if tags_to_remove is not None else ["script", "style", "header", "footer", "nav", "aside", "form", "button"]
34
+ self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
35
+ "script",
36
+ "style",
37
+ "header",
38
+ "footer",
39
+ "nav",
40
+ "aside",
41
+ "form",
42
+ "button",
43
+ ]
28
44
  self.visited_urls = set()
29
45
  self.crawled_pages = []
30
-
46
+ self.session = session or Session()
47
+ self.agent = LitAgent()
48
+ # Use all headers and generate fingerprint
49
+ self.session.headers = self.agent.generate_fingerprint()
50
+ self.session.headers.setdefault("User-Agent", self.agent.chrome())
51
+ self.delay = delay
52
+ self.obey_robots = obey_robots
53
+ self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
54
+ self.last_request_time = 0
55
+ self.url_hashes = set()
56
+ if obey_robots:
57
+ self.robots = robotparser.RobotFileParser()
58
+ robots_url = urllib.parse.urljoin(base_url, '/robots.txt')
59
+ try:
60
+ self.robots.set_url(robots_url)
61
+ self.robots.read()
62
+ except Exception:
63
+ self.robots = None
64
+ else:
65
+ self.robots = None
66
+
67
+ def _normalize_url(self, url: str) -> str:
68
+ url = url.split('#')[0]
69
+ url = re.sub(r'\?.*$', '', url) # Remove query params
70
+ return url.rstrip('/')
71
+
31
72
  def _is_valid_url(self, url: str) -> bool:
32
73
  """
33
74
  Check if a URL is valid and within the same domain.
34
-
75
+
35
76
  Args:
36
77
  url (str): URL to validate
37
-
78
+
38
79
  Returns:
39
80
  bool: Whether the URL is valid
40
81
  """
41
82
  try:
42
83
  parsed_base = urllib.parse.urlparse(self.base_url)
43
84
  parsed_url = urllib.parse.urlparse(url)
44
-
45
- return (
46
- parsed_url.scheme in ['http', 'https'] and
47
- parsed_base.netloc == parsed_url.netloc and
48
- len(self.visited_urls) < self.max_pages
49
- )
85
+ if parsed_url.scheme not in ["http", "https"]:
86
+ return False
87
+ if parsed_url.netloc not in self.allowed_domains:
88
+ return False
89
+ if self.obey_robots and self.robots:
90
+ return self.robots.can_fetch("*", url)
91
+ return True
50
92
  except Exception:
51
93
  return False
52
-
94
+
95
+ def _is_duplicate(self, url: str) -> bool:
96
+ norm = self._normalize_url(url)
97
+ url_hash = hashlib.md5(norm.encode()).hexdigest()
98
+ if url_hash in self.url_hashes:
99
+ return True
100
+ self.url_hashes.add(url_hash)
101
+ return False
102
+
103
+ def _extract_main_text(self, soup):
104
+ # Try to extract main content (simple heuristic)
105
+ main = soup.find('main')
106
+ if main:
107
+ return main.get_text(separator=" ", strip=True)
108
+ article = soup.find('article')
109
+ if article:
110
+ return article.get_text(separator=" ", strip=True)
111
+ # fallback to body
112
+ body = soup.find('body')
113
+ if body:
114
+ return body.get_text(separator=" ", strip=True)
115
+ return soup.get_text(separator=" ", strip=True)
116
+
53
117
  def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
54
118
  """
55
119
  Crawl a single page and extract information.
56
-
120
+
57
121
  Args:
58
122
  url (str): URL to crawl
59
123
  depth (int, optional): Current crawl depth
60
-
124
+
61
125
  Returns:
62
126
  Dict[str, Union[str, List[str]]]: Crawled page information
63
127
  """
64
- if url in self.visited_urls:
128
+ if url in self.visited_urls or self._is_duplicate(url):
65
129
  return {}
66
-
130
+ # Throttle requests
131
+ now = time.time()
132
+ if self.last_request_time:
133
+ elapsed = now - self.last_request_time
134
+ if elapsed < self.delay:
135
+ time.sleep(self.delay - elapsed)
136
+ self.last_request_time = time.time()
67
137
  try:
68
- response = requests.get(url, timeout=10)
138
+ response = self.session.get(url, timeout=10)
69
139
  response.raise_for_status()
70
-
71
- scout = Scout(response.content, features='lxml')
72
-
73
- title_result = scout.find('title')
74
- title = title_result[0].get_text() if title_result else ''
75
-
76
- visible_text = scout._soup.get_text(strip=True)
77
-
78
- for tag in scout._soup(self.tags_to_remove):
79
- tag.extract()
80
-
140
+ if not response.headers.get('Content-Type', '').startswith('text/html'):
141
+ return {}
142
+ scout = Scout(response.content, features="lxml")
143
+ title_result = scout.find("title")
144
+ title = title_result[0].get_text() if title_result else ""
145
+ for tag_name in self.tags_to_remove:
146
+ for tag in scout._soup.find_all(tag_name):
147
+ tag.extract()
148
+ visible_text = self._extract_main_text(scout._soup)
81
149
  page_info = {
82
150
  'url': url,
83
151
  'title': title,
84
152
  'links': [
85
- urllib.parse.urljoin(url, link.get('href'))
86
- for link in scout.find_all('a', href=True)
153
+ urllib.parse.urljoin(url, link.get('href'))
154
+ for link in scout.find_all('a', href=True)
87
155
  if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
88
156
  ],
89
157
  'text': visible_text,
90
- 'depth': depth
158
+ 'depth': depth,
159
+ 'timestamp': datetime.utcnow().isoformat(),
160
+ 'headers': dict(response.headers),
91
161
  }
92
-
93
162
  self.visited_urls.add(url)
94
163
  self.crawled_pages.append(page_info)
95
-
96
164
  return page_info
97
165
  except Exception as e:
98
166
  print(f"Error crawling {url}: {e}")
99
167
  return {}
100
-
101
- def crawl(self) -> List[Dict[str, Union[str, List[str]]]]:
168
+
169
+ def crawl(self):
102
170
  """
103
- Start web crawling from base URL.
104
-
105
- Returns:
106
- List[Dict[str, Union[str, List[str]]]]: List of crawled pages
171
+ Start web crawling from base URL and yield each crawled page in real time.
172
+
173
+ Yields:
174
+ Dict[str, Union[str, List[str]]]: Crawled page information
107
175
  """
108
176
  with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
109
177
  futures = {executor.submit(self._crawl_page, self.base_url, 0)}
110
-
178
+ submitted_links: set[str] = set()
179
+
111
180
  while futures:
112
- done, futures = concurrent.futures.wait(
181
+ if len(self.visited_urls) >= self.max_pages:
182
+ break
183
+ done, not_done = concurrent.futures.wait(
113
184
  futures, return_when=concurrent.futures.FIRST_COMPLETED
114
185
  )
115
-
186
+ futures = not_done
187
+
116
188
  for future in done:
117
189
  page_info = future.result()
118
-
190
+
191
+ if page_info:
192
+ yield page_info
193
+
119
194
  if len(self.visited_urls) >= self.max_pages:
120
- break
121
-
122
- submitted_links = set() # New set to track submitted links
123
- for link in page_info.get('links', []):
195
+ return
196
+
197
+ for link in page_info.get("links", []):
124
198
  if (
125
- len(self.visited_urls) < self.max_pages and
126
- link not in self.visited_urls
199
+ len(self.visited_urls) < self.max_pages
200
+ and link not in self.visited_urls
201
+ and link not in submitted_links
127
202
  ):
128
- if link not in submitted_links: # Check against submitted links
129
- submitted_links.add(link) # Add to submitted links
130
- futures.add(
131
- executor.submit(
132
- self._crawl_page,
133
- link,
134
- page_info.get('depth', 0) + 1
135
- )
203
+ submitted_links.add(link)
204
+ futures.add(
205
+ executor.submit(
206
+ self._crawl_page,
207
+ link,
208
+ page_info.get("depth", 0) + 1,
136
209
  )
137
- if len(self.visited_urls) >= self.max_pages:
138
- break
139
-
140
- return self.crawled_pages
210
+ )