webscout 8.2.8__py3-none-any.whl → 8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +34 -16
- webscout/AIbase.py +96 -37
- webscout/AIutel.py +491 -87
- webscout/Bard.py +441 -323
- webscout/Extra/GitToolkit/__init__.py +10 -10
- webscout/Extra/YTToolkit/ytapi/video.py +232 -232
- webscout/Litlogger/README.md +10 -0
- webscout/Litlogger/__init__.py +7 -59
- webscout/Litlogger/formats.py +4 -0
- webscout/Litlogger/handlers.py +103 -0
- webscout/Litlogger/levels.py +13 -0
- webscout/Litlogger/logger.py +92 -0
- webscout/Provider/AISEARCH/Perplexity.py +332 -358
- webscout/Provider/AISEARCH/felo_search.py +9 -35
- webscout/Provider/AISEARCH/genspark_search.py +30 -56
- webscout/Provider/AISEARCH/hika_search.py +4 -16
- webscout/Provider/AISEARCH/iask_search.py +410 -436
- webscout/Provider/AISEARCH/monica_search.py +4 -30
- webscout/Provider/AISEARCH/scira_search.py +6 -32
- webscout/Provider/AISEARCH/webpilotai_search.py +38 -64
- webscout/Provider/Blackboxai.py +155 -35
- webscout/Provider/ChatSandbox.py +2 -1
- webscout/Provider/Deepinfra.py +339 -339
- webscout/Provider/ExaChat.py +358 -358
- webscout/Provider/Gemini.py +169 -169
- webscout/Provider/GithubChat.py +1 -2
- webscout/Provider/Glider.py +3 -3
- webscout/Provider/HeckAI.py +172 -82
- webscout/Provider/LambdaChat.py +1 -0
- webscout/Provider/MCPCore.py +7 -3
- webscout/Provider/OPENAI/BLACKBOXAI.py +421 -139
- webscout/Provider/OPENAI/Cloudflare.py +38 -21
- webscout/Provider/OPENAI/FalconH1.py +457 -0
- webscout/Provider/OPENAI/FreeGemini.py +35 -18
- webscout/Provider/OPENAI/NEMOTRON.py +34 -34
- webscout/Provider/OPENAI/PI.py +427 -0
- webscout/Provider/OPENAI/Qwen3.py +304 -0
- webscout/Provider/OPENAI/README.md +952 -1253
- webscout/Provider/OPENAI/TwoAI.py +374 -0
- webscout/Provider/OPENAI/__init__.py +7 -1
- webscout/Provider/OPENAI/ai4chat.py +73 -63
- webscout/Provider/OPENAI/api.py +869 -644
- webscout/Provider/OPENAI/base.py +2 -0
- webscout/Provider/OPENAI/c4ai.py +34 -13
- webscout/Provider/OPENAI/chatgpt.py +575 -556
- webscout/Provider/OPENAI/chatgptclone.py +512 -487
- webscout/Provider/OPENAI/chatsandbox.py +11 -6
- webscout/Provider/OPENAI/copilot.py +258 -0
- webscout/Provider/OPENAI/deepinfra.py +327 -318
- webscout/Provider/OPENAI/e2b.py +140 -104
- webscout/Provider/OPENAI/exaai.py +420 -411
- webscout/Provider/OPENAI/exachat.py +448 -443
- webscout/Provider/OPENAI/flowith.py +7 -3
- webscout/Provider/OPENAI/freeaichat.py +12 -8
- webscout/Provider/OPENAI/glider.py +15 -8
- webscout/Provider/OPENAI/groq.py +5 -2
- webscout/Provider/OPENAI/heckai.py +311 -307
- webscout/Provider/OPENAI/llmchatco.py +9 -7
- webscout/Provider/OPENAI/mcpcore.py +18 -9
- webscout/Provider/OPENAI/multichat.py +7 -5
- webscout/Provider/OPENAI/netwrck.py +16 -11
- webscout/Provider/OPENAI/oivscode.py +290 -0
- webscout/Provider/OPENAI/opkfc.py +507 -496
- webscout/Provider/OPENAI/pydantic_imports.py +172 -0
- webscout/Provider/OPENAI/scirachat.py +29 -17
- webscout/Provider/OPENAI/sonus.py +308 -303
- webscout/Provider/OPENAI/standardinput.py +442 -433
- webscout/Provider/OPENAI/textpollinations.py +18 -11
- webscout/Provider/OPENAI/toolbaz.py +419 -413
- webscout/Provider/OPENAI/typefully.py +17 -10
- webscout/Provider/OPENAI/typegpt.py +21 -11
- webscout/Provider/OPENAI/uncovrAI.py +477 -462
- webscout/Provider/OPENAI/utils.py +90 -79
- webscout/Provider/OPENAI/venice.py +435 -425
- webscout/Provider/OPENAI/wisecat.py +387 -381
- webscout/Provider/OPENAI/writecream.py +166 -163
- webscout/Provider/OPENAI/x0gpt.py +26 -37
- webscout/Provider/OPENAI/yep.py +384 -356
- webscout/Provider/PI.py +2 -1
- webscout/Provider/TTI/README.md +55 -101
- webscout/Provider/TTI/__init__.py +4 -9
- webscout/Provider/TTI/aiarta.py +365 -0
- webscout/Provider/TTI/artbit.py +0 -0
- webscout/Provider/TTI/base.py +64 -0
- webscout/Provider/TTI/fastflux.py +200 -0
- webscout/Provider/TTI/magicstudio.py +201 -0
- webscout/Provider/TTI/piclumen.py +203 -0
- webscout/Provider/TTI/pixelmuse.py +225 -0
- webscout/Provider/TTI/pollinations.py +221 -0
- webscout/Provider/TTI/utils.py +11 -0
- webscout/Provider/TTS/__init__.py +2 -1
- webscout/Provider/TTS/base.py +159 -159
- webscout/Provider/TTS/openai_fm.py +129 -0
- webscout/Provider/TextPollinationsAI.py +308 -308
- webscout/Provider/TwoAI.py +239 -44
- webscout/Provider/UNFINISHED/Youchat.py +330 -330
- webscout/Provider/UNFINISHED/puterjs.py +635 -0
- webscout/Provider/UNFINISHED/test_lmarena.py +119 -119
- webscout/Provider/Writecream.py +246 -246
- webscout/Provider/__init__.py +2 -2
- webscout/Provider/ai4chat.py +33 -8
- webscout/Provider/granite.py +41 -6
- webscout/Provider/koala.py +169 -169
- webscout/Provider/oivscode.py +309 -0
- webscout/Provider/samurai.py +3 -2
- webscout/Provider/scnet.py +1 -0
- webscout/Provider/typegpt.py +3 -3
- webscout/Provider/uncovr.py +368 -368
- webscout/client.py +70 -0
- webscout/litprinter/__init__.py +58 -58
- webscout/optimizers.py +419 -419
- webscout/scout/README.md +3 -1
- webscout/scout/core/crawler.py +134 -64
- webscout/scout/core/scout.py +148 -109
- webscout/scout/element.py +106 -88
- webscout/swiftcli/Readme.md +323 -323
- webscout/swiftcli/plugins/manager.py +9 -2
- webscout/version.py +1 -1
- webscout/zeroart/__init__.py +134 -134
- webscout/zeroart/effects.py +100 -100
- webscout/zeroart/fonts.py +1238 -1238
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/METADATA +160 -35
- webscout-8.3.dist-info/RECORD +290 -0
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/WHEEL +1 -1
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/entry_points.txt +1 -0
- webscout/Litlogger/Readme.md +0 -175
- webscout/Litlogger/core/__init__.py +0 -6
- webscout/Litlogger/core/level.py +0 -23
- webscout/Litlogger/core/logger.py +0 -165
- webscout/Litlogger/handlers/__init__.py +0 -12
- webscout/Litlogger/handlers/console.py +0 -33
- webscout/Litlogger/handlers/file.py +0 -143
- webscout/Litlogger/handlers/network.py +0 -173
- webscout/Litlogger/styles/__init__.py +0 -7
- webscout/Litlogger/styles/colors.py +0 -249
- webscout/Litlogger/styles/formats.py +0 -458
- webscout/Litlogger/styles/text.py +0 -87
- webscout/Litlogger/utils/__init__.py +0 -6
- webscout/Litlogger/utils/detectors.py +0 -153
- webscout/Litlogger/utils/formatters.py +0 -200
- webscout/Provider/ChatGPTGratis.py +0 -194
- webscout/Provider/TTI/AiForce/README.md +0 -159
- webscout/Provider/TTI/AiForce/__init__.py +0 -22
- webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
- webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
- webscout/Provider/TTI/FreeAIPlayground/README.md +0 -99
- webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
- webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
- webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
- webscout/Provider/TTI/ImgSys/README.md +0 -174
- webscout/Provider/TTI/ImgSys/__init__.py +0 -23
- webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
- webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
- webscout/Provider/TTI/MagicStudio/README.md +0 -101
- webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
- webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
- webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
- webscout/Provider/TTI/Nexra/README.md +0 -155
- webscout/Provider/TTI/Nexra/__init__.py +0 -22
- webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
- webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
- webscout/Provider/TTI/PollinationsAI/README.md +0 -146
- webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
- webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
- webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
- webscout/Provider/TTI/aiarta/README.md +0 -134
- webscout/Provider/TTI/aiarta/__init__.py +0 -2
- webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
- webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
- webscout/Provider/TTI/artbit/README.md +0 -100
- webscout/Provider/TTI/artbit/__init__.py +0 -22
- webscout/Provider/TTI/artbit/async_artbit.py +0 -155
- webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
- webscout/Provider/TTI/fastflux/README.md +0 -129
- webscout/Provider/TTI/fastflux/__init__.py +0 -22
- webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
- webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
- webscout/Provider/TTI/huggingface/README.md +0 -114
- webscout/Provider/TTI/huggingface/__init__.py +0 -22
- webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
- webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
- webscout/Provider/TTI/piclumen/README.md +0 -161
- webscout/Provider/TTI/piclumen/__init__.py +0 -23
- webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
- webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
- webscout/Provider/TTI/pixelmuse/README.md +0 -79
- webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
- webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
- webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
- webscout/Provider/TTI/talkai/README.md +0 -139
- webscout/Provider/TTI/talkai/__init__.py +0 -4
- webscout/Provider/TTI/talkai/async_talkai.py +0 -229
- webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
- webscout/Provider/UNFINISHED/oivscode.py +0 -351
- webscout-8.2.8.dist-info/RECORD +0 -334
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/top_level.txt +0 -0
webscout/scout/README.md
CHANGED
|
@@ -148,6 +148,7 @@ Scout provides powerful tools for navigating and manipulating HTML/XML documents
|
|
|
148
148
|
- **Tree Traversal**: Navigate parent-child relationships and sibling elements
|
|
149
149
|
- **Content Extraction**: Extract text, attributes, and structured data
|
|
150
150
|
- **Document Manipulation**: Modify, replace, or remove elements
|
|
151
|
+
- **Dynamic Building**: Easily append or insert new nodes
|
|
151
152
|
|
|
152
153
|
```python
|
|
153
154
|
# CSS selector support
|
|
@@ -159,6 +160,7 @@ results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
|
|
|
159
160
|
# Tree traversal
|
|
160
161
|
parent = element.find_parent('div')
|
|
161
162
|
siblings = element.find_next_siblings('p')
|
|
163
|
+
prev_sibling = element.find_previous_sibling('p')
|
|
162
164
|
```
|
|
163
165
|
|
|
164
166
|
### 🧠 Intelligent Analysis
|
|
@@ -363,7 +365,7 @@ For detailed API documentation, please refer to the [documentation](https://gith
|
|
|
363
365
|
|
|
364
366
|
## 🔧 Dependencies
|
|
365
367
|
|
|
366
|
-
- `
|
|
368
|
+
- `curl_cffi`: HTTP library used for web requests
|
|
367
369
|
- `lxml`: XML and HTML processing library (optional, recommended)
|
|
368
370
|
- `html5lib`: Standards-compliant HTML parser (optional)
|
|
369
371
|
- `markdownify`: HTML to Markdown conversion
|
webscout/scout/core/crawler.py
CHANGED
|
@@ -4,19 +4,26 @@ Scout Crawler Module
|
|
|
4
4
|
|
|
5
5
|
import concurrent.futures
|
|
6
6
|
import urllib.parse
|
|
7
|
-
|
|
8
|
-
import
|
|
7
|
+
import time
|
|
8
|
+
import hashlib
|
|
9
|
+
import re
|
|
10
|
+
from urllib import robotparser
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Dict, List, Optional, Union
|
|
13
|
+
from webscout.litagent import LitAgent
|
|
14
|
+
from curl_cffi.requests import Session
|
|
9
15
|
|
|
10
16
|
from .scout import Scout
|
|
11
17
|
|
|
18
|
+
|
|
12
19
|
class ScoutCrawler:
|
|
13
20
|
"""
|
|
14
21
|
Advanced web crawling utility for Scout library.
|
|
15
22
|
"""
|
|
16
|
-
def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None):
|
|
23
|
+
def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
|
|
17
24
|
"""
|
|
18
25
|
Initialize the web crawler.
|
|
19
|
-
|
|
26
|
+
|
|
20
27
|
Args:
|
|
21
28
|
base_url (str): Starting URL to crawl
|
|
22
29
|
max_pages (int, optional): Maximum number of pages to crawl
|
|
@@ -24,117 +31,180 @@ class ScoutCrawler:
|
|
|
24
31
|
"""
|
|
25
32
|
self.base_url = base_url
|
|
26
33
|
self.max_pages = max_pages
|
|
27
|
-
self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
|
|
34
|
+
self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
|
|
35
|
+
"script",
|
|
36
|
+
"style",
|
|
37
|
+
"header",
|
|
38
|
+
"footer",
|
|
39
|
+
"nav",
|
|
40
|
+
"aside",
|
|
41
|
+
"form",
|
|
42
|
+
"button",
|
|
43
|
+
]
|
|
28
44
|
self.visited_urls = set()
|
|
29
45
|
self.crawled_pages = []
|
|
30
|
-
|
|
46
|
+
self.session = session or Session()
|
|
47
|
+
self.agent = LitAgent()
|
|
48
|
+
# Use all headers and generate fingerprint
|
|
49
|
+
self.session.headers = self.agent.generate_fingerprint()
|
|
50
|
+
self.session.headers.setdefault("User-Agent", self.agent.chrome())
|
|
51
|
+
self.delay = delay
|
|
52
|
+
self.obey_robots = obey_robots
|
|
53
|
+
self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
|
|
54
|
+
self.last_request_time = 0
|
|
55
|
+
self.url_hashes = set()
|
|
56
|
+
if obey_robots:
|
|
57
|
+
self.robots = robotparser.RobotFileParser()
|
|
58
|
+
robots_url = urllib.parse.urljoin(base_url, '/robots.txt')
|
|
59
|
+
try:
|
|
60
|
+
self.robots.set_url(robots_url)
|
|
61
|
+
self.robots.read()
|
|
62
|
+
except Exception:
|
|
63
|
+
self.robots = None
|
|
64
|
+
else:
|
|
65
|
+
self.robots = None
|
|
66
|
+
|
|
67
|
+
def _normalize_url(self, url: str) -> str:
|
|
68
|
+
url = url.split('#')[0]
|
|
69
|
+
url = re.sub(r'\?.*$', '', url) # Remove query params
|
|
70
|
+
return url.rstrip('/')
|
|
71
|
+
|
|
31
72
|
def _is_valid_url(self, url: str) -> bool:
|
|
32
73
|
"""
|
|
33
74
|
Check if a URL is valid and within the same domain.
|
|
34
|
-
|
|
75
|
+
|
|
35
76
|
Args:
|
|
36
77
|
url (str): URL to validate
|
|
37
|
-
|
|
78
|
+
|
|
38
79
|
Returns:
|
|
39
80
|
bool: Whether the URL is valid
|
|
40
81
|
"""
|
|
41
82
|
try:
|
|
42
83
|
parsed_base = urllib.parse.urlparse(self.base_url)
|
|
43
84
|
parsed_url = urllib.parse.urlparse(url)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
85
|
+
if parsed_url.scheme not in ["http", "https"]:
|
|
86
|
+
return False
|
|
87
|
+
if parsed_url.netloc not in self.allowed_domains:
|
|
88
|
+
return False
|
|
89
|
+
if self.obey_robots and self.robots:
|
|
90
|
+
return self.robots.can_fetch("*", url)
|
|
91
|
+
return True
|
|
50
92
|
except Exception:
|
|
51
93
|
return False
|
|
52
|
-
|
|
94
|
+
|
|
95
|
+
def _is_duplicate(self, url: str) -> bool:
|
|
96
|
+
norm = self._normalize_url(url)
|
|
97
|
+
url_hash = hashlib.md5(norm.encode()).hexdigest()
|
|
98
|
+
if url_hash in self.url_hashes:
|
|
99
|
+
return True
|
|
100
|
+
self.url_hashes.add(url_hash)
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
def _extract_main_text(self, soup):
|
|
104
|
+
# Try to extract main content (simple heuristic)
|
|
105
|
+
main = soup.find('main')
|
|
106
|
+
if main:
|
|
107
|
+
return main.get_text(separator=" ", strip=True)
|
|
108
|
+
article = soup.find('article')
|
|
109
|
+
if article:
|
|
110
|
+
return article.get_text(separator=" ", strip=True)
|
|
111
|
+
# fallback to body
|
|
112
|
+
body = soup.find('body')
|
|
113
|
+
if body:
|
|
114
|
+
return body.get_text(separator=" ", strip=True)
|
|
115
|
+
return soup.get_text(separator=" ", strip=True)
|
|
116
|
+
|
|
53
117
|
def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
|
|
54
118
|
"""
|
|
55
119
|
Crawl a single page and extract information.
|
|
56
|
-
|
|
120
|
+
|
|
57
121
|
Args:
|
|
58
122
|
url (str): URL to crawl
|
|
59
123
|
depth (int, optional): Current crawl depth
|
|
60
|
-
|
|
124
|
+
|
|
61
125
|
Returns:
|
|
62
126
|
Dict[str, Union[str, List[str]]]: Crawled page information
|
|
63
127
|
"""
|
|
64
|
-
if url in self.visited_urls:
|
|
128
|
+
if url in self.visited_urls or self._is_duplicate(url):
|
|
65
129
|
return {}
|
|
66
|
-
|
|
130
|
+
# Throttle requests
|
|
131
|
+
now = time.time()
|
|
132
|
+
if self.last_request_time:
|
|
133
|
+
elapsed = now - self.last_request_time
|
|
134
|
+
if elapsed < self.delay:
|
|
135
|
+
time.sleep(self.delay - elapsed)
|
|
136
|
+
self.last_request_time = time.time()
|
|
67
137
|
try:
|
|
68
|
-
response =
|
|
138
|
+
response = self.session.get(url, timeout=10)
|
|
69
139
|
response.raise_for_status()
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
title_result = scout.find(
|
|
74
|
-
title = title_result[0].get_text() if title_result else
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
tag.extract()
|
|
80
|
-
|
|
140
|
+
if not response.headers.get('Content-Type', '').startswith('text/html'):
|
|
141
|
+
return {}
|
|
142
|
+
scout = Scout(response.content, features="lxml")
|
|
143
|
+
title_result = scout.find("title")
|
|
144
|
+
title = title_result[0].get_text() if title_result else ""
|
|
145
|
+
for tag_name in self.tags_to_remove:
|
|
146
|
+
for tag in scout._soup.find_all(tag_name):
|
|
147
|
+
tag.extract()
|
|
148
|
+
visible_text = self._extract_main_text(scout._soup)
|
|
81
149
|
page_info = {
|
|
82
150
|
'url': url,
|
|
83
151
|
'title': title,
|
|
84
152
|
'links': [
|
|
85
|
-
urllib.parse.urljoin(url, link.get('href'))
|
|
86
|
-
for link in scout.find_all('a', href=True)
|
|
153
|
+
urllib.parse.urljoin(url, link.get('href'))
|
|
154
|
+
for link in scout.find_all('a', href=True)
|
|
87
155
|
if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
|
|
88
156
|
],
|
|
89
157
|
'text': visible_text,
|
|
90
|
-
'depth': depth
|
|
158
|
+
'depth': depth,
|
|
159
|
+
'timestamp': datetime.utcnow().isoformat(),
|
|
160
|
+
'headers': dict(response.headers),
|
|
91
161
|
}
|
|
92
|
-
|
|
93
162
|
self.visited_urls.add(url)
|
|
94
163
|
self.crawled_pages.append(page_info)
|
|
95
|
-
|
|
96
164
|
return page_info
|
|
97
165
|
except Exception as e:
|
|
98
166
|
print(f"Error crawling {url}: {e}")
|
|
99
167
|
return {}
|
|
100
|
-
|
|
101
|
-
def crawl(self)
|
|
168
|
+
|
|
169
|
+
def crawl(self):
|
|
102
170
|
"""
|
|
103
|
-
Start web crawling from base URL.
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
171
|
+
Start web crawling from base URL and yield each crawled page in real time.
|
|
172
|
+
|
|
173
|
+
Yields:
|
|
174
|
+
Dict[str, Union[str, List[str]]]: Crawled page information
|
|
107
175
|
"""
|
|
108
176
|
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
|
109
177
|
futures = {executor.submit(self._crawl_page, self.base_url, 0)}
|
|
110
|
-
|
|
178
|
+
submitted_links: set[str] = set()
|
|
179
|
+
|
|
111
180
|
while futures:
|
|
112
|
-
|
|
181
|
+
if len(self.visited_urls) >= self.max_pages:
|
|
182
|
+
break
|
|
183
|
+
done, not_done = concurrent.futures.wait(
|
|
113
184
|
futures, return_when=concurrent.futures.FIRST_COMPLETED
|
|
114
185
|
)
|
|
115
|
-
|
|
186
|
+
futures = not_done
|
|
187
|
+
|
|
116
188
|
for future in done:
|
|
117
189
|
page_info = future.result()
|
|
118
|
-
|
|
190
|
+
|
|
191
|
+
if page_info:
|
|
192
|
+
yield page_info
|
|
193
|
+
|
|
119
194
|
if len(self.visited_urls) >= self.max_pages:
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
for link in page_info.get('links', []):
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
for link in page_info.get("links", []):
|
|
124
198
|
if (
|
|
125
|
-
len(self.visited_urls) < self.max_pages
|
|
126
|
-
link not in self.visited_urls
|
|
199
|
+
len(self.visited_urls) < self.max_pages
|
|
200
|
+
and link not in self.visited_urls
|
|
201
|
+
and link not in submitted_links
|
|
127
202
|
):
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
page_info.get('depth', 0) + 1
|
|
135
|
-
)
|
|
203
|
+
submitted_links.add(link)
|
|
204
|
+
futures.add(
|
|
205
|
+
executor.submit(
|
|
206
|
+
self._crawl_page,
|
|
207
|
+
link,
|
|
208
|
+
page_info.get("depth", 0) + 1,
|
|
136
209
|
)
|
|
137
|
-
|
|
138
|
-
break
|
|
139
|
-
|
|
140
|
-
return self.crawled_pages
|
|
210
|
+
)
|