webscout 8.2.7__py3-none-any.whl → 8.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webscout/AIauto.py +33 -15
- webscout/AIbase.py +96 -37
- webscout/AIutel.py +703 -250
- webscout/Bard.py +441 -323
- webscout/Extra/Act.md +309 -0
- webscout/Extra/GitToolkit/__init__.py +10 -0
- webscout/Extra/GitToolkit/gitapi/README.md +110 -0
- webscout/Extra/GitToolkit/gitapi/__init__.py +12 -0
- webscout/Extra/GitToolkit/gitapi/repository.py +195 -0
- webscout/Extra/GitToolkit/gitapi/user.py +96 -0
- webscout/Extra/GitToolkit/gitapi/utils.py +62 -0
- webscout/Extra/YTToolkit/README.md +375 -0
- webscout/Extra/YTToolkit/YTdownloader.py +957 -0
- webscout/Extra/YTToolkit/__init__.py +3 -0
- webscout/Extra/YTToolkit/transcriber.py +476 -0
- webscout/Extra/YTToolkit/ytapi/README.md +44 -0
- webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
- webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
- webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
- webscout/Extra/YTToolkit/ytapi/extras.py +118 -0
- webscout/Extra/YTToolkit/ytapi/https.py +88 -0
- webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
- webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
- webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
- webscout/Extra/YTToolkit/ytapi/query.py +40 -0
- webscout/Extra/YTToolkit/ytapi/stream.py +63 -0
- webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
- webscout/Extra/YTToolkit/ytapi/video.py +232 -0
- webscout/Extra/__init__.py +7 -0
- webscout/Extra/autocoder/__init__.py +9 -0
- webscout/Extra/autocoder/autocoder.py +1105 -0
- webscout/Extra/autocoder/autocoder_utiles.py +332 -0
- webscout/Extra/gguf.md +430 -0
- webscout/Extra/gguf.py +684 -0
- webscout/Extra/tempmail/README.md +488 -0
- webscout/Extra/tempmail/__init__.py +28 -0
- webscout/Extra/tempmail/async_utils.py +141 -0
- webscout/Extra/tempmail/base.py +161 -0
- webscout/Extra/tempmail/cli.py +187 -0
- webscout/Extra/tempmail/emailnator.py +84 -0
- webscout/Extra/tempmail/mail_tm.py +361 -0
- webscout/Extra/tempmail/temp_mail_io.py +292 -0
- webscout/Extra/weather.md +281 -0
- webscout/Extra/weather.py +194 -0
- webscout/Extra/weather_ascii.py +76 -0
- webscout/Litlogger/README.md +10 -0
- webscout/Litlogger/__init__.py +15 -0
- webscout/Litlogger/formats.py +4 -0
- webscout/Litlogger/handlers.py +103 -0
- webscout/Litlogger/levels.py +13 -0
- webscout/Litlogger/logger.py +92 -0
- webscout/Provider/AI21.py +177 -0
- webscout/Provider/AISEARCH/DeepFind.py +254 -0
- webscout/Provider/AISEARCH/Perplexity.py +333 -0
- webscout/Provider/AISEARCH/README.md +279 -0
- webscout/Provider/AISEARCH/__init__.py +9 -0
- webscout/Provider/AISEARCH/felo_search.py +202 -0
- webscout/Provider/AISEARCH/genspark_search.py +324 -0
- webscout/Provider/AISEARCH/hika_search.py +186 -0
- webscout/Provider/AISEARCH/iask_search.py +410 -0
- webscout/Provider/AISEARCH/monica_search.py +220 -0
- webscout/Provider/AISEARCH/scira_search.py +298 -0
- webscout/Provider/AISEARCH/webpilotai_search.py +255 -0
- webscout/Provider/Aitopia.py +316 -0
- webscout/Provider/AllenAI.py +440 -0
- webscout/Provider/Andi.py +228 -0
- webscout/Provider/Blackboxai.py +791 -0
- webscout/Provider/ChatGPTClone.py +237 -0
- webscout/Provider/ChatGPTGratis.py +194 -0
- webscout/Provider/ChatSandbox.py +342 -0
- webscout/Provider/Cloudflare.py +324 -0
- webscout/Provider/Cohere.py +208 -0
- webscout/Provider/Deepinfra.py +340 -0
- webscout/Provider/ExaAI.py +261 -0
- webscout/Provider/ExaChat.py +358 -0
- webscout/Provider/Flowith.py +217 -0
- webscout/Provider/FreeGemini.py +250 -0
- webscout/Provider/Gemini.py +169 -0
- webscout/Provider/GithubChat.py +369 -0
- webscout/Provider/GizAI.py +295 -0
- webscout/Provider/Glider.py +225 -0
- webscout/Provider/Groq.py +801 -0
- webscout/Provider/HF_space/__init__.py +0 -0
- webscout/Provider/HF_space/qwen_qwen2.py +206 -0
- webscout/Provider/HeckAI.py +375 -0
- webscout/Provider/HuggingFaceChat.py +469 -0
- webscout/Provider/Hunyuan.py +283 -0
- webscout/Provider/Jadve.py +291 -0
- webscout/Provider/Koboldai.py +384 -0
- webscout/Provider/LambdaChat.py +411 -0
- webscout/Provider/Llama3.py +259 -0
- webscout/Provider/MCPCore.py +315 -0
- webscout/Provider/Marcus.py +198 -0
- webscout/Provider/Nemotron.py +218 -0
- webscout/Provider/Netwrck.py +270 -0
- webscout/Provider/OLLAMA.py +396 -0
- webscout/Provider/OPENAI/BLACKBOXAI.py +766 -0
- webscout/Provider/OPENAI/Cloudflare.py +378 -0
- webscout/Provider/OPENAI/FreeGemini.py +283 -0
- webscout/Provider/OPENAI/NEMOTRON.py +232 -0
- webscout/Provider/OPENAI/Qwen3.py +283 -0
- webscout/Provider/OPENAI/README.md +952 -0
- webscout/Provider/OPENAI/TwoAI.py +357 -0
- webscout/Provider/OPENAI/__init__.py +40 -0
- webscout/Provider/OPENAI/ai4chat.py +293 -0
- webscout/Provider/OPENAI/api.py +969 -0
- webscout/Provider/OPENAI/base.py +249 -0
- webscout/Provider/OPENAI/c4ai.py +373 -0
- webscout/Provider/OPENAI/chatgpt.py +556 -0
- webscout/Provider/OPENAI/chatgptclone.py +494 -0
- webscout/Provider/OPENAI/chatsandbox.py +173 -0
- webscout/Provider/OPENAI/copilot.py +242 -0
- webscout/Provider/OPENAI/deepinfra.py +322 -0
- webscout/Provider/OPENAI/e2b.py +1414 -0
- webscout/Provider/OPENAI/exaai.py +417 -0
- webscout/Provider/OPENAI/exachat.py +444 -0
- webscout/Provider/OPENAI/flowith.py +162 -0
- webscout/Provider/OPENAI/freeaichat.py +359 -0
- webscout/Provider/OPENAI/glider.py +326 -0
- webscout/Provider/OPENAI/groq.py +364 -0
- webscout/Provider/OPENAI/heckai.py +308 -0
- webscout/Provider/OPENAI/llmchatco.py +335 -0
- webscout/Provider/OPENAI/mcpcore.py +389 -0
- webscout/Provider/OPENAI/multichat.py +376 -0
- webscout/Provider/OPENAI/netwrck.py +357 -0
- webscout/Provider/OPENAI/oivscode.py +287 -0
- webscout/Provider/OPENAI/opkfc.py +496 -0
- webscout/Provider/OPENAI/pydantic_imports.py +172 -0
- webscout/Provider/OPENAI/scirachat.py +477 -0
- webscout/Provider/OPENAI/sonus.py +304 -0
- webscout/Provider/OPENAI/standardinput.py +433 -0
- webscout/Provider/OPENAI/textpollinations.py +339 -0
- webscout/Provider/OPENAI/toolbaz.py +413 -0
- webscout/Provider/OPENAI/typefully.py +355 -0
- webscout/Provider/OPENAI/typegpt.py +364 -0
- webscout/Provider/OPENAI/uncovrAI.py +463 -0
- webscout/Provider/OPENAI/utils.py +318 -0
- webscout/Provider/OPENAI/venice.py +431 -0
- webscout/Provider/OPENAI/wisecat.py +387 -0
- webscout/Provider/OPENAI/writecream.py +163 -0
- webscout/Provider/OPENAI/x0gpt.py +365 -0
- webscout/Provider/OPENAI/yep.py +382 -0
- webscout/Provider/OpenGPT.py +209 -0
- webscout/Provider/Openai.py +496 -0
- webscout/Provider/PI.py +429 -0
- webscout/Provider/Perplexitylabs.py +415 -0
- webscout/Provider/QwenLM.py +254 -0
- webscout/Provider/Reka.py +214 -0
- webscout/Provider/StandardInput.py +290 -0
- webscout/Provider/TTI/README.md +82 -0
- webscout/Provider/TTI/__init__.py +7 -0
- webscout/Provider/TTI/aiarta.py +365 -0
- webscout/Provider/TTI/artbit.py +0 -0
- webscout/Provider/TTI/base.py +64 -0
- webscout/Provider/TTI/fastflux.py +200 -0
- webscout/Provider/TTI/magicstudio.py +201 -0
- webscout/Provider/TTI/piclumen.py +203 -0
- webscout/Provider/TTI/pixelmuse.py +225 -0
- webscout/Provider/TTI/pollinations.py +221 -0
- webscout/Provider/TTI/utils.py +11 -0
- webscout/Provider/TTS/README.md +192 -0
- webscout/Provider/TTS/__init__.py +10 -0
- webscout/Provider/TTS/base.py +159 -0
- webscout/Provider/TTS/deepgram.py +156 -0
- webscout/Provider/TTS/elevenlabs.py +111 -0
- webscout/Provider/TTS/gesserit.py +128 -0
- webscout/Provider/TTS/murfai.py +113 -0
- webscout/Provider/TTS/openai_fm.py +129 -0
- webscout/Provider/TTS/parler.py +111 -0
- webscout/Provider/TTS/speechma.py +580 -0
- webscout/Provider/TTS/sthir.py +94 -0
- webscout/Provider/TTS/streamElements.py +333 -0
- webscout/Provider/TTS/utils.py +280 -0
- webscout/Provider/TeachAnything.py +229 -0
- webscout/Provider/TextPollinationsAI.py +308 -0
- webscout/Provider/TwoAI.py +475 -0
- webscout/Provider/TypliAI.py +305 -0
- webscout/Provider/UNFINISHED/ChatHub.py +209 -0
- webscout/Provider/UNFINISHED/Youchat.py +330 -0
- webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
- webscout/Provider/UNFINISHED/puterjs.py +635 -0
- webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
- webscout/Provider/Venice.py +258 -0
- webscout/Provider/VercelAI.py +253 -0
- webscout/Provider/WiseCat.py +233 -0
- webscout/Provider/WrDoChat.py +370 -0
- webscout/Provider/Writecream.py +246 -0
- webscout/Provider/WritingMate.py +269 -0
- webscout/Provider/__init__.py +174 -0
- webscout/Provider/ai4chat.py +174 -0
- webscout/Provider/akashgpt.py +335 -0
- webscout/Provider/asksteve.py +220 -0
- webscout/Provider/cerebras.py +290 -0
- webscout/Provider/chatglm.py +215 -0
- webscout/Provider/cleeai.py +213 -0
- webscout/Provider/copilot.py +425 -0
- webscout/Provider/elmo.py +283 -0
- webscout/Provider/freeaichat.py +285 -0
- webscout/Provider/geminiapi.py +208 -0
- webscout/Provider/granite.py +235 -0
- webscout/Provider/hermes.py +266 -0
- webscout/Provider/julius.py +223 -0
- webscout/Provider/koala.py +170 -0
- webscout/Provider/learnfastai.py +325 -0
- webscout/Provider/llama3mitril.py +215 -0
- webscout/Provider/llmchat.py +258 -0
- webscout/Provider/llmchatco.py +306 -0
- webscout/Provider/lmarena.py +198 -0
- webscout/Provider/meta.py +801 -0
- webscout/Provider/multichat.py +364 -0
- webscout/Provider/oivscode.py +309 -0
- webscout/Provider/samurai.py +224 -0
- webscout/Provider/scira_chat.py +299 -0
- webscout/Provider/scnet.py +243 -0
- webscout/Provider/searchchat.py +292 -0
- webscout/Provider/sonus.py +258 -0
- webscout/Provider/talkai.py +194 -0
- webscout/Provider/toolbaz.py +353 -0
- webscout/Provider/turboseek.py +266 -0
- webscout/Provider/typefully.py +202 -0
- webscout/Provider/typegpt.py +289 -0
- webscout/Provider/uncovr.py +368 -0
- webscout/Provider/x0gpt.py +299 -0
- webscout/Provider/yep.py +389 -0
- webscout/__init__.py +4 -2
- webscout/cli.py +3 -28
- webscout/client.py +70 -0
- webscout/conversation.py +35 -35
- webscout/litagent/Readme.md +276 -0
- webscout/litagent/__init__.py +29 -0
- webscout/litagent/agent.py +455 -0
- webscout/litagent/constants.py +60 -0
- webscout/litprinter/__init__.py +59 -0
- webscout/optimizers.py +419 -419
- webscout/scout/README.md +404 -0
- webscout/scout/__init__.py +8 -0
- webscout/scout/core/__init__.py +7 -0
- webscout/scout/core/crawler.py +210 -0
- webscout/scout/core/scout.py +607 -0
- webscout/scout/core/search_result.py +96 -0
- webscout/scout/core/text_analyzer.py +63 -0
- webscout/scout/core/text_utils.py +277 -0
- webscout/scout/core/web_analyzer.py +52 -0
- webscout/scout/element.py +478 -0
- webscout/scout/parsers/__init__.py +69 -0
- webscout/scout/parsers/html5lib_parser.py +172 -0
- webscout/scout/parsers/html_parser.py +236 -0
- webscout/scout/parsers/lxml_parser.py +178 -0
- webscout/scout/utils.py +37 -0
- webscout/swiftcli/Readme.md +323 -0
- webscout/swiftcli/__init__.py +95 -0
- webscout/swiftcli/core/__init__.py +7 -0
- webscout/swiftcli/core/cli.py +297 -0
- webscout/swiftcli/core/context.py +104 -0
- webscout/swiftcli/core/group.py +241 -0
- webscout/swiftcli/decorators/__init__.py +28 -0
- webscout/swiftcli/decorators/command.py +221 -0
- webscout/swiftcli/decorators/options.py +220 -0
- webscout/swiftcli/decorators/output.py +252 -0
- webscout/swiftcli/exceptions.py +21 -0
- webscout/swiftcli/plugins/__init__.py +9 -0
- webscout/swiftcli/plugins/base.py +135 -0
- webscout/swiftcli/plugins/manager.py +269 -0
- webscout/swiftcli/utils/__init__.py +59 -0
- webscout/swiftcli/utils/formatting.py +252 -0
- webscout/swiftcli/utils/parsing.py +267 -0
- webscout/version.py +1 -1
- webscout/webscout_search.py +2 -182
- webscout/webscout_search_async.py +1 -179
- webscout/zeroart/README.md +89 -0
- webscout/zeroart/__init__.py +135 -0
- webscout/zeroart/base.py +66 -0
- webscout/zeroart/effects.py +101 -0
- webscout/zeroart/fonts.py +1239 -0
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/METADATA +262 -83
- webscout-8.2.9.dist-info/RECORD +289 -0
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/WHEEL +1 -1
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/entry_points.txt +1 -0
- webscout-8.2.7.dist-info/RECORD +0 -26
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/top_level.txt +0 -0
webscout/scout/README.md
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
# 🕵️ Scout: Next-Gen Web Parsing Library
|
|
2
|
+
|
|
3
|
+
<div align="center">
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://github.com/OE-LUCIFER/Webscout)
|
|
8
|
+
[](https://github.com/OE-LUCIFER/Webscout/wiki)
|
|
9
|
+
[](https://github.com/OE-LUCIFER/Webscout/pulls)
|
|
10
|
+
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
## 📋 Overview
|
|
14
|
+
|
|
15
|
+
Scout is a powerful, flexible, and performant HTML parsing library that makes web scraping a breeze! It provides intelligent HTML/XML parsing with advanced features like web crawling, text analysis, semantic extraction, and Markdown conversion. Scout goes beyond traditional parsing libraries with its intuitive API and comprehensive feature set.
|
|
16
|
+
|
|
17
|
+
<details open>
|
|
18
|
+
<summary><b>Why Choose Scout?</b></summary>
|
|
19
|
+
|
|
20
|
+
- **Powerful Parsing**: Multiple parser backends with intelligent markup handling
|
|
21
|
+
- **Advanced Analysis**: Built-in text and web content analysis tools
|
|
22
|
+
- **Concurrent Crawling**: Efficient multi-threaded web crawling
|
|
23
|
+
- **Flexible API**: Intuitive interface similar to BeautifulSoup but with enhanced capabilities
|
|
24
|
+
- **Format Conversion**: Convert HTML to JSON, Markdown, and more
|
|
25
|
+
|
|
26
|
+
</details>
|
|
27
|
+
|
|
28
|
+
## 📑 Table of Contents
|
|
29
|
+
|
|
30
|
+
- [Installation](#-installation)
|
|
31
|
+
- [Quick Start](#-quick-start)
|
|
32
|
+
- [Features](#-features)
|
|
33
|
+
- [Advanced Usage](#-advanced-usage)
|
|
34
|
+
- [API Reference](#-api-reference)
|
|
35
|
+
- [Dependencies](#-dependencies)
|
|
36
|
+
- [Supported Python Versions](#-supported-python-versions)
|
|
37
|
+
- [Contributing](#-contributing)
|
|
38
|
+
- [License](#-license)
|
|
39
|
+
|
|
40
|
+
## 📦 Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install webscout
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or install the latest version from GitHub:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install git+https://github.com/OE-LUCIFER/Webscout.git
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## 🚀 Quick Start
|
|
53
|
+
|
|
54
|
+
### Basic Parsing
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from webscout.scout import Scout
|
|
58
|
+
|
|
59
|
+
# Parse HTML content
|
|
60
|
+
html_content = """
|
|
61
|
+
<html>
|
|
62
|
+
<body>
|
|
63
|
+
<h1>Hello, Scout!</h1>
|
|
64
|
+
<div class="content">
|
|
65
|
+
<p>Web parsing made easy.</p>
|
|
66
|
+
<a href="https://example.com">Link</a>
|
|
67
|
+
</div>
|
|
68
|
+
</body>
|
|
69
|
+
</html>
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
scout = Scout(html_content)
|
|
73
|
+
|
|
74
|
+
# Find elements
|
|
75
|
+
title = scout.find('h1')
|
|
76
|
+
links = scout.find_all('a')
|
|
77
|
+
|
|
78
|
+
# Extract text
|
|
79
|
+
print(title[0].get_text()) # Output: Hello, Scout!
|
|
80
|
+
print(links.attrs('href')) # Output: ['https://example.com']
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Web Crawling
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from webscout.scout import ScoutCrawler
|
|
87
|
+
|
|
88
|
+
# Crawl a website with default settings
|
|
89
|
+
crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
|
|
90
|
+
|
|
91
|
+
# Or customize the crawler
|
|
92
|
+
crawler = ScoutCrawler(
|
|
93
|
+
'https://example.com', # base_url
|
|
94
|
+
max_pages=100, # maximum pages to crawl
|
|
95
|
+
tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Start crawling
|
|
99
|
+
crawled_pages = crawler.crawl()
|
|
100
|
+
|
|
101
|
+
for page in crawled_pages:
|
|
102
|
+
print(f"URL: {page['url']}")
|
|
103
|
+
print(f"Title: {page['title']}")
|
|
104
|
+
print(f"Links found: {len(page['links'])}")
|
|
105
|
+
print(f"Crawl depth: {page['depth']}")
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Text Analysis
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from webscout.scout import Scout
|
|
112
|
+
|
|
113
|
+
# Parse a webpage
|
|
114
|
+
html = """<div><h1>Climate Change</h1><p>Email us at info@example.com or call 555-123-4567.</p>
|
|
115
|
+
<p>Visit https://climate-action.org for more information.</p></div>"""
|
|
116
|
+
scout = Scout(html)
|
|
117
|
+
|
|
118
|
+
# Analyze text and extract entities
|
|
119
|
+
analysis = scout.analyze_text()
|
|
120
|
+
print(f"Word frequencies: {analysis['word_count']}")
|
|
121
|
+
print(f"Entities found: {analysis['entities']}")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## ✨ Features
|
|
125
|
+
|
|
126
|
+
### 🔍 Multiple Parser Support
|
|
127
|
+
|
|
128
|
+
Scout supports multiple HTML/XML parsers, allowing you to choose the best tool for your specific needs:
|
|
129
|
+
|
|
130
|
+
| Parser | Description | Best For |
|
|
131
|
+
|--------|-------------|----------|
|
|
132
|
+
| `html.parser` | Python's built-in parser | General-purpose parsing, no dependencies |
|
|
133
|
+
| `lxml` | Fast C-based parser | Performance-critical applications |
|
|
134
|
+
| `html5lib` | Highly compliant HTML5 parser | Handling malformed HTML |
|
|
135
|
+
| `lxml-xml` | XML parser | XML document parsing |
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
# Choose your parser
|
|
139
|
+
scout = Scout(html_content, features='lxml') # For speed
|
|
140
|
+
scout = Scout(html_content, features='html5lib') # For compliance
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### 🌐 Advanced Parsing Capabilities
|
|
144
|
+
|
|
145
|
+
Scout provides powerful tools for navigating and manipulating HTML/XML documents:
|
|
146
|
+
|
|
147
|
+
- **Element Selection**: Find elements by tag name, attributes, CSS selectors, and more
|
|
148
|
+
- **Tree Traversal**: Navigate parent-child relationships and sibling elements
|
|
149
|
+
- **Content Extraction**: Extract text, attributes, and structured data
|
|
150
|
+
- **Document Manipulation**: Modify, replace, or remove elements
|
|
151
|
+
- **Dynamic Building**: Easily append or insert new nodes
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# CSS selector support
|
|
155
|
+
elements = scout.select('div.content > p')
|
|
156
|
+
|
|
157
|
+
# Advanced find with attribute matching
|
|
158
|
+
results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
|
|
159
|
+
|
|
160
|
+
# Tree traversal
|
|
161
|
+
parent = element.find_parent('div')
|
|
162
|
+
siblings = element.find_next_siblings('p')
|
|
163
|
+
prev_sibling = element.find_previous_sibling('p')
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### 🧠 Intelligent Analysis
|
|
167
|
+
|
|
168
|
+
Scout includes built-in analysis tools for extracting insights from web content:
|
|
169
|
+
|
|
170
|
+
#### Text Analysis
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
# Extract and analyze text
|
|
174
|
+
text = scout.get_text()
|
|
175
|
+
word_counts = scout.text_analyzer.count_words(text)
|
|
176
|
+
entities = scout.text_analyzer.extract_entities(text)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
#### Web Structure Analysis
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
# Analyze page structure
|
|
183
|
+
structure = scout.analyze_page_structure()
|
|
184
|
+
print(f"Most common tags: {structure['tag_distribution']}")
|
|
185
|
+
print(f"Page depth: {max(structure['depth_analysis'].keys())}")
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
#### Semantic Information Extraction
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
# Extract semantic information
|
|
192
|
+
semantics = scout.extract_semantic_info()
|
|
193
|
+
print(f"Headings: {semantics['headings']}")
|
|
194
|
+
print(f"Lists: {len(semantics['lists']['ul']) + len(semantics['lists']['ol'])}")
|
|
195
|
+
print(f"Tables: {semantics['tables']['count']}")
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### 🕸️ Web Crawling
|
|
199
|
+
|
|
200
|
+
Scout includes a powerful concurrent web crawler for fetching and analyzing multiple pages:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from webscout.scout import ScoutCrawler
|
|
204
|
+
|
|
205
|
+
# Create a crawler with default settings
|
|
206
|
+
crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
|
|
207
|
+
|
|
208
|
+
# Or customize the crawler with specific options
|
|
209
|
+
crawler = ScoutCrawler(
|
|
210
|
+
'https://example.com', # base_url
|
|
211
|
+
max_pages=100, # maximum pages to crawl
|
|
212
|
+
tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Start crawling
|
|
216
|
+
pages = crawler.crawl()
|
|
217
|
+
|
|
218
|
+
# Process results
|
|
219
|
+
for page in pages:
|
|
220
|
+
print(f"URL: {page['url']}")
|
|
221
|
+
print(f"Title: {page['title']}")
|
|
222
|
+
print(f"Links: {len(page['links'])}")
|
|
223
|
+
print(f"Depth: {page['depth']}")
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
The crawler automatically:
|
|
227
|
+
- Stays within the same domain as the base URL
|
|
228
|
+
- Uses concurrent requests for faster crawling
|
|
229
|
+
- Removes unwanted tags (like scripts and styles) for cleaner text extraction
|
|
230
|
+
- Tracks crawl depth for each page
|
|
231
|
+
|
|
232
|
+
### 📄 Format Conversion
|
|
233
|
+
|
|
234
|
+
Scout can convert HTML to various formats:
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
# Convert to JSON
|
|
238
|
+
json_data = scout.to_json(indent=2)
|
|
239
|
+
|
|
240
|
+
# Convert to Markdown
|
|
241
|
+
markdown = scout.to_markdown(heading_style='ATX')
|
|
242
|
+
|
|
243
|
+
# Pretty-print HTML
|
|
244
|
+
pretty_html = scout.prettify()
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## 🔬 Advanced Usage
|
|
248
|
+
|
|
249
|
+
### Working with Search Results
|
|
250
|
+
|
|
251
|
+
Scout's search methods return a `ScoutSearchResult` object with powerful methods for processing results:
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
from webscout.scout import Scout
|
|
255
|
+
|
|
256
|
+
scout = Scout(html_content)
|
|
257
|
+
|
|
258
|
+
# Find all paragraphs
|
|
259
|
+
paragraphs = scout.find_all('p')
|
|
260
|
+
|
|
261
|
+
# Extract all text from results
|
|
262
|
+
all_text = paragraphs.texts(separator='\n')
|
|
263
|
+
|
|
264
|
+
# Extract specific attributes
|
|
265
|
+
hrefs = paragraphs.attrs('href')
|
|
266
|
+
|
|
267
|
+
# Filter results with a predicate function
|
|
268
|
+
important = paragraphs.filter(lambda p: 'important' in p.get('class', []))
|
|
269
|
+
|
|
270
|
+
# Transform results
|
|
271
|
+
word_counts = paragraphs.map(lambda p: len(p.get_text().split()))
|
|
272
|
+
|
|
273
|
+
# Analyze text in results
|
|
274
|
+
analysis = paragraphs.analyze_text()
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
### URL Handling and Analysis
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
from webscout.scout import Scout
|
|
281
|
+
|
|
282
|
+
scout = Scout(html_content)
|
|
283
|
+
|
|
284
|
+
# Parse and analyze URLs
|
|
285
|
+
links = scout.extract_links(base_url='https://example.com')
|
|
286
|
+
for link in links:
|
|
287
|
+
url_components = scout.url_parse(link['href'])
|
|
288
|
+
print(f"Domain: {url_components['netloc']}")
|
|
289
|
+
print(f"Path: {url_components['path']}")
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
### Metadata Extraction
|
|
293
|
+
|
|
294
|
+
```python
|
|
295
|
+
from webscout.scout import Scout
|
|
296
|
+
|
|
297
|
+
scout = Scout(html_content)
|
|
298
|
+
|
|
299
|
+
# Extract metadata
|
|
300
|
+
metadata = scout.extract_metadata()
|
|
301
|
+
print(f"Title: {metadata['title']}")
|
|
302
|
+
print(f"Description: {metadata['description']}")
|
|
303
|
+
print(f"Open Graph: {metadata['og_metadata']}")
|
|
304
|
+
print(f"Twitter Card: {metadata['twitter_metadata']}")
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### Content Hashing and Caching
|
|
308
|
+
|
|
309
|
+
```python
|
|
310
|
+
from webscout.scout import Scout
|
|
311
|
+
|
|
312
|
+
scout = Scout(html_content)
|
|
313
|
+
|
|
314
|
+
# Generate content hash
|
|
315
|
+
content_hash = scout.hash_content(method='sha256')
|
|
316
|
+
|
|
317
|
+
# Use caching for expensive operations
|
|
318
|
+
if not scout.cache('parsed_data'):
|
|
319
|
+
data = scout.extract_semantic_info()
|
|
320
|
+
scout.cache('parsed_data', data)
|
|
321
|
+
|
|
322
|
+
cached_data = scout.cache('parsed_data')
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
## 📚 API Reference
|
|
326
|
+
|
|
327
|
+
### Core Classes
|
|
328
|
+
|
|
329
|
+
| Class | Description |
|
|
330
|
+
|-------|-------------|
|
|
331
|
+
| `Scout` | Main class for HTML parsing and traversal |
|
|
332
|
+
| `ScoutCrawler` | Web crawler for fetching and parsing multiple pages |
|
|
333
|
+
| `ScoutTextAnalyzer` | Text analysis utilities |
|
|
334
|
+
| `ScoutWebAnalyzer` | Web page analysis utilities |
|
|
335
|
+
| `ScoutSearchResult` | Enhanced search results with filtering and analysis |
|
|
336
|
+
| `Tag` | Represents an HTML/XML tag |
|
|
337
|
+
| `NavigableString` | Represents text within an HTML/XML document |
|
|
338
|
+
|
|
339
|
+
### Key Methods
|
|
340
|
+
|
|
341
|
+
#### Scout Class
|
|
342
|
+
|
|
343
|
+
- `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
|
|
344
|
+
- `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
|
|
345
|
+
- `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
|
|
346
|
+
- `select(selector)`: Find elements using CSS selector
|
|
347
|
+
- `get_text(separator=' ', strip=False)`: Extract text from document
|
|
348
|
+
- `analyze_text()`: Perform text analysis
|
|
349
|
+
- `analyze_page_structure()`: Analyze document structure
|
|
350
|
+
- `extract_semantic_info()`: Extract semantic information
|
|
351
|
+
- `extract_links(base_url=None)`: Extract all links
|
|
352
|
+
- `extract_metadata()`: Extract metadata from document
|
|
353
|
+
- `to_json(indent=2)`: Convert to JSON
|
|
354
|
+
- `to_markdown(heading_style='ATX')`: Convert to Markdown
|
|
355
|
+
- `prettify(formatter='minimal')`: Pretty-print HTML
|
|
356
|
+
|
|
357
|
+
#### ScoutCrawler Class
|
|
358
|
+
|
|
359
|
+
- `__init__(base_url, max_pages=50, tags_to_remove=None)`: Initialize the crawler
|
|
360
|
+
- `crawl()`: Start crawling from the base URL
|
|
361
|
+
- `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
|
|
362
|
+
- `_is_valid_url(url)`: Check if a URL is valid (internal method)
|
|
363
|
+
|
|
364
|
+
For detailed API documentation, please refer to the [documentation](https://github.com/OE-LUCIFER/Webscout/wiki).
|
|
365
|
+
|
|
366
|
+
## 🔧 Dependencies
|
|
367
|
+
|
|
368
|
+
- `curl_cffi`: HTTP library used for web requests
|
|
369
|
+
- `lxml`: XML and HTML processing library (optional, recommended)
|
|
370
|
+
- `html5lib`: Standards-compliant HTML parser (optional)
|
|
371
|
+
- `markdownify`: HTML to Markdown conversion
|
|
372
|
+
- `concurrent.futures`: Asynchronous execution (standard library)
|
|
373
|
+
|
|
374
|
+
## 🌈 Supported Python Versions
|
|
375
|
+
|
|
376
|
+
- Python 3.8+
|
|
377
|
+
|
|
378
|
+
## 🤝 Contributing
|
|
379
|
+
|
|
380
|
+
Contributions are welcome! Here's how you can contribute:
|
|
381
|
+
|
|
382
|
+
1. Fork the repository
|
|
383
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
384
|
+
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
|
385
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
386
|
+
5. Open a Pull Request
|
|
387
|
+
|
|
388
|
+
Please make sure to update tests as appropriate.
|
|
389
|
+
|
|
390
|
+
## 📄 License
|
|
391
|
+
|
|
392
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
393
|
+
|
|
394
|
+
---
|
|
395
|
+
|
|
396
|
+
<div align="center">
|
|
397
|
+
<p>Made with ❤️ by the Webscout team</p>
|
|
398
|
+
<p>
|
|
399
|
+
<a href="https://github.com/OE-LUCIFER/Webscout">GitHub</a> •
|
|
400
|
+
<a href="https://github.com/OE-LUCIFER/Webscout/wiki">Documentation</a> •
|
|
401
|
+
<a href="https://github.com/OE-LUCIFER/Webscout/issues">Report Bug</a> •
|
|
402
|
+
<a href="https://github.com/OE-LUCIFER/Webscout/issues">Request Feature</a>
|
|
403
|
+
</p>
|
|
404
|
+
</div>
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scout: A powerful, zero-dependency web scraping library
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .core import Scout, ScoutCrawler, ScoutTextAnalyzer, ScoutWebAnalyzer, ScoutSearchResult
|
|
6
|
+
from .element import Tag, NavigableString
|
|
7
|
+
|
|
8
|
+
__all__ = ['Scout', 'ScoutCrawler', 'Tag', 'NavigableString','ScoutTextAnalyzer', 'ScoutWebAnalyzer', 'ScoutSearchResult']
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from .text_analyzer import ScoutTextAnalyzer
|
|
2
|
+
from .web_analyzer import ScoutWebAnalyzer
|
|
3
|
+
from .search_result import ScoutSearchResult
|
|
4
|
+
from .crawler import ScoutCrawler
|
|
5
|
+
from .scout import Scout
|
|
6
|
+
|
|
7
|
+
__all__ = ['ScoutTextAnalyzer', 'ScoutWebAnalyzer', 'ScoutSearchResult', 'ScoutCrawler', 'Scout']
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scout Crawler Module
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import concurrent.futures
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import time
|
|
8
|
+
import hashlib
|
|
9
|
+
import re
|
|
10
|
+
from urllib import robotparser
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Dict, List, Optional, Union
|
|
13
|
+
from webscout.litagent import LitAgent
|
|
14
|
+
from curl_cffi.requests import Session
|
|
15
|
+
|
|
16
|
+
from .scout import Scout
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ScoutCrawler:
|
|
20
|
+
"""
|
|
21
|
+
Advanced web crawling utility for Scout library.
|
|
22
|
+
"""
|
|
23
|
+
def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the web crawler.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
base_url (str): Starting URL to crawl
|
|
29
|
+
max_pages (int, optional): Maximum number of pages to crawl
|
|
30
|
+
tags_to_remove (List[str], optional): List of tags to remove
|
|
31
|
+
"""
|
|
32
|
+
self.base_url = base_url
|
|
33
|
+
self.max_pages = max_pages
|
|
34
|
+
self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
|
|
35
|
+
"script",
|
|
36
|
+
"style",
|
|
37
|
+
"header",
|
|
38
|
+
"footer",
|
|
39
|
+
"nav",
|
|
40
|
+
"aside",
|
|
41
|
+
"form",
|
|
42
|
+
"button",
|
|
43
|
+
]
|
|
44
|
+
self.visited_urls = set()
|
|
45
|
+
self.crawled_pages = []
|
|
46
|
+
self.session = session or Session()
|
|
47
|
+
self.agent = LitAgent()
|
|
48
|
+
# Use all headers and generate fingerprint
|
|
49
|
+
self.session.headers = self.agent.generate_fingerprint()
|
|
50
|
+
self.session.headers.setdefault("User-Agent", self.agent.chrome())
|
|
51
|
+
self.delay = delay
|
|
52
|
+
self.obey_robots = obey_robots
|
|
53
|
+
self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
|
|
54
|
+
self.last_request_time = 0
|
|
55
|
+
self.url_hashes = set()
|
|
56
|
+
if obey_robots:
|
|
57
|
+
self.robots = robotparser.RobotFileParser()
|
|
58
|
+
robots_url = urllib.parse.urljoin(base_url, '/robots.txt')
|
|
59
|
+
try:
|
|
60
|
+
self.robots.set_url(robots_url)
|
|
61
|
+
self.robots.read()
|
|
62
|
+
except Exception:
|
|
63
|
+
self.robots = None
|
|
64
|
+
else:
|
|
65
|
+
self.robots = None
|
|
66
|
+
|
|
67
|
+
def _normalize_url(self, url: str) -> str:
|
|
68
|
+
url = url.split('#')[0]
|
|
69
|
+
url = re.sub(r'\?.*$', '', url) # Remove query params
|
|
70
|
+
return url.rstrip('/')
|
|
71
|
+
|
|
72
|
+
def _is_valid_url(self, url: str) -> bool:
|
|
73
|
+
"""
|
|
74
|
+
Check if a URL is valid and within the same domain.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
url (str): URL to validate
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
bool: Whether the URL is valid
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
parsed_base = urllib.parse.urlparse(self.base_url)
|
|
84
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
85
|
+
if parsed_url.scheme not in ["http", "https"]:
|
|
86
|
+
return False
|
|
87
|
+
if parsed_url.netloc not in self.allowed_domains:
|
|
88
|
+
return False
|
|
89
|
+
if self.obey_robots and self.robots:
|
|
90
|
+
return self.robots.can_fetch("*", url)
|
|
91
|
+
return True
|
|
92
|
+
except Exception:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
def _is_duplicate(self, url: str) -> bool:
|
|
96
|
+
norm = self._normalize_url(url)
|
|
97
|
+
url_hash = hashlib.md5(norm.encode()).hexdigest()
|
|
98
|
+
if url_hash in self.url_hashes:
|
|
99
|
+
return True
|
|
100
|
+
self.url_hashes.add(url_hash)
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
def _extract_main_text(self, soup):
|
|
104
|
+
# Try to extract main content (simple heuristic)
|
|
105
|
+
main = soup.find('main')
|
|
106
|
+
if main:
|
|
107
|
+
return main.get_text(separator=" ", strip=True)
|
|
108
|
+
article = soup.find('article')
|
|
109
|
+
if article:
|
|
110
|
+
return article.get_text(separator=" ", strip=True)
|
|
111
|
+
# fallback to body
|
|
112
|
+
body = soup.find('body')
|
|
113
|
+
if body:
|
|
114
|
+
return body.get_text(separator=" ", strip=True)
|
|
115
|
+
return soup.get_text(separator=" ", strip=True)
|
|
116
|
+
|
|
117
|
+
def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
|
|
118
|
+
"""
|
|
119
|
+
Crawl a single page and extract information.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
url (str): URL to crawl
|
|
123
|
+
depth (int, optional): Current crawl depth
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Dict[str, Union[str, List[str]]]: Crawled page information
|
|
127
|
+
"""
|
|
128
|
+
if url in self.visited_urls or self._is_duplicate(url):
|
|
129
|
+
return {}
|
|
130
|
+
# Throttle requests
|
|
131
|
+
now = time.time()
|
|
132
|
+
if self.last_request_time:
|
|
133
|
+
elapsed = now - self.last_request_time
|
|
134
|
+
if elapsed < self.delay:
|
|
135
|
+
time.sleep(self.delay - elapsed)
|
|
136
|
+
self.last_request_time = time.time()
|
|
137
|
+
try:
|
|
138
|
+
response = self.session.get(url, timeout=10)
|
|
139
|
+
response.raise_for_status()
|
|
140
|
+
if not response.headers.get('Content-Type', '').startswith('text/html'):
|
|
141
|
+
return {}
|
|
142
|
+
scout = Scout(response.content, features="lxml")
|
|
143
|
+
title_result = scout.find("title")
|
|
144
|
+
title = title_result[0].get_text() if title_result else ""
|
|
145
|
+
for tag_name in self.tags_to_remove:
|
|
146
|
+
for tag in scout._soup.find_all(tag_name):
|
|
147
|
+
tag.extract()
|
|
148
|
+
visible_text = self._extract_main_text(scout._soup)
|
|
149
|
+
page_info = {
|
|
150
|
+
'url': url,
|
|
151
|
+
'title': title,
|
|
152
|
+
'links': [
|
|
153
|
+
urllib.parse.urljoin(url, link.get('href'))
|
|
154
|
+
for link in scout.find_all('a', href=True)
|
|
155
|
+
if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
|
|
156
|
+
],
|
|
157
|
+
'text': visible_text,
|
|
158
|
+
'depth': depth,
|
|
159
|
+
'timestamp': datetime.utcnow().isoformat(),
|
|
160
|
+
'headers': dict(response.headers),
|
|
161
|
+
}
|
|
162
|
+
self.visited_urls.add(url)
|
|
163
|
+
self.crawled_pages.append(page_info)
|
|
164
|
+
return page_info
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f"Error crawling {url}: {e}")
|
|
167
|
+
return {}
|
|
168
|
+
|
|
169
|
+
def crawl(self):
|
|
170
|
+
"""
|
|
171
|
+
Start web crawling from base URL and yield each crawled page in real time.
|
|
172
|
+
|
|
173
|
+
Yields:
|
|
174
|
+
Dict[str, Union[str, List[str]]]: Crawled page information
|
|
175
|
+
"""
|
|
176
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
|
177
|
+
futures = {executor.submit(self._crawl_page, self.base_url, 0)}
|
|
178
|
+
submitted_links: set[str] = set()
|
|
179
|
+
|
|
180
|
+
while futures:
|
|
181
|
+
if len(self.visited_urls) >= self.max_pages:
|
|
182
|
+
break
|
|
183
|
+
done, not_done = concurrent.futures.wait(
|
|
184
|
+
futures, return_when=concurrent.futures.FIRST_COMPLETED
|
|
185
|
+
)
|
|
186
|
+
futures = not_done
|
|
187
|
+
|
|
188
|
+
for future in done:
|
|
189
|
+
page_info = future.result()
|
|
190
|
+
|
|
191
|
+
if page_info:
|
|
192
|
+
yield page_info
|
|
193
|
+
|
|
194
|
+
if len(self.visited_urls) >= self.max_pages:
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
for link in page_info.get("links", []):
|
|
198
|
+
if (
|
|
199
|
+
len(self.visited_urls) < self.max_pages
|
|
200
|
+
and link not in self.visited_urls
|
|
201
|
+
and link not in submitted_links
|
|
202
|
+
):
|
|
203
|
+
submitted_links.add(link)
|
|
204
|
+
futures.add(
|
|
205
|
+
executor.submit(
|
|
206
|
+
self._crawl_page,
|
|
207
|
+
link,
|
|
208
|
+
page_info.get("depth", 0) + 1,
|
|
209
|
+
)
|
|
210
|
+
)
|