crawlemoon 1.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlemoon-1.1.5/LICENSE +23 -0
- crawlemoon-1.1.5/PKG-INFO +197 -0
- crawlemoon-1.1.5/README.md +128 -0
- crawlemoon-1.1.5/crawlemoon.egg-info/PKG-INFO +197 -0
- crawlemoon-1.1.5/crawlemoon.egg-info/SOURCES.txt +94 -0
- crawlemoon-1.1.5/crawlemoon.egg-info/dependency_links.txt +1 -0
- crawlemoon-1.1.5/crawlemoon.egg-info/entry_points.txt +4 -0
- crawlemoon-1.1.5/crawlemoon.egg-info/requires.txt +51 -0
- crawlemoon-1.1.5/crawlemoon.egg-info/top_level.txt +1 -0
- crawlemoon-1.1.5/pyproject.toml +111 -0
- crawlemoon-1.1.5/setup.cfg +4 -0
- crawlemoon-1.1.5/setup.py +8 -0
- crawlemoon-1.1.5/src/__init__.py +47 -0
- crawlemoon-1.1.5/src/cli/__init__.py +9 -0
- crawlemoon-1.1.5/src/cli/main.py +334 -0
- crawlemoon-1.1.5/src/core/__init__.py +9 -0
- crawlemoon-1.1.5/src/core/browser/__init__.py +9 -0
- crawlemoon-1.1.5/src/core/browser/cdp.py +102 -0
- crawlemoon-1.1.5/src/core/browser/pool.py +269 -0
- crawlemoon-1.1.5/src/core/browser/proxy_pool.py +322 -0
- crawlemoon-1.1.5/src/core/browser/stealth.py +341 -0
- crawlemoon-1.1.5/src/core/browser/xray.py +845 -0
- crawlemoon-1.1.5/src/core/cache/__init__.py +9 -0
- crawlemoon-1.1.5/src/core/cache/manager.py +357 -0
- crawlemoon-1.1.5/src/core/http/__init__.py +2 -0
- crawlemoon-1.1.5/src/core/http/stealth_client.py +259 -0
- crawlemoon-1.1.5/src/core/logging.py +129 -0
- crawlemoon-1.1.5/src/core/rate_limiter.py +332 -0
- crawlemoon-1.1.5/src/core/recording_storage.py +375 -0
- crawlemoon-1.1.5/src/core/session/__init__.py +9 -0
- crawlemoon-1.1.5/src/core/session/manager.py +451 -0
- crawlemoon-1.1.5/src/crawlers/__init__.py +9 -0
- crawlemoon-1.1.5/src/exceptions.py +202 -0
- crawlemoon-1.1.5/src/intelligence/__init__.py +9 -0
- crawlemoon-1.1.5/src/intelligence/extraction/__init__.py +2 -0
- crawlemoon-1.1.5/src/intelligence/extraction/content.py +307 -0
- crawlemoon-1.1.5/src/intelligence/extraction/smart.py +534 -0
- crawlemoon-1.1.5/src/intelligence/generator/__init__.py +9 -0
- crawlemoon-1.1.5/src/intelligence/generator/crawler_gen.py +400 -0
- crawlemoon-1.1.5/src/intelligence/js/__init__.py +9 -0
- crawlemoon-1.1.5/src/intelligence/js/analyzer.py +305 -0
- crawlemoon-1.1.5/src/intelligence/js/deobfuscator.py +154 -0
- crawlemoon-1.1.5/src/intelligence/js/dynamic.py +112 -0
- crawlemoon-1.1.5/src/intelligence/network/__init__.py +9 -0
- crawlemoon-1.1.5/src/intelligence/network/analyzer.py +297 -0
- crawlemoon-1.1.5/src/intelligence/network/api_discovery.py +581 -0
- crawlemoon-1.1.5/src/intelligence/network/graphql.py +49 -0
- crawlemoon-1.1.5/src/intelligence/network/interceptor.py +250 -0
- crawlemoon-1.1.5/src/intelligence/network/sitemap.py +229 -0
- crawlemoon-1.1.5/src/intelligence/network/websocket.py +69 -0
- crawlemoon-1.1.5/src/intelligence/recorder/__init__.py +9 -0
- crawlemoon-1.1.5/src/intelligence/recorder/session.py +388 -0
- crawlemoon-1.1.5/src/intelligence/recorder/state_machine.py +133 -0
- crawlemoon-1.1.5/src/intelligence/security/__init__.py +9 -0
- crawlemoon-1.1.5/src/intelligence/security/auth.py +114 -0
- crawlemoon-1.1.5/src/intelligence/security/bot_detection.py +341 -0
- crawlemoon-1.1.5/src/intelligence/security/captcha_solver.py +534 -0
- crawlemoon-1.1.5/src/intelligence/security/technology_detector.py +210 -0
- crawlemoon-1.1.5/src/mcp/__init__.py +9 -0
- crawlemoon-1.1.5/src/mcp/config.py +151 -0
- crawlemoon-1.1.5/src/mcp/schemas.py +454 -0
- crawlemoon-1.1.5/src/mcp/server.py +4603 -0
- crawlemoon-1.1.5/src/mcp/tools/__init__.py +9 -0
- crawlemoon-1.1.5/src/mcp/utils.py +297 -0
- crawlemoon-1.1.5/src/sites/__init__.py +10 -0
- crawlemoon-1.1.5/tests/test_api_discovery.py +124 -0
- crawlemoon-1.1.5/tests/test_auth_analyzer.py +109 -0
- crawlemoon-1.1.5/tests/test_bot_detection.py +307 -0
- crawlemoon-1.1.5/tests/test_browser_pool.py +191 -0
- crawlemoon-1.1.5/tests/test_cache_manager.py +189 -0
- crawlemoon-1.1.5/tests/test_captcha_solver.py +348 -0
- crawlemoon-1.1.5/tests/test_cdp_client.py +301 -0
- crawlemoon-1.1.5/tests/test_content_extractor.py +351 -0
- crawlemoon-1.1.5/tests/test_crawl_and_extract.py +249 -0
- crawlemoon-1.1.5/tests/test_crawler_generator.py +199 -0
- crawlemoon-1.1.5/tests/test_improvements.py +668 -0
- crawlemoon-1.1.5/tests/test_js_analyzer.py +182 -0
- crawlemoon-1.1.5/tests/test_js_deobfuscator.py +332 -0
- crawlemoon-1.1.5/tests/test_js_security.py +56 -0
- crawlemoon-1.1.5/tests/test_mcp_config.py +301 -0
- crawlemoon-1.1.5/tests/test_mcp_server.py +328 -0
- crawlemoon-1.1.5/tests/test_mcp_utils.py +87 -0
- crawlemoon-1.1.5/tests/test_network_interceptor.py +172 -0
- crawlemoon-1.1.5/tests/test_proxy_pool.py +405 -0
- crawlemoon-1.1.5/tests/test_rate_limiter.py +305 -0
- crawlemoon-1.1.5/tests/test_recording_storage.py +146 -0
- crawlemoon-1.1.5/tests/test_request_analyzer.py +74 -0
- crawlemoon-1.1.5/tests/test_session_manager.py +335 -0
- crawlemoon-1.1.5/tests/test_session_recorder.py +176 -0
- crawlemoon-1.1.5/tests/test_sitemap_analyzer.py +415 -0
- crawlemoon-1.1.5/tests/test_smart_extractor.py +336 -0
- crawlemoon-1.1.5/tests/test_stealth.py +96 -0
- crawlemoon-1.1.5/tests/test_stealth_client.py +318 -0
- crawlemoon-1.1.5/tests/test_technology_detector.py +377 -0
- crawlemoon-1.1.5/tests/test_xray.py +254 -0
- crawlemoon-1.1.5/tests/test_xray_advanced.py +215 -0
crawlemoon-1.1.5/LICENSE
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 emad.dev
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlemoon
|
|
3
|
+
Version: 1.1.5
|
|
4
|
+
Summary: Advanced Web Crawling Platform with Deep Analysis and MCP Server
|
|
5
|
+
Author-email: "emad.dev" <contact@emad.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: crawling,scraping,automation,mcp,web-analysis,playwright,api-discovery
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Classifier: Topic :: System :: Networking
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: playwright>=1.40.0
|
|
22
|
+
Requires-Dist: mcp>=1.0.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
25
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
26
|
+
Requires-Dist: httpx>=0.25.0
|
|
27
|
+
Requires-Dist: websockets>=12.0
|
|
28
|
+
Requires-Dist: pyyaml>=6.0
|
|
29
|
+
Requires-Dist: python-json-logger>=2.0.0
|
|
30
|
+
Requires-Dist: tenacity>=8.2.0
|
|
31
|
+
Requires-Dist: cachetools>=5.3.0
|
|
32
|
+
Requires-Dist: graphql-core>=3.2.0
|
|
33
|
+
Requires-Dist: esprima>=4.0.0
|
|
34
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
35
|
+
Requires-Dist: lxml>=5.0.0
|
|
36
|
+
Requires-Dist: cryptography>=41.0.0
|
|
37
|
+
Requires-Dist: curl-cffi>=0.6.0
|
|
38
|
+
Requires-Dist: fake-useragent>=1.4.0
|
|
39
|
+
Requires-Dist: trafilatura>=1.6.0
|
|
40
|
+
Requires-Dist: selectolax>=0.3.0
|
|
41
|
+
Requires-Dist: markdownify>=0.11.0
|
|
42
|
+
Requires-Dist: instructor>=1.0.0
|
|
43
|
+
Requires-Dist: openai>=1.0.0
|
|
44
|
+
Requires-Dist: python-Wappalyzer>=0.3.0
|
|
45
|
+
Requires-Dist: ftfy>=6.1.0
|
|
46
|
+
Requires-Dist: dateparser>=1.2.0
|
|
47
|
+
Requires-Dist: url-normalize>=1.4.0
|
|
48
|
+
Requires-Dist: tldextract>=5.0.0
|
|
49
|
+
Provides-Extra: captcha
|
|
50
|
+
Requires-Dist: python-anticaptcha>=1.0.0; extra == "captcha"
|
|
51
|
+
Requires-Dist: capsolver>=1.0.0; extra == "captcha"
|
|
52
|
+
Provides-Extra: ocr
|
|
53
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
|
|
54
|
+
Requires-Dist: Pillow>=10.0.0; extra == "ocr"
|
|
55
|
+
Provides-Extra: dev
|
|
56
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
57
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
58
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
59
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
60
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
61
|
+
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
62
|
+
Requires-Dist: pip-audit>=2.7.0; extra == "dev"
|
|
63
|
+
Requires-Dist: pre-commit>=3.6.0; extra == "dev"
|
|
64
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
65
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
66
|
+
Provides-Extra: all
|
|
67
|
+
Requires-Dist: crawlemoon[captcha,dev,ocr]; extra == "all"
|
|
68
|
+
Dynamic: license-file
|
|
69
|
+
|
|
70
|
+
# Crawlemoon MCP Server
|
|
71
|
+
|
|
72
|
+
<p align="center">
|
|
73
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/hero.png" alt="Crawlemoon MCP Server — free, AI-native web crawling for the agent era" width="100%"/>
|
|
74
|
+
</p>
|
|
75
|
+
|
|
76
|
+
<p align="left">
|
|
77
|
+
<img alt="python 3.10+ · pypi 1.1.0 · MIT · MCP-native · code style black" src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/badges.png" height="22"/>
|
|
78
|
+
</p>
|
|
79
|
+
|
|
80
|
+
A **free, open-source MCP server** that gives any agent (Claude Code, Cursor, Windsurf, …) **55 production-grade tools** for the full web-crawling stack: deep analysis, stealth, API discovery, session recording → runnable crawler, smart extraction. No proprietary API. No per-request fee.
|
|
81
|
+
|
|
82
|
+
<p align="center">
|
|
83
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/features.png" alt="Crawlemoon capabilities — deep analysis, stealth, record→crawler, smart extraction" width="100%"/>
|
|
84
|
+
</p>
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Quick start
|
|
89
|
+
|
|
90
|
+
<p align="center">
|
|
91
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/install.png" alt="Three install paths — uvx, pipx, pip" width="100%"/>
|
|
92
|
+
</p>
|
|
93
|
+
|
|
94
|
+
The recommended path needs no install — `uvx` runs straight from PyPI:
|
|
95
|
+
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"mcpServers": {
|
|
99
|
+
"crawlemoon": {
|
|
100
|
+
"command": "uvx",
|
|
101
|
+
"args": ["crawlemoon-mcp-server"]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
> Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install once: `curl -LsSf https://astral.sh/uv/install.sh | sh`. Or use `pipx run crawlemoon-mcp-server` / `pip install crawlemoon-mcp-server` instead.
|
|
108
|
+
|
|
109
|
+
**Where to put that JSON:** Cursor → Settings → MCP. Claude Code → `~/.config/claude/mcp_settings.json`. Windsurf → Settings → MCP Servers.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## How it works
|
|
114
|
+
|
|
115
|
+
<p align="center">
|
|
116
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/architecture.png" alt="Agent → Crawlemoon → Browser/HTTP/Proxy → target web" width="100%"/>
|
|
117
|
+
</p>
|
|
118
|
+
|
|
119
|
+
Your agent talks to Crawlemoon over the Model Context Protocol. Crawlemoon owns a hardened browser pool, an HTTP stack with TLS fingerprinting, and a rotating proxy pool. While it fetches pages, it captures network traffic, reads scripts, and introspects schemas — so the agent gets clean structured data, not raw HTML.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## What's in the box
|
|
124
|
+
|
|
125
|
+
A short list — see the source for the full set of 55 tools.
|
|
126
|
+
|
|
127
|
+
| Group | Tools |
|
|
128
|
+
|---|---|
|
|
129
|
+
| **Deep analysis** | `deep_analyze`, `discover_apis`, `introspect_graphql`, `analyze_websocket`, `analyze_auth`, `detect_protection`, `detect_technology` |
|
|
130
|
+
| **Stealth** | `stealth_request`, `configure_proxies`, `configure_rate_limit`, `add_proxy`, `test_proxy` |
|
|
131
|
+
| **Record → crawler** | `record_session`, `stop_recording`, `export_recording`, `generate_crawler` |
|
|
132
|
+
| **Extraction** | `smart_extract`, `extract_article`, `extract_tables`, `extract_links`, `extract_forms`, `extract_metadata`, `convert_to_markdown` |
|
|
133
|
+
| **Page interaction** | `take_screenshot`, `fill_form`, `wait_and_extract`, `compare_pages`, `measure_performance`, `check_accessibility`, `get_dom_tree` |
|
|
134
|
+
| **Sessions & cache** | `save_session`, `load_session`, `get_cookies`, `get_storage`, `clear_cache`, `get_cache_stats` |
|
|
135
|
+
| **Advanced (opt-in)** | `execute_js`, `execute_cdp`, `deobfuscate_js`, `extract_from_js`, `solve_captcha` |
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Smart extraction — bring any LLM, including free ones
|
|
140
|
+
|
|
141
|
+
`smart_extract` works **without any API key** using pattern matching. Plug in any OpenAI-compatible endpoint for higher accuracy — including FREE tiers:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
# OpenRouter (free models exist)
|
|
145
|
+
CRAWLEMOON_LLM_PROVIDER=openrouter
|
|
146
|
+
CRAWLEMOON_LLM_API_KEY=sk-or-v1-xxx
|
|
147
|
+
CRAWLEMOON_LLM_MODEL=meta-llama/llama-3.2-3b-instruct:free
|
|
148
|
+
|
|
149
|
+
# Groq (free, very fast)
|
|
150
|
+
CRAWLEMOON_LLM_PROVIDER=groq
|
|
151
|
+
CRAWLEMOON_LLM_API_KEY=gsk_xxx
|
|
152
|
+
|
|
153
|
+
# Local Ollama (no key needed)
|
|
154
|
+
CRAWLEMOON_LLM_PROVIDER=ollama
|
|
155
|
+
CRAWLEMOON_LLM_MODEL=llama3.2
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Together, DeepSeek, Mistral, Fireworks, and standard OpenAI also work via `CRAWLEMOON_LLM_BASE_URL`.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Configuration
|
|
163
|
+
|
|
164
|
+
| Variable | Default | Notes |
|
|
165
|
+
|---|---|---|
|
|
166
|
+
| `CRAWLEMOON_HEADLESS` | `true` | Run browser without UI |
|
|
167
|
+
| `CRAWLEMOON_BROWSER` | `chromium` | `chromium` / `firefox` / `webkit` |
|
|
168
|
+
| `CRAWLEMOON_POOL_SIZE` | `5` | Max concurrent browsers |
|
|
169
|
+
| `CRAWLEMOON_NAV_TIMEOUT` | `30.0` | Page-load timeout (s) |
|
|
170
|
+
| `CRAWLEMOON_API_KEY` | _unset_ | If set, every tool call must include matching `_api_key` |
|
|
171
|
+
| `CRAWLEMOON_ALLOW_DANGEROUS_JS` | `false` | Required for `execute_js` / `execute_cdp` / `deobfuscate_js` |
|
|
172
|
+
| `CRAWLEMOON_JS_MAX_LENGTH` | `50000` | Length cap for JS payloads |
|
|
173
|
+
| `CRAWLEMOON_JS_EXEC_TIMEOUT` | `10.0` | Per-script timeout (s) |
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Security
|
|
178
|
+
|
|
179
|
+
`execute_js`, `execute_cdp`, and `deobfuscate_js` are **disabled by default** — they execute or operate on arbitrary code in a real browser. Enable on trusted networks with `CRAWLEMOON_ALLOW_DANGEROUS_JS=true`. Even then, payloads are length-capped, time-bounded, and a denylist rejects `eval`, `new Function`, dynamic `import()`, `document.write`, `importScripts`, and `WebAssembly.{compile,instantiate}`. Set `CRAWLEMOON_API_KEY` so MCP clients must present a matching `_api_key`.
|
|
180
|
+
|
|
181
|
+
These are mitigations, not a sandbox: do not expose this server to untrusted clients.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Develop
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
git clone https://github.com/razavioo/crawlemoon-mcp-server.git
|
|
189
|
+
cd crawlemoon-mcp-server
|
|
190
|
+
make dev-install # editable install + dev/captcha/ocr extras + pre-commit
|
|
191
|
+
make test # pytest
|
|
192
|
+
make lint # ruff + mypy
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
PRs welcome. Particularly interested in: distributed mode (Redis queue), result sinks (Postgres / S3), Prometheus metrics. See [`MIT License`](LICENSE).
|
|
196
|
+
|
|
197
|
+
<p align="center"><sub>Made by <a href="https://emad.dev">emad.dev</a></sub></p>
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# Crawlemoon MCP Server
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/hero.png" alt="Crawlemoon MCP Server — free, AI-native web crawling for the agent era" width="100%"/>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="left">
|
|
8
|
+
<img alt="python 3.10+ · pypi 1.1.0 · MIT · MCP-native · code style black" src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/badges.png" height="22"/>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
A **free, open-source MCP server** that gives any agent (Claude Code, Cursor, Windsurf, …) **55 production-grade tools** for the full web-crawling stack: deep analysis, stealth, API discovery, session recording → runnable crawler, smart extraction. No proprietary API. No per-request fee.
|
|
12
|
+
|
|
13
|
+
<p align="center">
|
|
14
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/features.png" alt="Crawlemoon capabilities — deep analysis, stealth, record→crawler, smart extraction" width="100%"/>
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Quick start
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/install.png" alt="Three install paths — uvx, pipx, pip" width="100%"/>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
The recommended path needs no install — `uvx` runs straight from PyPI:
|
|
26
|
+
|
|
27
|
+
```json
|
|
28
|
+
{
|
|
29
|
+
"mcpServers": {
|
|
30
|
+
"crawlemoon": {
|
|
31
|
+
"command": "uvx",
|
|
32
|
+
"args": ["crawlemoon-mcp-server"]
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
> Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install once: `curl -LsSf https://astral.sh/uv/install.sh | sh`. Or use `pipx run crawlemoon-mcp-server` / `pip install crawlemoon-mcp-server` instead.
|
|
39
|
+
|
|
40
|
+
**Where to put that JSON:** Cursor → Settings → MCP. Claude Code → `~/.config/claude/mcp_settings.json`. Windsurf → Settings → MCP Servers.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## How it works
|
|
45
|
+
|
|
46
|
+
<p align="center">
|
|
47
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/architecture.png" alt="Agent → Crawlemoon → Browser/HTTP/Proxy → target web" width="100%"/>
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
Your agent talks to Crawlemoon over the Model Context Protocol. Crawlemoon owns a hardened browser pool, an HTTP stack with TLS fingerprinting, and a rotating proxy pool. While it fetches pages, it captures network traffic, reads scripts, and introspects schemas — so the agent gets clean structured data, not raw HTML.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## What's in the box
|
|
55
|
+
|
|
56
|
+
A short list — see the source for the full set of 55 tools.
|
|
57
|
+
|
|
58
|
+
| Group | Tools |
|
|
59
|
+
|---|---|
|
|
60
|
+
| **Deep analysis** | `deep_analyze`, `discover_apis`, `introspect_graphql`, `analyze_websocket`, `analyze_auth`, `detect_protection`, `detect_technology` |
|
|
61
|
+
| **Stealth** | `stealth_request`, `configure_proxies`, `configure_rate_limit`, `add_proxy`, `test_proxy` |
|
|
62
|
+
| **Record → crawler** | `record_session`, `stop_recording`, `export_recording`, `generate_crawler` |
|
|
63
|
+
| **Extraction** | `smart_extract`, `extract_article`, `extract_tables`, `extract_links`, `extract_forms`, `extract_metadata`, `convert_to_markdown` |
|
|
64
|
+
| **Page interaction** | `take_screenshot`, `fill_form`, `wait_and_extract`, `compare_pages`, `measure_performance`, `check_accessibility`, `get_dom_tree` |
|
|
65
|
+
| **Sessions & cache** | `save_session`, `load_session`, `get_cookies`, `get_storage`, `clear_cache`, `get_cache_stats` |
|
|
66
|
+
| **Advanced (opt-in)** | `execute_js`, `execute_cdp`, `deobfuscate_js`, `extract_from_js`, `solve_captcha` |
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Smart extraction — bring any LLM, including free ones
|
|
71
|
+
|
|
72
|
+
`smart_extract` works **without any API key** using pattern matching. Plug in any OpenAI-compatible endpoint for higher accuracy — including FREE tiers:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# OpenRouter (free models exist)
|
|
76
|
+
CRAWLEMOON_LLM_PROVIDER=openrouter
|
|
77
|
+
CRAWLEMOON_LLM_API_KEY=sk-or-v1-xxx
|
|
78
|
+
CRAWLEMOON_LLM_MODEL=meta-llama/llama-3.2-3b-instruct:free
|
|
79
|
+
|
|
80
|
+
# Groq (free, very fast)
|
|
81
|
+
CRAWLEMOON_LLM_PROVIDER=groq
|
|
82
|
+
CRAWLEMOON_LLM_API_KEY=gsk_xxx
|
|
83
|
+
|
|
84
|
+
# Local Ollama (no key needed)
|
|
85
|
+
CRAWLEMOON_LLM_PROVIDER=ollama
|
|
86
|
+
CRAWLEMOON_LLM_MODEL=llama3.2
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Together, DeepSeek, Mistral, Fireworks, and standard OpenAI also work via `CRAWLEMOON_LLM_BASE_URL`.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Configuration
|
|
94
|
+
|
|
95
|
+
| Variable | Default | Notes |
|
|
96
|
+
|---|---|---|
|
|
97
|
+
| `CRAWLEMOON_HEADLESS` | `true` | Run browser without UI |
|
|
98
|
+
| `CRAWLEMOON_BROWSER` | `chromium` | `chromium` / `firefox` / `webkit` |
|
|
99
|
+
| `CRAWLEMOON_POOL_SIZE` | `5` | Max concurrent browsers |
|
|
100
|
+
| `CRAWLEMOON_NAV_TIMEOUT` | `30.0` | Page-load timeout (s) |
|
|
101
|
+
| `CRAWLEMOON_API_KEY` | _unset_ | If set, every tool call must include matching `_api_key` |
|
|
102
|
+
| `CRAWLEMOON_ALLOW_DANGEROUS_JS` | `false` | Required for `execute_js` / `execute_cdp` / `deobfuscate_js` |
|
|
103
|
+
| `CRAWLEMOON_JS_MAX_LENGTH` | `50000` | Length cap for JS payloads |
|
|
104
|
+
| `CRAWLEMOON_JS_EXEC_TIMEOUT` | `10.0` | Per-script timeout (s) |
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Security
|
|
109
|
+
|
|
110
|
+
`execute_js`, `execute_cdp`, and `deobfuscate_js` are **disabled by default** — they execute or operate on arbitrary code in a real browser. Enable on trusted networks with `CRAWLEMOON_ALLOW_DANGEROUS_JS=true`. Even then, payloads are length-capped, time-bounded, and a denylist rejects `eval`, `new Function`, dynamic `import()`, `document.write`, `importScripts`, and `WebAssembly.{compile,instantiate}`. Set `CRAWLEMOON_API_KEY` so MCP clients must present a matching `_api_key`.
|
|
111
|
+
|
|
112
|
+
These are mitigations, not a sandbox: do not expose this server to untrusted clients.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Develop
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
git clone https://github.com/razavioo/crawlemoon-mcp-server.git
|
|
120
|
+
cd crawlemoon-mcp-server
|
|
121
|
+
make dev-install # editable install + dev/captcha/ocr extras + pre-commit
|
|
122
|
+
make test # pytest
|
|
123
|
+
make lint # ruff + mypy
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
PRs welcome. Particularly interested in: distributed mode (Redis queue), result sinks (Postgres / S3), Prometheus metrics. See [`MIT License`](LICENSE).
|
|
127
|
+
|
|
128
|
+
<p align="center"><sub>Made by <a href="https://emad.dev">emad.dev</a></sub></p>
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlemoon
|
|
3
|
+
Version: 1.1.5
|
|
4
|
+
Summary: Advanced Web Crawling Platform with Deep Analysis and MCP Server
|
|
5
|
+
Author-email: "emad.dev" <contact@emad.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: crawling,scraping,automation,mcp,web-analysis,playwright,api-discovery
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Classifier: Topic :: System :: Networking
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: playwright>=1.40.0
|
|
22
|
+
Requires-Dist: mcp>=1.0.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
25
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
26
|
+
Requires-Dist: httpx>=0.25.0
|
|
27
|
+
Requires-Dist: websockets>=12.0
|
|
28
|
+
Requires-Dist: pyyaml>=6.0
|
|
29
|
+
Requires-Dist: python-json-logger>=2.0.0
|
|
30
|
+
Requires-Dist: tenacity>=8.2.0
|
|
31
|
+
Requires-Dist: cachetools>=5.3.0
|
|
32
|
+
Requires-Dist: graphql-core>=3.2.0
|
|
33
|
+
Requires-Dist: esprima>=4.0.0
|
|
34
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
35
|
+
Requires-Dist: lxml>=5.0.0
|
|
36
|
+
Requires-Dist: cryptography>=41.0.0
|
|
37
|
+
Requires-Dist: curl-cffi>=0.6.0
|
|
38
|
+
Requires-Dist: fake-useragent>=1.4.0
|
|
39
|
+
Requires-Dist: trafilatura>=1.6.0
|
|
40
|
+
Requires-Dist: selectolax>=0.3.0
|
|
41
|
+
Requires-Dist: markdownify>=0.11.0
|
|
42
|
+
Requires-Dist: instructor>=1.0.0
|
|
43
|
+
Requires-Dist: openai>=1.0.0
|
|
44
|
+
Requires-Dist: python-Wappalyzer>=0.3.0
|
|
45
|
+
Requires-Dist: ftfy>=6.1.0
|
|
46
|
+
Requires-Dist: dateparser>=1.2.0
|
|
47
|
+
Requires-Dist: url-normalize>=1.4.0
|
|
48
|
+
Requires-Dist: tldextract>=5.0.0
|
|
49
|
+
Provides-Extra: captcha
|
|
50
|
+
Requires-Dist: python-anticaptcha>=1.0.0; extra == "captcha"
|
|
51
|
+
Requires-Dist: capsolver>=1.0.0; extra == "captcha"
|
|
52
|
+
Provides-Extra: ocr
|
|
53
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
|
|
54
|
+
Requires-Dist: Pillow>=10.0.0; extra == "ocr"
|
|
55
|
+
Provides-Extra: dev
|
|
56
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
57
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
58
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
59
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
60
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
61
|
+
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
62
|
+
Requires-Dist: pip-audit>=2.7.0; extra == "dev"
|
|
63
|
+
Requires-Dist: pre-commit>=3.6.0; extra == "dev"
|
|
64
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
65
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
66
|
+
Provides-Extra: all
|
|
67
|
+
Requires-Dist: crawlemoon[captcha,dev,ocr]; extra == "all"
|
|
68
|
+
Dynamic: license-file
|
|
69
|
+
|
|
70
|
+
# Crawlemoon MCP Server
|
|
71
|
+
|
|
72
|
+
<p align="center">
|
|
73
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/hero.png" alt="Crawlemoon MCP Server — free, AI-native web crawling for the agent era" width="100%"/>
|
|
74
|
+
</p>
|
|
75
|
+
|
|
76
|
+
<p align="left">
|
|
77
|
+
<img alt="python 3.10+ · pypi 1.1.0 · MIT · MCP-native · code style black" src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/badges.png" height="22"/>
|
|
78
|
+
</p>
|
|
79
|
+
|
|
80
|
+
A **free, open-source MCP server** that gives any agent (Claude Code, Cursor, Windsurf, …) **55 production-grade tools** for the full web-crawling stack: deep analysis, stealth, API discovery, session recording → runnable crawler, smart extraction. No proprietary API. No per-request fee.
|
|
81
|
+
|
|
82
|
+
<p align="center">
|
|
83
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/features.png" alt="Crawlemoon capabilities — deep analysis, stealth, record→crawler, smart extraction" width="100%"/>
|
|
84
|
+
</p>
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Quick start
|
|
89
|
+
|
|
90
|
+
<p align="center">
|
|
91
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/install.png" alt="Three install paths — uvx, pipx, pip" width="100%"/>
|
|
92
|
+
</p>
|
|
93
|
+
|
|
94
|
+
The recommended path needs no install — `uvx` runs straight from PyPI:
|
|
95
|
+
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"mcpServers": {
|
|
99
|
+
"crawlemoon": {
|
|
100
|
+
"command": "uvx",
|
|
101
|
+
"args": ["crawlemoon-mcp-server"]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
> Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install once: `curl -LsSf https://astral.sh/uv/install.sh | sh`. Or use `pipx run crawlemoon-mcp-server` / `pip install crawlemoon-mcp-server` instead.
|
|
108
|
+
|
|
109
|
+
**Where to put that JSON:** Cursor → Settings → MCP. Claude Code → `~/.config/claude/mcp_settings.json`. Windsurf → Settings → MCP Servers.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## How it works
|
|
114
|
+
|
|
115
|
+
<p align="center">
|
|
116
|
+
<img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/architecture.png" alt="Agent → Crawlemoon → Browser/HTTP/Proxy → target web" width="100%"/>
|
|
117
|
+
</p>
|
|
118
|
+
|
|
119
|
+
Your agent talks to Crawlemoon over the Model Context Protocol. Crawlemoon owns a hardened browser pool, an HTTP stack with TLS fingerprinting, and a rotating proxy pool. While it fetches pages, it captures network traffic, reads scripts, and introspects schemas — so the agent gets clean structured data, not raw HTML.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## What's in the box
|
|
124
|
+
|
|
125
|
+
A short list — see the source for the full set of 55 tools.
|
|
126
|
+
|
|
127
|
+
| Group | Tools |
|
|
128
|
+
|---|---|
|
|
129
|
+
| **Deep analysis** | `deep_analyze`, `discover_apis`, `introspect_graphql`, `analyze_websocket`, `analyze_auth`, `detect_protection`, `detect_technology` |
|
|
130
|
+
| **Stealth** | `stealth_request`, `configure_proxies`, `configure_rate_limit`, `add_proxy`, `test_proxy` |
|
|
131
|
+
| **Record → crawler** | `record_session`, `stop_recording`, `export_recording`, `generate_crawler` |
|
|
132
|
+
| **Extraction** | `smart_extract`, `extract_article`, `extract_tables`, `extract_links`, `extract_forms`, `extract_metadata`, `convert_to_markdown` |
|
|
133
|
+
| **Page interaction** | `take_screenshot`, `fill_form`, `wait_and_extract`, `compare_pages`, `measure_performance`, `check_accessibility`, `get_dom_tree` |
|
|
134
|
+
| **Sessions & cache** | `save_session`, `load_session`, `get_cookies`, `get_storage`, `clear_cache`, `get_cache_stats` |
|
|
135
|
+
| **Advanced (opt-in)** | `execute_js`, `execute_cdp`, `deobfuscate_js`, `extract_from_js`, `solve_captcha` |
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Smart extraction — bring any LLM, including free ones
|
|
140
|
+
|
|
141
|
+
`smart_extract` works **without any API key** using pattern matching. Plug in any OpenAI-compatible endpoint for higher accuracy — including FREE tiers:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
# OpenRouter (free models exist)
|
|
145
|
+
CRAWLEMOON_LLM_PROVIDER=openrouter
|
|
146
|
+
CRAWLEMOON_LLM_API_KEY=sk-or-v1-xxx
|
|
147
|
+
CRAWLEMOON_LLM_MODEL=meta-llama/llama-3.2-3b-instruct:free
|
|
148
|
+
|
|
149
|
+
# Groq (free, very fast)
|
|
150
|
+
CRAWLEMOON_LLM_PROVIDER=groq
|
|
151
|
+
CRAWLEMOON_LLM_API_KEY=gsk_xxx
|
|
152
|
+
|
|
153
|
+
# Local Ollama (no key needed)
|
|
154
|
+
CRAWLEMOON_LLM_PROVIDER=ollama
|
|
155
|
+
CRAWLEMOON_LLM_MODEL=llama3.2
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Together, DeepSeek, Mistral, Fireworks, and standard OpenAI also work via `CRAWLEMOON_LLM_BASE_URL`.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Configuration
|
|
163
|
+
|
|
164
|
+
| Variable | Default | Notes |
|
|
165
|
+
|---|---|---|
|
|
166
|
+
| `CRAWLEMOON_HEADLESS` | `true` | Run browser without UI |
|
|
167
|
+
| `CRAWLEMOON_BROWSER` | `chromium` | `chromium` / `firefox` / `webkit` |
|
|
168
|
+
| `CRAWLEMOON_POOL_SIZE` | `5` | Max concurrent browsers |
|
|
169
|
+
| `CRAWLEMOON_NAV_TIMEOUT` | `30.0` | Page-load timeout (s) |
|
|
170
|
+
| `CRAWLEMOON_API_KEY` | _unset_ | If set, every tool call must include matching `_api_key` |
|
|
171
|
+
| `CRAWLEMOON_ALLOW_DANGEROUS_JS` | `false` | Required for `execute_js` / `execute_cdp` / `deobfuscate_js` |
|
|
172
|
+
| `CRAWLEMOON_JS_MAX_LENGTH` | `50000` | Length cap for JS payloads |
|
|
173
|
+
| `CRAWLEMOON_JS_EXEC_TIMEOUT` | `10.0` | Per-script timeout (s) |
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Security
|
|
178
|
+
|
|
179
|
+
`execute_js`, `execute_cdp`, and `deobfuscate_js` are **disabled by default** — they execute or operate on arbitrary code in a real browser. Enable on trusted networks with `CRAWLEMOON_ALLOW_DANGEROUS_JS=true`. Even then, payloads are length-capped, time-bounded, and a denylist rejects `eval`, `new Function`, dynamic `import()`, `document.write`, `importScripts`, and `WebAssembly.{compile,instantiate}`. Set `CRAWLEMOON_API_KEY` so MCP clients must present a matching `_api_key`.
|
|
180
|
+
|
|
181
|
+
These are mitigations, not a sandbox: do not expose this server to untrusted clients.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Develop
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
git clone https://github.com/razavioo/crawlemoon-mcp-server.git
|
|
189
|
+
cd crawlemoon-mcp-server
|
|
190
|
+
make dev-install # editable install + dev/captcha/ocr extras + pre-commit
|
|
191
|
+
make test # pytest
|
|
192
|
+
make lint # ruff + mypy
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
PRs welcome. Particularly interested in: distributed mode (Redis queue), result sinks (Postgres / S3), Prometheus metrics. See [`MIT License`](LICENSE).
|
|
196
|
+
|
|
197
|
+
<p align="center"><sub>Made by <a href="https://emad.dev">emad.dev</a></sub></p>
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
crawlemoon.egg-info/PKG-INFO
|
|
6
|
+
crawlemoon.egg-info/SOURCES.txt
|
|
7
|
+
crawlemoon.egg-info/dependency_links.txt
|
|
8
|
+
crawlemoon.egg-info/entry_points.txt
|
|
9
|
+
crawlemoon.egg-info/requires.txt
|
|
10
|
+
crawlemoon.egg-info/top_level.txt
|
|
11
|
+
src/__init__.py
|
|
12
|
+
src/exceptions.py
|
|
13
|
+
src/cli/__init__.py
|
|
14
|
+
src/cli/main.py
|
|
15
|
+
src/core/__init__.py
|
|
16
|
+
src/core/logging.py
|
|
17
|
+
src/core/rate_limiter.py
|
|
18
|
+
src/core/recording_storage.py
|
|
19
|
+
src/core/browser/__init__.py
|
|
20
|
+
src/core/browser/cdp.py
|
|
21
|
+
src/core/browser/pool.py
|
|
22
|
+
src/core/browser/proxy_pool.py
|
|
23
|
+
src/core/browser/stealth.py
|
|
24
|
+
src/core/browser/xray.py
|
|
25
|
+
src/core/cache/__init__.py
|
|
26
|
+
src/core/cache/manager.py
|
|
27
|
+
src/core/http/__init__.py
|
|
28
|
+
src/core/http/stealth_client.py
|
|
29
|
+
src/core/session/__init__.py
|
|
30
|
+
src/core/session/manager.py
|
|
31
|
+
src/crawlers/__init__.py
|
|
32
|
+
src/intelligence/__init__.py
|
|
33
|
+
src/intelligence/extraction/__init__.py
|
|
34
|
+
src/intelligence/extraction/content.py
|
|
35
|
+
src/intelligence/extraction/smart.py
|
|
36
|
+
src/intelligence/generator/__init__.py
|
|
37
|
+
src/intelligence/generator/crawler_gen.py
|
|
38
|
+
src/intelligence/js/__init__.py
|
|
39
|
+
src/intelligence/js/analyzer.py
|
|
40
|
+
src/intelligence/js/deobfuscator.py
|
|
41
|
+
src/intelligence/js/dynamic.py
|
|
42
|
+
src/intelligence/network/__init__.py
|
|
43
|
+
src/intelligence/network/analyzer.py
|
|
44
|
+
src/intelligence/network/api_discovery.py
|
|
45
|
+
src/intelligence/network/graphql.py
|
|
46
|
+
src/intelligence/network/interceptor.py
|
|
47
|
+
src/intelligence/network/sitemap.py
|
|
48
|
+
src/intelligence/network/websocket.py
|
|
49
|
+
src/intelligence/recorder/__init__.py
|
|
50
|
+
src/intelligence/recorder/session.py
|
|
51
|
+
src/intelligence/recorder/state_machine.py
|
|
52
|
+
src/intelligence/security/__init__.py
|
|
53
|
+
src/intelligence/security/auth.py
|
|
54
|
+
src/intelligence/security/bot_detection.py
|
|
55
|
+
src/intelligence/security/captcha_solver.py
|
|
56
|
+
src/intelligence/security/technology_detector.py
|
|
57
|
+
src/mcp/__init__.py
|
|
58
|
+
src/mcp/config.py
|
|
59
|
+
src/mcp/schemas.py
|
|
60
|
+
src/mcp/server.py
|
|
61
|
+
src/mcp/utils.py
|
|
62
|
+
src/mcp/tools/__init__.py
|
|
63
|
+
src/sites/__init__.py
|
|
64
|
+
tests/test_api_discovery.py
|
|
65
|
+
tests/test_auth_analyzer.py
|
|
66
|
+
tests/test_bot_detection.py
|
|
67
|
+
tests/test_browser_pool.py
|
|
68
|
+
tests/test_cache_manager.py
|
|
69
|
+
tests/test_captcha_solver.py
|
|
70
|
+
tests/test_cdp_client.py
|
|
71
|
+
tests/test_content_extractor.py
|
|
72
|
+
tests/test_crawl_and_extract.py
|
|
73
|
+
tests/test_crawler_generator.py
|
|
74
|
+
tests/test_improvements.py
|
|
75
|
+
tests/test_js_analyzer.py
|
|
76
|
+
tests/test_js_deobfuscator.py
|
|
77
|
+
tests/test_js_security.py
|
|
78
|
+
tests/test_mcp_config.py
|
|
79
|
+
tests/test_mcp_server.py
|
|
80
|
+
tests/test_mcp_utils.py
|
|
81
|
+
tests/test_network_interceptor.py
|
|
82
|
+
tests/test_proxy_pool.py
|
|
83
|
+
tests/test_rate_limiter.py
|
|
84
|
+
tests/test_recording_storage.py
|
|
85
|
+
tests/test_request_analyzer.py
|
|
86
|
+
tests/test_session_manager.py
|
|
87
|
+
tests/test_session_recorder.py
|
|
88
|
+
tests/test_sitemap_analyzer.py
|
|
89
|
+
tests/test_smart_extractor.py
|
|
90
|
+
tests/test_stealth.py
|
|
91
|
+
tests/test_stealth_client.py
|
|
92
|
+
tests/test_technology_detector.py
|
|
93
|
+
tests/test_xray.py
|
|
94
|
+
tests/test_xray_advanced.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|