crawlemoon 1.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. crawlemoon-1.1.5/LICENSE +23 -0
  2. crawlemoon-1.1.5/PKG-INFO +197 -0
  3. crawlemoon-1.1.5/README.md +128 -0
  4. crawlemoon-1.1.5/crawlemoon.egg-info/PKG-INFO +197 -0
  5. crawlemoon-1.1.5/crawlemoon.egg-info/SOURCES.txt +94 -0
  6. crawlemoon-1.1.5/crawlemoon.egg-info/dependency_links.txt +1 -0
  7. crawlemoon-1.1.5/crawlemoon.egg-info/entry_points.txt +4 -0
  8. crawlemoon-1.1.5/crawlemoon.egg-info/requires.txt +51 -0
  9. crawlemoon-1.1.5/crawlemoon.egg-info/top_level.txt +1 -0
  10. crawlemoon-1.1.5/pyproject.toml +111 -0
  11. crawlemoon-1.1.5/setup.cfg +4 -0
  12. crawlemoon-1.1.5/setup.py +8 -0
  13. crawlemoon-1.1.5/src/__init__.py +47 -0
  14. crawlemoon-1.1.5/src/cli/__init__.py +9 -0
  15. crawlemoon-1.1.5/src/cli/main.py +334 -0
  16. crawlemoon-1.1.5/src/core/__init__.py +9 -0
  17. crawlemoon-1.1.5/src/core/browser/__init__.py +9 -0
  18. crawlemoon-1.1.5/src/core/browser/cdp.py +102 -0
  19. crawlemoon-1.1.5/src/core/browser/pool.py +269 -0
  20. crawlemoon-1.1.5/src/core/browser/proxy_pool.py +322 -0
  21. crawlemoon-1.1.5/src/core/browser/stealth.py +341 -0
  22. crawlemoon-1.1.5/src/core/browser/xray.py +845 -0
  23. crawlemoon-1.1.5/src/core/cache/__init__.py +9 -0
  24. crawlemoon-1.1.5/src/core/cache/manager.py +357 -0
  25. crawlemoon-1.1.5/src/core/http/__init__.py +2 -0
  26. crawlemoon-1.1.5/src/core/http/stealth_client.py +259 -0
  27. crawlemoon-1.1.5/src/core/logging.py +129 -0
  28. crawlemoon-1.1.5/src/core/rate_limiter.py +332 -0
  29. crawlemoon-1.1.5/src/core/recording_storage.py +375 -0
  30. crawlemoon-1.1.5/src/core/session/__init__.py +9 -0
  31. crawlemoon-1.1.5/src/core/session/manager.py +451 -0
  32. crawlemoon-1.1.5/src/crawlers/__init__.py +9 -0
  33. crawlemoon-1.1.5/src/exceptions.py +202 -0
  34. crawlemoon-1.1.5/src/intelligence/__init__.py +9 -0
  35. crawlemoon-1.1.5/src/intelligence/extraction/__init__.py +2 -0
  36. crawlemoon-1.1.5/src/intelligence/extraction/content.py +307 -0
  37. crawlemoon-1.1.5/src/intelligence/extraction/smart.py +534 -0
  38. crawlemoon-1.1.5/src/intelligence/generator/__init__.py +9 -0
  39. crawlemoon-1.1.5/src/intelligence/generator/crawler_gen.py +400 -0
  40. crawlemoon-1.1.5/src/intelligence/js/__init__.py +9 -0
  41. crawlemoon-1.1.5/src/intelligence/js/analyzer.py +305 -0
  42. crawlemoon-1.1.5/src/intelligence/js/deobfuscator.py +154 -0
  43. crawlemoon-1.1.5/src/intelligence/js/dynamic.py +112 -0
  44. crawlemoon-1.1.5/src/intelligence/network/__init__.py +9 -0
  45. crawlemoon-1.1.5/src/intelligence/network/analyzer.py +297 -0
  46. crawlemoon-1.1.5/src/intelligence/network/api_discovery.py +581 -0
  47. crawlemoon-1.1.5/src/intelligence/network/graphql.py +49 -0
  48. crawlemoon-1.1.5/src/intelligence/network/interceptor.py +250 -0
  49. crawlemoon-1.1.5/src/intelligence/network/sitemap.py +229 -0
  50. crawlemoon-1.1.5/src/intelligence/network/websocket.py +69 -0
  51. crawlemoon-1.1.5/src/intelligence/recorder/__init__.py +9 -0
  52. crawlemoon-1.1.5/src/intelligence/recorder/session.py +388 -0
  53. crawlemoon-1.1.5/src/intelligence/recorder/state_machine.py +133 -0
  54. crawlemoon-1.1.5/src/intelligence/security/__init__.py +9 -0
  55. crawlemoon-1.1.5/src/intelligence/security/auth.py +114 -0
  56. crawlemoon-1.1.5/src/intelligence/security/bot_detection.py +341 -0
  57. crawlemoon-1.1.5/src/intelligence/security/captcha_solver.py +534 -0
  58. crawlemoon-1.1.5/src/intelligence/security/technology_detector.py +210 -0
  59. crawlemoon-1.1.5/src/mcp/__init__.py +9 -0
  60. crawlemoon-1.1.5/src/mcp/config.py +151 -0
  61. crawlemoon-1.1.5/src/mcp/schemas.py +454 -0
  62. crawlemoon-1.1.5/src/mcp/server.py +4603 -0
  63. crawlemoon-1.1.5/src/mcp/tools/__init__.py +9 -0
  64. crawlemoon-1.1.5/src/mcp/utils.py +297 -0
  65. crawlemoon-1.1.5/src/sites/__init__.py +10 -0
  66. crawlemoon-1.1.5/tests/test_api_discovery.py +124 -0
  67. crawlemoon-1.1.5/tests/test_auth_analyzer.py +109 -0
  68. crawlemoon-1.1.5/tests/test_bot_detection.py +307 -0
  69. crawlemoon-1.1.5/tests/test_browser_pool.py +191 -0
  70. crawlemoon-1.1.5/tests/test_cache_manager.py +189 -0
  71. crawlemoon-1.1.5/tests/test_captcha_solver.py +348 -0
  72. crawlemoon-1.1.5/tests/test_cdp_client.py +301 -0
  73. crawlemoon-1.1.5/tests/test_content_extractor.py +351 -0
  74. crawlemoon-1.1.5/tests/test_crawl_and_extract.py +249 -0
  75. crawlemoon-1.1.5/tests/test_crawler_generator.py +199 -0
  76. crawlemoon-1.1.5/tests/test_improvements.py +668 -0
  77. crawlemoon-1.1.5/tests/test_js_analyzer.py +182 -0
  78. crawlemoon-1.1.5/tests/test_js_deobfuscator.py +332 -0
  79. crawlemoon-1.1.5/tests/test_js_security.py +56 -0
  80. crawlemoon-1.1.5/tests/test_mcp_config.py +301 -0
  81. crawlemoon-1.1.5/tests/test_mcp_server.py +328 -0
  82. crawlemoon-1.1.5/tests/test_mcp_utils.py +87 -0
  83. crawlemoon-1.1.5/tests/test_network_interceptor.py +172 -0
  84. crawlemoon-1.1.5/tests/test_proxy_pool.py +405 -0
  85. crawlemoon-1.1.5/tests/test_rate_limiter.py +305 -0
  86. crawlemoon-1.1.5/tests/test_recording_storage.py +146 -0
  87. crawlemoon-1.1.5/tests/test_request_analyzer.py +74 -0
  88. crawlemoon-1.1.5/tests/test_session_manager.py +335 -0
  89. crawlemoon-1.1.5/tests/test_session_recorder.py +176 -0
  90. crawlemoon-1.1.5/tests/test_sitemap_analyzer.py +415 -0
  91. crawlemoon-1.1.5/tests/test_smart_extractor.py +336 -0
  92. crawlemoon-1.1.5/tests/test_stealth.py +96 -0
  93. crawlemoon-1.1.5/tests/test_stealth_client.py +318 -0
  94. crawlemoon-1.1.5/tests/test_technology_detector.py +377 -0
  95. crawlemoon-1.1.5/tests/test_xray.py +254 -0
  96. crawlemoon-1.1.5/tests/test_xray_advanced.py +215 -0
@@ -0,0 +1,23 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 emad.dev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlemoon
3
+ Version: 1.1.5
4
+ Summary: Advanced Web Crawling Platform with Deep Analysis and MCP Server
5
+ Author-email: "emad.dev" <contact@emad.dev>
6
+ License: MIT
7
+ Keywords: crawling,scraping,automation,mcp,web-analysis,playwright,api-discovery
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Classifier: Topic :: System :: Networking
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: playwright>=1.40.0
22
+ Requires-Dist: mcp>=1.0.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: pydantic-settings>=2.0.0
25
+ Requires-Dist: aiohttp>=3.9.0
26
+ Requires-Dist: httpx>=0.25.0
27
+ Requires-Dist: websockets>=12.0
28
+ Requires-Dist: pyyaml>=6.0
29
+ Requires-Dist: python-json-logger>=2.0.0
30
+ Requires-Dist: tenacity>=8.2.0
31
+ Requires-Dist: cachetools>=5.3.0
32
+ Requires-Dist: graphql-core>=3.2.0
33
+ Requires-Dist: esprima>=4.0.0
34
+ Requires-Dist: beautifulsoup4>=4.12.0
35
+ Requires-Dist: lxml>=5.0.0
36
+ Requires-Dist: cryptography>=41.0.0
37
+ Requires-Dist: curl-cffi>=0.6.0
38
+ Requires-Dist: fake-useragent>=1.4.0
39
+ Requires-Dist: trafilatura>=1.6.0
40
+ Requires-Dist: selectolax>=0.3.0
41
+ Requires-Dist: markdownify>=0.11.0
42
+ Requires-Dist: instructor>=1.0.0
43
+ Requires-Dist: openai>=1.0.0
44
+ Requires-Dist: python-Wappalyzer>=0.3.0
45
+ Requires-Dist: ftfy>=6.1.0
46
+ Requires-Dist: dateparser>=1.2.0
47
+ Requires-Dist: url-normalize>=1.4.0
48
+ Requires-Dist: tldextract>=5.0.0
49
+ Provides-Extra: captcha
50
+ Requires-Dist: python-anticaptcha>=1.0.0; extra == "captcha"
51
+ Requires-Dist: capsolver>=1.0.0; extra == "captcha"
52
+ Provides-Extra: ocr
53
+ Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
54
+ Requires-Dist: Pillow>=10.0.0; extra == "ocr"
55
+ Provides-Extra: dev
56
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
57
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
58
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
59
+ Requires-Dist: black>=23.0.0; extra == "dev"
60
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
61
+ Requires-Dist: mypy>=1.7.0; extra == "dev"
62
+ Requires-Dist: pip-audit>=2.7.0; extra == "dev"
63
+ Requires-Dist: pre-commit>=3.6.0; extra == "dev"
64
+ Requires-Dist: build>=1.0.0; extra == "dev"
65
+ Requires-Dist: twine>=4.0.0; extra == "dev"
66
+ Provides-Extra: all
67
+ Requires-Dist: crawlemoon[captcha,dev,ocr]; extra == "all"
68
+ Dynamic: license-file
69
+
70
+ # Crawlemoon MCP Server
71
+
72
+ <p align="center">
73
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/hero.png" alt="Crawlemoon MCP Server — free, AI-native web crawling for the agent era" width="100%"/>
74
+ </p>
75
+
76
+ <p align="left">
77
+ <img alt="python 3.10+ · pypi 1.1.0 · MIT · MCP-native · code style black" src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/badges.png" height="22"/>
78
+ </p>
79
+
80
+ A **free, open-source MCP server** that gives any agent (Claude Code, Cursor, Windsurf, …) **55 production-grade tools** for the full web-crawling stack: deep analysis, stealth, API discovery, session recording → runnable crawler, smart extraction. No proprietary API. No per-request fee.
81
+
82
+ <p align="center">
83
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/features.png" alt="Crawlemoon capabilities — deep analysis, stealth, record→crawler, smart extraction" width="100%"/>
84
+ </p>
85
+
86
+ ---
87
+
88
+ ## Quick start
89
+
90
+ <p align="center">
91
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/install.png" alt="Three install paths — uvx, pipx, pip" width="100%"/>
92
+ </p>
93
+
94
+ The recommended path needs no install — `uvx` runs straight from PyPI:
95
+
96
+ ```json
97
+ {
98
+ "mcpServers": {
99
+ "crawlemoon": {
100
+ "command": "uvx",
101
+ "args": ["crawlemoon-mcp-server"]
102
+ }
103
+ }
104
+ }
105
+ ```
106
+
107
+ > Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install once: `curl -LsSf https://astral.sh/uv/install.sh | sh`. Or use `pipx run crawlemoon-mcp-server` / `pip install crawlemoon-mcp-server` instead.
108
+
109
+ **Where to put that JSON:** Cursor → Settings → MCP. Claude Code → `~/.config/claude/mcp_settings.json`. Windsurf → Settings → MCP Servers.
110
+
111
+ ---
112
+
113
+ ## How it works
114
+
115
+ <p align="center">
116
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/architecture.png" alt="Agent → Crawlemoon → Browser/HTTP/Proxy → target web" width="100%"/>
117
+ </p>
118
+
119
+ Your agent talks to Crawlemoon over the Model Context Protocol. Crawlemoon owns a hardened browser pool, an HTTP stack with TLS fingerprinting, and a rotating proxy pool. While it fetches pages, it captures network traffic, reads scripts, and introspects schemas — so the agent gets clean structured data, not raw HTML.
120
+
121
+ ---
122
+
123
+ ## What's in the box
124
+
125
+ A short list — see the source for the full set of 55 tools.
126
+
127
+ | Group | Tools |
128
+ |---|---|
129
+ | **Deep analysis** | `deep_analyze`, `discover_apis`, `introspect_graphql`, `analyze_websocket`, `analyze_auth`, `detect_protection`, `detect_technology` |
130
+ | **Stealth** | `stealth_request`, `configure_proxies`, `configure_rate_limit`, `add_proxy`, `test_proxy` |
131
+ | **Record → crawler** | `record_session`, `stop_recording`, `export_recording`, `generate_crawler` |
132
+ | **Extraction** | `smart_extract`, `extract_article`, `extract_tables`, `extract_links`, `extract_forms`, `extract_metadata`, `convert_to_markdown` |
133
+ | **Page interaction** | `take_screenshot`, `fill_form`, `wait_and_extract`, `compare_pages`, `measure_performance`, `check_accessibility`, `get_dom_tree` |
134
+ | **Sessions & cache** | `save_session`, `load_session`, `get_cookies`, `get_storage`, `clear_cache`, `get_cache_stats` |
135
+ | **Advanced (opt-in)** | `execute_js`, `execute_cdp`, `deobfuscate_js`, `extract_from_js`, `solve_captcha` |
136
+
137
+ ---
138
+
139
+ ## Smart extraction — bring any LLM, including free ones
140
+
141
+ `smart_extract` works **without any API key** using pattern matching. Plug in any OpenAI-compatible endpoint for higher accuracy — including FREE tiers:
142
+
143
+ ```bash
144
+ # OpenRouter (free models exist)
145
+ CRAWLEMOON_LLM_PROVIDER=openrouter
146
+ CRAWLEMOON_LLM_API_KEY=sk-or-v1-xxx
147
+ CRAWLEMOON_LLM_MODEL=meta-llama/llama-3.2-3b-instruct:free
148
+
149
+ # Groq (free, very fast)
150
+ CRAWLEMOON_LLM_PROVIDER=groq
151
+ CRAWLEMOON_LLM_API_KEY=gsk_xxx
152
+
153
+ # Local Ollama (no key needed)
154
+ CRAWLEMOON_LLM_PROVIDER=ollama
155
+ CRAWLEMOON_LLM_MODEL=llama3.2
156
+ ```
157
+
158
+ Together, DeepSeek, Mistral, Fireworks, and standard OpenAI also work via `CRAWLEMOON_LLM_BASE_URL`.
159
+
160
+ ---
161
+
162
+ ## Configuration
163
+
164
+ | Variable | Default | Notes |
165
+ |---|---|---|
166
+ | `CRAWLEMOON_HEADLESS` | `true` | Run browser without UI |
167
+ | `CRAWLEMOON_BROWSER` | `chromium` | `chromium` / `firefox` / `webkit` |
168
+ | `CRAWLEMOON_POOL_SIZE` | `5` | Max concurrent browsers |
169
+ | `CRAWLEMOON_NAV_TIMEOUT` | `30.0` | Page-load timeout (s) |
170
+ | `CRAWLEMOON_API_KEY` | _unset_ | If set, every tool call must include matching `_api_key` |
171
+ | `CRAWLEMOON_ALLOW_DANGEROUS_JS` | `false` | Required for `execute_js` / `execute_cdp` / `deobfuscate_js` |
172
+ | `CRAWLEMOON_JS_MAX_LENGTH` | `50000` | Length cap for JS payloads |
173
+ | `CRAWLEMOON_JS_EXEC_TIMEOUT` | `10.0` | Per-script timeout (s) |
174
+
175
+ ---
176
+
177
+ ## Security
178
+
179
+ `execute_js`, `execute_cdp`, and `deobfuscate_js` are **disabled by default** — they execute or operate on arbitrary code in a real browser. Enable on trusted networks with `CRAWLEMOON_ALLOW_DANGEROUS_JS=true`. Even then, payloads are length-capped, time-bounded, and a denylist rejects `eval`, `new Function`, dynamic `import()`, `document.write`, `importScripts`, and `WebAssembly.{compile,instantiate}`. Set `CRAWLEMOON_API_KEY` so MCP clients must present a matching `_api_key`.
180
+
181
+ These are mitigations, not a sandbox: do not expose this server to untrusted clients.
182
+
183
+ ---
184
+
185
+ ## Develop
186
+
187
+ ```bash
188
+ git clone https://github.com/razavioo/crawlemoon-mcp-server.git
189
+ cd crawlemoon-mcp-server
190
+ make dev-install # editable install + dev/captcha/ocr extras + pre-commit
191
+ make test # pytest
192
+ make lint # ruff + mypy
193
+ ```
194
+
195
+ PRs welcome. Particularly interested in: distributed mode (Redis queue), result sinks (Postgres / S3), Prometheus metrics. See [`MIT License`](LICENSE).
196
+
197
+ <p align="center"><sub>Made by <a href="https://emad.dev">emad.dev</a></sub></p>
@@ -0,0 +1,128 @@
1
+ # Crawlemoon MCP Server
2
+
3
+ <p align="center">
4
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/hero.png" alt="Crawlemoon MCP Server — free, AI-native web crawling for the agent era" width="100%"/>
5
+ </p>
6
+
7
+ <p align="left">
8
+ <img alt="python 3.10+ · pypi 1.1.0 · MIT · MCP-native · code style black" src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/badges.png" height="22"/>
9
+ </p>
10
+
11
+ A **free, open-source MCP server** that gives any agent (Claude Code, Cursor, Windsurf, …) **55 production-grade tools** for the full web-crawling stack: deep analysis, stealth, API discovery, session recording → runnable crawler, smart extraction. No proprietary API. No per-request fee.
12
+
13
+ <p align="center">
14
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/features.png" alt="Crawlemoon capabilities — deep analysis, stealth, record→crawler, smart extraction" width="100%"/>
15
+ </p>
16
+
17
+ ---
18
+
19
+ ## Quick start
20
+
21
+ <p align="center">
22
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/install.png" alt="Three install paths — uvx, pipx, pip" width="100%"/>
23
+ </p>
24
+
25
+ The recommended path needs no install — `uvx` runs straight from PyPI:
26
+
27
+ ```json
28
+ {
29
+ "mcpServers": {
30
+ "crawlemoon": {
31
+ "command": "uvx",
32
+ "args": ["crawlemoon-mcp-server"]
33
+ }
34
+ }
35
+ }
36
+ ```
37
+
38
+ > Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install once: `curl -LsSf https://astral.sh/uv/install.sh | sh`. Or use `pipx run crawlemoon-mcp-server` / `pip install crawlemoon-mcp-server` instead.
39
+
40
+ **Where to put that JSON:** Cursor → Settings → MCP. Claude Code → `~/.config/claude/mcp_settings.json`. Windsurf → Settings → MCP Servers.
41
+
42
+ ---
43
+
44
+ ## How it works
45
+
46
+ <p align="center">
47
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/architecture.png" alt="Agent → Crawlemoon → Browser/HTTP/Proxy → target web" width="100%"/>
48
+ </p>
49
+
50
+ Your agent talks to Crawlemoon over the Model Context Protocol. Crawlemoon owns a hardened browser pool, an HTTP stack with TLS fingerprinting, and a rotating proxy pool. While it fetches pages, it captures network traffic, reads scripts, and introspects schemas — so the agent gets clean structured data, not raw HTML.
51
+
52
+ ---
53
+
54
+ ## What's in the box
55
+
56
+ A short list — see the source for the full set of 55 tools.
57
+
58
+ | Group | Tools |
59
+ |---|---|
60
+ | **Deep analysis** | `deep_analyze`, `discover_apis`, `introspect_graphql`, `analyze_websocket`, `analyze_auth`, `detect_protection`, `detect_technology` |
61
+ | **Stealth** | `stealth_request`, `configure_proxies`, `configure_rate_limit`, `add_proxy`, `test_proxy` |
62
+ | **Record → crawler** | `record_session`, `stop_recording`, `export_recording`, `generate_crawler` |
63
+ | **Extraction** | `smart_extract`, `extract_article`, `extract_tables`, `extract_links`, `extract_forms`, `extract_metadata`, `convert_to_markdown` |
64
+ | **Page interaction** | `take_screenshot`, `fill_form`, `wait_and_extract`, `compare_pages`, `measure_performance`, `check_accessibility`, `get_dom_tree` |
65
+ | **Sessions & cache** | `save_session`, `load_session`, `get_cookies`, `get_storage`, `clear_cache`, `get_cache_stats` |
66
+ | **Advanced (opt-in)** | `execute_js`, `execute_cdp`, `deobfuscate_js`, `extract_from_js`, `solve_captcha` |
67
+
68
+ ---
69
+
70
+ ## Smart extraction — bring any LLM, including free ones
71
+
72
+ `smart_extract` works **without any API key** using pattern matching. Plug in any OpenAI-compatible endpoint for higher accuracy — including FREE tiers:
73
+
74
+ ```bash
75
+ # OpenRouter (free models exist)
76
+ CRAWLEMOON_LLM_PROVIDER=openrouter
77
+ CRAWLEMOON_LLM_API_KEY=sk-or-v1-xxx
78
+ CRAWLEMOON_LLM_MODEL=meta-llama/llama-3.2-3b-instruct:free
79
+
80
+ # Groq (free, very fast)
81
+ CRAWLEMOON_LLM_PROVIDER=groq
82
+ CRAWLEMOON_LLM_API_KEY=gsk_xxx
83
+
84
+ # Local Ollama (no key needed)
85
+ CRAWLEMOON_LLM_PROVIDER=ollama
86
+ CRAWLEMOON_LLM_MODEL=llama3.2
87
+ ```
88
+
89
+ Together, DeepSeek, Mistral, Fireworks, and standard OpenAI also work via `CRAWLEMOON_LLM_BASE_URL`.
90
+
91
+ ---
92
+
93
+ ## Configuration
94
+
95
+ | Variable | Default | Notes |
96
+ |---|---|---|
97
+ | `CRAWLEMOON_HEADLESS` | `true` | Run browser without UI |
98
+ | `CRAWLEMOON_BROWSER` | `chromium` | `chromium` / `firefox` / `webkit` |
99
+ | `CRAWLEMOON_POOL_SIZE` | `5` | Max concurrent browsers |
100
+ | `CRAWLEMOON_NAV_TIMEOUT` | `30.0` | Page-load timeout (s) |
101
+ | `CRAWLEMOON_API_KEY` | _unset_ | If set, every tool call must include matching `_api_key` |
102
+ | `CRAWLEMOON_ALLOW_DANGEROUS_JS` | `false` | Required for `execute_js` / `execute_cdp` / `deobfuscate_js` |
103
+ | `CRAWLEMOON_JS_MAX_LENGTH` | `50000` | Length cap for JS payloads |
104
+ | `CRAWLEMOON_JS_EXEC_TIMEOUT` | `10.0` | Per-script timeout (s) |
105
+
106
+ ---
107
+
108
+ ## Security
109
+
110
+ `execute_js`, `execute_cdp`, and `deobfuscate_js` are **disabled by default** — they execute or operate on arbitrary code in a real browser. Enable on trusted networks with `CRAWLEMOON_ALLOW_DANGEROUS_JS=true`. Even then, payloads are length-capped, time-bounded, and a denylist rejects `eval`, `new Function`, dynamic `import()`, `document.write`, `importScripts`, and `WebAssembly.{compile,instantiate}`. Set `CRAWLEMOON_API_KEY` so MCP clients must present a matching `_api_key`.
111
+
112
+ These are mitigations, not a sandbox: do not expose this server to untrusted clients.
113
+
114
+ ---
115
+
116
+ ## Develop
117
+
118
+ ```bash
119
+ git clone https://github.com/razavioo/crawlemoon-mcp-server.git
120
+ cd crawlemoon-mcp-server
121
+ make dev-install # editable install + dev/captcha/ocr extras + pre-commit
122
+ make test # pytest
123
+ make lint # ruff + mypy
124
+ ```
125
+
126
+ PRs welcome. Particularly interested in: distributed mode (Redis queue), result sinks (Postgres / S3), Prometheus metrics. See [`MIT License`](LICENSE).
127
+
128
+ <p align="center"><sub>Made by <a href="https://emad.dev">emad.dev</a></sub></p>
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlemoon
3
+ Version: 1.1.5
4
+ Summary: Advanced Web Crawling Platform with Deep Analysis and MCP Server
5
+ Author-email: "emad.dev" <contact@emad.dev>
6
+ License: MIT
7
+ Keywords: crawling,scraping,automation,mcp,web-analysis,playwright,api-discovery
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Classifier: Topic :: System :: Networking
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: playwright>=1.40.0
22
+ Requires-Dist: mcp>=1.0.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: pydantic-settings>=2.0.0
25
+ Requires-Dist: aiohttp>=3.9.0
26
+ Requires-Dist: httpx>=0.25.0
27
+ Requires-Dist: websockets>=12.0
28
+ Requires-Dist: pyyaml>=6.0
29
+ Requires-Dist: python-json-logger>=2.0.0
30
+ Requires-Dist: tenacity>=8.2.0
31
+ Requires-Dist: cachetools>=5.3.0
32
+ Requires-Dist: graphql-core>=3.2.0
33
+ Requires-Dist: esprima>=4.0.0
34
+ Requires-Dist: beautifulsoup4>=4.12.0
35
+ Requires-Dist: lxml>=5.0.0
36
+ Requires-Dist: cryptography>=41.0.0
37
+ Requires-Dist: curl-cffi>=0.6.0
38
+ Requires-Dist: fake-useragent>=1.4.0
39
+ Requires-Dist: trafilatura>=1.6.0
40
+ Requires-Dist: selectolax>=0.3.0
41
+ Requires-Dist: markdownify>=0.11.0
42
+ Requires-Dist: instructor>=1.0.0
43
+ Requires-Dist: openai>=1.0.0
44
+ Requires-Dist: python-Wappalyzer>=0.3.0
45
+ Requires-Dist: ftfy>=6.1.0
46
+ Requires-Dist: dateparser>=1.2.0
47
+ Requires-Dist: url-normalize>=1.4.0
48
+ Requires-Dist: tldextract>=5.0.0
49
+ Provides-Extra: captcha
50
+ Requires-Dist: python-anticaptcha>=1.0.0; extra == "captcha"
51
+ Requires-Dist: capsolver>=1.0.0; extra == "captcha"
52
+ Provides-Extra: ocr
53
+ Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
54
+ Requires-Dist: Pillow>=10.0.0; extra == "ocr"
55
+ Provides-Extra: dev
56
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
57
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
58
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
59
+ Requires-Dist: black>=23.0.0; extra == "dev"
60
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
61
+ Requires-Dist: mypy>=1.7.0; extra == "dev"
62
+ Requires-Dist: pip-audit>=2.7.0; extra == "dev"
63
+ Requires-Dist: pre-commit>=3.6.0; extra == "dev"
64
+ Requires-Dist: build>=1.0.0; extra == "dev"
65
+ Requires-Dist: twine>=4.0.0; extra == "dev"
66
+ Provides-Extra: all
67
+ Requires-Dist: crawlemoon[captcha,dev,ocr]; extra == "all"
68
+ Dynamic: license-file
69
+
70
+ # Crawlemoon MCP Server
71
+
72
+ <p align="center">
73
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/hero.png" alt="Crawlemoon MCP Server — free, AI-native web crawling for the agent era" width="100%"/>
74
+ </p>
75
+
76
+ <p align="left">
77
+ <img alt="python 3.10+ · pypi 1.1.0 · MIT · MCP-native · code style black" src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/badges.png" height="22"/>
78
+ </p>
79
+
80
+ A **free, open-source MCP server** that gives any agent (Claude Code, Cursor, Windsurf, …) **55 production-grade tools** for the full web-crawling stack: deep analysis, stealth, API discovery, session recording → runnable crawler, smart extraction. No proprietary API. No per-request fee.
81
+
82
+ <p align="center">
83
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/features.png" alt="Crawlemoon capabilities — deep analysis, stealth, record→crawler, smart extraction" width="100%"/>
84
+ </p>
85
+
86
+ ---
87
+
88
+ ## Quick start
89
+
90
+ <p align="center">
91
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/install.png" alt="Three install paths — uvx, pipx, pip" width="100%"/>
92
+ </p>
93
+
94
+ The recommended path needs no install — `uvx` runs straight from PyPI:
95
+
96
+ ```json
97
+ {
98
+ "mcpServers": {
99
+ "crawlemoon": {
100
+ "command": "uvx",
101
+ "args": ["crawlemoon-mcp-server"]
102
+ }
103
+ }
104
+ }
105
+ ```
106
+
107
+ > Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install once: `curl -LsSf https://astral.sh/uv/install.sh | sh`. Or use `pipx run crawlemoon-mcp-server` / `pip install crawlemoon-mcp-server` instead.
108
+
109
+ **Where to put that JSON:** Cursor → Settings → MCP. Claude Code → `~/.config/claude/mcp_settings.json`. Windsurf → Settings → MCP Servers.
110
+
111
+ ---
112
+
113
+ ## How it works
114
+
115
+ <p align="center">
116
+ <img src="https://raw.githubusercontent.com/razavioo/crawlemoon-mcp-server/main/assets/architecture.png" alt="Agent → Crawlemoon → Browser/HTTP/Proxy → target web" width="100%"/>
117
+ </p>
118
+
119
+ Your agent talks to Crawlemoon over the Model Context Protocol. Crawlemoon owns a hardened browser pool, an HTTP stack with TLS fingerprinting, and a rotating proxy pool. While it fetches pages, it captures network traffic, reads scripts, and introspects schemas — so the agent gets clean structured data, not raw HTML.
120
+
121
+ ---
122
+
123
+ ## What's in the box
124
+
125
+ A short list — see the source for the full set of 55 tools.
126
+
127
+ | Group | Tools |
128
+ |---|---|
129
+ | **Deep analysis** | `deep_analyze`, `discover_apis`, `introspect_graphql`, `analyze_websocket`, `analyze_auth`, `detect_protection`, `detect_technology` |
130
+ | **Stealth** | `stealth_request`, `configure_proxies`, `configure_rate_limit`, `add_proxy`, `test_proxy` |
131
+ | **Record → crawler** | `record_session`, `stop_recording`, `export_recording`, `generate_crawler` |
132
+ | **Extraction** | `smart_extract`, `extract_article`, `extract_tables`, `extract_links`, `extract_forms`, `extract_metadata`, `convert_to_markdown` |
133
+ | **Page interaction** | `take_screenshot`, `fill_form`, `wait_and_extract`, `compare_pages`, `measure_performance`, `check_accessibility`, `get_dom_tree` |
134
+ | **Sessions & cache** | `save_session`, `load_session`, `get_cookies`, `get_storage`, `clear_cache`, `get_cache_stats` |
135
+ | **Advanced (opt-in)** | `execute_js`, `execute_cdp`, `deobfuscate_js`, `extract_from_js`, `solve_captcha` |
136
+
137
+ ---
138
+
139
+ ## Smart extraction — bring any LLM, including free ones
140
+
141
+ `smart_extract` works **without any API key** using pattern matching. Plug in any OpenAI-compatible endpoint for higher accuracy — including FREE tiers:
142
+
143
+ ```bash
144
+ # OpenRouter (free models exist)
145
+ CRAWLEMOON_LLM_PROVIDER=openrouter
146
+ CRAWLEMOON_LLM_API_KEY=sk-or-v1-xxx
147
+ CRAWLEMOON_LLM_MODEL=meta-llama/llama-3.2-3b-instruct:free
148
+
149
+ # Groq (free, very fast)
150
+ CRAWLEMOON_LLM_PROVIDER=groq
151
+ CRAWLEMOON_LLM_API_KEY=gsk_xxx
152
+
153
+ # Local Ollama (no key needed)
154
+ CRAWLEMOON_LLM_PROVIDER=ollama
155
+ CRAWLEMOON_LLM_MODEL=llama3.2
156
+ ```
157
+
158
+ Together, DeepSeek, Mistral, Fireworks, and standard OpenAI also work via `CRAWLEMOON_LLM_BASE_URL`.
159
+
160
+ ---
161
+
162
+ ## Configuration
163
+
164
+ | Variable | Default | Notes |
165
+ |---|---|---|
166
+ | `CRAWLEMOON_HEADLESS` | `true` | Run browser without UI |
167
+ | `CRAWLEMOON_BROWSER` | `chromium` | `chromium` / `firefox` / `webkit` |
168
+ | `CRAWLEMOON_POOL_SIZE` | `5` | Max concurrent browsers |
169
+ | `CRAWLEMOON_NAV_TIMEOUT` | `30.0` | Page-load timeout (s) |
170
+ | `CRAWLEMOON_API_KEY` | _unset_ | If set, every tool call must include matching `_api_key` |
171
+ | `CRAWLEMOON_ALLOW_DANGEROUS_JS` | `false` | Required for `execute_js` / `execute_cdp` / `deobfuscate_js` |
172
+ | `CRAWLEMOON_JS_MAX_LENGTH` | `50000` | Length cap for JS payloads |
173
+ | `CRAWLEMOON_JS_EXEC_TIMEOUT` | `10.0` | Per-script timeout (s) |
174
+
175
+ ---
176
+
177
+ ## Security
178
+
179
+ `execute_js`, `execute_cdp`, and `deobfuscate_js` are **disabled by default** — they execute or operate on arbitrary code in a real browser. Enable on trusted networks with `CRAWLEMOON_ALLOW_DANGEROUS_JS=true`. Even then, payloads are length-capped, time-bounded, and a denylist rejects `eval`, `new Function`, dynamic `import()`, `document.write`, `importScripts`, and `WebAssembly.{compile,instantiate}`. Set `CRAWLEMOON_API_KEY` so MCP clients must present a matching `_api_key`.
180
+
181
+ These are mitigations, not a sandbox: do not expose this server to untrusted clients.
182
+
183
+ ---
184
+
185
+ ## Develop
186
+
187
+ ```bash
188
+ git clone https://github.com/razavioo/crawlemoon-mcp-server.git
189
+ cd crawlemoon-mcp-server
190
+ make dev-install # editable install + dev/captcha/ocr extras + pre-commit
191
+ make test # pytest
192
+ make lint # ruff + mypy
193
+ ```
194
+
195
+ PRs welcome. Particularly interested in: distributed mode (Redis queue), result sinks (Postgres / S3), Prometheus metrics. See [`MIT License`](LICENSE).
196
+
197
+ <p align="center"><sub>Made by <a href="https://emad.dev">emad.dev</a></sub></p>
@@ -0,0 +1,94 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ crawlemoon.egg-info/PKG-INFO
6
+ crawlemoon.egg-info/SOURCES.txt
7
+ crawlemoon.egg-info/dependency_links.txt
8
+ crawlemoon.egg-info/entry_points.txt
9
+ crawlemoon.egg-info/requires.txt
10
+ crawlemoon.egg-info/top_level.txt
11
+ src/__init__.py
12
+ src/exceptions.py
13
+ src/cli/__init__.py
14
+ src/cli/main.py
15
+ src/core/__init__.py
16
+ src/core/logging.py
17
+ src/core/rate_limiter.py
18
+ src/core/recording_storage.py
19
+ src/core/browser/__init__.py
20
+ src/core/browser/cdp.py
21
+ src/core/browser/pool.py
22
+ src/core/browser/proxy_pool.py
23
+ src/core/browser/stealth.py
24
+ src/core/browser/xray.py
25
+ src/core/cache/__init__.py
26
+ src/core/cache/manager.py
27
+ src/core/http/__init__.py
28
+ src/core/http/stealth_client.py
29
+ src/core/session/__init__.py
30
+ src/core/session/manager.py
31
+ src/crawlers/__init__.py
32
+ src/intelligence/__init__.py
33
+ src/intelligence/extraction/__init__.py
34
+ src/intelligence/extraction/content.py
35
+ src/intelligence/extraction/smart.py
36
+ src/intelligence/generator/__init__.py
37
+ src/intelligence/generator/crawler_gen.py
38
+ src/intelligence/js/__init__.py
39
+ src/intelligence/js/analyzer.py
40
+ src/intelligence/js/deobfuscator.py
41
+ src/intelligence/js/dynamic.py
42
+ src/intelligence/network/__init__.py
43
+ src/intelligence/network/analyzer.py
44
+ src/intelligence/network/api_discovery.py
45
+ src/intelligence/network/graphql.py
46
+ src/intelligence/network/interceptor.py
47
+ src/intelligence/network/sitemap.py
48
+ src/intelligence/network/websocket.py
49
+ src/intelligence/recorder/__init__.py
50
+ src/intelligence/recorder/session.py
51
+ src/intelligence/recorder/state_machine.py
52
+ src/intelligence/security/__init__.py
53
+ src/intelligence/security/auth.py
54
+ src/intelligence/security/bot_detection.py
55
+ src/intelligence/security/captcha_solver.py
56
+ src/intelligence/security/technology_detector.py
57
+ src/mcp/__init__.py
58
+ src/mcp/config.py
59
+ src/mcp/schemas.py
60
+ src/mcp/server.py
61
+ src/mcp/utils.py
62
+ src/mcp/tools/__init__.py
63
+ src/sites/__init__.py
64
+ tests/test_api_discovery.py
65
+ tests/test_auth_analyzer.py
66
+ tests/test_bot_detection.py
67
+ tests/test_browser_pool.py
68
+ tests/test_cache_manager.py
69
+ tests/test_captcha_solver.py
70
+ tests/test_cdp_client.py
71
+ tests/test_content_extractor.py
72
+ tests/test_crawl_and_extract.py
73
+ tests/test_crawler_generator.py
74
+ tests/test_improvements.py
75
+ tests/test_js_analyzer.py
76
+ tests/test_js_deobfuscator.py
77
+ tests/test_js_security.py
78
+ tests/test_mcp_config.py
79
+ tests/test_mcp_server.py
80
+ tests/test_mcp_utils.py
81
+ tests/test_network_interceptor.py
82
+ tests/test_proxy_pool.py
83
+ tests/test_rate_limiter.py
84
+ tests/test_recording_storage.py
85
+ tests/test_request_analyzer.py
86
+ tests/test_session_manager.py
87
+ tests/test_session_recorder.py
88
+ tests/test_sitemap_analyzer.py
89
+ tests/test_smart_extractor.py
90
+ tests/test_stealth.py
91
+ tests/test_stealth_client.py
92
+ tests/test_technology_detector.py
93
+ tests/test_xray.py
94
+ tests/test_xray_advanced.py
@@ -0,0 +1,4 @@
1
+ [console_scripts]
2
+ crawl = src.cli.main:main
3
+ crawlemoon = src.mcp.server:main_sync
4
+ crawlemoon-mcp-server = src.mcp.server:main_sync