agentic-stealth-browser 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentic_stealth_browser-0.8.0.dist-info/METADATA +195 -0
- agentic_stealth_browser-0.8.0.dist-info/RECORD +53 -0
- agentic_stealth_browser-0.8.0.dist-info/WHEEL +5 -0
- agentic_stealth_browser-0.8.0.dist-info/entry_points.txt +2 -0
- agentic_stealth_browser-0.8.0.dist-info/licenses/LICENSE +21 -0
- agentic_stealth_browser-0.8.0.dist-info/top_level.txt +11 -0
- ai/__init__.py +0 -0
- ai/ai_hooks.py +126 -0
- audit/__init__.py +0 -0
- audit/logger.py +449 -0
- behavior/__init__.py +0 -0
- behavior/adaptive_tuner.py +175 -0
- behavior/human_behavior.py +825 -0
- behavior/orchestration.py +141 -0
- behavior/persona_rotator.py +380 -0
- core/__init__.py +0 -0
- core/account_health.py +309 -0
- core/account_warming.py +315 -0
- core/agent_browser.py +1676 -0
- core/connection_pool.py +115 -0
- core/error_messages.py +191 -0
- core/py.typed +0 -0
- core/session_checkpoint.py +354 -0
- core/types.py +169 -0
- linkedin/__init__.py +5 -0
- linkedin/actions.py +154 -0
- production/Dockerfile +83 -0
- production/__init__.py +7 -0
- production/cli.py +240 -0
- production/docker-compose.yml +45 -0
- production/docker-healthcheck.py +14 -0
- production/metrics.py +140 -0
- production/otel_export.py +210 -0
- production/rate_limiter.py +291 -0
- proxy/__init__.py +0 -0
- proxy/proxy_manager.py +369 -0
- recovery/__init__.py +0 -0
- recovery/anti_block_orchestrator.py +791 -0
- recovery/detectors.py +263 -0
- recovery/explain_blocked.py +98 -0
- scraping/__init__.py +0 -0
- scraping/scraper.py +88 -0
- sessions/__init__.py +0 -0
- sessions/cookie_manager.py +823 -0
- sessions/session_manager.py +283 -0
- stealth/__init__.py +0 -0
- stealth/advanced_stealth.py +487 -0
- stealth/cache.py +160 -0
- stealth/headers.py +42 -0
- stealth/presets.py +279 -0
- stealth/profiles.py +226 -0
- stealth/tls_fingerprint.py +264 -0
- stealth/tls_ja3_ja4.py +157 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentic-stealth-browser
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: Production-grade, human-mimicking browser automation framework for autonomous agents. Survives modern anti-bot systems.
|
|
5
|
+
Author-email: Shane W <shanewas@users.noreply.github.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/shanewas/agentic-stealth-browser
|
|
8
|
+
Project-URL: Repository, https://github.com/shanewas/agentic-stealth-browser
|
|
9
|
+
Project-URL: Issues, https://github.com/shanewas/agentic-stealth-browser/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/shanewas/agentic-stealth-browser/tree/master/docs
|
|
11
|
+
Project-URL: Changelog, https://github.com/shanewas/agentic-stealth-browser/blob/master/CHANGELOG.md
|
|
12
|
+
Keywords: stealth,browser,automation,playwright,anti-bot,scraping,agent
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
|
22
|
+
Classifier: Topic :: Software Development :: Testing
|
|
23
|
+
Classifier: Framework :: AsyncIO
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: playwright
|
|
28
|
+
Requires-Dist: aiohttp
|
|
29
|
+
Requires-Dist: cryptography
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# Agentic Stealth Browser
|
|
37
|
+
|
|
38
|
+
[](https://github.com/shanewas/agentic-stealth-browser/actions/workflows/ci.yml)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
[](https://www.python.org/downloads/)
|
|
41
|
+
[](tests/)
|
|
42
|
+
|
|
43
|
+
A Python framework that makes browser automation look human. Built for autonomous agents that need to navigate websites protected by Cloudflare, LinkedIn, Amazon, and other anti-bot systems.
|
|
44
|
+
|
|
45
|
+
## Why This Exists
|
|
46
|
+
|
|
47
|
+
Standard browser automation (`page.goto()`, `page.click()`) gets detected instantly. This framework solves that by combining:
|
|
48
|
+
|
|
49
|
+
- **TLS fingerprint spoofing** — matches real browser TLS handshakes
|
|
50
|
+
- **Human behavior simulation** — natural mouse, typing, scrolling with realistic imperfections
|
|
51
|
+
- **Automatic recovery** — detects blocks (CAPTCHAs, rate limits) and recovers without crashing
|
|
52
|
+
- **Account lifecycle management** — warming, health scoring, cooling off
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install agentic-stealth-browser
|
|
58
|
+
playwright install --with-deps chromium
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from core.agent_browser import AgentBrowser
|
|
65
|
+
import asyncio
|
|
66
|
+
|
|
67
|
+
async def main():
|
|
68
|
+
browser = AgentBrowser(session_name="demo")
|
|
69
|
+
await browser.launch(headless=True)
|
|
70
|
+
|
|
71
|
+
# This handles stealth, human behavior, and recovery automatically
|
|
72
|
+
await browser.safe_goto("https://example.com")
|
|
73
|
+
|
|
74
|
+
# Add human-like actions
|
|
75
|
+
await browser.human.scroll_naturally(400)
|
|
76
|
+
await browser.human.think(1500, 2800)
|
|
77
|
+
|
|
78
|
+
await browser.close()
|
|
79
|
+
|
|
80
|
+
asyncio.run(main())
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Real-World Example
|
|
84
|
+
|
|
85
|
+
For protected sites, load real cookies and use a platform preset:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
browser = AgentBrowser(session_name="linkedin")
|
|
89
|
+
await browser.launch(preset="linkedin_2026")
|
|
90
|
+
await browser.load_cookies_from_file("cookies.json")
|
|
91
|
+
await browser.warm_up_before_work(intensity="heavy")
|
|
92
|
+
await browser.safe_goto("https://www.linkedin.com/feed/", platform="linkedin")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
The flow: **cookies → warm-up → navigate → recover if blocked → act human**.
|
|
96
|
+
|
|
97
|
+
## How It Works
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
AgentBrowser
|
|
101
|
+
├── Stealth → TLS profiles, canvas/WebGL spoofing, WebRTC isolation
|
|
102
|
+
├── Behavior → Bézier mouse, natural typing, distraction simulation
|
|
103
|
+
├── Recovery → Detects blocks → rotates proxy/session → retries
|
|
104
|
+
├── Accounts → Health scoring, 14-day warming, session checkpointing
|
|
105
|
+
└── Proxy → Residential proxy with rotation and health tracking
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Key Features
|
|
109
|
+
|
|
110
|
+
| Feature | What It Does |
|
|
111
|
+
|---|---|
|
|
112
|
+
| **TLS Fingerprinting** | Region-specific profiles (US, Japan, EU, Korea) with JA3/JA4 support |
|
|
113
|
+
| **Human Behavior** | Mouse with wobble, typing with mistakes, variable scrolling, fatigue |
|
|
114
|
+
| **Auto Recovery** | Detects CAPTCHAs, rate limits, blocks — recovers automatically |
|
|
115
|
+
| **Account Warming** | 14-day gradual ramp-up so new accounts don't get flagged |
|
|
116
|
+
| **Session Checkpoints** | Export/import browser state for cross-host migration |
|
|
117
|
+
| **Platform Presets** | Pre-configured profiles for LinkedIn, Amazon, Cloudflare |
|
|
118
|
+
| **MCP Server** | Integration with AI agents via Model Context Protocol |
|
|
119
|
+
|
|
120
|
+
## Configuration
|
|
121
|
+
|
|
122
|
+
### Environment Variables
|
|
123
|
+
|
|
124
|
+
| Variable | Description | Default |
|
|
125
|
+
|---|---|---|
|
|
126
|
+
| `STEALTH_REGION` | TLS fingerprint region | `japan` |
|
|
127
|
+
| `STEALTH_HEADLESS` | Run browser headless | `true` |
|
|
128
|
+
| `STEALTH_PROXY` | Use residential proxy | `false` |
|
|
129
|
+
|
|
130
|
+
### Platform Presets
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
await browser.launch(preset="linkedin_2026") # LinkedIn
|
|
134
|
+
await browser.launch(preset="amazon_2026") # Amazon
|
|
135
|
+
await browser.launch(preset="cloudflare") # Cloudflare-protected sites
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Project Structure
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
agentic-stealth-browser/
|
|
142
|
+
├── core/ # AgentBrowser main class
|
|
143
|
+
├── stealth/ # TLS fingerprinting, script injection, caching
|
|
144
|
+
├── behavior/ # Human-like mouse, typing, scrolling, personas
|
|
145
|
+
├── recovery/ # Block detection, anti-block orchestrator
|
|
146
|
+
├── proxy/ # Proxy management and rotation
|
|
147
|
+
├── sessions/ # Session and cookie management
|
|
148
|
+
├── audit/ # Structured logging and audit trails
|
|
149
|
+
├── ai/ # AI hooks and content analysis
|
|
150
|
+
├── production/ # CLI, Docker, rate limiting, metrics
|
|
151
|
+
├── linkedin/ # LinkedIn-specific actions
|
|
152
|
+
├── scraping/ # Safe page scraping utilities
|
|
153
|
+
├── docs/ # Architecture Decision Records and guides
|
|
154
|
+
└── tests/ # 493 tests across 23 files
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Documentation
|
|
158
|
+
|
|
159
|
+
- [Architecture Decision Records](docs/adr/)
|
|
160
|
+
- [Visual Debugging Guide](docs/VISUAL_DEBUGGING.md)
|
|
161
|
+
- [Stealth Limitations](docs/STEALTH_LIMITATIONS.md)
|
|
162
|
+
- [Threat Model](docs/THREAT_MODEL.md)
|
|
163
|
+
- [Common Pitfalls](docs/COMMON_PITFALLS.md)
|
|
164
|
+
- [Rate Limiting & Backoff](docs/RATE_LIMITING_BACKOFF.md)
|
|
165
|
+
- [Cookie & Session Resilience](docs/COOKIE_SESSION_RESILIENCE.md)
|
|
166
|
+
|
|
167
|
+
## Security
|
|
168
|
+
|
|
169
|
+
See [SECURITY.md](SECURITY.md) for vulnerability reporting and best practices.
|
|
170
|
+
|
|
171
|
+
## Responsible Use
|
|
172
|
+
|
|
173
|
+
This framework is designed for legitimate automation use cases such as:
|
|
174
|
+
|
|
175
|
+
- Testing your own applications and infrastructure
|
|
176
|
+
- Automating workflows on platforms that permit automation
|
|
177
|
+
- Research and security analysis
|
|
178
|
+
- Accessibility testing
|
|
179
|
+
|
|
180
|
+
**Important:** Many websites (including LinkedIn, Amazon, and others) prohibit automated access in their Terms of Service. Always:
|
|
181
|
+
|
|
182
|
+
1. Review the target site's Terms of Service and robots.txt
|
|
183
|
+
2. Obtain proper authorization before automating access
|
|
184
|
+
3. Respect rate limits and avoid causing harm to services
|
|
185
|
+
4. Use this tool responsibly and legally
|
|
186
|
+
|
|
187
|
+
This project is provided as-is under the MIT License. Users are responsible for complying with applicable laws and terms of service.
|
|
188
|
+
|
|
189
|
+
## Contributing
|
|
190
|
+
|
|
191
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
agentic_stealth_browser-0.8.0.dist-info/licenses/LICENSE,sha256=OHalejGTK-x9OKISC5qIzC5WG5UdVYms8MvPYuITTwA,1063
|
|
2
|
+
ai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
ai/ai_hooks.py,sha256=aKm64VpU6Xyh4BXtsi1zjDWQsv3qDcabjuuyHha1yYw,5449
|
|
4
|
+
audit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
audit/logger.py,sha256=APFZ9Q4e8uVhfHyslV_yAgxfTJB1L9zW4EIEuKHTF5g,18065
|
|
6
|
+
behavior/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
behavior/adaptive_tuner.py,sha256=P4SpoRiV0fxKLI6zOWkJKncsPy0wFpE38Ge00bdTyWg,6466
|
|
8
|
+
behavior/human_behavior.py,sha256=aYhD9h_a94n8jTB2Lmtf1oHcPtobyDh3BxXBgvfBATk,39843
|
|
9
|
+
behavior/orchestration.py,sha256=UgCYf-AG5D4QgaF_gEQcgQh_IDFss8D9DrvevpIJD28,5183
|
|
10
|
+
behavior/persona_rotator.py,sha256=5kObwFkKirKfEEBkH5q-EyFXgZ4EN1Ew0A5DIg7sa1k,13703
|
|
11
|
+
core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
core/account_health.py,sha256=7maJ5S8GLw4eIZsIONrqMSkfRPZ1aNERWNlqT4HMHTs,11210
|
|
13
|
+
core/account_warming.py,sha256=qyiIoaGGnD4E1JOvAaZV-bLzOoxrsnaxG9dk6qgFHPc,11029
|
|
14
|
+
core/agent_browser.py,sha256=L2i1AJzWEbw5bOVtaWKWxitjAspTw3vW_VHIPdOHL4U,82777
|
|
15
|
+
core/connection_pool.py,sha256=DFwDXZAIX0s3_uZW98GMX6YpDlmv26-wyMIlaUYmyIs,4314
|
|
16
|
+
core/error_messages.py,sha256=lC6bLUXAn-kLrOunujYOaayWgqqUTlIEL_P9LEK51us,7537
|
|
17
|
+
core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
core/session_checkpoint.py,sha256=wDA4elYk1mBgPMUodjyaMsFBPi7U8CBcCpRHS88twLY,13004
|
|
19
|
+
core/types.py,sha256=EDDJL31oMv8tM9sO7jGdn3Oakk2uX4MD7yj70-DUPNI,4755
|
|
20
|
+
linkedin/__init__.py,sha256=TdLucVbGu1c_f5xidQw7urwTO2FctpNTKxbmH5PaU0M,149
|
|
21
|
+
linkedin/actions.py,sha256=uhpSzXKNgHt0395pHuUzB9NF4VuNOfBrBniihpD3q9g,5733
|
|
22
|
+
production/Dockerfile,sha256=uIFGtcykrtnqiukPgAnSzNHkMVOrfHhxhhkseMGTl0I,2214
|
|
23
|
+
production/__init__.py,sha256=yVLj-2coZXgbDFrTQbDXg4xEaMUY8YnL6z_RybvCNek,302
|
|
24
|
+
production/cli.py,sha256=z-PsEtgqL0e7yMluMAgQ9EN16UBuqwIuDSesoKUoSAg,9564
|
|
25
|
+
production/docker-compose.yml,sha256=nV5qEJHLU38oOOaN6JC5lJDRN_kGn_OO9SIIzYWteaQ,1368
|
|
26
|
+
production/docker-healthcheck.py,sha256=TJhCh1aYlyPfM6_qcBSknYWX0-5-4KNCHVSmaS5gInc,485
|
|
27
|
+
production/metrics.py,sha256=XgRRVMveElkmkpR1joM-1o8GMTuMi-PjycQt4Qq_QIw,6163
|
|
28
|
+
production/otel_export.py,sha256=ZK1s4Hx-8sicqceu9bI1tizwXjysrU4qC_ydfD0Z5Zs,7108
|
|
29
|
+
production/rate_limiter.py,sha256=4te3x6AjzicpiHFuqPYiYPUUV5aeEdT7zbHuAw1ixWQ,12773
|
|
30
|
+
proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
proxy/proxy_manager.py,sha256=vGugVpSXw7hUvjatlMKtD5kkXMidCoPhSYK4mO83g8s,14620
|
|
32
|
+
recovery/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
recovery/anti_block_orchestrator.py,sha256=DvM6GrI1RaWWcS4BflZYc7mPYkvLhBUaTXWsncUGtbU,37034
|
|
34
|
+
recovery/detectors.py,sha256=AMs7dTQ3CEXvWWC29YmYXc-lPbNMqO2lWjO3AuJTpWQ,9598
|
|
35
|
+
recovery/explain_blocked.py,sha256=9CvZEva-u4HWL9cnVWRDcs8eHO_eSKqsZf7NUH48ruo,5171
|
|
36
|
+
scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
scraping/scraper.py,sha256=wddQTNlgG7K2gZ5cusGeqWGPYo23z4RV3j38oDMRuOo,3947
|
|
38
|
+
sessions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
sessions/cookie_manager.py,sha256=BQxTXRRuWLsnVN0tAFZ0zNeBtUi_BIUPtqAaqm54LWs,37921
|
|
40
|
+
sessions/session_manager.py,sha256=Bo-uDOtVu_FW4eJn_VPUKSXTdlr-y7gHnFvy6WYlhTs,11429
|
|
41
|
+
stealth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
+
stealth/advanced_stealth.py,sha256=Mq-j8t4K4O5G7IcS_SnDRfBHkYnGLiuXJbM8FUW2FYs,26328
|
|
43
|
+
stealth/cache.py,sha256=a4zfwoEPkPQozR-I6I7UCqVko6jekEXjRi1q-uDhrLI,4979
|
|
44
|
+
stealth/headers.py,sha256=LN-s0FllcCPgdrGzazYduD-YPrq3riXjIUG2zAIC2tw,1914
|
|
45
|
+
stealth/presets.py,sha256=1epbVDWY5S1l4uja9qWr5goHEXY02oy44cmpSGq956o,10553
|
|
46
|
+
stealth/profiles.py,sha256=FGPLc8aQ6qKY7HofSCCIxMO_Vk_D99jvcIUio_LD7Nw,8015
|
|
47
|
+
stealth/tls_fingerprint.py,sha256=VGmUoJOEveL5HnZGBVzrqR5n2qAt6tVGQ5GknKMMPEM,11243
|
|
48
|
+
stealth/tls_ja3_ja4.py,sha256=uRkvUw2Wy1gnMeYnwwwGwd9thx2VRiHw50XGPq8TMn4,6199
|
|
49
|
+
agentic_stealth_browser-0.8.0.dist-info/METADATA,sha256=IcM7nyeg8VBytoSnGGy3E_w7ofutzqf87-hdY0OjdWk,7643
|
|
50
|
+
agentic_stealth_browser-0.8.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
51
|
+
agentic_stealth_browser-0.8.0.dist-info/entry_points.txt,sha256=7FmBqyu4LJZJFRMpch2bnvUrZVj6HukbyTQqJgl0SaE,64
|
|
52
|
+
agentic_stealth_browser-0.8.0.dist-info/top_level.txt,sha256=z-dkqaR0uDPpoCvXYzkjXU9oO_AW8wtzV_bWhDm17iA,84
|
|
53
|
+
agentic_stealth_browser-0.8.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shane W
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ai/__init__.py
ADDED
|
File without changes
|
ai/ai_hooks.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AI Integration Hooks for Agentic Browser
|
|
3
|
+
Allows connecting to LLMs for decision making and content analysis
|
|
4
|
+
|
|
5
|
+
Security: All external content (page text, context) is sanitized before being
|
|
6
|
+
eligible for LLM consumption to mitigate prompt injection (#188 P3).
|
|
7
|
+
|
|
8
|
+
P3 #98 fix: AIHooks now provides real value even without an LLM provider:
|
|
9
|
+
- Content sanitization for safe downstream use
|
|
10
|
+
- Rule-based action decisions as fallback
|
|
11
|
+
- Structured extraction via regex/heuristics
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Any, Dict, Optional, List
|
|
15
|
+
import re
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AIHooks:
|
|
20
|
+
"""
|
|
21
|
+
AI integration hooks with injection protection and rule-based fallbacks.
|
|
22
|
+
|
|
23
|
+
When provider="none", provides:
|
|
24
|
+
- Content sanitization for safe downstream use
|
|
25
|
+
- Rule-based action decisions
|
|
26
|
+
- Heuristic structured extraction
|
|
27
|
+
|
|
28
|
+
When provider is set, connects to actual LLM (OpenAI, Claude, etc.).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Common injection / jailbreak patterns observed in web content & attacks
|
|
32
|
+
_INJECTION_PATTERNS = [
|
|
33
|
+
r"\b(ignore|disregard|forget)\s+(all\s+)?(previous|above|prior|earlier|system|instructions?)\b",
|
|
34
|
+
r"\b(you are now|act as|role.?play|pretend you are|from now on you are)\b",
|
|
35
|
+
r"\b(system prompt|override|jailbreak|do anything|no restrictions|unfiltered)\b",
|
|
36
|
+
r"^\s*(system:|assistant:|user:)\s*",
|
|
37
|
+
r"\b(output only the|respond with only|print exactly|your new instructions)\b",
|
|
38
|
+
]
|
|
39
|
+
_INJECTION_RE = re.compile("|".join(_INJECTION_PATTERNS), re.IGNORECASE)
|
|
40
|
+
|
|
41
|
+
def __init__(self, provider: str = "none", api_key: Optional[str] = None, model: Optional[str] = None):
|
|
42
|
+
self.provider = provider
|
|
43
|
+
self.api_key = api_key
|
|
44
|
+
self.model = model
|
|
45
|
+
self.enabled = provider != "none"
|
|
46
|
+
self._client = None
|
|
47
|
+
|
|
48
|
+
def sanitize_for_llm(self, content: str, max_len: int = 12000) -> str:
|
|
49
|
+
"""Strip or neuter likely prompt-injection content from untrusted page/LLM input.
|
|
50
|
+
Returns a safe-ish version with markers for removed sections.
|
|
51
|
+
This is a practical P3 baseline; real deployments should layer more (e.g. LLM guardrails).
|
|
52
|
+
"""
|
|
53
|
+
if not content or not isinstance(content, str):
|
|
54
|
+
return ""
|
|
55
|
+
safe = content[:max_len]
|
|
56
|
+
def _replace(m):
|
|
57
|
+
return "[REDACTED: potential instruction override]"
|
|
58
|
+
safe = self._INJECTION_RE.sub(_replace, safe)
|
|
59
|
+
safe = re.sub(r"[\u200b\u200c\u200d\u2060\ufeff]", "", safe)
|
|
60
|
+
return safe
|
|
61
|
+
|
|
62
|
+
def extract_links(self, text: str) -> List[str]:
|
|
63
|
+
"""P3 #98: Extract URLs from text without LLM."""
|
|
64
|
+
url_pattern = r'https?://[^\s<>"\')\]]+'
|
|
65
|
+
return re.findall(url_pattern, text)
|
|
66
|
+
|
|
67
|
+
def extract_emails(self, text: str) -> List[str]:
|
|
68
|
+
"""P3 #98: Extract email addresses from text without LLM."""
|
|
69
|
+
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
|
70
|
+
return re.findall(email_pattern, text)
|
|
71
|
+
|
|
72
|
+
async def analyze_page(self, page_content: str, task: str) -> str:
|
|
73
|
+
"""Analyze page content using AI (with injection protection)"""
|
|
74
|
+
if not self.enabled:
|
|
75
|
+
safe_content = self.sanitize_for_llm(page_content)
|
|
76
|
+
links = self.extract_links(safe_content)
|
|
77
|
+
emails = self.extract_emails(safe_content)
|
|
78
|
+
word_count = len(safe_content.split())
|
|
79
|
+
return (
|
|
80
|
+
f"Heuristic analysis (AI disabled):\n"
|
|
81
|
+
f" Word count: {word_count}\n"
|
|
82
|
+
f" Links found: {len(links)}\n"
|
|
83
|
+
f" Emails found: {len(emails)}\n"
|
|
84
|
+
f" Content sanitized: {len(safe_content)} chars\n"
|
|
85
|
+
f" Task: {task}\n"
|
|
86
|
+
f" Enable AI provider for deeper analysis."
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
safe_content = self.sanitize_for_llm(page_content)
|
|
90
|
+
return f"[AI Analysis Placeholder] Task: {task} (content sanitized: {len(safe_content)} chars kept)"
|
|
91
|
+
|
|
92
|
+
async def decide_next_action(self, context: Dict) -> str:
|
|
93
|
+
"""Let AI decide next browsing action (with injection protection on any text fields)"""
|
|
94
|
+
if not self.enabled:
|
|
95
|
+
url = context.get("url", "")
|
|
96
|
+
status = context.get("status", 200)
|
|
97
|
+
if status in (403, 429, 503):
|
|
98
|
+
return "backoff_and_retry"
|
|
99
|
+
if "login" in url.lower() or "signin" in url.lower():
|
|
100
|
+
return "wait_for_user"
|
|
101
|
+
if not context.get("has_content", False):
|
|
102
|
+
return "scroll"
|
|
103
|
+
return "extract_and_continue"
|
|
104
|
+
|
|
105
|
+
safe_ctx = {}
|
|
106
|
+
for k, v in (context or {}).items():
|
|
107
|
+
if isinstance(v, str):
|
|
108
|
+
safe_ctx[k] = self.sanitize_for_llm(v)
|
|
109
|
+
else:
|
|
110
|
+
safe_ctx[k] = v
|
|
111
|
+
return "scroll"
|
|
112
|
+
|
|
113
|
+
async def extract_structured_data(self, text: str, schema: Dict) -> Dict:
|
|
114
|
+
"""Extract structured data using AI (sanitized input)"""
|
|
115
|
+
if not self.enabled:
|
|
116
|
+
safe_text = self.sanitize_for_llm(text)
|
|
117
|
+
result = {
|
|
118
|
+
"links": self.extract_links(safe_text),
|
|
119
|
+
"emails": self.extract_emails(safe_text),
|
|
120
|
+
"phones": re.findall(r'\+?[\d\s\-\(\)]{10,}', safe_text),
|
|
121
|
+
"prices": re.findall(r'\$[\d,]+\.?\d*', safe_text),
|
|
122
|
+
}
|
|
123
|
+
return {"status": "heuristic_extraction", "data": result, "sanitized_len": len(safe_text)}
|
|
124
|
+
|
|
125
|
+
safe_text = self.sanitize_for_llm(text)
|
|
126
|
+
return {"status": "ai_disabled", "data": {}, "sanitized_len": len(safe_text)}
|
audit/__init__.py
ADDED
|
File without changes
|