agentic-stealth-browser 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. agentic_stealth_browser-0.8.0.dist-info/METADATA +195 -0
  2. agentic_stealth_browser-0.8.0.dist-info/RECORD +53 -0
  3. agentic_stealth_browser-0.8.0.dist-info/WHEEL +5 -0
  4. agentic_stealth_browser-0.8.0.dist-info/entry_points.txt +2 -0
  5. agentic_stealth_browser-0.8.0.dist-info/licenses/LICENSE +21 -0
  6. agentic_stealth_browser-0.8.0.dist-info/top_level.txt +11 -0
  7. ai/__init__.py +0 -0
  8. ai/ai_hooks.py +126 -0
  9. audit/__init__.py +0 -0
  10. audit/logger.py +449 -0
  11. behavior/__init__.py +0 -0
  12. behavior/adaptive_tuner.py +175 -0
  13. behavior/human_behavior.py +825 -0
  14. behavior/orchestration.py +141 -0
  15. behavior/persona_rotator.py +380 -0
  16. core/__init__.py +0 -0
  17. core/account_health.py +309 -0
  18. core/account_warming.py +315 -0
  19. core/agent_browser.py +1676 -0
  20. core/connection_pool.py +115 -0
  21. core/error_messages.py +191 -0
  22. core/py.typed +0 -0
  23. core/session_checkpoint.py +354 -0
  24. core/types.py +169 -0
  25. linkedin/__init__.py +5 -0
  26. linkedin/actions.py +154 -0
  27. production/Dockerfile +83 -0
  28. production/__init__.py +7 -0
  29. production/cli.py +240 -0
  30. production/docker-compose.yml +45 -0
  31. production/docker-healthcheck.py +14 -0
  32. production/metrics.py +140 -0
  33. production/otel_export.py +210 -0
  34. production/rate_limiter.py +291 -0
  35. proxy/__init__.py +0 -0
  36. proxy/proxy_manager.py +369 -0
  37. recovery/__init__.py +0 -0
  38. recovery/anti_block_orchestrator.py +791 -0
  39. recovery/detectors.py +263 -0
  40. recovery/explain_blocked.py +98 -0
  41. scraping/__init__.py +0 -0
  42. scraping/scraper.py +88 -0
  43. sessions/__init__.py +0 -0
  44. sessions/cookie_manager.py +823 -0
  45. sessions/session_manager.py +283 -0
  46. stealth/__init__.py +0 -0
  47. stealth/advanced_stealth.py +487 -0
  48. stealth/cache.py +160 -0
  49. stealth/headers.py +42 -0
  50. stealth/presets.py +279 -0
  51. stealth/profiles.py +226 -0
  52. stealth/tls_fingerprint.py +264 -0
  53. stealth/tls_ja3_ja4.py +157 -0
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentic-stealth-browser
3
+ Version: 0.8.0
4
+ Summary: Production-grade, human-mimicking browser automation framework for autonomous agents. Survives modern anti-bot systems.
5
+ Author-email: Shane W <shanewas@users.noreply.github.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/shanewas/agentic-stealth-browser
8
+ Project-URL: Repository, https://github.com/shanewas/agentic-stealth-browser
9
+ Project-URL: Issues, https://github.com/shanewas/agentic-stealth-browser/issues
10
+ Project-URL: Documentation, https://github.com/shanewas/agentic-stealth-browser/tree/master/docs
11
+ Project-URL: Changelog, https://github.com/shanewas/agentic-stealth-browser/blob/master/CHANGELOG.md
12
+ Keywords: stealth,browser,automation,playwright,anti-bot,scraping,agent
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
22
+ Classifier: Topic :: Software Development :: Testing
23
+ Classifier: Framework :: AsyncIO
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: playwright
28
+ Requires-Dist: aiohttp
29
+ Requires-Dist: cryptography
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest; extra == "dev"
32
+ Requires-Dist: pytest-asyncio; extra == "dev"
33
+ Requires-Dist: pytest-cov; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # Agentic Stealth Browser
37
+
38
+ [![CI](https://github.com/shanewas/agentic-stealth-browser/actions/workflows/ci.yml/badge.svg)](https://github.com/shanewas/agentic-stealth-browser/actions/workflows/ci.yml)
39
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
40
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
41
+ [![Tests](https://img.shields.io/badge/tests-493%20passing-brightgreen)](tests/)
42
+
43
+ A Python framework that makes browser automation look human. Built for autonomous agents that need to navigate websites protected by Cloudflare, LinkedIn, Amazon, and other anti-bot systems.
44
+
45
+ ## Why This Exists
46
+
47
+ Standard browser automation (`page.goto()`, `page.click()`) gets detected instantly. This framework solves that by combining:
48
+
49
+ - **TLS fingerprint spoofing** — matches real browser TLS handshakes
50
+ - **Human behavior simulation** — natural mouse, typing, scrolling with realistic imperfections
51
+ - **Automatic recovery** — detects blocks (CAPTCHAs, rate limits) and recovers without crashing
52
+ - **Account lifecycle management** — warming, health scoring, cooling off
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install agentic-stealth-browser
58
+ playwright install --with-deps chromium
59
+ ```
60
+
61
+ ## Quick Start
62
+
63
+ ```python
64
+ from core.agent_browser import AgentBrowser
65
+ import asyncio
66
+
67
+ async def main():
68
+ browser = AgentBrowser(session_name="demo")
69
+ await browser.launch(headless=True)
70
+
71
+ # This handles stealth, human behavior, and recovery automatically
72
+ await browser.safe_goto("https://example.com")
73
+
74
+ # Add human-like actions
75
+ await browser.human.scroll_naturally(400)
76
+ await browser.human.think(1500, 2800)
77
+
78
+ await browser.close()
79
+
80
+ asyncio.run(main())
81
+ ```
82
+
83
+ ## Real-World Example
84
+
85
+ For protected sites, load real cookies and use a platform preset:
86
+
87
+ ```python
88
+ browser = AgentBrowser(session_name="linkedin")
89
+ await browser.launch(preset="linkedin_2026")
90
+ await browser.load_cookies_from_file("cookies.json")
91
+ await browser.warm_up_before_work(intensity="heavy")
92
+ await browser.safe_goto("https://www.linkedin.com/feed/", platform="linkedin")
93
+ ```
94
+
95
+ The flow: **cookies → warm-up → navigate → recover if blocked → act human**.
96
+
97
+ ## How It Works
98
+
99
+ ```
100
+ AgentBrowser
101
+ ├── Stealth → TLS profiles, canvas/WebGL spoofing, WebRTC isolation
102
+ ├── Behavior → Bézier mouse, natural typing, distraction simulation
103
+ ├── Recovery → Detects blocks → rotates proxy/session → retries
104
+ ├── Accounts → Health scoring, 14-day warming, session checkpointing
105
+ └── Proxy → Residential proxy with rotation and health tracking
106
+ ```
107
+
108
+ ## Key Features
109
+
110
+ | Feature | What It Does |
111
+ |---|---|
112
+ | **TLS Fingerprinting** | Region-specific profiles (US, Japan, EU, Korea) with JA3/JA4 support |
113
+ | **Human Behavior** | Mouse with wobble, typing with mistakes, variable scrolling, fatigue |
114
+ | **Auto Recovery** | Detects CAPTCHAs, rate limits, blocks — recovers automatically |
115
+ | **Account Warming** | 14-day gradual ramp-up so new accounts don't get flagged |
116
+ | **Session Checkpoints** | Export/import browser state for cross-host migration |
117
+ | **Platform Presets** | Pre-configured profiles for LinkedIn, Amazon, Cloudflare |
118
+ | **MCP Server** | Integration with AI agents via Model Context Protocol |
119
+
120
+ ## Configuration
121
+
122
+ ### Environment Variables
123
+
124
+ | Variable | Description | Default |
125
+ |---|---|---|
126
+ | `STEALTH_REGION` | TLS fingerprint region | `japan` |
127
+ | `STEALTH_HEADLESS` | Run browser headless | `true` |
128
+ | `STEALTH_PROXY` | Use residential proxy | `false` |
129
+
130
+ ### Platform Presets
131
+
132
+ ```python
133
+ await browser.launch(preset="linkedin_2026") # LinkedIn
134
+ await browser.launch(preset="amazon_2026") # Amazon
135
+ await browser.launch(preset="cloudflare") # Cloudflare-protected sites
136
+ ```
137
+
138
+ ## Project Structure
139
+
140
+ ```
141
+ agentic-stealth-browser/
142
+ ├── core/ # AgentBrowser main class
143
+ ├── stealth/ # TLS fingerprinting, script injection, caching
144
+ ├── behavior/ # Human-like mouse, typing, scrolling, personas
145
+ ├── recovery/ # Block detection, anti-block orchestrator
146
+ ├── proxy/ # Proxy management and rotation
147
+ ├── sessions/ # Session and cookie management
148
+ ├── audit/ # Structured logging and audit trails
149
+ ├── ai/ # AI hooks and content analysis
150
+ ├── production/ # CLI, Docker, rate limiting, metrics
151
+ ├── linkedin/ # LinkedIn-specific actions
152
+ ├── scraping/ # Safe page scraping utilities
153
+ ├── docs/ # Architecture Decision Records and guides
154
+ └── tests/ # 493 tests across 23 files
155
+ ```
156
+
157
+ ## Documentation
158
+
159
+ - [Architecture Decision Records](docs/adr/)
160
+ - [Visual Debugging Guide](docs/VISUAL_DEBUGGING.md)
161
+ - [Stealth Limitations](docs/STEALTH_LIMITATIONS.md)
162
+ - [Threat Model](docs/THREAT_MODEL.md)
163
+ - [Common Pitfalls](docs/COMMON_PITFALLS.md)
164
+ - [Rate Limiting & Backoff](docs/RATE_LIMITING_BACKOFF.md)
165
+ - [Cookie & Session Resilience](docs/COOKIE_SESSION_RESILIENCE.md)
166
+
167
+ ## Security
168
+
169
+ See [SECURITY.md](SECURITY.md) for vulnerability reporting and best practices.
170
+
171
+ ## Responsible Use
172
+
173
+ This framework is designed for legitimate automation use cases such as:
174
+
175
+ - Testing your own applications and infrastructure
176
+ - Automating workflows on platforms that permit automation
177
+ - Research and security analysis
178
+ - Accessibility testing
179
+
180
+ **Important:** Many websites (including LinkedIn, Amazon, and others) prohibit automated access in their Terms of Service. Always:
181
+
182
+ 1. Review the target site's Terms of Service and robots.txt
183
+ 2. Obtain proper authorization before automating access
184
+ 3. Respect rate limits and avoid causing harm to services
185
+ 4. Use this tool responsibly and legally
186
+
187
+ This project is provided as-is under the MIT License. Users are responsible for complying with applicable laws and terms of service.
188
+
189
+ ## Contributing
190
+
191
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
192
+
193
+ ## License
194
+
195
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,53 @@
1
+ agentic_stealth_browser-0.8.0.dist-info/licenses/LICENSE,sha256=OHalejGTK-x9OKISC5qIzC5WG5UdVYms8MvPYuITTwA,1063
2
+ ai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ ai/ai_hooks.py,sha256=aKm64VpU6Xyh4BXtsi1zjDWQsv3qDcabjuuyHha1yYw,5449
4
+ audit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ audit/logger.py,sha256=APFZ9Q4e8uVhfHyslV_yAgxfTJB1L9zW4EIEuKHTF5g,18065
6
+ behavior/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ behavior/adaptive_tuner.py,sha256=P4SpoRiV0fxKLI6zOWkJKncsPy0wFpE38Ge00bdTyWg,6466
8
+ behavior/human_behavior.py,sha256=aYhD9h_a94n8jTB2Lmtf1oHcPtobyDh3BxXBgvfBATk,39843
9
+ behavior/orchestration.py,sha256=UgCYf-AG5D4QgaF_gEQcgQh_IDFss8D9DrvevpIJD28,5183
10
+ behavior/persona_rotator.py,sha256=5kObwFkKirKfEEBkH5q-EyFXgZ4EN1Ew0A5DIg7sa1k,13703
11
+ core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ core/account_health.py,sha256=7maJ5S8GLw4eIZsIONrqMSkfRPZ1aNERWNlqT4HMHTs,11210
13
+ core/account_warming.py,sha256=qyiIoaGGnD4E1JOvAaZV-bLzOoxrsnaxG9dk6qgFHPc,11029
14
+ core/agent_browser.py,sha256=L2i1AJzWEbw5bOVtaWKWxitjAspTw3vW_VHIPdOHL4U,82777
15
+ core/connection_pool.py,sha256=DFwDXZAIX0s3_uZW98GMX6YpDlmv26-wyMIlaUYmyIs,4314
16
+ core/error_messages.py,sha256=lC6bLUXAn-kLrOunujYOaayWgqqUTlIEL_P9LEK51us,7537
17
+ core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ core/session_checkpoint.py,sha256=wDA4elYk1mBgPMUodjyaMsFBPi7U8CBcCpRHS88twLY,13004
19
+ core/types.py,sha256=EDDJL31oMv8tM9sO7jGdn3Oakk2uX4MD7yj70-DUPNI,4755
20
+ linkedin/__init__.py,sha256=TdLucVbGu1c_f5xidQw7urwTO2FctpNTKxbmH5PaU0M,149
21
+ linkedin/actions.py,sha256=uhpSzXKNgHt0395pHuUzB9NF4VuNOfBrBniihpD3q9g,5733
22
+ production/Dockerfile,sha256=uIFGtcykrtnqiukPgAnSzNHkMVOrfHhxhhkseMGTl0I,2214
23
+ production/__init__.py,sha256=yVLj-2coZXgbDFrTQbDXg4xEaMUY8YnL6z_RybvCNek,302
24
+ production/cli.py,sha256=z-PsEtgqL0e7yMluMAgQ9EN16UBuqwIuDSesoKUoSAg,9564
25
+ production/docker-compose.yml,sha256=nV5qEJHLU38oOOaN6JC5lJDRN_kGn_OO9SIIzYWteaQ,1368
26
+ production/docker-healthcheck.py,sha256=TJhCh1aYlyPfM6_qcBSknYWX0-5-4KNCHVSmaS5gInc,485
27
+ production/metrics.py,sha256=XgRRVMveElkmkpR1joM-1o8GMTuMi-PjycQt4Qq_QIw,6163
28
+ production/otel_export.py,sha256=ZK1s4Hx-8sicqceu9bI1tizwXjysrU4qC_ydfD0Z5Zs,7108
29
+ production/rate_limiter.py,sha256=4te3x6AjzicpiHFuqPYiYPUUV5aeEdT7zbHuAw1ixWQ,12773
30
+ proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ proxy/proxy_manager.py,sha256=vGugVpSXw7hUvjatlMKtD5kkXMidCoPhSYK4mO83g8s,14620
32
+ recovery/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ recovery/anti_block_orchestrator.py,sha256=DvM6GrI1RaWWcS4BflZYc7mPYkvLhBUaTXWsncUGtbU,37034
34
+ recovery/detectors.py,sha256=AMs7dTQ3CEXvWWC29YmYXc-lPbNMqO2lWjO3AuJTpWQ,9598
35
+ recovery/explain_blocked.py,sha256=9CvZEva-u4HWL9cnVWRDcs8eHO_eSKqsZf7NUH48ruo,5171
36
+ scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ scraping/scraper.py,sha256=wddQTNlgG7K2gZ5cusGeqWGPYo23z4RV3j38oDMRuOo,3947
38
+ sessions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ sessions/cookie_manager.py,sha256=BQxTXRRuWLsnVN0tAFZ0zNeBtUi_BIUPtqAaqm54LWs,37921
40
+ sessions/session_manager.py,sha256=Bo-uDOtVu_FW4eJn_VPUKSXTdlr-y7gHnFvy6WYlhTs,11429
41
+ stealth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ stealth/advanced_stealth.py,sha256=Mq-j8t4K4O5G7IcS_SnDRfBHkYnGLiuXJbM8FUW2FYs,26328
43
+ stealth/cache.py,sha256=a4zfwoEPkPQozR-I6I7UCqVko6jekEXjRi1q-uDhrLI,4979
44
+ stealth/headers.py,sha256=LN-s0FllcCPgdrGzazYduD-YPrq3riXjIUG2zAIC2tw,1914
45
+ stealth/presets.py,sha256=1epbVDWY5S1l4uja9qWr5goHEXY02oy44cmpSGq956o,10553
46
+ stealth/profiles.py,sha256=FGPLc8aQ6qKY7HofSCCIxMO_Vk_D99jvcIUio_LD7Nw,8015
47
+ stealth/tls_fingerprint.py,sha256=VGmUoJOEveL5HnZGBVzrqR5n2qAt6tVGQ5GknKMMPEM,11243
48
+ stealth/tls_ja3_ja4.py,sha256=uRkvUw2Wy1gnMeYnwwwGwd9thx2VRiHw50XGPq8TMn4,6199
49
+ agentic_stealth_browser-0.8.0.dist-info/METADATA,sha256=IcM7nyeg8VBytoSnGGy3E_w7ofutzqf87-hdY0OjdWk,7643
50
+ agentic_stealth_browser-0.8.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
51
+ agentic_stealth_browser-0.8.0.dist-info/entry_points.txt,sha256=7FmBqyu4LJZJFRMpch2bnvUrZVj6HukbyTQqJgl0SaE,64
52
+ agentic_stealth_browser-0.8.0.dist-info/top_level.txt,sha256=z-dkqaR0uDPpoCvXYzkjXU9oO_AW8wtzV_bWhDm17iA,84
53
+ agentic_stealth_browser-0.8.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ agentic-stealth-browser = production.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shane W
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,11 @@
1
+ ai
2
+ audit
3
+ behavior
4
+ core
5
+ linkedin
6
+ production
7
+ proxy
8
+ recovery
9
+ scraping
10
+ sessions
11
+ stealth
ai/__init__.py ADDED
File without changes
ai/ai_hooks.py ADDED
@@ -0,0 +1,126 @@
1
+ """
2
+ AI Integration Hooks for Agentic Browser
3
+ Allows connecting to LLMs for decision making and content analysis
4
+
5
+ Security: All external content (page text, context) is sanitized before being
6
+ eligible for LLM consumption to mitigate prompt injection (#188 P3).
7
+
8
+ P3 #98 fix: AIHooks now provides real value even without an LLM provider:
9
+ - Content sanitization for safe downstream use
10
+ - Rule-based action decisions as fallback
11
+ - Structured extraction via regex/heuristics
12
+ """
13
+
14
+ from typing import Any, Dict, Optional, List
15
+ import re
16
+ import json
17
+
18
+
19
+ class AIHooks:
20
+ """
21
+ AI integration hooks with injection protection and rule-based fallbacks.
22
+
23
+ When provider="none", provides:
24
+ - Content sanitization for safe downstream use
25
+ - Rule-based action decisions
26
+ - Heuristic structured extraction
27
+
28
+ When provider is set, connects to actual LLM (OpenAI, Claude, etc.).
29
+ """
30
+
31
+ # Common injection / jailbreak patterns observed in web content & attacks
32
+ _INJECTION_PATTERNS = [
33
+ r"\b(ignore|disregard|forget)\s+(all\s+)?(previous|above|prior|earlier|system|instructions?)\b",
34
+ r"\b(you are now|act as|role.?play|pretend you are|from now on you are)\b",
35
+ r"\b(system prompt|override|jailbreak|do anything|no restrictions|unfiltered)\b",
36
+ r"^\s*(system:|assistant:|user:)\s*",
37
+ r"\b(output only the|respond with only|print exactly|your new instructions)\b",
38
+ ]
39
+ _INJECTION_RE = re.compile("|".join(_INJECTION_PATTERNS), re.IGNORECASE)
40
+
41
+ def __init__(self, provider: str = "none", api_key: Optional[str] = None, model: Optional[str] = None):
42
+ self.provider = provider
43
+ self.api_key = api_key
44
+ self.model = model
45
+ self.enabled = provider != "none"
46
+ self._client = None
47
+
48
+ def sanitize_for_llm(self, content: str, max_len: int = 12000) -> str:
49
+ """Strip or neuter likely prompt-injection content from untrusted page/LLM input.
50
+ Returns a safe-ish version with markers for removed sections.
51
+ This is a practical P3 baseline; real deployments should layer more (e.g. LLM guardrails).
52
+ """
53
+ if not content or not isinstance(content, str):
54
+ return ""
55
+ safe = content[:max_len]
56
+ def _replace(m):
57
+ return "[REDACTED: potential instruction override]"
58
+ safe = self._INJECTION_RE.sub(_replace, safe)
59
+ safe = re.sub(r"[\u200b\u200c\u200d\u2060\ufeff]", "", safe)
60
+ return safe
61
+
62
+ def extract_links(self, text: str) -> List[str]:
63
+ """P3 #98: Extract URLs from text without LLM."""
64
+ url_pattern = r'https?://[^\s<>"\')\]]+'
65
+ return re.findall(url_pattern, text)
66
+
67
+ def extract_emails(self, text: str) -> List[str]:
68
+ """P3 #98: Extract email addresses from text without LLM."""
69
+ email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
70
+ return re.findall(email_pattern, text)
71
+
72
+ async def analyze_page(self, page_content: str, task: str) -> str:
73
+ """Analyze page content using AI (with injection protection)"""
74
+ if not self.enabled:
75
+ safe_content = self.sanitize_for_llm(page_content)
76
+ links = self.extract_links(safe_content)
77
+ emails = self.extract_emails(safe_content)
78
+ word_count = len(safe_content.split())
79
+ return (
80
+ f"Heuristic analysis (AI disabled):\n"
81
+ f" Word count: {word_count}\n"
82
+ f" Links found: {len(links)}\n"
83
+ f" Emails found: {len(emails)}\n"
84
+ f" Content sanitized: {len(safe_content)} chars\n"
85
+ f" Task: {task}\n"
86
+ f" Enable AI provider for deeper analysis."
87
+ )
88
+
89
+ safe_content = self.sanitize_for_llm(page_content)
90
+ return f"[AI Analysis Placeholder] Task: {task} (content sanitized: {len(safe_content)} chars kept)"
91
+
92
+ async def decide_next_action(self, context: Dict) -> str:
93
+ """Let AI decide next browsing action (with injection protection on any text fields)"""
94
+ if not self.enabled:
95
+ url = context.get("url", "")
96
+ status = context.get("status", 200)
97
+ if status in (403, 429, 503):
98
+ return "backoff_and_retry"
99
+ if "login" in url.lower() or "signin" in url.lower():
100
+ return "wait_for_user"
101
+ if not context.get("has_content", False):
102
+ return "scroll"
103
+ return "extract_and_continue"
104
+
105
+ safe_ctx = {}
106
+ for k, v in (context or {}).items():
107
+ if isinstance(v, str):
108
+ safe_ctx[k] = self.sanitize_for_llm(v)
109
+ else:
110
+ safe_ctx[k] = v
111
+ return "scroll"
112
+
113
+ async def extract_structured_data(self, text: str, schema: Dict) -> Dict:
114
+ """Extract structured data using AI (sanitized input)"""
115
+ if not self.enabled:
116
+ safe_text = self.sanitize_for_llm(text)
117
+ result = {
118
+ "links": self.extract_links(safe_text),
119
+ "emails": self.extract_emails(safe_text),
120
+ "phones": re.findall(r'\+?[\d\s\-\(\)]{10,}', safe_text),
121
+ "prices": re.findall(r'\$[\d,]+\.?\d*', safe_text),
122
+ }
123
+ return {"status": "heuristic_extraction", "data": result, "sanitized_len": len(safe_text)}
124
+
125
+ safe_text = self.sanitize_for_llm(text)
126
+ return {"status": "ai_disabled", "data": {}, "sanitized_len": len(safe_text)}
audit/__init__.py ADDED
File without changes