aiptx 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiptx might be problematic. Click here for more details.
- aipt_v2/__init__.py +110 -0
- aipt_v2/__main__.py +24 -0
- aipt_v2/agents/AIPTxAgent/__init__.py +10 -0
- aipt_v2/agents/AIPTxAgent/aiptx_agent.py +211 -0
- aipt_v2/agents/__init__.py +24 -0
- aipt_v2/agents/base.py +520 -0
- aipt_v2/agents/ptt.py +406 -0
- aipt_v2/agents/state.py +168 -0
- aipt_v2/app.py +960 -0
- aipt_v2/browser/__init__.py +31 -0
- aipt_v2/browser/automation.py +458 -0
- aipt_v2/browser/crawler.py +453 -0
- aipt_v2/cli.py +321 -0
- aipt_v2/compliance/__init__.py +71 -0
- aipt_v2/compliance/compliance_report.py +449 -0
- aipt_v2/compliance/framework_mapper.py +424 -0
- aipt_v2/compliance/nist_mapping.py +345 -0
- aipt_v2/compliance/owasp_mapping.py +330 -0
- aipt_v2/compliance/pci_mapping.py +297 -0
- aipt_v2/config.py +288 -0
- aipt_v2/core/__init__.py +43 -0
- aipt_v2/core/agent.py +630 -0
- aipt_v2/core/llm.py +395 -0
- aipt_v2/core/memory.py +305 -0
- aipt_v2/core/ptt.py +329 -0
- aipt_v2/database/__init__.py +14 -0
- aipt_v2/database/models.py +232 -0
- aipt_v2/database/repository.py +384 -0
- aipt_v2/docker/__init__.py +23 -0
- aipt_v2/docker/builder.py +260 -0
- aipt_v2/docker/manager.py +222 -0
- aipt_v2/docker/sandbox.py +371 -0
- aipt_v2/evasion/__init__.py +58 -0
- aipt_v2/evasion/request_obfuscator.py +272 -0
- aipt_v2/evasion/tls_fingerprint.py +285 -0
- aipt_v2/evasion/ua_rotator.py +301 -0
- aipt_v2/evasion/waf_bypass.py +439 -0
- aipt_v2/execution/__init__.py +23 -0
- aipt_v2/execution/executor.py +302 -0
- aipt_v2/execution/parser.py +544 -0
- aipt_v2/execution/terminal.py +337 -0
- aipt_v2/health.py +437 -0
- aipt_v2/intelligence/__init__.py +85 -0
- aipt_v2/intelligence/auth.py +520 -0
- aipt_v2/intelligence/chaining.py +775 -0
- aipt_v2/intelligence/cve_aipt.py +334 -0
- aipt_v2/intelligence/cve_info.py +1111 -0
- aipt_v2/intelligence/rag.py +239 -0
- aipt_v2/intelligence/scope.py +442 -0
- aipt_v2/intelligence/searchers/__init__.py +5 -0
- aipt_v2/intelligence/searchers/exploitdb_searcher.py +523 -0
- aipt_v2/intelligence/searchers/github_searcher.py +467 -0
- aipt_v2/intelligence/searchers/google_searcher.py +281 -0
- aipt_v2/intelligence/tools.json +443 -0
- aipt_v2/intelligence/triage.py +670 -0
- aipt_v2/interface/__init__.py +5 -0
- aipt_v2/interface/cli.py +230 -0
- aipt_v2/interface/main.py +501 -0
- aipt_v2/interface/tui.py +1276 -0
- aipt_v2/interface/utils.py +583 -0
- aipt_v2/llm/__init__.py +39 -0
- aipt_v2/llm/config.py +26 -0
- aipt_v2/llm/llm.py +514 -0
- aipt_v2/llm/memory.py +214 -0
- aipt_v2/llm/request_queue.py +89 -0
- aipt_v2/llm/utils.py +89 -0
- aipt_v2/models/__init__.py +15 -0
- aipt_v2/models/findings.py +295 -0
- aipt_v2/models/phase_result.py +224 -0
- aipt_v2/models/scan_config.py +207 -0
- aipt_v2/monitoring/grafana/dashboards/aipt-dashboard.json +355 -0
- aipt_v2/monitoring/grafana/dashboards/default.yml +17 -0
- aipt_v2/monitoring/grafana/datasources/prometheus.yml +17 -0
- aipt_v2/monitoring/prometheus.yml +60 -0
- aipt_v2/orchestration/__init__.py +52 -0
- aipt_v2/orchestration/pipeline.py +398 -0
- aipt_v2/orchestration/progress.py +300 -0
- aipt_v2/orchestration/scheduler.py +296 -0
- aipt_v2/orchestrator.py +2284 -0
- aipt_v2/payloads/__init__.py +27 -0
- aipt_v2/payloads/cmdi.py +150 -0
- aipt_v2/payloads/sqli.py +263 -0
- aipt_v2/payloads/ssrf.py +204 -0
- aipt_v2/payloads/templates.py +222 -0
- aipt_v2/payloads/traversal.py +166 -0
- aipt_v2/payloads/xss.py +204 -0
- aipt_v2/prompts/__init__.py +60 -0
- aipt_v2/proxy/__init__.py +29 -0
- aipt_v2/proxy/history.py +352 -0
- aipt_v2/proxy/interceptor.py +452 -0
- aipt_v2/recon/__init__.py +44 -0
- aipt_v2/recon/dns.py +241 -0
- aipt_v2/recon/osint.py +367 -0
- aipt_v2/recon/subdomain.py +372 -0
- aipt_v2/recon/tech_detect.py +311 -0
- aipt_v2/reports/__init__.py +17 -0
- aipt_v2/reports/generator.py +313 -0
- aipt_v2/reports/html_report.py +378 -0
- aipt_v2/runtime/__init__.py +44 -0
- aipt_v2/runtime/base.py +30 -0
- aipt_v2/runtime/docker.py +401 -0
- aipt_v2/runtime/local.py +346 -0
- aipt_v2/runtime/tool_server.py +205 -0
- aipt_v2/scanners/__init__.py +28 -0
- aipt_v2/scanners/base.py +273 -0
- aipt_v2/scanners/nikto.py +244 -0
- aipt_v2/scanners/nmap.py +402 -0
- aipt_v2/scanners/nuclei.py +273 -0
- aipt_v2/scanners/web.py +454 -0
- aipt_v2/scripts/security_audit.py +366 -0
- aipt_v2/telemetry/__init__.py +7 -0
- aipt_v2/telemetry/tracer.py +347 -0
- aipt_v2/terminal/__init__.py +28 -0
- aipt_v2/terminal/executor.py +400 -0
- aipt_v2/terminal/sandbox.py +350 -0
- aipt_v2/tools/__init__.py +44 -0
- aipt_v2/tools/active_directory/__init__.py +78 -0
- aipt_v2/tools/active_directory/ad_config.py +238 -0
- aipt_v2/tools/active_directory/bloodhound_wrapper.py +447 -0
- aipt_v2/tools/active_directory/kerberos_attacks.py +430 -0
- aipt_v2/tools/active_directory/ldap_enum.py +533 -0
- aipt_v2/tools/active_directory/smb_attacks.py +505 -0
- aipt_v2/tools/agents_graph/__init__.py +19 -0
- aipt_v2/tools/agents_graph/agents_graph_actions.py +69 -0
- aipt_v2/tools/api_security/__init__.py +76 -0
- aipt_v2/tools/api_security/api_discovery.py +608 -0
- aipt_v2/tools/api_security/graphql_scanner.py +622 -0
- aipt_v2/tools/api_security/jwt_analyzer.py +577 -0
- aipt_v2/tools/api_security/openapi_fuzzer.py +761 -0
- aipt_v2/tools/browser/__init__.py +5 -0
- aipt_v2/tools/browser/browser_actions.py +238 -0
- aipt_v2/tools/browser/browser_instance.py +535 -0
- aipt_v2/tools/browser/tab_manager.py +344 -0
- aipt_v2/tools/cloud/__init__.py +70 -0
- aipt_v2/tools/cloud/cloud_config.py +273 -0
- aipt_v2/tools/cloud/cloud_scanner.py +639 -0
- aipt_v2/tools/cloud/prowler_tool.py +571 -0
- aipt_v2/tools/cloud/scoutsuite_tool.py +359 -0
- aipt_v2/tools/executor.py +307 -0
- aipt_v2/tools/parser.py +408 -0
- aipt_v2/tools/proxy/__init__.py +5 -0
- aipt_v2/tools/proxy/proxy_actions.py +103 -0
- aipt_v2/tools/proxy/proxy_manager.py +789 -0
- aipt_v2/tools/registry.py +196 -0
- aipt_v2/tools/scanners/__init__.py +343 -0
- aipt_v2/tools/scanners/acunetix_tool.py +712 -0
- aipt_v2/tools/scanners/burp_tool.py +631 -0
- aipt_v2/tools/scanners/config.py +156 -0
- aipt_v2/tools/scanners/nessus_tool.py +588 -0
- aipt_v2/tools/scanners/zap_tool.py +612 -0
- aipt_v2/tools/terminal/__init__.py +5 -0
- aipt_v2/tools/terminal/terminal_actions.py +37 -0
- aipt_v2/tools/terminal/terminal_manager.py +153 -0
- aipt_v2/tools/terminal/terminal_session.py +449 -0
- aipt_v2/tools/tool_processing.py +108 -0
- aipt_v2/utils/__init__.py +17 -0
- aipt_v2/utils/logging.py +201 -0
- aipt_v2/utils/model_manager.py +187 -0
- aipt_v2/utils/searchers/__init__.py +269 -0
- aiptx-2.0.2.dist-info/METADATA +324 -0
- aiptx-2.0.2.dist-info/RECORD +165 -0
- aiptx-2.0.2.dist-info/WHEEL +5 -0
- aiptx-2.0.2.dist-info/entry_points.txt +7 -0
- aiptx-2.0.2.dist-info/licenses/LICENSE +21 -0
- aiptx-2.0.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AIPT Web Crawler
|
|
3
|
+
|
|
4
|
+
Intelligent web crawling for security assessment.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
from collections import deque
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from typing import Callable, Optional, Set
|
|
15
|
+
from urllib.parse import urljoin, urlparse
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class CrawlConfig:
|
|
24
|
+
"""Web crawler configuration"""
|
|
25
|
+
max_depth: int = 3
|
|
26
|
+
max_pages: int = 100
|
|
27
|
+
max_concurrent: int = 5
|
|
28
|
+
timeout: float = 30.0
|
|
29
|
+
delay_between_requests: float = 0.5
|
|
30
|
+
|
|
31
|
+
# Scope
|
|
32
|
+
stay_in_scope: bool = True
|
|
33
|
+
allowed_domains: list[str] = field(default_factory=list)
|
|
34
|
+
excluded_patterns: list[str] = field(default_factory=lambda: [
|
|
35
|
+
r"\.(jpg|jpeg|png|gif|svg|ico|css|js|woff|woff2|ttf|eot)$",
|
|
36
|
+
r"/logout",
|
|
37
|
+
r"/signout",
|
|
38
|
+
r"#",
|
|
39
|
+
])
|
|
40
|
+
|
|
41
|
+
# Authentication
|
|
42
|
+
cookies: dict[str, str] = field(default_factory=dict)
|
|
43
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
44
|
+
|
|
45
|
+
# User agent
|
|
46
|
+
user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
47
|
+
|
|
48
|
+
# Content
|
|
49
|
+
follow_redirects: bool = True
|
|
50
|
+
parse_forms: bool = True
|
|
51
|
+
parse_scripts: bool = True
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class CrawledPage:
|
|
56
|
+
"""Information about a crawled page"""
|
|
57
|
+
url: str
|
|
58
|
+
status_code: int
|
|
59
|
+
content_type: str = ""
|
|
60
|
+
title: str = ""
|
|
61
|
+
forms: list[dict] = field(default_factory=list)
|
|
62
|
+
links: list[str] = field(default_factory=list)
|
|
63
|
+
scripts: list[str] = field(default_factory=list)
|
|
64
|
+
parameters: list[dict] = field(default_factory=list) # GET/POST params found
|
|
65
|
+
depth: int = 0
|
|
66
|
+
parent_url: str = ""
|
|
67
|
+
crawl_time: float = 0.0
|
|
68
|
+
error: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class CrawlResult:
|
|
73
|
+
"""Complete crawl results"""
|
|
74
|
+
target: str
|
|
75
|
+
pages: list[CrawledPage] = field(default_factory=list)
|
|
76
|
+
total_urls_found: int = 0
|
|
77
|
+
total_forms_found: int = 0
|
|
78
|
+
total_parameters_found: int = 0
|
|
79
|
+
unique_domains: set = field(default_factory=set)
|
|
80
|
+
start_time: Optional[datetime] = None
|
|
81
|
+
end_time: Optional[datetime] = None
|
|
82
|
+
duration_seconds: float = 0.0
|
|
83
|
+
|
|
84
|
+
def get_all_urls(self) -> list[str]:
|
|
85
|
+
"""Get all discovered URLs"""
|
|
86
|
+
return [p.url for p in self.pages]
|
|
87
|
+
|
|
88
|
+
def get_all_forms(self) -> list[dict]:
|
|
89
|
+
"""Get all discovered forms"""
|
|
90
|
+
forms = []
|
|
91
|
+
for page in self.pages:
|
|
92
|
+
for form in page.forms:
|
|
93
|
+
forms.append({"page": page.url, **form})
|
|
94
|
+
return forms
|
|
95
|
+
|
|
96
|
+
def get_all_parameters(self) -> list[dict]:
|
|
97
|
+
"""Get all discovered parameters"""
|
|
98
|
+
params = []
|
|
99
|
+
for page in self.pages:
|
|
100
|
+
for param in page.parameters:
|
|
101
|
+
params.append({"page": page.url, **param})
|
|
102
|
+
return params
|
|
103
|
+
|
|
104
|
+
def to_dict(self) -> dict:
|
|
105
|
+
return {
|
|
106
|
+
"target": self.target,
|
|
107
|
+
"pages_crawled": len(self.pages),
|
|
108
|
+
"total_urls_found": self.total_urls_found,
|
|
109
|
+
"total_forms_found": self.total_forms_found,
|
|
110
|
+
"total_parameters_found": self.total_parameters_found,
|
|
111
|
+
"unique_domains": list(self.unique_domains),
|
|
112
|
+
"duration_seconds": self.duration_seconds,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class WebCrawler:
|
|
117
|
+
"""
|
|
118
|
+
Web crawler for security assessment.
|
|
119
|
+
|
|
120
|
+
Features:
|
|
121
|
+
- Breadth-first crawling
|
|
122
|
+
- Concurrent requests
|
|
123
|
+
- Form/parameter discovery
|
|
124
|
+
- Scope enforcement
|
|
125
|
+
- Rate limiting
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
crawler = WebCrawler(CrawlConfig(max_depth=3))
|
|
129
|
+
result = await crawler.crawl("https://target.com")
|
|
130
|
+
|
|
131
|
+
# Get all forms for testing
|
|
132
|
+
for form in result.get_all_forms():
|
|
133
|
+
print(f"Form at {form['page']}: {form['action']}")
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, config: Optional[CrawlConfig] = None):
|
|
137
|
+
self.config = config or CrawlConfig()
|
|
138
|
+
self._visited: Set[str] = set()
|
|
139
|
+
self._queue: deque = deque()
|
|
140
|
+
self._results: list[CrawledPage] = []
|
|
141
|
+
self._semaphore: Optional[asyncio.Semaphore] = None
|
|
142
|
+
self._client: Optional[httpx.AsyncClient] = None
|
|
143
|
+
self._callback: Optional[Callable[[CrawledPage], None]] = None
|
|
144
|
+
|
|
145
|
+
async def crawl(
|
|
146
|
+
self,
|
|
147
|
+
start_url: str,
|
|
148
|
+
callback: Optional[Callable[[CrawledPage], None]] = None,
|
|
149
|
+
) -> CrawlResult:
|
|
150
|
+
"""
|
|
151
|
+
Start crawling from URL.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
start_url: Starting URL
|
|
155
|
+
callback: Optional callback for each crawled page
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
CrawlResult with all discoveries
|
|
159
|
+
"""
|
|
160
|
+
self._callback = callback
|
|
161
|
+
self._visited.clear()
|
|
162
|
+
self._results.clear()
|
|
163
|
+
|
|
164
|
+
result = CrawlResult(target=start_url)
|
|
165
|
+
result.start_time = datetime.utcnow()
|
|
166
|
+
|
|
167
|
+
# Parse start URL for domain
|
|
168
|
+
parsed = urlparse(start_url)
|
|
169
|
+
base_domain = parsed.netloc
|
|
170
|
+
|
|
171
|
+
if not self.config.allowed_domains:
|
|
172
|
+
self.config.allowed_domains = [base_domain]
|
|
173
|
+
|
|
174
|
+
# Initialize client
|
|
175
|
+
headers = {"User-Agent": self.config.user_agent}
|
|
176
|
+
headers.update(self.config.headers)
|
|
177
|
+
|
|
178
|
+
self._client = httpx.AsyncClient(
|
|
179
|
+
timeout=self.config.timeout,
|
|
180
|
+
follow_redirects=self.config.follow_redirects,
|
|
181
|
+
headers=headers,
|
|
182
|
+
cookies=self.config.cookies,
|
|
183
|
+
verify=False, # For testing sites with self-signed certs
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
self._semaphore = asyncio.Semaphore(self.config.max_concurrent)
|
|
187
|
+
|
|
188
|
+
# Add start URL to queue
|
|
189
|
+
self._queue.append((start_url, 0, ""))
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
# Process queue
|
|
193
|
+
while self._queue and len(self._results) < self.config.max_pages:
|
|
194
|
+
# Get batch of URLs
|
|
195
|
+
batch = []
|
|
196
|
+
while self._queue and len(batch) < self.config.max_concurrent:
|
|
197
|
+
url, depth, parent = self._queue.popleft()
|
|
198
|
+
normalized = self._normalize_url(url)
|
|
199
|
+
|
|
200
|
+
if normalized not in self._visited and depth <= self.config.max_depth:
|
|
201
|
+
self._visited.add(normalized)
|
|
202
|
+
batch.append((url, depth, parent))
|
|
203
|
+
|
|
204
|
+
if not batch:
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
# Crawl batch concurrently
|
|
208
|
+
tasks = [
|
|
209
|
+
self._crawl_page(url, depth, parent)
|
|
210
|
+
for url, depth, parent in batch
|
|
211
|
+
]
|
|
212
|
+
await asyncio.gather(*tasks)
|
|
213
|
+
|
|
214
|
+
# Rate limiting
|
|
215
|
+
if self.config.delay_between_requests > 0:
|
|
216
|
+
await asyncio.sleep(self.config.delay_between_requests)
|
|
217
|
+
|
|
218
|
+
finally:
|
|
219
|
+
await self._client.aclose()
|
|
220
|
+
|
|
221
|
+
# Compile results
|
|
222
|
+
result.pages = self._results
|
|
223
|
+
result.end_time = datetime.utcnow()
|
|
224
|
+
result.duration_seconds = (result.end_time - result.start_time).total_seconds()
|
|
225
|
+
|
|
226
|
+
for page in self._results:
|
|
227
|
+
result.total_urls_found += len(page.links)
|
|
228
|
+
result.total_forms_found += len(page.forms)
|
|
229
|
+
result.total_parameters_found += len(page.parameters)
|
|
230
|
+
result.unique_domains.add(urlparse(page.url).netloc)
|
|
231
|
+
|
|
232
|
+
logger.info(
|
|
233
|
+
f"Crawl complete: {len(result.pages)} pages, "
|
|
234
|
+
f"{result.total_forms_found} forms, "
|
|
235
|
+
f"{result.total_parameters_found} parameters"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return result
|
|
239
|
+
|
|
240
|
+
async def _crawl_page(self, url: str, depth: int, parent: str) -> None:
|
|
241
|
+
"""Crawl a single page"""
|
|
242
|
+
async with self._semaphore:
|
|
243
|
+
page = CrawledPage(url=url, depth=depth, parent_url=parent)
|
|
244
|
+
start_time = datetime.utcnow()
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
response = await self._client.get(url)
|
|
248
|
+
page.status_code = response.status_code
|
|
249
|
+
page.content_type = response.headers.get("content-type", "")
|
|
250
|
+
|
|
251
|
+
# Only parse HTML
|
|
252
|
+
if "text/html" not in page.content_type:
|
|
253
|
+
self._results.append(page)
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
content = response.text
|
|
257
|
+
|
|
258
|
+
# Extract title
|
|
259
|
+
title_match = re.search(r"<title[^>]*>([^<]+)</title>", content, re.IGNORECASE)
|
|
260
|
+
if title_match:
|
|
261
|
+
page.title = title_match.group(1).strip()
|
|
262
|
+
|
|
263
|
+
# Extract links
|
|
264
|
+
page.links = self._extract_links(content, url)
|
|
265
|
+
|
|
266
|
+
# Extract forms
|
|
267
|
+
if self.config.parse_forms:
|
|
268
|
+
page.forms = self._extract_forms(content, url)
|
|
269
|
+
page.parameters.extend(self._extract_form_params(page.forms))
|
|
270
|
+
|
|
271
|
+
# Extract scripts
|
|
272
|
+
if self.config.parse_scripts:
|
|
273
|
+
page.scripts = self._extract_scripts(content, url)
|
|
274
|
+
|
|
275
|
+
# Extract URL parameters
|
|
276
|
+
page.parameters.extend(self._extract_url_params(url))
|
|
277
|
+
|
|
278
|
+
# Add new links to queue
|
|
279
|
+
for link in page.links:
|
|
280
|
+
if self._should_crawl(link):
|
|
281
|
+
self._queue.append((link, depth + 1, url))
|
|
282
|
+
|
|
283
|
+
except httpx.TimeoutException:
|
|
284
|
+
page.error = "Timeout"
|
|
285
|
+
except httpx.RequestError as e:
|
|
286
|
+
page.error = str(e)
|
|
287
|
+
except Exception as e:
|
|
288
|
+
page.error = f"Error: {str(e)}"
|
|
289
|
+
finally:
|
|
290
|
+
page.crawl_time = (datetime.utcnow() - start_time).total_seconds()
|
|
291
|
+
|
|
292
|
+
self._results.append(page)
|
|
293
|
+
|
|
294
|
+
if self._callback:
|
|
295
|
+
self._callback(page)
|
|
296
|
+
|
|
297
|
+
logger.debug(f"Crawled: {url} (depth={depth}, status={page.status_code})")
|
|
298
|
+
|
|
299
|
+
def _extract_links(self, html: str, base_url: str) -> list[str]:
|
|
300
|
+
"""Extract links from HTML"""
|
|
301
|
+
links = []
|
|
302
|
+
# href links
|
|
303
|
+
href_pattern = r'href=["\']([^"\']+)["\']'
|
|
304
|
+
for match in re.finditer(href_pattern, html, re.IGNORECASE):
|
|
305
|
+
href = match.group(1)
|
|
306
|
+
if not href.startswith(("javascript:", "mailto:", "tel:", "#")):
|
|
307
|
+
full_url = urljoin(base_url, href)
|
|
308
|
+
if full_url not in links:
|
|
309
|
+
links.append(full_url)
|
|
310
|
+
|
|
311
|
+
# src links (for images/scripts that might reveal paths)
|
|
312
|
+
src_pattern = r'src=["\']([^"\']+)["\']'
|
|
313
|
+
for match in re.finditer(src_pattern, html, re.IGNORECASE):
|
|
314
|
+
src = match.group(1)
|
|
315
|
+
if not src.startswith("data:"):
|
|
316
|
+
full_url = urljoin(base_url, src)
|
|
317
|
+
if full_url not in links:
|
|
318
|
+
links.append(full_url)
|
|
319
|
+
|
|
320
|
+
return links
|
|
321
|
+
|
|
322
|
+
def _extract_forms(self, html: str, base_url: str) -> list[dict]:
|
|
323
|
+
"""Extract forms from HTML"""
|
|
324
|
+
forms = []
|
|
325
|
+
form_pattern = r'<form[^>]*>(.*?)</form>'
|
|
326
|
+
|
|
327
|
+
for form_match in re.finditer(form_pattern, html, re.IGNORECASE | re.DOTALL):
|
|
328
|
+
form_html = form_match.group(0)
|
|
329
|
+
|
|
330
|
+
# Extract form attributes
|
|
331
|
+
action_match = re.search(r'action=["\']([^"\']*)["\']', form_html, re.IGNORECASE)
|
|
332
|
+
method_match = re.search(r'method=["\']([^"\']*)["\']', form_html, re.IGNORECASE)
|
|
333
|
+
|
|
334
|
+
action = action_match.group(1) if action_match else ""
|
|
335
|
+
method = method_match.group(1).upper() if method_match else "GET"
|
|
336
|
+
|
|
337
|
+
# Extract inputs
|
|
338
|
+
inputs = []
|
|
339
|
+
input_pattern = r'<input[^>]*>'
|
|
340
|
+
for input_match in re.finditer(input_pattern, form_html, re.IGNORECASE):
|
|
341
|
+
input_tag = input_match.group(0)
|
|
342
|
+
|
|
343
|
+
name_match = re.search(r'name=["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
|
|
344
|
+
type_match = re.search(r'type=["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
|
|
345
|
+
value_match = re.search(r'value=["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
|
|
346
|
+
|
|
347
|
+
inputs.append({
|
|
348
|
+
"name": name_match.group(1) if name_match else "",
|
|
349
|
+
"type": type_match.group(1) if type_match else "text",
|
|
350
|
+
"value": value_match.group(1) if value_match else "",
|
|
351
|
+
})
|
|
352
|
+
|
|
353
|
+
# Extract textareas
|
|
354
|
+
textarea_pattern = r'<textarea[^>]*name=["\']([^"\']*)["\'][^>]*>'
|
|
355
|
+
for ta_match in re.finditer(textarea_pattern, form_html, re.IGNORECASE):
|
|
356
|
+
inputs.append({
|
|
357
|
+
"name": ta_match.group(1),
|
|
358
|
+
"type": "textarea",
|
|
359
|
+
"value": "",
|
|
360
|
+
})
|
|
361
|
+
|
|
362
|
+
# Extract selects
|
|
363
|
+
select_pattern = r'<select[^>]*name=["\']([^"\']*)["\'][^>]*>'
|
|
364
|
+
for sel_match in re.finditer(select_pattern, form_html, re.IGNORECASE):
|
|
365
|
+
inputs.append({
|
|
366
|
+
"name": sel_match.group(1),
|
|
367
|
+
"type": "select",
|
|
368
|
+
"value": "",
|
|
369
|
+
})
|
|
370
|
+
|
|
371
|
+
forms.append({
|
|
372
|
+
"action": urljoin(base_url, action) if action else base_url,
|
|
373
|
+
"method": method,
|
|
374
|
+
"inputs": inputs,
|
|
375
|
+
})
|
|
376
|
+
|
|
377
|
+
return forms
|
|
378
|
+
|
|
379
|
+
def _extract_scripts(self, html: str, base_url: str) -> list[str]:
|
|
380
|
+
"""Extract script URLs"""
|
|
381
|
+
scripts = []
|
|
382
|
+
pattern = r'<script[^>]*src=["\']([^"\']+)["\'][^>]*>'
|
|
383
|
+
|
|
384
|
+
for match in re.finditer(pattern, html, re.IGNORECASE):
|
|
385
|
+
src = match.group(1)
|
|
386
|
+
full_url = urljoin(base_url, src)
|
|
387
|
+
scripts.append(full_url)
|
|
388
|
+
|
|
389
|
+
return scripts
|
|
390
|
+
|
|
391
|
+
def _extract_form_params(self, forms: list[dict]) -> list[dict]:
|
|
392
|
+
"""Extract parameters from forms"""
|
|
393
|
+
params = []
|
|
394
|
+
for form in forms:
|
|
395
|
+
for inp in form.get("inputs", []):
|
|
396
|
+
if inp.get("name"):
|
|
397
|
+
params.append({
|
|
398
|
+
"name": inp["name"],
|
|
399
|
+
"type": inp["type"],
|
|
400
|
+
"method": form["method"],
|
|
401
|
+
"location": form["action"],
|
|
402
|
+
})
|
|
403
|
+
return params
|
|
404
|
+
|
|
405
|
+
def _extract_url_params(self, url: str) -> list[dict]:
|
|
406
|
+
"""Extract GET parameters from URL"""
|
|
407
|
+
params = []
|
|
408
|
+
parsed = urlparse(url)
|
|
409
|
+
if parsed.query:
|
|
410
|
+
for pair in parsed.query.split("&"):
|
|
411
|
+
if "=" in pair:
|
|
412
|
+
name, _ = pair.split("=", 1)
|
|
413
|
+
params.append({
|
|
414
|
+
"name": name,
|
|
415
|
+
"type": "url",
|
|
416
|
+
"method": "GET",
|
|
417
|
+
"location": url,
|
|
418
|
+
})
|
|
419
|
+
return params
|
|
420
|
+
|
|
421
|
+
def _should_crawl(self, url: str) -> bool:
|
|
422
|
+
"""Check if URL should be crawled"""
|
|
423
|
+
# Check exclusion patterns
|
|
424
|
+
for pattern in self.config.excluded_patterns:
|
|
425
|
+
if re.search(pattern, url, re.IGNORECASE):
|
|
426
|
+
return False
|
|
427
|
+
|
|
428
|
+
# Check scope
|
|
429
|
+
if self.config.stay_in_scope:
|
|
430
|
+
parsed = urlparse(url)
|
|
431
|
+
domain = parsed.netloc
|
|
432
|
+
|
|
433
|
+
in_scope = False
|
|
434
|
+
for allowed in self.config.allowed_domains:
|
|
435
|
+
if domain == allowed or domain.endswith("." + allowed):
|
|
436
|
+
in_scope = True
|
|
437
|
+
break
|
|
438
|
+
|
|
439
|
+
if not in_scope:
|
|
440
|
+
return False
|
|
441
|
+
|
|
442
|
+
# Check if already visited
|
|
443
|
+
normalized = self._normalize_url(url)
|
|
444
|
+
if normalized in self._visited:
|
|
445
|
+
return False
|
|
446
|
+
|
|
447
|
+
return True
|
|
448
|
+
|
|
449
|
+
def _normalize_url(self, url: str) -> str:
|
|
450
|
+
"""Normalize URL for comparison"""
|
|
451
|
+
parsed = urlparse(url)
|
|
452
|
+
# Remove fragment and normalize
|
|
453
|
+
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}".rstrip("/").lower()
|