aiptx 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiptx might be problematic. Click here for more details.

Files changed (165) hide show
  1. aipt_v2/__init__.py +110 -0
  2. aipt_v2/__main__.py +24 -0
  3. aipt_v2/agents/AIPTxAgent/__init__.py +10 -0
  4. aipt_v2/agents/AIPTxAgent/aiptx_agent.py +211 -0
  5. aipt_v2/agents/__init__.py +24 -0
  6. aipt_v2/agents/base.py +520 -0
  7. aipt_v2/agents/ptt.py +406 -0
  8. aipt_v2/agents/state.py +168 -0
  9. aipt_v2/app.py +960 -0
  10. aipt_v2/browser/__init__.py +31 -0
  11. aipt_v2/browser/automation.py +458 -0
  12. aipt_v2/browser/crawler.py +453 -0
  13. aipt_v2/cli.py +321 -0
  14. aipt_v2/compliance/__init__.py +71 -0
  15. aipt_v2/compliance/compliance_report.py +449 -0
  16. aipt_v2/compliance/framework_mapper.py +424 -0
  17. aipt_v2/compliance/nist_mapping.py +345 -0
  18. aipt_v2/compliance/owasp_mapping.py +330 -0
  19. aipt_v2/compliance/pci_mapping.py +297 -0
  20. aipt_v2/config.py +288 -0
  21. aipt_v2/core/__init__.py +43 -0
  22. aipt_v2/core/agent.py +630 -0
  23. aipt_v2/core/llm.py +395 -0
  24. aipt_v2/core/memory.py +305 -0
  25. aipt_v2/core/ptt.py +329 -0
  26. aipt_v2/database/__init__.py +14 -0
  27. aipt_v2/database/models.py +232 -0
  28. aipt_v2/database/repository.py +384 -0
  29. aipt_v2/docker/__init__.py +23 -0
  30. aipt_v2/docker/builder.py +260 -0
  31. aipt_v2/docker/manager.py +222 -0
  32. aipt_v2/docker/sandbox.py +371 -0
  33. aipt_v2/evasion/__init__.py +58 -0
  34. aipt_v2/evasion/request_obfuscator.py +272 -0
  35. aipt_v2/evasion/tls_fingerprint.py +285 -0
  36. aipt_v2/evasion/ua_rotator.py +301 -0
  37. aipt_v2/evasion/waf_bypass.py +439 -0
  38. aipt_v2/execution/__init__.py +23 -0
  39. aipt_v2/execution/executor.py +302 -0
  40. aipt_v2/execution/parser.py +544 -0
  41. aipt_v2/execution/terminal.py +337 -0
  42. aipt_v2/health.py +437 -0
  43. aipt_v2/intelligence/__init__.py +85 -0
  44. aipt_v2/intelligence/auth.py +520 -0
  45. aipt_v2/intelligence/chaining.py +775 -0
  46. aipt_v2/intelligence/cve_aipt.py +334 -0
  47. aipt_v2/intelligence/cve_info.py +1111 -0
  48. aipt_v2/intelligence/rag.py +239 -0
  49. aipt_v2/intelligence/scope.py +442 -0
  50. aipt_v2/intelligence/searchers/__init__.py +5 -0
  51. aipt_v2/intelligence/searchers/exploitdb_searcher.py +523 -0
  52. aipt_v2/intelligence/searchers/github_searcher.py +467 -0
  53. aipt_v2/intelligence/searchers/google_searcher.py +281 -0
  54. aipt_v2/intelligence/tools.json +443 -0
  55. aipt_v2/intelligence/triage.py +670 -0
  56. aipt_v2/interface/__init__.py +5 -0
  57. aipt_v2/interface/cli.py +230 -0
  58. aipt_v2/interface/main.py +501 -0
  59. aipt_v2/interface/tui.py +1276 -0
  60. aipt_v2/interface/utils.py +583 -0
  61. aipt_v2/llm/__init__.py +39 -0
  62. aipt_v2/llm/config.py +26 -0
  63. aipt_v2/llm/llm.py +514 -0
  64. aipt_v2/llm/memory.py +214 -0
  65. aipt_v2/llm/request_queue.py +89 -0
  66. aipt_v2/llm/utils.py +89 -0
  67. aipt_v2/models/__init__.py +15 -0
  68. aipt_v2/models/findings.py +295 -0
  69. aipt_v2/models/phase_result.py +224 -0
  70. aipt_v2/models/scan_config.py +207 -0
  71. aipt_v2/monitoring/grafana/dashboards/aipt-dashboard.json +355 -0
  72. aipt_v2/monitoring/grafana/dashboards/default.yml +17 -0
  73. aipt_v2/monitoring/grafana/datasources/prometheus.yml +17 -0
  74. aipt_v2/monitoring/prometheus.yml +60 -0
  75. aipt_v2/orchestration/__init__.py +52 -0
  76. aipt_v2/orchestration/pipeline.py +398 -0
  77. aipt_v2/orchestration/progress.py +300 -0
  78. aipt_v2/orchestration/scheduler.py +296 -0
  79. aipt_v2/orchestrator.py +2284 -0
  80. aipt_v2/payloads/__init__.py +27 -0
  81. aipt_v2/payloads/cmdi.py +150 -0
  82. aipt_v2/payloads/sqli.py +263 -0
  83. aipt_v2/payloads/ssrf.py +204 -0
  84. aipt_v2/payloads/templates.py +222 -0
  85. aipt_v2/payloads/traversal.py +166 -0
  86. aipt_v2/payloads/xss.py +204 -0
  87. aipt_v2/prompts/__init__.py +60 -0
  88. aipt_v2/proxy/__init__.py +29 -0
  89. aipt_v2/proxy/history.py +352 -0
  90. aipt_v2/proxy/interceptor.py +452 -0
  91. aipt_v2/recon/__init__.py +44 -0
  92. aipt_v2/recon/dns.py +241 -0
  93. aipt_v2/recon/osint.py +367 -0
  94. aipt_v2/recon/subdomain.py +372 -0
  95. aipt_v2/recon/tech_detect.py +311 -0
  96. aipt_v2/reports/__init__.py +17 -0
  97. aipt_v2/reports/generator.py +313 -0
  98. aipt_v2/reports/html_report.py +378 -0
  99. aipt_v2/runtime/__init__.py +44 -0
  100. aipt_v2/runtime/base.py +30 -0
  101. aipt_v2/runtime/docker.py +401 -0
  102. aipt_v2/runtime/local.py +346 -0
  103. aipt_v2/runtime/tool_server.py +205 -0
  104. aipt_v2/scanners/__init__.py +28 -0
  105. aipt_v2/scanners/base.py +273 -0
  106. aipt_v2/scanners/nikto.py +244 -0
  107. aipt_v2/scanners/nmap.py +402 -0
  108. aipt_v2/scanners/nuclei.py +273 -0
  109. aipt_v2/scanners/web.py +454 -0
  110. aipt_v2/scripts/security_audit.py +366 -0
  111. aipt_v2/telemetry/__init__.py +7 -0
  112. aipt_v2/telemetry/tracer.py +347 -0
  113. aipt_v2/terminal/__init__.py +28 -0
  114. aipt_v2/terminal/executor.py +400 -0
  115. aipt_v2/terminal/sandbox.py +350 -0
  116. aipt_v2/tools/__init__.py +44 -0
  117. aipt_v2/tools/active_directory/__init__.py +78 -0
  118. aipt_v2/tools/active_directory/ad_config.py +238 -0
  119. aipt_v2/tools/active_directory/bloodhound_wrapper.py +447 -0
  120. aipt_v2/tools/active_directory/kerberos_attacks.py +430 -0
  121. aipt_v2/tools/active_directory/ldap_enum.py +533 -0
  122. aipt_v2/tools/active_directory/smb_attacks.py +505 -0
  123. aipt_v2/tools/agents_graph/__init__.py +19 -0
  124. aipt_v2/tools/agents_graph/agents_graph_actions.py +69 -0
  125. aipt_v2/tools/api_security/__init__.py +76 -0
  126. aipt_v2/tools/api_security/api_discovery.py +608 -0
  127. aipt_v2/tools/api_security/graphql_scanner.py +622 -0
  128. aipt_v2/tools/api_security/jwt_analyzer.py +577 -0
  129. aipt_v2/tools/api_security/openapi_fuzzer.py +761 -0
  130. aipt_v2/tools/browser/__init__.py +5 -0
  131. aipt_v2/tools/browser/browser_actions.py +238 -0
  132. aipt_v2/tools/browser/browser_instance.py +535 -0
  133. aipt_v2/tools/browser/tab_manager.py +344 -0
  134. aipt_v2/tools/cloud/__init__.py +70 -0
  135. aipt_v2/tools/cloud/cloud_config.py +273 -0
  136. aipt_v2/tools/cloud/cloud_scanner.py +639 -0
  137. aipt_v2/tools/cloud/prowler_tool.py +571 -0
  138. aipt_v2/tools/cloud/scoutsuite_tool.py +359 -0
  139. aipt_v2/tools/executor.py +307 -0
  140. aipt_v2/tools/parser.py +408 -0
  141. aipt_v2/tools/proxy/__init__.py +5 -0
  142. aipt_v2/tools/proxy/proxy_actions.py +103 -0
  143. aipt_v2/tools/proxy/proxy_manager.py +789 -0
  144. aipt_v2/tools/registry.py +196 -0
  145. aipt_v2/tools/scanners/__init__.py +343 -0
  146. aipt_v2/tools/scanners/acunetix_tool.py +712 -0
  147. aipt_v2/tools/scanners/burp_tool.py +631 -0
  148. aipt_v2/tools/scanners/config.py +156 -0
  149. aipt_v2/tools/scanners/nessus_tool.py +588 -0
  150. aipt_v2/tools/scanners/zap_tool.py +612 -0
  151. aipt_v2/tools/terminal/__init__.py +5 -0
  152. aipt_v2/tools/terminal/terminal_actions.py +37 -0
  153. aipt_v2/tools/terminal/terminal_manager.py +153 -0
  154. aipt_v2/tools/terminal/terminal_session.py +449 -0
  155. aipt_v2/tools/tool_processing.py +108 -0
  156. aipt_v2/utils/__init__.py +17 -0
  157. aipt_v2/utils/logging.py +201 -0
  158. aipt_v2/utils/model_manager.py +187 -0
  159. aipt_v2/utils/searchers/__init__.py +269 -0
  160. aiptx-2.0.2.dist-info/METADATA +324 -0
  161. aiptx-2.0.2.dist-info/RECORD +165 -0
  162. aiptx-2.0.2.dist-info/WHEEL +5 -0
  163. aiptx-2.0.2.dist-info/entry_points.txt +7 -0
  164. aiptx-2.0.2.dist-info/licenses/LICENSE +21 -0
  165. aiptx-2.0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,453 @@
1
+ """
2
+ AIPT Web Crawler
3
+
4
+ Intelligent web crawling for security assessment.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import asyncio
9
+ import logging
10
+ import re
11
+ from collections import deque
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime
14
+ from typing import Callable, Optional, Set
15
+ from urllib.parse import urljoin, urlparse
16
+
17
+ import httpx
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class CrawlConfig:
24
+ """Web crawler configuration"""
25
+ max_depth: int = 3
26
+ max_pages: int = 100
27
+ max_concurrent: int = 5
28
+ timeout: float = 30.0
29
+ delay_between_requests: float = 0.5
30
+
31
+ # Scope
32
+ stay_in_scope: bool = True
33
+ allowed_domains: list[str] = field(default_factory=list)
34
+ excluded_patterns: list[str] = field(default_factory=lambda: [
35
+ r"\.(jpg|jpeg|png|gif|svg|ico|css|js|woff|woff2|ttf|eot)$",
36
+ r"/logout",
37
+ r"/signout",
38
+ r"#",
39
+ ])
40
+
41
+ # Authentication
42
+ cookies: dict[str, str] = field(default_factory=dict)
43
+ headers: dict[str, str] = field(default_factory=dict)
44
+
45
+ # User agent
46
+ user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
47
+
48
+ # Content
49
+ follow_redirects: bool = True
50
+ parse_forms: bool = True
51
+ parse_scripts: bool = True
52
+
53
+
54
+ @dataclass
55
+ class CrawledPage:
56
+ """Information about a crawled page"""
57
+ url: str
58
+ status_code: int
59
+ content_type: str = ""
60
+ title: str = ""
61
+ forms: list[dict] = field(default_factory=list)
62
+ links: list[str] = field(default_factory=list)
63
+ scripts: list[str] = field(default_factory=list)
64
+ parameters: list[dict] = field(default_factory=list) # GET/POST params found
65
+ depth: int = 0
66
+ parent_url: str = ""
67
+ crawl_time: float = 0.0
68
+ error: Optional[str] = None
69
+
70
+
71
+ @dataclass
72
+ class CrawlResult:
73
+ """Complete crawl results"""
74
+ target: str
75
+ pages: list[CrawledPage] = field(default_factory=list)
76
+ total_urls_found: int = 0
77
+ total_forms_found: int = 0
78
+ total_parameters_found: int = 0
79
+ unique_domains: set = field(default_factory=set)
80
+ start_time: Optional[datetime] = None
81
+ end_time: Optional[datetime] = None
82
+ duration_seconds: float = 0.0
83
+
84
+ def get_all_urls(self) -> list[str]:
85
+ """Get all discovered URLs"""
86
+ return [p.url for p in self.pages]
87
+
88
+ def get_all_forms(self) -> list[dict]:
89
+ """Get all discovered forms"""
90
+ forms = []
91
+ for page in self.pages:
92
+ for form in page.forms:
93
+ forms.append({"page": page.url, **form})
94
+ return forms
95
+
96
+ def get_all_parameters(self) -> list[dict]:
97
+ """Get all discovered parameters"""
98
+ params = []
99
+ for page in self.pages:
100
+ for param in page.parameters:
101
+ params.append({"page": page.url, **param})
102
+ return params
103
+
104
+ def to_dict(self) -> dict:
105
+ return {
106
+ "target": self.target,
107
+ "pages_crawled": len(self.pages),
108
+ "total_urls_found": self.total_urls_found,
109
+ "total_forms_found": self.total_forms_found,
110
+ "total_parameters_found": self.total_parameters_found,
111
+ "unique_domains": list(self.unique_domains),
112
+ "duration_seconds": self.duration_seconds,
113
+ }
114
+
115
+
116
+ class WebCrawler:
117
+ """
118
+ Web crawler for security assessment.
119
+
120
+ Features:
121
+ - Breadth-first crawling
122
+ - Concurrent requests
123
+ - Form/parameter discovery
124
+ - Scope enforcement
125
+ - Rate limiting
126
+
127
+ Example:
128
+ crawler = WebCrawler(CrawlConfig(max_depth=3))
129
+ result = await crawler.crawl("https://target.com")
130
+
131
+ # Get all forms for testing
132
+ for form in result.get_all_forms():
133
+ print(f"Form at {form['page']}: {form['action']}")
134
+ """
135
+
136
+ def __init__(self, config: Optional[CrawlConfig] = None):
137
+ self.config = config or CrawlConfig()
138
+ self._visited: Set[str] = set()
139
+ self._queue: deque = deque()
140
+ self._results: list[CrawledPage] = []
141
+ self._semaphore: Optional[asyncio.Semaphore] = None
142
+ self._client: Optional[httpx.AsyncClient] = None
143
+ self._callback: Optional[Callable[[CrawledPage], None]] = None
144
+
145
+ async def crawl(
146
+ self,
147
+ start_url: str,
148
+ callback: Optional[Callable[[CrawledPage], None]] = None,
149
+ ) -> CrawlResult:
150
+ """
151
+ Start crawling from URL.
152
+
153
+ Args:
154
+ start_url: Starting URL
155
+ callback: Optional callback for each crawled page
156
+
157
+ Returns:
158
+ CrawlResult with all discoveries
159
+ """
160
+ self._callback = callback
161
+ self._visited.clear()
162
+ self._results.clear()
163
+
164
+ result = CrawlResult(target=start_url)
165
+ result.start_time = datetime.utcnow()
166
+
167
+ # Parse start URL for domain
168
+ parsed = urlparse(start_url)
169
+ base_domain = parsed.netloc
170
+
171
+ if not self.config.allowed_domains:
172
+ self.config.allowed_domains = [base_domain]
173
+
174
+ # Initialize client
175
+ headers = {"User-Agent": self.config.user_agent}
176
+ headers.update(self.config.headers)
177
+
178
+ self._client = httpx.AsyncClient(
179
+ timeout=self.config.timeout,
180
+ follow_redirects=self.config.follow_redirects,
181
+ headers=headers,
182
+ cookies=self.config.cookies,
183
+ verify=False, # For testing sites with self-signed certs
184
+ )
185
+
186
+ self._semaphore = asyncio.Semaphore(self.config.max_concurrent)
187
+
188
+ # Add start URL to queue
189
+ self._queue.append((start_url, 0, ""))
190
+
191
+ try:
192
+ # Process queue
193
+ while self._queue and len(self._results) < self.config.max_pages:
194
+ # Get batch of URLs
195
+ batch = []
196
+ while self._queue and len(batch) < self.config.max_concurrent:
197
+ url, depth, parent = self._queue.popleft()
198
+ normalized = self._normalize_url(url)
199
+
200
+ if normalized not in self._visited and depth <= self.config.max_depth:
201
+ self._visited.add(normalized)
202
+ batch.append((url, depth, parent))
203
+
204
+ if not batch:
205
+ break
206
+
207
+ # Crawl batch concurrently
208
+ tasks = [
209
+ self._crawl_page(url, depth, parent)
210
+ for url, depth, parent in batch
211
+ ]
212
+ await asyncio.gather(*tasks)
213
+
214
+ # Rate limiting
215
+ if self.config.delay_between_requests > 0:
216
+ await asyncio.sleep(self.config.delay_between_requests)
217
+
218
+ finally:
219
+ await self._client.aclose()
220
+
221
+ # Compile results
222
+ result.pages = self._results
223
+ result.end_time = datetime.utcnow()
224
+ result.duration_seconds = (result.end_time - result.start_time).total_seconds()
225
+
226
+ for page in self._results:
227
+ result.total_urls_found += len(page.links)
228
+ result.total_forms_found += len(page.forms)
229
+ result.total_parameters_found += len(page.parameters)
230
+ result.unique_domains.add(urlparse(page.url).netloc)
231
+
232
+ logger.info(
233
+ f"Crawl complete: {len(result.pages)} pages, "
234
+ f"{result.total_forms_found} forms, "
235
+ f"{result.total_parameters_found} parameters"
236
+ )
237
+
238
+ return result
239
+
240
+ async def _crawl_page(self, url: str, depth: int, parent: str) -> None:
241
+ """Crawl a single page"""
242
+ async with self._semaphore:
243
+ page = CrawledPage(url=url, depth=depth, parent_url=parent)
244
+ start_time = datetime.utcnow()
245
+
246
+ try:
247
+ response = await self._client.get(url)
248
+ page.status_code = response.status_code
249
+ page.content_type = response.headers.get("content-type", "")
250
+
251
+ # Only parse HTML
252
+ if "text/html" not in page.content_type:
253
+ self._results.append(page)
254
+ return
255
+
256
+ content = response.text
257
+
258
+ # Extract title
259
+ title_match = re.search(r"<title[^>]*>([^<]+)</title>", content, re.IGNORECASE)
260
+ if title_match:
261
+ page.title = title_match.group(1).strip()
262
+
263
+ # Extract links
264
+ page.links = self._extract_links(content, url)
265
+
266
+ # Extract forms
267
+ if self.config.parse_forms:
268
+ page.forms = self._extract_forms(content, url)
269
+ page.parameters.extend(self._extract_form_params(page.forms))
270
+
271
+ # Extract scripts
272
+ if self.config.parse_scripts:
273
+ page.scripts = self._extract_scripts(content, url)
274
+
275
+ # Extract URL parameters
276
+ page.parameters.extend(self._extract_url_params(url))
277
+
278
+ # Add new links to queue
279
+ for link in page.links:
280
+ if self._should_crawl(link):
281
+ self._queue.append((link, depth + 1, url))
282
+
283
+ except httpx.TimeoutException:
284
+ page.error = "Timeout"
285
+ except httpx.RequestError as e:
286
+ page.error = str(e)
287
+ except Exception as e:
288
+ page.error = f"Error: {str(e)}"
289
+ finally:
290
+ page.crawl_time = (datetime.utcnow() - start_time).total_seconds()
291
+
292
+ self._results.append(page)
293
+
294
+ if self._callback:
295
+ self._callback(page)
296
+
297
+ logger.debug(f"Crawled: {url} (depth={depth}, status={page.status_code})")
298
+
299
+ def _extract_links(self, html: str, base_url: str) -> list[str]:
300
+ """Extract links from HTML"""
301
+ links = []
302
+ # href links
303
+ href_pattern = r'href=["\']([^"\']+)["\']'
304
+ for match in re.finditer(href_pattern, html, re.IGNORECASE):
305
+ href = match.group(1)
306
+ if not href.startswith(("javascript:", "mailto:", "tel:", "#")):
307
+ full_url = urljoin(base_url, href)
308
+ if full_url not in links:
309
+ links.append(full_url)
310
+
311
+ # src links (for images/scripts that might reveal paths)
312
+ src_pattern = r'src=["\']([^"\']+)["\']'
313
+ for match in re.finditer(src_pattern, html, re.IGNORECASE):
314
+ src = match.group(1)
315
+ if not src.startswith("data:"):
316
+ full_url = urljoin(base_url, src)
317
+ if full_url not in links:
318
+ links.append(full_url)
319
+
320
+ return links
321
+
322
+ def _extract_forms(self, html: str, base_url: str) -> list[dict]:
323
+ """Extract forms from HTML"""
324
+ forms = []
325
+ form_pattern = r'<form[^>]*>(.*?)</form>'
326
+
327
+ for form_match in re.finditer(form_pattern, html, re.IGNORECASE | re.DOTALL):
328
+ form_html = form_match.group(0)
329
+
330
+ # Extract form attributes
331
+ action_match = re.search(r'action=["\']([^"\']*)["\']', form_html, re.IGNORECASE)
332
+ method_match = re.search(r'method=["\']([^"\']*)["\']', form_html, re.IGNORECASE)
333
+
334
+ action = action_match.group(1) if action_match else ""
335
+ method = method_match.group(1).upper() if method_match else "GET"
336
+
337
+ # Extract inputs
338
+ inputs = []
339
+ input_pattern = r'<input[^>]*>'
340
+ for input_match in re.finditer(input_pattern, form_html, re.IGNORECASE):
341
+ input_tag = input_match.group(0)
342
+
343
+ name_match = re.search(r'name=["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
344
+ type_match = re.search(r'type=["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
345
+ value_match = re.search(r'value=["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
346
+
347
+ inputs.append({
348
+ "name": name_match.group(1) if name_match else "",
349
+ "type": type_match.group(1) if type_match else "text",
350
+ "value": value_match.group(1) if value_match else "",
351
+ })
352
+
353
+ # Extract textareas
354
+ textarea_pattern = r'<textarea[^>]*name=["\']([^"\']*)["\'][^>]*>'
355
+ for ta_match in re.finditer(textarea_pattern, form_html, re.IGNORECASE):
356
+ inputs.append({
357
+ "name": ta_match.group(1),
358
+ "type": "textarea",
359
+ "value": "",
360
+ })
361
+
362
+ # Extract selects
363
+ select_pattern = r'<select[^>]*name=["\']([^"\']*)["\'][^>]*>'
364
+ for sel_match in re.finditer(select_pattern, form_html, re.IGNORECASE):
365
+ inputs.append({
366
+ "name": sel_match.group(1),
367
+ "type": "select",
368
+ "value": "",
369
+ })
370
+
371
+ forms.append({
372
+ "action": urljoin(base_url, action) if action else base_url,
373
+ "method": method,
374
+ "inputs": inputs,
375
+ })
376
+
377
+ return forms
378
+
379
+ def _extract_scripts(self, html: str, base_url: str) -> list[str]:
380
+ """Extract script URLs"""
381
+ scripts = []
382
+ pattern = r'<script[^>]*src=["\']([^"\']+)["\'][^>]*>'
383
+
384
+ for match in re.finditer(pattern, html, re.IGNORECASE):
385
+ src = match.group(1)
386
+ full_url = urljoin(base_url, src)
387
+ scripts.append(full_url)
388
+
389
+ return scripts
390
+
391
+ def _extract_form_params(self, forms: list[dict]) -> list[dict]:
392
+ """Extract parameters from forms"""
393
+ params = []
394
+ for form in forms:
395
+ for inp in form.get("inputs", []):
396
+ if inp.get("name"):
397
+ params.append({
398
+ "name": inp["name"],
399
+ "type": inp["type"],
400
+ "method": form["method"],
401
+ "location": form["action"],
402
+ })
403
+ return params
404
+
405
+ def _extract_url_params(self, url: str) -> list[dict]:
406
+ """Extract GET parameters from URL"""
407
+ params = []
408
+ parsed = urlparse(url)
409
+ if parsed.query:
410
+ for pair in parsed.query.split("&"):
411
+ if "=" in pair:
412
+ name, _ = pair.split("=", 1)
413
+ params.append({
414
+ "name": name,
415
+ "type": "url",
416
+ "method": "GET",
417
+ "location": url,
418
+ })
419
+ return params
420
+
421
+ def _should_crawl(self, url: str) -> bool:
422
+ """Check if URL should be crawled"""
423
+ # Check exclusion patterns
424
+ for pattern in self.config.excluded_patterns:
425
+ if re.search(pattern, url, re.IGNORECASE):
426
+ return False
427
+
428
+ # Check scope
429
+ if self.config.stay_in_scope:
430
+ parsed = urlparse(url)
431
+ domain = parsed.netloc
432
+
433
+ in_scope = False
434
+ for allowed in self.config.allowed_domains:
435
+ if domain == allowed or domain.endswith("." + allowed):
436
+ in_scope = True
437
+ break
438
+
439
+ if not in_scope:
440
+ return False
441
+
442
+ # Check if already visited
443
+ normalized = self._normalize_url(url)
444
+ if normalized in self._visited:
445
+ return False
446
+
447
+ return True
448
+
449
+ def _normalize_url(self, url: str) -> str:
450
+ """Normalize URL for comparison"""
451
+ parsed = urlparse(url)
452
+ # Remove fragment and normalize
453
+ return f"{parsed.scheme}://{parsed.netloc}{parsed.path}".rstrip("/").lower()