moriarty-project 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moriarty/__init__.py +1 -1
- moriarty/cli/app.py +4 -3
- moriarty/cli/domain_cmd.py +5 -1
- moriarty/modules/directory_fuzzer.py +25 -5
- moriarty/modules/web_crawler.py +448 -91
- {moriarty_project-0.1.22.dist-info → moriarty_project-0.1.24.dist-info}/METADATA +3 -3
- {moriarty_project-0.1.22.dist-info → moriarty_project-0.1.24.dist-info}/RECORD +9 -27
- moriarty/modules/wifippler/__init__.py +0 -92
- moriarty/modules/wifippler/cli/__init__.py +0 -8
- moriarty/modules/wifippler/cli/commands.py +0 -123
- moriarty/modules/wifippler/core/__init__.py +0 -94
- moriarty/modules/wifippler/core/attacks/__init__.py +0 -146
- moriarty/modules/wifippler/core/attacks/deauth.py +0 -262
- moriarty/modules/wifippler/core/attacks/handshake.py +0 -402
- moriarty/modules/wifippler/core/attacks/pmkid.py +0 -424
- moriarty/modules/wifippler/core/attacks/wep.py +0 -467
- moriarty/modules/wifippler/core/attacks/wpa.py +0 -446
- moriarty/modules/wifippler/core/attacks/wps.py +0 -474
- moriarty/modules/wifippler/core/models/__init__.py +0 -10
- moriarty/modules/wifippler/core/models/network.py +0 -240
- moriarty/modules/wifippler/core/scanner.py +0 -903
- moriarty/modules/wifippler/core/utils/__init__.py +0 -624
- moriarty/modules/wifippler/core/utils/exec.py +0 -182
- moriarty/modules/wifippler/core/utils/network.py +0 -262
- moriarty/modules/wifippler/core/utils/system.py +0 -153
- {moriarty_project-0.1.22.dist-info → moriarty_project-0.1.24.dist-info}/WHEEL +0 -0
- {moriarty_project-0.1.22.dist-info → moriarty_project-0.1.24.dist-info}/entry_points.txt +0 -0
moriarty/modules/web_crawler.py
CHANGED
@@ -1,10 +1,14 @@
|
|
1
|
-
"""Crawler HTTP
|
1
|
+
"""Crawler HTTP avançado para enumeração de rotas e formulários com suporte a redirecionamentos e evasão de bloqueios."""
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import asyncio
|
5
5
|
import random
|
6
|
+
import time
|
7
|
+
import ssl
|
8
|
+
import certifi
|
6
9
|
from dataclasses import dataclass, field
|
7
|
-
from typing import Dict, List, Optional, Set, TYPE_CHECKING
|
10
|
+
from typing import Dict, List, Optional, Set, Tuple, Any, TYPE_CHECKING
|
11
|
+
from urllib.parse import urlparse, urljoin
|
8
12
|
|
9
13
|
import httpx
|
10
14
|
from selectolax.parser import HTMLParser
|
@@ -15,149 +19,502 @@ if TYPE_CHECKING: # pragma: no cover - apenas para type hints
|
|
15
19
|
|
16
20
|
logger = structlog.get_logger(__name__)
|
17
21
|
|
22
|
+
# Headers realistas de navegador
|
23
|
+
DEFAULT_HEADERS = {
|
24
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
25
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
26
|
+
"Accept-Language": "pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3",
|
27
|
+
"Accept-Encoding": "gzip, deflate, br",
|
28
|
+
"Connection": "keep-alive",
|
29
|
+
"Upgrade-Insecure-Requests": "1",
|
30
|
+
"Sec-Fetch-Dest": "document",
|
31
|
+
"Sec-Fetch-Mode": "navigate",
|
32
|
+
"Sec-Fetch-Site": "none",
|
33
|
+
"Sec-Fetch-User": "?1",
|
34
|
+
"Cache-Control": "max-age=0",
|
35
|
+
}
|
36
|
+
|
37
|
+
# Lista de user-agents para rotação
|
38
|
+
USER_AGENTS = [
|
39
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
40
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
|
41
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
|
42
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
|
43
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
44
|
+
]
|
45
|
+
|
46
|
+
# Lista de referrers para rotação
|
47
|
+
REFERRERS = [
|
48
|
+
"https://www.google.com/",
|
49
|
+
"https://www.bing.com/",
|
50
|
+
"https://www.yahoo.com/",
|
51
|
+
"https://duckduckgo.com/",
|
52
|
+
""
|
53
|
+
]
|
18
54
|
|
19
55
|
@dataclass
|
20
56
|
class CrawlPage:
|
57
|
+
"""Representa uma página web rastreada."""
|
21
58
|
url: str
|
22
59
|
status: int
|
23
60
|
title: Optional[str] = None
|
24
|
-
forms: List[Dict[str,
|
61
|
+
forms: List[Dict[str, Any]] = field(default_factory=list)
|
25
62
|
links: List[str] = field(default_factory=list)
|
63
|
+
redirect_chain: List[Tuple[str, int]] = field(default_factory=list)
|
64
|
+
error: Optional[str] = None
|
26
65
|
|
27
66
|
|
28
67
|
class WebCrawler:
|
29
|
-
"""Crawler
|
68
|
+
"""Crawler avançado com suporte a redirecionamentos e evasão de bloqueios."""
|
30
69
|
|
31
70
|
def __init__(
|
32
71
|
self,
|
33
72
|
base_url: str,
|
34
73
|
max_pages: int = 100,
|
35
74
|
max_depth: int = 2,
|
36
|
-
concurrency: int =
|
75
|
+
concurrency: int = 5, # Reduzido para evitar sobrecarga
|
37
76
|
follow_subdomains: bool = False,
|
38
|
-
user_agent: str =
|
77
|
+
user_agent: Optional[str] = None,
|
39
78
|
stealth: Optional["StealthMode"] = None,
|
79
|
+
request_delay: Tuple[float, float] = (1.0, 3.0), # Atraso aleatório entre requisições (min, max)
|
80
|
+
timeout: float = 30.0, # Timeout para requisições
|
81
|
+
verify_ssl: bool = True, # Verificar certificados SSL
|
82
|
+
max_redirects: int = 5, # Número máximo de redirecionamentos
|
83
|
+
respect_robots: bool = True, # Respeitar robots.txt
|
40
84
|
):
|
41
85
|
self.base_url = base_url.rstrip("/")
|
42
86
|
self.max_pages = max_pages
|
43
87
|
self.max_depth = max_depth
|
44
88
|
self.concurrency = concurrency
|
45
89
|
self.follow_subdomains = follow_subdomains
|
90
|
+
|
91
|
+
# Configurações de requisição
|
92
|
+
self.request_delay = request_delay
|
93
|
+
self.timeout = timeout
|
94
|
+
self.max_redirects = max_redirects
|
95
|
+
self.verify_ssl = verify_ssl
|
96
|
+
self.respect_robots = respect_robots
|
97
|
+
|
98
|
+
# Configurações de stealth
|
99
|
+
self.stealth = stealth
|
100
|
+
self.user_agent = user_agent or random.choice(USER_AGENTS)
|
101
|
+
self.session_cookies: Dict[str, str] = {}
|
102
|
+
self.last_request_time: float = 0
|
103
|
+
|
104
|
+
# Configurações de domínio
|
105
|
+
self.parsed_base_url = self._parse_url(base_url)
|
106
|
+
self.base_domain = self._get_base_domain(self.parsed_base_url.hostname or '')
|
107
|
+
self.allowed_domains = {self.base_domain}
|
108
|
+
if follow_subdomains:
|
109
|
+
self.allowed_domains.add(f".{self.base_domain}")
|
110
|
+
|
111
|
+
# Estado do crawler
|
46
112
|
self.visited: Set[str] = set()
|
47
113
|
self.results: Dict[str, CrawlPage] = {}
|
48
|
-
self.
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
# Reduz concorrência para modos stealth altos
|
54
|
-
effective_concurrency = max(2, min(concurrency, int(concurrency / (self.stealth.level or 1))))
|
55
|
-
|
56
|
-
self.sem = asyncio.Semaphore(effective_concurrency)
|
57
|
-
self.session = httpx.AsyncClient(timeout=10.0, follow_redirects=True)
|
58
|
-
|
59
|
-
parsed = httpx.URL(self.base_url)
|
60
|
-
self._host = parsed.host
|
61
|
-
self._scheme = parsed.scheme
|
62
|
-
|
63
|
-
async def close(self) -> None:
|
64
|
-
await self.session.aclose()
|
114
|
+
self.robots_txt: Optional[Dict[str, Any]] = None
|
115
|
+
|
116
|
+
# Configuração do cliente HTTP
|
117
|
+
self.session: Optional[httpx.AsyncClient] = None
|
118
|
+
self.sem: Optional[asyncio.Semaphore] = None
|
65
119
|
|
120
|
+
async def _init_session(self) -> None:
|
121
|
+
"""Inicializa a sessão HTTP com configurações de segurança e performance."""
|
122
|
+
# Configuração SSL
|
123
|
+
ssl_context = ssl.create_default_context(cafile=certifi.where())
|
124
|
+
if not self.verify_ssl:
|
125
|
+
ssl_context.check_hostname = False
|
126
|
+
ssl_context.verify_mode = ssl.CERT_NONE
|
127
|
+
|
128
|
+
# Configuração do transporte HTTP
|
129
|
+
limits = httpx.Limits(
|
130
|
+
max_keepalive_connections=10,
|
131
|
+
max_connections=20,
|
132
|
+
keepalive_expiry=60.0
|
133
|
+
)
|
134
|
+
|
135
|
+
# Configuração do cliente HTTP
|
136
|
+
self.session = httpx.AsyncClient(
|
137
|
+
timeout=self.timeout,
|
138
|
+
follow_redirects=True,
|
139
|
+
max_redirects=self.max_redirects,
|
140
|
+
http_versions=["HTTP/1.1", "HTTP/2"],
|
141
|
+
limits=limits,
|
142
|
+
verify=ssl_context if self.verify_ssl else False,
|
143
|
+
headers=DEFAULT_HEADERS.copy(),
|
144
|
+
cookies=self.session_cookies
|
145
|
+
)
|
146
|
+
|
147
|
+
# Atualiza o user-agent
|
148
|
+
if self.user_agent:
|
149
|
+
self.session.headers["User-Agent"] = self.user_agent
|
150
|
+
|
151
|
+
# Adiciona headers adicionais de stealth
|
152
|
+
self.session.headers.update({
|
153
|
+
"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
|
154
|
+
"Accept-Encoding": "gzip, deflate, br",
|
155
|
+
"DNT": "1",
|
156
|
+
"Upgrade-Insecure-Requests": "1"
|
157
|
+
})
|
158
|
+
|
159
|
+
# Configura o semáforo para limitar concorrência
|
160
|
+
self.sem = asyncio.Semaphore(self.concurrency)
|
161
|
+
|
162
|
+
# Se necessário, verifica o robots.txt
|
163
|
+
if self.respect_robots:
|
164
|
+
await self._check_robots_txt()
|
165
|
+
|
166
|
+
async def _check_robots_txt(self) -> None:
|
167
|
+
"""Verifica o arquivo robots.txt e atualiza as regras de acesso."""
|
168
|
+
if not self.session:
|
169
|
+
return
|
170
|
+
|
171
|
+
robots_url = f"{self.parsed_base_url.scheme}://{self.parsed_base_url.netloc}/robots.txt"
|
172
|
+
try:
|
173
|
+
response = await self.session.get(robots_url)
|
174
|
+
if response.status_code == 200:
|
175
|
+
# Aqui você pode implementar um parser de robots.txt mais sofisticado
|
176
|
+
self.robots_txt = {"content": response.text}
|
177
|
+
logger.info("robots_txt_found", url=robots_url)
|
178
|
+
except Exception as e:
|
179
|
+
logger.warning("robots_txt_error", url=robots_url, error=str(e))
|
180
|
+
|
181
|
+
async def _random_delay(self) -> None:
|
182
|
+
"""Aguarda um tempo aleatório entre requisições para evitar bloqueios."""
|
183
|
+
if self.request_delay:
|
184
|
+
min_delay, max_delay = self.request_delay
|
185
|
+
delay = random.uniform(min_delay, max_delay)
|
186
|
+
elapsed = time.time() - self.last_request_time
|
187
|
+
if elapsed < delay:
|
188
|
+
await asyncio.sleep(delay - elapsed)
|
189
|
+
self.last_request_time = time.time()
|
190
|
+
|
66
191
|
async def crawl(self) -> Dict[str, CrawlPage]:
|
192
|
+
"""Inicia o processo de rastreamento do site.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
Dict[str, CrawlPage]: Dicionário com as páginas encontradas, onde a chave é a URL.
|
196
|
+
"""
|
197
|
+
# Inicializa a sessão HTTP
|
198
|
+
if not self.session:
|
199
|
+
await self._init_session()
|
200
|
+
|
201
|
+
# Inicializa a fila de URLs a serem processadas
|
67
202
|
queue: asyncio.Queue = asyncio.Queue()
|
68
|
-
|
203
|
+
initial_url = f"{self.parsed_base_url.scheme}://{self.parsed_base_url.netloc}"
|
204
|
+
await queue.put((initial_url, 0))
|
69
205
|
|
70
|
-
|
206
|
+
# Função worker para processar URLs em paralelo
|
207
|
+
async def worker() -> None:
|
71
208
|
while True:
|
72
209
|
try:
|
73
210
|
url, depth = queue.get_nowait()
|
74
211
|
except asyncio.QueueEmpty:
|
75
212
|
break
|
213
|
+
|
214
|
+
# Verifica os limites de páginas e profundidade
|
76
215
|
if len(self.results) >= self.max_pages or depth > self.max_depth:
|
77
216
|
continue
|
217
|
+
|
218
|
+
# Evita processar a mesma URL múltiplas vezes
|
78
219
|
if url in self.visited:
|
79
220
|
continue
|
80
|
-
|
221
|
+
|
222
|
+
# Aguarda um tempo aleatório entre requisições
|
223
|
+
await self._random_delay()
|
224
|
+
|
225
|
+
# Processa a URL
|
81
226
|
await self._fetch(url, depth, queue)
|
227
|
+
|
228
|
+
# Atualiza o contador de páginas processadas
|
229
|
+
queue.task_done()
|
82
230
|
|
231
|
+
# Inicia os workers
|
83
232
|
workers = [asyncio.create_task(worker()) for _ in range(self.concurrency)]
|
84
233
|
await asyncio.gather(*workers)
|
85
234
|
return self.results
|
86
235
|
|
236
|
+
def _parse_url(self, url: str) -> httpx.URL:
|
237
|
+
"""Parseia uma URL e retorna um objeto URL do httpx."""
|
238
|
+
try:
|
239
|
+
return httpx.URL(url)
|
240
|
+
except Exception as e:
|
241
|
+
logger.error("url_parse_error", url=url, error=str(e))
|
242
|
+
raise ValueError(f"URL inválida: {url}") from e
|
243
|
+
|
244
|
+
def _get_base_domain(self, hostname: str) -> str:
|
245
|
+
"""Extrai o domínio base de um hostname."""
|
246
|
+
if not hostname:
|
247
|
+
return ""
|
248
|
+
parts = hostname.split(".")
|
249
|
+
if len(parts) > 2:
|
250
|
+
return ".".join(parts[-2:])
|
251
|
+
return hostname
|
252
|
+
|
253
|
+
def _is_same_domain(self, url: str) -> bool:
|
254
|
+
"""Verifica se uma URL pertence ao mesmo domínio do alvo."""
|
255
|
+
try:
|
256
|
+
parsed = self._parse_url(url)
|
257
|
+
if not parsed.host:
|
258
|
+
return False
|
259
|
+
|
260
|
+
# Verifica se o domínio é o mesmo ou um subdomínio
|
261
|
+
if self.follow_subdomains:
|
262
|
+
return parsed.host.endswith(self.base_domain) or f".{parsed.host}".endswith(f".{self.base_domain}")
|
263
|
+
return parsed.host == self.parsed_base_url.host
|
264
|
+
except Exception:
|
265
|
+
return False
|
266
|
+
|
267
|
+
def _normalize_url(self, url: str, base_url: Optional[str] = None) -> str:
|
268
|
+
"""Normaliza uma URL, resolvendo URLs relativas e removendo fragmentos."""
|
269
|
+
try:
|
270
|
+
if not url:
|
271
|
+
return ""
|
272
|
+
|
273
|
+
# Remove fragmentos e espaços em branco
|
274
|
+
url = url.split("#")[0].strip()
|
275
|
+
if not url:
|
276
|
+
return ""
|
277
|
+
|
278
|
+
# Se for uma URL relativa, resolve em relação à base_url
|
279
|
+
if base_url and not url.startswith(('http://', 'https://')):
|
280
|
+
base = self._parse_url(base_url)
|
281
|
+
url = str(base.join(url))
|
282
|
+
|
283
|
+
# Parseia a URL para normalização
|
284
|
+
parsed = self._parse_url(url)
|
285
|
+
|
286
|
+
# Remove parâmetros de rastreamento comuns
|
287
|
+
if parsed.query:
|
288
|
+
query_params = []
|
289
|
+
for param in parsed.query.decode().split('&'):
|
290
|
+
if '=' in param and any(t in param.lower() for t in ['utm_', 'ref=', 'source=', 'fbclid=', 'gclid=']):
|
291
|
+
continue
|
292
|
+
query_params.append(param)
|
293
|
+
|
294
|
+
# Reconstrói a URL sem os parâmetros de rastreamento
|
295
|
+
if query_params:
|
296
|
+
parsed = parsed.copy_with(query='&'.join(query_params))
|
297
|
+
else:
|
298
|
+
parsed = parsed.copy_with(query=None)
|
299
|
+
|
300
|
+
# Remove barras finais desnecessárias
|
301
|
+
path = parsed.path.decode()
|
302
|
+
if path.endswith('/'):
|
303
|
+
path = path.rstrip('/') or '/'
|
304
|
+
parsed = parsed.copy_with(path=path)
|
305
|
+
|
306
|
+
return str(parsed)
|
307
|
+
|
308
|
+
except Exception as e:
|
309
|
+
logger.warning("url_normalize_error", url=url, error=str(e))
|
310
|
+
return url
|
311
|
+
|
312
|
+
def _build_headers(self, referer: Optional[str] = None) -> Dict[str, str]:
|
313
|
+
"""Constrói os headers para a requisição HTTP."""
|
314
|
+
headers = DEFAULT_HEADERS.copy()
|
315
|
+
|
316
|
+
# Rotaciona o User-Agent
|
317
|
+
headers["User-Agent"] = random.choice(USER_AGENTS)
|
318
|
+
|
319
|
+
# Adiciona o referer se fornecido
|
320
|
+
if referer:
|
321
|
+
headers["Referer"] = referer
|
322
|
+
else:
|
323
|
+
headers["Referer"] = random.choice(REFERRERS)
|
324
|
+
|
325
|
+
return headers
|
326
|
+
|
327
|
+
async def _stealth_delay(self) -> None:
|
328
|
+
"""Aplica um atraso aleatório para evitar detecção."""
|
329
|
+
if self.stealth and hasattr(self.stealth, 'get_delay'):
|
330
|
+
delay = self.stealth.get_delay()
|
331
|
+
if delay > 0:
|
332
|
+
await asyncio.sleep(delay)
|
333
|
+
|
87
334
|
async def _fetch(self, url: str, depth: int, queue: asyncio.Queue) -> None:
|
88
|
-
|
335
|
+
"""
|
336
|
+
Faz o fetch de uma URL e processa os links encontrados.
|
337
|
+
|
338
|
+
Args:
|
339
|
+
url: URL a ser acessada
|
340
|
+
depth: Profundidade atual do rastreamento
|
341
|
+
queue: Fila de URLs para processamento
|
342
|
+
"""
|
343
|
+
if not self.session:
|
344
|
+
logger.error("session_not_initialized")
|
345
|
+
return
|
346
|
+
|
347
|
+
# Marca a URL como visitada
|
348
|
+
self.visited.add(url)
|
349
|
+
|
350
|
+
try:
|
351
|
+
# Aplica atraso de stealth, se necessário
|
352
|
+
await self._stealth_delay()
|
353
|
+
|
354
|
+
# Prepara os headers para a requisição
|
355
|
+
headers = self._build_headers()
|
356
|
+
|
357
|
+
# Tenta fazer a requisição com tratamento de erros
|
89
358
|
try:
|
90
|
-
await self.
|
91
|
-
|
92
|
-
|
93
|
-
|
359
|
+
response = await self.session.get(
|
360
|
+
url,
|
361
|
+
headers=headers,
|
362
|
+
follow_redirects=True,
|
363
|
+
timeout=self.timeout
|
364
|
+
)
|
365
|
+
|
366
|
+
# Registra o tempo da última requisição
|
367
|
+
self.last_request_time = time.time()
|
368
|
+
|
369
|
+
except httpx.HTTPStatusError as e:
|
370
|
+
logger.warning("http_status_error", url=url, status_code=e.response.status_code)
|
371
|
+
self.results[url] = CrawlPage(
|
372
|
+
url=url,
|
373
|
+
status=e.response.status_code,
|
374
|
+
error=f"HTTP Error: {e.response.status_code}"
|
375
|
+
)
|
94
376
|
return
|
95
|
-
|
96
|
-
|
377
|
+
|
378
|
+
except httpx.RequestError as e:
|
379
|
+
logger.warning("request_error", url=url, error=str(e))
|
380
|
+
self.results[url] = CrawlPage(
|
381
|
+
url=url,
|
382
|
+
status=0,
|
383
|
+
error=f"Request Error: {str(e)}"
|
384
|
+
)
|
385
|
+
return
|
386
|
+
|
387
|
+
except Exception as e:
|
388
|
+
logger.error("unexpected_error", url=url, error=str(e))
|
389
|
+
self.results[url] = CrawlPage(
|
390
|
+
url=url,
|
391
|
+
status=0,
|
392
|
+
error=f"Unexpected Error: {str(e)}"
|
393
|
+
)
|
394
|
+
return
|
395
|
+
|
396
|
+
# Processa a resposta
|
397
|
+
await self._process_response(url, response, depth, queue)
|
398
|
+
|
399
|
+
except Exception as e:
|
400
|
+
logger.error("fetch_error", url=url, error=str(e))
|
401
|
+
self.results[url] = CrawlPage(
|
402
|
+
url=url,
|
403
|
+
status=0,
|
404
|
+
error=f"Processing Error: {str(e)}"
|
405
|
+
)
|
406
|
+
|
407
|
+
async def _process_response(self, url: str, response: httpx.Response, depth: int, queue: asyncio.Queue) -> None:
|
408
|
+
"""
|
409
|
+
Processa a resposta HTTP e extrai links para continuar o rastreamento.
|
410
|
+
|
411
|
+
Args:
|
412
|
+
url: URL que foi acessada
|
413
|
+
response: Resposta HTTP
|
414
|
+
depth: Profundidade atual do rastreamento
|
415
|
+
queue: Fila de URLs para processamento
|
416
|
+
"""
|
417
|
+
# Cria o objeto da página com os dados básicos
|
418
|
+
page = CrawlPage(
|
419
|
+
url=url,
|
420
|
+
status=response.status_code,
|
421
|
+
redirect_chain=[(str(r.url), r.status_code) for r in response.history]
|
422
|
+
)
|
423
|
+
|
424
|
+
# Se não for uma resposta de sucesso ou não for HTML, retorna
|
97
425
|
if response.status_code >= 400 or not response.headers.get("content-type", "").startswith("text"):
|
98
426
|
self.results[url] = page
|
99
427
|
return
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
428
|
+
|
429
|
+
try:
|
430
|
+
# Parseia o HTML
|
431
|
+
parser = HTMLParser(response.text)
|
432
|
+
|
433
|
+
# Extrai o título da página
|
434
|
+
title = parser.css_first("title")
|
435
|
+
if title and hasattr(title, 'text') and callable(title.text):
|
436
|
+
page.title = title.text(strip=True)
|
437
|
+
|
438
|
+
# Extrai os links da página
|
439
|
+
await self._extract_links(parser, url, depth, queue)
|
440
|
+
|
441
|
+
# Extrai os formulários da página
|
442
|
+
self._extract_forms(parser, page)
|
443
|
+
|
444
|
+
# Adiciona a página aos resultados
|
445
|
+
self.results[url] = page
|
446
|
+
|
447
|
+
except Exception as e:
|
448
|
+
logger.error("process_response_error", url=url, error=str(e))
|
449
|
+
page.error = f"Error processing response: {str(e)}"
|
450
|
+
self.results[url] = page
|
451
|
+
|
452
|
+
async def _extract_links(self, parser: HTMLParser, base_url: str, depth: int, queue: asyncio.Queue) -> None:
|
453
|
+
"""Extrai links do HTML e os adiciona à fila de processamento."""
|
454
|
+
for link in parser.css("a[href]"):
|
455
|
+
try:
|
456
|
+
href = link.attributes.get("href", "").strip()
|
457
|
+
if not href or href.startswith("#") or href.startswith("javascript:"):
|
458
|
+
continue
|
459
|
+
|
460
|
+
# Normaliza a URL
|
461
|
+
url = self._normalize_url(href, base_url)
|
462
|
+
if not url:
|
463
|
+
continue
|
464
|
+
|
465
|
+
# Verifica se a URL pertence ao mesmo domínio
|
466
|
+
if not self._is_same_domain(url):
|
467
|
+
continue
|
468
|
+
|
469
|
+
# Adiciona à fila se ainda não foi visitada
|
470
|
+
if url not in self.visited and url not in self.results:
|
471
|
+
queue.put_nowait((url, depth + 1))
|
472
|
+
|
473
|
+
except Exception as e:
|
474
|
+
logger.warning("link_extraction_error", href=href, error=str(e))
|
475
|
+
|
476
|
+
def _extract_forms(self, parser: HTMLParser, page: CrawlPage) -> None:
|
477
|
+
"""Extrai formulários do HTML."""
|
106
478
|
for form in parser.css("form"):
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
"
|
114
|
-
|
115
|
-
|
116
|
-
|
479
|
+
try:
|
480
|
+
form_data = {"method": form.attributes.get("method", "GET").upper()}
|
481
|
+
|
482
|
+
# Obtém a ação do formulário
|
483
|
+
action = form.attributes.get("action", "").strip()
|
484
|
+
if action:
|
485
|
+
form_data["action"] = self._normalize_url(action, page.url)
|
486
|
+
else:
|
487
|
+
form_data["action"] = page.url
|
488
|
+
|
489
|
+
# Extrai os campos do formulário
|
490
|
+
form_data["fields"] = []
|
491
|
+
for field in form.css("input, textarea, select"):
|
492
|
+
field_data = {
|
493
|
+
"name": field.attributes.get("name", ""),
|
494
|
+
"type": field.attributes.get("type", "text"),
|
495
|
+
"value": field.attributes.get("value", ""),
|
496
|
+
"required": "required" in field.attributes
|
497
|
+
}
|
498
|
+
form_data["fields"].append(field_data)
|
499
|
+
|
500
|
+
page.forms.append(form_data)
|
501
|
+
|
502
|
+
except Exception as e:
|
503
|
+
logger.warning("form_extraction_error", error=str(e))
|
504
|
+
|
505
|
+
async def close(self) -> None:
|
506
|
+
"""Fecha a sessão HTTP."""
|
507
|
+
if self.session:
|
508
|
+
await self.session.aclose()
|
509
|
+
self.session = None
|
117
510
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
href = anchor.attributes.get("href")
|
122
|
-
if not href:
|
123
|
-
continue
|
124
|
-
href = href.strip()
|
125
|
-
if href.startswith("javascript:") or href.startswith("mailto:"):
|
126
|
-
continue
|
127
|
-
absolute = httpx.URL(href, base=httpx.URL(url)).human_repr()
|
128
|
-
if not self._should_follow(absolute):
|
129
|
-
continue
|
130
|
-
links.add(absolute)
|
131
|
-
if absolute not in self.visited and len(self.results) < self.max_pages:
|
132
|
-
await queue.put((absolute, depth + 1))
|
133
|
-
page.links = sorted(links)
|
134
|
-
self.results[url] = page
|
135
|
-
|
136
|
-
def _should_follow(self, url: str) -> bool:
|
137
|
-
parsed = httpx.URL(url)
|
138
|
-
if parsed.scheme not in {"http", "https"}:
|
139
|
-
return False
|
140
|
-
if not self.follow_subdomains and parsed.host != self._host:
|
141
|
-
return False
|
142
|
-
if not parsed.host.endswith(self._host):
|
143
|
-
return False
|
144
|
-
return True
|
145
|
-
|
146
|
-
def _build_headers(self) -> Dict[str, str]:
|
147
|
-
headers: Dict[str, str] = {"User-Agent": self.user_agent, "Accept": "*/*"}
|
148
|
-
if self.stealth:
|
149
|
-
stealth_headers = self.stealth.get_random_headers()
|
150
|
-
headers.update(stealth_headers)
|
151
|
-
headers.setdefault("User-Agent", stealth_headers.get("User-Agent", self.user_agent))
|
152
|
-
return headers
|
511
|
+
async def __aenter__(self):
|
512
|
+
await self._init_session()
|
513
|
+
return self
|
153
514
|
|
154
|
-
async def
|
155
|
-
|
156
|
-
return
|
157
|
-
config = getattr(self.stealth, "config", None)
|
158
|
-
if not config or not getattr(config, "timing_randomization", False):
|
159
|
-
return
|
160
|
-
await asyncio.sleep(random.uniform(0.05, 0.2) * max(1, self.stealth.level))
|
515
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
516
|
+
await self.close()
|
161
517
|
|
162
518
|
|
519
|
+
# Para compatibilidade com código existente
|
163
520
|
__all__ = ["WebCrawler", "CrawlPage"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: moriarty-project
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.24
|
4
4
|
Summary: Client-side OSINT toolkit with forensic-grade evidence handling.
|
5
5
|
Project-URL: Homepage, https://github.com/DonatoReis/moriarty
|
6
6
|
Project-URL: Documentation, https://github.com/DonatoReis/moriarty#readme
|
@@ -98,7 +98,7 @@ Description-Content-Type: text/markdown
|
|
98
98
|
<!-- Badges -->
|
99
99
|
<p align="center">
|
100
100
|
<a href="https://pypi.org/project/moriarty-project/">
|
101
|
-
<img src="https://img.shields.io/badge/version-0.1.
|
101
|
+
<img src="https://img.shields.io/badge/version-0.1.24-blue" alt="Version 0.1.24">
|
102
102
|
</a>
|
103
103
|
<a href="https://www.python.org/downloads/">
|
104
104
|
<img src="https://img.shields.io/pypi/pyversions/moriarty-project?color=blue" alt="Python Versions">
|
@@ -152,7 +152,7 @@ Description-Content-Type: text/markdown
|
|
152
152
|
pipx install moriarty-project
|
153
153
|
|
154
154
|
# OU para instalar uma versão específica
|
155
|
-
# pipx install moriarty-project==0.1.
|
155
|
+
# pipx install moriarty-project==0.1.24
|
156
156
|
|
157
157
|
# Verificar a instalação
|
158
158
|
moriarty --help
|