bbot 2.6.0.6840rc0__py3-none-any.whl → 2.7.2.7424rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bbot/__init__.py +1 -1
- bbot/cli.py +22 -8
- bbot/core/engine.py +1 -1
- bbot/core/event/__init__.py +2 -2
- bbot/core/event/base.py +138 -110
- bbot/core/flags.py +1 -0
- bbot/core/helpers/bloom.py +6 -7
- bbot/core/helpers/depsinstaller/installer.py +21 -2
- bbot/core/helpers/dns/dns.py +0 -1
- bbot/core/helpers/dns/engine.py +0 -2
- bbot/core/helpers/files.py +2 -2
- bbot/core/helpers/git.py +17 -0
- bbot/core/helpers/helper.py +6 -5
- bbot/core/helpers/misc.py +8 -23
- bbot/core/helpers/ntlm.py +0 -2
- bbot/core/helpers/regex.py +1 -1
- bbot/core/helpers/regexes.py +25 -8
- bbot/core/helpers/web/web.py +2 -1
- bbot/core/modules.py +22 -60
- bbot/defaults.yml +4 -2
- bbot/modules/apkpure.py +1 -1
- bbot/modules/baddns.py +1 -1
- bbot/modules/baddns_direct.py +1 -1
- bbot/modules/baddns_zone.py +1 -1
- bbot/modules/badsecrets.py +1 -1
- bbot/modules/base.py +123 -38
- bbot/modules/bucket_amazon.py +1 -1
- bbot/modules/bucket_digitalocean.py +1 -1
- bbot/modules/bucket_firebase.py +1 -1
- bbot/modules/bucket_google.py +1 -1
- bbot/modules/{bucket_azure.py → bucket_microsoft.py} +2 -2
- bbot/modules/builtwith.py +4 -2
- bbot/modules/dnsbimi.py +1 -4
- bbot/modules/dnsbrute.py +6 -1
- bbot/modules/dnsdumpster.py +35 -52
- bbot/modules/dnstlsrpt.py +0 -6
- bbot/modules/docker_pull.py +1 -1
- bbot/modules/emailformat.py +17 -1
- bbot/modules/ffuf.py +4 -1
- bbot/modules/ffuf_shortnames.py +6 -3
- bbot/modules/filedownload.py +7 -4
- bbot/modules/git_clone.py +47 -22
- bbot/modules/gitdumper.py +4 -14
- bbot/modules/github_workflows.py +6 -5
- bbot/modules/gitlab_com.py +31 -0
- bbot/modules/gitlab_onprem.py +84 -0
- bbot/modules/gowitness.py +0 -6
- bbot/modules/graphql_introspection.py +5 -2
- bbot/modules/httpx.py +2 -0
- bbot/modules/iis_shortnames.py +0 -7
- bbot/modules/internal/cloudcheck.py +65 -72
- bbot/modules/internal/unarchive.py +9 -3
- bbot/modules/lightfuzz/lightfuzz.py +6 -2
- bbot/modules/lightfuzz/submodules/esi.py +42 -0
- bbot/modules/medusa.py +4 -7
- bbot/modules/nuclei.py +1 -1
- bbot/modules/otx.py +9 -2
- bbot/modules/output/base.py +3 -11
- bbot/modules/paramminer_headers.py +10 -7
- bbot/modules/portfilter.py +2 -0
- bbot/modules/postman_download.py +1 -1
- bbot/modules/retirejs.py +232 -0
- bbot/modules/securitytxt.py +0 -3
- bbot/modules/sslcert.py +2 -2
- bbot/modules/subdomaincenter.py +1 -16
- bbot/modules/telerik.py +7 -2
- bbot/modules/templates/bucket.py +24 -4
- bbot/modules/templates/gitlab.py +98 -0
- bbot/modules/trufflehog.py +6 -3
- bbot/modules/wafw00f.py +2 -2
- bbot/presets/web/lightfuzz-heavy.yml +1 -1
- bbot/presets/web/lightfuzz-medium.yml +1 -1
- bbot/presets/web/lightfuzz-superheavy.yml +1 -1
- bbot/scanner/manager.py +44 -37
- bbot/scanner/scanner.py +12 -4
- bbot/scripts/benchmark_report.py +433 -0
- bbot/test/benchmarks/__init__.py +2 -0
- bbot/test/benchmarks/test_bloom_filter_benchmarks.py +105 -0
- bbot/test/benchmarks/test_closest_match_benchmarks.py +76 -0
- bbot/test/benchmarks/test_event_validation_benchmarks.py +438 -0
- bbot/test/benchmarks/test_excavate_benchmarks.py +291 -0
- bbot/test/benchmarks/test_ipaddress_benchmarks.py +143 -0
- bbot/test/benchmarks/test_weighted_shuffle_benchmarks.py +70 -0
- bbot/test/test_step_1/test_bbot_fastapi.py +2 -2
- bbot/test/test_step_1/test_events.py +22 -21
- bbot/test/test_step_1/test_helpers.py +1 -0
- bbot/test/test_step_1/test_manager_scope_accuracy.py +45 -0
- bbot/test/test_step_1/test_modules_basic.py +40 -15
- bbot/test/test_step_1/test_python_api.py +2 -2
- bbot/test/test_step_1/test_regexes.py +21 -4
- bbot/test/test_step_1/test_scan.py +7 -8
- bbot/test/test_step_1/test_web.py +46 -0
- bbot/test/test_step_2/module_tests/base.py +6 -1
- bbot/test/test_step_2/module_tests/test_module_bucket_amazon.py +52 -18
- bbot/test/test_step_2/module_tests/test_module_bucket_google.py +1 -1
- bbot/test/test_step_2/module_tests/{test_module_bucket_azure.py → test_module_bucket_microsoft.py} +7 -5
- bbot/test/test_step_2/module_tests/test_module_cloudcheck.py +19 -31
- bbot/test/test_step_2/module_tests/test_module_dnsbimi.py +2 -1
- bbot/test/test_step_2/module_tests/test_module_dnsdumpster.py +3 -5
- bbot/test/test_step_2/module_tests/test_module_emailformat.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_emails.py +2 -2
- bbot/test/test_step_2/module_tests/test_module_excavate.py +57 -4
- bbot/test/test_step_2/module_tests/test_module_github_workflows.py +10 -1
- bbot/test/test_step_2/module_tests/test_module_gitlab_com.py +66 -0
- bbot/test/test_step_2/module_tests/{test_module_gitlab.py → test_module_gitlab_onprem.py} +4 -69
- bbot/test/test_step_2/module_tests/test_module_lightfuzz.py +71 -3
- bbot/test/test_step_2/module_tests/test_module_nuclei.py +1 -2
- bbot/test/test_step_2/module_tests/test_module_otx.py +3 -0
- bbot/test/test_step_2/module_tests/test_module_portfilter.py +2 -0
- bbot/test/test_step_2/module_tests/test_module_retirejs.py +161 -0
- bbot/test/test_step_2/module_tests/test_module_telerik.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_trufflehog.py +10 -1
- {bbot-2.6.0.6840rc0.dist-info → bbot-2.7.2.7424rc0.dist-info}/METADATA +10 -7
- {bbot-2.6.0.6840rc0.dist-info → bbot-2.7.2.7424rc0.dist-info}/RECORD +117 -106
- {bbot-2.6.0.6840rc0.dist-info → bbot-2.7.2.7424rc0.dist-info}/WHEEL +1 -1
- {bbot-2.6.0.6840rc0.dist-info → bbot-2.7.2.7424rc0.dist-info/licenses}/LICENSE +98 -58
- bbot/modules/censys.py +0 -98
- bbot/modules/gitlab.py +0 -141
- bbot/modules/zoomeye.py +0 -77
- bbot/test/test_step_2/module_tests/test_module_censys.py +0 -83
- bbot/test/test_step_2/module_tests/test_module_zoomeye.py +0 -35
- {bbot-2.6.0.6840rc0.dist-info → bbot-2.7.2.7424rc0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import asyncio
|
|
3
|
+
from bbot.scanner import Scanner
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestExcavateDirectBenchmarks:
|
|
7
|
+
"""
|
|
8
|
+
Direct benchmark tests for Excavate module operations.
|
|
9
|
+
|
|
10
|
+
These tests measure the performance of excavate's core YARA processing
|
|
11
|
+
by calling the excavate.search() method directly with specific text sizes
|
|
12
|
+
in both single-threaded and parallel asyncio tasks to test the GIL sidestep feature of YARA.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Number of text segments per test
|
|
16
|
+
TEXT_SEGMENTS_COUNT = 100
|
|
17
|
+
|
|
18
|
+
# Prescribed sizes for deterministic benchmarking (in bytes)
|
|
19
|
+
SMALL_SIZE = 4096 # 4KB
|
|
20
|
+
LARGE_SIZE = 5242880 # 5MB
|
|
21
|
+
|
|
22
|
+
def _generate_text_segments(self, target_size, count):
|
|
23
|
+
"""Generate a list of text segments of the specified size"""
|
|
24
|
+
segments = []
|
|
25
|
+
|
|
26
|
+
for i in range(count):
|
|
27
|
+
# Generate realistic content that excavate can work with
|
|
28
|
+
base_content = self._generate_realistic_content(i)
|
|
29
|
+
|
|
30
|
+
# Pad to the exact target size with deterministic content
|
|
31
|
+
remaining_size = target_size - len(base_content)
|
|
32
|
+
if remaining_size > 0:
|
|
33
|
+
# Use deterministic padding pattern
|
|
34
|
+
padding_pattern = "Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. "
|
|
35
|
+
padding_repeats = (remaining_size // len(padding_pattern)) + 1
|
|
36
|
+
padding = (padding_pattern * padding_repeats)[:remaining_size]
|
|
37
|
+
content = base_content + padding
|
|
38
|
+
else:
|
|
39
|
+
content = base_content[:target_size]
|
|
40
|
+
|
|
41
|
+
segments.append(content)
|
|
42
|
+
|
|
43
|
+
return segments
|
|
44
|
+
|
|
45
|
+
def _generate_realistic_content(self, index):
|
|
46
|
+
"""Generate realistic content that excavate can extract from"""
|
|
47
|
+
return f"""
|
|
48
|
+
<html>
|
|
49
|
+
<head>
|
|
50
|
+
<title>Test Content {index}</title>
|
|
51
|
+
<script src="https://api{index}.example.com/js/app.js"></script>
|
|
52
|
+
</head>
|
|
53
|
+
<body>
|
|
54
|
+
<h1>Page {index}</h1>
|
|
55
|
+
|
|
56
|
+
<!-- URLs and subdomains -->
|
|
57
|
+
<a href="https://www{index}.example.com/page{index}">Link {index}</a>
|
|
58
|
+
<a href="https://cdn{index}.example.com/assets/">CDN {index}</a>
|
|
59
|
+
<img src="https://img{index}.example.com/photo{index}.jpg" />
|
|
60
|
+
|
|
61
|
+
<!-- Forms with parameters -->
|
|
62
|
+
<form action="/search{index}" method="GET">
|
|
63
|
+
<input type="text" name="query{index}" value="test{index}">
|
|
64
|
+
<input type="hidden" name="token{index}" value="abc123{index}">
|
|
65
|
+
<button type="submit">Search</button>
|
|
66
|
+
</form>
|
|
67
|
+
|
|
68
|
+
<!-- API endpoints -->
|
|
69
|
+
<script>
|
|
70
|
+
fetch('https://api{index}.example.com/v1/users/{index}')
|
|
71
|
+
.then(response => response.json())
|
|
72
|
+
.then(data => console.log(data));
|
|
73
|
+
|
|
74
|
+
// WebSocket connection
|
|
75
|
+
const ws = new WebSocket('wss://realtime{index}.example.com/socket');
|
|
76
|
+
</script>
|
|
77
|
+
|
|
78
|
+
<!-- Various protocols -->
|
|
79
|
+
<p>FTP: ftp://ftp{index}.example.com:21/files/</p>
|
|
80
|
+
<p>SSH: ssh://server{index}.example.com:22/</p>
|
|
81
|
+
<p>Email: contact{index}@example.com</p>
|
|
82
|
+
|
|
83
|
+
<!-- JSON data -->
|
|
84
|
+
<script type="application/json">
|
|
85
|
+
{{
|
|
86
|
+
"apiEndpoint{index}": "https://api{index}.example.com/data",
|
|
87
|
+
"parameter{index}": "value{index}",
|
|
88
|
+
"secretKey{index}": "sk_test_{index}_abcdef123456"
|
|
89
|
+
}}
|
|
90
|
+
</script>
|
|
91
|
+
|
|
92
|
+
<!-- Comments with URLs -->
|
|
93
|
+
<!-- https://hidden{index}.example.com/admin -->
|
|
94
|
+
<!-- TODO: Check https://internal{index}.example.com/debug -->
|
|
95
|
+
</body>
|
|
96
|
+
</html>
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
async def _run_excavate_single_thread(self, text_segments):
|
|
100
|
+
"""Run excavate processing in single thread"""
|
|
101
|
+
# Create scanner and initialize excavate
|
|
102
|
+
scan = Scanner("example.com", modules=["httpx"], config={"excavate": True})
|
|
103
|
+
await scan._prep()
|
|
104
|
+
excavate_module = scan.modules.get("excavate")
|
|
105
|
+
|
|
106
|
+
if not excavate_module:
|
|
107
|
+
raise RuntimeError("Excavate module not found")
|
|
108
|
+
|
|
109
|
+
# Track events emitted by excavate
|
|
110
|
+
emitted_events = []
|
|
111
|
+
|
|
112
|
+
async def track_emit_event(event_data, *args, **kwargs):
|
|
113
|
+
emitted_events.append(event_data)
|
|
114
|
+
|
|
115
|
+
excavate_module.emit_event = track_emit_event
|
|
116
|
+
|
|
117
|
+
# Process all text segments sequentially
|
|
118
|
+
results = []
|
|
119
|
+
for i, text_segment in enumerate(text_segments):
|
|
120
|
+
# Create a mock HTTP_RESPONSE event
|
|
121
|
+
mock_event = scan.make_event(
|
|
122
|
+
{
|
|
123
|
+
"url": f"https://example.com/test/{i}",
|
|
124
|
+
"method": "GET",
|
|
125
|
+
"body": text_segment,
|
|
126
|
+
"header-dict": {"Content-Type": ["text/html"]},
|
|
127
|
+
"raw_header": "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n",
|
|
128
|
+
"status_code": 200,
|
|
129
|
+
},
|
|
130
|
+
"HTTP_RESPONSE",
|
|
131
|
+
parent=scan.root_event,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Process with excavate
|
|
135
|
+
await excavate_module.search(text_segment, mock_event, "text/html", f"Single thread benchmark {i}")
|
|
136
|
+
results.append(f"processed_{i}")
|
|
137
|
+
|
|
138
|
+
return results, emitted_events
|
|
139
|
+
|
|
140
|
+
async def _run_excavate_parallel_tasks(self, text_segments):
|
|
141
|
+
"""Run excavate processing with parallel asyncio tasks"""
|
|
142
|
+
# Create scanner and initialize excavate
|
|
143
|
+
scan = Scanner("example.com", modules=["httpx"], config={"excavate": True})
|
|
144
|
+
await scan._prep()
|
|
145
|
+
excavate_module = scan.modules.get("excavate")
|
|
146
|
+
|
|
147
|
+
if not excavate_module:
|
|
148
|
+
raise RuntimeError("Excavate module not found")
|
|
149
|
+
|
|
150
|
+
# Define async task to process a single text segment
|
|
151
|
+
async def process_segment(segment_index, text_segment):
|
|
152
|
+
mock_event = scan.make_event(
|
|
153
|
+
{
|
|
154
|
+
"url": f"https://example.com/parallel/{segment_index}",
|
|
155
|
+
"method": "GET",
|
|
156
|
+
"body": text_segment,
|
|
157
|
+
"header-dict": {"Content-Type": ["text/html"]},
|
|
158
|
+
"raw_header": "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n",
|
|
159
|
+
"status_code": 200,
|
|
160
|
+
},
|
|
161
|
+
"HTTP_RESPONSE",
|
|
162
|
+
parent=scan.root_event,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
await excavate_module.search(
|
|
166
|
+
text_segment, mock_event, "text/html", f"Parallel benchmark task {segment_index}"
|
|
167
|
+
)
|
|
168
|
+
return f"processed_{segment_index}"
|
|
169
|
+
|
|
170
|
+
# Create all tasks and run them concurrently
|
|
171
|
+
tasks = [process_segment(i, text_segment) for i, text_segment in enumerate(text_segments)]
|
|
172
|
+
|
|
173
|
+
# Run all tasks in parallel
|
|
174
|
+
results = await asyncio.gather(*tasks)
|
|
175
|
+
return results
|
|
176
|
+
|
|
177
|
+
# Single Thread Tests
|
|
178
|
+
@pytest.mark.benchmark(group="excavate_single_small")
|
|
179
|
+
def test_excavate_single_thread_small(self, benchmark):
|
|
180
|
+
"""Benchmark excavate single thread processing with small (4KB) segments"""
|
|
181
|
+
text_segments = self._generate_text_segments(self.SMALL_SIZE, self.TEXT_SEGMENTS_COUNT)
|
|
182
|
+
|
|
183
|
+
def run_test():
|
|
184
|
+
return asyncio.run(self._run_excavate_single_thread(text_segments))
|
|
185
|
+
|
|
186
|
+
result, events = benchmark(run_test)
|
|
187
|
+
|
|
188
|
+
assert len(result) == self.TEXT_SEGMENTS_COUNT
|
|
189
|
+
total_size_mb = (self.SMALL_SIZE * self.TEXT_SEGMENTS_COUNT) / (1024 * 1024)
|
|
190
|
+
|
|
191
|
+
# Count events by type
|
|
192
|
+
total_events = len(events)
|
|
193
|
+
url_events = len([e for e in events if e.type == "URL_UNVERIFIED"])
|
|
194
|
+
dns_events = len([e for e in events if e.type == "DNS_NAME"])
|
|
195
|
+
email_events = len([e for e in events if e.type == "EMAIL_ADDRESS"])
|
|
196
|
+
protocol_events = len([e for e in events if e.type == "PROTOCOL"])
|
|
197
|
+
finding_events = len([e for e in events if e.type == "FINDING"])
|
|
198
|
+
|
|
199
|
+
print("\n✅ Single-thread small segments benchmark completed")
|
|
200
|
+
print(f"📊 Processed {len(result):,} segments of {self.SMALL_SIZE / 1024:.0f}KB each")
|
|
201
|
+
print(f"📊 Total size processed: {total_size_mb:.1f} MB")
|
|
202
|
+
print(f"📊 Total events: {total_events}")
|
|
203
|
+
print(f"📊 URL events: {url_events}")
|
|
204
|
+
print(f"📊 DNS events: {dns_events}")
|
|
205
|
+
print(f"📊 Email events: {email_events}")
|
|
206
|
+
print(f"📊 Protocol events: {protocol_events}")
|
|
207
|
+
print(f"📊 Finding events: {finding_events}")
|
|
208
|
+
|
|
209
|
+
# Validate that excavate actually found and processed content
|
|
210
|
+
assert total_events > 0, "Expected to find some events from excavate"
|
|
211
|
+
assert url_events > 0 or dns_events > 0 or protocol_events > 0, (
|
|
212
|
+
"Expected excavate to find URLs, DNS names, or protocols"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
@pytest.mark.benchmark(group="excavate_single_large")
|
|
216
|
+
def test_excavate_single_thread_large(self, benchmark):
|
|
217
|
+
"""Benchmark excavate single thread processing with large (10MB) segments"""
|
|
218
|
+
text_segments = self._generate_text_segments(self.LARGE_SIZE, self.TEXT_SEGMENTS_COUNT)
|
|
219
|
+
|
|
220
|
+
def run_test():
|
|
221
|
+
return asyncio.run(self._run_excavate_single_thread(text_segments))
|
|
222
|
+
|
|
223
|
+
result, events = benchmark(run_test)
|
|
224
|
+
|
|
225
|
+
assert len(result) == self.TEXT_SEGMENTS_COUNT
|
|
226
|
+
total_size_mb = (self.LARGE_SIZE * self.TEXT_SEGMENTS_COUNT) / (1024 * 1024)
|
|
227
|
+
|
|
228
|
+
# Count events by type
|
|
229
|
+
total_events = len(events)
|
|
230
|
+
url_events = len([e for e in events if e.type == "URL_UNVERIFIED"])
|
|
231
|
+
dns_events = len([e for e in events if e.type == "DNS_NAME"])
|
|
232
|
+
email_events = len([e for e in events if e.type == "EMAIL_ADDRESS"])
|
|
233
|
+
protocol_events = len([e for e in events if e.type == "PROTOCOL"])
|
|
234
|
+
finding_events = len([e for e in events if e.type == "FINDING"])
|
|
235
|
+
|
|
236
|
+
print("\n✅ Single-thread large segments benchmark completed")
|
|
237
|
+
print(f"📊 Processed {len(result):,} segments of {self.LARGE_SIZE / (1024 * 1024):.0f}MB each")
|
|
238
|
+
print(f"📊 Total size processed: {total_size_mb:.1f} MB")
|
|
239
|
+
print(f"📊 Total events: {total_events}")
|
|
240
|
+
print(f"📊 URL events: {url_events}")
|
|
241
|
+
print(f"📊 DNS events: {dns_events}")
|
|
242
|
+
print(f"📊 Email events: {email_events}")
|
|
243
|
+
print(f"📊 Protocol events: {protocol_events}")
|
|
244
|
+
print(f"📊 Finding events: {finding_events}")
|
|
245
|
+
|
|
246
|
+
# Validate that excavate actually found and processed content
|
|
247
|
+
assert total_events > 0, "Expected to find some events from excavate"
|
|
248
|
+
assert url_events > 0 or dns_events > 0 or protocol_events > 0, (
|
|
249
|
+
"Expected excavate to find URLs, DNS names, or protocols"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Parallel Tests
|
|
253
|
+
@pytest.mark.benchmark(group="excavate_parallel_small")
|
|
254
|
+
def test_excavate_parallel_tasks_small(self, benchmark):
|
|
255
|
+
"""Benchmark excavate parallel processing with small (4KB) segments"""
|
|
256
|
+
text_segments = self._generate_text_segments(self.SMALL_SIZE, self.TEXT_SEGMENTS_COUNT)
|
|
257
|
+
|
|
258
|
+
def run_test():
|
|
259
|
+
return asyncio.run(self._run_excavate_parallel_tasks(text_segments))
|
|
260
|
+
|
|
261
|
+
result = benchmark(run_test)
|
|
262
|
+
|
|
263
|
+
assert len(result) == self.TEXT_SEGMENTS_COUNT
|
|
264
|
+
total_size_mb = (self.SMALL_SIZE * self.TEXT_SEGMENTS_COUNT) / (1024 * 1024)
|
|
265
|
+
print("\n✅ Parallel small segments benchmark completed")
|
|
266
|
+
print(f"📊 Processed {len(result):,} segments of {self.SMALL_SIZE / 1024:.0f}KB each in parallel")
|
|
267
|
+
print(f"📊 Total size processed: {total_size_mb:.1f} MB")
|
|
268
|
+
print("📊 Tasks executed concurrently to test YARA GIL sidestep")
|
|
269
|
+
|
|
270
|
+
# Basic assertion that excavate is actually working (should find URLs in our test content)
|
|
271
|
+
assert len(result) > 0, "Expected excavate to process all segments"
|
|
272
|
+
|
|
273
|
+
@pytest.mark.benchmark(group="excavate_parallel_large")
|
|
274
|
+
def test_excavate_parallel_tasks_large(self, benchmark):
|
|
275
|
+
"""Benchmark excavate parallel processing with large (10MB) segments to test YARA GIL sidestep"""
|
|
276
|
+
text_segments = self._generate_text_segments(self.LARGE_SIZE, self.TEXT_SEGMENTS_COUNT)
|
|
277
|
+
|
|
278
|
+
def run_test():
|
|
279
|
+
return asyncio.run(self._run_excavate_parallel_tasks(text_segments))
|
|
280
|
+
|
|
281
|
+
result = benchmark(run_test)
|
|
282
|
+
|
|
283
|
+
assert len(result) == self.TEXT_SEGMENTS_COUNT
|
|
284
|
+
total_size_mb = (self.LARGE_SIZE * self.TEXT_SEGMENTS_COUNT) / (1024 * 1024)
|
|
285
|
+
print("\n✅ Parallel large segments benchmark completed")
|
|
286
|
+
print(f"📊 Processed {len(result):,} segments of {self.LARGE_SIZE / (1024 * 1024):.0f}MB each in parallel")
|
|
287
|
+
print(f"📊 Total size processed: {total_size_mb:.1f} MB")
|
|
288
|
+
print("📊 Tasks executed concurrently to test YARA GIL sidestep")
|
|
289
|
+
|
|
290
|
+
# Basic assertion that excavate is actually working (should find URLs in our test content)
|
|
291
|
+
assert len(result) > 0, "Expected excavate to process all segments"
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import random
|
|
3
|
+
import string
|
|
4
|
+
from bbot.core.helpers.misc import make_ip_type, is_ip
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestIPAddressBenchmarks:
|
|
8
|
+
"""
|
|
9
|
+
Benchmark tests for IP address processing operations.
|
|
10
|
+
|
|
11
|
+
These tests measure the performance of BBOT-level IP functions which are
|
|
12
|
+
critical for network scanning efficiency and could benefit from different
|
|
13
|
+
underlying implementations.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def setup_method(self):
|
|
17
|
+
"""Setup common test data"""
|
|
18
|
+
# Set deterministic seed for consistent benchmark results
|
|
19
|
+
random.seed(42) # Fixed seed for reproducible results
|
|
20
|
+
|
|
21
|
+
# Generate test data of different types and sizes
|
|
22
|
+
self.valid_ips = self._generate_valid_ips()
|
|
23
|
+
self.invalid_ips = self._generate_invalid_ips()
|
|
24
|
+
self.mixed_data = self._generate_mixed_data()
|
|
25
|
+
|
|
26
|
+
def _generate_valid_ips(self):
|
|
27
|
+
"""Generate valid IP addresses for testing"""
|
|
28
|
+
valid_ips = []
|
|
29
|
+
|
|
30
|
+
# IPv4 addresses
|
|
31
|
+
for i in range(1000):
|
|
32
|
+
valid_ips.append(
|
|
33
|
+
f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# IPv6 addresses
|
|
37
|
+
for i in range(500):
|
|
38
|
+
ipv6_parts = []
|
|
39
|
+
for j in range(8):
|
|
40
|
+
ipv6_parts.append(f"{random.randint(0, 65535):x}")
|
|
41
|
+
valid_ips.append(":".join(ipv6_parts))
|
|
42
|
+
|
|
43
|
+
# Network addresses
|
|
44
|
+
for i in range(500):
|
|
45
|
+
base_ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.0"
|
|
46
|
+
valid_ips.append(f"{base_ip}/{random.randint(8, 30)}")
|
|
47
|
+
|
|
48
|
+
# IP ranges
|
|
49
|
+
for i in range(200):
|
|
50
|
+
start_ip = (
|
|
51
|
+
f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 200)}"
|
|
52
|
+
)
|
|
53
|
+
end_ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(201, 254)}"
|
|
54
|
+
valid_ips.append(f"{start_ip}-{end_ip}")
|
|
55
|
+
|
|
56
|
+
return valid_ips
|
|
57
|
+
|
|
58
|
+
def _generate_invalid_ips(self):
|
|
59
|
+
"""Generate invalid IP addresses for testing"""
|
|
60
|
+
invalid_ips = []
|
|
61
|
+
|
|
62
|
+
# Malformed IPv4
|
|
63
|
+
for i in range(500):
|
|
64
|
+
invalid_ips.append(
|
|
65
|
+
f"{random.randint(256, 999)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}"
|
|
66
|
+
)
|
|
67
|
+
invalid_ips.append(f"{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}")
|
|
68
|
+
invalid_ips.append(
|
|
69
|
+
f"{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Malformed IPv6
|
|
73
|
+
for i in range(300):
|
|
74
|
+
ipv6_parts = []
|
|
75
|
+
for j in range(random.randint(5, 10)): # Wrong number of parts
|
|
76
|
+
ipv6_parts.append(f"{random.randint(0, 65535):x}")
|
|
77
|
+
invalid_ips.append(":".join(ipv6_parts))
|
|
78
|
+
|
|
79
|
+
# Random strings
|
|
80
|
+
for i in range(200):
|
|
81
|
+
length = random.randint(5, 20)
|
|
82
|
+
invalid_ips.append("".join(random.choices(string.ascii_letters + string.digits, k=length)))
|
|
83
|
+
|
|
84
|
+
return invalid_ips
|
|
85
|
+
|
|
86
|
+
def _generate_mixed_data(self):
|
|
87
|
+
"""Generate mixed valid/invalid data for realistic testing"""
|
|
88
|
+
mixed = []
|
|
89
|
+
mixed.extend(self.valid_ips[:500]) # First 500 valid
|
|
90
|
+
mixed.extend(self.invalid_ips[:500]) # First 500 invalid
|
|
91
|
+
# Use deterministic shuffle with fixed seed for consistent results
|
|
92
|
+
random.seed(42) # Reset seed before shuffle
|
|
93
|
+
random.shuffle(mixed) # Shuffle for realistic distribution
|
|
94
|
+
return mixed
|
|
95
|
+
|
|
96
|
+
@pytest.mark.benchmark(group="ip_validation")
|
|
97
|
+
def test_is_ip_performance(self, benchmark):
|
|
98
|
+
"""Benchmark IP validation performance with mixed data"""
|
|
99
|
+
|
|
100
|
+
def validate_ips():
|
|
101
|
+
valid_count = 0
|
|
102
|
+
for ip in self.mixed_data:
|
|
103
|
+
if is_ip(ip):
|
|
104
|
+
valid_count += 1
|
|
105
|
+
return valid_count
|
|
106
|
+
|
|
107
|
+
result = benchmark(validate_ips)
|
|
108
|
+
assert result > 0
|
|
109
|
+
|
|
110
|
+
@pytest.mark.benchmark(group="ip_type_detection")
|
|
111
|
+
def test_make_ip_type_performance(self, benchmark):
|
|
112
|
+
"""Benchmark IP type detection performance"""
|
|
113
|
+
|
|
114
|
+
def detect_ip_types():
|
|
115
|
+
type_count = 0
|
|
116
|
+
for ip in self.valid_ips:
|
|
117
|
+
try:
|
|
118
|
+
make_ip_type(ip)
|
|
119
|
+
type_count += 1
|
|
120
|
+
except Exception:
|
|
121
|
+
pass
|
|
122
|
+
return type_count
|
|
123
|
+
|
|
124
|
+
result = benchmark(detect_ip_types)
|
|
125
|
+
assert result > 0
|
|
126
|
+
|
|
127
|
+
@pytest.mark.benchmark(group="ip_processing")
|
|
128
|
+
def test_mixed_ip_operations(self, benchmark):
|
|
129
|
+
"""Benchmark combined IP validation + type detection"""
|
|
130
|
+
|
|
131
|
+
def process_ips():
|
|
132
|
+
processed = 0
|
|
133
|
+
for ip in self.mixed_data:
|
|
134
|
+
if is_ip(ip):
|
|
135
|
+
try:
|
|
136
|
+
make_ip_type(ip)
|
|
137
|
+
processed += 1
|
|
138
|
+
except Exception:
|
|
139
|
+
pass
|
|
140
|
+
return processed
|
|
141
|
+
|
|
142
|
+
result = benchmark(process_ips)
|
|
143
|
+
assert result > 0
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import random
|
|
3
|
+
from bbot.core.helpers.misc import weighted_shuffle
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestWeightedShuffleBenchmarks:
|
|
7
|
+
"""
|
|
8
|
+
Benchmark tests for weighted_shuffle operations.
|
|
9
|
+
|
|
10
|
+
This function is critical for BBOT's queue management, where it shuffles
|
|
11
|
+
incoming queues based on module priority weights. Performance here directly
|
|
12
|
+
impacts scan throughput and responsiveness.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def setup_method(self):
|
|
16
|
+
"""Setup common test data"""
|
|
17
|
+
# Set deterministic seed for consistent benchmark results
|
|
18
|
+
random.seed(42) # Fixed seed for reproducible results
|
|
19
|
+
|
|
20
|
+
# Generate test data of different sizes and complexity
|
|
21
|
+
self.small_data = self._generate_small_dataset()
|
|
22
|
+
self.medium_data = self._generate_medium_dataset()
|
|
23
|
+
self.large_data = self._generate_large_dataset()
|
|
24
|
+
self.priority_weights = self._generate_priority_weights()
|
|
25
|
+
|
|
26
|
+
def _generate_small_dataset(self):
|
|
27
|
+
"""Generate small dataset (like few modules)"""
|
|
28
|
+
return {"items": ["module_a", "module_b", "module_c"], "weights": [0.6, 0.3, 0.1]}
|
|
29
|
+
|
|
30
|
+
def _generate_medium_dataset(self):
|
|
31
|
+
"""Generate medium dataset (like typical scan)"""
|
|
32
|
+
items = [f"module_{i}" for i in range(20)]
|
|
33
|
+
weights = [random.uniform(0.1, 1.0) for _ in range(20)]
|
|
34
|
+
return {"items": items, "weights": weights}
|
|
35
|
+
|
|
36
|
+
def _generate_large_dataset(self):
|
|
37
|
+
"""Generate large dataset (like complex scan with many modules)"""
|
|
38
|
+
items = [f"module_{i}" for i in range(100)]
|
|
39
|
+
weights = [random.uniform(0.1, 1.0) for _ in range(100)]
|
|
40
|
+
return {"items": items, "weights": weights}
|
|
41
|
+
|
|
42
|
+
def _generate_priority_weights(self):
|
|
43
|
+
"""Generate realistic priority weights (like BBOT module priorities)"""
|
|
44
|
+
# BBOT uses priorities 1-5, where lower priority = higher weight
|
|
45
|
+
# Weights are calculated as [5] + [6 - m.priority for m in modules]
|
|
46
|
+
priorities = [5] + [6 - p for p in [1, 2, 3, 4, 5]] * 20 # 5 + 5*20 = 105 items
|
|
47
|
+
items = [f"queue_{i}" for i in range(len(priorities))]
|
|
48
|
+
return {"items": items, "weights": priorities}
|
|
49
|
+
|
|
50
|
+
@pytest.mark.benchmark(group="weighted_shuffle")
|
|
51
|
+
def test_typical_queue_shuffle(self, benchmark):
|
|
52
|
+
"""Benchmark weighted shuffle with typical BBOT scan workload"""
|
|
53
|
+
|
|
54
|
+
def shuffle_typical():
|
|
55
|
+
return weighted_shuffle(self.medium_data["items"], self.medium_data["weights"])
|
|
56
|
+
|
|
57
|
+
result = benchmark(shuffle_typical)
|
|
58
|
+
assert len(result) == 20
|
|
59
|
+
assert all(item in result for item in self.medium_data["items"])
|
|
60
|
+
|
|
61
|
+
@pytest.mark.benchmark(group="weighted_shuffle")
|
|
62
|
+
def test_priority_queue_shuffle(self, benchmark):
|
|
63
|
+
"""Benchmark weighted shuffle with realistic BBOT priority weights"""
|
|
64
|
+
|
|
65
|
+
def shuffle_priorities():
|
|
66
|
+
return weighted_shuffle(self.priority_weights["items"], self.priority_weights["weights"])
|
|
67
|
+
|
|
68
|
+
result = benchmark(shuffle_priorities)
|
|
69
|
+
assert len(result) == len(self.priority_weights["items"])
|
|
70
|
+
assert all(item in result for item in self.priority_weights["items"])
|
|
@@ -22,8 +22,8 @@ def test_bbot_multiprocess(bbot_httpserver):
|
|
|
22
22
|
queue = multiprocessing.Queue()
|
|
23
23
|
events_process = multiprocessing.Process(target=run_bbot_multiprocess, args=(queue,))
|
|
24
24
|
events_process.start()
|
|
25
|
-
events_process.join()
|
|
26
|
-
events = queue.get()
|
|
25
|
+
events_process.join(timeout=300)
|
|
26
|
+
events = queue.get(timeout=10)
|
|
27
27
|
assert len(events) >= 3
|
|
28
28
|
scan_events = [e for e in events if e["type"] == "SCAN"]
|
|
29
29
|
assert len(scan_events) == 2
|
|
@@ -42,7 +42,7 @@ async def test_events(events, helpers):
|
|
|
42
42
|
assert events.ipv4 == scan.make_event("8.8.8.8", dummy=True)
|
|
43
43
|
assert "8.8.8.8" in events.ipv4
|
|
44
44
|
assert events.ipv4.host_filterable == "8.8.8.8"
|
|
45
|
-
assert "8.8.8.8"
|
|
45
|
+
assert events.ipv4.data == "8.8.8.8"
|
|
46
46
|
assert "8.8.8.8" in events.netv4
|
|
47
47
|
assert "8.8.8.9" not in events.ipv4
|
|
48
48
|
assert "8.8.9.8" not in events.netv4
|
|
@@ -60,7 +60,7 @@ async def test_events(events, helpers):
|
|
|
60
60
|
assert events.emoji not in events.netv6
|
|
61
61
|
assert events.netv6 not in events.emoji
|
|
62
62
|
ipv6_event = scan.make_event(" [DEaD::c0De]:88", "DNS_NAME", dummy=True)
|
|
63
|
-
assert "dead::c0de"
|
|
63
|
+
assert ipv6_event.data == "dead::c0de"
|
|
64
64
|
assert ipv6_event.host_filterable == "dead::c0de"
|
|
65
65
|
range_to_ip = scan.make_event("1.2.3.4/32", dummy=True)
|
|
66
66
|
assert range_to_ip.type == "IP_ADDRESS"
|
|
@@ -87,7 +87,7 @@ async def test_events(events, helpers):
|
|
|
87
87
|
open_port_event = scan.make_event(" eViLcorp.COM.:88", "DNS_NAME", dummy=True)
|
|
88
88
|
dns_event = scan.make_event("evilcorp.com.", "DNS_NAME", dummy=True)
|
|
89
89
|
for e in (open_port_event, dns_event):
|
|
90
|
-
assert "evilcorp.com"
|
|
90
|
+
assert e.data == "evilcorp.com"
|
|
91
91
|
assert e.netloc == "evilcorp.com"
|
|
92
92
|
assert e.json()["netloc"] == "evilcorp.com"
|
|
93
93
|
assert e.port is None
|
|
@@ -117,17 +117,19 @@ async def test_events(events, helpers):
|
|
|
117
117
|
assert events.emoji not in events.url_unverified
|
|
118
118
|
assert events.emoji not in events.ipv6_url_unverified
|
|
119
119
|
assert events.url_unverified not in events.emoji
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
|
|
121
|
+
# URL normalization tests – compare against normalized event.data / .with_port().geturl()
|
|
122
|
+
assert scan.make_event("https://evilcorp.com:443", dummy=True).data == "https://evilcorp.com/"
|
|
123
|
+
assert scan.make_event("http://evilcorp.com:80", dummy=True).data == "http://evilcorp.com/"
|
|
122
124
|
assert "http://evilcorp.com:80/asdf.js" in scan.make_event("http://evilcorp.com/asdf.js", dummy=True)
|
|
123
125
|
assert "http://evilcorp.com/asdf.js" in scan.make_event("http://evilcorp.com:80/asdf.js", dummy=True)
|
|
124
|
-
assert "https://evilcorp.com
|
|
125
|
-
assert "http://evilcorp.com
|
|
126
|
-
assert "https://evilcorp.com:80" ==
|
|
127
|
-
assert "http://evilcorp.com:443" ==
|
|
126
|
+
assert scan.make_event("https://evilcorp.com", dummy=True).data == "https://evilcorp.com/"
|
|
127
|
+
assert scan.make_event("http://evilcorp.com", dummy=True).data == "http://evilcorp.com/"
|
|
128
|
+
assert scan.make_event("https://evilcorp.com:80", dummy=True).data == "https://evilcorp.com:80/"
|
|
129
|
+
assert scan.make_event("http://evilcorp.com:443", dummy=True).data == "http://evilcorp.com:443/"
|
|
128
130
|
assert scan.make_event("https://evilcorp.com", dummy=True).with_port().geturl() == "https://evilcorp.com:443/"
|
|
129
131
|
assert scan.make_event("https://evilcorp.com:666", dummy=True).with_port().geturl() == "https://evilcorp.com:666/"
|
|
130
|
-
assert scan.make_event("https://evilcorp.com.:666", dummy=True) == "https://evilcorp.com:666/"
|
|
132
|
+
assert scan.make_event("https://evilcorp.com.:666", dummy=True).data == "https://evilcorp.com:666/"
|
|
131
133
|
assert scan.make_event("https://[bad::c0de]", dummy=True).with_port().geturl() == "https://[bad::c0de]:443/"
|
|
132
134
|
assert scan.make_event("https://[bad::c0de]:666", dummy=True).with_port().geturl() == "https://[bad::c0de]:666/"
|
|
133
135
|
url_event = scan.make_event("https://evilcorp.com", "URL", events.ipv4_url, tags=["status-200"])
|
|
@@ -209,7 +211,6 @@ async def test_events(events, helpers):
|
|
|
209
211
|
javascript_event = scan.make_event("http://evilcorp.com/asdf/a.js?b=c#d", "URL_UNVERIFIED", parent=scan.root_event)
|
|
210
212
|
assert "extension-js" in javascript_event.tags
|
|
211
213
|
await scan.ingress_module.handle_event(javascript_event)
|
|
212
|
-
assert "httpx-only" in javascript_event.tags
|
|
213
214
|
|
|
214
215
|
# scope distance
|
|
215
216
|
event1 = scan.make_event("1.2.3.4", dummy=True)
|
|
@@ -261,21 +262,21 @@ async def test_events(events, helpers):
|
|
|
261
262
|
)
|
|
262
263
|
assert event.discovery_context == "something discovered IP_ADDRESS: 127.0.0.1"
|
|
263
264
|
|
|
264
|
-
# updating an already-created event with
|
|
265
|
+
# updating an already-created event with update_event()
|
|
265
266
|
# updating tags
|
|
266
267
|
event1 = scan.make_event("127.0.0.1", parent=scan.root_event)
|
|
267
|
-
updated_event = scan.
|
|
268
|
-
assert "asdf" not in event1.tags
|
|
268
|
+
updated_event = scan.update_event(event1, tags="asdf")
|
|
269
|
+
# assert "asdf" not in event1.tags # why was this test added? why is it important the original event stays untouched? 🤔
|
|
269
270
|
assert "asdf" in updated_event.tags
|
|
270
271
|
# updating parent
|
|
271
272
|
event2 = scan.make_event("127.0.0.1", parent=scan.root_event)
|
|
272
|
-
updated_event = scan.
|
|
273
|
-
assert event2.parent == scan.root_event
|
|
273
|
+
updated_event = scan.update_event(event2, parent=event1)
|
|
274
|
+
# assert event2.parent == scan.root_event
|
|
274
275
|
assert updated_event.parent == event1
|
|
275
|
-
# updating module
|
|
276
|
+
# updating module/internal flag
|
|
276
277
|
event3 = scan.make_event("127.0.0.1", parent=scan.root_event)
|
|
277
|
-
updated_event = scan.
|
|
278
|
-
assert event3.internal is False
|
|
278
|
+
updated_event = scan.update_event(event3, internal=True)
|
|
279
|
+
# assert event3.internal is False
|
|
279
280
|
assert updated_event.internal is True
|
|
280
281
|
|
|
281
282
|
# event sorting
|
|
@@ -1056,13 +1057,13 @@ async def test_mobile_app():
|
|
|
1056
1057
|
|
|
1057
1058
|
@pytest.mark.asyncio
|
|
1058
1059
|
async def test_filesystem():
|
|
1059
|
-
scan = Scanner("FILESYSTEM:/tmp/
|
|
1060
|
+
scan = Scanner("FILESYSTEM:/tmp/asdfasdgasdfasdfddsdf")
|
|
1060
1061
|
events = [e async for e in scan.async_start()]
|
|
1061
1062
|
assert len(events) == 3
|
|
1062
1063
|
filesystem_events = [e for e in events if e.type == "FILESYSTEM"]
|
|
1063
1064
|
assert len(filesystem_events) == 1
|
|
1064
1065
|
assert filesystem_events[0].type == "FILESYSTEM"
|
|
1065
|
-
assert filesystem_events[0].data == {"path": "/tmp/
|
|
1066
|
+
assert filesystem_events[0].data == {"path": "/tmp/asdfasdgasdfasdfddsdf"}
|
|
1066
1067
|
|
|
1067
1068
|
|
|
1068
1069
|
def test_event_hashing():
|
|
@@ -155,6 +155,7 @@ async def test_helpers_misc(helpers, scan, bbot_scanner, bbot_httpserver):
|
|
|
155
155
|
assert helpers.extract_host("https://[dead::beef]:22?a=b") == ("dead::beef", "https://[", "]:22?a=b")
|
|
156
156
|
assert helpers.extract_host("https://[dead::beef]/?a=b") == ("dead::beef", "https://[", "]/?a=b")
|
|
157
157
|
assert helpers.extract_host("https://[dead::beef]?a=b") == ("dead::beef", "https://[", "]?a=b")
|
|
158
|
+
assert helpers.extract_host("https://[::1]") == ("::1", "https://[", "]")
|
|
158
159
|
assert helpers.extract_host("ftp://username:password@my-ftp.com/my-file.csv") == (
|
|
159
160
|
"my-ftp.com",
|
|
160
161
|
"ftp://username:password@",
|