bbot 2.5.0__py3-none-any.whl → 2.7.2.7424rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. bbot/__init__.py +1 -1
  2. bbot/cli.py +22 -8
  3. bbot/core/engine.py +1 -1
  4. bbot/core/event/__init__.py +2 -2
  5. bbot/core/event/base.py +138 -110
  6. bbot/core/flags.py +1 -0
  7. bbot/core/helpers/bloom.py +6 -7
  8. bbot/core/helpers/command.py +5 -2
  9. bbot/core/helpers/depsinstaller/installer.py +78 -7
  10. bbot/core/helpers/dns/dns.py +0 -1
  11. bbot/core/helpers/dns/engine.py +0 -2
  12. bbot/core/helpers/files.py +2 -2
  13. bbot/core/helpers/git.py +17 -0
  14. bbot/core/helpers/helper.py +6 -5
  15. bbot/core/helpers/misc.py +15 -28
  16. bbot/core/helpers/names_generator.py +5 -0
  17. bbot/core/helpers/ntlm.py +0 -2
  18. bbot/core/helpers/regex.py +1 -1
  19. bbot/core/helpers/regexes.py +25 -8
  20. bbot/core/helpers/web/engine.py +1 -1
  21. bbot/core/helpers/web/web.py +2 -1
  22. bbot/core/modules.py +22 -60
  23. bbot/core/shared_deps.py +38 -0
  24. bbot/defaults.yml +4 -2
  25. bbot/modules/apkpure.py +2 -2
  26. bbot/modules/aspnet_bin_exposure.py +80 -0
  27. bbot/modules/baddns.py +1 -1
  28. bbot/modules/baddns_direct.py +1 -1
  29. bbot/modules/baddns_zone.py +1 -1
  30. bbot/modules/badsecrets.py +1 -1
  31. bbot/modules/base.py +129 -40
  32. bbot/modules/bucket_amazon.py +1 -1
  33. bbot/modules/bucket_digitalocean.py +1 -1
  34. bbot/modules/bucket_firebase.py +1 -1
  35. bbot/modules/bucket_google.py +1 -1
  36. bbot/modules/{bucket_azure.py → bucket_microsoft.py} +2 -2
  37. bbot/modules/builtwith.py +4 -2
  38. bbot/modules/c99.py +1 -1
  39. bbot/modules/dnsbimi.py +1 -4
  40. bbot/modules/dnsbrute.py +6 -1
  41. bbot/modules/dnscommonsrv.py +1 -0
  42. bbot/modules/dnsdumpster.py +35 -52
  43. bbot/modules/dnstlsrpt.py +0 -6
  44. bbot/modules/docker_pull.py +2 -2
  45. bbot/modules/emailformat.py +17 -1
  46. bbot/modules/ffuf.py +4 -1
  47. bbot/modules/ffuf_shortnames.py +6 -3
  48. bbot/modules/filedownload.py +8 -5
  49. bbot/modules/fullhunt.py +1 -1
  50. bbot/modules/git_clone.py +47 -22
  51. bbot/modules/gitdumper.py +5 -15
  52. bbot/modules/github_workflows.py +6 -5
  53. bbot/modules/gitlab_com.py +31 -0
  54. bbot/modules/gitlab_onprem.py +84 -0
  55. bbot/modules/gowitness.py +60 -30
  56. bbot/modules/graphql_introspection.py +145 -0
  57. bbot/modules/httpx.py +2 -0
  58. bbot/modules/hunt.py +10 -3
  59. bbot/modules/iis_shortnames.py +16 -7
  60. bbot/modules/internal/cloudcheck.py +65 -72
  61. bbot/modules/internal/unarchive.py +9 -3
  62. bbot/modules/lightfuzz/lightfuzz.py +6 -2
  63. bbot/modules/lightfuzz/submodules/esi.py +42 -0
  64. bbot/modules/{deadly/medusa.py → medusa.py} +4 -7
  65. bbot/modules/nuclei.py +2 -2
  66. bbot/modules/otx.py +9 -2
  67. bbot/modules/output/base.py +3 -11
  68. bbot/modules/paramminer_headers.py +10 -7
  69. bbot/modules/passivetotal.py +1 -1
  70. bbot/modules/portfilter.py +2 -0
  71. bbot/modules/portscan.py +1 -1
  72. bbot/modules/postman_download.py +2 -2
  73. bbot/modules/retirejs.py +232 -0
  74. bbot/modules/securitytxt.py +0 -3
  75. bbot/modules/sslcert.py +2 -2
  76. bbot/modules/subdomaincenter.py +1 -16
  77. bbot/modules/telerik.py +7 -2
  78. bbot/modules/templates/bucket.py +24 -4
  79. bbot/modules/templates/gitlab.py +98 -0
  80. bbot/modules/trufflehog.py +7 -4
  81. bbot/modules/wafw00f.py +2 -2
  82. bbot/presets/web/dotnet-audit.yml +1 -0
  83. bbot/presets/web/lightfuzz-heavy.yml +1 -1
  84. bbot/presets/web/lightfuzz-medium.yml +1 -1
  85. bbot/presets/web/lightfuzz-superheavy.yml +1 -1
  86. bbot/scanner/manager.py +44 -37
  87. bbot/scanner/scanner.py +17 -4
  88. bbot/scripts/benchmark_report.py +433 -0
  89. bbot/test/benchmarks/__init__.py +2 -0
  90. bbot/test/benchmarks/test_bloom_filter_benchmarks.py +105 -0
  91. bbot/test/benchmarks/test_closest_match_benchmarks.py +76 -0
  92. bbot/test/benchmarks/test_event_validation_benchmarks.py +438 -0
  93. bbot/test/benchmarks/test_excavate_benchmarks.py +291 -0
  94. bbot/test/benchmarks/test_ipaddress_benchmarks.py +143 -0
  95. bbot/test/benchmarks/test_weighted_shuffle_benchmarks.py +70 -0
  96. bbot/test/conftest.py +1 -1
  97. bbot/test/test_step_1/test_bbot_fastapi.py +2 -2
  98. bbot/test/test_step_1/test_events.py +22 -21
  99. bbot/test/test_step_1/test_helpers.py +20 -0
  100. bbot/test/test_step_1/test_manager_scope_accuracy.py +45 -0
  101. bbot/test/test_step_1/test_modules_basic.py +40 -15
  102. bbot/test/test_step_1/test_python_api.py +2 -2
  103. bbot/test/test_step_1/test_regexes.py +21 -4
  104. bbot/test/test_step_1/test_scan.py +7 -8
  105. bbot/test/test_step_1/test_web.py +46 -0
  106. bbot/test/test_step_2/module_tests/base.py +6 -1
  107. bbot/test/test_step_2/module_tests/test_module_aspnet_bin_exposure.py +73 -0
  108. bbot/test/test_step_2/module_tests/test_module_bucket_amazon.py +52 -18
  109. bbot/test/test_step_2/module_tests/test_module_bucket_google.py +1 -1
  110. bbot/test/test_step_2/module_tests/{test_module_bucket_azure.py → test_module_bucket_microsoft.py} +7 -5
  111. bbot/test/test_step_2/module_tests/test_module_cloudcheck.py +19 -31
  112. bbot/test/test_step_2/module_tests/test_module_dnsbimi.py +2 -1
  113. bbot/test/test_step_2/module_tests/test_module_dnsdumpster.py +3 -5
  114. bbot/test/test_step_2/module_tests/test_module_emailformat.py +1 -1
  115. bbot/test/test_step_2/module_tests/test_module_emails.py +2 -2
  116. bbot/test/test_step_2/module_tests/test_module_excavate.py +64 -5
  117. bbot/test/test_step_2/module_tests/test_module_extractous.py +13 -1
  118. bbot/test/test_step_2/module_tests/test_module_github_workflows.py +10 -1
  119. bbot/test/test_step_2/module_tests/test_module_gitlab_com.py +66 -0
  120. bbot/test/test_step_2/module_tests/{test_module_gitlab.py → test_module_gitlab_onprem.py} +4 -69
  121. bbot/test/test_step_2/module_tests/test_module_gowitness.py +5 -5
  122. bbot/test/test_step_2/module_tests/test_module_graphql_introspection.py +34 -0
  123. bbot/test/test_step_2/module_tests/test_module_iis_shortnames.py +46 -1
  124. bbot/test/test_step_2/module_tests/test_module_jadx.py +9 -0
  125. bbot/test/test_step_2/module_tests/test_module_lightfuzz.py +71 -3
  126. bbot/test/test_step_2/module_tests/test_module_nuclei.py +8 -6
  127. bbot/test/test_step_2/module_tests/test_module_otx.py +3 -0
  128. bbot/test/test_step_2/module_tests/test_module_portfilter.py +2 -0
  129. bbot/test/test_step_2/module_tests/test_module_retirejs.py +161 -0
  130. bbot/test/test_step_2/module_tests/test_module_telerik.py +1 -1
  131. bbot/test/test_step_2/module_tests/test_module_trufflehog.py +10 -1
  132. bbot/test/test_step_2/module_tests/test_module_unarchive.py +9 -0
  133. {bbot-2.5.0.dist-info → bbot-2.7.2.7424rc0.dist-info}/METADATA +12 -9
  134. {bbot-2.5.0.dist-info → bbot-2.7.2.7424rc0.dist-info}/RECORD +137 -124
  135. {bbot-2.5.0.dist-info → bbot-2.7.2.7424rc0.dist-info}/WHEEL +1 -1
  136. {bbot-2.5.0.dist-info → bbot-2.7.2.7424rc0.dist-info/licenses}/LICENSE +98 -58
  137. bbot/modules/binaryedge.py +0 -42
  138. bbot/modules/censys.py +0 -98
  139. bbot/modules/gitlab.py +0 -141
  140. bbot/modules/zoomeye.py +0 -77
  141. bbot/test/test_step_2/module_tests/test_module_binaryedge.py +0 -33
  142. bbot/test/test_step_2/module_tests/test_module_censys.py +0 -83
  143. bbot/test/test_step_2/module_tests/test_module_zoomeye.py +0 -35
  144. {bbot-2.5.0.dist-info → bbot-2.7.2.7424rc0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,105 @@
1
+ import pytest
2
+ import string
3
+ import random
4
+ from bbot.scanner import Scanner
5
+
6
+
7
+ class TestBloomFilterBenchmarks:
8
+ """
9
+ Benchmark tests for Bloom Filter operations.
10
+
11
+ These tests measure the performance of bloom filter operations which are
12
+ critical for DNS brute-forcing efficiency in BBOT.
13
+ """
14
+
15
+ def setup_method(self):
16
+ """Setup common test data"""
17
+ self.scan = Scanner()
18
+
19
+ # Generate test data of different sizes
20
+ self.items_small = self._generate_random_strings(1000) # 1K items
21
+ self.items_medium = self._generate_random_strings(10000) # 10K items
22
+
23
+ def _generate_random_strings(self, n, length=10):
24
+ """Generate a list of n random strings."""
25
+ # Slightly longer strings for testing performance difference
26
+ length = length + 2 # Make strings 2 chars longer
27
+ return ["".join(random.choices(string.ascii_letters + string.digits, k=length)) for _ in range(n)]
28
+
29
+ @pytest.mark.benchmark(group="bloom_filter_operations")
30
+ def test_bloom_filter_dns_mutation_tracking_performance(self, benchmark):
31
+ """Benchmark comprehensive bloom filter operations (add, check, mixed) for DNS brute-forcing"""
32
+
33
+ def comprehensive_bloom_operations():
34
+ bloom_filter = self.scan.helpers.bloom_filter(size=8000000) # 8M bits
35
+
36
+ # Phase 1: Add operations (simulating storing tried DNS mutations)
37
+ for item in self.items_small:
38
+ bloom_filter.add(item)
39
+
40
+ # Phase 2: Check operations (simulating lookup of existing mutations)
41
+ found_count = 0
42
+ for item in self.items_small:
43
+ if item in bloom_filter:
44
+ found_count += 1
45
+
46
+ # Phase 3: Mixed operations (realistic DNS brute-force simulation)
47
+ # Add new items while checking existing ones
48
+ for i, item in enumerate(self.items_medium[:500]): # Smaller subset for mixed ops
49
+ bloom_filter.add(item)
50
+ # Every few additions, check some existing items
51
+ if i % 10 == 0:
52
+ for check_item in self.items_small[i : i + 5]:
53
+ if check_item in bloom_filter:
54
+ found_count += 1
55
+
56
+ return {
57
+ "items_added": len(self.items_small) + 500,
58
+ "items_checked": found_count,
59
+ "bloom_size": bloom_filter.size,
60
+ }
61
+
62
+ result = benchmark(comprehensive_bloom_operations)
63
+ assert result["items_added"] > 1000
64
+ assert result["items_checked"] > 0
65
+
66
+ @pytest.mark.benchmark(group="bloom_filter_scalability")
67
+ def test_bloom_filter_large_scale_dns_brute_force(self, benchmark):
68
+ """Benchmark bloom filter performance with large-scale DNS brute-force simulation"""
69
+
70
+ def large_scale_simulation():
71
+ bloom_filter = self.scan.helpers.bloom_filter(size=8000000) # 8M bits
72
+
73
+ # Simulate a large DNS brute-force session
74
+ mutations_tried = 0
75
+ duplicate_attempts = 0
76
+
77
+ # Add all medium dataset (simulating 10K DNS mutations)
78
+ for item in self.items_medium:
79
+ bloom_filter.add(item)
80
+ mutations_tried += 1
81
+
82
+ # Simulate checking for duplicates during brute-force
83
+ for item in self.items_medium[:2000]: # Check subset for duplicates
84
+ if item in bloom_filter:
85
+ duplicate_attempts += 1
86
+
87
+ # Simulate adding more mutations with duplicate checking
88
+ for item in self.items_small:
89
+ if item not in bloom_filter: # Only add if not already tried
90
+ bloom_filter.add(item)
91
+ mutations_tried += 1
92
+ else:
93
+ duplicate_attempts += 1
94
+
95
+ return {
96
+ "total_mutations_tried": mutations_tried,
97
+ "duplicates_avoided": duplicate_attempts,
98
+ "efficiency_ratio": mutations_tried / (mutations_tried + duplicate_attempts)
99
+ if duplicate_attempts > 0
100
+ else 1.0,
101
+ }
102
+
103
+ result = benchmark(large_scale_simulation)
104
+ assert result["total_mutations_tried"] > 10000
105
+ assert result["efficiency_ratio"] > 0
@@ -0,0 +1,76 @@
1
+ import pytest
2
+ import random
3
+ from bbot.core.helpers.misc import closest_match
4
+
5
+
6
+ class TestClosestMatchBenchmarks:
7
+ """
8
+ Benchmark tests for closest_match operations.
9
+
10
+ This function is critical for BBOT's DNS brute forcing, where it finds the best
11
+ matching parent event among thousands of choices. Performance here directly impacts
12
+ scan throughput and DNS mutation efficiency.
13
+ """
14
+
15
+ def setup_method(self):
16
+ """Setup common test data"""
17
+ # Set deterministic seed for consistent benchmark results
18
+ random.seed(42) # Fixed seed for reproducible results
19
+
20
+ # Generate test data for benchmarks
21
+ self.large_closest_match_choices = self._generate_large_closest_match_choices()
22
+ self.realistic_closest_match_choices = self._generate_realistic_closest_match_choices()
23
+
24
+ def _generate_large_closest_match_choices(self):
25
+ """Generate large closest match dataset (stress test with many parent events)"""
26
+ choices = []
27
+ for i in range(10000):
28
+ # Generate realistic domain names with more variety
29
+ tld = random.choice(["com", "net", "org", "io", "co", "dev"])
30
+ domain = f"subdomain{i}.example{i % 100}.{tld}"
31
+ choices.append(domain)
32
+ return choices
33
+
34
+ def _generate_realistic_closest_match_choices(self):
35
+ """Generate realistic closest match parent event choices (like actual BBOT usage)"""
36
+ choices = []
37
+
38
+ # Common TLDs
39
+ tlds = ["com", "net", "org", "io", "co", "dev", "test", "local"]
40
+
41
+ # Generate parent domains with realistic patterns
42
+ for i in range(5000):
43
+ # Base domain patterns
44
+ if i % 10 == 0:
45
+ # Simple domains
46
+ domain = f"example{i}.{random.choice(tlds)}"
47
+ elif i % 5 == 0:
48
+ # Multi-level domains
49
+ domain = f"sub{i}.example{i}.{random.choice(tlds)}"
50
+ else:
51
+ # Complex domains
52
+ domain = f"level1{i}.level2{i}.example{i}.{random.choice(tlds)}"
53
+
54
+ choices.append(domain)
55
+
56
+ return choices
57
+
58
+ @pytest.mark.benchmark(group="closest_match")
59
+ def test_large_closest_match_lookup(self, benchmark):
60
+ """Benchmark closest_match with large closest match workload (many parent events)"""
61
+
62
+ def find_large_closest_match():
63
+ return closest_match("subdomain5678.example50.com", self.large_closest_match_choices)
64
+
65
+ result = benchmark.pedantic(find_large_closest_match, iterations=50, rounds=10)
66
+ assert result is not None
67
+
68
+ @pytest.mark.benchmark(group="closest_match")
69
+ def test_realistic_closest_match_workload(self, benchmark):
70
+ """Benchmark closest_match with realistic BBOT closest match parent event choices"""
71
+
72
+ def find_realistic_closest_match():
73
+ return closest_match("subdomain123.example5.com", self.realistic_closest_match_choices)
74
+
75
+ result = benchmark.pedantic(find_realistic_closest_match, iterations=50, rounds=10)
76
+ assert result is not None
@@ -0,0 +1,438 @@
1
+ import pytest
2
+ import random
3
+ import string
4
+ from bbot.scanner import Scanner
5
+ from bbot.core.event.base import make_event
6
+
7
+
8
+ class TestEventValidationBenchmarks:
9
+ def setup_method(self):
10
+ """Setup minimal scanner configuration for benchmarking event validation"""
11
+ # Set deterministic random seed for reproducible benchmarks
12
+ random.seed(42)
13
+
14
+ # Create a minimal scanner with no modules to isolate event validation performance
15
+ self.scanner_config = {
16
+ "modules": None, # No modules to avoid overhead
17
+ "output_modules": None, # No output modules
18
+ "dns": {"disable": True}, # Disable DNS to avoid network calls
19
+ "web": {"http_timeout": 1}, # Minimal timeouts
20
+ }
21
+
22
+ def _generate_diverse_targets(self, count=1000):
23
+ """Generate a diverse set of targets that will trigger different event type auto-detection"""
24
+ # Use deterministic random state for reproducible target generation
25
+ rng = random.Random(42)
26
+ targets = []
27
+
28
+ # DNS Names (various formats)
29
+ subdomains = ["www", "api", "mail", "ftp", "admin", "test", "dev", "staging", "blog"]
30
+ tlds = ["com", "org", "net", "io", "co.uk", "de", "fr", "jp"]
31
+
32
+ for _ in range(count // 10):
33
+ # Standard domains
34
+ targets.append(
35
+ f"{rng.choice(subdomains)}.{rng.choice(['example', 'test', 'evilcorp'])}.{rng.choice(tlds)}"
36
+ )
37
+ # Bare domains
38
+ targets.append(f"{rng.choice(['example', 'test', 'company'])}.{rng.choice(tlds)}")
39
+
40
+ # IP Addresses (IPv4 and IPv6)
41
+ for _ in range(count // 15):
42
+ # IPv4
43
+ targets.append(f"{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}")
44
+ # IPv6
45
+ targets.append(f"2001:db8::{rng.randint(1, 9999):x}:{rng.randint(1, 9999):x}")
46
+
47
+ # IP Ranges
48
+ for _ in range(count // 20):
49
+ targets.append(f"192.168.{rng.randint(1, 254)}.0/24")
50
+ targets.append(f"10.0.{rng.randint(1, 254)}.0/24")
51
+
52
+ # URLs (only supported schemes: http, https)
53
+ url_schemes = ["http", "https"] # Only schemes supported by BBOT auto-detection
54
+ url_paths = ["", "/", "/admin", "/api/v1", "/login.php", "/index.html"]
55
+ for _ in range(count // 8):
56
+ scheme = rng.choice(url_schemes)
57
+ domain = f"{rng.choice(subdomains)}.example.{rng.choice(tlds)}"
58
+ path = rng.choice(url_paths)
59
+ port = rng.choice(["", ":8080", ":443", ":80", ":8443"])
60
+ targets.append(f"{scheme}://{domain}{port}{path}")
61
+
62
+ # Open Ports
63
+ ports = [80, 443, 22, 21, 25, 53, 110, 143, 993, 995, 8080, 8443, 3389]
64
+ for _ in range(count // 12):
65
+ domain = f"example.{rng.choice(tlds)}"
66
+ port = rng.choice(ports)
67
+ targets.append(f"{domain}:{port}")
68
+ # IPv4 with port
69
+ ip = f"{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}"
70
+ targets.append(f"{ip}:{port}")
71
+
72
+ # Email Addresses
73
+ email_domains = ["example.com", "test.org", "company.net"]
74
+ email_users = ["admin", "test", "info", "contact", "support", "sales"]
75
+ for _ in range(count // 15):
76
+ user = rng.choice(email_users)
77
+ domain = rng.choice(email_domains)
78
+ targets.append(f"{user}@{domain}")
79
+ # Plus addressing
80
+ targets.append(f"{user}+{rng.randint(1, 999)}@{domain}")
81
+
82
+ # Mixed/Edge cases that should trigger auto-detection logic
83
+ edge_cases = [
84
+ # Localhost variants
85
+ "localhost",
86
+ "127.0.0.1",
87
+ "::1",
88
+ # Punycode domains
89
+ "xn--e1afmkfd.xn--p1ai",
90
+ "xn--fiqs8s.xn--0zwm56d",
91
+ # Long domains (shortened to avoid issues)
92
+ "very-long-subdomain-name-for-testing.test.com",
93
+ # IP with ports
94
+ "192.168.1.1",
95
+ "10.0.0.1:80",
96
+ # URLs with parameters
97
+ "https://example.com/search?q=test&limit=10",
98
+ "http://api.example.com:8080/v1/users?format=json",
99
+ # More standard domains for better compatibility
100
+ "api.test.com",
101
+ "mail.example.org",
102
+ "secure.company.net",
103
+ ]
104
+ targets.extend(edge_cases)
105
+
106
+ # Fill remainder with random variations
107
+ remaining = count - len(targets)
108
+ if remaining > 0:
109
+ for _ in range(remaining):
110
+ choice = rng.randint(1, 4)
111
+ if choice == 1:
112
+ # Random domain
113
+ targets.append(f"{''.join(rng.choices(string.ascii_lowercase, k=8))}.com")
114
+ elif choice == 2:
115
+ # Random IP
116
+ targets.append(
117
+ f"{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}"
118
+ )
119
+ elif choice == 3:
120
+ # Random URL
121
+ targets.append(f"https://{''.join(rng.choices(string.ascii_lowercase, k=8))}.com/path")
122
+ else:
123
+ # Random email
124
+ targets.append(f"{''.join(rng.choices(string.ascii_lowercase, k=8))}@example.com")
125
+
126
+ # Ensure we have exactly the requested count by removing duplicates and filling as needed
127
+ unique_targets = list(set(targets))
128
+
129
+ # If we have too few unique targets, generate more
130
+ while len(unique_targets) < count:
131
+ additional_target = f"filler{len(unique_targets)}.example.com"
132
+ if additional_target not in unique_targets:
133
+ unique_targets.append(additional_target)
134
+
135
+ # Return exactly the requested number of unique targets
136
+ return unique_targets[:count]
137
+
138
+ def _generate_diverse_event_data(self, count=1000):
139
+ """Generate diverse event data that will trigger different auto-detection paths in make_event"""
140
+ # Use deterministic random state for reproducible data generation
141
+ rng = random.Random(42)
142
+ event_data = []
143
+
144
+ # DNS Names (various formats)
145
+ subdomains = ["www", "api", "mail", "ftp", "admin", "test", "dev", "staging", "blog"]
146
+ tlds = ["com", "org", "net", "io", "co.uk", "de", "fr", "jp"]
147
+
148
+ for _ in range(count // 10):
149
+ # Standard domains
150
+ event_data.append(
151
+ f"{rng.choice(subdomains)}.{rng.choice(['example', 'test', 'evilcorp'])}.{rng.choice(tlds)}"
152
+ )
153
+ # Bare domains
154
+ event_data.append(f"{rng.choice(['example', 'test', 'company'])}.{rng.choice(tlds)}")
155
+
156
+ # IP Addresses (IPv4 and IPv6)
157
+ for _ in range(count // 15):
158
+ # IPv4
159
+ event_data.append(
160
+ f"{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}"
161
+ )
162
+ # IPv6
163
+ event_data.append(f"2001:db8::{rng.randint(1, 9999):x}:{rng.randint(1, 9999):x}")
164
+
165
+ # IP Ranges
166
+ for _ in range(count // 20):
167
+ event_data.append(f"192.168.{rng.randint(1, 254)}.0/24")
168
+ event_data.append(f"10.0.{rng.randint(1, 254)}.0/24")
169
+
170
+ # URLs (HTTP/HTTPS)
171
+ url_schemes = ["http", "https"]
172
+ url_paths = ["", "/", "/admin", "/api/v1", "/login.php", "/index.html"]
173
+ for _ in range(count // 8):
174
+ scheme = rng.choice(url_schemes)
175
+ domain = f"{rng.choice(subdomains)}.example.{rng.choice(tlds)}"
176
+ path = rng.choice(url_paths)
177
+ port = rng.choice(["", ":8080", ":443", ":80", ":8443"])
178
+ event_data.append(f"{scheme}://{domain}{port}{path}")
179
+
180
+ # Open Ports
181
+ ports = [80, 443, 22, 21, 25, 53, 110, 143, 993, 995, 8080, 8443, 3389]
182
+ for _ in range(count // 12):
183
+ domain = f"example.{rng.choice(tlds)}"
184
+ port = rng.choice(ports)
185
+ event_data.append(f"{domain}:{port}")
186
+ # IPv4 with port
187
+ ip = f"{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}"
188
+ event_data.append(f"{ip}:{port}")
189
+
190
+ # Email Addresses
191
+ email_domains = ["example.com", "test.org", "company.net"]
192
+ email_users = ["admin", "test", "info", "contact", "support", "sales"]
193
+ for _ in range(count // 15):
194
+ user = rng.choice(email_users)
195
+ domain = rng.choice(email_domains)
196
+ event_data.append(f"{user}@{domain}")
197
+ # Plus addressing
198
+ event_data.append(f"{user}+{rng.randint(1, 999)}@{domain}")
199
+
200
+ # Mixed/Edge cases that test auto-detection logic
201
+ edge_cases = [
202
+ # Localhost variants
203
+ "localhost",
204
+ "127.0.0.1",
205
+ "::1",
206
+ # Punycode domains
207
+ "xn--e1afmkfd.xn--p1ai",
208
+ "xn--fiqs8s.xn--0zwm56d",
209
+ # Long domains
210
+ "very-long-subdomain-name-for-testing.test.com",
211
+ # IP with ports
212
+ "192.168.1.1",
213
+ "10.0.0.1:80",
214
+ # URLs with parameters
215
+ "https://example.com/search?q=test&limit=10",
216
+ "http://api.example.com:8080/v1/users?format=json",
217
+ # Standard domains for better compatibility
218
+ "api.test.com",
219
+ "mail.example.org",
220
+ "secure.company.net",
221
+ ]
222
+ event_data.extend(edge_cases)
223
+
224
+ # Fill remainder with random variations
225
+ remaining = count - len(event_data)
226
+ if remaining > 0:
227
+ for _ in range(remaining):
228
+ choice = rng.randint(1, 4)
229
+ if choice == 1:
230
+ # Random domain
231
+ event_data.append(f"{''.join(rng.choices(string.ascii_lowercase, k=8))}.com")
232
+ elif choice == 2:
233
+ # Random IP
234
+ event_data.append(
235
+ f"{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}.{rng.randint(1, 254)}"
236
+ )
237
+ elif choice == 3:
238
+ # Random URL
239
+ event_data.append(f"https://{''.join(rng.choices(string.ascii_lowercase, k=8))}.com/path")
240
+ else:
241
+ # Random email
242
+ event_data.append(f"{''.join(rng.choices(string.ascii_lowercase, k=8))}@example.com")
243
+
244
+ # Ensure we have exactly the requested count by removing duplicates and filling as needed
245
+ unique_data = list(set(event_data))
246
+
247
+ # If we have too few unique entries, generate more
248
+ while len(unique_data) < count:
249
+ additional_data = f"filler{len(unique_data)}.example.com"
250
+ if additional_data not in unique_data:
251
+ unique_data.append(additional_data)
252
+
253
+ # Return exactly the requested number of unique data items
254
+ return unique_data[:count]
255
+
256
+ @pytest.mark.benchmark(group="event_validation_scan_startup_small")
257
+ def test_event_validation_full_scan_startup_small_batch(self, benchmark):
258
+ """Benchmark full scan startup event validation with small batch (100 targets) for quick iteration"""
259
+ targets = self._generate_diverse_targets(100)
260
+
261
+ def validate_event_batch():
262
+ scan = Scanner(*targets, config=self.scanner_config)
263
+ # Count successful event creations and types detected
264
+ event_counts = {}
265
+ total_events = 0
266
+
267
+ for event_seed in scan.target.seeds:
268
+ event_type = event_seed.type
269
+ event_counts[event_type] = event_counts.get(event_type, 0) + 1
270
+ total_events += 1
271
+
272
+ return {
273
+ "total_events_processed": total_events,
274
+ "unique_event_types": len(event_counts),
275
+ "event_type_breakdown": event_counts,
276
+ "targets_input": len(targets),
277
+ }
278
+
279
+ result = benchmark(validate_event_batch)
280
+ assert result["total_events_processed"] == result["targets_input"] # Should process ALL targets
281
+ assert result["unique_event_types"] >= 3 # Should detect at least DNS_NAME, IP_ADDRESS, URL
282
+
283
+ @pytest.mark.benchmark(group="event_validation_scan_startup_large")
284
+ def test_event_validation_full_scan_startup_large_batch(self, benchmark):
285
+ """Benchmark full scan startup event validation with large batch (1000 targets) for comprehensive testing"""
286
+ targets = self._generate_diverse_targets(1000)
287
+
288
+ def validate_large_batch():
289
+ scan = Scanner(*targets, config=self.scanner_config)
290
+
291
+ # Comprehensive analysis of validation pipeline performance
292
+ validation_metrics = {
293
+ "targets_input": len(targets),
294
+ "events_created": 0,
295
+ "validation_errors": 0,
296
+ "auto_detection_success": 0,
297
+ "type_distribution": {},
298
+ "processing_efficiency": 0.0,
299
+ }
300
+
301
+ try:
302
+ for event_seed in scan.target.seeds:
303
+ validation_metrics["events_created"] += 1
304
+ event_type = event_seed.type
305
+
306
+ if event_type not in validation_metrics["type_distribution"]:
307
+ validation_metrics["type_distribution"][event_type] = 0
308
+ validation_metrics["type_distribution"][event_type] += 1
309
+
310
+ # If we got a valid event type, auto-detection succeeded
311
+ if event_type and event_type != "UNKNOWN":
312
+ validation_metrics["auto_detection_success"] += 1
313
+
314
+ except Exception:
315
+ validation_metrics["validation_errors"] += 1
316
+
317
+ # Calculate efficiency ratio
318
+ if validation_metrics["targets_input"] > 0:
319
+ validation_metrics["processing_efficiency"] = (
320
+ validation_metrics["events_created"] / validation_metrics["targets_input"]
321
+ )
322
+
323
+ return validation_metrics
324
+
325
+ result = benchmark(validate_large_batch)
326
+ assert result["events_created"] == result["targets_input"] # Should process ALL targets successfully
327
+ assert result["processing_efficiency"] == 1.0 # 100% success rate
328
+ assert len(result["type_distribution"]) >= 5 # Should detect multiple event types
329
+
330
+ @pytest.mark.benchmark(group="make_event_small")
331
+ def test_make_event_autodetection_small(self, benchmark):
332
+ """Benchmark make_event with auto-detection for small batch (100 items)"""
333
+ event_data = self._generate_diverse_event_data(100)
334
+
335
+ def create_events_with_autodetection():
336
+ events_created = []
337
+ type_distribution = {}
338
+ validation_errors = 0
339
+
340
+ for data in event_data:
341
+ try:
342
+ # Test auto-detection by not providing event_type
343
+ event = make_event(data, dummy=True)
344
+ events_created.append(event)
345
+
346
+ event_type = event.type
347
+ type_distribution[event_type] = type_distribution.get(event_type, 0) + 1
348
+
349
+ except Exception:
350
+ validation_errors += 1
351
+
352
+ return {
353
+ "events_created": len(events_created),
354
+ "type_distribution": type_distribution,
355
+ "validation_errors": validation_errors,
356
+ "autodetection_success_rate": len(events_created) / len(event_data) if event_data else 0,
357
+ }
358
+
359
+ result = benchmark.pedantic(create_events_with_autodetection, iterations=50, rounds=10)
360
+ assert result["events_created"] == len(event_data) # Should create events for all data
361
+ assert result["validation_errors"] == 0 # Should have no validation errors
362
+ assert len(result["type_distribution"]) >= 3 # Should detect multiple event types
363
+ assert result["autodetection_success_rate"] == 1.0 # 100% success rate
364
+
365
+ @pytest.mark.benchmark(group="make_event_large")
366
+ def test_make_event_autodetection_large(self, benchmark):
367
+ """Benchmark make_event with auto-detection for large batch (1000 items)"""
368
+ event_data = self._generate_diverse_event_data(1000)
369
+
370
+ def create_large_event_batch():
371
+ performance_metrics = {
372
+ "total_processed": len(event_data),
373
+ "events_created": 0,
374
+ "autodetection_failures": 0,
375
+ "type_distribution": {},
376
+ "processing_efficiency": 0.0,
377
+ }
378
+
379
+ for data in event_data:
380
+ try:
381
+ # Use dummy=True for performance (no scan/parent validation)
382
+ event = make_event(data, dummy=True)
383
+ performance_metrics["events_created"] += 1
384
+
385
+ event_type = event.type
386
+ if event_type not in performance_metrics["type_distribution"]:
387
+ performance_metrics["type_distribution"][event_type] = 0
388
+ performance_metrics["type_distribution"][event_type] += 1
389
+
390
+ except Exception:
391
+ performance_metrics["autodetection_failures"] += 1
392
+
393
+ # Calculate efficiency ratio
394
+ performance_metrics["processing_efficiency"] = (
395
+ performance_metrics["events_created"] / performance_metrics["total_processed"]
396
+ )
397
+
398
+ return performance_metrics
399
+
400
+ result = benchmark.pedantic(create_large_event_batch, iterations=50, rounds=10)
401
+ assert result["events_created"] == result["total_processed"] # Should process all successfully
402
+ assert result["autodetection_failures"] == 0 # Should have no failures
403
+ assert result["processing_efficiency"] == 1.0 # 100% efficiency
404
+ assert len(result["type_distribution"]) >= 5 # Should detect multiple event types
405
+
406
+ @pytest.mark.benchmark(group="make_event_explicit_types")
407
+ def test_make_event_explicit_types(self, benchmark):
408
+ """Benchmark make_event when event types are explicitly provided (no auto-detection)"""
409
+ # Create data with explicit type mappings to bypass auto-detection
410
+ test_cases = [
411
+ ("example.com", "DNS_NAME"),
412
+ ("192.168.1.1", "IP_ADDRESS"),
413
+ ("https://example.com", "URL"),
414
+ ("admin@example.com", "EMAIL_ADDRESS"),
415
+ ("example.com:80", "OPEN_TCP_PORT"),
416
+ ] * 20 # 100 total cases
417
+
418
+ def create_events_explicit_types():
419
+ events_created = []
420
+ type_distribution = {}
421
+
422
+ for data, event_type in test_cases:
423
+ # Explicitly provide event_type to skip auto-detection
424
+ event = make_event(data, event_type=event_type, dummy=True)
425
+ events_created.append(event)
426
+
427
+ type_distribution[event_type] = type_distribution.get(event_type, 0) + 1
428
+
429
+ return {
430
+ "events_created": len(events_created),
431
+ "type_distribution": type_distribution,
432
+ "bypass_autodetection": True,
433
+ }
434
+
435
+ result = benchmark.pedantic(create_events_explicit_types, iterations=50, rounds=10)
436
+ assert result["events_created"] == len(test_cases) # Should create all events
437
+ assert result["bypass_autodetection"] # Confirms we bypassed auto-detection
438
+ assert len(result["type_distribution"]) == 5 # Should have exactly 5 types