oscura 0.7.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +19 -19
- oscura/analyzers/__init__.py +2 -0
- oscura/analyzers/digital/extraction.py +2 -3
- oscura/analyzers/digital/quality.py +1 -1
- oscura/analyzers/digital/timing.py +1 -1
- oscura/analyzers/eye/__init__.py +5 -1
- oscura/analyzers/eye/generation.py +501 -0
- oscura/analyzers/jitter/__init__.py +6 -6
- oscura/analyzers/jitter/timing.py +419 -0
- oscura/analyzers/patterns/__init__.py +94 -0
- oscura/analyzers/patterns/reverse_engineering.py +991 -0
- oscura/analyzers/power/__init__.py +35 -12
- oscura/analyzers/power/basic.py +3 -3
- oscura/analyzers/power/soa.py +1 -1
- oscura/analyzers/power/switching.py +3 -3
- oscura/analyzers/signal_classification.py +529 -0
- oscura/analyzers/signal_integrity/sparams.py +3 -3
- oscura/analyzers/statistics/__init__.py +4 -0
- oscura/analyzers/statistics/basic.py +152 -0
- oscura/analyzers/statistics/correlation.py +47 -6
- oscura/analyzers/validation.py +1 -1
- oscura/analyzers/waveform/__init__.py +2 -0
- oscura/analyzers/waveform/measurements.py +329 -163
- oscura/analyzers/waveform/measurements_with_uncertainty.py +91 -35
- oscura/analyzers/waveform/spectral.py +498 -54
- oscura/api/dsl/commands.py +15 -6
- oscura/api/server/templates/base.html +137 -146
- oscura/api/server/templates/export.html +84 -110
- oscura/api/server/templates/home.html +248 -267
- oscura/api/server/templates/protocols.html +44 -48
- oscura/api/server/templates/reports.html +27 -35
- oscura/api/server/templates/session_detail.html +68 -78
- oscura/api/server/templates/sessions.html +62 -72
- oscura/api/server/templates/waveforms.html +54 -64
- oscura/automotive/__init__.py +1 -1
- oscura/automotive/can/session.py +1 -1
- oscura/automotive/dbc/generator.py +638 -23
- oscura/automotive/dtc/data.json +102 -17
- oscura/automotive/uds/decoder.py +99 -6
- oscura/cli/analyze.py +8 -2
- oscura/cli/batch.py +36 -5
- oscura/cli/characterize.py +18 -4
- oscura/cli/export.py +47 -5
- oscura/cli/main.py +2 -0
- oscura/cli/onboarding/wizard.py +10 -6
- oscura/cli/pipeline.py +585 -0
- oscura/cli/visualize.py +6 -4
- oscura/convenience.py +400 -32
- oscura/core/config/loader.py +0 -1
- oscura/core/measurement_result.py +286 -0
- oscura/core/progress.py +1 -1
- oscura/core/schemas/device_mapping.json +8 -2
- oscura/core/schemas/packet_format.json +24 -4
- oscura/core/schemas/protocol_definition.json +12 -2
- oscura/core/types.py +300 -199
- oscura/correlation/multi_protocol.py +1 -1
- oscura/export/legacy/__init__.py +11 -0
- oscura/export/legacy/wav.py +75 -0
- oscura/exporters/__init__.py +19 -0
- oscura/exporters/wireshark.py +809 -0
- oscura/hardware/acquisition/file.py +5 -19
- oscura/hardware/acquisition/saleae.py +10 -10
- oscura/hardware/acquisition/socketcan.py +4 -6
- oscura/hardware/acquisition/synthetic.py +1 -5
- oscura/hardware/acquisition/visa.py +6 -6
- oscura/hardware/security/side_channel_detector.py +5 -508
- oscura/inference/message_format.py +686 -1
- oscura/jupyter/display.py +2 -2
- oscura/jupyter/magic.py +3 -3
- oscura/loaders/__init__.py +17 -12
- oscura/loaders/binary.py +1 -1
- oscura/loaders/chipwhisperer.py +1 -2
- oscura/loaders/configurable.py +1 -1
- oscura/loaders/csv_loader.py +2 -2
- oscura/loaders/hdf5_loader.py +1 -1
- oscura/loaders/lazy.py +6 -1
- oscura/loaders/mmap_loader.py +0 -1
- oscura/loaders/numpy_loader.py +8 -7
- oscura/loaders/preprocessing.py +3 -5
- oscura/loaders/rigol.py +21 -7
- oscura/loaders/sigrok.py +2 -5
- oscura/loaders/tdms.py +3 -2
- oscura/loaders/tektronix.py +38 -32
- oscura/loaders/tss.py +20 -27
- oscura/loaders/vcd.py +13 -8
- oscura/loaders/wav.py +1 -6
- oscura/pipeline/__init__.py +76 -0
- oscura/pipeline/handlers/__init__.py +165 -0
- oscura/pipeline/handlers/analyzers.py +1045 -0
- oscura/pipeline/handlers/decoders.py +899 -0
- oscura/pipeline/handlers/exporters.py +1103 -0
- oscura/pipeline/handlers/filters.py +891 -0
- oscura/pipeline/handlers/loaders.py +640 -0
- oscura/pipeline/handlers/transforms.py +768 -0
- oscura/reporting/__init__.py +88 -1
- oscura/reporting/automation.py +348 -0
- oscura/reporting/citations.py +374 -0
- oscura/reporting/core.py +54 -0
- oscura/reporting/formatting/__init__.py +11 -0
- oscura/reporting/formatting/measurements.py +320 -0
- oscura/reporting/html.py +57 -0
- oscura/reporting/interpretation.py +431 -0
- oscura/reporting/summary.py +329 -0
- oscura/reporting/templates/enhanced/protocol_re.html +504 -503
- oscura/reporting/visualization.py +542 -0
- oscura/side_channel/__init__.py +38 -57
- oscura/utils/builders/signal_builder.py +5 -5
- oscura/utils/comparison/compare.py +7 -9
- oscura/utils/comparison/golden.py +1 -1
- oscura/utils/filtering/convenience.py +2 -2
- oscura/utils/math/arithmetic.py +38 -62
- oscura/utils/math/interpolation.py +20 -20
- oscura/utils/pipeline/__init__.py +4 -17
- oscura/utils/progressive.py +1 -4
- oscura/utils/triggering/edge.py +1 -1
- oscura/utils/triggering/pattern.py +2 -2
- oscura/utils/triggering/pulse.py +2 -2
- oscura/utils/triggering/window.py +3 -3
- oscura/validation/hil_testing.py +11 -11
- oscura/visualization/__init__.py +47 -284
- oscura/visualization/batch.py +160 -0
- oscura/visualization/plot.py +542 -53
- oscura/visualization/styles.py +184 -318
- oscura/workflows/__init__.py +2 -0
- oscura/workflows/batch/advanced.py +1 -1
- oscura/workflows/batch/aggregate.py +7 -8
- oscura/workflows/complete_re.py +251 -23
- oscura/workflows/digital.py +27 -4
- oscura/workflows/multi_trace.py +136 -17
- oscura/workflows/waveform.py +788 -0
- {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/METADATA +59 -79
- {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/RECORD +135 -149
- oscura/side_channel/dpa.py +0 -1025
- oscura/utils/optimization/__init__.py +0 -19
- oscura/utils/optimization/parallel.py +0 -443
- oscura/utils/optimization/search.py +0 -532
- oscura/utils/pipeline/base.py +0 -338
- oscura/utils/pipeline/composition.py +0 -248
- oscura/utils/pipeline/parallel.py +0 -449
- oscura/utils/pipeline/pipeline.py +0 -375
- oscura/utils/search/__init__.py +0 -16
- oscura/utils/search/anomaly.py +0 -424
- oscura/utils/search/context.py +0 -294
- oscura/utils/search/pattern.py +0 -288
- oscura/utils/storage/__init__.py +0 -61
- oscura/utils/storage/database.py +0 -1166
- oscura/visualization/accessibility.py +0 -526
- oscura/visualization/annotations.py +0 -371
- oscura/visualization/axis_scaling.py +0 -305
- oscura/visualization/colors.py +0 -451
- oscura/visualization/digital.py +0 -436
- oscura/visualization/eye.py +0 -571
- oscura/visualization/histogram.py +0 -281
- oscura/visualization/interactive.py +0 -1035
- oscura/visualization/jitter.py +0 -1042
- oscura/visualization/keyboard.py +0 -394
- oscura/visualization/layout.py +0 -400
- oscura/visualization/optimization.py +0 -1079
- oscura/visualization/palettes.py +0 -446
- oscura/visualization/power.py +0 -508
- oscura/visualization/power_extended.py +0 -955
- oscura/visualization/presets.py +0 -469
- oscura/visualization/protocols.py +0 -1246
- oscura/visualization/render.py +0 -223
- oscura/visualization/rendering.py +0 -444
- oscura/visualization/reverse_engineering.py +0 -838
- oscura/visualization/signal_integrity.py +0 -989
- oscura/visualization/specialized.py +0 -643
- oscura/visualization/spectral.py +0 -1226
- oscura/visualization/thumbnails.py +0 -340
- oscura/visualization/time_axis.py +0 -351
- oscura/visualization/waveform.py +0 -454
- {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/WHEEL +0 -0
- {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/entry_points.txt +0 -0
- {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,991 @@
|
|
|
1
|
+
"""Comprehensive reverse engineering toolkit for binary data and protocols.
|
|
2
|
+
|
|
3
|
+
This module provides a complete toolkit for reverse engineering unknown binary
|
|
4
|
+
protocols and data formats by integrating pattern analysis, entropy analysis,
|
|
5
|
+
field inference, and data classification.
|
|
6
|
+
|
|
7
|
+
Key capabilities:
|
|
8
|
+
- Pattern discovery and motif extraction
|
|
9
|
+
- N-gram frequency analysis for fingerprinting
|
|
10
|
+
- Signature and delimiter discovery
|
|
11
|
+
- Binary regex and multi-pattern search
|
|
12
|
+
- Fuzzy/approximate matching
|
|
13
|
+
- Anomaly detection
|
|
14
|
+
- Entropy analysis and crypto detection
|
|
15
|
+
- Data type classification (encrypted, compressed, structured)
|
|
16
|
+
- Field boundary inference
|
|
17
|
+
- Delimiter and length prefix detection
|
|
18
|
+
- Checksum field detection
|
|
19
|
+
|
|
20
|
+
Example workflow:
|
|
21
|
+
>>> from oscura.analyzers.patterns.reverse_engineering import ReverseEngineer
|
|
22
|
+
>>> re_tool = ReverseEngineer()
|
|
23
|
+
>>>
|
|
24
|
+
>>> # Analyze unknown binary data
|
|
25
|
+
>>> analysis = re_tool.analyze_binary(unknown_data)
|
|
26
|
+
>>> print(f"Data type: {analysis['data_type']}")
|
|
27
|
+
>>> print(f"Entropy: {analysis['entropy']:.2f} bits/byte")
|
|
28
|
+
>>> print(f"Detected signatures: {analysis['signatures']}")
|
|
29
|
+
>>>
|
|
30
|
+
>>> # Infer protocol structure
|
|
31
|
+
>>> messages = [msg1, msg2, msg3, ...]
|
|
32
|
+
>>> structure = re_tool.infer_protocol_structure(messages)
|
|
33
|
+
>>> for field in structure['fields']:
|
|
34
|
+
... print(f"Field at offset {field['offset']}: {field['type']}")
|
|
35
|
+
|
|
36
|
+
Author: Oscura Development Team
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import logging
|
|
42
|
+
from dataclasses import dataclass, field
|
|
43
|
+
from typing import TYPE_CHECKING, Any
|
|
44
|
+
|
|
45
|
+
import numpy as np
|
|
46
|
+
|
|
47
|
+
# Import from existing modules
|
|
48
|
+
from oscura.analyzers.entropy import CryptoDetector, EntropyResult
|
|
49
|
+
from oscura.analyzers.patterns.discovery import (
|
|
50
|
+
CandidateSignature,
|
|
51
|
+
SignatureDiscovery,
|
|
52
|
+
)
|
|
53
|
+
from oscura.analyzers.patterns.matching import (
|
|
54
|
+
FuzzyMatcher,
|
|
55
|
+
)
|
|
56
|
+
from oscura.analyzers.patterns.periodic import detect_period
|
|
57
|
+
from oscura.analyzers.patterns.sequences import (
|
|
58
|
+
find_repeating_sequences,
|
|
59
|
+
)
|
|
60
|
+
from oscura.analyzers.statistical.ngrams import (
|
|
61
|
+
NGramAnalyzer,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if TYPE_CHECKING:
|
|
65
|
+
from numpy.typing import NDArray
|
|
66
|
+
|
|
67
|
+
logger = logging.getLogger(__name__)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class FieldDescriptor:
|
|
72
|
+
"""Descriptor for an inferred protocol field.
|
|
73
|
+
|
|
74
|
+
Attributes:
|
|
75
|
+
offset: Byte offset from start of message.
|
|
76
|
+
length: Field length in bytes.
|
|
77
|
+
field_type: Inferred type (fixed, variable, length, checksum, payload, etc.).
|
|
78
|
+
entropy: Average entropy of field across messages.
|
|
79
|
+
is_constant: True if field has same value across all messages.
|
|
80
|
+
constant_value: Value if is_constant is True.
|
|
81
|
+
description: Human-readable description.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
offset: int
|
|
85
|
+
length: int
|
|
86
|
+
field_type: str
|
|
87
|
+
entropy: float
|
|
88
|
+
is_constant: bool = False
|
|
89
|
+
constant_value: bytes | None = None
|
|
90
|
+
description: str = ""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class ProtocolStructure:
|
|
95
|
+
"""Inferred protocol message structure.
|
|
96
|
+
|
|
97
|
+
Attributes:
|
|
98
|
+
message_length: Fixed message length, or -1 for variable.
|
|
99
|
+
fields: List of inferred fields.
|
|
100
|
+
delimiter: Detected delimiter bytes, if any.
|
|
101
|
+
length_prefix_offset: Offset of length prefix field, if detected.
|
|
102
|
+
checksum_offset: Offset of checksum field, if detected.
|
|
103
|
+
payload_offset: Offset of variable payload, if detected.
|
|
104
|
+
confidence: Overall confidence in structure inference (0.0-1.0).
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
message_length: int
|
|
108
|
+
fields: list[FieldDescriptor] = field(default_factory=list)
|
|
109
|
+
delimiter: bytes | None = None
|
|
110
|
+
length_prefix_offset: int | None = None
|
|
111
|
+
checksum_offset: int | None = None
|
|
112
|
+
payload_offset: int | None = None
|
|
113
|
+
confidence: float = 0.0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class BinaryAnalysisResult:
|
|
118
|
+
"""Complete analysis result for binary data.
|
|
119
|
+
|
|
120
|
+
Attributes:
|
|
121
|
+
data_type: Classified type (encrypted, compressed, structured, random).
|
|
122
|
+
entropy: Shannon entropy in bits/byte.
|
|
123
|
+
entropy_result: Detailed entropy analysis.
|
|
124
|
+
signatures: Discovered candidate signatures.
|
|
125
|
+
repeating_patterns: Detected repeating sequences.
|
|
126
|
+
ngram_profile: N-gram frequency distribution.
|
|
127
|
+
anomalies: Detected anomaly positions.
|
|
128
|
+
periodic_patterns: Detected periodic patterns.
|
|
129
|
+
confidence: Overall analysis confidence (0.0-1.0).
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
data_type: str
|
|
133
|
+
entropy: float
|
|
134
|
+
entropy_result: EntropyResult
|
|
135
|
+
signatures: list[CandidateSignature]
|
|
136
|
+
repeating_patterns: list[dict[str, Any]]
|
|
137
|
+
ngram_profile: dict[bytes, int]
|
|
138
|
+
anomalies: list[int]
|
|
139
|
+
periodic_patterns: list[dict[str, Any]]
|
|
140
|
+
confidence: float
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class ReverseEngineer:
|
|
144
|
+
"""Comprehensive reverse engineering toolkit for binary data and protocols.
|
|
145
|
+
|
|
146
|
+
Integrates multiple analysis techniques to help reverse engineer unknown
|
|
147
|
+
binary formats and protocols:
|
|
148
|
+
|
|
149
|
+
- Pattern analysis: motifs, repeating sequences, signatures
|
|
150
|
+
- Entropy analysis: crypto detection, compression detection
|
|
151
|
+
- N-gram analysis: fingerprinting, frequency analysis
|
|
152
|
+
- Field inference: automatic field boundary detection
|
|
153
|
+
- Structure learning: protocol message structure inference
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
>>> re_tool = ReverseEngineer()
|
|
157
|
+
>>>
|
|
158
|
+
>>> # Quick binary analysis
|
|
159
|
+
>>> result = re_tool.analyze_binary(data)
|
|
160
|
+
>>> print(result.data_type)
|
|
161
|
+
'encrypted'
|
|
162
|
+
>>>
|
|
163
|
+
>>> # Protocol structure inference
|
|
164
|
+
>>> messages = [capture1, capture2, capture3]
|
|
165
|
+
>>> structure = re_tool.infer_protocol_structure(messages)
|
|
166
|
+
>>> for field in structure.fields:
|
|
167
|
+
... print(f"{field.field_type} at offset {field.offset}")
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def __init__(
|
|
171
|
+
self,
|
|
172
|
+
min_signature_length: int = 4,
|
|
173
|
+
max_signature_length: int = 16,
|
|
174
|
+
ngram_size: int = 2,
|
|
175
|
+
):
|
|
176
|
+
"""Initialize reverse engineering toolkit.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
min_signature_length: Minimum signature length for discovery.
|
|
180
|
+
max_signature_length: Maximum signature length for discovery.
|
|
181
|
+
ngram_size: Default n-gram size for frequency analysis.
|
|
182
|
+
"""
|
|
183
|
+
self.crypto_detector = CryptoDetector()
|
|
184
|
+
self.signature_discovery = SignatureDiscovery(
|
|
185
|
+
min_length=min_signature_length,
|
|
186
|
+
max_length=max_signature_length,
|
|
187
|
+
)
|
|
188
|
+
self.ngram_analyzer = NGramAnalyzer(n=ngram_size)
|
|
189
|
+
self.fuzzy_matcher = FuzzyMatcher(max_edit_distance=2)
|
|
190
|
+
|
|
191
|
+
def analyze_binary(
|
|
192
|
+
self,
|
|
193
|
+
data: bytes,
|
|
194
|
+
detect_anomalies: bool = True,
|
|
195
|
+
detect_signatures: bool = True,
|
|
196
|
+
) -> BinaryAnalysisResult:
|
|
197
|
+
"""Perform comprehensive analysis of binary data.
|
|
198
|
+
|
|
199
|
+
Combines multiple analysis techniques to characterize binary data:
|
|
200
|
+
- Entropy analysis and crypto/compression detection
|
|
201
|
+
- Signature and header discovery
|
|
202
|
+
- Repeating pattern detection
|
|
203
|
+
- N-gram frequency profiling
|
|
204
|
+
- Anomaly detection
|
|
205
|
+
- Periodic pattern detection
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
data: Binary data to analyze.
|
|
209
|
+
detect_anomalies: Whether to run anomaly detection.
|
|
210
|
+
detect_signatures: Whether to run signature discovery.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
BinaryAnalysisResult with comprehensive analysis.
|
|
214
|
+
|
|
215
|
+
Raises:
|
|
216
|
+
ValueError: If data is empty.
|
|
217
|
+
|
|
218
|
+
Example:
|
|
219
|
+
>>> data = open('unknown.bin', 'rb').read()
|
|
220
|
+
>>> result = re_tool.analyze_binary(data)
|
|
221
|
+
>>> if result.entropy > 7.5:
|
|
222
|
+
... print("Likely encrypted")
|
|
223
|
+
>>> for sig in result.signatures:
|
|
224
|
+
... print(f"Signature: {sig.pattern.hex()}")
|
|
225
|
+
"""
|
|
226
|
+
if not data:
|
|
227
|
+
raise ValueError("Cannot analyze empty data")
|
|
228
|
+
|
|
229
|
+
logger.info(f"Starting comprehensive analysis of {len(data)} bytes")
|
|
230
|
+
|
|
231
|
+
# 1. Entropy analysis
|
|
232
|
+
entropy_result = self.crypto_detector.analyze_entropy(data)
|
|
233
|
+
entropy_val = entropy_result.shannon_entropy
|
|
234
|
+
|
|
235
|
+
# 2. Classify data type
|
|
236
|
+
if entropy_result.encryption_likelihood > 0.7:
|
|
237
|
+
data_type = "encrypted"
|
|
238
|
+
elif entropy_result.compression_likelihood > 0.7:
|
|
239
|
+
data_type = "compressed"
|
|
240
|
+
elif entropy_val < 3.0:
|
|
241
|
+
data_type = "structured"
|
|
242
|
+
else:
|
|
243
|
+
data_type = "mixed"
|
|
244
|
+
|
|
245
|
+
# 3. Signature discovery (skip for encrypted/compressed)
|
|
246
|
+
signatures = []
|
|
247
|
+
if detect_signatures and data_type in ["structured", "mixed"]:
|
|
248
|
+
try:
|
|
249
|
+
signatures = self.signature_discovery.discover_signatures(data)
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.warning(f"Signature discovery failed: {e}")
|
|
252
|
+
|
|
253
|
+
# 4. Repeating pattern detection
|
|
254
|
+
repeating = []
|
|
255
|
+
try:
|
|
256
|
+
sequences = find_repeating_sequences(data, min_length=4, min_count=3)
|
|
257
|
+
repeating = [
|
|
258
|
+
{
|
|
259
|
+
"pattern": seq.pattern.hex(),
|
|
260
|
+
"length": seq.length,
|
|
261
|
+
"count": seq.count,
|
|
262
|
+
"frequency": seq.frequency,
|
|
263
|
+
}
|
|
264
|
+
for seq in sequences[:10] # Top 10
|
|
265
|
+
]
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.warning(f"Repeating pattern detection failed: {e}")
|
|
268
|
+
|
|
269
|
+
# 5. N-gram profiling
|
|
270
|
+
ngram_profile = {}
|
|
271
|
+
try:
|
|
272
|
+
ngram_profile = self.ngram_analyzer.analyze(data)
|
|
273
|
+
except Exception as e:
|
|
274
|
+
logger.warning(f"N-gram analysis failed: {e}")
|
|
275
|
+
|
|
276
|
+
# 6. Anomaly detection (simple z-score based)
|
|
277
|
+
anomalies = []
|
|
278
|
+
if detect_anomalies:
|
|
279
|
+
try:
|
|
280
|
+
byte_array = np.frombuffer(data, dtype=np.uint8)
|
|
281
|
+
# Simple z-score anomaly detection
|
|
282
|
+
mean = np.mean(byte_array)
|
|
283
|
+
std = np.std(byte_array)
|
|
284
|
+
if std > 0:
|
|
285
|
+
z_scores = np.abs((byte_array - mean) / std)
|
|
286
|
+
anomalies = np.where(z_scores > 3.0)[0].tolist()
|
|
287
|
+
except Exception as e:
|
|
288
|
+
logger.warning(f"Anomaly detection failed: {e}")
|
|
289
|
+
|
|
290
|
+
# 7. Periodic pattern detection
|
|
291
|
+
periodic = []
|
|
292
|
+
try:
|
|
293
|
+
byte_data_uint8 = np.frombuffer(data, dtype=np.uint8)
|
|
294
|
+
byte_data_float: NDArray[np.float64] = byte_data_uint8.astype(np.float64)
|
|
295
|
+
period_result = detect_period(byte_data_float)
|
|
296
|
+
if period_result is not None:
|
|
297
|
+
periodic.append(
|
|
298
|
+
{
|
|
299
|
+
"period": period_result.period,
|
|
300
|
+
"confidence": period_result.confidence,
|
|
301
|
+
"method": period_result.method,
|
|
302
|
+
}
|
|
303
|
+
)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.warning(f"Period detection failed: {e}")
|
|
306
|
+
|
|
307
|
+
# Calculate overall confidence
|
|
308
|
+
confidence = self._calculate_analysis_confidence(
|
|
309
|
+
entropy_result, signatures, repeating, anomalies
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
logger.info(
|
|
313
|
+
f"Analysis complete: type={data_type}, "
|
|
314
|
+
f"entropy={entropy_val:.2f}, "
|
|
315
|
+
f"signatures={len(signatures)}, "
|
|
316
|
+
f"confidence={confidence:.2f}"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return BinaryAnalysisResult(
|
|
320
|
+
data_type=data_type,
|
|
321
|
+
entropy=entropy_val,
|
|
322
|
+
entropy_result=entropy_result,
|
|
323
|
+
signatures=signatures,
|
|
324
|
+
repeating_patterns=repeating,
|
|
325
|
+
ngram_profile=ngram_profile,
|
|
326
|
+
anomalies=anomalies,
|
|
327
|
+
periodic_patterns=periodic,
|
|
328
|
+
confidence=confidence,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def infer_protocol_structure(
|
|
332
|
+
self,
|
|
333
|
+
messages: list[bytes],
|
|
334
|
+
min_field_size: int = 1,
|
|
335
|
+
) -> ProtocolStructure:
|
|
336
|
+
"""Infer protocol message structure from multiple captures.
|
|
337
|
+
|
|
338
|
+
Analyzes a collection of protocol messages to automatically infer:
|
|
339
|
+
- Fixed vs variable length messages
|
|
340
|
+
- Field boundaries and types
|
|
341
|
+
- Header/delimiter bytes
|
|
342
|
+
- Length prefix fields
|
|
343
|
+
- Checksum/CRC fields
|
|
344
|
+
- Encrypted payload regions
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
messages: List of captured protocol messages.
|
|
348
|
+
min_field_size: Minimum field size to detect.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
ProtocolStructure with inferred fields and metadata.
|
|
352
|
+
|
|
353
|
+
Raises:
|
|
354
|
+
ValueError: If messages list is empty.
|
|
355
|
+
|
|
356
|
+
Example:
|
|
357
|
+
>>> # Capture multiple protocol messages
|
|
358
|
+
>>> messages = [msg1, msg2, msg3, ...]
|
|
359
|
+
>>>
|
|
360
|
+
>>> # Infer structure
|
|
361
|
+
>>> structure = re_tool.infer_protocol_structure(messages)
|
|
362
|
+
>>>
|
|
363
|
+
>>> # Print discovered fields
|
|
364
|
+
>>> for field in structure.fields:
|
|
365
|
+
... print(f"{field.field_type}: offset={field.offset}, "
|
|
366
|
+
... f"length={field.length}, entropy={field.entropy:.2f}")
|
|
367
|
+
"""
|
|
368
|
+
if not messages:
|
|
369
|
+
raise ValueError("Cannot infer structure from empty message list")
|
|
370
|
+
|
|
371
|
+
logger.info(f"Inferring protocol structure from {len(messages)} messages")
|
|
372
|
+
|
|
373
|
+
# 1. Determine message length (fixed or variable)
|
|
374
|
+
lengths = [len(msg) for msg in messages]
|
|
375
|
+
is_fixed_length = len(set(lengths)) == 1
|
|
376
|
+
msg_length = lengths[0] if is_fixed_length else -1
|
|
377
|
+
|
|
378
|
+
# 2. Detect delimiter (for variable-length protocols)
|
|
379
|
+
delimiter = None
|
|
380
|
+
if not is_fixed_length:
|
|
381
|
+
delimiter = self._detect_delimiter(messages)
|
|
382
|
+
|
|
383
|
+
# 3. Group by length for field inference
|
|
384
|
+
if is_fixed_length:
|
|
385
|
+
groups = {msg_length: messages}
|
|
386
|
+
else:
|
|
387
|
+
groups = {}
|
|
388
|
+
for msg in messages:
|
|
389
|
+
groups.setdefault(len(msg), []).append(msg)
|
|
390
|
+
|
|
391
|
+
# 4. Infer fields for each length group
|
|
392
|
+
all_fields = []
|
|
393
|
+
for msg_group in groups.values():
|
|
394
|
+
fields = self._infer_fields(msg_group, min_field_size)
|
|
395
|
+
all_fields.extend(fields)
|
|
396
|
+
|
|
397
|
+
# 5. Detect special field types
|
|
398
|
+
length_prefix_offset = self._detect_length_prefix(messages) if not is_fixed_length else None
|
|
399
|
+
checksum_offset = self._detect_checksum_field(messages)
|
|
400
|
+
|
|
401
|
+
# 6. Detect encrypted payload regions
|
|
402
|
+
payload_offset = None
|
|
403
|
+
crypto_fields = self.crypto_detector.detect_crypto_fields(messages, min_field_size=8)
|
|
404
|
+
if crypto_fields:
|
|
405
|
+
# Mark crypto fields
|
|
406
|
+
for cf in crypto_fields:
|
|
407
|
+
payload_offset = cf["offset"]
|
|
408
|
+
all_fields.append(
|
|
409
|
+
FieldDescriptor(
|
|
410
|
+
offset=cf["offset"],
|
|
411
|
+
length=cf["length"],
|
|
412
|
+
field_type="encrypted_payload",
|
|
413
|
+
entropy=cf["entropy"],
|
|
414
|
+
is_constant=False,
|
|
415
|
+
description="High entropy region (likely encrypted)",
|
|
416
|
+
)
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# 7. Calculate confidence
|
|
420
|
+
confidence = self._calculate_structure_confidence(
|
|
421
|
+
all_fields, is_fixed_length, delimiter is not None
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
logger.info(
|
|
425
|
+
f"Structure inference complete: "
|
|
426
|
+
f"fields={len(all_fields)}, "
|
|
427
|
+
f"fixed_length={is_fixed_length}, "
|
|
428
|
+
f"confidence={confidence:.2f}"
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
return ProtocolStructure(
|
|
432
|
+
message_length=msg_length,
|
|
433
|
+
fields=all_fields,
|
|
434
|
+
delimiter=delimiter,
|
|
435
|
+
length_prefix_offset=length_prefix_offset,
|
|
436
|
+
checksum_offset=checksum_offset,
|
|
437
|
+
payload_offset=payload_offset,
|
|
438
|
+
confidence=confidence,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
def detect_delimiter(self, messages: list[bytes]) -> bytes | None:
|
|
442
|
+
"""Detect message delimiter bytes.
|
|
443
|
+
|
|
444
|
+
Finds byte sequences that consistently appear at message boundaries
|
|
445
|
+
across multiple messages.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
messages: List of messages to analyze.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Delimiter bytes if found, None otherwise.
|
|
452
|
+
|
|
453
|
+
Example:
|
|
454
|
+
>>> messages = [b'START' + data1 + b'END', b'START' + data2 + b'END']
|
|
455
|
+
>>> delim = re_tool.detect_delimiter(messages)
|
|
456
|
+
>>> print(delim)
|
|
457
|
+
b'END'
|
|
458
|
+
"""
|
|
459
|
+
return self._detect_delimiter(messages)
|
|
460
|
+
|
|
461
|
+
def infer_fields(self, messages: list[bytes], min_field_size: int = 1) -> list[FieldDescriptor]:
|
|
462
|
+
"""Infer field boundaries from message samples.
|
|
463
|
+
|
|
464
|
+
Analyzes byte-level entropy and variance across messages to detect
|
|
465
|
+
field boundaries. Fields with constant values, high entropy, or
|
|
466
|
+
distinct variance patterns are identified.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
messages: List of messages (must all be same length).
|
|
470
|
+
min_field_size: Minimum field size in bytes.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
List of FieldDescriptor objects.
|
|
474
|
+
|
|
475
|
+
Raises:
|
|
476
|
+
ValueError: If messages have different lengths.
|
|
477
|
+
|
|
478
|
+
Example:
|
|
479
|
+
>>> messages = [msg1, msg2, msg3] # Same length
|
|
480
|
+
>>> fields = re_tool.infer_fields(messages)
|
|
481
|
+
>>> for field in fields:
|
|
482
|
+
... print(f"Field: {field.field_type} at {field.offset}")
|
|
483
|
+
"""
|
|
484
|
+
if not messages:
|
|
485
|
+
return []
|
|
486
|
+
|
|
487
|
+
# Validate all messages same length
|
|
488
|
+
msg_len = len(messages[0])
|
|
489
|
+
if not all(len(msg) == msg_len for msg in messages):
|
|
490
|
+
raise ValueError("All messages must have same length for field inference")
|
|
491
|
+
|
|
492
|
+
return self._infer_fields(messages, min_field_size)
|
|
493
|
+
|
|
494
|
+
def detect_length_prefix(self, messages: list[bytes]) -> int | None:
|
|
495
|
+
"""Detect length prefix field in variable-length protocol.
|
|
496
|
+
|
|
497
|
+
Searches for a field at the beginning of messages that encodes
|
|
498
|
+
the message length. Common encodings: 1-byte, 2-byte LE/BE, varint.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
messages: List of variable-length messages.
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
Offset of length prefix field, or None if not detected.
|
|
505
|
+
|
|
506
|
+
Example:
|
|
507
|
+
>>> messages = [b'\\x05hello', b'\\x07goodbye'] # Length prefix
|
|
508
|
+
>>> offset = re_tool.detect_length_prefix(messages)
|
|
509
|
+
>>> print(offset)
|
|
510
|
+
0
|
|
511
|
+
"""
|
|
512
|
+
return self._detect_length_prefix(messages)
|
|
513
|
+
|
|
514
|
+
def detect_checksum_field(self, messages: list[bytes]) -> int | None:
|
|
515
|
+
"""Detect checksum/CRC field in protocol messages.
|
|
516
|
+
|
|
517
|
+
Attempts to identify fields that contain checksums by testing
|
|
518
|
+
common checksum algorithms (CRC8, CRC16, CRC32, simple sum).
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
messages: List of messages to analyze.
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Offset of checksum field, or None if not detected.
|
|
525
|
+
|
|
526
|
+
Example:
|
|
527
|
+
>>> # Messages with CRC at end
|
|
528
|
+
>>> messages = [msg1, msg2, msg3]
|
|
529
|
+
>>> offset = re_tool.detect_checksum_field(messages)
|
|
530
|
+
>>> if offset:
|
|
531
|
+
... print(f"Checksum at offset {offset}")
|
|
532
|
+
"""
|
|
533
|
+
return self._detect_checksum_field(messages)
|
|
534
|
+
|
|
535
|
+
def classify_data_type(self, data: bytes) -> str:
|
|
536
|
+
"""Classify binary data type (encrypted, compressed, structured, random).
|
|
537
|
+
|
|
538
|
+
Uses entropy analysis and statistical tests to classify data.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
data: Binary data to classify.
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
Data type string: 'encrypted', 'compressed', 'structured', or 'random'.
|
|
545
|
+
|
|
546
|
+
Example:
|
|
547
|
+
>>> encrypted = os.urandom(256)
|
|
548
|
+
>>> print(re_tool.classify_data_type(encrypted))
|
|
549
|
+
'encrypted'
|
|
550
|
+
"""
|
|
551
|
+
if not data:
|
|
552
|
+
return "empty"
|
|
553
|
+
|
|
554
|
+
result = self.crypto_detector.analyze_entropy(data)
|
|
555
|
+
|
|
556
|
+
if result.encryption_likelihood > 0.7:
|
|
557
|
+
return "encrypted"
|
|
558
|
+
elif result.compression_likelihood > 0.7:
|
|
559
|
+
return "compressed"
|
|
560
|
+
elif result.shannon_entropy < 3.0:
|
|
561
|
+
return "structured"
|
|
562
|
+
else:
|
|
563
|
+
return "mixed"
|
|
564
|
+
|
|
565
|
+
# =========================================================================
|
|
566
|
+
# Internal Helper Methods
|
|
567
|
+
# =========================================================================
|
|
568
|
+
|
|
569
|
+
def _detect_delimiter(self, messages: list[bytes]) -> bytes | None:
|
|
570
|
+
"""Detect delimiter by finding common endings."""
|
|
571
|
+
if len(messages) < 2:
|
|
572
|
+
return None
|
|
573
|
+
|
|
574
|
+
# Look for common suffixes (last 1-4 bytes)
|
|
575
|
+
for delim_len in range(1, 5):
|
|
576
|
+
candidates: dict[bytes, int] = {}
|
|
577
|
+
for msg in messages:
|
|
578
|
+
if len(msg) >= delim_len:
|
|
579
|
+
suffix = msg[-delim_len:]
|
|
580
|
+
candidates[suffix] = candidates.get(suffix, 0) + 1
|
|
581
|
+
|
|
582
|
+
# Check if any suffix appears in >80% of messages
|
|
583
|
+
for suffix, count in candidates.items():
|
|
584
|
+
if count / len(messages) > 0.8:
|
|
585
|
+
return suffix
|
|
586
|
+
|
|
587
|
+
return None
|
|
588
|
+
|
|
589
|
+
def _infer_fields(self, messages: list[bytes], min_field_size: int) -> list[FieldDescriptor]:
|
|
590
|
+
"""Infer field boundaries using entropy and variance analysis."""
|
|
591
|
+
if not messages:
|
|
592
|
+
return []
|
|
593
|
+
|
|
594
|
+
msg_len = len(messages[0])
|
|
595
|
+
if msg_len < min_field_size:
|
|
596
|
+
return []
|
|
597
|
+
|
|
598
|
+
# Compute positional entropy and variance
|
|
599
|
+
position_entropy = np.zeros(msg_len)
|
|
600
|
+
position_variance = np.zeros(msg_len)
|
|
601
|
+
|
|
602
|
+
for pos in range(msg_len):
|
|
603
|
+
values = [msg[pos] for msg in messages]
|
|
604
|
+
position_entropy[pos] = self._shannon_entropy_bytes(bytes(values))
|
|
605
|
+
position_variance[pos] = np.var(values)
|
|
606
|
+
|
|
607
|
+
# Identify field boundaries (where entropy/variance changes significantly)
|
|
608
|
+
fields = []
|
|
609
|
+
field_start = 0
|
|
610
|
+
field_type = "unknown"
|
|
611
|
+
|
|
612
|
+
# Simple field detection: constant fields (entropy ~0) and variable fields
|
|
613
|
+
for pos in range(1, msg_len):
|
|
614
|
+
# Detect boundary if entropy changes significantly
|
|
615
|
+
if abs(position_entropy[pos] - position_entropy[pos - 1]) > 2.0:
|
|
616
|
+
# Create field
|
|
617
|
+
if pos - field_start >= min_field_size:
|
|
618
|
+
avg_entropy = float(np.mean(position_entropy[field_start:pos]))
|
|
619
|
+
is_constant = avg_entropy < 0.1
|
|
620
|
+
|
|
621
|
+
constant_val = None
|
|
622
|
+
if is_constant:
|
|
623
|
+
# All messages have same value
|
|
624
|
+
constant_val = messages[0][field_start:pos]
|
|
625
|
+
field_type = "constant"
|
|
626
|
+
elif avg_entropy > 6.0:
|
|
627
|
+
field_type = "high_entropy"
|
|
628
|
+
else:
|
|
629
|
+
field_type = "variable"
|
|
630
|
+
|
|
631
|
+
fields.append(
|
|
632
|
+
FieldDescriptor(
|
|
633
|
+
offset=field_start,
|
|
634
|
+
length=pos - field_start,
|
|
635
|
+
field_type=field_type,
|
|
636
|
+
entropy=avg_entropy,
|
|
637
|
+
is_constant=is_constant,
|
|
638
|
+
constant_value=constant_val,
|
|
639
|
+
)
|
|
640
|
+
)
|
|
641
|
+
field_start = pos
|
|
642
|
+
|
|
643
|
+
# Add final field
|
|
644
|
+
if msg_len - field_start >= min_field_size:
|
|
645
|
+
avg_entropy = float(np.mean(position_entropy[field_start:]))
|
|
646
|
+
is_constant = avg_entropy < 0.1
|
|
647
|
+
constant_val = messages[0][field_start:] if is_constant else None
|
|
648
|
+
|
|
649
|
+
fields.append(
|
|
650
|
+
FieldDescriptor(
|
|
651
|
+
offset=field_start,
|
|
652
|
+
length=msg_len - field_start,
|
|
653
|
+
field_type="constant" if is_constant else "variable",
|
|
654
|
+
entropy=avg_entropy,
|
|
655
|
+
is_constant=is_constant,
|
|
656
|
+
constant_value=constant_val,
|
|
657
|
+
)
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
return fields
|
|
661
|
+
|
|
662
|
+
def _detect_length_prefix(self, messages: list[bytes]) -> int | None:
|
|
663
|
+
"""Detect length prefix at start of messages."""
|
|
664
|
+
if len(messages) < 3:
|
|
665
|
+
return None
|
|
666
|
+
|
|
667
|
+
# Try 1-byte length at offset 0
|
|
668
|
+
if all(len(msg) > 1 for msg in messages):
|
|
669
|
+
if all(msg[0] == len(msg) for msg in messages):
|
|
670
|
+
return 0
|
|
671
|
+
|
|
672
|
+
# Try 2-byte little-endian length at offset 0
|
|
673
|
+
if all(len(msg) > 2 for msg in messages):
|
|
674
|
+
matches = 0
|
|
675
|
+
for msg in messages:
|
|
676
|
+
length_field = int.from_bytes(msg[0:2], byteorder="little")
|
|
677
|
+
if length_field == len(msg):
|
|
678
|
+
matches += 1
|
|
679
|
+
if matches / len(messages) > 0.8:
|
|
680
|
+
return 0
|
|
681
|
+
|
|
682
|
+
return None
|
|
683
|
+
|
|
684
|
+
def _detect_checksum_field(self, messages: list[bytes]) -> int | None:
|
|
685
|
+
"""Detect checksum field (simplified heuristic)."""
|
|
686
|
+
# This is a simplified version - real implementation would test CRC8/16/32
|
|
687
|
+
# For now, just detect if last 1-4 bytes have high variance (likely checksum)
|
|
688
|
+
if len(messages) < 3:
|
|
689
|
+
return None
|
|
690
|
+
|
|
691
|
+
msg_len = len(messages[0])
|
|
692
|
+
if not all(len(msg) == msg_len for msg in messages):
|
|
693
|
+
return None
|
|
694
|
+
|
|
695
|
+
# Check last few bytes for high variance
|
|
696
|
+
for checksum_len in [1, 2, 4]:
|
|
697
|
+
if msg_len > checksum_len:
|
|
698
|
+
offset = msg_len - checksum_len
|
|
699
|
+
values = [msg[offset:] for msg in messages]
|
|
700
|
+
unique_values = len(set(values))
|
|
701
|
+
# If almost all unique, likely a checksum
|
|
702
|
+
if unique_values / len(messages) > 0.9:
|
|
703
|
+
return offset
|
|
704
|
+
|
|
705
|
+
return None
|
|
706
|
+
|
|
707
|
+
def _shannon_entropy_bytes(self, data: bytes) -> float:
|
|
708
|
+
"""Calculate Shannon entropy for byte sequence."""
|
|
709
|
+
if not data:
|
|
710
|
+
return 0.0
|
|
711
|
+
|
|
712
|
+
byte_counts = np.bincount(np.frombuffer(data, dtype=np.uint8), minlength=256)
|
|
713
|
+
probabilities = byte_counts[byte_counts > 0] / len(data)
|
|
714
|
+
return float(-np.sum(probabilities * np.log2(probabilities)))
|
|
715
|
+
|
|
716
|
+
def _calculate_analysis_confidence(
|
|
717
|
+
self,
|
|
718
|
+
entropy_result: EntropyResult,
|
|
719
|
+
signatures: list[CandidateSignature],
|
|
720
|
+
repeating: list[dict[str, Any]],
|
|
721
|
+
anomalies: list[int],
|
|
722
|
+
) -> float:
|
|
723
|
+
"""Calculate overall confidence in binary analysis."""
|
|
724
|
+
# Base confidence from entropy analysis
|
|
725
|
+
confidence = entropy_result.confidence
|
|
726
|
+
|
|
727
|
+
# Boost if we found signatures
|
|
728
|
+
if signatures:
|
|
729
|
+
confidence = min(1.0, confidence + 0.1 * len(signatures))
|
|
730
|
+
|
|
731
|
+
# Boost if we found repeating patterns
|
|
732
|
+
if repeating:
|
|
733
|
+
confidence = min(1.0, confidence + 0.05 * len(repeating))
|
|
734
|
+
|
|
735
|
+
return float(confidence)
|
|
736
|
+
|
|
737
|
+
def _calculate_structure_confidence(
|
|
738
|
+
self,
|
|
739
|
+
fields: list[FieldDescriptor],
|
|
740
|
+
is_fixed_length: bool,
|
|
741
|
+
has_delimiter: bool,
|
|
742
|
+
) -> float:
|
|
743
|
+
"""Calculate confidence in protocol structure inference."""
|
|
744
|
+
confidence = 0.5 # Base confidence
|
|
745
|
+
|
|
746
|
+
# More confidence if we found fields
|
|
747
|
+
if fields:
|
|
748
|
+
confidence += 0.1 * min(len(fields), 5)
|
|
749
|
+
|
|
750
|
+
# More confidence for fixed-length protocols
|
|
751
|
+
if is_fixed_length:
|
|
752
|
+
confidence += 0.2
|
|
753
|
+
|
|
754
|
+
# More confidence if delimiter found
|
|
755
|
+
if has_delimiter:
|
|
756
|
+
confidence += 0.1
|
|
757
|
+
|
|
758
|
+
return min(1.0, float(confidence))
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
# Convenience functions for common operations
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def search_pattern(
|
|
765
|
+
data: bytes,
|
|
766
|
+
pattern: bytes | str,
|
|
767
|
+
fuzzy: bool = False,
|
|
768
|
+
max_distance: int = 2,
|
|
769
|
+
) -> list[int]:
|
|
770
|
+
"""Search for pattern in binary data with optional fuzzy matching.
|
|
771
|
+
|
|
772
|
+
Args:
|
|
773
|
+
data: Binary data to search.
|
|
774
|
+
pattern: Pattern to search for (bytes or hex string).
|
|
775
|
+
fuzzy: Enable fuzzy/approximate matching.
|
|
776
|
+
max_distance: Maximum edit distance for fuzzy matching.
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
List of match positions.
|
|
780
|
+
|
|
781
|
+
Example:
|
|
782
|
+
>>> positions = search_pattern(data, b'\\xff\\xfe', fuzzy=False)
|
|
783
|
+
>>> print(f"Found at: {positions}")
|
|
784
|
+
"""
|
|
785
|
+
# Convert hex string to bytes
|
|
786
|
+
if isinstance(pattern, str):
|
|
787
|
+
pattern = bytes.fromhex(pattern.replace(" ", ""))
|
|
788
|
+
|
|
789
|
+
if fuzzy:
|
|
790
|
+
matcher = FuzzyMatcher(max_edit_distance=max_distance)
|
|
791
|
+
matches = matcher.search(data, pattern)
|
|
792
|
+
return [m.offset for m in matches]
|
|
793
|
+
else:
|
|
794
|
+
# Simple exact search
|
|
795
|
+
positions = []
|
|
796
|
+
for i in range(len(data) - len(pattern) + 1):
|
|
797
|
+
if data[i : i + len(pattern)] == pattern:
|
|
798
|
+
positions.append(i)
|
|
799
|
+
return positions
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def shannon_entropy(data: bytes) -> float:
|
|
803
|
+
"""Calculate Shannon entropy of binary data.
|
|
804
|
+
|
|
805
|
+
Convenience wrapper around CryptoDetector entropy calculation.
|
|
806
|
+
|
|
807
|
+
Args:
|
|
808
|
+
data: Binary data to analyze.
|
|
809
|
+
|
|
810
|
+
Returns:
|
|
811
|
+
Shannon entropy in bits per byte (0.0-8.0).
|
|
812
|
+
|
|
813
|
+
Example:
|
|
814
|
+
>>> entropy = shannon_entropy(b'\\x00' * 100)
|
|
815
|
+
>>> print(f"{entropy:.2f}")
|
|
816
|
+
0.00
|
|
817
|
+
>>> entropy = shannon_entropy(os.urandom(100))
|
|
818
|
+
>>> print(f"{entropy:.2f}")
|
|
819
|
+
7.98
|
|
820
|
+
"""
|
|
821
|
+
detector = CryptoDetector()
|
|
822
|
+
return detector._shannon_entropy(data)
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def byte_frequency_distribution(data: bytes) -> dict[int, int]:
|
|
826
|
+
"""Calculate byte frequency distribution.
|
|
827
|
+
|
|
828
|
+
Args:
|
|
829
|
+
data: Binary data to analyze.
|
|
830
|
+
|
|
831
|
+
Returns:
|
|
832
|
+
Dictionary mapping byte value (0-255) to count.
|
|
833
|
+
|
|
834
|
+
Example:
|
|
835
|
+
>>> freq = byte_frequency_distribution(b'AAABBC')
|
|
836
|
+
>>> print(freq[ord('A')])
|
|
837
|
+
3
|
|
838
|
+
"""
|
|
839
|
+
byte_array = np.frombuffer(data, dtype=np.uint8)
|
|
840
|
+
counts = np.bincount(byte_array, minlength=256)
|
|
841
|
+
return {i: int(count) for i, count in enumerate(counts) if count > 0}
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def sliding_entropy(
|
|
845
|
+
data: bytes,
|
|
846
|
+
window_size: int = 256,
|
|
847
|
+
stride: int = 64,
|
|
848
|
+
) -> list[tuple[int, float]]:
|
|
849
|
+
"""Calculate entropy across sliding windows.
|
|
850
|
+
|
|
851
|
+
Wrapper around CryptoDetector.sliding_window_entropy.
|
|
852
|
+
|
|
853
|
+
Args:
|
|
854
|
+
data: Binary data to analyze.
|
|
855
|
+
window_size: Window size in bytes.
|
|
856
|
+
stride: Step size between windows.
|
|
857
|
+
|
|
858
|
+
Returns:
|
|
859
|
+
List of (offset, entropy) tuples.
|
|
860
|
+
|
|
861
|
+
Example:
|
|
862
|
+
>>> windows = sliding_entropy(data, window_size=128)
|
|
863
|
+
>>> for offset, ent in windows:
|
|
864
|
+
... if ent > 7.5:
|
|
865
|
+
... print(f"High entropy at offset {offset}")
|
|
866
|
+
"""
|
|
867
|
+
detector = CryptoDetector()
|
|
868
|
+
return detector.sliding_window_entropy(data, window_size=window_size, stride=stride)
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def entropy_profile(data: bytes, window_size: int = 256) -> NDArray[np.float64]:
|
|
872
|
+
"""Generate entropy profile (entropy over time).
|
|
873
|
+
|
|
874
|
+
Args:
|
|
875
|
+
data: Binary data to analyze.
|
|
876
|
+
window_size: Window size for entropy calculation.
|
|
877
|
+
|
|
878
|
+
Returns:
|
|
879
|
+
Array of entropy values for each window position.
|
|
880
|
+
|
|
881
|
+
Example:
|
|
882
|
+
>>> profile = entropy_profile(data, window_size=128)
|
|
883
|
+
>>> plt.plot(profile)
|
|
884
|
+
>>> plt.ylabel('Entropy (bits/byte)')
|
|
885
|
+
>>> plt.xlabel('Position')
|
|
886
|
+
"""
|
|
887
|
+
detector = CryptoDetector()
|
|
888
|
+
windows = detector.sliding_window_entropy(data, window_size=window_size, stride=1)
|
|
889
|
+
return np.array([ent for _, ent in windows])
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def detect_encrypted_regions(
|
|
893
|
+
data: bytes,
|
|
894
|
+
window_size: int = 256,
|
|
895
|
+
threshold: float = 7.5,
|
|
896
|
+
) -> list[tuple[int, int]]:
|
|
897
|
+
"""Detect regions with high entropy (likely encrypted).
|
|
898
|
+
|
|
899
|
+
Args:
|
|
900
|
+
data: Binary data to analyze.
|
|
901
|
+
window_size: Window size for analysis.
|
|
902
|
+
threshold: Entropy threshold for encryption (bits/byte).
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
List of (start, end) tuples for encrypted regions.
|
|
906
|
+
|
|
907
|
+
Example:
|
|
908
|
+
>>> regions = detect_encrypted_regions(data)
|
|
909
|
+
>>> for start, end in regions:
|
|
910
|
+
... print(f"Encrypted: {start}-{end}")
|
|
911
|
+
"""
|
|
912
|
+
detector = CryptoDetector()
|
|
913
|
+
windows = detector.sliding_window_entropy(data, window_size=window_size, stride=1)
|
|
914
|
+
|
|
915
|
+
regions = []
|
|
916
|
+
in_region = False
|
|
917
|
+
region_start = 0
|
|
918
|
+
|
|
919
|
+
for offset, entropy in windows:
|
|
920
|
+
if entropy > threshold:
|
|
921
|
+
if not in_region:
|
|
922
|
+
region_start = offset
|
|
923
|
+
in_region = True
|
|
924
|
+
else:
|
|
925
|
+
if in_region:
|
|
926
|
+
regions.append((region_start, offset))
|
|
927
|
+
in_region = False
|
|
928
|
+
|
|
929
|
+
# Close final region
|
|
930
|
+
if in_region:
|
|
931
|
+
regions.append((region_start, len(data)))
|
|
932
|
+
|
|
933
|
+
return regions
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
def detect_compressed_regions(
|
|
937
|
+
data: bytes,
|
|
938
|
+
window_size: int = 256,
|
|
939
|
+
) -> list[tuple[int, int]]:
|
|
940
|
+
"""Detect regions with medium-high entropy (likely compressed).
|
|
941
|
+
|
|
942
|
+
Args:
|
|
943
|
+
data: Binary data to analyze.
|
|
944
|
+
window_size: Window size for analysis.
|
|
945
|
+
|
|
946
|
+
Returns:
|
|
947
|
+
List of (start, end) tuples for compressed regions.
|
|
948
|
+
|
|
949
|
+
Example:
|
|
950
|
+
>>> regions = detect_compressed_regions(data)
|
|
951
|
+
>>> for start, end in regions:
|
|
952
|
+
... print(f"Compressed: {start}-{end}")
|
|
953
|
+
"""
|
|
954
|
+
detector = CryptoDetector()
|
|
955
|
+
windows = detector.sliding_window_entropy(data, window_size=window_size, stride=1)
|
|
956
|
+
|
|
957
|
+
regions = []
|
|
958
|
+
in_region = False
|
|
959
|
+
region_start = 0
|
|
960
|
+
|
|
961
|
+
for offset, entropy in windows:
|
|
962
|
+
# Compressed: 6.5-7.5 bits/byte
|
|
963
|
+
if 6.5 < entropy < 7.5:
|
|
964
|
+
if not in_region:
|
|
965
|
+
region_start = offset
|
|
966
|
+
in_region = True
|
|
967
|
+
else:
|
|
968
|
+
if in_region:
|
|
969
|
+
regions.append((region_start, offset))
|
|
970
|
+
in_region = False
|
|
971
|
+
|
|
972
|
+
if in_region:
|
|
973
|
+
regions.append((region_start, len(data)))
|
|
974
|
+
|
|
975
|
+
return regions
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
__all__ = [
|
|
979
|
+
"BinaryAnalysisResult",
|
|
980
|
+
"FieldDescriptor",
|
|
981
|
+
"ProtocolStructure",
|
|
982
|
+
"ReverseEngineer",
|
|
983
|
+
# Convenience functions
|
|
984
|
+
"byte_frequency_distribution",
|
|
985
|
+
"detect_compressed_regions",
|
|
986
|
+
"detect_encrypted_regions",
|
|
987
|
+
"entropy_profile",
|
|
988
|
+
"search_pattern",
|
|
989
|
+
"shannon_entropy",
|
|
990
|
+
"sliding_entropy",
|
|
991
|
+
]
|