oscura 0.7.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. oscura/__init__.py +19 -19
  2. oscura/analyzers/__init__.py +2 -0
  3. oscura/analyzers/digital/extraction.py +2 -3
  4. oscura/analyzers/digital/quality.py +1 -1
  5. oscura/analyzers/digital/timing.py +1 -1
  6. oscura/analyzers/eye/__init__.py +5 -1
  7. oscura/analyzers/eye/generation.py +501 -0
  8. oscura/analyzers/jitter/__init__.py +6 -6
  9. oscura/analyzers/jitter/timing.py +419 -0
  10. oscura/analyzers/patterns/__init__.py +94 -0
  11. oscura/analyzers/patterns/reverse_engineering.py +991 -0
  12. oscura/analyzers/power/__init__.py +35 -12
  13. oscura/analyzers/power/basic.py +3 -3
  14. oscura/analyzers/power/soa.py +1 -1
  15. oscura/analyzers/power/switching.py +3 -3
  16. oscura/analyzers/signal_classification.py +529 -0
  17. oscura/analyzers/signal_integrity/sparams.py +3 -3
  18. oscura/analyzers/statistics/__init__.py +4 -0
  19. oscura/analyzers/statistics/basic.py +152 -0
  20. oscura/analyzers/statistics/correlation.py +47 -6
  21. oscura/analyzers/validation.py +1 -1
  22. oscura/analyzers/waveform/__init__.py +2 -0
  23. oscura/analyzers/waveform/measurements.py +329 -163
  24. oscura/analyzers/waveform/measurements_with_uncertainty.py +91 -35
  25. oscura/analyzers/waveform/spectral.py +498 -54
  26. oscura/api/dsl/commands.py +15 -6
  27. oscura/api/server/templates/base.html +137 -146
  28. oscura/api/server/templates/export.html +84 -110
  29. oscura/api/server/templates/home.html +248 -267
  30. oscura/api/server/templates/protocols.html +44 -48
  31. oscura/api/server/templates/reports.html +27 -35
  32. oscura/api/server/templates/session_detail.html +68 -78
  33. oscura/api/server/templates/sessions.html +62 -72
  34. oscura/api/server/templates/waveforms.html +54 -64
  35. oscura/automotive/__init__.py +1 -1
  36. oscura/automotive/can/session.py +1 -1
  37. oscura/automotive/dbc/generator.py +638 -23
  38. oscura/automotive/dtc/data.json +102 -17
  39. oscura/automotive/uds/decoder.py +99 -6
  40. oscura/cli/analyze.py +8 -2
  41. oscura/cli/batch.py +36 -5
  42. oscura/cli/characterize.py +18 -4
  43. oscura/cli/export.py +47 -5
  44. oscura/cli/main.py +2 -0
  45. oscura/cli/onboarding/wizard.py +10 -6
  46. oscura/cli/pipeline.py +585 -0
  47. oscura/cli/visualize.py +6 -4
  48. oscura/convenience.py +400 -32
  49. oscura/core/config/loader.py +0 -1
  50. oscura/core/measurement_result.py +286 -0
  51. oscura/core/progress.py +1 -1
  52. oscura/core/schemas/device_mapping.json +8 -2
  53. oscura/core/schemas/packet_format.json +24 -4
  54. oscura/core/schemas/protocol_definition.json +12 -2
  55. oscura/core/types.py +300 -199
  56. oscura/correlation/multi_protocol.py +1 -1
  57. oscura/export/legacy/__init__.py +11 -0
  58. oscura/export/legacy/wav.py +75 -0
  59. oscura/exporters/__init__.py +19 -0
  60. oscura/exporters/wireshark.py +809 -0
  61. oscura/hardware/acquisition/file.py +5 -19
  62. oscura/hardware/acquisition/saleae.py +10 -10
  63. oscura/hardware/acquisition/socketcan.py +4 -6
  64. oscura/hardware/acquisition/synthetic.py +1 -5
  65. oscura/hardware/acquisition/visa.py +6 -6
  66. oscura/hardware/security/side_channel_detector.py +5 -508
  67. oscura/inference/message_format.py +686 -1
  68. oscura/jupyter/display.py +2 -2
  69. oscura/jupyter/magic.py +3 -3
  70. oscura/loaders/__init__.py +17 -12
  71. oscura/loaders/binary.py +1 -1
  72. oscura/loaders/chipwhisperer.py +1 -2
  73. oscura/loaders/configurable.py +1 -1
  74. oscura/loaders/csv_loader.py +2 -2
  75. oscura/loaders/hdf5_loader.py +1 -1
  76. oscura/loaders/lazy.py +6 -1
  77. oscura/loaders/mmap_loader.py +0 -1
  78. oscura/loaders/numpy_loader.py +8 -7
  79. oscura/loaders/preprocessing.py +3 -5
  80. oscura/loaders/rigol.py +21 -7
  81. oscura/loaders/sigrok.py +2 -5
  82. oscura/loaders/tdms.py +3 -2
  83. oscura/loaders/tektronix.py +38 -32
  84. oscura/loaders/tss.py +20 -27
  85. oscura/loaders/vcd.py +13 -8
  86. oscura/loaders/wav.py +1 -6
  87. oscura/pipeline/__init__.py +76 -0
  88. oscura/pipeline/handlers/__init__.py +165 -0
  89. oscura/pipeline/handlers/analyzers.py +1045 -0
  90. oscura/pipeline/handlers/decoders.py +899 -0
  91. oscura/pipeline/handlers/exporters.py +1103 -0
  92. oscura/pipeline/handlers/filters.py +891 -0
  93. oscura/pipeline/handlers/loaders.py +640 -0
  94. oscura/pipeline/handlers/transforms.py +768 -0
  95. oscura/reporting/__init__.py +88 -1
  96. oscura/reporting/automation.py +348 -0
  97. oscura/reporting/citations.py +374 -0
  98. oscura/reporting/core.py +54 -0
  99. oscura/reporting/formatting/__init__.py +11 -0
  100. oscura/reporting/formatting/measurements.py +320 -0
  101. oscura/reporting/html.py +57 -0
  102. oscura/reporting/interpretation.py +431 -0
  103. oscura/reporting/summary.py +329 -0
  104. oscura/reporting/templates/enhanced/protocol_re.html +504 -503
  105. oscura/reporting/visualization.py +542 -0
  106. oscura/side_channel/__init__.py +38 -57
  107. oscura/utils/builders/signal_builder.py +5 -5
  108. oscura/utils/comparison/compare.py +7 -9
  109. oscura/utils/comparison/golden.py +1 -1
  110. oscura/utils/filtering/convenience.py +2 -2
  111. oscura/utils/math/arithmetic.py +38 -62
  112. oscura/utils/math/interpolation.py +20 -20
  113. oscura/utils/pipeline/__init__.py +4 -17
  114. oscura/utils/progressive.py +1 -4
  115. oscura/utils/triggering/edge.py +1 -1
  116. oscura/utils/triggering/pattern.py +2 -2
  117. oscura/utils/triggering/pulse.py +2 -2
  118. oscura/utils/triggering/window.py +3 -3
  119. oscura/validation/hil_testing.py +11 -11
  120. oscura/visualization/__init__.py +47 -284
  121. oscura/visualization/batch.py +160 -0
  122. oscura/visualization/plot.py +542 -53
  123. oscura/visualization/styles.py +184 -318
  124. oscura/workflows/__init__.py +2 -0
  125. oscura/workflows/batch/advanced.py +1 -1
  126. oscura/workflows/batch/aggregate.py +7 -8
  127. oscura/workflows/complete_re.py +251 -23
  128. oscura/workflows/digital.py +27 -4
  129. oscura/workflows/multi_trace.py +136 -17
  130. oscura/workflows/waveform.py +788 -0
  131. {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/METADATA +59 -79
  132. {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/RECORD +135 -149
  133. oscura/side_channel/dpa.py +0 -1025
  134. oscura/utils/optimization/__init__.py +0 -19
  135. oscura/utils/optimization/parallel.py +0 -443
  136. oscura/utils/optimization/search.py +0 -532
  137. oscura/utils/pipeline/base.py +0 -338
  138. oscura/utils/pipeline/composition.py +0 -248
  139. oscura/utils/pipeline/parallel.py +0 -449
  140. oscura/utils/pipeline/pipeline.py +0 -375
  141. oscura/utils/search/__init__.py +0 -16
  142. oscura/utils/search/anomaly.py +0 -424
  143. oscura/utils/search/context.py +0 -294
  144. oscura/utils/search/pattern.py +0 -288
  145. oscura/utils/storage/__init__.py +0 -61
  146. oscura/utils/storage/database.py +0 -1166
  147. oscura/visualization/accessibility.py +0 -526
  148. oscura/visualization/annotations.py +0 -371
  149. oscura/visualization/axis_scaling.py +0 -305
  150. oscura/visualization/colors.py +0 -451
  151. oscura/visualization/digital.py +0 -436
  152. oscura/visualization/eye.py +0 -571
  153. oscura/visualization/histogram.py +0 -281
  154. oscura/visualization/interactive.py +0 -1035
  155. oscura/visualization/jitter.py +0 -1042
  156. oscura/visualization/keyboard.py +0 -394
  157. oscura/visualization/layout.py +0 -400
  158. oscura/visualization/optimization.py +0 -1079
  159. oscura/visualization/palettes.py +0 -446
  160. oscura/visualization/power.py +0 -508
  161. oscura/visualization/power_extended.py +0 -955
  162. oscura/visualization/presets.py +0 -469
  163. oscura/visualization/protocols.py +0 -1246
  164. oscura/visualization/render.py +0 -223
  165. oscura/visualization/rendering.py +0 -444
  166. oscura/visualization/reverse_engineering.py +0 -838
  167. oscura/visualization/signal_integrity.py +0 -989
  168. oscura/visualization/specialized.py +0 -643
  169. oscura/visualization/spectral.py +0 -1226
  170. oscura/visualization/thumbnails.py +0 -340
  171. oscura/visualization/time_axis.py +0 -351
  172. oscura/visualization/waveform.py +0 -454
  173. {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/WHEEL +0 -0
  174. {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/entry_points.txt +0 -0
  175. {oscura-0.7.0.dist-info → oscura-0.10.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,991 @@
1
+ """Comprehensive reverse engineering toolkit for binary data and protocols.
2
+
3
+ This module provides a complete toolkit for reverse engineering unknown binary
4
+ protocols and data formats by integrating pattern analysis, entropy analysis,
5
+ field inference, and data classification.
6
+
7
+ Key capabilities:
8
+ - Pattern discovery and motif extraction
9
+ - N-gram frequency analysis for fingerprinting
10
+ - Signature and delimiter discovery
11
+ - Binary regex and multi-pattern search
12
+ - Fuzzy/approximate matching
13
+ - Anomaly detection
14
+ - Entropy analysis and crypto detection
15
+ - Data type classification (encrypted, compressed, structured)
16
+ - Field boundary inference
17
+ - Delimiter and length prefix detection
18
+ - Checksum field detection
19
+
20
+ Example workflow:
21
+ >>> from oscura.analyzers.patterns.reverse_engineering import ReverseEngineer
22
+ >>> re_tool = ReverseEngineer()
23
+ >>>
24
+ >>> # Analyze unknown binary data
25
+ >>> analysis = re_tool.analyze_binary(unknown_data)
26
+ >>> print(f"Data type: {analysis['data_type']}")
27
+ >>> print(f"Entropy: {analysis['entropy']:.2f} bits/byte")
28
+ >>> print(f"Detected signatures: {analysis['signatures']}")
29
+ >>>
30
+ >>> # Infer protocol structure
31
+ >>> messages = [msg1, msg2, msg3, ...]
32
+ >>> structure = re_tool.infer_protocol_structure(messages)
33
+ >>> for field in structure['fields']:
34
+ ... print(f"Field at offset {field['offset']}: {field['type']}")
35
+
36
+ Author: Oscura Development Team
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import logging
42
+ from dataclasses import dataclass, field
43
+ from typing import TYPE_CHECKING, Any
44
+
45
+ import numpy as np
46
+
47
+ # Import from existing modules
48
+ from oscura.analyzers.entropy import CryptoDetector, EntropyResult
49
+ from oscura.analyzers.patterns.discovery import (
50
+ CandidateSignature,
51
+ SignatureDiscovery,
52
+ )
53
+ from oscura.analyzers.patterns.matching import (
54
+ FuzzyMatcher,
55
+ )
56
+ from oscura.analyzers.patterns.periodic import detect_period
57
+ from oscura.analyzers.patterns.sequences import (
58
+ find_repeating_sequences,
59
+ )
60
+ from oscura.analyzers.statistical.ngrams import (
61
+ NGramAnalyzer,
62
+ )
63
+
64
+ if TYPE_CHECKING:
65
+ from numpy.typing import NDArray
66
+
67
+ logger = logging.getLogger(__name__)
68
+
69
+
70
+ @dataclass
71
+ class FieldDescriptor:
72
+ """Descriptor for an inferred protocol field.
73
+
74
+ Attributes:
75
+ offset: Byte offset from start of message.
76
+ length: Field length in bytes.
77
+ field_type: Inferred type (fixed, variable, length, checksum, payload, etc.).
78
+ entropy: Average entropy of field across messages.
79
+ is_constant: True if field has same value across all messages.
80
+ constant_value: Value if is_constant is True.
81
+ description: Human-readable description.
82
+ """
83
+
84
+ offset: int
85
+ length: int
86
+ field_type: str
87
+ entropy: float
88
+ is_constant: bool = False
89
+ constant_value: bytes | None = None
90
+ description: str = ""
91
+
92
+
93
+ @dataclass
94
+ class ProtocolStructure:
95
+ """Inferred protocol message structure.
96
+
97
+ Attributes:
98
+ message_length: Fixed message length, or -1 for variable.
99
+ fields: List of inferred fields.
100
+ delimiter: Detected delimiter bytes, if any.
101
+ length_prefix_offset: Offset of length prefix field, if detected.
102
+ checksum_offset: Offset of checksum field, if detected.
103
+ payload_offset: Offset of variable payload, if detected.
104
+ confidence: Overall confidence in structure inference (0.0-1.0).
105
+ """
106
+
107
+ message_length: int
108
+ fields: list[FieldDescriptor] = field(default_factory=list)
109
+ delimiter: bytes | None = None
110
+ length_prefix_offset: int | None = None
111
+ checksum_offset: int | None = None
112
+ payload_offset: int | None = None
113
+ confidence: float = 0.0
114
+
115
+
116
+ @dataclass
117
+ class BinaryAnalysisResult:
118
+ """Complete analysis result for binary data.
119
+
120
+ Attributes:
121
+ data_type: Classified type (encrypted, compressed, structured, random).
122
+ entropy: Shannon entropy in bits/byte.
123
+ entropy_result: Detailed entropy analysis.
124
+ signatures: Discovered candidate signatures.
125
+ repeating_patterns: Detected repeating sequences.
126
+ ngram_profile: N-gram frequency distribution.
127
+ anomalies: Detected anomaly positions.
128
+ periodic_patterns: Detected periodic patterns.
129
+ confidence: Overall analysis confidence (0.0-1.0).
130
+ """
131
+
132
+ data_type: str
133
+ entropy: float
134
+ entropy_result: EntropyResult
135
+ signatures: list[CandidateSignature]
136
+ repeating_patterns: list[dict[str, Any]]
137
+ ngram_profile: dict[bytes, int]
138
+ anomalies: list[int]
139
+ periodic_patterns: list[dict[str, Any]]
140
+ confidence: float
141
+
142
+
143
+ class ReverseEngineer:
144
+ """Comprehensive reverse engineering toolkit for binary data and protocols.
145
+
146
+ Integrates multiple analysis techniques to help reverse engineer unknown
147
+ binary formats and protocols:
148
+
149
+ - Pattern analysis: motifs, repeating sequences, signatures
150
+ - Entropy analysis: crypto detection, compression detection
151
+ - N-gram analysis: fingerprinting, frequency analysis
152
+ - Field inference: automatic field boundary detection
153
+ - Structure learning: protocol message structure inference
154
+
155
+ Example:
156
+ >>> re_tool = ReverseEngineer()
157
+ >>>
158
+ >>> # Quick binary analysis
159
+ >>> result = re_tool.analyze_binary(data)
160
+ >>> print(result.data_type)
161
+ 'encrypted'
162
+ >>>
163
+ >>> # Protocol structure inference
164
+ >>> messages = [capture1, capture2, capture3]
165
+ >>> structure = re_tool.infer_protocol_structure(messages)
166
+ >>> for field in structure.fields:
167
+ ... print(f"{field.field_type} at offset {field.offset}")
168
+ """
169
+
170
+ def __init__(
171
+ self,
172
+ min_signature_length: int = 4,
173
+ max_signature_length: int = 16,
174
+ ngram_size: int = 2,
175
+ ):
176
+ """Initialize reverse engineering toolkit.
177
+
178
+ Args:
179
+ min_signature_length: Minimum signature length for discovery.
180
+ max_signature_length: Maximum signature length for discovery.
181
+ ngram_size: Default n-gram size for frequency analysis.
182
+ """
183
+ self.crypto_detector = CryptoDetector()
184
+ self.signature_discovery = SignatureDiscovery(
185
+ min_length=min_signature_length,
186
+ max_length=max_signature_length,
187
+ )
188
+ self.ngram_analyzer = NGramAnalyzer(n=ngram_size)
189
+ self.fuzzy_matcher = FuzzyMatcher(max_edit_distance=2)
190
+
191
+ def analyze_binary(
192
+ self,
193
+ data: bytes,
194
+ detect_anomalies: bool = True,
195
+ detect_signatures: bool = True,
196
+ ) -> BinaryAnalysisResult:
197
+ """Perform comprehensive analysis of binary data.
198
+
199
+ Combines multiple analysis techniques to characterize binary data:
200
+ - Entropy analysis and crypto/compression detection
201
+ - Signature and header discovery
202
+ - Repeating pattern detection
203
+ - N-gram frequency profiling
204
+ - Anomaly detection
205
+ - Periodic pattern detection
206
+
207
+ Args:
208
+ data: Binary data to analyze.
209
+ detect_anomalies: Whether to run anomaly detection.
210
+ detect_signatures: Whether to run signature discovery.
211
+
212
+ Returns:
213
+ BinaryAnalysisResult with comprehensive analysis.
214
+
215
+ Raises:
216
+ ValueError: If data is empty.
217
+
218
+ Example:
219
+ >>> data = open('unknown.bin', 'rb').read()
220
+ >>> result = re_tool.analyze_binary(data)
221
+ >>> if result.entropy > 7.5:
222
+ ... print("Likely encrypted")
223
+ >>> for sig in result.signatures:
224
+ ... print(f"Signature: {sig.pattern.hex()}")
225
+ """
226
+ if not data:
227
+ raise ValueError("Cannot analyze empty data")
228
+
229
+ logger.info(f"Starting comprehensive analysis of {len(data)} bytes")
230
+
231
+ # 1. Entropy analysis
232
+ entropy_result = self.crypto_detector.analyze_entropy(data)
233
+ entropy_val = entropy_result.shannon_entropy
234
+
235
+ # 2. Classify data type
236
+ if entropy_result.encryption_likelihood > 0.7:
237
+ data_type = "encrypted"
238
+ elif entropy_result.compression_likelihood > 0.7:
239
+ data_type = "compressed"
240
+ elif entropy_val < 3.0:
241
+ data_type = "structured"
242
+ else:
243
+ data_type = "mixed"
244
+
245
+ # 3. Signature discovery (skip for encrypted/compressed)
246
+ signatures = []
247
+ if detect_signatures and data_type in ["structured", "mixed"]:
248
+ try:
249
+ signatures = self.signature_discovery.discover_signatures(data)
250
+ except Exception as e:
251
+ logger.warning(f"Signature discovery failed: {e}")
252
+
253
+ # 4. Repeating pattern detection
254
+ repeating = []
255
+ try:
256
+ sequences = find_repeating_sequences(data, min_length=4, min_count=3)
257
+ repeating = [
258
+ {
259
+ "pattern": seq.pattern.hex(),
260
+ "length": seq.length,
261
+ "count": seq.count,
262
+ "frequency": seq.frequency,
263
+ }
264
+ for seq in sequences[:10] # Top 10
265
+ ]
266
+ except Exception as e:
267
+ logger.warning(f"Repeating pattern detection failed: {e}")
268
+
269
+ # 5. N-gram profiling
270
+ ngram_profile = {}
271
+ try:
272
+ ngram_profile = self.ngram_analyzer.analyze(data)
273
+ except Exception as e:
274
+ logger.warning(f"N-gram analysis failed: {e}")
275
+
276
+ # 6. Anomaly detection (simple z-score based)
277
+ anomalies = []
278
+ if detect_anomalies:
279
+ try:
280
+ byte_array = np.frombuffer(data, dtype=np.uint8)
281
+ # Simple z-score anomaly detection
282
+ mean = np.mean(byte_array)
283
+ std = np.std(byte_array)
284
+ if std > 0:
285
+ z_scores = np.abs((byte_array - mean) / std)
286
+ anomalies = np.where(z_scores > 3.0)[0].tolist()
287
+ except Exception as e:
288
+ logger.warning(f"Anomaly detection failed: {e}")
289
+
290
+ # 7. Periodic pattern detection
291
+ periodic = []
292
+ try:
293
+ byte_data_uint8 = np.frombuffer(data, dtype=np.uint8)
294
+ byte_data_float: NDArray[np.float64] = byte_data_uint8.astype(np.float64)
295
+ period_result = detect_period(byte_data_float)
296
+ if period_result is not None:
297
+ periodic.append(
298
+ {
299
+ "period": period_result.period,
300
+ "confidence": period_result.confidence,
301
+ "method": period_result.method,
302
+ }
303
+ )
304
+ except Exception as e:
305
+ logger.warning(f"Period detection failed: {e}")
306
+
307
+ # Calculate overall confidence
308
+ confidence = self._calculate_analysis_confidence(
309
+ entropy_result, signatures, repeating, anomalies
310
+ )
311
+
312
+ logger.info(
313
+ f"Analysis complete: type={data_type}, "
314
+ f"entropy={entropy_val:.2f}, "
315
+ f"signatures={len(signatures)}, "
316
+ f"confidence={confidence:.2f}"
317
+ )
318
+
319
+ return BinaryAnalysisResult(
320
+ data_type=data_type,
321
+ entropy=entropy_val,
322
+ entropy_result=entropy_result,
323
+ signatures=signatures,
324
+ repeating_patterns=repeating,
325
+ ngram_profile=ngram_profile,
326
+ anomalies=anomalies,
327
+ periodic_patterns=periodic,
328
+ confidence=confidence,
329
+ )
330
+
331
+ def infer_protocol_structure(
332
+ self,
333
+ messages: list[bytes],
334
+ min_field_size: int = 1,
335
+ ) -> ProtocolStructure:
336
+ """Infer protocol message structure from multiple captures.
337
+
338
+ Analyzes a collection of protocol messages to automatically infer:
339
+ - Fixed vs variable length messages
340
+ - Field boundaries and types
341
+ - Header/delimiter bytes
342
+ - Length prefix fields
343
+ - Checksum/CRC fields
344
+ - Encrypted payload regions
345
+
346
+ Args:
347
+ messages: List of captured protocol messages.
348
+ min_field_size: Minimum field size to detect.
349
+
350
+ Returns:
351
+ ProtocolStructure with inferred fields and metadata.
352
+
353
+ Raises:
354
+ ValueError: If messages list is empty.
355
+
356
+ Example:
357
+ >>> # Capture multiple protocol messages
358
+ >>> messages = [msg1, msg2, msg3, ...]
359
+ >>>
360
+ >>> # Infer structure
361
+ >>> structure = re_tool.infer_protocol_structure(messages)
362
+ >>>
363
+ >>> # Print discovered fields
364
+ >>> for field in structure.fields:
365
+ ... print(f"{field.field_type}: offset={field.offset}, "
366
+ ... f"length={field.length}, entropy={field.entropy:.2f}")
367
+ """
368
+ if not messages:
369
+ raise ValueError("Cannot infer structure from empty message list")
370
+
371
+ logger.info(f"Inferring protocol structure from {len(messages)} messages")
372
+
373
+ # 1. Determine message length (fixed or variable)
374
+ lengths = [len(msg) for msg in messages]
375
+ is_fixed_length = len(set(lengths)) == 1
376
+ msg_length = lengths[0] if is_fixed_length else -1
377
+
378
+ # 2. Detect delimiter (for variable-length protocols)
379
+ delimiter = None
380
+ if not is_fixed_length:
381
+ delimiter = self._detect_delimiter(messages)
382
+
383
+ # 3. Group by length for field inference
384
+ if is_fixed_length:
385
+ groups = {msg_length: messages}
386
+ else:
387
+ groups = {}
388
+ for msg in messages:
389
+ groups.setdefault(len(msg), []).append(msg)
390
+
391
+ # 4. Infer fields for each length group
392
+ all_fields = []
393
+ for msg_group in groups.values():
394
+ fields = self._infer_fields(msg_group, min_field_size)
395
+ all_fields.extend(fields)
396
+
397
+ # 5. Detect special field types
398
+ length_prefix_offset = self._detect_length_prefix(messages) if not is_fixed_length else None
399
+ checksum_offset = self._detect_checksum_field(messages)
400
+
401
+ # 6. Detect encrypted payload regions
402
+ payload_offset = None
403
+ crypto_fields = self.crypto_detector.detect_crypto_fields(messages, min_field_size=8)
404
+ if crypto_fields:
405
+ # Mark crypto fields
406
+ for cf in crypto_fields:
407
+ payload_offset = cf["offset"]
408
+ all_fields.append(
409
+ FieldDescriptor(
410
+ offset=cf["offset"],
411
+ length=cf["length"],
412
+ field_type="encrypted_payload",
413
+ entropy=cf["entropy"],
414
+ is_constant=False,
415
+ description="High entropy region (likely encrypted)",
416
+ )
417
+ )
418
+
419
+ # 7. Calculate confidence
420
+ confidence = self._calculate_structure_confidence(
421
+ all_fields, is_fixed_length, delimiter is not None
422
+ )
423
+
424
+ logger.info(
425
+ f"Structure inference complete: "
426
+ f"fields={len(all_fields)}, "
427
+ f"fixed_length={is_fixed_length}, "
428
+ f"confidence={confidence:.2f}"
429
+ )
430
+
431
+ return ProtocolStructure(
432
+ message_length=msg_length,
433
+ fields=all_fields,
434
+ delimiter=delimiter,
435
+ length_prefix_offset=length_prefix_offset,
436
+ checksum_offset=checksum_offset,
437
+ payload_offset=payload_offset,
438
+ confidence=confidence,
439
+ )
440
+
441
+ def detect_delimiter(self, messages: list[bytes]) -> bytes | None:
442
+ """Detect message delimiter bytes.
443
+
444
+ Finds byte sequences that consistently appear at message boundaries
445
+ across multiple messages.
446
+
447
+ Args:
448
+ messages: List of messages to analyze.
449
+
450
+ Returns:
451
+ Delimiter bytes if found, None otherwise.
452
+
453
+ Example:
454
+ >>> messages = [b'START' + data1 + b'END', b'START' + data2 + b'END']
455
+ >>> delim = re_tool.detect_delimiter(messages)
456
+ >>> print(delim)
457
+ b'END'
458
+ """
459
+ return self._detect_delimiter(messages)
460
+
461
+ def infer_fields(self, messages: list[bytes], min_field_size: int = 1) -> list[FieldDescriptor]:
462
+ """Infer field boundaries from message samples.
463
+
464
+ Analyzes byte-level entropy and variance across messages to detect
465
+ field boundaries. Fields with constant values, high entropy, or
466
+ distinct variance patterns are identified.
467
+
468
+ Args:
469
+ messages: List of messages (must all be same length).
470
+ min_field_size: Minimum field size in bytes.
471
+
472
+ Returns:
473
+ List of FieldDescriptor objects.
474
+
475
+ Raises:
476
+ ValueError: If messages have different lengths.
477
+
478
+ Example:
479
+ >>> messages = [msg1, msg2, msg3] # Same length
480
+ >>> fields = re_tool.infer_fields(messages)
481
+ >>> for field in fields:
482
+ ... print(f"Field: {field.field_type} at {field.offset}")
483
+ """
484
+ if not messages:
485
+ return []
486
+
487
+ # Validate all messages same length
488
+ msg_len = len(messages[0])
489
+ if not all(len(msg) == msg_len for msg in messages):
490
+ raise ValueError("All messages must have same length for field inference")
491
+
492
+ return self._infer_fields(messages, min_field_size)
493
+
494
+ def detect_length_prefix(self, messages: list[bytes]) -> int | None:
495
+ """Detect length prefix field in variable-length protocol.
496
+
497
+ Searches for a field at the beginning of messages that encodes
498
+ the message length. Common encodings: 1-byte, 2-byte LE/BE, varint.
499
+
500
+ Args:
501
+ messages: List of variable-length messages.
502
+
503
+ Returns:
504
+ Offset of length prefix field, or None if not detected.
505
+
506
+ Example:
507
+ >>> messages = [b'\\x05hello', b'\\x07goodbye'] # Length prefix
508
+ >>> offset = re_tool.detect_length_prefix(messages)
509
+ >>> print(offset)
510
+ 0
511
+ """
512
+ return self._detect_length_prefix(messages)
513
+
514
+ def detect_checksum_field(self, messages: list[bytes]) -> int | None:
515
+ """Detect checksum/CRC field in protocol messages.
516
+
517
+ Attempts to identify fields that contain checksums by testing
518
+ common checksum algorithms (CRC8, CRC16, CRC32, simple sum).
519
+
520
+ Args:
521
+ messages: List of messages to analyze.
522
+
523
+ Returns:
524
+ Offset of checksum field, or None if not detected.
525
+
526
+ Example:
527
+ >>> # Messages with CRC at end
528
+ >>> messages = [msg1, msg2, msg3]
529
+ >>> offset = re_tool.detect_checksum_field(messages)
530
+ >>> if offset:
531
+ ... print(f"Checksum at offset {offset}")
532
+ """
533
+ return self._detect_checksum_field(messages)
534
+
535
+ def classify_data_type(self, data: bytes) -> str:
536
+ """Classify binary data type (encrypted, compressed, structured, random).
537
+
538
+ Uses entropy analysis and statistical tests to classify data.
539
+
540
+ Args:
541
+ data: Binary data to classify.
542
+
543
+ Returns:
544
+ Data type string: 'encrypted', 'compressed', 'structured', or 'random'.
545
+
546
+ Example:
547
+ >>> encrypted = os.urandom(256)
548
+ >>> print(re_tool.classify_data_type(encrypted))
549
+ 'encrypted'
550
+ """
551
+ if not data:
552
+ return "empty"
553
+
554
+ result = self.crypto_detector.analyze_entropy(data)
555
+
556
+ if result.encryption_likelihood > 0.7:
557
+ return "encrypted"
558
+ elif result.compression_likelihood > 0.7:
559
+ return "compressed"
560
+ elif result.shannon_entropy < 3.0:
561
+ return "structured"
562
+ else:
563
+ return "mixed"
564
+
565
+ # =========================================================================
566
+ # Internal Helper Methods
567
+ # =========================================================================
568
+
569
+ def _detect_delimiter(self, messages: list[bytes]) -> bytes | None:
570
+ """Detect delimiter by finding common endings."""
571
+ if len(messages) < 2:
572
+ return None
573
+
574
+ # Look for common suffixes (last 1-4 bytes)
575
+ for delim_len in range(1, 5):
576
+ candidates: dict[bytes, int] = {}
577
+ for msg in messages:
578
+ if len(msg) >= delim_len:
579
+ suffix = msg[-delim_len:]
580
+ candidates[suffix] = candidates.get(suffix, 0) + 1
581
+
582
+ # Check if any suffix appears in >80% of messages
583
+ for suffix, count in candidates.items():
584
+ if count / len(messages) > 0.8:
585
+ return suffix
586
+
587
+ return None
588
+
589
+ def _infer_fields(self, messages: list[bytes], min_field_size: int) -> list[FieldDescriptor]:
590
+ """Infer field boundaries using entropy and variance analysis."""
591
+ if not messages:
592
+ return []
593
+
594
+ msg_len = len(messages[0])
595
+ if msg_len < min_field_size:
596
+ return []
597
+
598
+ # Compute positional entropy and variance
599
+ position_entropy = np.zeros(msg_len)
600
+ position_variance = np.zeros(msg_len)
601
+
602
+ for pos in range(msg_len):
603
+ values = [msg[pos] for msg in messages]
604
+ position_entropy[pos] = self._shannon_entropy_bytes(bytes(values))
605
+ position_variance[pos] = np.var(values)
606
+
607
+ # Identify field boundaries (where entropy/variance changes significantly)
608
+ fields = []
609
+ field_start = 0
610
+ field_type = "unknown"
611
+
612
+ # Simple field detection: constant fields (entropy ~0) and variable fields
613
+ for pos in range(1, msg_len):
614
+ # Detect boundary if entropy changes significantly
615
+ if abs(position_entropy[pos] - position_entropy[pos - 1]) > 2.0:
616
+ # Create field
617
+ if pos - field_start >= min_field_size:
618
+ avg_entropy = float(np.mean(position_entropy[field_start:pos]))
619
+ is_constant = avg_entropy < 0.1
620
+
621
+ constant_val = None
622
+ if is_constant:
623
+ # All messages have same value
624
+ constant_val = messages[0][field_start:pos]
625
+ field_type = "constant"
626
+ elif avg_entropy > 6.0:
627
+ field_type = "high_entropy"
628
+ else:
629
+ field_type = "variable"
630
+
631
+ fields.append(
632
+ FieldDescriptor(
633
+ offset=field_start,
634
+ length=pos - field_start,
635
+ field_type=field_type,
636
+ entropy=avg_entropy,
637
+ is_constant=is_constant,
638
+ constant_value=constant_val,
639
+ )
640
+ )
641
+ field_start = pos
642
+
643
+ # Add final field
644
+ if msg_len - field_start >= min_field_size:
645
+ avg_entropy = float(np.mean(position_entropy[field_start:]))
646
+ is_constant = avg_entropy < 0.1
647
+ constant_val = messages[0][field_start:] if is_constant else None
648
+
649
+ fields.append(
650
+ FieldDescriptor(
651
+ offset=field_start,
652
+ length=msg_len - field_start,
653
+ field_type="constant" if is_constant else "variable",
654
+ entropy=avg_entropy,
655
+ is_constant=is_constant,
656
+ constant_value=constant_val,
657
+ )
658
+ )
659
+
660
+ return fields
661
+
662
+ def _detect_length_prefix(self, messages: list[bytes]) -> int | None:
663
+ """Detect length prefix at start of messages."""
664
+ if len(messages) < 3:
665
+ return None
666
+
667
+ # Try 1-byte length at offset 0
668
+ if all(len(msg) > 1 for msg in messages):
669
+ if all(msg[0] == len(msg) for msg in messages):
670
+ return 0
671
+
672
+ # Try 2-byte little-endian length at offset 0
673
+ if all(len(msg) > 2 for msg in messages):
674
+ matches = 0
675
+ for msg in messages:
676
+ length_field = int.from_bytes(msg[0:2], byteorder="little")
677
+ if length_field == len(msg):
678
+ matches += 1
679
+ if matches / len(messages) > 0.8:
680
+ return 0
681
+
682
+ return None
683
+
684
+ def _detect_checksum_field(self, messages: list[bytes]) -> int | None:
685
+ """Detect checksum field (simplified heuristic)."""
686
+ # This is a simplified version - real implementation would test CRC8/16/32
687
+ # For now, just detect if last 1-4 bytes have high variance (likely checksum)
688
+ if len(messages) < 3:
689
+ return None
690
+
691
+ msg_len = len(messages[0])
692
+ if not all(len(msg) == msg_len for msg in messages):
693
+ return None
694
+
695
+ # Check last few bytes for high variance
696
+ for checksum_len in [1, 2, 4]:
697
+ if msg_len > checksum_len:
698
+ offset = msg_len - checksum_len
699
+ values = [msg[offset:] for msg in messages]
700
+ unique_values = len(set(values))
701
+ # If almost all unique, likely a checksum
702
+ if unique_values / len(messages) > 0.9:
703
+ return offset
704
+
705
+ return None
706
+
707
+ def _shannon_entropy_bytes(self, data: bytes) -> float:
708
+ """Calculate Shannon entropy for byte sequence."""
709
+ if not data:
710
+ return 0.0
711
+
712
+ byte_counts = np.bincount(np.frombuffer(data, dtype=np.uint8), minlength=256)
713
+ probabilities = byte_counts[byte_counts > 0] / len(data)
714
+ return float(-np.sum(probabilities * np.log2(probabilities)))
715
+
716
+ def _calculate_analysis_confidence(
717
+ self,
718
+ entropy_result: EntropyResult,
719
+ signatures: list[CandidateSignature],
720
+ repeating: list[dict[str, Any]],
721
+ anomalies: list[int],
722
+ ) -> float:
723
+ """Calculate overall confidence in binary analysis."""
724
+ # Base confidence from entropy analysis
725
+ confidence = entropy_result.confidence
726
+
727
+ # Boost if we found signatures
728
+ if signatures:
729
+ confidence = min(1.0, confidence + 0.1 * len(signatures))
730
+
731
+ # Boost if we found repeating patterns
732
+ if repeating:
733
+ confidence = min(1.0, confidence + 0.05 * len(repeating))
734
+
735
+ return float(confidence)
736
+
737
+ def _calculate_structure_confidence(
738
+ self,
739
+ fields: list[FieldDescriptor],
740
+ is_fixed_length: bool,
741
+ has_delimiter: bool,
742
+ ) -> float:
743
+ """Calculate confidence in protocol structure inference."""
744
+ confidence = 0.5 # Base confidence
745
+
746
+ # More confidence if we found fields
747
+ if fields:
748
+ confidence += 0.1 * min(len(fields), 5)
749
+
750
+ # More confidence for fixed-length protocols
751
+ if is_fixed_length:
752
+ confidence += 0.2
753
+
754
+ # More confidence if delimiter found
755
+ if has_delimiter:
756
+ confidence += 0.1
757
+
758
+ return min(1.0, float(confidence))
759
+
760
+
761
+ # Convenience functions for common operations
762
+
763
+
764
+ def search_pattern(
765
+ data: bytes,
766
+ pattern: bytes | str,
767
+ fuzzy: bool = False,
768
+ max_distance: int = 2,
769
+ ) -> list[int]:
770
+ """Search for pattern in binary data with optional fuzzy matching.
771
+
772
+ Args:
773
+ data: Binary data to search.
774
+ pattern: Pattern to search for (bytes or hex string).
775
+ fuzzy: Enable fuzzy/approximate matching.
776
+ max_distance: Maximum edit distance for fuzzy matching.
777
+
778
+ Returns:
779
+ List of match positions.
780
+
781
+ Example:
782
+ >>> positions = search_pattern(data, b'\\xff\\xfe', fuzzy=False)
783
+ >>> print(f"Found at: {positions}")
784
+ """
785
+ # Convert hex string to bytes
786
+ if isinstance(pattern, str):
787
+ pattern = bytes.fromhex(pattern.replace(" ", ""))
788
+
789
+ if fuzzy:
790
+ matcher = FuzzyMatcher(max_edit_distance=max_distance)
791
+ matches = matcher.search(data, pattern)
792
+ return [m.offset for m in matches]
793
+ else:
794
+ # Simple exact search
795
+ positions = []
796
+ for i in range(len(data) - len(pattern) + 1):
797
+ if data[i : i + len(pattern)] == pattern:
798
+ positions.append(i)
799
+ return positions
800
+
801
+
802
+ def shannon_entropy(data: bytes) -> float:
803
+ """Calculate Shannon entropy of binary data.
804
+
805
+ Convenience wrapper around CryptoDetector entropy calculation.
806
+
807
+ Args:
808
+ data: Binary data to analyze.
809
+
810
+ Returns:
811
+ Shannon entropy in bits per byte (0.0-8.0).
812
+
813
+ Example:
814
+ >>> entropy = shannon_entropy(b'\\x00' * 100)
815
+ >>> print(f"{entropy:.2f}")
816
+ 0.00
817
+ >>> entropy = shannon_entropy(os.urandom(100))
818
+ >>> print(f"{entropy:.2f}")
819
+ 7.98
820
+ """
821
+ detector = CryptoDetector()
822
+ return detector._shannon_entropy(data)
823
+
824
+
825
+ def byte_frequency_distribution(data: bytes) -> dict[int, int]:
826
+ """Calculate byte frequency distribution.
827
+
828
+ Args:
829
+ data: Binary data to analyze.
830
+
831
+ Returns:
832
+ Dictionary mapping byte value (0-255) to count.
833
+
834
+ Example:
835
+ >>> freq = byte_frequency_distribution(b'AAABBC')
836
+ >>> print(freq[ord('A')])
837
+ 3
838
+ """
839
+ byte_array = np.frombuffer(data, dtype=np.uint8)
840
+ counts = np.bincount(byte_array, minlength=256)
841
+ return {i: int(count) for i, count in enumerate(counts) if count > 0}
842
+
843
+
844
+ def sliding_entropy(
845
+ data: bytes,
846
+ window_size: int = 256,
847
+ stride: int = 64,
848
+ ) -> list[tuple[int, float]]:
849
+ """Calculate entropy across sliding windows.
850
+
851
+ Wrapper around CryptoDetector.sliding_window_entropy.
852
+
853
+ Args:
854
+ data: Binary data to analyze.
855
+ window_size: Window size in bytes.
856
+ stride: Step size between windows.
857
+
858
+ Returns:
859
+ List of (offset, entropy) tuples.
860
+
861
+ Example:
862
+ >>> windows = sliding_entropy(data, window_size=128)
863
+ >>> for offset, ent in windows:
864
+ ... if ent > 7.5:
865
+ ... print(f"High entropy at offset {offset}")
866
+ """
867
+ detector = CryptoDetector()
868
+ return detector.sliding_window_entropy(data, window_size=window_size, stride=stride)
869
+
870
+
871
+ def entropy_profile(data: bytes, window_size: int = 256) -> NDArray[np.float64]:
872
+ """Generate entropy profile (entropy over time).
873
+
874
+ Args:
875
+ data: Binary data to analyze.
876
+ window_size: Window size for entropy calculation.
877
+
878
+ Returns:
879
+ Array of entropy values for each window position.
880
+
881
+ Example:
882
+ >>> profile = entropy_profile(data, window_size=128)
883
+ >>> plt.plot(profile)
884
+ >>> plt.ylabel('Entropy (bits/byte)')
885
+ >>> plt.xlabel('Position')
886
+ """
887
+ detector = CryptoDetector()
888
+ windows = detector.sliding_window_entropy(data, window_size=window_size, stride=1)
889
+ return np.array([ent for _, ent in windows])
890
+
891
+
892
+ def detect_encrypted_regions(
893
+ data: bytes,
894
+ window_size: int = 256,
895
+ threshold: float = 7.5,
896
+ ) -> list[tuple[int, int]]:
897
+ """Detect regions with high entropy (likely encrypted).
898
+
899
+ Args:
900
+ data: Binary data to analyze.
901
+ window_size: Window size for analysis.
902
+ threshold: Entropy threshold for encryption (bits/byte).
903
+
904
+ Returns:
905
+ List of (start, end) tuples for encrypted regions.
906
+
907
+ Example:
908
+ >>> regions = detect_encrypted_regions(data)
909
+ >>> for start, end in regions:
910
+ ... print(f"Encrypted: {start}-{end}")
911
+ """
912
+ detector = CryptoDetector()
913
+ windows = detector.sliding_window_entropy(data, window_size=window_size, stride=1)
914
+
915
+ regions = []
916
+ in_region = False
917
+ region_start = 0
918
+
919
+ for offset, entropy in windows:
920
+ if entropy > threshold:
921
+ if not in_region:
922
+ region_start = offset
923
+ in_region = True
924
+ else:
925
+ if in_region:
926
+ regions.append((region_start, offset))
927
+ in_region = False
928
+
929
+ # Close final region
930
+ if in_region:
931
+ regions.append((region_start, len(data)))
932
+
933
+ return regions
934
+
935
+
936
+ def detect_compressed_regions(
937
+ data: bytes,
938
+ window_size: int = 256,
939
+ ) -> list[tuple[int, int]]:
940
+ """Detect regions with medium-high entropy (likely compressed).
941
+
942
+ Args:
943
+ data: Binary data to analyze.
944
+ window_size: Window size for analysis.
945
+
946
+ Returns:
947
+ List of (start, end) tuples for compressed regions.
948
+
949
+ Example:
950
+ >>> regions = detect_compressed_regions(data)
951
+ >>> for start, end in regions:
952
+ ... print(f"Compressed: {start}-{end}")
953
+ """
954
+ detector = CryptoDetector()
955
+ windows = detector.sliding_window_entropy(data, window_size=window_size, stride=1)
956
+
957
+ regions = []
958
+ in_region = False
959
+ region_start = 0
960
+
961
+ for offset, entropy in windows:
962
+ # Compressed: 6.5-7.5 bits/byte
963
+ if 6.5 < entropy < 7.5:
964
+ if not in_region:
965
+ region_start = offset
966
+ in_region = True
967
+ else:
968
+ if in_region:
969
+ regions.append((region_start, offset))
970
+ in_region = False
971
+
972
+ if in_region:
973
+ regions.append((region_start, len(data)))
974
+
975
+ return regions
976
+
977
+
978
+ __all__ = [
979
+ "BinaryAnalysisResult",
980
+ "FieldDescriptor",
981
+ "ProtocolStructure",
982
+ "ReverseEngineer",
983
+ # Convenience functions
984
+ "byte_frequency_distribution",
985
+ "detect_compressed_regions",
986
+ "detect_encrypted_regions",
987
+ "entropy_profile",
988
+ "search_pattern",
989
+ "shannon_entropy",
990
+ "sliding_entropy",
991
+ ]