oscura 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +1 -1
- oscura/analyzers/binary/__init__.py +36 -0
- oscura/analyzers/binary/core/__init__.py +29 -0
- oscura/analyzers/binary/core/file_access.py +193 -0
- oscura/analyzers/binary/core/pipeline.py +161 -0
- oscura/analyzers/binary/core/results.py +217 -0
- oscura/analyzers/binary/detection/__init__.py +10 -0
- oscura/analyzers/binary/detection/encoding.py +624 -0
- oscura/analyzers/binary/detection/patterns.py +320 -0
- oscura/analyzers/binary/detection/structure.py +630 -0
- oscura/analyzers/binary/export/__init__.py +9 -0
- oscura/analyzers/binary/export/dissector.py +174 -0
- oscura/analyzers/binary/inference/__init__.py +15 -0
- oscura/analyzers/binary/inference/checksums.py +214 -0
- oscura/analyzers/binary/inference/fields.py +150 -0
- oscura/analyzers/binary/inference/sequences.py +232 -0
- oscura/analyzers/binary/inference/timestamps.py +210 -0
- oscura/analyzers/binary/visualization/__init__.py +9 -0
- oscura/analyzers/binary/visualization/structure_view.py +182 -0
- oscura/automotive/__init__.py +1 -1
- oscura/automotive/dtc/data.json +102 -17
- oscura/core/schemas/device_mapping.json +8 -2
- oscura/core/schemas/packet_format.json +24 -4
- oscura/core/schemas/protocol_definition.json +12 -2
- oscura/loaders/__init__.py +4 -1
- oscura/loaders/binary.py +284 -1
- oscura/sessions/legacy.py +80 -19
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/METADATA +3 -3
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/RECORD +32 -14
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/WHEEL +0 -0
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/entry_points.txt +0 -0
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,630 @@
|
|
|
1
|
+
"""Structure inference for message and field boundaries in binary files.
|
|
2
|
+
|
|
3
|
+
This module implements message boundary detection and field inference for
|
|
4
|
+
binary files containing structured data. Uses pattern analysis and entropy-based
|
|
5
|
+
field boundary detection to reverse engineer message formats.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> from oscura.analyzers.binary.core.file_access import BinaryFile
|
|
9
|
+
>>> from oscura.analyzers.binary.detection.patterns import PatternMiner
|
|
10
|
+
>>> from oscura.analyzers.binary.detection.encoding import EncodingDetector
|
|
11
|
+
>>> from oscura.analyzers.binary.detection.structure import StructureInferencer
|
|
12
|
+
>>>
|
|
13
|
+
>>> bf = BinaryFile("data.bin")
|
|
14
|
+
>>> pattern_miner = PatternMiner()
|
|
15
|
+
>>> encoding_detector = EncodingDetector()
|
|
16
|
+
>>> patterns = pattern_miner.find_patterns(bf)
|
|
17
|
+
>>> encoding = encoding_detector.detect(bf)
|
|
18
|
+
>>> inferencer = StructureInferencer()
|
|
19
|
+
>>> result = inferencer.infer(bf, patterns, encoding)
|
|
20
|
+
>>> print(f"Found {result.message_count} messages")
|
|
21
|
+
>>> for field in result.fields:
|
|
22
|
+
... print(f" {field.name}: {field.field_type.value}")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from collections import Counter
|
|
28
|
+
from typing import TYPE_CHECKING
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
from numpy.typing import NDArray
|
|
32
|
+
|
|
33
|
+
from oscura.analyzers.binary.core.results import (
|
|
34
|
+
EncodingResult,
|
|
35
|
+
Field,
|
|
36
|
+
FieldType,
|
|
37
|
+
Message,
|
|
38
|
+
Pattern,
|
|
39
|
+
PatternRole,
|
|
40
|
+
StructureResult,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from oscura.analyzers.binary.core.file_access import BinaryFile
|
|
45
|
+
|
|
46
|
+
# Sample size for structure analysis (bytes)
|
|
47
|
+
STRUCTURE_SAMPLE_SIZE = 100_000 # 100KB
|
|
48
|
+
|
|
49
|
+
# Length field validation range (bytes)
|
|
50
|
+
MIN_MESSAGE_LENGTH = 10
|
|
51
|
+
MAX_MESSAGE_LENGTH = 10_000
|
|
52
|
+
|
|
53
|
+
# Maximum size for last message extraction (bytes)
|
|
54
|
+
MAX_LAST_MESSAGE_SIZE = 10_000
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class StructureInferencer:
|
|
58
|
+
"""Infer message and field structure from binary files.
|
|
59
|
+
|
|
60
|
+
Detects message boundaries using pattern analysis and infers field
|
|
61
|
+
boundaries using entropy-based techniques. Designed for reverse
|
|
62
|
+
engineering unknown binary protocols and file formats.
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> inferencer = StructureInferencer()
|
|
66
|
+
>>> bf = BinaryFile("data.bin")
|
|
67
|
+
>>> result = inferencer.infer(bf, patterns, encoding)
|
|
68
|
+
>>> if result.has_messages:
|
|
69
|
+
... print(f"Message length: {result.message_length} bytes")
|
|
70
|
+
... print(f"Fields: {len(result.fields)}")
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
min_messages: int = 3,
|
|
76
|
+
max_field_length: int = 1024,
|
|
77
|
+
entropy_window: int = 8,
|
|
78
|
+
entropy_threshold: float = 0.3,
|
|
79
|
+
):
|
|
80
|
+
"""Initialize structure inferencer.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
min_messages: Minimum messages required to infer structure.
|
|
84
|
+
max_field_length: Maximum field length in bytes.
|
|
85
|
+
entropy_window: Window size for entropy analysis.
|
|
86
|
+
entropy_threshold: Entropy change threshold for field boundaries.
|
|
87
|
+
"""
|
|
88
|
+
self.min_messages = min_messages
|
|
89
|
+
self.max_field_length = max_field_length
|
|
90
|
+
self.entropy_window = entropy_window
|
|
91
|
+
self.entropy_threshold = entropy_threshold
|
|
92
|
+
|
|
93
|
+
def infer(
|
|
94
|
+
self, file: BinaryFile, patterns: list[Pattern], encoding: EncodingResult
|
|
95
|
+
) -> StructureResult:
|
|
96
|
+
"""Infer file structure from patterns and encoding.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
file: Binary file to analyze.
|
|
100
|
+
patterns: Detected byte patterns from PatternMiner.
|
|
101
|
+
encoding: Detected encoding from EncodingDetector.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
StructureResult with message and field information.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
>>> bf = BinaryFile("data.bin")
|
|
108
|
+
>>> patterns = miner.find_patterns(bf)
|
|
109
|
+
>>> encoding = detector.detect(bf)
|
|
110
|
+
>>> inferencer = StructureInferencer()
|
|
111
|
+
>>> result = inferencer.infer(bf, patterns, encoding)
|
|
112
|
+
>>> print(f"Confidence: {result.confidence:.1%}")
|
|
113
|
+
"""
|
|
114
|
+
# Strategy 1: Detect message boundaries
|
|
115
|
+
boundaries = self._detect_boundaries(file, patterns)
|
|
116
|
+
|
|
117
|
+
if len(boundaries) < self.min_messages:
|
|
118
|
+
# No clear message structure detected
|
|
119
|
+
return StructureResult(
|
|
120
|
+
has_messages=False,
|
|
121
|
+
message_length=None,
|
|
122
|
+
message_count=0,
|
|
123
|
+
messages=[],
|
|
124
|
+
fields=[],
|
|
125
|
+
confidence=0.0,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Strategy 2: Extract messages
|
|
129
|
+
messages = self._extract_messages(file, boundaries)
|
|
130
|
+
|
|
131
|
+
if len(messages) < self.min_messages:
|
|
132
|
+
return StructureResult(
|
|
133
|
+
has_messages=False,
|
|
134
|
+
message_length=None,
|
|
135
|
+
message_count=0,
|
|
136
|
+
messages=[],
|
|
137
|
+
fields=[],
|
|
138
|
+
confidence=0.0,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Strategy 3: Infer fields from messages
|
|
142
|
+
fields = self._infer_fields(messages)
|
|
143
|
+
|
|
144
|
+
# Calculate confidence based on consistency
|
|
145
|
+
confidence = self._calculate_confidence(messages, fields, patterns)
|
|
146
|
+
|
|
147
|
+
# Determine message length (use most common if variable)
|
|
148
|
+
message_lengths = [msg.length for msg in messages]
|
|
149
|
+
message_length = int(np.median(message_lengths))
|
|
150
|
+
|
|
151
|
+
return StructureResult(
|
|
152
|
+
has_messages=True,
|
|
153
|
+
message_length=message_length,
|
|
154
|
+
message_count=len(messages),
|
|
155
|
+
messages=messages,
|
|
156
|
+
fields=fields,
|
|
157
|
+
confidence=confidence,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def _detect_boundaries(self, file: BinaryFile, patterns: list[Pattern]) -> list[int]:
|
|
161
|
+
"""Detect message boundaries using pattern analysis.
|
|
162
|
+
|
|
163
|
+
Tries multiple strategies in order:
|
|
164
|
+
1. Use header pattern positions
|
|
165
|
+
2. Use delimiter pattern with regular spacing
|
|
166
|
+
3. Fixed-length from regular spacing patterns
|
|
167
|
+
4. Length field detection
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
file: Binary file to analyze.
|
|
171
|
+
patterns: Detected byte patterns.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
List of byte offsets where messages start.
|
|
175
|
+
"""
|
|
176
|
+
if not patterns:
|
|
177
|
+
return []
|
|
178
|
+
|
|
179
|
+
# Strategy 1: Use header patterns
|
|
180
|
+
header_patterns = [p for p in patterns if p.role == PatternRole.HEADER]
|
|
181
|
+
if header_patterns:
|
|
182
|
+
# Use most frequent header pattern
|
|
183
|
+
header = max(header_patterns, key=lambda p: p.count)
|
|
184
|
+
if header.count >= self.min_messages:
|
|
185
|
+
return sorted(header.positions)
|
|
186
|
+
|
|
187
|
+
# Strategy 2: Use delimiter patterns with regular spacing
|
|
188
|
+
delimiter_patterns = [p for p in patterns if p.role == PatternRole.DELIMITER and p.regular]
|
|
189
|
+
if delimiter_patterns:
|
|
190
|
+
# Use delimiter with most regular spacing
|
|
191
|
+
delimiter = max(delimiter_patterns, key=lambda p: p.count)
|
|
192
|
+
if delimiter.count >= self.min_messages:
|
|
193
|
+
return sorted(delimiter.positions)
|
|
194
|
+
|
|
195
|
+
# Strategy 3: Fixed-length messages from regular spacing
|
|
196
|
+
regular_patterns = [p for p in patterns if p.regular and p.count >= self.min_messages]
|
|
197
|
+
if regular_patterns:
|
|
198
|
+
# Use pattern with largest average spacing (likely message boundaries)
|
|
199
|
+
best_pattern = max(regular_patterns, key=lambda p: p.avg_spacing)
|
|
200
|
+
# Import MIN_MESSAGE_SPACING from patterns module
|
|
201
|
+
from oscura.analyzers.binary.detection.patterns import MIN_MESSAGE_SPACING
|
|
202
|
+
|
|
203
|
+
if best_pattern.avg_spacing > MIN_MESSAGE_SPACING:
|
|
204
|
+
return sorted(best_pattern.positions)
|
|
205
|
+
|
|
206
|
+
# Strategy 4: Length field detection
|
|
207
|
+
length_boundaries = self._detect_length_field_boundaries(file)
|
|
208
|
+
if len(length_boundaries) >= self.min_messages:
|
|
209
|
+
return length_boundaries
|
|
210
|
+
|
|
211
|
+
# No clear boundaries found
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
def _detect_length_field_boundaries(self, file: BinaryFile) -> list[int]:
|
|
215
|
+
"""Detect message boundaries using length field analysis.
|
|
216
|
+
|
|
217
|
+
Scans file for uint16/uint32 values that might indicate message lengths
|
|
218
|
+
and validates by checking if they form consistent message structures.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
file: Binary file to analyze.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
List of detected message boundary positions.
|
|
225
|
+
"""
|
|
226
|
+
# Sample beginning of file for length field detection
|
|
227
|
+
sample_size = min(STRUCTURE_SAMPLE_SIZE, file.size)
|
|
228
|
+
sample_data = file.read_bytes(0, sample_size)
|
|
229
|
+
|
|
230
|
+
boundaries: list[int] = []
|
|
231
|
+
|
|
232
|
+
# Try uint16 length fields (2 bytes)
|
|
233
|
+
for offset in range(0, len(sample_data) - 2, 2):
|
|
234
|
+
# Read potential length field (little-endian)
|
|
235
|
+
length = int.from_bytes(sample_data[offset : offset + 2], byteorder="little")
|
|
236
|
+
|
|
237
|
+
# Validate length is reasonable
|
|
238
|
+
if MIN_MESSAGE_LENGTH <= length <= MAX_MESSAGE_LENGTH:
|
|
239
|
+
# Check if this forms a valid message
|
|
240
|
+
next_offset = offset + 2 + length
|
|
241
|
+
if next_offset < len(sample_data) - 2:
|
|
242
|
+
# Check if next position also has valid length field
|
|
243
|
+
next_length = int.from_bytes(
|
|
244
|
+
sample_data[next_offset : next_offset + 2], byteorder="little"
|
|
245
|
+
)
|
|
246
|
+
if MIN_MESSAGE_LENGTH <= next_length <= MAX_MESSAGE_LENGTH:
|
|
247
|
+
boundaries.append(offset)
|
|
248
|
+
|
|
249
|
+
# Found potential pattern, continue from next message
|
|
250
|
+
if len(boundaries) >= self.min_messages:
|
|
251
|
+
break
|
|
252
|
+
|
|
253
|
+
return boundaries
|
|
254
|
+
|
|
255
|
+
def _extract_messages(self, file: BinaryFile, boundaries: list[int]) -> list[Message]:
|
|
256
|
+
"""Extract messages from file using detected boundaries.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
file: Binary file to read from.
|
|
260
|
+
boundaries: List of message start positions.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
List of Message objects.
|
|
264
|
+
"""
|
|
265
|
+
messages: list[Message] = []
|
|
266
|
+
|
|
267
|
+
for i, start in enumerate(boundaries):
|
|
268
|
+
# Calculate message length
|
|
269
|
+
if i + 1 < len(boundaries):
|
|
270
|
+
# Length is distance to next boundary
|
|
271
|
+
length = boundaries[i + 1] - start
|
|
272
|
+
else:
|
|
273
|
+
# Last message: use remaining file data with size limit
|
|
274
|
+
length = min(MAX_LAST_MESSAGE_SIZE, file.size - start)
|
|
275
|
+
|
|
276
|
+
# Validate length is reasonable
|
|
277
|
+
if length <= 0 or length > MAX_MESSAGE_LENGTH:
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
# Read message data
|
|
281
|
+
data = file.read_bytes(start, length)
|
|
282
|
+
|
|
283
|
+
if len(data) < length:
|
|
284
|
+
# End of file reached
|
|
285
|
+
length = len(data)
|
|
286
|
+
|
|
287
|
+
if length > 0:
|
|
288
|
+
messages.append(Message(offset=start, length=length, data=data, index=i))
|
|
289
|
+
|
|
290
|
+
return messages
|
|
291
|
+
|
|
292
|
+
def _infer_fields(self, messages: list[Message]) -> list[Field]:
|
|
293
|
+
"""Infer field boundaries and types from messages.
|
|
294
|
+
|
|
295
|
+
Uses entropy analysis and value analysis to detect field boundaries.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
messages: List of messages to analyze.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
List of Field objects with inferred types.
|
|
302
|
+
"""
|
|
303
|
+
if not messages:
|
|
304
|
+
return []
|
|
305
|
+
|
|
306
|
+
# Find consistent message length (use most common)
|
|
307
|
+
lengths = [msg.length for msg in messages]
|
|
308
|
+
length_counts = Counter(lengths)
|
|
309
|
+
most_common_length = length_counts.most_common(1)[0][0]
|
|
310
|
+
|
|
311
|
+
# Filter messages with consistent length
|
|
312
|
+
consistent_messages = [msg for msg in messages if msg.length == most_common_length]
|
|
313
|
+
|
|
314
|
+
if len(consistent_messages) < self.min_messages:
|
|
315
|
+
# Not enough consistent messages
|
|
316
|
+
return []
|
|
317
|
+
|
|
318
|
+
# Align messages as 2D numpy array (rows=messages, cols=bytes)
|
|
319
|
+
aligned = self._align_messages(consistent_messages, most_common_length)
|
|
320
|
+
|
|
321
|
+
# Detect field boundaries using entropy analysis
|
|
322
|
+
field_boundaries = self._detect_field_boundaries(aligned)
|
|
323
|
+
|
|
324
|
+
# Classify fields
|
|
325
|
+
fields: list[Field] = []
|
|
326
|
+
for i, start_offset in enumerate(field_boundaries):
|
|
327
|
+
# Calculate field length
|
|
328
|
+
if i + 1 < len(field_boundaries):
|
|
329
|
+
length = field_boundaries[i + 1] - start_offset
|
|
330
|
+
else:
|
|
331
|
+
length = most_common_length - start_offset
|
|
332
|
+
|
|
333
|
+
# Ensure reasonable field length
|
|
334
|
+
if length <= 0 or length > self.max_field_length:
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
# Classify field type
|
|
338
|
+
field = self._classify_field(aligned, start_offset, length)
|
|
339
|
+
fields.append(field)
|
|
340
|
+
|
|
341
|
+
# Merge adjacent similar fields
|
|
342
|
+
fields = self._merge_adjacent_fields(fields)
|
|
343
|
+
|
|
344
|
+
return fields
|
|
345
|
+
|
|
346
|
+
def _align_messages(self, messages: list[Message], length: int) -> NDArray[np.uint8]:
|
|
347
|
+
"""Align messages as 2D numpy array.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
messages: List of messages with same length.
|
|
351
|
+
length: Message length in bytes.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
2D array with shape (n_messages, length).
|
|
355
|
+
"""
|
|
356
|
+
n_messages = len(messages)
|
|
357
|
+
aligned = np.zeros((n_messages, length), dtype=np.uint8)
|
|
358
|
+
|
|
359
|
+
for i, msg in enumerate(messages):
|
|
360
|
+
# Copy message bytes to row
|
|
361
|
+
msg_bytes = np.frombuffer(msg.data[:length], dtype=np.uint8)
|
|
362
|
+
aligned[i, : len(msg_bytes)] = msg_bytes
|
|
363
|
+
|
|
364
|
+
return aligned
|
|
365
|
+
|
|
366
|
+
def _detect_field_boundaries(self, aligned: NDArray[np.uint8]) -> list[int]:
|
|
367
|
+
"""Detect field boundaries using sliding window entropy analysis.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
aligned: Aligned message array (n_messages x message_length).
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
List of field boundary offsets.
|
|
374
|
+
"""
|
|
375
|
+
if aligned.shape[0] < 2 or aligned.shape[1] < self.entropy_window:
|
|
376
|
+
return [0]
|
|
377
|
+
|
|
378
|
+
n_messages, msg_length = aligned.shape
|
|
379
|
+
boundaries = [0] # Always start at offset 0
|
|
380
|
+
|
|
381
|
+
# Compute entropy for each byte position
|
|
382
|
+
entropies = np.zeros(msg_length)
|
|
383
|
+
for pos in range(msg_length):
|
|
384
|
+
values = aligned[:, pos]
|
|
385
|
+
entropies[pos] = self._compute_byte_entropy(values)
|
|
386
|
+
|
|
387
|
+
# Smooth entropy using sliding window
|
|
388
|
+
window = self.entropy_window
|
|
389
|
+
smoothed = np.convolve(entropies, np.ones(window) / window, mode="valid")
|
|
390
|
+
|
|
391
|
+
# Detect boundaries where entropy changes significantly
|
|
392
|
+
entropy_diff = np.abs(np.diff(smoothed))
|
|
393
|
+
threshold = np.mean(entropy_diff) + self.entropy_threshold * np.std(entropy_diff)
|
|
394
|
+
|
|
395
|
+
for i in range(len(entropy_diff)):
|
|
396
|
+
if entropy_diff[i] > threshold:
|
|
397
|
+
# Found potential boundary
|
|
398
|
+
boundary_pos = i + window // 2
|
|
399
|
+
if boundary_pos > boundaries[-1] + 1: # Avoid duplicate boundaries
|
|
400
|
+
boundaries.append(boundary_pos)
|
|
401
|
+
|
|
402
|
+
# If no boundaries detected beyond start, add some default boundaries
|
|
403
|
+
if len(boundaries) == 1 and msg_length >= 16:
|
|
404
|
+
# Add boundaries at common field sizes (4, 8, 12, 16 bytes)
|
|
405
|
+
for offset in [4, 8, 12, 16]:
|
|
406
|
+
if offset < msg_length and offset not in boundaries:
|
|
407
|
+
boundaries.append(offset)
|
|
408
|
+
|
|
409
|
+
return sorted(boundaries)
|
|
410
|
+
|
|
411
|
+
def _compute_byte_entropy(self, values: NDArray[np.uint8]) -> float:
|
|
412
|
+
"""Compute Shannon entropy for byte values.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
values: Array of byte values.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Entropy in bits (0-8).
|
|
419
|
+
"""
|
|
420
|
+
if len(values) == 0:
|
|
421
|
+
return 0.0
|
|
422
|
+
|
|
423
|
+
# Count value frequencies
|
|
424
|
+
counts = np.bincount(values, minlength=256)
|
|
425
|
+
probabilities = counts[counts > 0] / len(values)
|
|
426
|
+
|
|
427
|
+
# Shannon entropy
|
|
428
|
+
entropy = -np.sum(probabilities * np.log2(probabilities))
|
|
429
|
+
return float(entropy)
|
|
430
|
+
|
|
431
|
+
def _classify_field(self, aligned: NDArray[np.uint8], offset: int, length: int) -> Field:
|
|
432
|
+
"""Classify field type based on value analysis.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
aligned: Aligned message array.
|
|
436
|
+
offset: Field start offset.
|
|
437
|
+
length: Field length in bytes.
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
Classified Field object.
|
|
441
|
+
"""
|
|
442
|
+
# Extract field values from all messages
|
|
443
|
+
field_data = aligned[:, offset : offset + length]
|
|
444
|
+
|
|
445
|
+
# Convert to comparable values based on length
|
|
446
|
+
if length == 1:
|
|
447
|
+
values_int = field_data[:, 0].astype(int)
|
|
448
|
+
elif length == 2:
|
|
449
|
+
# uint16 interpretation
|
|
450
|
+
values_int = field_data[:, 0].astype(int) + field_data[:, 1].astype(int) * 256
|
|
451
|
+
elif length == 4:
|
|
452
|
+
# uint32 interpretation
|
|
453
|
+
values_int = (
|
|
454
|
+
field_data[:, 0].astype(int)
|
|
455
|
+
+ field_data[:, 1].astype(int) * 256
|
|
456
|
+
+ field_data[:, 2].astype(int) * 65536
|
|
457
|
+
+ field_data[:, 3].astype(int) * 16777216
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
# For longer fields, use bytes directly
|
|
461
|
+
values_int = None
|
|
462
|
+
|
|
463
|
+
# Check if field is constant
|
|
464
|
+
unique_count = len(np.unique(field_data, axis=0))
|
|
465
|
+
is_constant = unique_count == 1
|
|
466
|
+
|
|
467
|
+
if is_constant:
|
|
468
|
+
field_type = FieldType.CONSTANT
|
|
469
|
+
name = f"const_{offset:04d}"
|
|
470
|
+
sample_values = [field_data[0].tobytes()]
|
|
471
|
+
elif values_int is not None:
|
|
472
|
+
# Analyze numeric field
|
|
473
|
+
field_type, name = self._classify_numeric_field(values_int, offset, length)
|
|
474
|
+
sample_values = values_int[:5].tolist()
|
|
475
|
+
else:
|
|
476
|
+
# Unknown/payload field
|
|
477
|
+
field_type = FieldType.UNKNOWN
|
|
478
|
+
name = f"field_{offset:04d}"
|
|
479
|
+
sample_values = [row.tobytes() for row in field_data[:5]]
|
|
480
|
+
|
|
481
|
+
# Compute statistics
|
|
482
|
+
statistics: dict[str, float] = {}
|
|
483
|
+
if values_int is not None:
|
|
484
|
+
statistics = {
|
|
485
|
+
"mean": float(np.mean(values_int)),
|
|
486
|
+
"std": float(np.std(values_int)),
|
|
487
|
+
"min": float(np.min(values_int)),
|
|
488
|
+
"max": float(np.max(values_int)),
|
|
489
|
+
"unique_count": int(len(np.unique(values_int))),
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
return Field(
|
|
493
|
+
name=name,
|
|
494
|
+
offset=offset,
|
|
495
|
+
length=length,
|
|
496
|
+
field_type=field_type,
|
|
497
|
+
constant=is_constant,
|
|
498
|
+
values=sample_values,
|
|
499
|
+
statistics=statistics,
|
|
500
|
+
metadata={"entropy": self._compute_byte_entropy(field_data.flatten())},
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
def _classify_numeric_field(
|
|
504
|
+
self, values: NDArray[np.int64], offset: int, length: int
|
|
505
|
+
) -> tuple[FieldType, str]:
|
|
506
|
+
"""Classify numeric field type.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
values: Field values as integers.
|
|
510
|
+
offset: Field offset in message.
|
|
511
|
+
length: Field length in bytes.
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
Tuple of (FieldType, field_name).
|
|
515
|
+
"""
|
|
516
|
+
# Check for sequence (incrementing values)
|
|
517
|
+
if len(values) > 2:
|
|
518
|
+
diffs = np.diff(values)
|
|
519
|
+
if np.all(diffs == 1):
|
|
520
|
+
return FieldType.SEQUENCE, f"seq_{offset:04d}"
|
|
521
|
+
if np.all(diffs >= 0) and np.mean(diffs) > 0.8:
|
|
522
|
+
# Mostly incrementing
|
|
523
|
+
return FieldType.SEQUENCE, f"counter_{offset:04d}"
|
|
524
|
+
|
|
525
|
+
# Check for timestamp (large values, mostly increasing)
|
|
526
|
+
if length >= 4:
|
|
527
|
+
if np.min(values) > 1_000_000 and np.all(np.diff(values) >= 0):
|
|
528
|
+
return FieldType.TIMESTAMP, f"timestamp_{offset:04d}"
|
|
529
|
+
|
|
530
|
+
# Check for length field (values match some pattern)
|
|
531
|
+
unique_count = len(np.unique(values))
|
|
532
|
+
if unique_count < len(values) * 0.5 and np.max(values) < 10_000:
|
|
533
|
+
return FieldType.LENGTH, f"length_{offset:04d}"
|
|
534
|
+
|
|
535
|
+
# Check for checksum (appears random but not too large)
|
|
536
|
+
entropy = self._compute_byte_entropy(values.astype(np.uint8))
|
|
537
|
+
if entropy > 5.0 and length <= 4:
|
|
538
|
+
return FieldType.CHECKSUM, f"checksum_{offset:04d}"
|
|
539
|
+
|
|
540
|
+
# Default to unknown
|
|
541
|
+
return FieldType.UNKNOWN, f"field_{offset:04d}"
|
|
542
|
+
|
|
543
|
+
def _merge_adjacent_fields(self, fields: list[Field]) -> list[Field]:
|
|
544
|
+
"""Merge adjacent fields with similar characteristics.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
fields: List of fields to merge.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
Merged field list.
|
|
551
|
+
"""
|
|
552
|
+
if len(fields) <= 1:
|
|
553
|
+
return fields
|
|
554
|
+
|
|
555
|
+
merged: list[Field] = []
|
|
556
|
+
current = fields[0]
|
|
557
|
+
|
|
558
|
+
for next_field in fields[1:]:
|
|
559
|
+
# Check if fields should be merged
|
|
560
|
+
if (
|
|
561
|
+
current.field_type == next_field.field_type
|
|
562
|
+
and current.offset + current.length == next_field.offset
|
|
563
|
+
and current.field_type in (FieldType.PAYLOAD, FieldType.UNKNOWN, FieldType.CONSTANT)
|
|
564
|
+
):
|
|
565
|
+
# Merge fields
|
|
566
|
+
current = Field(
|
|
567
|
+
name=current.name,
|
|
568
|
+
offset=current.offset,
|
|
569
|
+
length=current.length + next_field.length,
|
|
570
|
+
field_type=current.field_type,
|
|
571
|
+
constant=current.constant and next_field.constant,
|
|
572
|
+
values=current.values,
|
|
573
|
+
statistics=current.statistics,
|
|
574
|
+
metadata=current.metadata,
|
|
575
|
+
)
|
|
576
|
+
else:
|
|
577
|
+
# Save current and start new
|
|
578
|
+
merged.append(current)
|
|
579
|
+
current = next_field
|
|
580
|
+
|
|
581
|
+
# Add last field
|
|
582
|
+
merged.append(current)
|
|
583
|
+
|
|
584
|
+
return merged
|
|
585
|
+
|
|
586
|
+
def _calculate_confidence(
|
|
587
|
+
self, messages: list[Message], fields: list[Field], patterns: list[Pattern]
|
|
588
|
+
) -> float:
|
|
589
|
+
"""Calculate confidence in structure inference.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
messages: Extracted messages.
|
|
593
|
+
fields: Inferred fields.
|
|
594
|
+
patterns: Detected patterns.
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
Confidence score (0.0 to 1.0).
|
|
598
|
+
"""
|
|
599
|
+
if not messages:
|
|
600
|
+
return 0.0
|
|
601
|
+
|
|
602
|
+
confidence = 0.0
|
|
603
|
+
|
|
604
|
+
# Factor 1: Message length consistency (up to 0.4)
|
|
605
|
+
lengths = [msg.length for msg in messages]
|
|
606
|
+
if len(lengths) > 1:
|
|
607
|
+
length_std = float(np.std(lengths))
|
|
608
|
+
length_mean = float(np.mean(lengths))
|
|
609
|
+
if length_mean > 0:
|
|
610
|
+
cv = length_std / length_mean # Coefficient of variation
|
|
611
|
+
length_confidence = float(max(0.0, 1.0 - cv)) # Lower CV = higher confidence
|
|
612
|
+
confidence += length_confidence * 0.4
|
|
613
|
+
|
|
614
|
+
# Factor 2: Number of fields detected (up to 0.3)
|
|
615
|
+
if fields:
|
|
616
|
+
# More fields = better structure understanding
|
|
617
|
+
field_confidence = min(1.0, len(fields) / 10.0)
|
|
618
|
+
confidence += field_confidence * 0.3
|
|
619
|
+
|
|
620
|
+
# Factor 3: Pattern support (up to 0.3)
|
|
621
|
+
if patterns:
|
|
622
|
+
# Having regular patterns increases confidence
|
|
623
|
+
regular_patterns = sum(1 for p in patterns if p.regular)
|
|
624
|
+
pattern_confidence = min(1.0, regular_patterns / 3.0)
|
|
625
|
+
confidence += pattern_confidence * 0.3
|
|
626
|
+
|
|
627
|
+
return min(1.0, confidence)
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
__all__ = ["StructureInferencer"]
|