oscura 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. oscura/__init__.py +1 -1
  2. oscura/analyzers/binary/__init__.py +36 -0
  3. oscura/analyzers/binary/core/__init__.py +29 -0
  4. oscura/analyzers/binary/core/file_access.py +193 -0
  5. oscura/analyzers/binary/core/pipeline.py +161 -0
  6. oscura/analyzers/binary/core/results.py +217 -0
  7. oscura/analyzers/binary/detection/__init__.py +10 -0
  8. oscura/analyzers/binary/detection/encoding.py +624 -0
  9. oscura/analyzers/binary/detection/patterns.py +320 -0
  10. oscura/analyzers/binary/detection/structure.py +630 -0
  11. oscura/analyzers/binary/export/__init__.py +9 -0
  12. oscura/analyzers/binary/export/dissector.py +174 -0
  13. oscura/analyzers/binary/inference/__init__.py +15 -0
  14. oscura/analyzers/binary/inference/checksums.py +214 -0
  15. oscura/analyzers/binary/inference/fields.py +150 -0
  16. oscura/analyzers/binary/inference/sequences.py +232 -0
  17. oscura/analyzers/binary/inference/timestamps.py +210 -0
  18. oscura/analyzers/binary/visualization/__init__.py +9 -0
  19. oscura/analyzers/binary/visualization/structure_view.py +182 -0
  20. oscura/automotive/__init__.py +1 -1
  21. oscura/automotive/dtc/data.json +102 -17
  22. oscura/core/schemas/device_mapping.json +8 -2
  23. oscura/core/schemas/packet_format.json +24 -4
  24. oscura/core/schemas/protocol_definition.json +12 -2
  25. oscura/loaders/__init__.py +4 -1
  26. oscura/loaders/binary.py +284 -1
  27. oscura/sessions/legacy.py +80 -19
  28. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/METADATA +3 -3
  29. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/RECORD +32 -14
  30. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/WHEEL +0 -0
  31. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/entry_points.txt +0 -0
  32. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,630 @@
1
+ """Structure inference for message and field boundaries in binary files.
2
+
3
+ This module implements message boundary detection and field inference for
4
+ binary files containing structured data. Uses pattern analysis and entropy-based
5
+ field boundary detection to reverse engineer message formats.
6
+
7
+ Example:
8
+ >>> from oscura.analyzers.binary.core.file_access import BinaryFile
9
+ >>> from oscura.analyzers.binary.detection.patterns import PatternMiner
10
+ >>> from oscura.analyzers.binary.detection.encoding import EncodingDetector
11
+ >>> from oscura.analyzers.binary.detection.structure import StructureInferencer
12
+ >>>
13
+ >>> bf = BinaryFile("data.bin")
14
+ >>> pattern_miner = PatternMiner()
15
+ >>> encoding_detector = EncodingDetector()
16
+ >>> patterns = pattern_miner.find_patterns(bf)
17
+ >>> encoding = encoding_detector.detect(bf)
18
+ >>> inferencer = StructureInferencer()
19
+ >>> result = inferencer.infer(bf, patterns, encoding)
20
+ >>> print(f"Found {result.message_count} messages")
21
+ >>> for field in result.fields:
22
+ ... print(f" {field.name}: {field.field_type.value}")
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from collections import Counter
28
+ from typing import TYPE_CHECKING
29
+
30
+ import numpy as np
31
+ from numpy.typing import NDArray
32
+
33
+ from oscura.analyzers.binary.core.results import (
34
+ EncodingResult,
35
+ Field,
36
+ FieldType,
37
+ Message,
38
+ Pattern,
39
+ PatternRole,
40
+ StructureResult,
41
+ )
42
+
43
+ if TYPE_CHECKING:
44
+ from oscura.analyzers.binary.core.file_access import BinaryFile
45
+
46
+ # Sample size for structure analysis (bytes)
47
+ STRUCTURE_SAMPLE_SIZE = 100_000 # 100KB
48
+
49
+ # Length field validation range (bytes)
50
+ MIN_MESSAGE_LENGTH = 10
51
+ MAX_MESSAGE_LENGTH = 10_000
52
+
53
+ # Maximum size for last message extraction (bytes)
54
+ MAX_LAST_MESSAGE_SIZE = 10_000
55
+
56
+
57
+ class StructureInferencer:
58
+ """Infer message and field structure from binary files.
59
+
60
+ Detects message boundaries using pattern analysis and infers field
61
+ boundaries using entropy-based techniques. Designed for reverse
62
+ engineering unknown binary protocols and file formats.
63
+
64
+ Example:
65
+ >>> inferencer = StructureInferencer()
66
+ >>> bf = BinaryFile("data.bin")
67
+ >>> result = inferencer.infer(bf, patterns, encoding)
68
+ >>> if result.has_messages:
69
+ ... print(f"Message length: {result.message_length} bytes")
70
+ ... print(f"Fields: {len(result.fields)}")
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ min_messages: int = 3,
76
+ max_field_length: int = 1024,
77
+ entropy_window: int = 8,
78
+ entropy_threshold: float = 0.3,
79
+ ):
80
+ """Initialize structure inferencer.
81
+
82
+ Args:
83
+ min_messages: Minimum messages required to infer structure.
84
+ max_field_length: Maximum field length in bytes.
85
+ entropy_window: Window size for entropy analysis.
86
+ entropy_threshold: Entropy change threshold for field boundaries.
87
+ """
88
+ self.min_messages = min_messages
89
+ self.max_field_length = max_field_length
90
+ self.entropy_window = entropy_window
91
+ self.entropy_threshold = entropy_threshold
92
+
93
+ def infer(
94
+ self, file: BinaryFile, patterns: list[Pattern], encoding: EncodingResult
95
+ ) -> StructureResult:
96
+ """Infer file structure from patterns and encoding.
97
+
98
+ Args:
99
+ file: Binary file to analyze.
100
+ patterns: Detected byte patterns from PatternMiner.
101
+ encoding: Detected encoding from EncodingDetector.
102
+
103
+ Returns:
104
+ StructureResult with message and field information.
105
+
106
+ Example:
107
+ >>> bf = BinaryFile("data.bin")
108
+ >>> patterns = miner.find_patterns(bf)
109
+ >>> encoding = detector.detect(bf)
110
+ >>> inferencer = StructureInferencer()
111
+ >>> result = inferencer.infer(bf, patterns, encoding)
112
+ >>> print(f"Confidence: {result.confidence:.1%}")
113
+ """
114
+ # Strategy 1: Detect message boundaries
115
+ boundaries = self._detect_boundaries(file, patterns)
116
+
117
+ if len(boundaries) < self.min_messages:
118
+ # No clear message structure detected
119
+ return StructureResult(
120
+ has_messages=False,
121
+ message_length=None,
122
+ message_count=0,
123
+ messages=[],
124
+ fields=[],
125
+ confidence=0.0,
126
+ )
127
+
128
+ # Strategy 2: Extract messages
129
+ messages = self._extract_messages(file, boundaries)
130
+
131
+ if len(messages) < self.min_messages:
132
+ return StructureResult(
133
+ has_messages=False,
134
+ message_length=None,
135
+ message_count=0,
136
+ messages=[],
137
+ fields=[],
138
+ confidence=0.0,
139
+ )
140
+
141
+ # Strategy 3: Infer fields from messages
142
+ fields = self._infer_fields(messages)
143
+
144
+ # Calculate confidence based on consistency
145
+ confidence = self._calculate_confidence(messages, fields, patterns)
146
+
147
+ # Determine message length (use most common if variable)
148
+ message_lengths = [msg.length for msg in messages]
149
+ message_length = int(np.median(message_lengths))
150
+
151
+ return StructureResult(
152
+ has_messages=True,
153
+ message_length=message_length,
154
+ message_count=len(messages),
155
+ messages=messages,
156
+ fields=fields,
157
+ confidence=confidence,
158
+ )
159
+
160
+ def _detect_boundaries(self, file: BinaryFile, patterns: list[Pattern]) -> list[int]:
161
+ """Detect message boundaries using pattern analysis.
162
+
163
+ Tries multiple strategies in order:
164
+ 1. Use header pattern positions
165
+ 2. Use delimiter pattern with regular spacing
166
+ 3. Fixed-length from regular spacing patterns
167
+ 4. Length field detection
168
+
169
+ Args:
170
+ file: Binary file to analyze.
171
+ patterns: Detected byte patterns.
172
+
173
+ Returns:
174
+ List of byte offsets where messages start.
175
+ """
176
+ if not patterns:
177
+ return []
178
+
179
+ # Strategy 1: Use header patterns
180
+ header_patterns = [p for p in patterns if p.role == PatternRole.HEADER]
181
+ if header_patterns:
182
+ # Use most frequent header pattern
183
+ header = max(header_patterns, key=lambda p: p.count)
184
+ if header.count >= self.min_messages:
185
+ return sorted(header.positions)
186
+
187
+ # Strategy 2: Use delimiter patterns with regular spacing
188
+ delimiter_patterns = [p for p in patterns if p.role == PatternRole.DELIMITER and p.regular]
189
+ if delimiter_patterns:
190
+ # Use delimiter with most regular spacing
191
+ delimiter = max(delimiter_patterns, key=lambda p: p.count)
192
+ if delimiter.count >= self.min_messages:
193
+ return sorted(delimiter.positions)
194
+
195
+ # Strategy 3: Fixed-length messages from regular spacing
196
+ regular_patterns = [p for p in patterns if p.regular and p.count >= self.min_messages]
197
+ if regular_patterns:
198
+ # Use pattern with largest average spacing (likely message boundaries)
199
+ best_pattern = max(regular_patterns, key=lambda p: p.avg_spacing)
200
+ # Import MIN_MESSAGE_SPACING from patterns module
201
+ from oscura.analyzers.binary.detection.patterns import MIN_MESSAGE_SPACING
202
+
203
+ if best_pattern.avg_spacing > MIN_MESSAGE_SPACING:
204
+ return sorted(best_pattern.positions)
205
+
206
+ # Strategy 4: Length field detection
207
+ length_boundaries = self._detect_length_field_boundaries(file)
208
+ if len(length_boundaries) >= self.min_messages:
209
+ return length_boundaries
210
+
211
+ # No clear boundaries found
212
+ return []
213
+
214
+ def _detect_length_field_boundaries(self, file: BinaryFile) -> list[int]:
215
+ """Detect message boundaries using length field analysis.
216
+
217
+ Scans file for uint16/uint32 values that might indicate message lengths
218
+ and validates by checking if they form consistent message structures.
219
+
220
+ Args:
221
+ file: Binary file to analyze.
222
+
223
+ Returns:
224
+ List of detected message boundary positions.
225
+ """
226
+ # Sample beginning of file for length field detection
227
+ sample_size = min(STRUCTURE_SAMPLE_SIZE, file.size)
228
+ sample_data = file.read_bytes(0, sample_size)
229
+
230
+ boundaries: list[int] = []
231
+
232
+ # Try uint16 length fields (2 bytes)
233
+ for offset in range(0, len(sample_data) - 2, 2):
234
+ # Read potential length field (little-endian)
235
+ length = int.from_bytes(sample_data[offset : offset + 2], byteorder="little")
236
+
237
+ # Validate length is reasonable
238
+ if MIN_MESSAGE_LENGTH <= length <= MAX_MESSAGE_LENGTH:
239
+ # Check if this forms a valid message
240
+ next_offset = offset + 2 + length
241
+ if next_offset < len(sample_data) - 2:
242
+ # Check if next position also has valid length field
243
+ next_length = int.from_bytes(
244
+ sample_data[next_offset : next_offset + 2], byteorder="little"
245
+ )
246
+ if MIN_MESSAGE_LENGTH <= next_length <= MAX_MESSAGE_LENGTH:
247
+ boundaries.append(offset)
248
+
249
+ # Found potential pattern, continue from next message
250
+ if len(boundaries) >= self.min_messages:
251
+ break
252
+
253
+ return boundaries
254
+
255
+ def _extract_messages(self, file: BinaryFile, boundaries: list[int]) -> list[Message]:
256
+ """Extract messages from file using detected boundaries.
257
+
258
+ Args:
259
+ file: Binary file to read from.
260
+ boundaries: List of message start positions.
261
+
262
+ Returns:
263
+ List of Message objects.
264
+ """
265
+ messages: list[Message] = []
266
+
267
+ for i, start in enumerate(boundaries):
268
+ # Calculate message length
269
+ if i + 1 < len(boundaries):
270
+ # Length is distance to next boundary
271
+ length = boundaries[i + 1] - start
272
+ else:
273
+ # Last message: use remaining file data with size limit
274
+ length = min(MAX_LAST_MESSAGE_SIZE, file.size - start)
275
+
276
+ # Validate length is reasonable
277
+ if length <= 0 or length > MAX_MESSAGE_LENGTH:
278
+ continue
279
+
280
+ # Read message data
281
+ data = file.read_bytes(start, length)
282
+
283
+ if len(data) < length:
284
+ # End of file reached
285
+ length = len(data)
286
+
287
+ if length > 0:
288
+ messages.append(Message(offset=start, length=length, data=data, index=i))
289
+
290
+ return messages
291
+
292
+ def _infer_fields(self, messages: list[Message]) -> list[Field]:
293
+ """Infer field boundaries and types from messages.
294
+
295
+ Uses entropy analysis and value analysis to detect field boundaries.
296
+
297
+ Args:
298
+ messages: List of messages to analyze.
299
+
300
+ Returns:
301
+ List of Field objects with inferred types.
302
+ """
303
+ if not messages:
304
+ return []
305
+
306
+ # Find consistent message length (use most common)
307
+ lengths = [msg.length for msg in messages]
308
+ length_counts = Counter(lengths)
309
+ most_common_length = length_counts.most_common(1)[0][0]
310
+
311
+ # Filter messages with consistent length
312
+ consistent_messages = [msg for msg in messages if msg.length == most_common_length]
313
+
314
+ if len(consistent_messages) < self.min_messages:
315
+ # Not enough consistent messages
316
+ return []
317
+
318
+ # Align messages as 2D numpy array (rows=messages, cols=bytes)
319
+ aligned = self._align_messages(consistent_messages, most_common_length)
320
+
321
+ # Detect field boundaries using entropy analysis
322
+ field_boundaries = self._detect_field_boundaries(aligned)
323
+
324
+ # Classify fields
325
+ fields: list[Field] = []
326
+ for i, start_offset in enumerate(field_boundaries):
327
+ # Calculate field length
328
+ if i + 1 < len(field_boundaries):
329
+ length = field_boundaries[i + 1] - start_offset
330
+ else:
331
+ length = most_common_length - start_offset
332
+
333
+ # Ensure reasonable field length
334
+ if length <= 0 or length > self.max_field_length:
335
+ continue
336
+
337
+ # Classify field type
338
+ field = self._classify_field(aligned, start_offset, length)
339
+ fields.append(field)
340
+
341
+ # Merge adjacent similar fields
342
+ fields = self._merge_adjacent_fields(fields)
343
+
344
+ return fields
345
+
346
+ def _align_messages(self, messages: list[Message], length: int) -> NDArray[np.uint8]:
347
+ """Align messages as 2D numpy array.
348
+
349
+ Args:
350
+ messages: List of messages with same length.
351
+ length: Message length in bytes.
352
+
353
+ Returns:
354
+ 2D array with shape (n_messages, length).
355
+ """
356
+ n_messages = len(messages)
357
+ aligned = np.zeros((n_messages, length), dtype=np.uint8)
358
+
359
+ for i, msg in enumerate(messages):
360
+ # Copy message bytes to row
361
+ msg_bytes = np.frombuffer(msg.data[:length], dtype=np.uint8)
362
+ aligned[i, : len(msg_bytes)] = msg_bytes
363
+
364
+ return aligned
365
+
366
+ def _detect_field_boundaries(self, aligned: NDArray[np.uint8]) -> list[int]:
367
+ """Detect field boundaries using sliding window entropy analysis.
368
+
369
+ Args:
370
+ aligned: Aligned message array (n_messages x message_length).
371
+
372
+ Returns:
373
+ List of field boundary offsets.
374
+ """
375
+ if aligned.shape[0] < 2 or aligned.shape[1] < self.entropy_window:
376
+ return [0]
377
+
378
+ n_messages, msg_length = aligned.shape
379
+ boundaries = [0] # Always start at offset 0
380
+
381
+ # Compute entropy for each byte position
382
+ entropies = np.zeros(msg_length)
383
+ for pos in range(msg_length):
384
+ values = aligned[:, pos]
385
+ entropies[pos] = self._compute_byte_entropy(values)
386
+
387
+ # Smooth entropy using sliding window
388
+ window = self.entropy_window
389
+ smoothed = np.convolve(entropies, np.ones(window) / window, mode="valid")
390
+
391
+ # Detect boundaries where entropy changes significantly
392
+ entropy_diff = np.abs(np.diff(smoothed))
393
+ threshold = np.mean(entropy_diff) + self.entropy_threshold * np.std(entropy_diff)
394
+
395
+ for i in range(len(entropy_diff)):
396
+ if entropy_diff[i] > threshold:
397
+ # Found potential boundary
398
+ boundary_pos = i + window // 2
399
+ if boundary_pos > boundaries[-1] + 1: # Avoid duplicate boundaries
400
+ boundaries.append(boundary_pos)
401
+
402
+ # If no boundaries detected beyond start, add some default boundaries
403
+ if len(boundaries) == 1 and msg_length >= 16:
404
+ # Add boundaries at common field sizes (4, 8, 12, 16 bytes)
405
+ for offset in [4, 8, 12, 16]:
406
+ if offset < msg_length and offset not in boundaries:
407
+ boundaries.append(offset)
408
+
409
+ return sorted(boundaries)
410
+
411
+ def _compute_byte_entropy(self, values: NDArray[np.uint8]) -> float:
412
+ """Compute Shannon entropy for byte values.
413
+
414
+ Args:
415
+ values: Array of byte values.
416
+
417
+ Returns:
418
+ Entropy in bits (0-8).
419
+ """
420
+ if len(values) == 0:
421
+ return 0.0
422
+
423
+ # Count value frequencies
424
+ counts = np.bincount(values, minlength=256)
425
+ probabilities = counts[counts > 0] / len(values)
426
+
427
+ # Shannon entropy
428
+ entropy = -np.sum(probabilities * np.log2(probabilities))
429
+ return float(entropy)
430
+
431
+ def _classify_field(self, aligned: NDArray[np.uint8], offset: int, length: int) -> Field:
432
+ """Classify field type based on value analysis.
433
+
434
+ Args:
435
+ aligned: Aligned message array.
436
+ offset: Field start offset.
437
+ length: Field length in bytes.
438
+
439
+ Returns:
440
+ Classified Field object.
441
+ """
442
+ # Extract field values from all messages
443
+ field_data = aligned[:, offset : offset + length]
444
+
445
+ # Convert to comparable values based on length
446
+ if length == 1:
447
+ values_int = field_data[:, 0].astype(int)
448
+ elif length == 2:
449
+ # uint16 interpretation
450
+ values_int = field_data[:, 0].astype(int) + field_data[:, 1].astype(int) * 256
451
+ elif length == 4:
452
+ # uint32 interpretation
453
+ values_int = (
454
+ field_data[:, 0].astype(int)
455
+ + field_data[:, 1].astype(int) * 256
456
+ + field_data[:, 2].astype(int) * 65536
457
+ + field_data[:, 3].astype(int) * 16777216
458
+ )
459
+ else:
460
+ # For longer fields, use bytes directly
461
+ values_int = None
462
+
463
+ # Check if field is constant
464
+ unique_count = len(np.unique(field_data, axis=0))
465
+ is_constant = unique_count == 1
466
+
467
+ if is_constant:
468
+ field_type = FieldType.CONSTANT
469
+ name = f"const_{offset:04d}"
470
+ sample_values = [field_data[0].tobytes()]
471
+ elif values_int is not None:
472
+ # Analyze numeric field
473
+ field_type, name = self._classify_numeric_field(values_int, offset, length)
474
+ sample_values = values_int[:5].tolist()
475
+ else:
476
+ # Unknown/payload field
477
+ field_type = FieldType.UNKNOWN
478
+ name = f"field_{offset:04d}"
479
+ sample_values = [row.tobytes() for row in field_data[:5]]
480
+
481
+ # Compute statistics
482
+ statistics: dict[str, float] = {}
483
+ if values_int is not None:
484
+ statistics = {
485
+ "mean": float(np.mean(values_int)),
486
+ "std": float(np.std(values_int)),
487
+ "min": float(np.min(values_int)),
488
+ "max": float(np.max(values_int)),
489
+ "unique_count": int(len(np.unique(values_int))),
490
+ }
491
+
492
+ return Field(
493
+ name=name,
494
+ offset=offset,
495
+ length=length,
496
+ field_type=field_type,
497
+ constant=is_constant,
498
+ values=sample_values,
499
+ statistics=statistics,
500
+ metadata={"entropy": self._compute_byte_entropy(field_data.flatten())},
501
+ )
502
+
503
+ def _classify_numeric_field(
504
+ self, values: NDArray[np.int64], offset: int, length: int
505
+ ) -> tuple[FieldType, str]:
506
+ """Classify numeric field type.
507
+
508
+ Args:
509
+ values: Field values as integers.
510
+ offset: Field offset in message.
511
+ length: Field length in bytes.
512
+
513
+ Returns:
514
+ Tuple of (FieldType, field_name).
515
+ """
516
+ # Check for sequence (incrementing values)
517
+ if len(values) > 2:
518
+ diffs = np.diff(values)
519
+ if np.all(diffs == 1):
520
+ return FieldType.SEQUENCE, f"seq_{offset:04d}"
521
+ if np.all(diffs >= 0) and np.mean(diffs) > 0.8:
522
+ # Mostly incrementing
523
+ return FieldType.SEQUENCE, f"counter_{offset:04d}"
524
+
525
+ # Check for timestamp (large values, mostly increasing)
526
+ if length >= 4:
527
+ if np.min(values) > 1_000_000 and np.all(np.diff(values) >= 0):
528
+ return FieldType.TIMESTAMP, f"timestamp_{offset:04d}"
529
+
530
+ # Check for length field (values match some pattern)
531
+ unique_count = len(np.unique(values))
532
+ if unique_count < len(values) * 0.5 and np.max(values) < 10_000:
533
+ return FieldType.LENGTH, f"length_{offset:04d}"
534
+
535
+ # Check for checksum (appears random but not too large)
536
+ entropy = self._compute_byte_entropy(values.astype(np.uint8))
537
+ if entropy > 5.0 and length <= 4:
538
+ return FieldType.CHECKSUM, f"checksum_{offset:04d}"
539
+
540
+ # Default to unknown
541
+ return FieldType.UNKNOWN, f"field_{offset:04d}"
542
+
543
+ def _merge_adjacent_fields(self, fields: list[Field]) -> list[Field]:
544
+ """Merge adjacent fields with similar characteristics.
545
+
546
+ Args:
547
+ fields: List of fields to merge.
548
+
549
+ Returns:
550
+ Merged field list.
551
+ """
552
+ if len(fields) <= 1:
553
+ return fields
554
+
555
+ merged: list[Field] = []
556
+ current = fields[0]
557
+
558
+ for next_field in fields[1:]:
559
+ # Check if fields should be merged
560
+ if (
561
+ current.field_type == next_field.field_type
562
+ and current.offset + current.length == next_field.offset
563
+ and current.field_type in (FieldType.PAYLOAD, FieldType.UNKNOWN, FieldType.CONSTANT)
564
+ ):
565
+ # Merge fields
566
+ current = Field(
567
+ name=current.name,
568
+ offset=current.offset,
569
+ length=current.length + next_field.length,
570
+ field_type=current.field_type,
571
+ constant=current.constant and next_field.constant,
572
+ values=current.values,
573
+ statistics=current.statistics,
574
+ metadata=current.metadata,
575
+ )
576
+ else:
577
+ # Save current and start new
578
+ merged.append(current)
579
+ current = next_field
580
+
581
+ # Add last field
582
+ merged.append(current)
583
+
584
+ return merged
585
+
586
+ def _calculate_confidence(
587
+ self, messages: list[Message], fields: list[Field], patterns: list[Pattern]
588
+ ) -> float:
589
+ """Calculate confidence in structure inference.
590
+
591
+ Args:
592
+ messages: Extracted messages.
593
+ fields: Inferred fields.
594
+ patterns: Detected patterns.
595
+
596
+ Returns:
597
+ Confidence score (0.0 to 1.0).
598
+ """
599
+ if not messages:
600
+ return 0.0
601
+
602
+ confidence = 0.0
603
+
604
+ # Factor 1: Message length consistency (up to 0.4)
605
+ lengths = [msg.length for msg in messages]
606
+ if len(lengths) > 1:
607
+ length_std = float(np.std(lengths))
608
+ length_mean = float(np.mean(lengths))
609
+ if length_mean > 0:
610
+ cv = length_std / length_mean # Coefficient of variation
611
+ length_confidence = float(max(0.0, 1.0 - cv)) # Lower CV = higher confidence
612
+ confidence += length_confidence * 0.4
613
+
614
+ # Factor 2: Number of fields detected (up to 0.3)
615
+ if fields:
616
+ # More fields = better structure understanding
617
+ field_confidence = min(1.0, len(fields) / 10.0)
618
+ confidence += field_confidence * 0.3
619
+
620
+ # Factor 3: Pattern support (up to 0.3)
621
+ if patterns:
622
+ # Having regular patterns increases confidence
623
+ regular_patterns = sum(1 for p in patterns if p.regular)
624
+ pattern_confidence = min(1.0, regular_patterns / 3.0)
625
+ confidence += pattern_confidence * 0.3
626
+
627
+ return min(1.0, confidence)
628
+
629
+
630
+ __all__ = ["StructureInferencer"]
@@ -0,0 +1,9 @@
1
+ """Export tools for binary analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from oscura.analyzers.binary.export.dissector import DissectorGenerator
6
+
7
+ __all__ = [
8
+ "DissectorGenerator",
9
+ ]