oscura 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. oscura/__init__.py +1 -1
  2. oscura/analyzers/binary/__init__.py +36 -0
  3. oscura/analyzers/binary/core/__init__.py +29 -0
  4. oscura/analyzers/binary/core/file_access.py +193 -0
  5. oscura/analyzers/binary/core/pipeline.py +161 -0
  6. oscura/analyzers/binary/core/results.py +217 -0
  7. oscura/analyzers/binary/detection/__init__.py +10 -0
  8. oscura/analyzers/binary/detection/encoding.py +624 -0
  9. oscura/analyzers/binary/detection/patterns.py +320 -0
  10. oscura/analyzers/binary/detection/structure.py +630 -0
  11. oscura/analyzers/binary/export/__init__.py +9 -0
  12. oscura/analyzers/binary/export/dissector.py +174 -0
  13. oscura/analyzers/binary/inference/__init__.py +15 -0
  14. oscura/analyzers/binary/inference/checksums.py +214 -0
  15. oscura/analyzers/binary/inference/fields.py +150 -0
  16. oscura/analyzers/binary/inference/sequences.py +232 -0
  17. oscura/analyzers/binary/inference/timestamps.py +210 -0
  18. oscura/analyzers/binary/visualization/__init__.py +9 -0
  19. oscura/analyzers/binary/visualization/structure_view.py +182 -0
  20. oscura/automotive/__init__.py +1 -1
  21. oscura/automotive/dtc/data.json +102 -17
  22. oscura/core/schemas/device_mapping.json +8 -2
  23. oscura/core/schemas/packet_format.json +24 -4
  24. oscura/core/schemas/protocol_definition.json +12 -2
  25. oscura/loaders/__init__.py +4 -1
  26. oscura/loaders/binary.py +284 -1
  27. oscura/sessions/legacy.py +80 -19
  28. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/METADATA +3 -3
  29. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/RECORD +32 -14
  30. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/WHEEL +0 -0
  31. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/entry_points.txt +0 -0
  32. {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,320 @@
1
+ """Efficient pattern mining for repeating byte sequences in binary files.
2
+
3
+ This module implements brute-force pattern detection for finding
4
+ repeating byte sequences in binary files of any size. Designed for
5
+ high-performance analysis of multi-GB files with minimal memory footprint.
6
+
7
+ Example:
8
+ >>> from oscura.analyzers.binary.detection.patterns import PatternMiner
9
+ >>> from oscura.analyzers.binary.core.file_access import BinaryFile
10
+ >>> bf = BinaryFile("large_file.bin")
11
+ >>> miner = PatternMiner()
12
+ >>> patterns = miner.find_patterns(bf, min_length=4, max_length=64)
13
+ >>> for pattern in patterns:
14
+ ... print(f"Found {len(pattern.bytes)}-byte pattern {pattern.count} times")
15
+ ... print(f" Role: {pattern.role.value}")
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from collections import defaultdict
21
+ from typing import TYPE_CHECKING
22
+
23
+ import numpy as np
24
+
25
+ from oscura.analyzers.binary.core.results import Pattern, PatternRole
26
+
27
+ if TYPE_CHECKING:
28
+ from oscura.analyzers.binary.core.file_access import BinaryFile
29
+
30
+ # File size threshold for small files (read entire file vs sampling strategy)
31
+ SMALL_FILE_THRESHOLD = 1_000_000 # 1MB
32
+
33
+ # Sample size for pattern detection in large files
34
+ PATTERN_SAMPLE_SIZE = 1_000_000 # 1MB per sample location
35
+
36
+ # Number of sample locations to use for large files
37
+ PATTERN_N_SAMPLES = 5
38
+
39
+ # Maximum positions stored per pattern to prevent unbounded memory growth
40
+ MAX_POSITIONS_PER_PATTERN = 10_000
41
+
42
+ # Regular spacing detection threshold (coefficient of variation)
43
+ REGULAR_SPACING_CV_THRESHOLD = 0.1 # 10% variation allowed
44
+
45
+ # Minimum spacing for delimiter classification (bytes)
46
+ MIN_DELIMITER_SPACING = 1000
47
+
48
+ # Minimum message spacing for fixed-length detection (bytes)
49
+ MIN_MESSAGE_SPACING = 100
50
+
51
+ # File position threshold for footer detection (fraction of file size)
52
+ FOOTER_POSITION_THRESHOLD = 0.95 # Last 5% of file
53
+
54
+ # Pattern deduplication overlap threshold (fraction)
55
+ PATTERN_OVERLAP_THRESHOLD = 0.9 # 90% position overlap
56
+
57
+
58
+ class PatternMiner:
59
+ """Efficient pattern mining for repeating byte sequences.
60
+
61
+ Uses brute-force substring matching for pattern detection in large
62
+ binary files. Samples multiple file locations to avoid scanning
63
+ entire multi-GB files while maintaining high detection accuracy.
64
+
65
+ Example:
66
+ >>> miner = PatternMiner()
67
+ >>> bf = BinaryFile("data.bin")
68
+ >>> patterns = miner.find_patterns(bf, min_length=2, max_length=64)
69
+ >>> print(f"Found {len(patterns)} repeating patterns")
70
+ """
71
+
72
+ def __init__(self) -> None:
73
+ """Initialize pattern miner."""
74
+ self._hash_base = 257 # Prime number for hashing
75
+ self._hash_mod = 2**31 - 1 # Large prime for hash modulus
76
+
77
+ def find_patterns(
78
+ self,
79
+ file: BinaryFile,
80
+ min_length: int = 2,
81
+ max_length: int = 64,
82
+ min_occurrences: int = 3,
83
+ ) -> list[Pattern]:
84
+ """Find repeating byte patterns in binary file.
85
+
86
+ Scans file for repeating byte sequences using brute-force
87
+ substring matching. Samples multiple file locations for large
88
+ files to maintain performance.
89
+
90
+ Args:
91
+ file: Binary file to analyze.
92
+ min_length: Minimum pattern length in bytes (default: 2).
93
+ max_length: Maximum pattern length in bytes (default: 64).
94
+ min_occurrences: Minimum pattern occurrences to report (default: 3).
95
+
96
+ Returns:
97
+ List of Pattern objects sorted by occurrence count (descending).
98
+
99
+ Example:
100
+ >>> bf = BinaryFile("data.bin")
101
+ >>> miner = PatternMiner()
102
+ >>> patterns = miner.find_patterns(bf, min_length=4, min_occurrences=5)
103
+ >>> for p in patterns:
104
+ ... print(f"{p.bytes.hex()}: {p.count} occurrences")
105
+ """
106
+ if min_length < 1:
107
+ raise ValueError(f"min_length must be >= 1, got {min_length}")
108
+ if max_length < min_length:
109
+ raise ValueError(f"max_length ({max_length}) must be >= min_length ({min_length})")
110
+ if min_occurrences < 1:
111
+ raise ValueError(f"min_occurrences must be >= 1, got {min_occurrences}")
112
+
113
+ # Handle empty files
114
+ if file.size == 0:
115
+ return []
116
+
117
+ # Sample multiple file locations for large files
118
+ sample_data = self._sample_file(file)
119
+
120
+ # Find patterns of each length
121
+ all_patterns: list[Pattern] = []
122
+ for length in range(min_length, min(max_length + 1, len(sample_data) + 1)):
123
+ length_patterns = self._find_patterns_of_length(sample_data, length, min_occurrences)
124
+ all_patterns.extend(length_patterns)
125
+
126
+ # Deduplicate patterns (remove subpatterns)
127
+ deduplicated = self._deduplicate_patterns(all_patterns)
128
+
129
+ # Classify pattern roles
130
+ classified = [self._classify_pattern_with_file_info(p, file) for p in deduplicated]
131
+
132
+ # Sort by occurrence count (most frequent first)
133
+ classified.sort(key=lambda p: p.count, reverse=True)
134
+
135
+ return classified
136
+
137
+ def _sample_file(self, file: BinaryFile) -> bytes:
138
+ """Sample file data for pattern analysis.
139
+
140
+ For large files, samples multiple locations instead of reading
141
+ entire file to maintain performance.
142
+
143
+ Args:
144
+ file: Binary file to sample.
145
+
146
+ Returns:
147
+ Concatenated sample data from multiple file locations.
148
+ """
149
+ # For small files, read entire file
150
+ if file.size < SMALL_FILE_THRESHOLD:
151
+ return file.read_bytes(0, file.size)
152
+
153
+ # For large files, sample multiple locations
154
+ n_samples = PATTERN_N_SAMPLES
155
+ sample_size = PATTERN_SAMPLE_SIZE
156
+
157
+ samples = file.sample_locations(n_samples=n_samples, sample_size=sample_size)
158
+
159
+ # Concatenate samples
160
+ return b"".join(samples)
161
+
162
+ def _find_patterns_of_length(
163
+ self, data: bytes, length: int, min_occurrences: int
164
+ ) -> list[Pattern]:
165
+ """Find all patterns of specific length in data.
166
+
167
+ Uses brute-force substring matching for pattern detection.
168
+
169
+ Args:
170
+ data: Byte data to search.
171
+ length: Pattern length in bytes.
172
+ min_occurrences: Minimum occurrences to report.
173
+
174
+ Returns:
175
+ List of Pattern objects for this length.
176
+ """
177
+ if len(data) < length:
178
+ return []
179
+
180
+ # Build hash table of patterns
181
+ pattern_positions: dict[bytes, list[int]] = defaultdict(list)
182
+
183
+ # Scan data for patterns
184
+ for i in range(len(data) - length + 1):
185
+ pattern_bytes = data[i : i + length]
186
+ if len(pattern_positions[pattern_bytes]) < MAX_POSITIONS_PER_PATTERN:
187
+ pattern_positions[pattern_bytes].append(i)
188
+
189
+ # Filter by minimum occurrences
190
+ patterns: list[Pattern] = []
191
+ for pattern_bytes, positions in pattern_positions.items():
192
+ count = len(positions)
193
+ if count >= min_occurrences:
194
+ # Calculate spacing statistics
195
+ spacings = np.diff(positions) if len(positions) > 1 else np.array([])
196
+ avg_spacing = float(np.mean(spacings)) if len(spacings) > 0 else 0.0
197
+ spacing_variance = float(np.var(spacings)) if len(spacings) > 0 else 0.0
198
+
199
+ # Determine if spacing is regular
200
+ regular = self._is_regular_spacing(spacings)
201
+
202
+ # Create pattern with initial role (will classify later)
203
+ patterns.append(
204
+ Pattern(
205
+ bytes=pattern_bytes,
206
+ positions=positions,
207
+ count=count,
208
+ avg_spacing=avg_spacing,
209
+ spacing_variance=spacing_variance,
210
+ regular=regular,
211
+ role=PatternRole.UNKNOWN,
212
+ )
213
+ )
214
+
215
+ return patterns
216
+
217
+ def _is_regular_spacing(self, spacings: np.ndarray[int, np.dtype[np.int64]]) -> bool:
218
+ """Determine if pattern spacing is regular.
219
+
220
+ Args:
221
+ spacings: Array of spacing values between pattern occurrences.
222
+
223
+ Returns:
224
+ True if spacing is regular (coefficient of variation < 0.1).
225
+ """
226
+ if len(spacings) < 2:
227
+ return False
228
+
229
+ mean_spacing = float(np.mean(spacings))
230
+ if mean_spacing == 0:
231
+ return False
232
+
233
+ std_spacing = float(np.std(spacings))
234
+ coefficient_of_variation = std_spacing / mean_spacing
235
+
236
+ # Consider regular if CV below threshold
237
+ return coefficient_of_variation < REGULAR_SPACING_CV_THRESHOLD
238
+
239
+ def _classify_pattern_with_file_info(self, pattern: Pattern, file: BinaryFile) -> Pattern:
240
+ """Classify pattern role using file context.
241
+
242
+ Args:
243
+ pattern: Pattern to classify.
244
+ file: Binary file for context.
245
+
246
+ Returns:
247
+ Pattern with updated role classification.
248
+ """
249
+ if not pattern.positions:
250
+ return pattern
251
+
252
+ first_position = pattern.positions[0]
253
+
254
+ # Header: at file start (position 0)
255
+ if first_position == 0:
256
+ pattern.role = PatternRole.HEADER
257
+ return pattern
258
+
259
+ # Footer: at file end
260
+ last_position = pattern.positions[-1]
261
+ if last_position > FOOTER_POSITION_THRESHOLD * file.size:
262
+ pattern.role = PatternRole.FOOTER
263
+ return pattern
264
+
265
+ # Delimiter: regular spacing + large gaps
266
+ if pattern.regular and pattern.avg_spacing > MIN_DELIMITER_SPACING:
267
+ pattern.role = PatternRole.DELIMITER
268
+ return pattern
269
+
270
+ # Sync marker: high frequency + short pattern
271
+ if len(pattern.bytes) <= 4 and pattern.count > 10:
272
+ pattern.role = PatternRole.SYNC_MARKER
273
+ return pattern
274
+
275
+ # Default to unknown
276
+ pattern.role = PatternRole.UNKNOWN
277
+ return pattern
278
+
279
+ def _deduplicate_patterns(self, patterns: list[Pattern]) -> list[Pattern]:
280
+ """Remove duplicate and subpatterns from pattern list.
281
+
282
+ Keeps longer patterns when a pattern is a substring of another
283
+ with the same positions.
284
+
285
+ Args:
286
+ patterns: List of patterns to deduplicate.
287
+
288
+ Returns:
289
+ Deduplicated list of patterns.
290
+ """
291
+ if not patterns:
292
+ return []
293
+
294
+ # Sort by length (longest first) and count (most frequent first)
295
+ sorted_patterns = sorted(patterns, key=lambda p: (len(p.bytes), p.count), reverse=True)
296
+
297
+ # Keep track of deduplicated patterns
298
+ deduplicated: list[Pattern] = []
299
+ seen_position_sets: list[set[int]] = []
300
+
301
+ for pattern in sorted_patterns:
302
+ position_set = set(pattern.positions)
303
+
304
+ # Check if this pattern is a duplicate (same positions)
305
+ is_duplicate = False
306
+ for seen_set in seen_position_sets:
307
+ # If position sets are very similar, consider duplicate
308
+ overlap = len(position_set & seen_set) / max(len(position_set), len(seen_set))
309
+ if overlap > PATTERN_OVERLAP_THRESHOLD:
310
+ is_duplicate = True
311
+ break
312
+
313
+ if not is_duplicate:
314
+ deduplicated.append(pattern)
315
+ seen_position_sets.append(position_set)
316
+
317
+ return deduplicated
318
+
319
+
320
+ __all__ = ["PatternMiner"]