oscura 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +1 -1
- oscura/__main__.py +4 -0
- oscura/analyzers/binary/__init__.py +36 -0
- oscura/analyzers/binary/core/__init__.py +29 -0
- oscura/analyzers/binary/core/file_access.py +193 -0
- oscura/analyzers/binary/core/pipeline.py +161 -0
- oscura/analyzers/binary/core/results.py +217 -0
- oscura/analyzers/binary/detection/__init__.py +10 -0
- oscura/analyzers/binary/detection/encoding.py +624 -0
- oscura/analyzers/binary/detection/patterns.py +320 -0
- oscura/analyzers/binary/detection/structure.py +630 -0
- oscura/analyzers/binary/export/__init__.py +9 -0
- oscura/analyzers/binary/export/dissector.py +174 -0
- oscura/analyzers/binary/inference/__init__.py +15 -0
- oscura/analyzers/binary/inference/checksums.py +214 -0
- oscura/analyzers/binary/inference/fields.py +150 -0
- oscura/analyzers/binary/inference/sequences.py +232 -0
- oscura/analyzers/binary/inference/timestamps.py +210 -0
- oscura/analyzers/binary/visualization/__init__.py +9 -0
- oscura/analyzers/binary/visualization/structure_view.py +182 -0
- oscura/analyzers/ml/signal_classifier.py +6 -0
- oscura/analyzers/waveform/spectral.py +18 -11
- oscura/automotive/__init__.py +1 -1
- oscura/automotive/flexray/fibex.py +9 -1
- oscura/loaders/__init__.py +4 -1
- oscura/loaders/binary.py +284 -1
- oscura/loaders/validation.py +17 -10
- oscura/sessions/legacy.py +110 -1
- oscura/workflows/batch/aggregate.py +5 -1
- oscura-0.12.0.dist-info/METADATA +460 -0
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/RECORD +34 -16
- oscura-0.10.0.dist-info/METADATA +0 -641
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/WHEEL +0 -0
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/entry_points.txt +0 -0
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""Efficient pattern mining for repeating byte sequences in binary files.
|
|
2
|
+
|
|
3
|
+
This module implements brute-force pattern detection for finding
|
|
4
|
+
repeating byte sequences in binary files of any size. Designed for
|
|
5
|
+
high-performance analysis of multi-GB files with minimal memory footprint.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> from oscura.analyzers.binary.detection.patterns import PatternMiner
|
|
9
|
+
>>> from oscura.analyzers.binary.core.file_access import BinaryFile
|
|
10
|
+
>>> bf = BinaryFile("large_file.bin")
|
|
11
|
+
>>> miner = PatternMiner()
|
|
12
|
+
>>> patterns = miner.find_patterns(bf, min_length=4, max_length=64)
|
|
13
|
+
>>> for pattern in patterns:
|
|
14
|
+
... print(f"Found {len(pattern.bytes)}-byte pattern {pattern.count} times")
|
|
15
|
+
... print(f" Role: {pattern.role.value}")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
from typing import TYPE_CHECKING
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
|
|
25
|
+
from oscura.analyzers.binary.core.results import Pattern, PatternRole
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from oscura.analyzers.binary.core.file_access import BinaryFile
|
|
29
|
+
|
|
30
|
+
# File size threshold for small files (read entire file vs sampling strategy)
|
|
31
|
+
SMALL_FILE_THRESHOLD = 1_000_000 # 1MB
|
|
32
|
+
|
|
33
|
+
# Sample size for pattern detection in large files
|
|
34
|
+
PATTERN_SAMPLE_SIZE = 1_000_000 # 1MB per sample location
|
|
35
|
+
|
|
36
|
+
# Number of sample locations to use for large files
|
|
37
|
+
PATTERN_N_SAMPLES = 5
|
|
38
|
+
|
|
39
|
+
# Maximum positions stored per pattern to prevent unbounded memory growth
|
|
40
|
+
MAX_POSITIONS_PER_PATTERN = 10_000
|
|
41
|
+
|
|
42
|
+
# Regular spacing detection threshold (coefficient of variation)
|
|
43
|
+
REGULAR_SPACING_CV_THRESHOLD = 0.1 # 10% variation allowed
|
|
44
|
+
|
|
45
|
+
# Minimum spacing for delimiter classification (bytes)
|
|
46
|
+
MIN_DELIMITER_SPACING = 1000
|
|
47
|
+
|
|
48
|
+
# Minimum message spacing for fixed-length detection (bytes)
|
|
49
|
+
MIN_MESSAGE_SPACING = 100
|
|
50
|
+
|
|
51
|
+
# File position threshold for footer detection (fraction of file size)
|
|
52
|
+
FOOTER_POSITION_THRESHOLD = 0.95 # Last 5% of file
|
|
53
|
+
|
|
54
|
+
# Pattern deduplication overlap threshold (fraction)
|
|
55
|
+
PATTERN_OVERLAP_THRESHOLD = 0.9 # 90% position overlap
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PatternMiner:
|
|
59
|
+
"""Efficient pattern mining for repeating byte sequences.
|
|
60
|
+
|
|
61
|
+
Uses brute-force substring matching for pattern detection in large
|
|
62
|
+
binary files. Samples multiple file locations to avoid scanning
|
|
63
|
+
entire multi-GB files while maintaining high detection accuracy.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> miner = PatternMiner()
|
|
67
|
+
>>> bf = BinaryFile("data.bin")
|
|
68
|
+
>>> patterns = miner.find_patterns(bf, min_length=2, max_length=64)
|
|
69
|
+
>>> print(f"Found {len(patterns)} repeating patterns")
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(self) -> None:
|
|
73
|
+
"""Initialize pattern miner."""
|
|
74
|
+
self._hash_base = 257 # Prime number for hashing
|
|
75
|
+
self._hash_mod = 2**31 - 1 # Large prime for hash modulus
|
|
76
|
+
|
|
77
|
+
def find_patterns(
|
|
78
|
+
self,
|
|
79
|
+
file: BinaryFile,
|
|
80
|
+
min_length: int = 2,
|
|
81
|
+
max_length: int = 64,
|
|
82
|
+
min_occurrences: int = 3,
|
|
83
|
+
) -> list[Pattern]:
|
|
84
|
+
"""Find repeating byte patterns in binary file.
|
|
85
|
+
|
|
86
|
+
Scans file for repeating byte sequences using brute-force
|
|
87
|
+
substring matching. Samples multiple file locations for large
|
|
88
|
+
files to maintain performance.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
file: Binary file to analyze.
|
|
92
|
+
min_length: Minimum pattern length in bytes (default: 2).
|
|
93
|
+
max_length: Maximum pattern length in bytes (default: 64).
|
|
94
|
+
min_occurrences: Minimum pattern occurrences to report (default: 3).
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of Pattern objects sorted by occurrence count (descending).
|
|
98
|
+
|
|
99
|
+
Example:
|
|
100
|
+
>>> bf = BinaryFile("data.bin")
|
|
101
|
+
>>> miner = PatternMiner()
|
|
102
|
+
>>> patterns = miner.find_patterns(bf, min_length=4, min_occurrences=5)
|
|
103
|
+
>>> for p in patterns:
|
|
104
|
+
... print(f"{p.bytes.hex()}: {p.count} occurrences")
|
|
105
|
+
"""
|
|
106
|
+
if min_length < 1:
|
|
107
|
+
raise ValueError(f"min_length must be >= 1, got {min_length}")
|
|
108
|
+
if max_length < min_length:
|
|
109
|
+
raise ValueError(f"max_length ({max_length}) must be >= min_length ({min_length})")
|
|
110
|
+
if min_occurrences < 1:
|
|
111
|
+
raise ValueError(f"min_occurrences must be >= 1, got {min_occurrences}")
|
|
112
|
+
|
|
113
|
+
# Handle empty files
|
|
114
|
+
if file.size == 0:
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
# Sample multiple file locations for large files
|
|
118
|
+
sample_data = self._sample_file(file)
|
|
119
|
+
|
|
120
|
+
# Find patterns of each length
|
|
121
|
+
all_patterns: list[Pattern] = []
|
|
122
|
+
for length in range(min_length, min(max_length + 1, len(sample_data) + 1)):
|
|
123
|
+
length_patterns = self._find_patterns_of_length(sample_data, length, min_occurrences)
|
|
124
|
+
all_patterns.extend(length_patterns)
|
|
125
|
+
|
|
126
|
+
# Deduplicate patterns (remove subpatterns)
|
|
127
|
+
deduplicated = self._deduplicate_patterns(all_patterns)
|
|
128
|
+
|
|
129
|
+
# Classify pattern roles
|
|
130
|
+
classified = [self._classify_pattern_with_file_info(p, file) for p in deduplicated]
|
|
131
|
+
|
|
132
|
+
# Sort by occurrence count (most frequent first)
|
|
133
|
+
classified.sort(key=lambda p: p.count, reverse=True)
|
|
134
|
+
|
|
135
|
+
return classified
|
|
136
|
+
|
|
137
|
+
def _sample_file(self, file: BinaryFile) -> bytes:
|
|
138
|
+
"""Sample file data for pattern analysis.
|
|
139
|
+
|
|
140
|
+
For large files, samples multiple locations instead of reading
|
|
141
|
+
entire file to maintain performance.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
file: Binary file to sample.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Concatenated sample data from multiple file locations.
|
|
148
|
+
"""
|
|
149
|
+
# For small files, read entire file
|
|
150
|
+
if file.size < SMALL_FILE_THRESHOLD:
|
|
151
|
+
return file.read_bytes(0, file.size)
|
|
152
|
+
|
|
153
|
+
# For large files, sample multiple locations
|
|
154
|
+
n_samples = PATTERN_N_SAMPLES
|
|
155
|
+
sample_size = PATTERN_SAMPLE_SIZE
|
|
156
|
+
|
|
157
|
+
samples = file.sample_locations(n_samples=n_samples, sample_size=sample_size)
|
|
158
|
+
|
|
159
|
+
# Concatenate samples
|
|
160
|
+
return b"".join(samples)
|
|
161
|
+
|
|
162
|
+
def _find_patterns_of_length(
|
|
163
|
+
self, data: bytes, length: int, min_occurrences: int
|
|
164
|
+
) -> list[Pattern]:
|
|
165
|
+
"""Find all patterns of specific length in data.
|
|
166
|
+
|
|
167
|
+
Uses brute-force substring matching for pattern detection.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
data: Byte data to search.
|
|
171
|
+
length: Pattern length in bytes.
|
|
172
|
+
min_occurrences: Minimum occurrences to report.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List of Pattern objects for this length.
|
|
176
|
+
"""
|
|
177
|
+
if len(data) < length:
|
|
178
|
+
return []
|
|
179
|
+
|
|
180
|
+
# Build hash table of patterns
|
|
181
|
+
pattern_positions: dict[bytes, list[int]] = defaultdict(list)
|
|
182
|
+
|
|
183
|
+
# Scan data for patterns
|
|
184
|
+
for i in range(len(data) - length + 1):
|
|
185
|
+
pattern_bytes = data[i : i + length]
|
|
186
|
+
if len(pattern_positions[pattern_bytes]) < MAX_POSITIONS_PER_PATTERN:
|
|
187
|
+
pattern_positions[pattern_bytes].append(i)
|
|
188
|
+
|
|
189
|
+
# Filter by minimum occurrences
|
|
190
|
+
patterns: list[Pattern] = []
|
|
191
|
+
for pattern_bytes, positions in pattern_positions.items():
|
|
192
|
+
count = len(positions)
|
|
193
|
+
if count >= min_occurrences:
|
|
194
|
+
# Calculate spacing statistics
|
|
195
|
+
spacings = np.diff(positions) if len(positions) > 1 else np.array([])
|
|
196
|
+
avg_spacing = float(np.mean(spacings)) if len(spacings) > 0 else 0.0
|
|
197
|
+
spacing_variance = float(np.var(spacings)) if len(spacings) > 0 else 0.0
|
|
198
|
+
|
|
199
|
+
# Determine if spacing is regular
|
|
200
|
+
regular = self._is_regular_spacing(spacings)
|
|
201
|
+
|
|
202
|
+
# Create pattern with initial role (will classify later)
|
|
203
|
+
patterns.append(
|
|
204
|
+
Pattern(
|
|
205
|
+
bytes=pattern_bytes,
|
|
206
|
+
positions=positions,
|
|
207
|
+
count=count,
|
|
208
|
+
avg_spacing=avg_spacing,
|
|
209
|
+
spacing_variance=spacing_variance,
|
|
210
|
+
regular=regular,
|
|
211
|
+
role=PatternRole.UNKNOWN,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
return patterns
|
|
216
|
+
|
|
217
|
+
def _is_regular_spacing(self, spacings: np.ndarray[int, np.dtype[np.int64]]) -> bool:
|
|
218
|
+
"""Determine if pattern spacing is regular.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
spacings: Array of spacing values between pattern occurrences.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
True if spacing is regular (coefficient of variation < 0.1).
|
|
225
|
+
"""
|
|
226
|
+
if len(spacings) < 2:
|
|
227
|
+
return False
|
|
228
|
+
|
|
229
|
+
mean_spacing = float(np.mean(spacings))
|
|
230
|
+
if mean_spacing == 0:
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
std_spacing = float(np.std(spacings))
|
|
234
|
+
coefficient_of_variation = std_spacing / mean_spacing
|
|
235
|
+
|
|
236
|
+
# Consider regular if CV below threshold
|
|
237
|
+
return coefficient_of_variation < REGULAR_SPACING_CV_THRESHOLD
|
|
238
|
+
|
|
239
|
+
def _classify_pattern_with_file_info(self, pattern: Pattern, file: BinaryFile) -> Pattern:
|
|
240
|
+
"""Classify pattern role using file context.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
pattern: Pattern to classify.
|
|
244
|
+
file: Binary file for context.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Pattern with updated role classification.
|
|
248
|
+
"""
|
|
249
|
+
if not pattern.positions:
|
|
250
|
+
return pattern
|
|
251
|
+
|
|
252
|
+
first_position = pattern.positions[0]
|
|
253
|
+
|
|
254
|
+
# Header: at file start (position 0)
|
|
255
|
+
if first_position == 0:
|
|
256
|
+
pattern.role = PatternRole.HEADER
|
|
257
|
+
return pattern
|
|
258
|
+
|
|
259
|
+
# Footer: at file end
|
|
260
|
+
last_position = pattern.positions[-1]
|
|
261
|
+
if last_position > FOOTER_POSITION_THRESHOLD * file.size:
|
|
262
|
+
pattern.role = PatternRole.FOOTER
|
|
263
|
+
return pattern
|
|
264
|
+
|
|
265
|
+
# Delimiter: regular spacing + large gaps
|
|
266
|
+
if pattern.regular and pattern.avg_spacing > MIN_DELIMITER_SPACING:
|
|
267
|
+
pattern.role = PatternRole.DELIMITER
|
|
268
|
+
return pattern
|
|
269
|
+
|
|
270
|
+
# Sync marker: high frequency + short pattern
|
|
271
|
+
if len(pattern.bytes) <= 4 and pattern.count > 10:
|
|
272
|
+
pattern.role = PatternRole.SYNC_MARKER
|
|
273
|
+
return pattern
|
|
274
|
+
|
|
275
|
+
# Default to unknown
|
|
276
|
+
pattern.role = PatternRole.UNKNOWN
|
|
277
|
+
return pattern
|
|
278
|
+
|
|
279
|
+
def _deduplicate_patterns(self, patterns: list[Pattern]) -> list[Pattern]:
|
|
280
|
+
"""Remove duplicate and subpatterns from pattern list.
|
|
281
|
+
|
|
282
|
+
Keeps longer patterns when a pattern is a substring of another
|
|
283
|
+
with the same positions.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
patterns: List of patterns to deduplicate.
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
Deduplicated list of patterns.
|
|
290
|
+
"""
|
|
291
|
+
if not patterns:
|
|
292
|
+
return []
|
|
293
|
+
|
|
294
|
+
# Sort by length (longest first) and count (most frequent first)
|
|
295
|
+
sorted_patterns = sorted(patterns, key=lambda p: (len(p.bytes), p.count), reverse=True)
|
|
296
|
+
|
|
297
|
+
# Keep track of deduplicated patterns
|
|
298
|
+
deduplicated: list[Pattern] = []
|
|
299
|
+
seen_position_sets: list[set[int]] = []
|
|
300
|
+
|
|
301
|
+
for pattern in sorted_patterns:
|
|
302
|
+
position_set = set(pattern.positions)
|
|
303
|
+
|
|
304
|
+
# Check if this pattern is a duplicate (same positions)
|
|
305
|
+
is_duplicate = False
|
|
306
|
+
for seen_set in seen_position_sets:
|
|
307
|
+
# If position sets are very similar, consider duplicate
|
|
308
|
+
overlap = len(position_set & seen_set) / max(len(position_set), len(seen_set))
|
|
309
|
+
if overlap > PATTERN_OVERLAP_THRESHOLD:
|
|
310
|
+
is_duplicate = True
|
|
311
|
+
break
|
|
312
|
+
|
|
313
|
+
if not is_duplicate:
|
|
314
|
+
deduplicated.append(pattern)
|
|
315
|
+
seen_position_sets.append(position_set)
|
|
316
|
+
|
|
317
|
+
return deduplicated
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
__all__ = ["PatternMiner"]
|