oscura 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +1 -1
- oscura/analyzers/binary/__init__.py +36 -0
- oscura/analyzers/binary/core/__init__.py +29 -0
- oscura/analyzers/binary/core/file_access.py +193 -0
- oscura/analyzers/binary/core/pipeline.py +161 -0
- oscura/analyzers/binary/core/results.py +217 -0
- oscura/analyzers/binary/detection/__init__.py +10 -0
- oscura/analyzers/binary/detection/encoding.py +624 -0
- oscura/analyzers/binary/detection/patterns.py +320 -0
- oscura/analyzers/binary/detection/structure.py +630 -0
- oscura/analyzers/binary/export/__init__.py +9 -0
- oscura/analyzers/binary/export/dissector.py +174 -0
- oscura/analyzers/binary/inference/__init__.py +15 -0
- oscura/analyzers/binary/inference/checksums.py +214 -0
- oscura/analyzers/binary/inference/fields.py +150 -0
- oscura/analyzers/binary/inference/sequences.py +232 -0
- oscura/analyzers/binary/inference/timestamps.py +210 -0
- oscura/analyzers/binary/visualization/__init__.py +9 -0
- oscura/analyzers/binary/visualization/structure_view.py +182 -0
- oscura/automotive/__init__.py +1 -1
- oscura/automotive/dtc/data.json +102 -17
- oscura/core/schemas/device_mapping.json +8 -2
- oscura/core/schemas/packet_format.json +24 -4
- oscura/core/schemas/protocol_definition.json +12 -2
- oscura/loaders/__init__.py +4 -1
- oscura/loaders/binary.py +284 -1
- oscura/sessions/legacy.py +80 -19
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/METADATA +3 -3
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/RECORD +32 -14
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/WHEEL +0 -0
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/entry_points.txt +0 -0
- {oscura-0.11.0.dist-info → oscura-0.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,624 @@
|
|
|
1
|
+
"""Enhanced encoding detection with multi-stage validation.
|
|
2
|
+
|
|
3
|
+
Provides robust encoding detection for binary files using:
|
|
4
|
+
- Statistical analysis (entropy, IEEE 754 validation, value ranges)
|
|
5
|
+
- Validation by loading and checking actual data
|
|
6
|
+
- Consensus detection from multiple file locations
|
|
7
|
+
- Prevents common misdetections (e.g., uint16 → float32)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import struct
|
|
13
|
+
import warnings
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numpy.typing import NDArray
|
|
19
|
+
|
|
20
|
+
from oscura.analyzers.binary.core.results import EncodingResult
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from oscura.analyzers.binary.core.file_access import BinaryFile
|
|
24
|
+
|
|
25
|
+
# Number of float values to validate for IEEE 754 compliance
|
|
26
|
+
ENCODING_VALIDATION_COUNT = 100
|
|
27
|
+
|
|
28
|
+
# Minimum valid float ratio for IEEE 754 validation
|
|
29
|
+
MIN_VALID_FLOAT_RATIO = 0.8 # 80% of values must be valid
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DTypeInfo(TypedDict):
|
|
33
|
+
"""Data type information."""
|
|
34
|
+
|
|
35
|
+
itemsize: int
|
|
36
|
+
min: float
|
|
37
|
+
max: float
|
|
38
|
+
signed: bool
|
|
39
|
+
float: bool
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Supported data types with their properties
|
|
43
|
+
SUPPORTED_DTYPES: dict[str, DTypeInfo] = {
|
|
44
|
+
"uint8": {"itemsize": 1, "min": 0, "max": 255, "signed": False, "float": False},
|
|
45
|
+
"int8": {"itemsize": 1, "min": -128, "max": 127, "signed": True, "float": False},
|
|
46
|
+
"uint16": {"itemsize": 2, "min": 0, "max": 65535, "signed": False, "float": False},
|
|
47
|
+
"int16": {"itemsize": 2, "min": -32768, "max": 32767, "signed": True, "float": False},
|
|
48
|
+
"uint32": {"itemsize": 4, "min": 0, "max": 4294967295, "signed": False, "float": False},
|
|
49
|
+
"int32": {
|
|
50
|
+
"itemsize": 4,
|
|
51
|
+
"min": -2147483648,
|
|
52
|
+
"max": 2147483647,
|
|
53
|
+
"signed": True,
|
|
54
|
+
"float": False,
|
|
55
|
+
},
|
|
56
|
+
"float32": {"itemsize": 4, "min": -3.4e38, "max": 3.4e38, "signed": True, "float": True},
|
|
57
|
+
"float64": {"itemsize": 8, "min": -1.7e308, "max": 1.7e308, "signed": True, "float": True},
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class ValidationResult:
|
|
63
|
+
"""Result of encoding validation."""
|
|
64
|
+
|
|
65
|
+
passed: bool
|
|
66
|
+
issues: list[str] = field(default_factory=list)
|
|
67
|
+
statistics: dict[str, float] = field(default_factory=dict)
|
|
68
|
+
sample_data: NDArray[np.float64] = field(default_factory=lambda: np.array([]))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class EncodingDetector:
|
|
72
|
+
"""Multi-stage encoding detector with validation.
|
|
73
|
+
|
|
74
|
+
Detects binary encoding using statistical analysis and validation.
|
|
75
|
+
Prevents false positives by actually loading data and checking results.
|
|
76
|
+
|
|
77
|
+
Example:
|
|
78
|
+
>>> detector = EncodingDetector()
|
|
79
|
+
>>> with BinaryFile("data.bin") as bf:
|
|
80
|
+
... result = detector.detect(bf, validation=True)
|
|
81
|
+
... print(f"Detected: {result.dtype} ({result.confidence:.1%})")
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
min_std_threshold: float = 0.01,
|
|
87
|
+
max_constant_ratio: float = 0.95,
|
|
88
|
+
min_confidence: float = 0.5,
|
|
89
|
+
):
|
|
90
|
+
"""Initialize encoding detector.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
min_std_threshold: Minimum standard deviation to avoid constant data.
|
|
94
|
+
max_constant_ratio: Maximum ratio of constant values allowed.
|
|
95
|
+
min_confidence: Minimum confidence to return a detection.
|
|
96
|
+
"""
|
|
97
|
+
self.min_std_threshold = min_std_threshold
|
|
98
|
+
self.max_constant_ratio = max_constant_ratio
|
|
99
|
+
self.min_confidence = min_confidence
|
|
100
|
+
|
|
101
|
+
def detect(
|
|
102
|
+
self,
|
|
103
|
+
file: BinaryFile,
|
|
104
|
+
validation: bool = True,
|
|
105
|
+
n_samples: int = 5,
|
|
106
|
+
max_samples: int = 1000,
|
|
107
|
+
) -> EncodingResult:
|
|
108
|
+
"""Detect encoding with optional validation.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
file: Binary file to analyze.
|
|
112
|
+
validation: Whether to validate detection by loading data.
|
|
113
|
+
n_samples: Number of file locations to sample.
|
|
114
|
+
max_samples: Maximum number of data samples to load for validation.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
EncodingResult with detected dtype and confidence.
|
|
118
|
+
|
|
119
|
+
Example:
|
|
120
|
+
>>> detector = EncodingDetector()
|
|
121
|
+
>>> with BinaryFile("data.bin") as bf:
|
|
122
|
+
... result = detector.detect(bf)
|
|
123
|
+
... if result.validation_passed:
|
|
124
|
+
... print(f"Confident detection: {result.dtype}")
|
|
125
|
+
"""
|
|
126
|
+
# Sample data from multiple locations
|
|
127
|
+
samples = file.sample_locations(n_samples=n_samples, sample_size=8192)
|
|
128
|
+
|
|
129
|
+
# Stage 1: Statistical detection
|
|
130
|
+
candidates = self._statistical_detection(samples)
|
|
131
|
+
|
|
132
|
+
if not candidates:
|
|
133
|
+
# Fallback to uint8 with low confidence
|
|
134
|
+
sample_data = self._load_as_dtype(file, "uint8", max_samples=max_samples)
|
|
135
|
+
return EncodingResult(
|
|
136
|
+
dtype="uint8",
|
|
137
|
+
confidence=0.3,
|
|
138
|
+
alternatives=[],
|
|
139
|
+
validation_passed=False,
|
|
140
|
+
sample_data=sample_data,
|
|
141
|
+
statistics=self._compute_statistics(sample_data),
|
|
142
|
+
issues=["No clear encoding detected, defaulting to uint8"],
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Stage 2: Consensus detection from multiple locations
|
|
146
|
+
if validation and len(samples) > 1:
|
|
147
|
+
candidates = self._consensus_detection(file, candidates, n_samples=n_samples)
|
|
148
|
+
|
|
149
|
+
# Stage 3: Validation by loading and checking data
|
|
150
|
+
best_dtype, best_confidence = candidates[0]
|
|
151
|
+
validation_result = ValidationResult(passed=False)
|
|
152
|
+
|
|
153
|
+
if validation:
|
|
154
|
+
validation_result = self._validate_encoding(file, best_dtype, max_samples=max_samples)
|
|
155
|
+
|
|
156
|
+
# If validation failed, try alternatives
|
|
157
|
+
if not validation_result.passed and len(candidates) > 1:
|
|
158
|
+
for dtype, conf in candidates[1:]:
|
|
159
|
+
alt_validation = self._validate_encoding(file, dtype, max_samples=max_samples)
|
|
160
|
+
if alt_validation.passed:
|
|
161
|
+
validation_result = alt_validation
|
|
162
|
+
best_dtype = dtype
|
|
163
|
+
best_confidence = conf
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
# Use validated sample data if available
|
|
167
|
+
sample_data = (
|
|
168
|
+
validation_result.sample_data
|
|
169
|
+
if validation_result.sample_data.size > 0
|
|
170
|
+
else self._load_as_dtype(file, best_dtype, max_samples=max_samples)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
statistics = (
|
|
174
|
+
validation_result.statistics
|
|
175
|
+
if validation_result.statistics
|
|
176
|
+
else self._compute_statistics(sample_data)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return EncodingResult(
|
|
180
|
+
dtype=best_dtype,
|
|
181
|
+
confidence=best_confidence if validation_result.passed else best_confidence * 0.7,
|
|
182
|
+
alternatives=[(dt, conf) for dt, conf in candidates[1:6]],
|
|
183
|
+
validation_passed=validation_result.passed,
|
|
184
|
+
sample_data=sample_data,
|
|
185
|
+
statistics=statistics,
|
|
186
|
+
issues=validation_result.issues if not validation_result.passed else [],
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def _statistical_detection(self, samples: list[bytes]) -> list[tuple[str, float]]:
|
|
190
|
+
"""Detect encoding using statistical analysis.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
samples: List of byte samples from file.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
List of (dtype, confidence) tuples, sorted by confidence descending.
|
|
197
|
+
"""
|
|
198
|
+
scores: dict[str, float] = {}
|
|
199
|
+
|
|
200
|
+
for dtype_name in SUPPORTED_DTYPES:
|
|
201
|
+
score = self._score_dtype(samples, dtype_name)
|
|
202
|
+
if score > 0:
|
|
203
|
+
scores[dtype_name] = score
|
|
204
|
+
|
|
205
|
+
# Sort by score descending
|
|
206
|
+
candidates = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
207
|
+
|
|
208
|
+
return candidates
|
|
209
|
+
|
|
210
|
+
def _score_dtype(self, samples: list[bytes], dtype: str) -> float:
|
|
211
|
+
"""Score how well a dtype fits the data.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
samples: List of byte samples.
|
|
215
|
+
dtype: Data type to score.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Score from 0.0 to 1.0, higher is better.
|
|
219
|
+
"""
|
|
220
|
+
dtype_info = SUPPORTED_DTYPES[dtype]
|
|
221
|
+
itemsize = dtype_info["itemsize"]
|
|
222
|
+
is_float = dtype_info["float"]
|
|
223
|
+
|
|
224
|
+
total_score = 0.0
|
|
225
|
+
valid_samples = 0
|
|
226
|
+
|
|
227
|
+
for sample in samples:
|
|
228
|
+
if len(sample) < itemsize:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
# Parse sample data
|
|
232
|
+
count = len(sample) // itemsize
|
|
233
|
+
try:
|
|
234
|
+
data = np.frombuffer(sample, dtype=dtype, count=count)
|
|
235
|
+
except (ValueError, TypeError):
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
if len(data) == 0:
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
valid_samples += 1
|
|
242
|
+
sample_score = 0.0
|
|
243
|
+
|
|
244
|
+
# Check 1: Entropy (should be reasonable)
|
|
245
|
+
entropy = self._compute_entropy(sample)
|
|
246
|
+
if is_float:
|
|
247
|
+
# Floats should have higher entropy
|
|
248
|
+
if entropy > 4.0:
|
|
249
|
+
sample_score += 0.3
|
|
250
|
+
else:
|
|
251
|
+
# Integers should have moderate entropy
|
|
252
|
+
if 1.0 < entropy < 7.0:
|
|
253
|
+
sample_score += 0.2
|
|
254
|
+
|
|
255
|
+
# Check 2: Value range makes sense
|
|
256
|
+
# Suppress warnings for invalid casts (expected when testing encodings)
|
|
257
|
+
with warnings.catch_warnings():
|
|
258
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
259
|
+
if is_float:
|
|
260
|
+
# No conversion needed for floats
|
|
261
|
+
data_min = float(np.min(data))
|
|
262
|
+
data_max = float(np.max(data))
|
|
263
|
+
else:
|
|
264
|
+
# Convert integers to float64 for range comparison
|
|
265
|
+
data_float = data.astype(np.float64)
|
|
266
|
+
data_min = float(np.min(data_float))
|
|
267
|
+
data_max = float(np.max(data_float))
|
|
268
|
+
|
|
269
|
+
if is_float:
|
|
270
|
+
# For floats, check IEEE 754 validity
|
|
271
|
+
if self._is_valid_ieee754(sample, itemsize):
|
|
272
|
+
sample_score += 0.4
|
|
273
|
+
# Check if values are in reasonable range
|
|
274
|
+
if data_min > -1e6 and data_max < 1e6:
|
|
275
|
+
sample_score += 0.2
|
|
276
|
+
else:
|
|
277
|
+
# For integers, check range
|
|
278
|
+
type_min = dtype_info["min"]
|
|
279
|
+
type_max = dtype_info["max"]
|
|
280
|
+
range_score = self._score_integer_range(data_min, data_max, type_min, type_max)
|
|
281
|
+
sample_score += range_score * 0.5
|
|
282
|
+
|
|
283
|
+
# Bonus: Prefer types that make better use of the value range
|
|
284
|
+
# If data uses more of the type's range, that's a better fit
|
|
285
|
+
data_range = data_max - data_min
|
|
286
|
+
type_range = type_max - type_min
|
|
287
|
+
if type_range > 0:
|
|
288
|
+
utilization = data_range / type_range
|
|
289
|
+
# Reward types with 1%-50% utilization (good fit)
|
|
290
|
+
if 0.01 <= utilization <= 0.5:
|
|
291
|
+
sample_score += 0.2
|
|
292
|
+
# Small reward for reasonable utilization
|
|
293
|
+
elif 0.001 <= utilization < 0.01:
|
|
294
|
+
sample_score += 0.1
|
|
295
|
+
|
|
296
|
+
# Check 3: Not all constant
|
|
297
|
+
# Suppress overflow warnings when testing invalid dtype interpretations
|
|
298
|
+
with warnings.catch_warnings():
|
|
299
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
300
|
+
if is_float:
|
|
301
|
+
std = float(np.std(data))
|
|
302
|
+
else:
|
|
303
|
+
std = float(np.std(data_float))
|
|
304
|
+
if std > self.min_std_threshold:
|
|
305
|
+
sample_score += 0.2
|
|
306
|
+
|
|
307
|
+
# Check 4: Alignment check (data should align well)
|
|
308
|
+
if self._check_alignment(sample, itemsize):
|
|
309
|
+
sample_score += 0.1
|
|
310
|
+
|
|
311
|
+
total_score += sample_score
|
|
312
|
+
|
|
313
|
+
if valid_samples == 0:
|
|
314
|
+
return 0.0
|
|
315
|
+
|
|
316
|
+
# Normalize to 0.0-1.0 range
|
|
317
|
+
avg_score = total_score / valid_samples
|
|
318
|
+
return min(avg_score, 1.0)
|
|
319
|
+
|
|
320
|
+
def _compute_entropy(self, data: bytes) -> float:
|
|
321
|
+
"""Compute Shannon entropy of byte data.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
data: Byte sequence.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Entropy in bits (0-8 for bytes).
|
|
328
|
+
"""
|
|
329
|
+
if len(data) == 0:
|
|
330
|
+
return 0.0
|
|
331
|
+
|
|
332
|
+
# Count byte frequencies
|
|
333
|
+
counts = np.bincount(np.frombuffer(data, dtype=np.uint8), minlength=256)
|
|
334
|
+
probabilities = counts[counts > 0] / len(data)
|
|
335
|
+
|
|
336
|
+
# Shannon entropy
|
|
337
|
+
entropy = -np.sum(probabilities * np.log2(probabilities))
|
|
338
|
+
return float(entropy)
|
|
339
|
+
|
|
340
|
+
def _is_valid_ieee754(self, data: bytes, itemsize: int) -> bool:
|
|
341
|
+
"""Check if data looks like valid IEEE 754 floats.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
data: Byte data to check.
|
|
345
|
+
itemsize: 4 for float32, 8 for float64.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
True if data appears to be valid IEEE 754.
|
|
349
|
+
"""
|
|
350
|
+
if itemsize not in (4, 8):
|
|
351
|
+
return False
|
|
352
|
+
|
|
353
|
+
count = min(len(data) // itemsize, ENCODING_VALIDATION_COUNT)
|
|
354
|
+
if count == 0:
|
|
355
|
+
return False
|
|
356
|
+
|
|
357
|
+
fmt = f"{'f' if itemsize == 4 else 'd'}"
|
|
358
|
+
valid_count = 0
|
|
359
|
+
|
|
360
|
+
for i in range(count):
|
|
361
|
+
try:
|
|
362
|
+
offset = i * itemsize
|
|
363
|
+
value = struct.unpack(fmt, data[offset : offset + itemsize])[0]
|
|
364
|
+
|
|
365
|
+
# Check for NaN, Inf
|
|
366
|
+
if np.isnan(value) or np.isinf(value):
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
# Check if value is reasonable
|
|
370
|
+
if abs(value) < 1e30: # Not extremely large
|
|
371
|
+
valid_count += 1
|
|
372
|
+
|
|
373
|
+
except (struct.error, ValueError):
|
|
374
|
+
continue
|
|
375
|
+
|
|
376
|
+
# Validate minimum ratio of valid floats
|
|
377
|
+
return valid_count / count > MIN_VALID_FLOAT_RATIO
|
|
378
|
+
|
|
379
|
+
def _score_integer_range(
|
|
380
|
+
self, data_min: float, data_max: float, type_min: float, type_max: float
|
|
381
|
+
) -> float:
|
|
382
|
+
"""Score how well data fits integer type range.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
data_min: Minimum value in data.
|
|
386
|
+
data_max: Maximum value in data.
|
|
387
|
+
type_min: Minimum value for type.
|
|
388
|
+
type_max: Maximum value for type.
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
Score from 0.0 to 1.0.
|
|
392
|
+
"""
|
|
393
|
+
# Data must fit in type range
|
|
394
|
+
if data_min < type_min or data_max > type_max:
|
|
395
|
+
return 0.0
|
|
396
|
+
|
|
397
|
+
# Score based on range utilization
|
|
398
|
+
data_range = data_max - data_min
|
|
399
|
+
type_range = type_max - type_min
|
|
400
|
+
|
|
401
|
+
if type_range == 0:
|
|
402
|
+
return 0.0
|
|
403
|
+
|
|
404
|
+
utilization = data_range / type_range
|
|
405
|
+
|
|
406
|
+
# Prefer types where data uses a reasonable portion of range
|
|
407
|
+
# but not types that are way too large
|
|
408
|
+
if utilization > 0.01: # Uses at least 1% of range
|
|
409
|
+
return 1.0
|
|
410
|
+
elif utilization > 0.001: # Uses at least 0.1%
|
|
411
|
+
return 0.7
|
|
412
|
+
else:
|
|
413
|
+
return 0.3 # Very small utilization, might be oversized type
|
|
414
|
+
|
|
415
|
+
def _check_alignment(self, data: bytes, itemsize: int) -> bool:
|
|
416
|
+
"""Check if data aligns well with item size.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
data: Byte data.
|
|
420
|
+
itemsize: Size of data type in bytes.
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
True if data aligns well.
|
|
424
|
+
"""
|
|
425
|
+
# File size should be multiple of itemsize (or close)
|
|
426
|
+
remainder = len(data) % itemsize
|
|
427
|
+
return remainder < itemsize * 0.1 # Within 10%
|
|
428
|
+
|
|
429
|
+
def _consensus_detection(
|
|
430
|
+
self, file: BinaryFile, candidates: list[tuple[str, float]], n_samples: int
|
|
431
|
+
) -> list[tuple[str, float]]:
|
|
432
|
+
"""Refine detection using consensus from multiple locations.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
file: Binary file being analyzed.
|
|
436
|
+
candidates: Initial candidate list.
|
|
437
|
+
n_samples: Number of samples to check.
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
Refined candidate list with adjusted confidences.
|
|
441
|
+
"""
|
|
442
|
+
# Sample additional locations
|
|
443
|
+
samples = file.sample_locations(n_samples=n_samples, sample_size=4096)
|
|
444
|
+
|
|
445
|
+
# Score each candidate on all samples
|
|
446
|
+
consensus_scores: dict[str, list[float]] = {dt: [] for dt, _ in candidates[:5]}
|
|
447
|
+
|
|
448
|
+
for sample in samples:
|
|
449
|
+
for dtype, scores_list in consensus_scores.items():
|
|
450
|
+
score = self._score_dtype([sample], dtype)
|
|
451
|
+
scores_list.append(score)
|
|
452
|
+
|
|
453
|
+
# Compute average and consistency
|
|
454
|
+
refined_candidates: list[tuple[str, float]] = []
|
|
455
|
+
for dtype, scores in consensus_scores.items():
|
|
456
|
+
if not scores:
|
|
457
|
+
continue
|
|
458
|
+
# Suppress overflow warnings when computing statistics
|
|
459
|
+
with warnings.catch_warnings():
|
|
460
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
461
|
+
avg_score = np.mean(scores)
|
|
462
|
+
consistency = 1.0 - np.std(scores) # Lower std = more consistent
|
|
463
|
+
|
|
464
|
+
# Combine average score with consistency
|
|
465
|
+
final_score = avg_score * 0.7 + consistency * 0.3
|
|
466
|
+
refined_candidates.append((dtype, float(final_score)))
|
|
467
|
+
|
|
468
|
+
# Sort by final score
|
|
469
|
+
refined_candidates.sort(key=lambda x: x[1], reverse=True)
|
|
470
|
+
|
|
471
|
+
return refined_candidates
|
|
472
|
+
|
|
473
|
+
def _validate_encoding(
|
|
474
|
+
self, file: BinaryFile, dtype: str, max_samples: int = 1000
|
|
475
|
+
) -> ValidationResult:
|
|
476
|
+
"""Validate encoding by loading and checking data.
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
file: Binary file to validate.
|
|
480
|
+
dtype: Data type to validate.
|
|
481
|
+
max_samples: Maximum number of data samples to load.
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
ValidationResult with validation status and issues.
|
|
485
|
+
"""
|
|
486
|
+
issues: list[str] = []
|
|
487
|
+
|
|
488
|
+
# Load sample data
|
|
489
|
+
try:
|
|
490
|
+
sample_data = self._load_as_dtype(file, dtype, max_samples=max_samples)
|
|
491
|
+
except (ValueError, TypeError) as e:
|
|
492
|
+
return ValidationResult(passed=False, issues=[f"Failed to load as {dtype}: {e!s}"])
|
|
493
|
+
|
|
494
|
+
if len(sample_data) == 0:
|
|
495
|
+
return ValidationResult(passed=False, issues=["No data could be loaded"])
|
|
496
|
+
|
|
497
|
+
# Convert to float for analysis
|
|
498
|
+
# Suppress overflow warnings when testing invalid dtype interpretations
|
|
499
|
+
with warnings.catch_warnings():
|
|
500
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
501
|
+
data_float = sample_data.astype(np.float64)
|
|
502
|
+
|
|
503
|
+
# Check 1: Not all constant
|
|
504
|
+
with warnings.catch_warnings():
|
|
505
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
506
|
+
std = float(np.std(data_float))
|
|
507
|
+
if std < self.min_std_threshold:
|
|
508
|
+
issues.append(f"Data appears constant (std={std:.2e})")
|
|
509
|
+
|
|
510
|
+
# Check 2: No NaN/Inf for float types
|
|
511
|
+
dtype_info = SUPPORTED_DTYPES[dtype]
|
|
512
|
+
if dtype_info["float"]:
|
|
513
|
+
nan_count = int(np.sum(np.isnan(data_float)))
|
|
514
|
+
inf_count = int(np.sum(np.isinf(data_float)))
|
|
515
|
+
|
|
516
|
+
if nan_count > len(data_float) * 0.1: # More than 10% NaN
|
|
517
|
+
issues.append(f"Too many NaN values: {nan_count}/{len(data_float)}")
|
|
518
|
+
|
|
519
|
+
if inf_count > len(data_float) * 0.1: # More than 10% Inf
|
|
520
|
+
issues.append(f"Too many Inf values: {inf_count}/{len(data_float)}")
|
|
521
|
+
|
|
522
|
+
# Check 3: Values in reasonable range
|
|
523
|
+
with warnings.catch_warnings():
|
|
524
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
525
|
+
data_min = float(np.min(data_float))
|
|
526
|
+
data_max = float(np.max(data_float))
|
|
527
|
+
type_min = dtype_info["min"]
|
|
528
|
+
type_max = dtype_info["max"]
|
|
529
|
+
|
|
530
|
+
if dtype_info["float"]:
|
|
531
|
+
# For floats, check reasonable range
|
|
532
|
+
if abs(data_min) > 1e20 or abs(data_max) > 1e20:
|
|
533
|
+
issues.append(f"Values extremely large: [{data_min:.2e}, {data_max:.2e}]")
|
|
534
|
+
else:
|
|
535
|
+
# For integers, must fit in type
|
|
536
|
+
if data_min < type_min or data_max > type_max:
|
|
537
|
+
issues.append(
|
|
538
|
+
f"Values outside type range: [{data_min}, {data_max}] "
|
|
539
|
+
f"not in [{type_min}, {type_max}]"
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# Check 4: Reasonable entropy
|
|
543
|
+
sample_bytes = file.read_bytes(0, min(8192, file.size))
|
|
544
|
+
entropy = self._compute_entropy(sample_bytes)
|
|
545
|
+
|
|
546
|
+
if entropy < 0.5:
|
|
547
|
+
issues.append(f"Very low entropy: {entropy:.2f} bits")
|
|
548
|
+
|
|
549
|
+
# Check 5: Not too many constant values
|
|
550
|
+
unique_count = len(np.unique(sample_data))
|
|
551
|
+
constant_ratio = 1.0 - (unique_count / len(sample_data))
|
|
552
|
+
|
|
553
|
+
if constant_ratio > self.max_constant_ratio:
|
|
554
|
+
issues.append(f"Too many repeated values: {constant_ratio:.1%} repetition")
|
|
555
|
+
|
|
556
|
+
# Compute statistics
|
|
557
|
+
statistics = self._compute_statistics(data_float)
|
|
558
|
+
|
|
559
|
+
# Validation passes if no critical issues
|
|
560
|
+
passed = len(issues) == 0
|
|
561
|
+
|
|
562
|
+
return ValidationResult(
|
|
563
|
+
passed=passed,
|
|
564
|
+
issues=issues,
|
|
565
|
+
statistics=statistics,
|
|
566
|
+
sample_data=data_float,
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
def _load_as_dtype(
|
|
570
|
+
self, file: BinaryFile, dtype: str, max_samples: int = 1000
|
|
571
|
+
) -> NDArray[np.float64]:
|
|
572
|
+
"""Load sample data as specified dtype.
|
|
573
|
+
|
|
574
|
+
Args:
|
|
575
|
+
file: Binary file to load from.
|
|
576
|
+
dtype: Data type to use.
|
|
577
|
+
max_samples: Maximum number of samples to load.
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
Sample data as float64 array.
|
|
581
|
+
"""
|
|
582
|
+
dtype_info = SUPPORTED_DTYPES[dtype]
|
|
583
|
+
itemsize = dtype_info["itemsize"]
|
|
584
|
+
|
|
585
|
+
# Calculate number of samples to load
|
|
586
|
+
max_elements = file.size // itemsize
|
|
587
|
+
count = min(max_samples, max_elements)
|
|
588
|
+
|
|
589
|
+
if count <= 0:
|
|
590
|
+
return np.array([], dtype=np.float64)
|
|
591
|
+
|
|
592
|
+
# Load data
|
|
593
|
+
data = file.read_array(offset=0, count=count, dtype=dtype)
|
|
594
|
+
|
|
595
|
+
# Convert to float64
|
|
596
|
+
# Suppress overflow warnings when converting invalid dtype interpretations
|
|
597
|
+
with warnings.catch_warnings():
|
|
598
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
599
|
+
return data.astype(np.float64)
|
|
600
|
+
|
|
601
|
+
def _compute_statistics(self, data: NDArray[np.float64]) -> dict[str, float]:
|
|
602
|
+
"""Compute statistics for sample data.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
data: Sample data array.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
Dictionary of statistics.
|
|
609
|
+
"""
|
|
610
|
+
if len(data) == 0:
|
|
611
|
+
return {}
|
|
612
|
+
|
|
613
|
+
# Suppress overflow warnings when computing statistics on invalid dtype interpretations
|
|
614
|
+
with warnings.catch_warnings():
|
|
615
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
616
|
+
return {
|
|
617
|
+
"mean": float(np.mean(data)),
|
|
618
|
+
"std": float(np.std(data)),
|
|
619
|
+
"min": float(np.min(data)),
|
|
620
|
+
"max": float(np.max(data)),
|
|
621
|
+
"median": float(np.median(data)),
|
|
622
|
+
"unique_count": int(len(np.unique(data))),
|
|
623
|
+
"total_count": int(len(data)),
|
|
624
|
+
}
|