oscura 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. oscura/__init__.py +1 -1
  2. oscura/__main__.py +4 -0
  3. oscura/analyzers/binary/__init__.py +36 -0
  4. oscura/analyzers/binary/core/__init__.py +29 -0
  5. oscura/analyzers/binary/core/file_access.py +193 -0
  6. oscura/analyzers/binary/core/pipeline.py +161 -0
  7. oscura/analyzers/binary/core/results.py +217 -0
  8. oscura/analyzers/binary/detection/__init__.py +10 -0
  9. oscura/analyzers/binary/detection/encoding.py +624 -0
  10. oscura/analyzers/binary/detection/patterns.py +320 -0
  11. oscura/analyzers/binary/detection/structure.py +630 -0
  12. oscura/analyzers/binary/export/__init__.py +9 -0
  13. oscura/analyzers/binary/export/dissector.py +174 -0
  14. oscura/analyzers/binary/inference/__init__.py +15 -0
  15. oscura/analyzers/binary/inference/checksums.py +214 -0
  16. oscura/analyzers/binary/inference/fields.py +150 -0
  17. oscura/analyzers/binary/inference/sequences.py +232 -0
  18. oscura/analyzers/binary/inference/timestamps.py +210 -0
  19. oscura/analyzers/binary/visualization/__init__.py +9 -0
  20. oscura/analyzers/binary/visualization/structure_view.py +182 -0
  21. oscura/analyzers/ml/signal_classifier.py +6 -0
  22. oscura/analyzers/waveform/spectral.py +18 -11
  23. oscura/automotive/__init__.py +1 -1
  24. oscura/automotive/flexray/fibex.py +9 -1
  25. oscura/loaders/__init__.py +4 -1
  26. oscura/loaders/binary.py +284 -1
  27. oscura/loaders/validation.py +17 -10
  28. oscura/sessions/legacy.py +110 -1
  29. oscura/workflows/batch/aggregate.py +5 -1
  30. oscura-0.12.0.dist-info/METADATA +460 -0
  31. {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/RECORD +34 -16
  32. oscura-0.10.0.dist-info/METADATA +0 -641
  33. {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/WHEEL +0 -0
  34. {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/entry_points.txt +0 -0
  35. {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,624 @@
1
+ """Enhanced encoding detection with multi-stage validation.
2
+
3
+ Provides robust encoding detection for binary files using:
4
+ - Statistical analysis (entropy, IEEE 754 validation, value ranges)
5
+ - Validation by loading and checking actual data
6
+ - Consensus detection from multiple file locations
7
+ - Prevents common misdetections (e.g., uint16 → float32)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import struct
13
+ import warnings
14
+ from dataclasses import dataclass, field
15
+ from typing import TYPE_CHECKING, TypedDict
16
+
17
+ import numpy as np
18
+ from numpy.typing import NDArray
19
+
20
+ from oscura.analyzers.binary.core.results import EncodingResult
21
+
22
+ if TYPE_CHECKING:
23
+ from oscura.analyzers.binary.core.file_access import BinaryFile
24
+
25
+ # Number of float values to validate for IEEE 754 compliance
26
+ ENCODING_VALIDATION_COUNT = 100
27
+
28
+ # Minimum valid float ratio for IEEE 754 validation
29
+ MIN_VALID_FLOAT_RATIO = 0.8 # 80% of values must be valid
30
+
31
+
32
+ class DTypeInfo(TypedDict):
33
+ """Data type information."""
34
+
35
+ itemsize: int
36
+ min: float
37
+ max: float
38
+ signed: bool
39
+ float: bool
40
+
41
+
42
+ # Supported data types with their properties
43
+ SUPPORTED_DTYPES: dict[str, DTypeInfo] = {
44
+ "uint8": {"itemsize": 1, "min": 0, "max": 255, "signed": False, "float": False},
45
+ "int8": {"itemsize": 1, "min": -128, "max": 127, "signed": True, "float": False},
46
+ "uint16": {"itemsize": 2, "min": 0, "max": 65535, "signed": False, "float": False},
47
+ "int16": {"itemsize": 2, "min": -32768, "max": 32767, "signed": True, "float": False},
48
+ "uint32": {"itemsize": 4, "min": 0, "max": 4294967295, "signed": False, "float": False},
49
+ "int32": {
50
+ "itemsize": 4,
51
+ "min": -2147483648,
52
+ "max": 2147483647,
53
+ "signed": True,
54
+ "float": False,
55
+ },
56
+ "float32": {"itemsize": 4, "min": -3.4e38, "max": 3.4e38, "signed": True, "float": True},
57
+ "float64": {"itemsize": 8, "min": -1.7e308, "max": 1.7e308, "signed": True, "float": True},
58
+ }
59
+
60
+
61
+ @dataclass
62
+ class ValidationResult:
63
+ """Result of encoding validation."""
64
+
65
+ passed: bool
66
+ issues: list[str] = field(default_factory=list)
67
+ statistics: dict[str, float] = field(default_factory=dict)
68
+ sample_data: NDArray[np.float64] = field(default_factory=lambda: np.array([]))
69
+
70
+
71
+ class EncodingDetector:
72
+ """Multi-stage encoding detector with validation.
73
+
74
+ Detects binary encoding using statistical analysis and validation.
75
+ Prevents false positives by actually loading data and checking results.
76
+
77
+ Example:
78
+ >>> detector = EncodingDetector()
79
+ >>> with BinaryFile("data.bin") as bf:
80
+ ... result = detector.detect(bf, validation=True)
81
+ ... print(f"Detected: {result.dtype} ({result.confidence:.1%})")
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ min_std_threshold: float = 0.01,
87
+ max_constant_ratio: float = 0.95,
88
+ min_confidence: float = 0.5,
89
+ ):
90
+ """Initialize encoding detector.
91
+
92
+ Args:
93
+ min_std_threshold: Minimum standard deviation to avoid constant data.
94
+ max_constant_ratio: Maximum ratio of constant values allowed.
95
+ min_confidence: Minimum confidence to return a detection.
96
+ """
97
+ self.min_std_threshold = min_std_threshold
98
+ self.max_constant_ratio = max_constant_ratio
99
+ self.min_confidence = min_confidence
100
+
101
+ def detect(
102
+ self,
103
+ file: BinaryFile,
104
+ validation: bool = True,
105
+ n_samples: int = 5,
106
+ max_samples: int = 1000,
107
+ ) -> EncodingResult:
108
+ """Detect encoding with optional validation.
109
+
110
+ Args:
111
+ file: Binary file to analyze.
112
+ validation: Whether to validate detection by loading data.
113
+ n_samples: Number of file locations to sample.
114
+ max_samples: Maximum number of data samples to load for validation.
115
+
116
+ Returns:
117
+ EncodingResult with detected dtype and confidence.
118
+
119
+ Example:
120
+ >>> detector = EncodingDetector()
121
+ >>> with BinaryFile("data.bin") as bf:
122
+ ... result = detector.detect(bf)
123
+ ... if result.validation_passed:
124
+ ... print(f"Confident detection: {result.dtype}")
125
+ """
126
+ # Sample data from multiple locations
127
+ samples = file.sample_locations(n_samples=n_samples, sample_size=8192)
128
+
129
+ # Stage 1: Statistical detection
130
+ candidates = self._statistical_detection(samples)
131
+
132
+ if not candidates:
133
+ # Fallback to uint8 with low confidence
134
+ sample_data = self._load_as_dtype(file, "uint8", max_samples=max_samples)
135
+ return EncodingResult(
136
+ dtype="uint8",
137
+ confidence=0.3,
138
+ alternatives=[],
139
+ validation_passed=False,
140
+ sample_data=sample_data,
141
+ statistics=self._compute_statistics(sample_data),
142
+ issues=["No clear encoding detected, defaulting to uint8"],
143
+ )
144
+
145
+ # Stage 2: Consensus detection from multiple locations
146
+ if validation and len(samples) > 1:
147
+ candidates = self._consensus_detection(file, candidates, n_samples=n_samples)
148
+
149
+ # Stage 3: Validation by loading and checking data
150
+ best_dtype, best_confidence = candidates[0]
151
+ validation_result = ValidationResult(passed=False)
152
+
153
+ if validation:
154
+ validation_result = self._validate_encoding(file, best_dtype, max_samples=max_samples)
155
+
156
+ # If validation failed, try alternatives
157
+ if not validation_result.passed and len(candidates) > 1:
158
+ for dtype, conf in candidates[1:]:
159
+ alt_validation = self._validate_encoding(file, dtype, max_samples=max_samples)
160
+ if alt_validation.passed:
161
+ validation_result = alt_validation
162
+ best_dtype = dtype
163
+ best_confidence = conf
164
+ break
165
+
166
+ # Use validated sample data if available
167
+ sample_data = (
168
+ validation_result.sample_data
169
+ if validation_result.sample_data.size > 0
170
+ else self._load_as_dtype(file, best_dtype, max_samples=max_samples)
171
+ )
172
+
173
+ statistics = (
174
+ validation_result.statistics
175
+ if validation_result.statistics
176
+ else self._compute_statistics(sample_data)
177
+ )
178
+
179
+ return EncodingResult(
180
+ dtype=best_dtype,
181
+ confidence=best_confidence if validation_result.passed else best_confidence * 0.7,
182
+ alternatives=[(dt, conf) for dt, conf in candidates[1:6]],
183
+ validation_passed=validation_result.passed,
184
+ sample_data=sample_data,
185
+ statistics=statistics,
186
+ issues=validation_result.issues if not validation_result.passed else [],
187
+ )
188
+
189
+ def _statistical_detection(self, samples: list[bytes]) -> list[tuple[str, float]]:
190
+ """Detect encoding using statistical analysis.
191
+
192
+ Args:
193
+ samples: List of byte samples from file.
194
+
195
+ Returns:
196
+ List of (dtype, confidence) tuples, sorted by confidence descending.
197
+ """
198
+ scores: dict[str, float] = {}
199
+
200
+ for dtype_name in SUPPORTED_DTYPES:
201
+ score = self._score_dtype(samples, dtype_name)
202
+ if score > 0:
203
+ scores[dtype_name] = score
204
+
205
+ # Sort by score descending
206
+ candidates = sorted(scores.items(), key=lambda x: x[1], reverse=True)
207
+
208
+ return candidates
209
+
210
+ def _score_dtype(self, samples: list[bytes], dtype: str) -> float:
211
+ """Score how well a dtype fits the data.
212
+
213
+ Args:
214
+ samples: List of byte samples.
215
+ dtype: Data type to score.
216
+
217
+ Returns:
218
+ Score from 0.0 to 1.0, higher is better.
219
+ """
220
+ dtype_info = SUPPORTED_DTYPES[dtype]
221
+ itemsize = dtype_info["itemsize"]
222
+ is_float = dtype_info["float"]
223
+
224
+ total_score = 0.0
225
+ valid_samples = 0
226
+
227
+ for sample in samples:
228
+ if len(sample) < itemsize:
229
+ continue
230
+
231
+ # Parse sample data
232
+ count = len(sample) // itemsize
233
+ try:
234
+ data = np.frombuffer(sample, dtype=dtype, count=count)
235
+ except (ValueError, TypeError):
236
+ continue
237
+
238
+ if len(data) == 0:
239
+ continue
240
+
241
+ valid_samples += 1
242
+ sample_score = 0.0
243
+
244
+ # Check 1: Entropy (should be reasonable)
245
+ entropy = self._compute_entropy(sample)
246
+ if is_float:
247
+ # Floats should have higher entropy
248
+ if entropy > 4.0:
249
+ sample_score += 0.3
250
+ else:
251
+ # Integers should have moderate entropy
252
+ if 1.0 < entropy < 7.0:
253
+ sample_score += 0.2
254
+
255
+ # Check 2: Value range makes sense
256
+ # Suppress warnings for invalid casts (expected when testing encodings)
257
+ with warnings.catch_warnings():
258
+ warnings.simplefilter("ignore", RuntimeWarning)
259
+ if is_float:
260
+ # No conversion needed for floats
261
+ data_min = float(np.min(data))
262
+ data_max = float(np.max(data))
263
+ else:
264
+ # Convert integers to float64 for range comparison
265
+ data_float = data.astype(np.float64)
266
+ data_min = float(np.min(data_float))
267
+ data_max = float(np.max(data_float))
268
+
269
+ if is_float:
270
+ # For floats, check IEEE 754 validity
271
+ if self._is_valid_ieee754(sample, itemsize):
272
+ sample_score += 0.4
273
+ # Check if values are in reasonable range
274
+ if data_min > -1e6 and data_max < 1e6:
275
+ sample_score += 0.2
276
+ else:
277
+ # For integers, check range
278
+ type_min = dtype_info["min"]
279
+ type_max = dtype_info["max"]
280
+ range_score = self._score_integer_range(data_min, data_max, type_min, type_max)
281
+ sample_score += range_score * 0.5
282
+
283
+ # Bonus: Prefer types that make better use of the value range
284
+ # If data uses more of the type's range, that's a better fit
285
+ data_range = data_max - data_min
286
+ type_range = type_max - type_min
287
+ if type_range > 0:
288
+ utilization = data_range / type_range
289
+ # Reward types with 1%-50% utilization (good fit)
290
+ if 0.01 <= utilization <= 0.5:
291
+ sample_score += 0.2
292
+ # Small reward for reasonable utilization
293
+ elif 0.001 <= utilization < 0.01:
294
+ sample_score += 0.1
295
+
296
+ # Check 3: Not all constant
297
+ # Suppress overflow warnings when testing invalid dtype interpretations
298
+ with warnings.catch_warnings():
299
+ warnings.simplefilter("ignore", RuntimeWarning)
300
+ if is_float:
301
+ std = float(np.std(data))
302
+ else:
303
+ std = float(np.std(data_float))
304
+ if std > self.min_std_threshold:
305
+ sample_score += 0.2
306
+
307
+ # Check 4: Alignment check (data should align well)
308
+ if self._check_alignment(sample, itemsize):
309
+ sample_score += 0.1
310
+
311
+ total_score += sample_score
312
+
313
+ if valid_samples == 0:
314
+ return 0.0
315
+
316
+ # Normalize to 0.0-1.0 range
317
+ avg_score = total_score / valid_samples
318
+ return min(avg_score, 1.0)
319
+
320
+ def _compute_entropy(self, data: bytes) -> float:
321
+ """Compute Shannon entropy of byte data.
322
+
323
+ Args:
324
+ data: Byte sequence.
325
+
326
+ Returns:
327
+ Entropy in bits (0-8 for bytes).
328
+ """
329
+ if len(data) == 0:
330
+ return 0.0
331
+
332
+ # Count byte frequencies
333
+ counts = np.bincount(np.frombuffer(data, dtype=np.uint8), minlength=256)
334
+ probabilities = counts[counts > 0] / len(data)
335
+
336
+ # Shannon entropy
337
+ entropy = -np.sum(probabilities * np.log2(probabilities))
338
+ return float(entropy)
339
+
340
+ def _is_valid_ieee754(self, data: bytes, itemsize: int) -> bool:
341
+ """Check if data looks like valid IEEE 754 floats.
342
+
343
+ Args:
344
+ data: Byte data to check.
345
+ itemsize: 4 for float32, 8 for float64.
346
+
347
+ Returns:
348
+ True if data appears to be valid IEEE 754.
349
+ """
350
+ if itemsize not in (4, 8):
351
+ return False
352
+
353
+ count = min(len(data) // itemsize, ENCODING_VALIDATION_COUNT)
354
+ if count == 0:
355
+ return False
356
+
357
+ fmt = f"{'f' if itemsize == 4 else 'd'}"
358
+ valid_count = 0
359
+
360
+ for i in range(count):
361
+ try:
362
+ offset = i * itemsize
363
+ value = struct.unpack(fmt, data[offset : offset + itemsize])[0]
364
+
365
+ # Check for NaN, Inf
366
+ if np.isnan(value) or np.isinf(value):
367
+ continue
368
+
369
+ # Check if value is reasonable
370
+ if abs(value) < 1e30: # Not extremely large
371
+ valid_count += 1
372
+
373
+ except (struct.error, ValueError):
374
+ continue
375
+
376
+ # Validate minimum ratio of valid floats
377
+ return valid_count / count > MIN_VALID_FLOAT_RATIO
378
+
379
+ def _score_integer_range(
380
+ self, data_min: float, data_max: float, type_min: float, type_max: float
381
+ ) -> float:
382
+ """Score how well data fits integer type range.
383
+
384
+ Args:
385
+ data_min: Minimum value in data.
386
+ data_max: Maximum value in data.
387
+ type_min: Minimum value for type.
388
+ type_max: Maximum value for type.
389
+
390
+ Returns:
391
+ Score from 0.0 to 1.0.
392
+ """
393
+ # Data must fit in type range
394
+ if data_min < type_min or data_max > type_max:
395
+ return 0.0
396
+
397
+ # Score based on range utilization
398
+ data_range = data_max - data_min
399
+ type_range = type_max - type_min
400
+
401
+ if type_range == 0:
402
+ return 0.0
403
+
404
+ utilization = data_range / type_range
405
+
406
+ # Prefer types where data uses a reasonable portion of range
407
+ # but not types that are way too large
408
+ if utilization > 0.01: # Uses at least 1% of range
409
+ return 1.0
410
+ elif utilization > 0.001: # Uses at least 0.1%
411
+ return 0.7
412
+ else:
413
+ return 0.3 # Very small utilization, might be oversized type
414
+
415
+ def _check_alignment(self, data: bytes, itemsize: int) -> bool:
416
+ """Check if data aligns well with item size.
417
+
418
+ Args:
419
+ data: Byte data.
420
+ itemsize: Size of data type in bytes.
421
+
422
+ Returns:
423
+ True if data aligns well.
424
+ """
425
+ # File size should be multiple of itemsize (or close)
426
+ remainder = len(data) % itemsize
427
+ return remainder < itemsize * 0.1 # Within 10%
428
+
429
+ def _consensus_detection(
430
+ self, file: BinaryFile, candidates: list[tuple[str, float]], n_samples: int
431
+ ) -> list[tuple[str, float]]:
432
+ """Refine detection using consensus from multiple locations.
433
+
434
+ Args:
435
+ file: Binary file being analyzed.
436
+ candidates: Initial candidate list.
437
+ n_samples: Number of samples to check.
438
+
439
+ Returns:
440
+ Refined candidate list with adjusted confidences.
441
+ """
442
+ # Sample additional locations
443
+ samples = file.sample_locations(n_samples=n_samples, sample_size=4096)
444
+
445
+ # Score each candidate on all samples
446
+ consensus_scores: dict[str, list[float]] = {dt: [] for dt, _ in candidates[:5]}
447
+
448
+ for sample in samples:
449
+ for dtype, scores_list in consensus_scores.items():
450
+ score = self._score_dtype([sample], dtype)
451
+ scores_list.append(score)
452
+
453
+ # Compute average and consistency
454
+ refined_candidates: list[tuple[str, float]] = []
455
+ for dtype, scores in consensus_scores.items():
456
+ if not scores:
457
+ continue
458
+ # Suppress overflow warnings when computing statistics
459
+ with warnings.catch_warnings():
460
+ warnings.simplefilter("ignore", RuntimeWarning)
461
+ avg_score = np.mean(scores)
462
+ consistency = 1.0 - np.std(scores) # Lower std = more consistent
463
+
464
+ # Combine average score with consistency
465
+ final_score = avg_score * 0.7 + consistency * 0.3
466
+ refined_candidates.append((dtype, float(final_score)))
467
+
468
+ # Sort by final score
469
+ refined_candidates.sort(key=lambda x: x[1], reverse=True)
470
+
471
+ return refined_candidates
472
+
473
+ def _validate_encoding(
474
+ self, file: BinaryFile, dtype: str, max_samples: int = 1000
475
+ ) -> ValidationResult:
476
+ """Validate encoding by loading and checking data.
477
+
478
+ Args:
479
+ file: Binary file to validate.
480
+ dtype: Data type to validate.
481
+ max_samples: Maximum number of data samples to load.
482
+
483
+ Returns:
484
+ ValidationResult with validation status and issues.
485
+ """
486
+ issues: list[str] = []
487
+
488
+ # Load sample data
489
+ try:
490
+ sample_data = self._load_as_dtype(file, dtype, max_samples=max_samples)
491
+ except (ValueError, TypeError) as e:
492
+ return ValidationResult(passed=False, issues=[f"Failed to load as {dtype}: {e!s}"])
493
+
494
+ if len(sample_data) == 0:
495
+ return ValidationResult(passed=False, issues=["No data could be loaded"])
496
+
497
+ # Convert to float for analysis
498
+ # Suppress overflow warnings when testing invalid dtype interpretations
499
+ with warnings.catch_warnings():
500
+ warnings.simplefilter("ignore", RuntimeWarning)
501
+ data_float = sample_data.astype(np.float64)
502
+
503
+ # Check 1: Not all constant
504
+ with warnings.catch_warnings():
505
+ warnings.simplefilter("ignore", RuntimeWarning)
506
+ std = float(np.std(data_float))
507
+ if std < self.min_std_threshold:
508
+ issues.append(f"Data appears constant (std={std:.2e})")
509
+
510
+ # Check 2: No NaN/Inf for float types
511
+ dtype_info = SUPPORTED_DTYPES[dtype]
512
+ if dtype_info["float"]:
513
+ nan_count = int(np.sum(np.isnan(data_float)))
514
+ inf_count = int(np.sum(np.isinf(data_float)))
515
+
516
+ if nan_count > len(data_float) * 0.1: # More than 10% NaN
517
+ issues.append(f"Too many NaN values: {nan_count}/{len(data_float)}")
518
+
519
+ if inf_count > len(data_float) * 0.1: # More than 10% Inf
520
+ issues.append(f"Too many Inf values: {inf_count}/{len(data_float)}")
521
+
522
+ # Check 3: Values in reasonable range
523
+ with warnings.catch_warnings():
524
+ warnings.simplefilter("ignore", RuntimeWarning)
525
+ data_min = float(np.min(data_float))
526
+ data_max = float(np.max(data_float))
527
+ type_min = dtype_info["min"]
528
+ type_max = dtype_info["max"]
529
+
530
+ if dtype_info["float"]:
531
+ # For floats, check reasonable range
532
+ if abs(data_min) > 1e20 or abs(data_max) > 1e20:
533
+ issues.append(f"Values extremely large: [{data_min:.2e}, {data_max:.2e}]")
534
+ else:
535
+ # For integers, must fit in type
536
+ if data_min < type_min or data_max > type_max:
537
+ issues.append(
538
+ f"Values outside type range: [{data_min}, {data_max}] "
539
+ f"not in [{type_min}, {type_max}]"
540
+ )
541
+
542
+ # Check 4: Reasonable entropy
543
+ sample_bytes = file.read_bytes(0, min(8192, file.size))
544
+ entropy = self._compute_entropy(sample_bytes)
545
+
546
+ if entropy < 0.5:
547
+ issues.append(f"Very low entropy: {entropy:.2f} bits")
548
+
549
+ # Check 5: Not too many constant values
550
+ unique_count = len(np.unique(sample_data))
551
+ constant_ratio = 1.0 - (unique_count / len(sample_data))
552
+
553
+ if constant_ratio > self.max_constant_ratio:
554
+ issues.append(f"Too many repeated values: {constant_ratio:.1%} repetition")
555
+
556
+ # Compute statistics
557
+ statistics = self._compute_statistics(data_float)
558
+
559
+ # Validation passes if no critical issues
560
+ passed = len(issues) == 0
561
+
562
+ return ValidationResult(
563
+ passed=passed,
564
+ issues=issues,
565
+ statistics=statistics,
566
+ sample_data=data_float,
567
+ )
568
+
569
+ def _load_as_dtype(
570
+ self, file: BinaryFile, dtype: str, max_samples: int = 1000
571
+ ) -> NDArray[np.float64]:
572
+ """Load sample data as specified dtype.
573
+
574
+ Args:
575
+ file: Binary file to load from.
576
+ dtype: Data type to use.
577
+ max_samples: Maximum number of samples to load.
578
+
579
+ Returns:
580
+ Sample data as float64 array.
581
+ """
582
+ dtype_info = SUPPORTED_DTYPES[dtype]
583
+ itemsize = dtype_info["itemsize"]
584
+
585
+ # Calculate number of samples to load
586
+ max_elements = file.size // itemsize
587
+ count = min(max_samples, max_elements)
588
+
589
+ if count <= 0:
590
+ return np.array([], dtype=np.float64)
591
+
592
+ # Load data
593
+ data = file.read_array(offset=0, count=count, dtype=dtype)
594
+
595
+ # Convert to float64
596
+ # Suppress overflow warnings when converting invalid dtype interpretations
597
+ with warnings.catch_warnings():
598
+ warnings.simplefilter("ignore", RuntimeWarning)
599
+ return data.astype(np.float64)
600
+
601
+ def _compute_statistics(self, data: NDArray[np.float64]) -> dict[str, float]:
602
+ """Compute statistics for sample data.
603
+
604
+ Args:
605
+ data: Sample data array.
606
+
607
+ Returns:
608
+ Dictionary of statistics.
609
+ """
610
+ if len(data) == 0:
611
+ return {}
612
+
613
+ # Suppress overflow warnings when computing statistics on invalid dtype interpretations
614
+ with warnings.catch_warnings():
615
+ warnings.simplefilter("ignore", RuntimeWarning)
616
+ return {
617
+ "mean": float(np.mean(data)),
618
+ "std": float(np.std(data)),
619
+ "min": float(np.min(data)),
620
+ "max": float(np.max(data)),
621
+ "median": float(np.median(data)),
622
+ "unique_count": int(len(np.unique(data))),
623
+ "total_count": int(len(data)),
624
+ }