oscura 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +1 -1
- oscura/__main__.py +4 -0
- oscura/analyzers/binary/__init__.py +36 -0
- oscura/analyzers/binary/core/__init__.py +29 -0
- oscura/analyzers/binary/core/file_access.py +193 -0
- oscura/analyzers/binary/core/pipeline.py +161 -0
- oscura/analyzers/binary/core/results.py +217 -0
- oscura/analyzers/binary/detection/__init__.py +10 -0
- oscura/analyzers/binary/detection/encoding.py +624 -0
- oscura/analyzers/binary/detection/patterns.py +320 -0
- oscura/analyzers/binary/detection/structure.py +630 -0
- oscura/analyzers/binary/export/__init__.py +9 -0
- oscura/analyzers/binary/export/dissector.py +174 -0
- oscura/analyzers/binary/inference/__init__.py +15 -0
- oscura/analyzers/binary/inference/checksums.py +214 -0
- oscura/analyzers/binary/inference/fields.py +150 -0
- oscura/analyzers/binary/inference/sequences.py +232 -0
- oscura/analyzers/binary/inference/timestamps.py +210 -0
- oscura/analyzers/binary/visualization/__init__.py +9 -0
- oscura/analyzers/binary/visualization/structure_view.py +182 -0
- oscura/analyzers/ml/signal_classifier.py +6 -0
- oscura/analyzers/waveform/spectral.py +18 -11
- oscura/automotive/__init__.py +1 -1
- oscura/automotive/flexray/fibex.py +9 -1
- oscura/loaders/__init__.py +4 -1
- oscura/loaders/binary.py +284 -1
- oscura/loaders/validation.py +17 -10
- oscura/sessions/legacy.py +110 -1
- oscura/workflows/batch/aggregate.py +5 -1
- oscura-0.12.0.dist-info/METADATA +460 -0
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/RECORD +34 -16
- oscura-0.10.0.dist-info/METADATA +0 -641
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/WHEEL +0 -0
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/entry_points.txt +0 -0
- {oscura-0.10.0.dist-info → oscura-0.12.0.dist-info}/licenses/LICENSE +0 -0
oscura/__init__.py
CHANGED
oscura/__main__.py
CHANGED
|
@@ -106,6 +106,10 @@ def download_file(url: str, dest: Path, checksum: str | None = None) -> bool:
|
|
|
106
106
|
# Create SSL context that works in most environments
|
|
107
107
|
context = ssl.create_default_context()
|
|
108
108
|
|
|
109
|
+
# SEC-003: Validate URL scheme to prevent file:// attacks
|
|
110
|
+
if not url.startswith(("http://", "https://")):
|
|
111
|
+
raise ValueError(f"Unsupported URL scheme (only http/https allowed): {url}")
|
|
112
|
+
|
|
109
113
|
print(f" Downloading: {url}")
|
|
110
114
|
|
|
111
115
|
with urllib.request.urlopen(url, context=context, timeout=30) as response:
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Binary file analysis framework.
|
|
2
|
+
|
|
3
|
+
Provides comprehensive analysis of binary files including:
|
|
4
|
+
- Multi-stage dtype detection with validation
|
|
5
|
+
- Pattern mining for repeating sequences
|
|
6
|
+
- Message and field structure inference
|
|
7
|
+
- Semantic field classification
|
|
8
|
+
- Protocol identification
|
|
9
|
+
- Visualization and dissector generation
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from oscura.analyzers.binary.core.pipeline import BinaryAnalyzer
|
|
15
|
+
from oscura.analyzers.binary.core.results import (
|
|
16
|
+
BinaryAnalysisResult,
|
|
17
|
+
EncodingResult,
|
|
18
|
+
Field,
|
|
19
|
+
FieldType,
|
|
20
|
+
Message,
|
|
21
|
+
Pattern,
|
|
22
|
+
PatternRole,
|
|
23
|
+
StructureResult,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"BinaryAnalysisResult",
|
|
28
|
+
"BinaryAnalyzer",
|
|
29
|
+
"EncodingResult",
|
|
30
|
+
"Field",
|
|
31
|
+
"FieldType",
|
|
32
|
+
"Message",
|
|
33
|
+
"Pattern",
|
|
34
|
+
"PatternRole",
|
|
35
|
+
"StructureResult",
|
|
36
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Core binary analysis components."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from oscura.analyzers.binary.core.file_access import BinaryFile
|
|
6
|
+
from oscura.analyzers.binary.core.pipeline import BinaryAnalyzer
|
|
7
|
+
from oscura.analyzers.binary.core.results import (
|
|
8
|
+
BinaryAnalysisResult,
|
|
9
|
+
EncodingResult,
|
|
10
|
+
Field,
|
|
11
|
+
FieldType,
|
|
12
|
+
Message,
|
|
13
|
+
Pattern,
|
|
14
|
+
PatternRole,
|
|
15
|
+
StructureResult,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"BinaryAnalysisResult",
|
|
20
|
+
"BinaryAnalyzer",
|
|
21
|
+
"BinaryFile",
|
|
22
|
+
"EncodingResult",
|
|
23
|
+
"Field",
|
|
24
|
+
"FieldType",
|
|
25
|
+
"Message",
|
|
26
|
+
"Pattern",
|
|
27
|
+
"PatternRole",
|
|
28
|
+
"StructureResult",
|
|
29
|
+
]
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Efficient binary file access with memory mapping, caching, and thread safety."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import mmap
|
|
6
|
+
import threading
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from os import PathLike
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BinaryFile:
|
|
17
|
+
"""Memory-efficient, thread-safe binary file access layer.
|
|
18
|
+
|
|
19
|
+
Provides efficient access to binary files of any size using memory mapping
|
|
20
|
+
and caching. Designed to handle multi-GB files with minimal memory footprint.
|
|
21
|
+
All operations are protected by an RLock for thread safety.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> bf = BinaryFile("large_file.bin")
|
|
25
|
+
>>> data = bf.read_bytes(0, 1024) # Read first 1KB
|
|
26
|
+
>>> samples = bf.sample_locations(n_samples=5) # Sample 5 locations
|
|
27
|
+
>>> bf.close()
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, path: str | PathLike[str]):
|
|
31
|
+
"""Initialize binary file access.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
path: Path to binary file.
|
|
35
|
+
"""
|
|
36
|
+
self.path = Path(path)
|
|
37
|
+
self.size = self.path.stat().st_size
|
|
38
|
+
self._file_handle: Any | None = None
|
|
39
|
+
self._mmap: mmap.mmap | None = None
|
|
40
|
+
self._lock = threading.RLock()
|
|
41
|
+
|
|
42
|
+
def _ensure_open(self) -> None:
|
|
43
|
+
"""Ensure file is open and memory mapped.
|
|
44
|
+
|
|
45
|
+
Thread-safe. Cleans up partial state on failure to prevent
|
|
46
|
+
resource leaks.
|
|
47
|
+
"""
|
|
48
|
+
with self._lock:
|
|
49
|
+
if self._mmap is None:
|
|
50
|
+
try:
|
|
51
|
+
self._file_handle = open(self.path, "rb") # noqa: SIM115
|
|
52
|
+
if self.size > 0:
|
|
53
|
+
self._mmap = mmap.mmap(
|
|
54
|
+
self._file_handle.fileno(), 0, access=mmap.ACCESS_READ
|
|
55
|
+
)
|
|
56
|
+
except Exception:
|
|
57
|
+
if self._file_handle is not None:
|
|
58
|
+
self._file_handle.close()
|
|
59
|
+
self._file_handle = None
|
|
60
|
+
raise
|
|
61
|
+
|
|
62
|
+
def read_bytes(self, offset: int, length: int) -> bytes:
|
|
63
|
+
"""Read bytes from file at given offset.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
offset: Byte offset to start reading.
|
|
67
|
+
length: Number of bytes to read.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Bytes read from file.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> bf = BinaryFile("data.bin")
|
|
74
|
+
>>> header = bf.read_bytes(0, 16)
|
|
75
|
+
"""
|
|
76
|
+
if offset + length > self.size:
|
|
77
|
+
length = max(0, self.size - offset)
|
|
78
|
+
|
|
79
|
+
if length <= 0:
|
|
80
|
+
return b""
|
|
81
|
+
|
|
82
|
+
self._ensure_open()
|
|
83
|
+
|
|
84
|
+
with self._lock:
|
|
85
|
+
if self._mmap is not None:
|
|
86
|
+
return bytes(self._mmap[offset : offset + length])
|
|
87
|
+
|
|
88
|
+
return b""
|
|
89
|
+
|
|
90
|
+
def read_regions(self, regions: list[tuple[int, int]]) -> list[bytes]:
|
|
91
|
+
"""Read multiple regions efficiently.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
regions: List of (offset, length) tuples.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of bytes for each region.
|
|
98
|
+
|
|
99
|
+
Example:
|
|
100
|
+
>>> bf = BinaryFile("data.bin")
|
|
101
|
+
>>> regions = [(0, 16), (1000, 16), (2000, 16)]
|
|
102
|
+
>>> data = bf.read_regions(regions)
|
|
103
|
+
"""
|
|
104
|
+
return [self.read_bytes(offset, length) for offset, length in regions]
|
|
105
|
+
|
|
106
|
+
def sample_locations(self, n_samples: int = 5, sample_size: int = 8192) -> list[bytes]:
|
|
107
|
+
"""Sample data from multiple file locations.
|
|
108
|
+
|
|
109
|
+
Samples evenly distributed throughout the file for robust analysis.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
n_samples: Number of locations to sample.
|
|
113
|
+
sample_size: Bytes to read at each location.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List of byte samples from different file locations.
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
>>> bf = BinaryFile("data.bin")
|
|
120
|
+
>>> samples = bf.sample_locations(n_samples=5, sample_size=4096)
|
|
121
|
+
"""
|
|
122
|
+
if self.size < sample_size:
|
|
123
|
+
return [self.read_bytes(0, self.size)]
|
|
124
|
+
|
|
125
|
+
# Calculate evenly-spaced locations
|
|
126
|
+
locations = np.linspace(0, self.size - sample_size, n_samples, dtype=int)
|
|
127
|
+
|
|
128
|
+
return [self.read_bytes(int(loc), sample_size) for loc in locations]
|
|
129
|
+
|
|
130
|
+
def read_array(
|
|
131
|
+
self, offset: int, count: int, dtype: str | np.dtype[Any]
|
|
132
|
+
) -> np.ndarray[Any, Any]:
|
|
133
|
+
"""Read data as numpy array.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
offset: Byte offset to start reading.
|
|
137
|
+
count: Number of elements to read (-1 for all).
|
|
138
|
+
dtype: NumPy dtype for interpretation.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
NumPy array of data.
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
>>> bf = BinaryFile("data.bin")
|
|
145
|
+
>>> data = bf.read_array(0, 1000, dtype="uint16")
|
|
146
|
+
"""
|
|
147
|
+
np_dtype = np.dtype(dtype)
|
|
148
|
+
bytes_per_element = np_dtype.itemsize
|
|
149
|
+
byte_offset = offset * bytes_per_element
|
|
150
|
+
|
|
151
|
+
if count == -1:
|
|
152
|
+
# Read all available data
|
|
153
|
+
available_bytes = self.size - byte_offset
|
|
154
|
+
count = available_bytes // bytes_per_element
|
|
155
|
+
|
|
156
|
+
if count <= 0:
|
|
157
|
+
return np.array([], dtype=np_dtype)
|
|
158
|
+
|
|
159
|
+
bytes_to_read = count * bytes_per_element
|
|
160
|
+
|
|
161
|
+
if byte_offset + bytes_to_read > self.size:
|
|
162
|
+
bytes_to_read = self.size - byte_offset
|
|
163
|
+
count = bytes_to_read // bytes_per_element
|
|
164
|
+
|
|
165
|
+
data_bytes = self.read_bytes(byte_offset, bytes_to_read)
|
|
166
|
+
return np.frombuffer(data_bytes, dtype=np_dtype, count=count)
|
|
167
|
+
|
|
168
|
+
def close(self) -> None:
|
|
169
|
+
"""Close file and release resources."""
|
|
170
|
+
with self._lock:
|
|
171
|
+
if self._mmap is not None:
|
|
172
|
+
self._mmap.close()
|
|
173
|
+
self._mmap = None
|
|
174
|
+
|
|
175
|
+
if self._file_handle is not None:
|
|
176
|
+
self._file_handle.close()
|
|
177
|
+
self._file_handle = None
|
|
178
|
+
|
|
179
|
+
def __enter__(self) -> BinaryFile:
|
|
180
|
+
"""Context manager entry."""
|
|
181
|
+
return self
|
|
182
|
+
|
|
183
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
184
|
+
"""Context manager exit."""
|
|
185
|
+
self.close()
|
|
186
|
+
|
|
187
|
+
def __del__(self) -> None:
|
|
188
|
+
"""Destructor to ensure cleanup."""
|
|
189
|
+
self.close()
|
|
190
|
+
|
|
191
|
+
def __repr__(self) -> str:
|
|
192
|
+
"""String representation."""
|
|
193
|
+
return f"BinaryFile('{self.path}', size={self.size:,} bytes)"
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Binary analysis pipeline orchestrator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from oscura.analyzers.binary.core.file_access import BinaryFile
|
|
9
|
+
from oscura.analyzers.binary.core.results import BinaryAnalysisResult
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from os import PathLike
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BinaryAnalyzer:
|
|
16
|
+
"""Complete binary file analysis pipeline.
|
|
17
|
+
|
|
18
|
+
Orchestrates multi-stage analysis of binary files including encoding
|
|
19
|
+
detection, pattern mining, structure inference, and semantic analysis.
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
>>> analyzer = BinaryAnalyzer("unknown_file.bin")
|
|
23
|
+
>>> results = analyzer.analyze(max_samples=100_000)
|
|
24
|
+
>>> print(results)
|
|
25
|
+
>>> results.to_dict() # Export to dictionary
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, file_path: str | PathLike[str]):
|
|
29
|
+
"""Initialize binary analyzer.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
file_path: Path to binary file to analyze.
|
|
33
|
+
"""
|
|
34
|
+
self.file_path = Path(file_path)
|
|
35
|
+
self.binary_file = BinaryFile(self.file_path)
|
|
36
|
+
self.results: BinaryAnalysisResult | None = None
|
|
37
|
+
|
|
38
|
+
def analyze(
|
|
39
|
+
self,
|
|
40
|
+
max_samples: int = 100_000,
|
|
41
|
+
enable_structure_inference: bool = True,
|
|
42
|
+
enable_semantic_analysis: bool = True,
|
|
43
|
+
) -> BinaryAnalysisResult:
|
|
44
|
+
"""Run complete analysis pipeline.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
max_samples: Maximum samples to load for analysis.
|
|
48
|
+
enable_structure_inference: Enable message/field structure detection.
|
|
49
|
+
enable_semantic_analysis: Enable field type classification.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Complete analysis results.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> analyzer = BinaryAnalyzer("data.bin")
|
|
56
|
+
>>> results = analyzer.analyze(max_samples=100_000)
|
|
57
|
+
>>> print(f"Detected dtype: {results.encoding.dtype}")
|
|
58
|
+
>>> print(f"Found {len(results.patterns)} patterns")
|
|
59
|
+
"""
|
|
60
|
+
# Import analysis components (lazy import to avoid circular dependencies)
|
|
61
|
+
from oscura.analyzers.binary.detection.encoding import EncodingDetector
|
|
62
|
+
from oscura.analyzers.binary.detection.patterns import PatternMiner
|
|
63
|
+
|
|
64
|
+
# Stage 1: Encoding Detection
|
|
65
|
+
encoding_detector = EncodingDetector()
|
|
66
|
+
encoding = encoding_detector.detect(
|
|
67
|
+
self.binary_file, validation=True, max_samples=max_samples
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Stage 2: Pattern Mining
|
|
71
|
+
pattern_miner = PatternMiner()
|
|
72
|
+
patterns = pattern_miner.find_patterns(
|
|
73
|
+
self.binary_file, min_length=2, max_length=64, min_occurrences=3
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Stage 3: Structure Inference (if enabled and patterns found)
|
|
77
|
+
structure = None
|
|
78
|
+
if enable_structure_inference and patterns:
|
|
79
|
+
from oscura.analyzers.binary.detection.structure import StructureInferencer
|
|
80
|
+
|
|
81
|
+
structure_inferencer = StructureInferencer()
|
|
82
|
+
structure = structure_inferencer.infer(
|
|
83
|
+
self.binary_file, patterns=patterns, encoding=encoding
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Stage 4: Semantic Analysis (if enabled and structure found)
|
|
87
|
+
if enable_semantic_analysis and structure is not None and structure.has_messages:
|
|
88
|
+
from oscura.analyzers.binary.inference.fields import SemanticAnalyzer
|
|
89
|
+
|
|
90
|
+
semantic_analyzer = SemanticAnalyzer()
|
|
91
|
+
structure.fields = semantic_analyzer.analyze_fields(
|
|
92
|
+
self.binary_file, structure=structure
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Calculate overall confidence
|
|
96
|
+
confidence = self._calculate_confidence(encoding, patterns, structure)
|
|
97
|
+
|
|
98
|
+
# Build result
|
|
99
|
+
self.results = BinaryAnalysisResult(
|
|
100
|
+
encoding=encoding,
|
|
101
|
+
patterns=patterns,
|
|
102
|
+
structure=structure,
|
|
103
|
+
confidence=confidence,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return self.results
|
|
107
|
+
|
|
108
|
+
def _calculate_confidence(self, encoding: Any, patterns: list[Any], structure: Any) -> float:
|
|
109
|
+
"""Calculate overall analysis confidence.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
encoding: Encoding detection result.
|
|
113
|
+
patterns: Pattern mining results.
|
|
114
|
+
structure: Structure inference result.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Overall confidence score (0-1).
|
|
118
|
+
"""
|
|
119
|
+
scores = [encoding.confidence]
|
|
120
|
+
|
|
121
|
+
# Pattern quality score
|
|
122
|
+
if patterns:
|
|
123
|
+
regular_patterns = sum(1 for p in patterns if p.regular)
|
|
124
|
+
pattern_score = min(1.0, regular_patterns / max(1, len(patterns)))
|
|
125
|
+
scores.append(pattern_score)
|
|
126
|
+
|
|
127
|
+
# Structure quality score
|
|
128
|
+
if structure and structure.has_messages:
|
|
129
|
+
scores.append(structure.confidence)
|
|
130
|
+
|
|
131
|
+
return sum(scores) / len(scores) if scores else 0.0
|
|
132
|
+
|
|
133
|
+
def export_results(self, output_path: str | PathLike[str]) -> None:
|
|
134
|
+
"""Export analysis results to JSON.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
output_path: Path to output JSON file.
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
>>> analyzer = BinaryAnalyzer("data.bin")
|
|
141
|
+
>>> results = analyzer.analyze()
|
|
142
|
+
>>> analyzer.export_results("analysis_results.json")
|
|
143
|
+
"""
|
|
144
|
+
if self.results is None:
|
|
145
|
+
msg = "No results to export. Run analyze() first."
|
|
146
|
+
raise ValueError(msg)
|
|
147
|
+
|
|
148
|
+
import json
|
|
149
|
+
|
|
150
|
+
output_path = Path(output_path)
|
|
151
|
+
output_path.write_text(json.dumps(self.results.to_dict(), indent=2))
|
|
152
|
+
|
|
153
|
+
def __repr__(self) -> str:
|
|
154
|
+
"""String representation."""
|
|
155
|
+
return f"BinaryAnalyzer('{self.file_path}')"
|
|
156
|
+
|
|
157
|
+
def __str__(self) -> str:
|
|
158
|
+
"""Human-readable string."""
|
|
159
|
+
if self.results:
|
|
160
|
+
return str(self.results)
|
|
161
|
+
return f"BinaryAnalyzer('{self.file_path}', not yet analyzed)"
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Result types for binary analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from numpy.typing import NDArray
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PatternRole(Enum):
|
|
14
|
+
"""Role classification for repeating patterns."""
|
|
15
|
+
|
|
16
|
+
HEADER = "header"
|
|
17
|
+
FOOTER = "footer"
|
|
18
|
+
DELIMITER = "delimiter"
|
|
19
|
+
SYNC_MARKER = "sync_marker"
|
|
20
|
+
UNKNOWN = "unknown"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FieldType(Enum):
|
|
24
|
+
"""Field type classification."""
|
|
25
|
+
|
|
26
|
+
CONSTANT = "constant"
|
|
27
|
+
SEQUENCE = "sequence"
|
|
28
|
+
TIMESTAMP = "timestamp"
|
|
29
|
+
CHECKSUM = "checksum"
|
|
30
|
+
PAYLOAD = "payload"
|
|
31
|
+
LENGTH = "length"
|
|
32
|
+
UNKNOWN = "unknown"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Pattern:
|
|
37
|
+
"""Repeating byte pattern in binary file."""
|
|
38
|
+
|
|
39
|
+
bytes: bytes
|
|
40
|
+
positions: list[int]
|
|
41
|
+
count: int
|
|
42
|
+
avg_spacing: float
|
|
43
|
+
spacing_variance: float
|
|
44
|
+
regular: bool
|
|
45
|
+
role: PatternRole
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> dict[str, Any]:
|
|
48
|
+
"""Export to dictionary."""
|
|
49
|
+
return {
|
|
50
|
+
"hex": self.bytes.hex(" ").upper(),
|
|
51
|
+
"length": len(self.bytes),
|
|
52
|
+
"occurrences": self.count,
|
|
53
|
+
"regular_spacing": self.regular,
|
|
54
|
+
"avg_spacing": self.avg_spacing,
|
|
55
|
+
"spacing_variance": self.spacing_variance,
|
|
56
|
+
"role": self.role.value,
|
|
57
|
+
"sample_positions": self.positions[:10],
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class Message:
|
|
63
|
+
"""Extracted message from binary file."""
|
|
64
|
+
|
|
65
|
+
offset: int
|
|
66
|
+
length: int
|
|
67
|
+
data: bytes
|
|
68
|
+
index: int
|
|
69
|
+
|
|
70
|
+
def to_dict(self) -> dict[str, Any]:
|
|
71
|
+
"""Export to dictionary."""
|
|
72
|
+
return {
|
|
73
|
+
"index": self.index,
|
|
74
|
+
"offset": self.offset,
|
|
75
|
+
"length": self.length,
|
|
76
|
+
"hex_preview": self.data[:64].hex(" ").upper()
|
|
77
|
+
if len(self.data) > 64
|
|
78
|
+
else self.data.hex(" ").upper(),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class Field:
|
|
84
|
+
"""Message field with semantic information."""
|
|
85
|
+
|
|
86
|
+
name: str
|
|
87
|
+
offset: int
|
|
88
|
+
length: int
|
|
89
|
+
field_type: FieldType
|
|
90
|
+
constant: bool
|
|
91
|
+
values: list[Any] = field(default_factory=list)
|
|
92
|
+
statistics: dict[str, float] = field(default_factory=dict)
|
|
93
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
94
|
+
|
|
95
|
+
def to_dict(self) -> dict[str, Any]:
|
|
96
|
+
"""Export to dictionary."""
|
|
97
|
+
result = {
|
|
98
|
+
"name": self.name,
|
|
99
|
+
"offset": self.offset,
|
|
100
|
+
"length": self.length,
|
|
101
|
+
"type": self.field_type.value,
|
|
102
|
+
"constant": self.constant,
|
|
103
|
+
"statistics": self.statistics,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# Add sample values
|
|
107
|
+
if self.values:
|
|
108
|
+
if isinstance(self.values[0], bytes):
|
|
109
|
+
result["sample_values"] = [v.hex(" ").upper() for v in self.values[:5]]
|
|
110
|
+
else:
|
|
111
|
+
result["sample_values"] = self.values[:5]
|
|
112
|
+
|
|
113
|
+
# Add metadata
|
|
114
|
+
if self.metadata:
|
|
115
|
+
result["metadata"] = self.metadata
|
|
116
|
+
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class EncodingResult:
|
|
122
|
+
"""Encoding detection result with validation."""
|
|
123
|
+
|
|
124
|
+
dtype: str
|
|
125
|
+
confidence: float
|
|
126
|
+
alternatives: list[tuple[str, float]]
|
|
127
|
+
validation_passed: bool
|
|
128
|
+
sample_data: NDArray[np.float64]
|
|
129
|
+
statistics: dict[str, float]
|
|
130
|
+
issues: list[str] = field(default_factory=list)
|
|
131
|
+
|
|
132
|
+
def to_dict(self) -> dict[str, Any]:
|
|
133
|
+
"""Export to dictionary."""
|
|
134
|
+
return {
|
|
135
|
+
"detected_dtype": self.dtype,
|
|
136
|
+
"confidence": f"{self.confidence:.1%}",
|
|
137
|
+
"alternatives": [
|
|
138
|
+
{"dtype": dt, "confidence": f"{conf:.1%}"} for dt, conf in self.alternatives[:3]
|
|
139
|
+
],
|
|
140
|
+
"validation_passed": self.validation_passed,
|
|
141
|
+
"statistics": {
|
|
142
|
+
"mean": float(np.mean(self.sample_data)),
|
|
143
|
+
"std": float(np.std(self.sample_data)),
|
|
144
|
+
"min": float(np.min(self.sample_data)),
|
|
145
|
+
"max": float(np.max(self.sample_data)),
|
|
146
|
+
},
|
|
147
|
+
"issues": self.issues,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class StructureResult:
|
|
153
|
+
"""Inferred file structure result."""
|
|
154
|
+
|
|
155
|
+
has_messages: bool
|
|
156
|
+
message_length: int | None
|
|
157
|
+
message_count: int
|
|
158
|
+
messages: list[Message]
|
|
159
|
+
fields: list[Field]
|
|
160
|
+
confidence: float
|
|
161
|
+
|
|
162
|
+
def to_dict(self) -> dict[str, Any]:
|
|
163
|
+
"""Export to dictionary."""
|
|
164
|
+
result: dict[str, Any] = {
|
|
165
|
+
"has_messages": self.has_messages,
|
|
166
|
+
"message_length": self.message_length,
|
|
167
|
+
"message_count": self.message_count,
|
|
168
|
+
"confidence": f"{self.confidence:.1%}",
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
if self.fields:
|
|
172
|
+
result["fields"] = [f.to_dict() for f in self.fields]
|
|
173
|
+
|
|
174
|
+
if self.messages:
|
|
175
|
+
result["sample_messages"] = [m.to_dict() for m in self.messages[:5]]
|
|
176
|
+
|
|
177
|
+
return result
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@dataclass
|
|
181
|
+
class BinaryAnalysisResult:
|
|
182
|
+
"""Complete binary analysis result."""
|
|
183
|
+
|
|
184
|
+
encoding: EncodingResult
|
|
185
|
+
patterns: list[Pattern]
|
|
186
|
+
structure: StructureResult | None = None
|
|
187
|
+
confidence: float = 0.0
|
|
188
|
+
|
|
189
|
+
def to_dict(self) -> dict[str, Any]:
|
|
190
|
+
"""Export complete results to dictionary."""
|
|
191
|
+
result = {
|
|
192
|
+
"encoding": self.encoding.to_dict(),
|
|
193
|
+
"patterns": [p.to_dict() for p in self.patterns],
|
|
194
|
+
"confidence": f"{self.confidence:.1%}",
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if self.structure:
|
|
198
|
+
result["structure"] = self.structure.to_dict()
|
|
199
|
+
|
|
200
|
+
return result
|
|
201
|
+
|
|
202
|
+
def __str__(self) -> str:
|
|
203
|
+
"""Human-readable summary."""
|
|
204
|
+
lines = [
|
|
205
|
+
"=== Binary Analysis Results ===",
|
|
206
|
+
f"Encoding: {self.encoding.dtype} ({self.encoding.confidence:.1%} confidence)",
|
|
207
|
+
f"Patterns found: {len(self.patterns)}",
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
if self.structure and self.structure.has_messages:
|
|
211
|
+
lines.append(f"Messages: {self.structure.message_count}")
|
|
212
|
+
lines.append(f"Message length: {self.structure.message_length} bytes")
|
|
213
|
+
lines.append(f"Fields identified: {len(self.structure.fields)}")
|
|
214
|
+
|
|
215
|
+
lines.append(f"Overall confidence: {self.confidence:.1%}")
|
|
216
|
+
|
|
217
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Binary file detection modules.
|
|
2
|
+
|
|
3
|
+
Encoding detection, pattern mining, and structure inference.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from oscura.analyzers.binary.detection.encoding import EncodingDetector
|
|
7
|
+
from oscura.analyzers.binary.detection.patterns import PatternMiner
|
|
8
|
+
from oscura.analyzers.binary.detection.structure import StructureInferencer
|
|
9
|
+
|
|
10
|
+
__all__ = ["EncodingDetector", "PatternMiner", "StructureInferencer"]
|