oswatcher-plugins 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oswatcher_plugins-0.14.0.dist-info/METADATA +79 -0
- oswatcher_plugins-0.14.0.dist-info/RECORD +26 -0
- oswatcher_plugins-0.14.0.dist-info/WHEEL +4 -0
- oswatcher_plugins-0.14.0.dist-info/entry_points.txt +3 -0
- oswatcher_plugins-0.14.0.dist-info/licenses/LICENSE +202 -0
- plugins/__init__.py +0 -0
- plugins/__main__.py +69 -0
- plugins/config/__init__.py +18 -0
- plugins/config/default_settings.toml +1 -0
- plugins/plugins/__init__.py +29 -0
- plugins/plugins/filetype.py +58 -0
- plugins/plugins/linux_symbols.py +271 -0
- plugins/plugins/linux_symbols_service.py +373 -0
- plugins/plugins/registry.py +224 -0
- plugins/plugins/symbols.py +475 -0
- plugins/plugins/symbols_repository.py +142 -0
- plugins/plugins/symbols_service.py +47 -0
- plugins/plugins/syscalls.py +180 -0
- plugins/syscalls/exceptions.py +25 -0
- plugins/syscalls/filesystem.py +121 -0
- plugins/syscalls/kernel_parser.py +36 -0
- plugins/syscalls/kernel_repo_manager.py +106 -0
- plugins/syscalls/nodes.py +119 -0
- plugins/syscalls/syscall_table_parser.py +46 -0
- plugins/syscalls/syscalls_h_parser.py +52 -0
- plugins/types.py +108 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path, PurePath
|
|
5
|
+
from typing import Dict, Iterator, List
|
|
6
|
+
|
|
7
|
+
from attrs import define, field
|
|
8
|
+
from neogit.core.merkle import MerkleVisitor
|
|
9
|
+
from neogit.core.model import MerkleLabel, MerkleNode, Node
|
|
10
|
+
from neogit.core.visitor import VisitedNode
|
|
11
|
+
from neogit.model.merkle import Blob
|
|
12
|
+
from neogit.model.neo import Commit, Tree
|
|
13
|
+
from regipy import NKRecord, RegistryHive, Subkey, Value
|
|
14
|
+
|
|
15
|
+
from plugins.types import AbstractPlugin, UniqueConstraint
|
|
16
|
+
|
|
17
|
+
"""Directory path to system-wide hives files"""
|
|
18
|
+
S32_CONFIG = PurePath("/Windows/System32/config")
|
|
19
|
+
|
|
20
|
+
"""HKEY_LOCAL_MACHINE root key path"""
|
|
21
|
+
HKLM = PurePath("HKEY_LOCAL_MACHINE")
|
|
22
|
+
"""HKEY_LOCAL_MACHINE root key path"""
|
|
23
|
+
HKU = PurePath("HKEY_USERS")
|
|
24
|
+
"""registry key name of the BCD mount point in HKLM"""
|
|
25
|
+
BCD_MOUNT_NAME = "BCD00000000"
|
|
26
|
+
|
|
27
|
+
"""Mapping from filepath to corresponding registry hive
|
|
28
|
+
We use PureWindowsPath for case insensitive matching"""
|
|
29
|
+
HIVE_MAPPING: Dict[PurePath, PurePath] = {
|
|
30
|
+
# HKLM hives
|
|
31
|
+
S32_CONFIG / "SAM": HKLM / "SAM",
|
|
32
|
+
S32_CONFIG / "SECURITY": HKLM / "SECURITY",
|
|
33
|
+
S32_CONFIG / "SOFTWARE": HKLM / "SOFTWARE",
|
|
34
|
+
S32_CONFIG / "SYSTEM": HKLM / "SYSTEM",
|
|
35
|
+
# BCD
|
|
36
|
+
# Bios boot
|
|
37
|
+
PurePath("/boot/BCD"): HKLM / BCD_MOUNT_NAME,
|
|
38
|
+
# EFI boot
|
|
39
|
+
PurePath("/EFI/Microsoft/Boot/BCD"): HKLM / BCD_MOUNT_NAME,
|
|
40
|
+
# HKEY_USERS hives
|
|
41
|
+
S32_CONFIG / "DEFAULT": HKU / ".Default",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@define(auto_attribs=True)
|
|
46
|
+
class CommonWinRegNode(Node):
|
|
47
|
+
path: PurePath = field()
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def name(self) -> str:
|
|
51
|
+
raise NotImplementedError
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def fullpath(self) -> PurePath:
|
|
55
|
+
return self.path / self.name
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@define(auto_attribs=True)
|
|
59
|
+
class WinRegValueNode(CommonWinRegNode):
|
|
60
|
+
value: Value = field()
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def name(self) -> str:
|
|
64
|
+
return self.value.name
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@define(auto_attribs=True)
|
|
68
|
+
class WinRegKeyNode(CommonWinRegNode):
|
|
69
|
+
key: NKRecord = field()
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def name(self) -> str:
|
|
73
|
+
return self.key.name
|
|
74
|
+
|
|
75
|
+
def iter_child_nodes(self) -> Iterator[Node]:
|
|
76
|
+
for sub_key in self.key.iter_subkeys():
|
|
77
|
+
yield WinRegKeyNode(self.fullpath, sub_key)
|
|
78
|
+
for value in self.key.iter_values():
|
|
79
|
+
yield WinRegValueNode(self.fullpath, value)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@define(auto_attribs=True)
|
|
83
|
+
class WinRegValueMerkleNode(MerkleNode):
|
|
84
|
+
value: Value = field(kw_only=True)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@define(auto_attribs=True)
|
|
88
|
+
class WinRegKeyMerkleNode(MerkleNode):
|
|
89
|
+
key: Subkey = field(kw_only=True)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class WinRegMerkleVisitor(MerkleVisitor):
|
|
93
|
+
|
|
94
|
+
def visit_WinRegValueNode(self, node: WinRegValueNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
|
|
95
|
+
hash_obj.update(f"{node.value.name}{node.value.value}{node.value.value_type}".encode())
|
|
96
|
+
merkle_node = WinRegValueMerkleNode( # type: ignore[call-arg]
|
|
97
|
+
hash=hash_obj.hexdigest(), label=MerkleLabel.Blob, value=node.value
|
|
98
|
+
)
|
|
99
|
+
return VisitedNode(node, merkle_node)
|
|
100
|
+
|
|
101
|
+
def visit_WinRegKeyNode(self, node: WinRegKeyNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
|
|
102
|
+
self.logger.debug("Visiting Key %s", node.fullpath)
|
|
103
|
+
merkle_children = {}
|
|
104
|
+
# sort by 2 criterias
|
|
105
|
+
# - keys first
|
|
106
|
+
# - values second
|
|
107
|
+
# then name
|
|
108
|
+
for child_node in sorted(node.iter_child_nodes(), key=lambda e: (not isinstance(e, WinRegKeyNode), e.name)):
|
|
109
|
+
visited_node = self.visit(child_node)
|
|
110
|
+
merkle_node = visited_node.return_value
|
|
111
|
+
data = f"{child_node.name}{merkle_node.hash}\n".encode()
|
|
112
|
+
hash_obj.update(data)
|
|
113
|
+
# clear out the current merkle node children Dict
|
|
114
|
+
# so we don't end up with a giant tree in memory
|
|
115
|
+
# TODO: this breaks the plugin: when the node is about to be inserted, he apparently has no children anymore
|
|
116
|
+
# merkle_node.children.clear()
|
|
117
|
+
merkle_children[child_node.name] = merkle_node
|
|
118
|
+
# compute final hash
|
|
119
|
+
merkle_node = WinRegKeyMerkleNode( # type: ignore[call-arg]
|
|
120
|
+
hash=hash_obj.hexdigest(), children=merkle_children, label=MerkleLabel.Tree, key=node.key
|
|
121
|
+
)
|
|
122
|
+
return VisitedNode(node, merkle_node)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@define(auto_attribs=True)
|
|
126
|
+
class WinRegistryPlugin(AbstractPlugin):
|
|
127
|
+
|
|
128
|
+
def constraints_data(self) -> List[UniqueConstraint]:
|
|
129
|
+
return [
|
|
130
|
+
UniqueConstraint(label="WinRegKey", property_list=["hash"]),
|
|
131
|
+
UniqueConstraint(label="WinRegValue", property_list=["hash"]),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
def run(self, commit: Commit):
|
|
135
|
+
fs: Tree = commit.filesystem.single()
|
|
136
|
+
for hive_path, root_hive in HIVE_MAPPING.items():
|
|
137
|
+
try:
|
|
138
|
+
blob = fs.get_blob_at_path(hive_path)
|
|
139
|
+
with self.downloaded_file(blob.hash) as hive_local_path:
|
|
140
|
+
self.logger.info("Dumping %s", hive_path)
|
|
141
|
+
node = self.dump_hive(hive_path, hive_local_path, root_hive)
|
|
142
|
+
if node is not None:
|
|
143
|
+
# attach Key to blob
|
|
144
|
+
self.attach_root_key_to_blob(blob, node.return_value, root_hive.name)
|
|
145
|
+
except FileNotFoundError:
|
|
146
|
+
self.logger.warning("Not found: %s", hive_path)
|
|
147
|
+
|
|
148
|
+
def dump_hive(self, hive_win_path: PurePath, hive_local_path: Path, root_hive: PurePath):
|
|
149
|
+
"""Dump a Windows registry hive
|
|
150
|
+
|
|
151
|
+
:param hive_local_path: host path to the download hive file
|
|
152
|
+
"""
|
|
153
|
+
# load hive
|
|
154
|
+
try:
|
|
155
|
+
hive = RegistryHive(hive_local_path)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
self.logger.warning("Failed to load hive %s", hive_win_path)
|
|
158
|
+
self.logger.debug(e)
|
|
159
|
+
return
|
|
160
|
+
root_node = WinRegKeyNode(root_hive, key=hive.root)
|
|
161
|
+
with WinRegMerkleVisitor(thread=True) as visitor:
|
|
162
|
+
visitor.run_visit(root_node)
|
|
163
|
+
last_node = None
|
|
164
|
+
for node in visitor.as_gen():
|
|
165
|
+
if isinstance(node.return_value, WinRegKeyMerkleNode):
|
|
166
|
+
self.insert_from_visited_node_cypher(node.return_value)
|
|
167
|
+
last_node = node
|
|
168
|
+
# clear children to save RAM
|
|
169
|
+
node.return_value.children.clear()
|
|
170
|
+
return last_node
|
|
171
|
+
|
|
172
|
+
def insert_from_visited_node_cypher(self, node: WinRegKeyMerkleNode):
|
|
173
|
+
query = """
|
|
174
|
+
MERGE (p:WinRegKey {hash: $parent_hash})
|
|
175
|
+
WITH p
|
|
176
|
+
FOREACH (cv IN $child_values |
|
|
177
|
+
MERGE (v:WinRegValue {hash: cv.hash, value: cv.value, type: cv.type})
|
|
178
|
+
MERGE (p)-[:HAS_CHILD {name: cv.name}]->(v)
|
|
179
|
+
)
|
|
180
|
+
WITH p
|
|
181
|
+
FOREACH (ck IN $child_keys |
|
|
182
|
+
MERGE (k:WinRegKey {hash: ck.hash})
|
|
183
|
+
MERGE (p)-[:HAS_CHILD {name: ck.name}]->(k)
|
|
184
|
+
)
|
|
185
|
+
"""
|
|
186
|
+
# note: Neo4j can store integer as signed 64 bits number
|
|
187
|
+
# however the Windows registry can contain REG_QWORD values up to 2^64 - 1
|
|
188
|
+
# so we need to ensure the value is casted as a string here
|
|
189
|
+
child_values = [
|
|
190
|
+
{
|
|
191
|
+
"name": child_name,
|
|
192
|
+
"hash": child_node.hash,
|
|
193
|
+
"value": str(child_node.value.value),
|
|
194
|
+
"type": child_node.value.value_type,
|
|
195
|
+
}
|
|
196
|
+
for child_name, child_node in node.children.items()
|
|
197
|
+
if child_node.label == MerkleLabel.Blob
|
|
198
|
+
]
|
|
199
|
+
child_keys = [
|
|
200
|
+
{
|
|
201
|
+
"name": child_name,
|
|
202
|
+
"hash": child_node.hash,
|
|
203
|
+
}
|
|
204
|
+
for child_name, child_node in node.children.items()
|
|
205
|
+
if child_node.label == MerkleLabel.Tree
|
|
206
|
+
]
|
|
207
|
+
self.neogit.db.cypher_query(
|
|
208
|
+
query,
|
|
209
|
+
{
|
|
210
|
+
"parent_hash": node.hash,
|
|
211
|
+
"child_values": child_values,
|
|
212
|
+
"child_keys": child_keys,
|
|
213
|
+
},
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def attach_root_key_to_blob(self, blob: Blob, root_node: WinRegKeyMerkleNode, root_name: str):
|
|
217
|
+
query = """
|
|
218
|
+
MATCH (b:Blob {hash: $blob_hash})
|
|
219
|
+
WITH b
|
|
220
|
+
MATCH (k:WinRegKey {hash: $root_hash})
|
|
221
|
+
WITH b, k
|
|
222
|
+
MERGE (b)-[:HAS_WINREG {name: $name}]->(k)
|
|
223
|
+
"""
|
|
224
|
+
self.neogit.db.cypher_query(query, {"blob_hash": blob.hash, "root_hash": root_node.hash, "name": root_name})
|
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import tempfile
|
|
9
|
+
from binascii import hexlify
|
|
10
|
+
from contextlib import contextmanager, suppress
|
|
11
|
+
from enum import Enum, auto
|
|
12
|
+
from pathlib import Path, PurePath
|
|
13
|
+
from typing import Dict, Generator, Optional, Tuple
|
|
14
|
+
from urllib import parse as urllib_parse
|
|
15
|
+
from urllib import request as urllib_request
|
|
16
|
+
|
|
17
|
+
import lief
|
|
18
|
+
import pypeln as pl
|
|
19
|
+
from attrs import define, field
|
|
20
|
+
from neogit.core.merkle import MerkleVisitor
|
|
21
|
+
from neogit.core.model import MerkleLabel, MerkleNode, Node
|
|
22
|
+
from neogit.core.visitor import VisitedNode
|
|
23
|
+
from neogit.model.neo import Commit
|
|
24
|
+
from volatility3.framework.contexts import Context
|
|
25
|
+
from volatility3.framework.symbols.windows.pdbconv import PdbReader, PdbRetreiver
|
|
26
|
+
|
|
27
|
+
from plugins.plugins.symbols_repository import SymbolsRepository
|
|
28
|
+
from plugins.plugins.symbols_service import filter_valid_filenames, parse_symbols_from_json
|
|
29
|
+
from plugins.types import AbstractPlugin, UniqueConstraint
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@contextmanager
|
|
33
|
+
def temporary_file_context(path):
|
|
34
|
+
try:
|
|
35
|
+
yield path
|
|
36
|
+
finally:
|
|
37
|
+
os.remove(path)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def return_exceptions(f):
|
|
41
|
+
@functools.wraps(f)
|
|
42
|
+
def wrapped(self, x):
|
|
43
|
+
if isinstance(x, BaseException):
|
|
44
|
+
return x
|
|
45
|
+
try:
|
|
46
|
+
return f(self, x)
|
|
47
|
+
except BaseException as e:
|
|
48
|
+
return e
|
|
49
|
+
|
|
50
|
+
return wrapped
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# User Types (structs)
|
|
54
|
+
class FieldKindType(Enum):
|
|
55
|
+
Base = auto()
|
|
56
|
+
Pointer = auto()
|
|
57
|
+
Function = auto()
|
|
58
|
+
Enum = auto()
|
|
59
|
+
Array = auto()
|
|
60
|
+
Struct = auto()
|
|
61
|
+
Union = auto()
|
|
62
|
+
Bitfield = auto()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class UserTypeKindType(Enum):
|
|
66
|
+
Struct = auto()
|
|
67
|
+
Union = auto()
|
|
68
|
+
Enum = auto()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@define(auto_attribs=True)
|
|
72
|
+
class StructNode(Node):
|
|
73
|
+
name: str
|
|
74
|
+
struct_data: Dict
|
|
75
|
+
# either struct, union or enum
|
|
76
|
+
kind: UserTypeKindType = field(init=False)
|
|
77
|
+
size: int = field(init=False)
|
|
78
|
+
|
|
79
|
+
def __attrs_post_init__(self):
|
|
80
|
+
self.size = self.struct_data["size"]
|
|
81
|
+
if "constants" in self.struct_data:
|
|
82
|
+
# enum
|
|
83
|
+
self.kind = UserTypeKindType.Enum
|
|
84
|
+
else:
|
|
85
|
+
self.kind = UserTypeKindType[self.struct_data["kind"].capitalize()]
|
|
86
|
+
|
|
87
|
+
def iter_child_nodes(self) -> Generator[Node, None, None]:
|
|
88
|
+
if self.kind == UserTypeKindType.Enum:
|
|
89
|
+
for name, value in self.struct_data["constants"].items():
|
|
90
|
+
field_node = StructFieldNode(
|
|
91
|
+
name=name, field_data={"offset": value, "type": {"kind": FieldKindType.Base.name, "name": "int"}}
|
|
92
|
+
)
|
|
93
|
+
yield field_node
|
|
94
|
+
else:
|
|
95
|
+
# iterate on every field
|
|
96
|
+
for field_name, field_data in self.struct_data["fields"].items():
|
|
97
|
+
field_node = StructFieldNode(name=field_name, field_data=field_data)
|
|
98
|
+
yield field_node
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@define(auto_attribs=True)
|
|
102
|
+
class StructFieldNode(Node):
|
|
103
|
+
name: str = field()
|
|
104
|
+
field_data: Dict = field()
|
|
105
|
+
offset: int = field(init=False)
|
|
106
|
+
data_type: str = field(init=False)
|
|
107
|
+
|
|
108
|
+
def __attrs_post_init__(self):
|
|
109
|
+
self.offset = self.field_data["offset"]
|
|
110
|
+
self.data_type = json.dumps(self.field_data["type"])
|
|
111
|
+
|
|
112
|
+
# store data type directly in the field node
|
|
113
|
+
# def iter_child_nodes(self) -> Generator[Node, None, None]:
|
|
114
|
+
# # construct the subtype node
|
|
115
|
+
# yield DataTypeNode(data_type=self.field_data["type"])
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@define(auto_attribs=True)
|
|
119
|
+
class DataTypeNode(Node):
|
|
120
|
+
"""Represents basic data types
|
|
121
|
+
- name: unsigned long
|
|
122
|
+
- name: unsigned long long
|
|
123
|
+
- name: void
|
|
124
|
+
- name: int
|
|
125
|
+
- name: void
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
data_type: Dict = field()
|
|
129
|
+
kind: FieldKindType = field(init=False)
|
|
130
|
+
# if base, enum, struct, union
|
|
131
|
+
name: Optional[str] = field(init=False)
|
|
132
|
+
# if array
|
|
133
|
+
array_counter: Optional[int] = field(init=False)
|
|
134
|
+
# if bitfield
|
|
135
|
+
bit_length: Optional[int] = field(init=False)
|
|
136
|
+
bit_position: Optional[int] = field(init=False)
|
|
137
|
+
# if pointer or bitfield or array
|
|
138
|
+
subtype: Optional[Dict] = field(init=False)
|
|
139
|
+
|
|
140
|
+
def __attrs_post_init__(self):
|
|
141
|
+
self.kind = FieldKindType[self.data_type["kind"].capitalize()]
|
|
142
|
+
match self.kind:
|
|
143
|
+
case FieldKindType.Base | FieldKindType.Enum | FieldKindType.Struct | FieldKindType.Union:
|
|
144
|
+
self.name = self.data_type["name"]
|
|
145
|
+
case FieldKindType.Array:
|
|
146
|
+
self.array_counter = self.data_type["count"]
|
|
147
|
+
self.subtype = self.data_type["subtype"]
|
|
148
|
+
case FieldKindType.Bitfield:
|
|
149
|
+
self.bit_length = self.data_type["bit_length"]
|
|
150
|
+
self.bit_position = self.data_type["bit_position"]
|
|
151
|
+
self.subtype = self.data_type["type"]
|
|
152
|
+
case FieldKindType.Pointer:
|
|
153
|
+
self.subtype = self.data_type["subtype"]
|
|
154
|
+
case FieldKindType.Function:
|
|
155
|
+
self.name = "function"
|
|
156
|
+
|
|
157
|
+
def iter_child_nodes(self) -> Generator[Node, None, None]:
|
|
158
|
+
match self.kind:
|
|
159
|
+
case FieldKindType.Array | FieldKindType.Bitfield | FieldKindType.Pointer:
|
|
160
|
+
yield DataTypeNode(data_type=self.subtype) # type: ignore[arg-type]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@define(auto_attribs=True)
|
|
164
|
+
class DataTypeMerkleNode(MerkleNode):
|
|
165
|
+
kind: FieldKindType = field(kw_only=True)
|
|
166
|
+
name: Optional[str] = field(default=None, kw_only=True)
|
|
167
|
+
array_counter: Optional[int] = field(default=None, kw_only=True)
|
|
168
|
+
bit_length: Optional[int] = field(default=None, kw_only=True)
|
|
169
|
+
bit_position: Optional[int] = field(default=None, kw_only=True)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@define(auto_attribs=True)
|
|
173
|
+
class StructFieldMerkleNode(MerkleNode):
|
|
174
|
+
name: str = field(kw_only=True)
|
|
175
|
+
offset: int = field(kw_only=True)
|
|
176
|
+
data_type: str = field(kw_only=True)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@define(auto_attribs=True)
|
|
180
|
+
class StructMerkleNode(MerkleNode):
|
|
181
|
+
name: str = field(kw_only=True)
|
|
182
|
+
size: int = field(kw_only=True)
|
|
183
|
+
kind: UserTypeKindType = field(kw_only=True)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# define the visitor
|
|
187
|
+
class SymbolsMerkleVisitor(MerkleVisitor):
|
|
188
|
+
|
|
189
|
+
def visit_DataTypeNode(self, node: DataTypeNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
|
|
190
|
+
match node.kind:
|
|
191
|
+
case (
|
|
192
|
+
FieldKindType.Base
|
|
193
|
+
| FieldKindType.Enum
|
|
194
|
+
| FieldKindType.Struct
|
|
195
|
+
| FieldKindType.Union
|
|
196
|
+
| FieldKindType.Function
|
|
197
|
+
):
|
|
198
|
+
hash_obj.update(f"{node.name}".encode())
|
|
199
|
+
merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
|
|
200
|
+
hash=hash_obj.hexdigest(), label=MerkleLabel.Blob, kind=node.kind, name=node.name
|
|
201
|
+
)
|
|
202
|
+
return VisitedNode(node, merkle_node)
|
|
203
|
+
case FieldKindType.Array:
|
|
204
|
+
subtype = next(node.iter_child_nodes())
|
|
205
|
+
visited_node = self.visit(subtype)
|
|
206
|
+
merkle_node = visited_node.return_value
|
|
207
|
+
data = f"{merkle_node.hash}-{node.array_counter}\n".encode()
|
|
208
|
+
hash_obj.update(data)
|
|
209
|
+
merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
|
|
210
|
+
hash=hash_obj.hexdigest(),
|
|
211
|
+
label=MerkleLabel.Blob,
|
|
212
|
+
kind=node.kind,
|
|
213
|
+
array_counter=node.array_counter,
|
|
214
|
+
children={merkle_node.name: merkle_node},
|
|
215
|
+
)
|
|
216
|
+
return VisitedNode(node, merkle_node)
|
|
217
|
+
case FieldKindType.Bitfield:
|
|
218
|
+
subtype = next(node.iter_child_nodes())
|
|
219
|
+
visited_node = self.visit(subtype)
|
|
220
|
+
merkle_node = visited_node.return_value
|
|
221
|
+
data = f"{merkle_node.hash}-{node.bit_length}-{node.bit_position}\n".encode()
|
|
222
|
+
hash_obj.update(data)
|
|
223
|
+
merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
|
|
224
|
+
hash=hash_obj.hexdigest(),
|
|
225
|
+
label=MerkleLabel.Blob,
|
|
226
|
+
kind=node.kind,
|
|
227
|
+
bit_length=node.bit_length,
|
|
228
|
+
bit_position=node.bit_position,
|
|
229
|
+
children={merkle_node.name: merkle_node},
|
|
230
|
+
)
|
|
231
|
+
return VisitedNode(node, merkle_node)
|
|
232
|
+
case FieldKindType.Pointer:
|
|
233
|
+
subtype = next(node.iter_child_nodes())
|
|
234
|
+
visited_node = self.visit(subtype)
|
|
235
|
+
merkle_node = visited_node.return_value
|
|
236
|
+
data = f"{merkle_node.hash}\n".encode()
|
|
237
|
+
hash_obj.update(data)
|
|
238
|
+
merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
|
|
239
|
+
hash=hash_obj.hexdigest(),
|
|
240
|
+
label=MerkleLabel.Blob,
|
|
241
|
+
kind=node.kind,
|
|
242
|
+
children={merkle_node.name: merkle_node},
|
|
243
|
+
)
|
|
244
|
+
return VisitedNode(node, merkle_node)
|
|
245
|
+
|
|
246
|
+
def visit_StructFieldNode(self, node: StructFieldNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
|
|
247
|
+
children: dict = {}
|
|
248
|
+
# for data_type in node.iter_child_nodes():
|
|
249
|
+
# visited_node = self.visit(data_type)
|
|
250
|
+
# merkle_node = visited_node.return_value
|
|
251
|
+
# data = f"{merkle_node.hash}\n".encode()
|
|
252
|
+
# hash_obj.update(data)
|
|
253
|
+
# children[merkle_node.hash] = merkle_node
|
|
254
|
+
# merklize the offset and the data type string
|
|
255
|
+
hash_obj.update(f"{node.offset}-{node.data_type}".encode())
|
|
256
|
+
merkle_node = StructFieldMerkleNode( # type: ignore[call-arg]
|
|
257
|
+
hash=hash_obj.hexdigest(),
|
|
258
|
+
label=MerkleLabel.Blob,
|
|
259
|
+
name=node.name,
|
|
260
|
+
offset=node.offset,
|
|
261
|
+
data_type=node.data_type,
|
|
262
|
+
children=children,
|
|
263
|
+
)
|
|
264
|
+
return VisitedNode(node, merkle_node)
|
|
265
|
+
|
|
266
|
+
def visit_StructNode(self, node: StructNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
|
|
267
|
+
children = {}
|
|
268
|
+
for member in node.iter_child_nodes():
|
|
269
|
+
visited_node = self.visit(member)
|
|
270
|
+
merkle_node = visited_node.return_value
|
|
271
|
+
"field_name-field_hash"
|
|
272
|
+
data = f"{member.name}{merkle_node.hash}\n".encode()
|
|
273
|
+
hash_obj.update(data)
|
|
274
|
+
children[member.name] = merkle_node
|
|
275
|
+
hash_obj.update(f"{node.size}-{node.kind.name}".encode())
|
|
276
|
+
merkle_node = StructMerkleNode( # type: ignore[call-arg]
|
|
277
|
+
hash=hash_obj.hexdigest(),
|
|
278
|
+
children=children,
|
|
279
|
+
label=MerkleLabel.Tree,
|
|
280
|
+
name=node.name,
|
|
281
|
+
size=node.size,
|
|
282
|
+
kind=node.kind,
|
|
283
|
+
)
|
|
284
|
+
return VisitedNode(node, merkle_node)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def parse_code_view(pe_path):
|
|
288
|
+
pe = lief.parse(pe_path)
|
|
289
|
+
for debug_dir in pe.debug:
|
|
290
|
+
if debug_dir.type == lief.PE.Debug.TYPES.CODEVIEW:
|
|
291
|
+
part1_bin = debug_dir.signature[:4]
|
|
292
|
+
part1_bin.reverse()
|
|
293
|
+
part1 = bytearray(part1_bin)
|
|
294
|
+
part2_bin = debug_dir.signature[4:6]
|
|
295
|
+
part2_bin.reverse()
|
|
296
|
+
part2 = bytearray(part2_bin)
|
|
297
|
+
part3_bin = debug_dir.signature[6:8]
|
|
298
|
+
part3_bin.reverse()
|
|
299
|
+
part3 = bytearray(part3_bin)
|
|
300
|
+
part4_bin = debug_dir.signature[8:]
|
|
301
|
+
part4 = bytearray(part4_bin)
|
|
302
|
+
|
|
303
|
+
guid = (
|
|
304
|
+
f"{hexlify(part1).decode()}{hexlify(part2).decode()}{hexlify(part3).decode()}{hexlify(part4).decode()}"
|
|
305
|
+
)
|
|
306
|
+
return guid, debug_dir.age & 0xF, debug_dir.filename
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _pdb_progress_callback(percentage, description):
|
|
310
|
+
"""Progress callback for PDB downloads - must be module-level for multiprocessing pickling.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
percentage: Progress percentage (0-100)
|
|
314
|
+
description: Description of current operation
|
|
315
|
+
"""
|
|
316
|
+
logging.debug(f"PDB download progress: {percentage: .1f}% - {description}")
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def retrieve_pdb(guid, age, pdb_name) -> str:
|
|
320
|
+
logging.debug("Retrieving PDB: %s - GUID: %s - Age: %s", pdb_name, guid, age)
|
|
321
|
+
filename = PdbRetreiver().retreive_pdb(
|
|
322
|
+
guid + str(age), file_name=pdb_name, progress_callback=_pdb_progress_callback
|
|
323
|
+
)
|
|
324
|
+
logging.debug("filename: %s", filename)
|
|
325
|
+
if not filename:
|
|
326
|
+
raise ValueError("PDB file could not be retrieved from the internet")
|
|
327
|
+
url = urllib_parse.urlparse(filename, scheme="file")
|
|
328
|
+
if url.scheme == "file":
|
|
329
|
+
if not Path(filename).exists():
|
|
330
|
+
logging.error(f"File {filename} does not exists")
|
|
331
|
+
location = "file:" + urllib_request.pathname2url(str(Path(filename).absolute()))
|
|
332
|
+
else:
|
|
333
|
+
location = filename
|
|
334
|
+
return location
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
@define(auto_attribs=True)
|
|
338
|
+
class SymbolsPlugin(AbstractPlugin):
|
|
339
|
+
max_workers: int = field(init=False, default=os.cpu_count())
|
|
340
|
+
_repository: Optional[SymbolsRepository] = field(init=False, default=None)
|
|
341
|
+
|
|
342
|
+
PE_MIME_TYPE = "application/vnd.microsoft.portable-executable"
|
|
343
|
+
# only process these filenames for now
|
|
344
|
+
FILTER_FILENAME = [
|
|
345
|
+
"ntoskrnl.exe",
|
|
346
|
+
"ntdll.dll",
|
|
347
|
+
"kernel32.dll",
|
|
348
|
+
"win32kfull.sys",
|
|
349
|
+
"win32kbase.sys",
|
|
350
|
+
"win32k.sys",
|
|
351
|
+
]
|
|
352
|
+
|
|
353
|
+
@property
|
|
354
|
+
def repository(self) -> SymbolsRepository:
|
|
355
|
+
"""Lazy-initialize repository."""
|
|
356
|
+
if self._repository is None:
|
|
357
|
+
self._repository = SymbolsRepository(self.neogit)
|
|
358
|
+
return self._repository
|
|
359
|
+
|
|
360
|
+
def constraints_data(self) -> list[UniqueConstraint]:
|
|
361
|
+
return [
|
|
362
|
+
UniqueConstraint(label="Symbol", property_list=["hash"]),
|
|
363
|
+
UniqueConstraint(label="Struct", property_list=["hash"]),
|
|
364
|
+
UniqueConstraint(label="StructField", property_list=["hash"]),
|
|
365
|
+
UniqueConstraint(label="DataType", property_list=["hash"]),
|
|
366
|
+
]
|
|
367
|
+
|
|
368
|
+
def run(self, commit: Commit):
|
|
369
|
+
# identify every PE file Blob
|
|
370
|
+
fs = commit.filesystem.single()
|
|
371
|
+
|
|
372
|
+
# Use repository to query PE blobs
|
|
373
|
+
all_blobs = self.repository.query_pe_blobs(fs.hash, self.__class__.PE_MIME_TYPE)
|
|
374
|
+
self.logger.info("Found %d PE blobs for commit %s", len(all_blobs), commit.hash)
|
|
375
|
+
blob_results = filter_valid_filenames(all_blobs, self.FILTER_FILENAME)
|
|
376
|
+
if not blob_results:
|
|
377
|
+
self.logger.info(
|
|
378
|
+
"No matching PE blobs after filename filter %s for commit %s",
|
|
379
|
+
self.FILTER_FILENAME,
|
|
380
|
+
commit.hash,
|
|
381
|
+
)
|
|
382
|
+
return
|
|
383
|
+
stage = pl.process.map(self.stage_parse_code_view, blob_results, workers=4)
|
|
384
|
+
stage = pl.process.map(self.stage_process_pdb, stage, workers=self.max_workers, maxsize=self.max_workers)
|
|
385
|
+
for ret in stage:
|
|
386
|
+
if isinstance(ret, BaseException):
|
|
387
|
+
self.logger.error("Failed to process PDB: %s: %s", type(ret).__name__, ret)
|
|
388
|
+
continue
|
|
389
|
+
blob_hash, pdb_name, tmp_file_path = ret
|
|
390
|
+
try:
|
|
391
|
+
with open(tmp_file_path, "r") as f:
|
|
392
|
+
j_data = json.load(f)
|
|
393
|
+
self.parse_pdb_json(blob_hash, pdb_name, j_data)
|
|
394
|
+
finally:
|
|
395
|
+
with suppress(FileNotFoundError):
|
|
396
|
+
os.remove(tmp_file_path)
|
|
397
|
+
|
|
398
|
+
@return_exceptions
|
|
399
|
+
def stage_parse_code_view(self, blob_result: Tuple[PurePath, str]) -> Optional[Tuple[str, int, str]]:
|
|
400
|
+
self.logger.info("Processing PE file: %s", blob_result[0])
|
|
401
|
+
blob_hash = blob_result[1]
|
|
402
|
+
with self.downloaded_file(blob_hash) as local_file:
|
|
403
|
+
ret = parse_code_view(local_file)
|
|
404
|
+
if not ret:
|
|
405
|
+
raise ValueError("No CodeView found")
|
|
406
|
+
guid, age, pdb_name = ret
|
|
407
|
+
self.logger.info("Path: %s - PDB: %s - GUID: %s (%s)", blob_result[0], pdb_name, guid, age)
|
|
408
|
+
return blob_hash, *ret
|
|
409
|
+
|
|
410
|
+
@return_exceptions
|
|
411
|
+
def stage_process_pdb(self, arg) -> Tuple[str, str, Path]:
|
|
412
|
+
blob_hash, guid, age, pdb_name = arg
|
|
413
|
+
try:
|
|
414
|
+
location = retrieve_pdb(guid, age, pdb_name)
|
|
415
|
+
except Exception as e:
|
|
416
|
+
self.logger.error("Failed to retrieve PDB for blob %s: %s", blob_hash, e)
|
|
417
|
+
raise ValueError(f"Failed to retrieve PDB {pdb_name} on {blob_hash}") from e
|
|
418
|
+
logging.debug(location)
|
|
419
|
+
ctx = Context()
|
|
420
|
+
try:
|
|
421
|
+
j_data = PdbReader(ctx, location).get_json()
|
|
422
|
+
except Exception as e:
|
|
423
|
+
raise ValueError(f"Failed to parse PDB {pdb_name} on {blob_hash}") from e
|
|
424
|
+
with tempfile.NamedTemporaryFile(delete=False, mode="w+") as tmp_file:
|
|
425
|
+
json.dump(j_data, tmp_file)
|
|
426
|
+
tmp_file.flush()
|
|
427
|
+
self.logger.debug("PDB: %s - JSON file: %s", pdb_name, tmp_file.name)
|
|
428
|
+
return blob_hash, pdb_name, Path(tmp_file.name)
|
|
429
|
+
|
|
430
|
+
def parse_pdb_json(self, blob_hash: str, pdb_name: str, j_pdb: Dict):
|
|
431
|
+
self.logger.debug("PDB: %s - Parsing JSON", pdb_name)
|
|
432
|
+
count_enum = self.parse_users_types(blob_hash, j_pdb["enums"])
|
|
433
|
+
count_syms = self.insert_symbols(blob_hash, j_pdb["symbols"])
|
|
434
|
+
count_types = self.parse_users_types(blob_hash, j_pdb["user_types"])
|
|
435
|
+
self.logger.info(
|
|
436
|
+
"PDB: %s - Inserted %d enums, %d symbols, %d user types", pdb_name, count_enum, count_syms, count_types
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
def parse_users_types(self, blob_hash: str, j_pdb: Dict) -> int:
|
|
440
|
+
with SymbolsMerkleVisitor(thread=True) as visitor:
|
|
441
|
+
for struct_name, struct_data in sorted(j_pdb.items()):
|
|
442
|
+
self.logger.debug("Struct: %s", struct_name)
|
|
443
|
+
struct_node = StructNode(name=struct_name, struct_data=struct_data)
|
|
444
|
+
visitor.run_visit(struct_node)
|
|
445
|
+
for node in visitor.as_gen():
|
|
446
|
+
merkle_node = node.return_value
|
|
447
|
+
if isinstance(merkle_node, DataTypeMerkleNode):
|
|
448
|
+
self.repository.insert_data_type(merkle_node)
|
|
449
|
+
if isinstance(merkle_node, StructMerkleNode):
|
|
450
|
+
unwind_param = [
|
|
451
|
+
{
|
|
452
|
+
"hash": child_node.hash,
|
|
453
|
+
"name": child_name,
|
|
454
|
+
"offset": child_node.offset,
|
|
455
|
+
"data_type": child_node.data_type,
|
|
456
|
+
}
|
|
457
|
+
for child_name, child_node in merkle_node.children.items()
|
|
458
|
+
]
|
|
459
|
+
self.repository.insert_struct(blob_hash, merkle_node, unwind_param)
|
|
460
|
+
|
|
461
|
+
return len(j_pdb.items())
|
|
462
|
+
|
|
463
|
+
def insert_symbols(self, blob_hash: str, symbols: Dict) -> int:
|
|
464
|
+
# Parse symbols using pure function
|
|
465
|
+
parsed_symbols = parse_symbols_from_json(symbols)
|
|
466
|
+
|
|
467
|
+
# Convert to param format for Neo4j
|
|
468
|
+
param_list = []
|
|
469
|
+
for symbol in parsed_symbols:
|
|
470
|
+
param_list.append({"hash": symbol["hash"], "sym_name": symbol["name"], "address": symbol["address"]})
|
|
471
|
+
self.logger.debug("Symbol %s (%s)", symbol["name"], symbol["address"])
|
|
472
|
+
|
|
473
|
+
# Use repository to insert symbols
|
|
474
|
+
self.repository.insert_symbols(blob_hash, param_list)
|
|
475
|
+
return len(symbols.items())
|