oswatcher-plugins 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,224 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from pathlib import Path, PurePath
5
+ from typing import Dict, Iterator, List
6
+
7
+ from attrs import define, field
8
+ from neogit.core.merkle import MerkleVisitor
9
+ from neogit.core.model import MerkleLabel, MerkleNode, Node
10
+ from neogit.core.visitor import VisitedNode
11
+ from neogit.model.merkle import Blob
12
+ from neogit.model.neo import Commit, Tree
13
+ from regipy import NKRecord, RegistryHive, Subkey, Value
14
+
15
+ from plugins.types import AbstractPlugin, UniqueConstraint
16
+
17
+ """Directory path to system-wide hives files"""
18
+ S32_CONFIG = PurePath("/Windows/System32/config")
19
+
20
+ """HKEY_LOCAL_MACHINE root key path"""
21
+ HKLM = PurePath("HKEY_LOCAL_MACHINE")
22
+ """HKEY_LOCAL_MACHINE root key path"""
23
+ HKU = PurePath("HKEY_USERS")
24
+ """registry key name of the BCD mount point in HKLM"""
25
+ BCD_MOUNT_NAME = "BCD00000000"
26
+
27
+ """Mapping from filepath to corresponding registry hive
28
+ We use PureWindowsPath for case insensitive matching"""
29
+ HIVE_MAPPING: Dict[PurePath, PurePath] = {
30
+ # HKLM hives
31
+ S32_CONFIG / "SAM": HKLM / "SAM",
32
+ S32_CONFIG / "SECURITY": HKLM / "SECURITY",
33
+ S32_CONFIG / "SOFTWARE": HKLM / "SOFTWARE",
34
+ S32_CONFIG / "SYSTEM": HKLM / "SYSTEM",
35
+ # BCD
36
+ # Bios boot
37
+ PurePath("/boot/BCD"): HKLM / BCD_MOUNT_NAME,
38
+ # EFI boot
39
+ PurePath("/EFI/Microsoft/Boot/BCD"): HKLM / BCD_MOUNT_NAME,
40
+ # HKEY_USERS hives
41
+ S32_CONFIG / "DEFAULT": HKU / ".Default",
42
+ }
43
+
44
+
45
+ @define(auto_attribs=True)
46
+ class CommonWinRegNode(Node):
47
+ path: PurePath = field()
48
+
49
+ @property
50
+ def name(self) -> str:
51
+ raise NotImplementedError
52
+
53
+ @property
54
+ def fullpath(self) -> PurePath:
55
+ return self.path / self.name
56
+
57
+
58
+ @define(auto_attribs=True)
59
+ class WinRegValueNode(CommonWinRegNode):
60
+ value: Value = field()
61
+
62
+ @property
63
+ def name(self) -> str:
64
+ return self.value.name
65
+
66
+
67
+ @define(auto_attribs=True)
68
+ class WinRegKeyNode(CommonWinRegNode):
69
+ key: NKRecord = field()
70
+
71
+ @property
72
+ def name(self) -> str:
73
+ return self.key.name
74
+
75
+ def iter_child_nodes(self) -> Iterator[Node]:
76
+ for sub_key in self.key.iter_subkeys():
77
+ yield WinRegKeyNode(self.fullpath, sub_key)
78
+ for value in self.key.iter_values():
79
+ yield WinRegValueNode(self.fullpath, value)
80
+
81
+
82
+ @define(auto_attribs=True)
83
+ class WinRegValueMerkleNode(MerkleNode):
84
+ value: Value = field(kw_only=True)
85
+
86
+
87
+ @define(auto_attribs=True)
88
+ class WinRegKeyMerkleNode(MerkleNode):
89
+ key: Subkey = field(kw_only=True)
90
+
91
+
92
+ class WinRegMerkleVisitor(MerkleVisitor):
93
+
94
+ def visit_WinRegValueNode(self, node: WinRegValueNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
95
+ hash_obj.update(f"{node.value.name}{node.value.value}{node.value.value_type}".encode())
96
+ merkle_node = WinRegValueMerkleNode( # type: ignore[call-arg]
97
+ hash=hash_obj.hexdigest(), label=MerkleLabel.Blob, value=node.value
98
+ )
99
+ return VisitedNode(node, merkle_node)
100
+
101
+ def visit_WinRegKeyNode(self, node: WinRegKeyNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
102
+ self.logger.debug("Visiting Key %s", node.fullpath)
103
+ merkle_children = {}
104
+ # sort by 2 criterias
105
+ # - keys first
106
+ # - values second
107
+ # then name
108
+ for child_node in sorted(node.iter_child_nodes(), key=lambda e: (not isinstance(e, WinRegKeyNode), e.name)):
109
+ visited_node = self.visit(child_node)
110
+ merkle_node = visited_node.return_value
111
+ data = f"{child_node.name}{merkle_node.hash}\n".encode()
112
+ hash_obj.update(data)
113
+ # clear out the current merkle node children Dict
114
+ # so we don't end up with a giant tree in memory
115
+ # TODO: this breaks the plugin: when the node is about to be inserted, he apparently has no children anymore
116
+ # merkle_node.children.clear()
117
+ merkle_children[child_node.name] = merkle_node
118
+ # compute final hash
119
+ merkle_node = WinRegKeyMerkleNode( # type: ignore[call-arg]
120
+ hash=hash_obj.hexdigest(), children=merkle_children, label=MerkleLabel.Tree, key=node.key
121
+ )
122
+ return VisitedNode(node, merkle_node)
123
+
124
+
125
+ @define(auto_attribs=True)
126
+ class WinRegistryPlugin(AbstractPlugin):
127
+
128
+ def constraints_data(self) -> List[UniqueConstraint]:
129
+ return [
130
+ UniqueConstraint(label="WinRegKey", property_list=["hash"]),
131
+ UniqueConstraint(label="WinRegValue", property_list=["hash"]),
132
+ ]
133
+
134
+ def run(self, commit: Commit):
135
+ fs: Tree = commit.filesystem.single()
136
+ for hive_path, root_hive in HIVE_MAPPING.items():
137
+ try:
138
+ blob = fs.get_blob_at_path(hive_path)
139
+ with self.downloaded_file(blob.hash) as hive_local_path:
140
+ self.logger.info("Dumping %s", hive_path)
141
+ node = self.dump_hive(hive_path, hive_local_path, root_hive)
142
+ if node is not None:
143
+ # attach Key to blob
144
+ self.attach_root_key_to_blob(blob, node.return_value, root_hive.name)
145
+ except FileNotFoundError:
146
+ self.logger.warning("Not found: %s", hive_path)
147
+
148
+ def dump_hive(self, hive_win_path: PurePath, hive_local_path: Path, root_hive: PurePath):
149
+ """Dump a Windows registry hive
150
+
151
+ :param hive_local_path: host path to the download hive file
152
+ """
153
+ # load hive
154
+ try:
155
+ hive = RegistryHive(hive_local_path)
156
+ except Exception as e:
157
+ self.logger.warning("Failed to load hive %s", hive_win_path)
158
+ self.logger.debug(e)
159
+ return
160
+ root_node = WinRegKeyNode(root_hive, key=hive.root)
161
+ with WinRegMerkleVisitor(thread=True) as visitor:
162
+ visitor.run_visit(root_node)
163
+ last_node = None
164
+ for node in visitor.as_gen():
165
+ if isinstance(node.return_value, WinRegKeyMerkleNode):
166
+ self.insert_from_visited_node_cypher(node.return_value)
167
+ last_node = node
168
+ # clear children to save RAM
169
+ node.return_value.children.clear()
170
+ return last_node
171
+
172
+ def insert_from_visited_node_cypher(self, node: WinRegKeyMerkleNode):
173
+ query = """
174
+ MERGE (p:WinRegKey {hash: $parent_hash})
175
+ WITH p
176
+ FOREACH (cv IN $child_values |
177
+ MERGE (v:WinRegValue {hash: cv.hash, value: cv.value, type: cv.type})
178
+ MERGE (p)-[:HAS_CHILD {name: cv.name}]->(v)
179
+ )
180
+ WITH p
181
+ FOREACH (ck IN $child_keys |
182
+ MERGE (k:WinRegKey {hash: ck.hash})
183
+ MERGE (p)-[:HAS_CHILD {name: ck.name}]->(k)
184
+ )
185
+ """
186
+ # note: Neo4j can store integer as signed 64 bits number
187
+ # however the Windows registry can contain REG_QWORD values up to 2^64 - 1
188
+ # so we need to ensure the value is casted as a string here
189
+ child_values = [
190
+ {
191
+ "name": child_name,
192
+ "hash": child_node.hash,
193
+ "value": str(child_node.value.value),
194
+ "type": child_node.value.value_type,
195
+ }
196
+ for child_name, child_node in node.children.items()
197
+ if child_node.label == MerkleLabel.Blob
198
+ ]
199
+ child_keys = [
200
+ {
201
+ "name": child_name,
202
+ "hash": child_node.hash,
203
+ }
204
+ for child_name, child_node in node.children.items()
205
+ if child_node.label == MerkleLabel.Tree
206
+ ]
207
+ self.neogit.db.cypher_query(
208
+ query,
209
+ {
210
+ "parent_hash": node.hash,
211
+ "child_values": child_values,
212
+ "child_keys": child_keys,
213
+ },
214
+ )
215
+
216
+ def attach_root_key_to_blob(self, blob: Blob, root_node: WinRegKeyMerkleNode, root_name: str):
217
+ query = """
218
+ MATCH (b:Blob {hash: $blob_hash})
219
+ WITH b
220
+ MATCH (k:WinRegKey {hash: $root_hash})
221
+ WITH b, k
222
+ MERGE (b)-[:HAS_WINREG {name: $name}]->(k)
223
+ """
224
+ self.neogit.db.cypher_query(query, {"blob_hash": blob.hash, "root_hash": root_node.hash, "name": root_name})
@@ -0,0 +1,475 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ import hashlib
5
+ import json
6
+ import logging
7
+ import os
8
+ import tempfile
9
+ from binascii import hexlify
10
+ from contextlib import contextmanager, suppress
11
+ from enum import Enum, auto
12
+ from pathlib import Path, PurePath
13
+ from typing import Dict, Generator, Optional, Tuple
14
+ from urllib import parse as urllib_parse
15
+ from urllib import request as urllib_request
16
+
17
+ import lief
18
+ import pypeln as pl
19
+ from attrs import define, field
20
+ from neogit.core.merkle import MerkleVisitor
21
+ from neogit.core.model import MerkleLabel, MerkleNode, Node
22
+ from neogit.core.visitor import VisitedNode
23
+ from neogit.model.neo import Commit
24
+ from volatility3.framework.contexts import Context
25
+ from volatility3.framework.symbols.windows.pdbconv import PdbReader, PdbRetreiver
26
+
27
+ from plugins.plugins.symbols_repository import SymbolsRepository
28
+ from plugins.plugins.symbols_service import filter_valid_filenames, parse_symbols_from_json
29
+ from plugins.types import AbstractPlugin, UniqueConstraint
30
+
31
+
32
+ @contextmanager
33
+ def temporary_file_context(path):
34
+ try:
35
+ yield path
36
+ finally:
37
+ os.remove(path)
38
+
39
+
40
+ def return_exceptions(f):
41
+ @functools.wraps(f)
42
+ def wrapped(self, x):
43
+ if isinstance(x, BaseException):
44
+ return x
45
+ try:
46
+ return f(self, x)
47
+ except BaseException as e:
48
+ return e
49
+
50
+ return wrapped
51
+
52
+
53
+ # User Types (structs)
54
+ class FieldKindType(Enum):
55
+ Base = auto()
56
+ Pointer = auto()
57
+ Function = auto()
58
+ Enum = auto()
59
+ Array = auto()
60
+ Struct = auto()
61
+ Union = auto()
62
+ Bitfield = auto()
63
+
64
+
65
+ class UserTypeKindType(Enum):
66
+ Struct = auto()
67
+ Union = auto()
68
+ Enum = auto()
69
+
70
+
71
+ @define(auto_attribs=True)
72
+ class StructNode(Node):
73
+ name: str
74
+ struct_data: Dict
75
+ # either struct, union or enum
76
+ kind: UserTypeKindType = field(init=False)
77
+ size: int = field(init=False)
78
+
79
+ def __attrs_post_init__(self):
80
+ self.size = self.struct_data["size"]
81
+ if "constants" in self.struct_data:
82
+ # enum
83
+ self.kind = UserTypeKindType.Enum
84
+ else:
85
+ self.kind = UserTypeKindType[self.struct_data["kind"].capitalize()]
86
+
87
+ def iter_child_nodes(self) -> Generator[Node, None, None]:
88
+ if self.kind == UserTypeKindType.Enum:
89
+ for name, value in self.struct_data["constants"].items():
90
+ field_node = StructFieldNode(
91
+ name=name, field_data={"offset": value, "type": {"kind": FieldKindType.Base.name, "name": "int"}}
92
+ )
93
+ yield field_node
94
+ else:
95
+ # iterate on every field
96
+ for field_name, field_data in self.struct_data["fields"].items():
97
+ field_node = StructFieldNode(name=field_name, field_data=field_data)
98
+ yield field_node
99
+
100
+
101
+ @define(auto_attribs=True)
102
+ class StructFieldNode(Node):
103
+ name: str = field()
104
+ field_data: Dict = field()
105
+ offset: int = field(init=False)
106
+ data_type: str = field(init=False)
107
+
108
+ def __attrs_post_init__(self):
109
+ self.offset = self.field_data["offset"]
110
+ self.data_type = json.dumps(self.field_data["type"])
111
+
112
+ # store data type directly in the field node
113
+ # def iter_child_nodes(self) -> Generator[Node, None, None]:
114
+ # # construct the subtype node
115
+ # yield DataTypeNode(data_type=self.field_data["type"])
116
+
117
+
118
+ @define(auto_attribs=True)
119
+ class DataTypeNode(Node):
120
+ """Represents basic data types
121
+ - name: unsigned long
122
+ - name: unsigned long long
123
+ - name: void
124
+ - name: int
125
+ - name: void
126
+ """
127
+
128
+ data_type: Dict = field()
129
+ kind: FieldKindType = field(init=False)
130
+ # if base, enum, struct, union
131
+ name: Optional[str] = field(init=False)
132
+ # if array
133
+ array_counter: Optional[int] = field(init=False)
134
+ # if bitfield
135
+ bit_length: Optional[int] = field(init=False)
136
+ bit_position: Optional[int] = field(init=False)
137
+ # if pointer or bitfield or array
138
+ subtype: Optional[Dict] = field(init=False)
139
+
140
+ def __attrs_post_init__(self):
141
+ self.kind = FieldKindType[self.data_type["kind"].capitalize()]
142
+ match self.kind:
143
+ case FieldKindType.Base | FieldKindType.Enum | FieldKindType.Struct | FieldKindType.Union:
144
+ self.name = self.data_type["name"]
145
+ case FieldKindType.Array:
146
+ self.array_counter = self.data_type["count"]
147
+ self.subtype = self.data_type["subtype"]
148
+ case FieldKindType.Bitfield:
149
+ self.bit_length = self.data_type["bit_length"]
150
+ self.bit_position = self.data_type["bit_position"]
151
+ self.subtype = self.data_type["type"]
152
+ case FieldKindType.Pointer:
153
+ self.subtype = self.data_type["subtype"]
154
+ case FieldKindType.Function:
155
+ self.name = "function"
156
+
157
+ def iter_child_nodes(self) -> Generator[Node, None, None]:
158
+ match self.kind:
159
+ case FieldKindType.Array | FieldKindType.Bitfield | FieldKindType.Pointer:
160
+ yield DataTypeNode(data_type=self.subtype) # type: ignore[arg-type]
161
+
162
+
163
+ @define(auto_attribs=True)
164
+ class DataTypeMerkleNode(MerkleNode):
165
+ kind: FieldKindType = field(kw_only=True)
166
+ name: Optional[str] = field(default=None, kw_only=True)
167
+ array_counter: Optional[int] = field(default=None, kw_only=True)
168
+ bit_length: Optional[int] = field(default=None, kw_only=True)
169
+ bit_position: Optional[int] = field(default=None, kw_only=True)
170
+
171
+
172
+ @define(auto_attribs=True)
173
+ class StructFieldMerkleNode(MerkleNode):
174
+ name: str = field(kw_only=True)
175
+ offset: int = field(kw_only=True)
176
+ data_type: str = field(kw_only=True)
177
+
178
+
179
+ @define(auto_attribs=True)
180
+ class StructMerkleNode(MerkleNode):
181
+ name: str = field(kw_only=True)
182
+ size: int = field(kw_only=True)
183
+ kind: UserTypeKindType = field(kw_only=True)
184
+
185
+
186
+ # define the visitor
187
+ class SymbolsMerkleVisitor(MerkleVisitor):
188
+
189
+ def visit_DataTypeNode(self, node: DataTypeNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
190
+ match node.kind:
191
+ case (
192
+ FieldKindType.Base
193
+ | FieldKindType.Enum
194
+ | FieldKindType.Struct
195
+ | FieldKindType.Union
196
+ | FieldKindType.Function
197
+ ):
198
+ hash_obj.update(f"{node.name}".encode())
199
+ merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
200
+ hash=hash_obj.hexdigest(), label=MerkleLabel.Blob, kind=node.kind, name=node.name
201
+ )
202
+ return VisitedNode(node, merkle_node)
203
+ case FieldKindType.Array:
204
+ subtype = next(node.iter_child_nodes())
205
+ visited_node = self.visit(subtype)
206
+ merkle_node = visited_node.return_value
207
+ data = f"{merkle_node.hash}-{node.array_counter}\n".encode()
208
+ hash_obj.update(data)
209
+ merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
210
+ hash=hash_obj.hexdigest(),
211
+ label=MerkleLabel.Blob,
212
+ kind=node.kind,
213
+ array_counter=node.array_counter,
214
+ children={merkle_node.name: merkle_node},
215
+ )
216
+ return VisitedNode(node, merkle_node)
217
+ case FieldKindType.Bitfield:
218
+ subtype = next(node.iter_child_nodes())
219
+ visited_node = self.visit(subtype)
220
+ merkle_node = visited_node.return_value
221
+ data = f"{merkle_node.hash}-{node.bit_length}-{node.bit_position}\n".encode()
222
+ hash_obj.update(data)
223
+ merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
224
+ hash=hash_obj.hexdigest(),
225
+ label=MerkleLabel.Blob,
226
+ kind=node.kind,
227
+ bit_length=node.bit_length,
228
+ bit_position=node.bit_position,
229
+ children={merkle_node.name: merkle_node},
230
+ )
231
+ return VisitedNode(node, merkle_node)
232
+ case FieldKindType.Pointer:
233
+ subtype = next(node.iter_child_nodes())
234
+ visited_node = self.visit(subtype)
235
+ merkle_node = visited_node.return_value
236
+ data = f"{merkle_node.hash}\n".encode()
237
+ hash_obj.update(data)
238
+ merkle_node = DataTypeMerkleNode( # type: ignore[call-arg]
239
+ hash=hash_obj.hexdigest(),
240
+ label=MerkleLabel.Blob,
241
+ kind=node.kind,
242
+ children={merkle_node.name: merkle_node},
243
+ )
244
+ return VisitedNode(node, merkle_node)
245
+
246
+ def visit_StructFieldNode(self, node: StructFieldNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
247
+ children: dict = {}
248
+ # for data_type in node.iter_child_nodes():
249
+ # visited_node = self.visit(data_type)
250
+ # merkle_node = visited_node.return_value
251
+ # data = f"{merkle_node.hash}\n".encode()
252
+ # hash_obj.update(data)
253
+ # children[merkle_node.hash] = merkle_node
254
+ # merklize the offset and the data type string
255
+ hash_obj.update(f"{node.offset}-{node.data_type}".encode())
256
+ merkle_node = StructFieldMerkleNode( # type: ignore[call-arg]
257
+ hash=hash_obj.hexdigest(),
258
+ label=MerkleLabel.Blob,
259
+ name=node.name,
260
+ offset=node.offset,
261
+ data_type=node.data_type,
262
+ children=children,
263
+ )
264
+ return VisitedNode(node, merkle_node)
265
+
266
+ def visit_StructNode(self, node: StructNode, hash_obj: hashlib._Hash, *args, **kwargs) -> VisitedNode:
267
+ children = {}
268
+ for member in node.iter_child_nodes():
269
+ visited_node = self.visit(member)
270
+ merkle_node = visited_node.return_value
271
+ "field_name-field_hash"
272
+ data = f"{member.name}{merkle_node.hash}\n".encode()
273
+ hash_obj.update(data)
274
+ children[member.name] = merkle_node
275
+ hash_obj.update(f"{node.size}-{node.kind.name}".encode())
276
+ merkle_node = StructMerkleNode( # type: ignore[call-arg]
277
+ hash=hash_obj.hexdigest(),
278
+ children=children,
279
+ label=MerkleLabel.Tree,
280
+ name=node.name,
281
+ size=node.size,
282
+ kind=node.kind,
283
+ )
284
+ return VisitedNode(node, merkle_node)
285
+
286
+
287
+ def parse_code_view(pe_path):
288
+ pe = lief.parse(pe_path)
289
+ for debug_dir in pe.debug:
290
+ if debug_dir.type == lief.PE.Debug.TYPES.CODEVIEW:
291
+ part1_bin = debug_dir.signature[:4]
292
+ part1_bin.reverse()
293
+ part1 = bytearray(part1_bin)
294
+ part2_bin = debug_dir.signature[4:6]
295
+ part2_bin.reverse()
296
+ part2 = bytearray(part2_bin)
297
+ part3_bin = debug_dir.signature[6:8]
298
+ part3_bin.reverse()
299
+ part3 = bytearray(part3_bin)
300
+ part4_bin = debug_dir.signature[8:]
301
+ part4 = bytearray(part4_bin)
302
+
303
+ guid = (
304
+ f"{hexlify(part1).decode()}{hexlify(part2).decode()}{hexlify(part3).decode()}{hexlify(part4).decode()}"
305
+ )
306
+ return guid, debug_dir.age & 0xF, debug_dir.filename
307
+
308
+
309
+ def _pdb_progress_callback(percentage, description):
310
+ """Progress callback for PDB downloads - must be module-level for multiprocessing pickling.
311
+
312
+ Args:
313
+ percentage: Progress percentage (0-100)
314
+ description: Description of current operation
315
+ """
316
+ logging.debug(f"PDB download progress: {percentage: .1f}% - {description}")
317
+
318
+
319
+ def retrieve_pdb(guid, age, pdb_name) -> str:
320
+ logging.debug("Retrieving PDB: %s - GUID: %s - Age: %s", pdb_name, guid, age)
321
+ filename = PdbRetreiver().retreive_pdb(
322
+ guid + str(age), file_name=pdb_name, progress_callback=_pdb_progress_callback
323
+ )
324
+ logging.debug("filename: %s", filename)
325
+ if not filename:
326
+ raise ValueError("PDB file could not be retrieved from the internet")
327
+ url = urllib_parse.urlparse(filename, scheme="file")
328
+ if url.scheme == "file":
329
+ if not Path(filename).exists():
330
+ logging.error(f"File {filename} does not exists")
331
+ location = "file:" + urllib_request.pathname2url(str(Path(filename).absolute()))
332
+ else:
333
+ location = filename
334
+ return location
335
+
336
+
337
+ @define(auto_attribs=True)
338
+ class SymbolsPlugin(AbstractPlugin):
339
+ max_workers: int = field(init=False, default=os.cpu_count())
340
+ _repository: Optional[SymbolsRepository] = field(init=False, default=None)
341
+
342
+ PE_MIME_TYPE = "application/vnd.microsoft.portable-executable"
343
+ # only process these filenames for now
344
+ FILTER_FILENAME = [
345
+ "ntoskrnl.exe",
346
+ "ntdll.dll",
347
+ "kernel32.dll",
348
+ "win32kfull.sys",
349
+ "win32kbase.sys",
350
+ "win32k.sys",
351
+ ]
352
+
353
+ @property
354
+ def repository(self) -> SymbolsRepository:
355
+ """Lazy-initialize repository."""
356
+ if self._repository is None:
357
+ self._repository = SymbolsRepository(self.neogit)
358
+ return self._repository
359
+
360
+ def constraints_data(self) -> list[UniqueConstraint]:
361
+ return [
362
+ UniqueConstraint(label="Symbol", property_list=["hash"]),
363
+ UniqueConstraint(label="Struct", property_list=["hash"]),
364
+ UniqueConstraint(label="StructField", property_list=["hash"]),
365
+ UniqueConstraint(label="DataType", property_list=["hash"]),
366
+ ]
367
+
368
+ def run(self, commit: Commit):
369
+ # identify every PE file Blob
370
+ fs = commit.filesystem.single()
371
+
372
+ # Use repository to query PE blobs
373
+ all_blobs = self.repository.query_pe_blobs(fs.hash, self.__class__.PE_MIME_TYPE)
374
+ self.logger.info("Found %d PE blobs for commit %s", len(all_blobs), commit.hash)
375
+ blob_results = filter_valid_filenames(all_blobs, self.FILTER_FILENAME)
376
+ if not blob_results:
377
+ self.logger.info(
378
+ "No matching PE blobs after filename filter %s for commit %s",
379
+ self.FILTER_FILENAME,
380
+ commit.hash,
381
+ )
382
+ return
383
+ stage = pl.process.map(self.stage_parse_code_view, blob_results, workers=4)
384
+ stage = pl.process.map(self.stage_process_pdb, stage, workers=self.max_workers, maxsize=self.max_workers)
385
+ for ret in stage:
386
+ if isinstance(ret, BaseException):
387
+ self.logger.error("Failed to process PDB: %s: %s", type(ret).__name__, ret)
388
+ continue
389
+ blob_hash, pdb_name, tmp_file_path = ret
390
+ try:
391
+ with open(tmp_file_path, "r") as f:
392
+ j_data = json.load(f)
393
+ self.parse_pdb_json(blob_hash, pdb_name, j_data)
394
+ finally:
395
+ with suppress(FileNotFoundError):
396
+ os.remove(tmp_file_path)
397
+
398
+ @return_exceptions
399
+ def stage_parse_code_view(self, blob_result: Tuple[PurePath, str]) -> Optional[Tuple[str, int, str]]:
400
+ self.logger.info("Processing PE file: %s", blob_result[0])
401
+ blob_hash = blob_result[1]
402
+ with self.downloaded_file(blob_hash) as local_file:
403
+ ret = parse_code_view(local_file)
404
+ if not ret:
405
+ raise ValueError("No CodeView found")
406
+ guid, age, pdb_name = ret
407
+ self.logger.info("Path: %s - PDB: %s - GUID: %s (%s)", blob_result[0], pdb_name, guid, age)
408
+ return blob_hash, *ret
409
+
410
+ @return_exceptions
411
+ def stage_process_pdb(self, arg) -> Tuple[str, str, Path]:
412
+ blob_hash, guid, age, pdb_name = arg
413
+ try:
414
+ location = retrieve_pdb(guid, age, pdb_name)
415
+ except Exception as e:
416
+ self.logger.error("Failed to retrieve PDB for blob %s: %s", blob_hash, e)
417
+ raise ValueError(f"Failed to retrieve PDB {pdb_name} on {blob_hash}") from e
418
+ logging.debug(location)
419
+ ctx = Context()
420
+ try:
421
+ j_data = PdbReader(ctx, location).get_json()
422
+ except Exception as e:
423
+ raise ValueError(f"Failed to parse PDB {pdb_name} on {blob_hash}") from e
424
+ with tempfile.NamedTemporaryFile(delete=False, mode="w+") as tmp_file:
425
+ json.dump(j_data, tmp_file)
426
+ tmp_file.flush()
427
+ self.logger.debug("PDB: %s - JSON file: %s", pdb_name, tmp_file.name)
428
+ return blob_hash, pdb_name, Path(tmp_file.name)
429
+
430
+ def parse_pdb_json(self, blob_hash: str, pdb_name: str, j_pdb: Dict):
431
+ self.logger.debug("PDB: %s - Parsing JSON", pdb_name)
432
+ count_enum = self.parse_users_types(blob_hash, j_pdb["enums"])
433
+ count_syms = self.insert_symbols(blob_hash, j_pdb["symbols"])
434
+ count_types = self.parse_users_types(blob_hash, j_pdb["user_types"])
435
+ self.logger.info(
436
+ "PDB: %s - Inserted %d enums, %d symbols, %d user types", pdb_name, count_enum, count_syms, count_types
437
+ )
438
+
439
+ def parse_users_types(self, blob_hash: str, j_pdb: Dict) -> int:
440
+ with SymbolsMerkleVisitor(thread=True) as visitor:
441
+ for struct_name, struct_data in sorted(j_pdb.items()):
442
+ self.logger.debug("Struct: %s", struct_name)
443
+ struct_node = StructNode(name=struct_name, struct_data=struct_data)
444
+ visitor.run_visit(struct_node)
445
+ for node in visitor.as_gen():
446
+ merkle_node = node.return_value
447
+ if isinstance(merkle_node, DataTypeMerkleNode):
448
+ self.repository.insert_data_type(merkle_node)
449
+ if isinstance(merkle_node, StructMerkleNode):
450
+ unwind_param = [
451
+ {
452
+ "hash": child_node.hash,
453
+ "name": child_name,
454
+ "offset": child_node.offset,
455
+ "data_type": child_node.data_type,
456
+ }
457
+ for child_name, child_node in merkle_node.children.items()
458
+ ]
459
+ self.repository.insert_struct(blob_hash, merkle_node, unwind_param)
460
+
461
+ return len(j_pdb.items())
462
+
463
+ def insert_symbols(self, blob_hash: str, symbols: Dict) -> int:
464
+ # Parse symbols using pure function
465
+ parsed_symbols = parse_symbols_from_json(symbols)
466
+
467
+ # Convert to param format for Neo4j
468
+ param_list = []
469
+ for symbol in parsed_symbols:
470
+ param_list.append({"hash": symbol["hash"], "sym_name": symbol["name"], "address": symbol["address"]})
471
+ self.logger.debug("Symbol %s (%s)", symbol["name"], symbol["address"])
472
+
473
+ # Use repository to insert symbols
474
+ self.repository.insert_symbols(blob_hash, param_list)
475
+ return len(symbols.items())