oswatcher-plugins 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,271 @@
1
+ """Linux kernel symbol extraction plugin using DWARF debug info from Ubuntu ddebs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional
8
+
9
+ import requests
10
+ from attrs import define, field
11
+ from neogit.model.neo import Commit
12
+
13
+ from plugins.plugins.linux_symbols_service import (
14
+ detect_ubuntu_codename,
15
+ extract_vmlinux_from_ddeb,
16
+ parse_kernel_version_parts,
17
+ parse_symbols_for_neo4j,
18
+ resolve_ddeb_url_from_packages,
19
+ run_dwarf2json,
20
+ )
21
+ from plugins.plugins.symbols import DataTypeMerkleNode, StructMerkleNode, StructNode, SymbolsMerkleVisitor
22
+ from plugins.plugins.symbols_repository import SymbolsRepository
23
+ from plugins.syscalls.filesystem import KernelInfo, find_kernel_versions, get_boot_directory
24
+ from plugins.types import AbstractPlugin, UniqueConstraint
25
+
26
+
27
+ def lief_arch_to_ddeb_arch(lief_arch: str) -> str:
28
+ """Convert lief architecture string to ddeb architecture name.
29
+
30
+ Handles both ELF machine types (e.g., "ARCH.x86_64") and
31
+ PE machine types from EFI boot stubs (e.g., "MACHINE_TYPES.AMD64").
32
+
33
+ Args:
34
+ lief_arch: Architecture string from lief
35
+
36
+ Returns:
37
+ Architecture for ddeb URL like "amd64" or "arm64"
38
+ """
39
+ arch_lower = lief_arch.lower()
40
+ if "x86_64" in arch_lower or "amd64" in arch_lower:
41
+ return "amd64"
42
+ if "aarch64" in arch_lower or "arm64" in arch_lower:
43
+ return "arm64"
44
+ if "arm" in arch_lower:
45
+ return "armhf"
46
+ if "i386" in arch_lower or "i686" in arch_lower:
47
+ return "i386"
48
+ raise ValueError(f"Unrecognized architecture: {lief_arch!r}")
49
+
50
+
51
+ @define(auto_attribs=True)
52
+ class LinuxSymbolsPlugin(AbstractPlugin):
53
+ """Plugin to extract Linux kernel symbols and struct definitions from DWARF debug info.
54
+
55
+ This plugin:
56
+ 1. Finds Linux kernels in /boot directory
57
+ 2. Downloads debug symbols from ddebs.ubuntu.com
58
+ 3. Parses DWARF debug info using dwarf2json (Volatility Foundation)
59
+ 4. Stores symbols and struct definitions in Neo4j
60
+ """
61
+
62
+ _repository: Optional[SymbolsRepository] = field(init=False, default=None)
63
+
64
+ # Request timeout for downloading ddeb packages (in seconds)
65
+ DOWNLOAD_TIMEOUT = 300
66
+
67
+ @property
68
+ def repository(self) -> SymbolsRepository:
69
+ """Lazy-initialize repository."""
70
+ if self._repository is None:
71
+ self._repository = SymbolsRepository(self.neogit)
72
+ return self._repository
73
+
74
+ def constraints_data(self) -> List[UniqueConstraint]:
75
+ """Return constraints for symbol-related nodes.
76
+
77
+ Reuses same constraints as SymbolsPlugin since we use the same node types.
78
+ """
79
+ return [
80
+ UniqueConstraint(label="Symbol", property_list=["hash"]),
81
+ UniqueConstraint(label="Struct", property_list=["hash"]),
82
+ UniqueConstraint(label="StructField", property_list=["hash"]),
83
+ UniqueConstraint(label="DataType", property_list=["hash"]),
84
+ ]
85
+
86
+ def run(self, commit: Commit):
87
+ """Execute Linux symbol extraction for a commit.
88
+
89
+ Args:
90
+ commit: The Commit node to analyze
91
+ """
92
+ self.logger.info(f"Running Linux symbols plugin for commit {commit.hash}")
93
+
94
+ # Get the root filesystem tree
95
+ try:
96
+ root_tree = commit.filesystem[0]
97
+ except IndexError:
98
+ self.logger.warning(f"No filesystem found for commit {commit.hash}")
99
+ return
100
+
101
+ # Navigate to /boot directory
102
+ boot_tree = get_boot_directory(root_tree)
103
+ if not boot_tree:
104
+ self.logger.info("No /boot directory found, skipping Linux symbol extraction")
105
+ return
106
+
107
+ codename = detect_ubuntu_codename(root_tree)
108
+ if codename:
109
+ self.logger.info(f"Detected Ubuntu codename: {codename}")
110
+ else:
111
+ self.logger.warning("Could not detect Ubuntu codename from /etc/os-release")
112
+
113
+ # Find kernel versions from vmlinuz files
114
+ kernel_info_list = find_kernel_versions(boot_tree, self)
115
+ if not kernel_info_list:
116
+ self.logger.info("No kernel files found in /boot")
117
+ return
118
+
119
+ self.logger.info(
120
+ f"Found {len(kernel_info_list)} kernel(s): "
121
+ + ", ".join(f"{k.filename} ({k.architecture})" for k in kernel_info_list)
122
+ )
123
+
124
+ # Process each kernel
125
+ for kernel_info in kernel_info_list:
126
+ try:
127
+ self._process_kernel(kernel_info, codename)
128
+ except Exception:
129
+ self.logger.exception(f"Failed to process kernel {kernel_info.filename}")
130
+
131
+ self.logger.info(f"Linux symbol extraction complete for commit {commit.hash}")
132
+
133
+ def _process_kernel(self, kernel_info: KernelInfo, codename: Optional[str]):
134
+ """Process a single kernel: download debug symbols and extract info.
135
+
136
+ Args:
137
+ kernel_info: Information about the kernel to process
138
+ """
139
+ self.logger.info(f"Processing kernel: {kernel_info.filename}")
140
+
141
+ # Parse version components from filename
142
+ try:
143
+ version, build, flavor = parse_kernel_version_parts(kernel_info.filename)
144
+ except ValueError as e:
145
+ self.logger.warning(f"Cannot parse kernel version from {kernel_info.filename}: {e}")
146
+ return
147
+
148
+ # Convert architecture
149
+ ddeb_arch = lief_arch_to_ddeb_arch(kernel_info.architecture)
150
+
151
+ # Resolve exact ddeb URL from repository metadata.
152
+ ddeb_url = resolve_ddeb_url_from_packages(version, build, flavor, ddeb_arch, codename)
153
+ if ddeb_url is None:
154
+ self.logger.error(
155
+ f"Could not resolve debug package URL for {kernel_info.filename} "
156
+ f"(codename={codename}, arch={ddeb_arch})"
157
+ )
158
+ return
159
+
160
+ self.logger.info(f"Debug package URL: {ddeb_url}")
161
+
162
+ # Download and process ddeb
163
+ with tempfile.TemporaryDirectory() as tmpdir:
164
+ tmpdir_path = Path(tmpdir)
165
+
166
+ # Download ddeb
167
+ ddeb_path = tmpdir_path / "debug.ddeb"
168
+ try:
169
+ self._download_ddeb(ddeb_url, ddeb_path)
170
+ except Exception as e:
171
+ self.logger.error(f"Failed to download ddeb from {ddeb_url}: {e}")
172
+ return
173
+
174
+ # Extract vmlinux
175
+ try:
176
+ vmlinux_path = extract_vmlinux_from_ddeb(ddeb_path, tmpdir_path)
177
+ self.logger.info(f"Extracted vmlinux: {vmlinux_path}")
178
+ except Exception as e:
179
+ self.logger.error(f"Failed to extract vmlinux: {e}")
180
+ return
181
+
182
+ # Parse everything in one dwarf2json call
183
+ try:
184
+ self.logger.info("Running dwarf2json...")
185
+ dwarf_data = run_dwarf2json(vmlinux_path)
186
+ except Exception as e:
187
+ self.logger.error(f"Failed to run dwarf2json: {e}")
188
+ return
189
+
190
+ # Insert symbols
191
+ symbols_dict = dwarf_data.get("symbols", {})
192
+ self.logger.info(f"Found {len(symbols_dict)} symbols")
193
+ param_list = parse_symbols_for_neo4j(symbols_dict)
194
+ self.logger.info(f"Inserting {len(param_list)} symbols into Neo4j")
195
+ self.repository.insert_symbols(kernel_info.blob_hash, param_list)
196
+
197
+ # Insert enums
198
+ enums = dwarf_data.get("enums", {})
199
+ self.logger.info(f"Found {len(enums)} enums")
200
+ count_enum = self._insert_user_types(kernel_info.blob_hash, enums)
201
+
202
+ # Insert structs/unions
203
+ types = dwarf_data.get("user_types", {})
204
+ self.logger.info(f"Found {len(types)} struct/union types")
205
+ count_types = self._insert_user_types(kernel_info.blob_hash, types)
206
+
207
+ self.logger.info(f"Inserted {len(param_list)} symbols, {count_enum} enums, {count_types} types")
208
+
209
+ def _download_ddeb(self, url: str, output_path: Path):
210
+ """Download ddeb package from URL.
211
+
212
+ Args:
213
+ url: URL to download from
214
+ output_path: Path to save the downloaded file
215
+ """
216
+ self.logger.info(f"Downloading: {url}")
217
+
218
+ response = requests.get(url, timeout=self.DOWNLOAD_TIMEOUT, stream=True)
219
+ response.raise_for_status()
220
+
221
+ total_size = int(response.headers.get("content-length", 0))
222
+ downloaded = 0
223
+
224
+ with open(output_path, "wb") as f:
225
+ for chunk in response.iter_content(chunk_size=8192):
226
+ f.write(chunk)
227
+ downloaded += len(chunk)
228
+ if total_size > 0:
229
+ percent = (downloaded / total_size) * 100
230
+ if downloaded % (10 * 1024 * 1024) < 8192: # Log every ~10MB
231
+ mb_downloaded = downloaded / 1024 / 1024
232
+ self.logger.debug(f"Downloaded {mb_downloaded:.1f}MB ({percent:.1f}%)") # noqa: E231
233
+
234
+ mb_downloaded = downloaded / 1024 / 1024
235
+ self.logger.info(f"Downloaded {mb_downloaded:.1f}MB to {output_path}") # noqa: E231
236
+
237
+ def _insert_user_types(self, blob_hash: str, types_dict: Dict) -> int:
238
+ """Insert user types (structs/unions/enums) into Neo4j using Merkle visitor.
239
+
240
+ This method reuses the same logic as SymbolsPlugin.parse_users_types().
241
+
242
+ Args:
243
+ blob_hash: Hash of the kernel blob
244
+ types_dict: Dictionary of type definitions in volatility3 format
245
+
246
+ Returns:
247
+ Number of types processed
248
+ """
249
+ with SymbolsMerkleVisitor(thread=True) as visitor:
250
+ for struct_name, struct_data in sorted(types_dict.items()):
251
+ self.logger.debug(f"Processing type: {struct_name}")
252
+ struct_node = StructNode(name=struct_name, struct_data=struct_data)
253
+ visitor.run_visit(struct_node)
254
+
255
+ for node in visitor.as_gen():
256
+ merkle_node = node.return_value
257
+ if isinstance(merkle_node, DataTypeMerkleNode):
258
+ self.repository.insert_data_type(merkle_node)
259
+ if isinstance(merkle_node, StructMerkleNode):
260
+ unwind_param = [
261
+ {
262
+ "hash": child_node.hash,
263
+ "name": child_name,
264
+ "offset": child_node.offset,
265
+ "data_type": child_node.data_type,
266
+ }
267
+ for child_name, child_node in merkle_node.children.items()
268
+ ]
269
+ self.repository.insert_struct(blob_hash, merkle_node, unwind_param)
270
+
271
+ return len(types_dict)
@@ -0,0 +1,373 @@
1
+ """Pure functions for Linux kernel symbol extraction.
2
+
3
+ This module provides pure functions for:
4
+ - Constructing ddebs.ubuntu.com URLs for debug packages
5
+ - Parsing kernel version strings
6
+ - Extracting vmlinux from ddeb archives
7
+ - Parsing DWARF debug info via dwarf2json (Volatility Foundation Go binary)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import gzip
13
+ import hashlib
14
+ import json
15
+ import lzma
16
+ import re
17
+ import subprocess
18
+ import tarfile
19
+ import tempfile
20
+ from pathlib import Path, PurePath
21
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
22
+
23
+ import requests
24
+
25
+ if TYPE_CHECKING:
26
+ from neogit.model.merkle import Tree
27
+
28
+
29
+ # Pattern: vmlinuz-{major}.{minor}.{patch}-{build}-{flavor}
30
+ # Example: vmlinuz-6.8.0-45-generic -> ("6.8.0", "45", "generic")
31
+ KERNEL_VERSION_PARTS_PATTERN = re.compile(r"^vmlinuz-(\d+\.\d+\.\d+)-(\d+)-(.+)$")
32
+
33
+
34
+ def parse_kernel_version_parts(filename: str) -> Tuple[str, str, str]:
35
+ """Parse vmlinuz filename into version components.
36
+
37
+ Args:
38
+ filename: Kernel filename like "vmlinuz-6.8.0-45-generic"
39
+
40
+ Returns:
41
+ Tuple of (version, build, flavor) e.g., ("6.8.0", "45", "generic")
42
+
43
+ Raises:
44
+ ValueError: If filename format is invalid
45
+ """
46
+ match = KERNEL_VERSION_PARTS_PATTERN.match(filename)
47
+ if not match:
48
+ raise ValueError(f"Invalid kernel filename format: {filename}")
49
+
50
+ return match.groups() # type: ignore
51
+
52
+
53
+ def construct_ddeb_url(kernel_version: str, build: str, flavor: str, arch: str) -> str:
54
+ """Construct ddebs.ubuntu.com URL for debug package.
55
+
56
+ The URL pattern is:
57
+ http://ddebs.ubuntu.com/pool/main/l/linux/linux-image-unsigned-{version}-{build}-{flavor}-dbgsym_{version}-{build}.{build}_{arch}.ddeb
58
+
59
+ Args:
60
+ kernel_version: Kernel version like "6.8.0"
61
+ build: Build number like "45"
62
+ flavor: Kernel flavor like "generic"
63
+ arch: Architecture like "amd64"
64
+
65
+ Returns:
66
+ Full URL to the ddeb package
67
+
68
+ Example:
69
+ >>> construct_ddeb_url("6.8.0", "45", "generic", "amd64")
70
+ 'http://ddebs.ubuntu.com/pool/main/l/linux/linux-image-unsigned-6.8.0-45-generic-dbgsym_6.8.0-45.45_amd64.ddeb'
71
+ """
72
+ # Format: linux-image-unsigned-{ver}-{build}-{flavor}-dbgsym_{ver}-{build}.{build}_{arch}.ddeb
73
+ package_name = f"linux-image-unsigned-{kernel_version}-{build}-{flavor}-dbgsym"
74
+ version_string = f"{kernel_version}-{build}.{build}"
75
+ filename = f"{package_name}_{version_string}_{arch}.ddeb"
76
+
77
+ base_url = "http://ddebs.ubuntu.com/pool/main/l/linux"
78
+ return f"{base_url}/{filename}"
79
+
80
+
81
+ def _parse_packages_index_for_filename(index_content: str, package_names: List[str], arch: str) -> Optional[str]:
82
+ """Parse Debian Packages index and return matching ddeb filename."""
83
+ package_set = set(package_names)
84
+ fields: Dict[str, str] = {}
85
+
86
+ def match_current_stanza() -> Optional[str]:
87
+ if fields.get("Package") not in package_set:
88
+ return None
89
+ if fields.get("Architecture") != arch:
90
+ return None
91
+ filename = fields.get("Filename")
92
+ if filename and filename.endswith(".ddeb"):
93
+ return filename
94
+ return None
95
+
96
+ for line in index_content.splitlines():
97
+ if not line.strip():
98
+ filename = match_current_stanza()
99
+ if filename:
100
+ return filename
101
+ fields = {}
102
+ continue
103
+
104
+ if line.startswith(" "):
105
+ continue
106
+
107
+ key, sep, value = line.partition(":")
108
+ if not sep:
109
+ continue
110
+ fields[key] = value.strip()
111
+
112
+ return match_current_stanza()
113
+
114
+
115
+ def resolve_ddeb_url_from_packages(
116
+ kernel_version: str,
117
+ build: str,
118
+ flavor: str,
119
+ arch: str,
120
+ codename: Optional[str],
121
+ timeout: int = 30,
122
+ ) -> Optional[str]:
123
+ """Resolve exact ddeb URL by scanning ddebs Packages indexes.
124
+
125
+ Only searches for the unsigned dbgsym package, which contains
126
+ the actual vmlinux with DWARF debug info. The signed variant
127
+ is a stub with no debug symbols.
128
+
129
+ Searches the provided codename's suites first. If no codename is
130
+ given, falls back to a list of known LTS/current releases.
131
+ """
132
+ package_name = f"linux-image-unsigned-{kernel_version}-{build}-{flavor}-dbgsym"
133
+ base_url = "http://ddebs.ubuntu.com"
134
+
135
+ # Determine which codenames to search
136
+ if codename:
137
+ codenames = [codename]
138
+ else:
139
+ codenames = ["noble", "jammy", "focal"]
140
+
141
+ for release in codenames:
142
+ for suite in [release, f"{release}-updates", f"{release}-proposed"]:
143
+ for index_name in ["Packages.xz", "Packages.gz"]:
144
+ index_url = f"{base_url}/dists/{suite}/main/binary-{arch}/{index_name}"
145
+ try:
146
+ response = requests.get(index_url, timeout=timeout)
147
+ response.raise_for_status()
148
+ except requests.RequestException:
149
+ continue
150
+
151
+ try:
152
+ if index_name.endswith(".xz"):
153
+ content = lzma.decompress(response.content).decode("utf-8", errors="replace")
154
+ elif index_name.endswith(".gz"):
155
+ content = gzip.decompress(response.content).decode("utf-8", errors="replace")
156
+ else:
157
+ content = response.text
158
+ except (lzma.LZMAError, gzip.BadGzipFile, UnicodeDecodeError):
159
+ continue
160
+
161
+ filename = _parse_packages_index_for_filename(content, [package_name], arch)
162
+ if filename:
163
+ return f"{base_url}/{filename.lstrip('/')}"
164
+ # Successfully parsed this index; skip other compression variants
165
+ break
166
+
167
+ # Fallback: scan pool directory listing directly.
168
+ major_minor = ".".join(kernel_version.split(".")[:2])
169
+ return _resolve_ddeb_url_from_pool_listing(kernel_version, build, flavor, arch, major_minor, timeout)
170
+
171
+
172
+ def _resolve_ddeb_url_from_pool_listing(
173
+ kernel_version: str,
174
+ build: str,
175
+ flavor: str,
176
+ arch: str,
177
+ major_minor: str,
178
+ timeout: int = 30,
179
+ ) -> Optional[str]:
180
+ """Resolve ddeb URL by matching filenames in the pool directory listing.
181
+
182
+ Searches both the main linux pool and HWE-specific pool directories.
183
+ Only looks for unsigned dbgsym packages.
184
+ """
185
+ pool_urls = [
186
+ "http://ddebs.ubuntu.com/pool/main/l/linux/",
187
+ f"http://ddebs.ubuntu.com/pool/main/l/linux-hwe-{major_minor}/",
188
+ ]
189
+ prefix = f"linux-image-unsigned-{kernel_version}-{build}-{flavor}-dbgsym_"
190
+
191
+ for pool_url in pool_urls:
192
+ try:
193
+ response = requests.get(pool_url, timeout=timeout)
194
+ response.raise_for_status()
195
+ except requests.RequestException:
196
+ continue
197
+
198
+ pattern = rf'href="({re.escape(prefix)}[^"]*_{re.escape(arch)}\.ddeb)"'
199
+ matches = re.findall(pattern, response.text)
200
+
201
+ if matches:
202
+ # Keep deterministic behavior and prefer highest lexical revision.
203
+ filename = sorted(set(matches))[-1]
204
+ return f"{pool_url}{filename}"
205
+
206
+ return None
207
+
208
+
209
+ def detect_ubuntu_codename(root_tree: "Tree") -> Optional[str]:
210
+ """Extract Ubuntu codename from /etc/os-release in filesystem.
211
+
212
+ Args:
213
+ root_tree: Root filesystem tree
214
+
215
+ Returns:
216
+ Ubuntu codename like "noble", "jammy", or None if not found
217
+ """
218
+ try:
219
+ os_release = root_tree.get_child_at_path(PurePath("/etc/os-release"))
220
+ if hasattr(os_release, "content"):
221
+ content = os_release.content.decode("utf-8", errors="replace")
222
+ for line in content.splitlines():
223
+ if line.startswith("VERSION_CODENAME="):
224
+ return line.split("=", 1)[1].strip().strip('"')
225
+ except (FileNotFoundError, AttributeError):
226
+ pass
227
+ return None
228
+
229
+
230
+ def extract_vmlinux_from_ddeb(ddeb_path: Path, output_dir: Path) -> Path:
231
+ """Extract vmlinux from ddeb archive.
232
+
233
+ ddeb structure:
234
+ - ddeb is an ar archive
235
+ - Contains data.tar.xz (or data.tar.zst)
236
+ - vmlinux is at usr/lib/debug/boot/vmlinux-*
237
+
238
+ Args:
239
+ ddeb_path: Path to downloaded ddeb file
240
+ output_dir: Directory to extract vmlinux to
241
+
242
+ Returns:
243
+ Path to extracted vmlinux file
244
+
245
+ Raises:
246
+ ValueError: If vmlinux not found in archive
247
+ """
248
+ # Extract data.tar from ar archive using ar command
249
+ # ar archives are simple and ar is universally available
250
+ result = subprocess.run(
251
+ ["ar", "-t", str(ddeb_path)],
252
+ capture_output=True,
253
+ text=True,
254
+ check=True,
255
+ )
256
+ archive_members = result.stdout.strip().split("\n")
257
+
258
+ # Find data.tar.* file
259
+ data_tar_name = None
260
+ for member in archive_members:
261
+ if member.startswith("data.tar"):
262
+ data_tar_name = member
263
+ break
264
+
265
+ if not data_tar_name:
266
+ raise ValueError(f"No data.tar found in ddeb: {ddeb_path}")
267
+
268
+ # Extract data.tar to temp location
269
+ with tempfile.TemporaryDirectory() as tmpdir:
270
+ tmpdir_path = Path(tmpdir)
271
+ subprocess.run(
272
+ ["ar", "-x", str(ddeb_path), data_tar_name],
273
+ cwd=tmpdir_path,
274
+ check=True,
275
+ )
276
+
277
+ data_tar_path = tmpdir_path / data_tar_name
278
+
279
+ # tarfile doesn't support zstd until Python 3.14, decompress first
280
+ if data_tar_name.endswith(".zst"):
281
+ plain_tar_path = tmpdir_path / "data.tar"
282
+ subprocess.run(
283
+ ["zstd", "-d", str(data_tar_path), "-o", str(plain_tar_path)],
284
+ check=True,
285
+ )
286
+ data_tar_path = plain_tar_path
287
+
288
+ # Open data.tar (handles .xz, .gz, .bz2 and plain tar)
289
+ with tarfile.open(data_tar_path, "r:*") as tar:
290
+ # Find vmlinux file
291
+ vmlinux_member: Optional[tarfile.TarInfo] = None
292
+ tar_members: List[tarfile.TarInfo] = tar.getmembers()
293
+ for tar_entry in tar_members:
294
+ if tar_entry.name.endswith("/vmlinux") or "/boot/vmlinux-" in tar_entry.name:
295
+ if tar_entry.isfile():
296
+ vmlinux_member = tar_entry
297
+ break
298
+
299
+ if vmlinux_member is None:
300
+ # List all members for debugging
301
+ members_list = [m.name for m in tar_members][:20]
302
+ raise ValueError(f"Could not find vmlinux in ddeb. Members: {members_list}")
303
+
304
+ # Extract vmlinux
305
+ vmlinux_member.name = Path(vmlinux_member.name).name # Flatten path
306
+ tar.extract(vmlinux_member, path=output_dir)
307
+ return output_dir / vmlinux_member.name
308
+
309
+
310
+ # --- DWARF/Symbol Parsing with dwarf2json ---
311
+
312
+
313
+ def run_dwarf2json(vmlinux_path: Path) -> Dict[str, Any]:
314
+ """Run dwarf2json on vmlinux and return parsed ISF JSON.
315
+
316
+ dwarf2json is a Go binary from the Volatility Foundation that extracts
317
+ DWARF debug info into the Volatility3 Intermediate Symbol Format (ISF).
318
+
319
+ Args:
320
+ vmlinux_path: Path to vmlinux ELF file with debug symbols
321
+
322
+ Returns:
323
+ Dictionary with keys: "symbols", "user_types", "enums", "base_types", "metadata"
324
+
325
+ Raises:
326
+ FileNotFoundError: If dwarf2json binary is not found on PATH
327
+ RuntimeError: If dwarf2json exits with non-zero status
328
+ """
329
+ try:
330
+ proc = subprocess.Popen(
331
+ ["dwarf2json", "linux", "--elf", str(vmlinux_path)],
332
+ stdout=subprocess.PIPE,
333
+ stderr=subprocess.PIPE,
334
+ )
335
+ except FileNotFoundError:
336
+ raise FileNotFoundError(
337
+ "dwarf2json not found on PATH. " "Install from https://github.com/volatilityfoundation/dwarf2json"
338
+ )
339
+
340
+ stdout_bytes, stderr_bytes = proc.communicate()
341
+
342
+ if proc.returncode != 0:
343
+ stderr = stderr_bytes.decode("utf-8", errors="replace")
344
+ raise RuntimeError(f"dwarf2json failed with exit code {proc.returncode}: {stderr}")
345
+
346
+ return json.loads(stdout_bytes)
347
+
348
+
349
+ def parse_symbols_for_neo4j(symbols_dict: Dict[str, Dict]) -> List[Dict[str, str]]:
350
+ """Convert parsed ELF symbols to Neo4j-compatible format.
351
+
352
+ Args:
353
+ symbols_dict: Dictionary from parse_elf_symbols()
354
+
355
+ Returns:
356
+ List of symbol dictionaries with sym_name, address, and hash fields
357
+ """
358
+ entries = []
359
+ for sym, value in sorted(symbols_dict.items()):
360
+ # Skip internal/compiler symbols
361
+ if sym.startswith("__"):
362
+ continue
363
+
364
+ address = str(value["address"])
365
+ entries.append(
366
+ {
367
+ "sym_name": sym,
368
+ "address": address,
369
+ "hash": hashlib.sha1(address.encode()).hexdigest(),
370
+ }
371
+ )
372
+
373
+ return entries