oswatcher-plugins 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oswatcher_plugins-0.14.0.dist-info/METADATA +79 -0
- oswatcher_plugins-0.14.0.dist-info/RECORD +26 -0
- oswatcher_plugins-0.14.0.dist-info/WHEEL +4 -0
- oswatcher_plugins-0.14.0.dist-info/entry_points.txt +3 -0
- oswatcher_plugins-0.14.0.dist-info/licenses/LICENSE +202 -0
- plugins/__init__.py +0 -0
- plugins/__main__.py +69 -0
- plugins/config/__init__.py +18 -0
- plugins/config/default_settings.toml +1 -0
- plugins/plugins/__init__.py +29 -0
- plugins/plugins/filetype.py +58 -0
- plugins/plugins/linux_symbols.py +271 -0
- plugins/plugins/linux_symbols_service.py +373 -0
- plugins/plugins/registry.py +224 -0
- plugins/plugins/symbols.py +475 -0
- plugins/plugins/symbols_repository.py +142 -0
- plugins/plugins/symbols_service.py +47 -0
- plugins/plugins/syscalls.py +180 -0
- plugins/syscalls/exceptions.py +25 -0
- plugins/syscalls/filesystem.py +121 -0
- plugins/syscalls/kernel_parser.py +36 -0
- plugins/syscalls/kernel_repo_manager.py +106 -0
- plugins/syscalls/nodes.py +119 -0
- plugins/syscalls/syscall_table_parser.py +46 -0
- plugins/syscalls/syscalls_h_parser.py +52 -0
- plugins/types.py +108 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""Linux kernel symbol extraction plugin using DWARF debug info from Ubuntu ddebs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
from attrs import define, field
|
|
11
|
+
from neogit.model.neo import Commit
|
|
12
|
+
|
|
13
|
+
from plugins.plugins.linux_symbols_service import (
|
|
14
|
+
detect_ubuntu_codename,
|
|
15
|
+
extract_vmlinux_from_ddeb,
|
|
16
|
+
parse_kernel_version_parts,
|
|
17
|
+
parse_symbols_for_neo4j,
|
|
18
|
+
resolve_ddeb_url_from_packages,
|
|
19
|
+
run_dwarf2json,
|
|
20
|
+
)
|
|
21
|
+
from plugins.plugins.symbols import DataTypeMerkleNode, StructMerkleNode, StructNode, SymbolsMerkleVisitor
|
|
22
|
+
from plugins.plugins.symbols_repository import SymbolsRepository
|
|
23
|
+
from plugins.syscalls.filesystem import KernelInfo, find_kernel_versions, get_boot_directory
|
|
24
|
+
from plugins.types import AbstractPlugin, UniqueConstraint
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def lief_arch_to_ddeb_arch(lief_arch: str) -> str:
|
|
28
|
+
"""Convert lief architecture string to ddeb architecture name.
|
|
29
|
+
|
|
30
|
+
Handles both ELF machine types (e.g., "ARCH.x86_64") and
|
|
31
|
+
PE machine types from EFI boot stubs (e.g., "MACHINE_TYPES.AMD64").
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
lief_arch: Architecture string from lief
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Architecture for ddeb URL like "amd64" or "arm64"
|
|
38
|
+
"""
|
|
39
|
+
arch_lower = lief_arch.lower()
|
|
40
|
+
if "x86_64" in arch_lower or "amd64" in arch_lower:
|
|
41
|
+
return "amd64"
|
|
42
|
+
if "aarch64" in arch_lower or "arm64" in arch_lower:
|
|
43
|
+
return "arm64"
|
|
44
|
+
if "arm" in arch_lower:
|
|
45
|
+
return "armhf"
|
|
46
|
+
if "i386" in arch_lower or "i686" in arch_lower:
|
|
47
|
+
return "i386"
|
|
48
|
+
raise ValueError(f"Unrecognized architecture: {lief_arch!r}")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@define(auto_attribs=True)
|
|
52
|
+
class LinuxSymbolsPlugin(AbstractPlugin):
|
|
53
|
+
"""Plugin to extract Linux kernel symbols and struct definitions from DWARF debug info.
|
|
54
|
+
|
|
55
|
+
This plugin:
|
|
56
|
+
1. Finds Linux kernels in /boot directory
|
|
57
|
+
2. Downloads debug symbols from ddebs.ubuntu.com
|
|
58
|
+
3. Parses DWARF debug info using dwarf2json (Volatility Foundation)
|
|
59
|
+
4. Stores symbols and struct definitions in Neo4j
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
_repository: Optional[SymbolsRepository] = field(init=False, default=None)
|
|
63
|
+
|
|
64
|
+
# Request timeout for downloading ddeb packages (in seconds)
|
|
65
|
+
DOWNLOAD_TIMEOUT = 300
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def repository(self) -> SymbolsRepository:
|
|
69
|
+
"""Lazy-initialize repository."""
|
|
70
|
+
if self._repository is None:
|
|
71
|
+
self._repository = SymbolsRepository(self.neogit)
|
|
72
|
+
return self._repository
|
|
73
|
+
|
|
74
|
+
def constraints_data(self) -> List[UniqueConstraint]:
|
|
75
|
+
"""Return constraints for symbol-related nodes.
|
|
76
|
+
|
|
77
|
+
Reuses same constraints as SymbolsPlugin since we use the same node types.
|
|
78
|
+
"""
|
|
79
|
+
return [
|
|
80
|
+
UniqueConstraint(label="Symbol", property_list=["hash"]),
|
|
81
|
+
UniqueConstraint(label="Struct", property_list=["hash"]),
|
|
82
|
+
UniqueConstraint(label="StructField", property_list=["hash"]),
|
|
83
|
+
UniqueConstraint(label="DataType", property_list=["hash"]),
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
def run(self, commit: Commit):
|
|
87
|
+
"""Execute Linux symbol extraction for a commit.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
commit: The Commit node to analyze
|
|
91
|
+
"""
|
|
92
|
+
self.logger.info(f"Running Linux symbols plugin for commit {commit.hash}")
|
|
93
|
+
|
|
94
|
+
# Get the root filesystem tree
|
|
95
|
+
try:
|
|
96
|
+
root_tree = commit.filesystem[0]
|
|
97
|
+
except IndexError:
|
|
98
|
+
self.logger.warning(f"No filesystem found for commit {commit.hash}")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
# Navigate to /boot directory
|
|
102
|
+
boot_tree = get_boot_directory(root_tree)
|
|
103
|
+
if not boot_tree:
|
|
104
|
+
self.logger.info("No /boot directory found, skipping Linux symbol extraction")
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
codename = detect_ubuntu_codename(root_tree)
|
|
108
|
+
if codename:
|
|
109
|
+
self.logger.info(f"Detected Ubuntu codename: {codename}")
|
|
110
|
+
else:
|
|
111
|
+
self.logger.warning("Could not detect Ubuntu codename from /etc/os-release")
|
|
112
|
+
|
|
113
|
+
# Find kernel versions from vmlinuz files
|
|
114
|
+
kernel_info_list = find_kernel_versions(boot_tree, self)
|
|
115
|
+
if not kernel_info_list:
|
|
116
|
+
self.logger.info("No kernel files found in /boot")
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
self.logger.info(
|
|
120
|
+
f"Found {len(kernel_info_list)} kernel(s): "
|
|
121
|
+
+ ", ".join(f"{k.filename} ({k.architecture})" for k in kernel_info_list)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Process each kernel
|
|
125
|
+
for kernel_info in kernel_info_list:
|
|
126
|
+
try:
|
|
127
|
+
self._process_kernel(kernel_info, codename)
|
|
128
|
+
except Exception:
|
|
129
|
+
self.logger.exception(f"Failed to process kernel {kernel_info.filename}")
|
|
130
|
+
|
|
131
|
+
self.logger.info(f"Linux symbol extraction complete for commit {commit.hash}")
|
|
132
|
+
|
|
133
|
+
def _process_kernel(self, kernel_info: KernelInfo, codename: Optional[str]):
|
|
134
|
+
"""Process a single kernel: download debug symbols and extract info.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
kernel_info: Information about the kernel to process
|
|
138
|
+
"""
|
|
139
|
+
self.logger.info(f"Processing kernel: {kernel_info.filename}")
|
|
140
|
+
|
|
141
|
+
# Parse version components from filename
|
|
142
|
+
try:
|
|
143
|
+
version, build, flavor = parse_kernel_version_parts(kernel_info.filename)
|
|
144
|
+
except ValueError as e:
|
|
145
|
+
self.logger.warning(f"Cannot parse kernel version from {kernel_info.filename}: {e}")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
# Convert architecture
|
|
149
|
+
ddeb_arch = lief_arch_to_ddeb_arch(kernel_info.architecture)
|
|
150
|
+
|
|
151
|
+
# Resolve exact ddeb URL from repository metadata.
|
|
152
|
+
ddeb_url = resolve_ddeb_url_from_packages(version, build, flavor, ddeb_arch, codename)
|
|
153
|
+
if ddeb_url is None:
|
|
154
|
+
self.logger.error(
|
|
155
|
+
f"Could not resolve debug package URL for {kernel_info.filename} "
|
|
156
|
+
f"(codename={codename}, arch={ddeb_arch})"
|
|
157
|
+
)
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
self.logger.info(f"Debug package URL: {ddeb_url}")
|
|
161
|
+
|
|
162
|
+
# Download and process ddeb
|
|
163
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
164
|
+
tmpdir_path = Path(tmpdir)
|
|
165
|
+
|
|
166
|
+
# Download ddeb
|
|
167
|
+
ddeb_path = tmpdir_path / "debug.ddeb"
|
|
168
|
+
try:
|
|
169
|
+
self._download_ddeb(ddeb_url, ddeb_path)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
self.logger.error(f"Failed to download ddeb from {ddeb_url}: {e}")
|
|
172
|
+
return
|
|
173
|
+
|
|
174
|
+
# Extract vmlinux
|
|
175
|
+
try:
|
|
176
|
+
vmlinux_path = extract_vmlinux_from_ddeb(ddeb_path, tmpdir_path)
|
|
177
|
+
self.logger.info(f"Extracted vmlinux: {vmlinux_path}")
|
|
178
|
+
except Exception as e:
|
|
179
|
+
self.logger.error(f"Failed to extract vmlinux: {e}")
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
# Parse everything in one dwarf2json call
|
|
183
|
+
try:
|
|
184
|
+
self.logger.info("Running dwarf2json...")
|
|
185
|
+
dwarf_data = run_dwarf2json(vmlinux_path)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
self.logger.error(f"Failed to run dwarf2json: {e}")
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
# Insert symbols
|
|
191
|
+
symbols_dict = dwarf_data.get("symbols", {})
|
|
192
|
+
self.logger.info(f"Found {len(symbols_dict)} symbols")
|
|
193
|
+
param_list = parse_symbols_for_neo4j(symbols_dict)
|
|
194
|
+
self.logger.info(f"Inserting {len(param_list)} symbols into Neo4j")
|
|
195
|
+
self.repository.insert_symbols(kernel_info.blob_hash, param_list)
|
|
196
|
+
|
|
197
|
+
# Insert enums
|
|
198
|
+
enums = dwarf_data.get("enums", {})
|
|
199
|
+
self.logger.info(f"Found {len(enums)} enums")
|
|
200
|
+
count_enum = self._insert_user_types(kernel_info.blob_hash, enums)
|
|
201
|
+
|
|
202
|
+
# Insert structs/unions
|
|
203
|
+
types = dwarf_data.get("user_types", {})
|
|
204
|
+
self.logger.info(f"Found {len(types)} struct/union types")
|
|
205
|
+
count_types = self._insert_user_types(kernel_info.blob_hash, types)
|
|
206
|
+
|
|
207
|
+
self.logger.info(f"Inserted {len(param_list)} symbols, {count_enum} enums, {count_types} types")
|
|
208
|
+
|
|
209
|
+
def _download_ddeb(self, url: str, output_path: Path):
|
|
210
|
+
"""Download ddeb package from URL.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
url: URL to download from
|
|
214
|
+
output_path: Path to save the downloaded file
|
|
215
|
+
"""
|
|
216
|
+
self.logger.info(f"Downloading: {url}")
|
|
217
|
+
|
|
218
|
+
response = requests.get(url, timeout=self.DOWNLOAD_TIMEOUT, stream=True)
|
|
219
|
+
response.raise_for_status()
|
|
220
|
+
|
|
221
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
222
|
+
downloaded = 0
|
|
223
|
+
|
|
224
|
+
with open(output_path, "wb") as f:
|
|
225
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
226
|
+
f.write(chunk)
|
|
227
|
+
downloaded += len(chunk)
|
|
228
|
+
if total_size > 0:
|
|
229
|
+
percent = (downloaded / total_size) * 100
|
|
230
|
+
if downloaded % (10 * 1024 * 1024) < 8192: # Log every ~10MB
|
|
231
|
+
mb_downloaded = downloaded / 1024 / 1024
|
|
232
|
+
self.logger.debug(f"Downloaded {mb_downloaded:.1f}MB ({percent:.1f}%)") # noqa: E231
|
|
233
|
+
|
|
234
|
+
mb_downloaded = downloaded / 1024 / 1024
|
|
235
|
+
self.logger.info(f"Downloaded {mb_downloaded:.1f}MB to {output_path}") # noqa: E231
|
|
236
|
+
|
|
237
|
+
def _insert_user_types(self, blob_hash: str, types_dict: Dict) -> int:
|
|
238
|
+
"""Insert user types (structs/unions/enums) into Neo4j using Merkle visitor.
|
|
239
|
+
|
|
240
|
+
This method reuses the same logic as SymbolsPlugin.parse_users_types().
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
blob_hash: Hash of the kernel blob
|
|
244
|
+
types_dict: Dictionary of type definitions in volatility3 format
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Number of types processed
|
|
248
|
+
"""
|
|
249
|
+
with SymbolsMerkleVisitor(thread=True) as visitor:
|
|
250
|
+
for struct_name, struct_data in sorted(types_dict.items()):
|
|
251
|
+
self.logger.debug(f"Processing type: {struct_name}")
|
|
252
|
+
struct_node = StructNode(name=struct_name, struct_data=struct_data)
|
|
253
|
+
visitor.run_visit(struct_node)
|
|
254
|
+
|
|
255
|
+
for node in visitor.as_gen():
|
|
256
|
+
merkle_node = node.return_value
|
|
257
|
+
if isinstance(merkle_node, DataTypeMerkleNode):
|
|
258
|
+
self.repository.insert_data_type(merkle_node)
|
|
259
|
+
if isinstance(merkle_node, StructMerkleNode):
|
|
260
|
+
unwind_param = [
|
|
261
|
+
{
|
|
262
|
+
"hash": child_node.hash,
|
|
263
|
+
"name": child_name,
|
|
264
|
+
"offset": child_node.offset,
|
|
265
|
+
"data_type": child_node.data_type,
|
|
266
|
+
}
|
|
267
|
+
for child_name, child_node in merkle_node.children.items()
|
|
268
|
+
]
|
|
269
|
+
self.repository.insert_struct(blob_hash, merkle_node, unwind_param)
|
|
270
|
+
|
|
271
|
+
return len(types_dict)
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
"""Pure functions for Linux kernel symbol extraction.
|
|
2
|
+
|
|
3
|
+
This module provides pure functions for:
|
|
4
|
+
- Constructing ddebs.ubuntu.com URLs for debug packages
|
|
5
|
+
- Parsing kernel version strings
|
|
6
|
+
- Extracting vmlinux from ddeb archives
|
|
7
|
+
- Parsing DWARF debug info via dwarf2json (Volatility Foundation Go binary)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import gzip
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import lzma
|
|
16
|
+
import re
|
|
17
|
+
import subprocess
|
|
18
|
+
import tarfile
|
|
19
|
+
import tempfile
|
|
20
|
+
from pathlib import Path, PurePath
|
|
21
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
|
22
|
+
|
|
23
|
+
import requests
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from neogit.model.merkle import Tree
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Pattern: vmlinuz-{major}.{minor}.{patch}-{build}-{flavor}
|
|
30
|
+
# Example: vmlinuz-6.8.0-45-generic -> ("6.8.0", "45", "generic")
|
|
31
|
+
KERNEL_VERSION_PARTS_PATTERN = re.compile(r"^vmlinuz-(\d+\.\d+\.\d+)-(\d+)-(.+)$")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_kernel_version_parts(filename: str) -> Tuple[str, str, str]:
|
|
35
|
+
"""Parse vmlinuz filename into version components.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
filename: Kernel filename like "vmlinuz-6.8.0-45-generic"
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple of (version, build, flavor) e.g., ("6.8.0", "45", "generic")
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ValueError: If filename format is invalid
|
|
45
|
+
"""
|
|
46
|
+
match = KERNEL_VERSION_PARTS_PATTERN.match(filename)
|
|
47
|
+
if not match:
|
|
48
|
+
raise ValueError(f"Invalid kernel filename format: {filename}")
|
|
49
|
+
|
|
50
|
+
return match.groups() # type: ignore
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def construct_ddeb_url(kernel_version: str, build: str, flavor: str, arch: str) -> str:
|
|
54
|
+
"""Construct ddebs.ubuntu.com URL for debug package.
|
|
55
|
+
|
|
56
|
+
The URL pattern is:
|
|
57
|
+
http://ddebs.ubuntu.com/pool/main/l/linux/linux-image-unsigned-{version}-{build}-{flavor}-dbgsym_{version}-{build}.{build}_{arch}.ddeb
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
kernel_version: Kernel version like "6.8.0"
|
|
61
|
+
build: Build number like "45"
|
|
62
|
+
flavor: Kernel flavor like "generic"
|
|
63
|
+
arch: Architecture like "amd64"
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Full URL to the ddeb package
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
>>> construct_ddeb_url("6.8.0", "45", "generic", "amd64")
|
|
70
|
+
'http://ddebs.ubuntu.com/pool/main/l/linux/linux-image-unsigned-6.8.0-45-generic-dbgsym_6.8.0-45.45_amd64.ddeb'
|
|
71
|
+
"""
|
|
72
|
+
# Format: linux-image-unsigned-{ver}-{build}-{flavor}-dbgsym_{ver}-{build}.{build}_{arch}.ddeb
|
|
73
|
+
package_name = f"linux-image-unsigned-{kernel_version}-{build}-{flavor}-dbgsym"
|
|
74
|
+
version_string = f"{kernel_version}-{build}.{build}"
|
|
75
|
+
filename = f"{package_name}_{version_string}_{arch}.ddeb"
|
|
76
|
+
|
|
77
|
+
base_url = "http://ddebs.ubuntu.com/pool/main/l/linux"
|
|
78
|
+
return f"{base_url}/{filename}"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _parse_packages_index_for_filename(index_content: str, package_names: List[str], arch: str) -> Optional[str]:
|
|
82
|
+
"""Parse Debian Packages index and return matching ddeb filename."""
|
|
83
|
+
package_set = set(package_names)
|
|
84
|
+
fields: Dict[str, str] = {}
|
|
85
|
+
|
|
86
|
+
def match_current_stanza() -> Optional[str]:
|
|
87
|
+
if fields.get("Package") not in package_set:
|
|
88
|
+
return None
|
|
89
|
+
if fields.get("Architecture") != arch:
|
|
90
|
+
return None
|
|
91
|
+
filename = fields.get("Filename")
|
|
92
|
+
if filename and filename.endswith(".ddeb"):
|
|
93
|
+
return filename
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
for line in index_content.splitlines():
|
|
97
|
+
if not line.strip():
|
|
98
|
+
filename = match_current_stanza()
|
|
99
|
+
if filename:
|
|
100
|
+
return filename
|
|
101
|
+
fields = {}
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
if line.startswith(" "):
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
key, sep, value = line.partition(":")
|
|
108
|
+
if not sep:
|
|
109
|
+
continue
|
|
110
|
+
fields[key] = value.strip()
|
|
111
|
+
|
|
112
|
+
return match_current_stanza()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def resolve_ddeb_url_from_packages(
|
|
116
|
+
kernel_version: str,
|
|
117
|
+
build: str,
|
|
118
|
+
flavor: str,
|
|
119
|
+
arch: str,
|
|
120
|
+
codename: Optional[str],
|
|
121
|
+
timeout: int = 30,
|
|
122
|
+
) -> Optional[str]:
|
|
123
|
+
"""Resolve exact ddeb URL by scanning ddebs Packages indexes.
|
|
124
|
+
|
|
125
|
+
Only searches for the unsigned dbgsym package, which contains
|
|
126
|
+
the actual vmlinux with DWARF debug info. The signed variant
|
|
127
|
+
is a stub with no debug symbols.
|
|
128
|
+
|
|
129
|
+
Searches the provided codename's suites first. If no codename is
|
|
130
|
+
given, falls back to a list of known LTS/current releases.
|
|
131
|
+
"""
|
|
132
|
+
package_name = f"linux-image-unsigned-{kernel_version}-{build}-{flavor}-dbgsym"
|
|
133
|
+
base_url = "http://ddebs.ubuntu.com"
|
|
134
|
+
|
|
135
|
+
# Determine which codenames to search
|
|
136
|
+
if codename:
|
|
137
|
+
codenames = [codename]
|
|
138
|
+
else:
|
|
139
|
+
codenames = ["noble", "jammy", "focal"]
|
|
140
|
+
|
|
141
|
+
for release in codenames:
|
|
142
|
+
for suite in [release, f"{release}-updates", f"{release}-proposed"]:
|
|
143
|
+
for index_name in ["Packages.xz", "Packages.gz"]:
|
|
144
|
+
index_url = f"{base_url}/dists/{suite}/main/binary-{arch}/{index_name}"
|
|
145
|
+
try:
|
|
146
|
+
response = requests.get(index_url, timeout=timeout)
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
except requests.RequestException:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
if index_name.endswith(".xz"):
|
|
153
|
+
content = lzma.decompress(response.content).decode("utf-8", errors="replace")
|
|
154
|
+
elif index_name.endswith(".gz"):
|
|
155
|
+
content = gzip.decompress(response.content).decode("utf-8", errors="replace")
|
|
156
|
+
else:
|
|
157
|
+
content = response.text
|
|
158
|
+
except (lzma.LZMAError, gzip.BadGzipFile, UnicodeDecodeError):
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
filename = _parse_packages_index_for_filename(content, [package_name], arch)
|
|
162
|
+
if filename:
|
|
163
|
+
return f"{base_url}/{filename.lstrip('/')}"
|
|
164
|
+
# Successfully parsed this index; skip other compression variants
|
|
165
|
+
break
|
|
166
|
+
|
|
167
|
+
# Fallback: scan pool directory listing directly.
|
|
168
|
+
major_minor = ".".join(kernel_version.split(".")[:2])
|
|
169
|
+
return _resolve_ddeb_url_from_pool_listing(kernel_version, build, flavor, arch, major_minor, timeout)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _resolve_ddeb_url_from_pool_listing(
|
|
173
|
+
kernel_version: str,
|
|
174
|
+
build: str,
|
|
175
|
+
flavor: str,
|
|
176
|
+
arch: str,
|
|
177
|
+
major_minor: str,
|
|
178
|
+
timeout: int = 30,
|
|
179
|
+
) -> Optional[str]:
|
|
180
|
+
"""Resolve ddeb URL by matching filenames in the pool directory listing.
|
|
181
|
+
|
|
182
|
+
Searches both the main linux pool and HWE-specific pool directories.
|
|
183
|
+
Only looks for unsigned dbgsym packages.
|
|
184
|
+
"""
|
|
185
|
+
pool_urls = [
|
|
186
|
+
"http://ddebs.ubuntu.com/pool/main/l/linux/",
|
|
187
|
+
f"http://ddebs.ubuntu.com/pool/main/l/linux-hwe-{major_minor}/",
|
|
188
|
+
]
|
|
189
|
+
prefix = f"linux-image-unsigned-{kernel_version}-{build}-{flavor}-dbgsym_"
|
|
190
|
+
|
|
191
|
+
for pool_url in pool_urls:
|
|
192
|
+
try:
|
|
193
|
+
response = requests.get(pool_url, timeout=timeout)
|
|
194
|
+
response.raise_for_status()
|
|
195
|
+
except requests.RequestException:
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
pattern = rf'href="({re.escape(prefix)}[^"]*_{re.escape(arch)}\.ddeb)"'
|
|
199
|
+
matches = re.findall(pattern, response.text)
|
|
200
|
+
|
|
201
|
+
if matches:
|
|
202
|
+
# Keep deterministic behavior and prefer highest lexical revision.
|
|
203
|
+
filename = sorted(set(matches))[-1]
|
|
204
|
+
return f"{pool_url}{filename}"
|
|
205
|
+
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def detect_ubuntu_codename(root_tree: "Tree") -> Optional[str]:
|
|
210
|
+
"""Extract Ubuntu codename from /etc/os-release in filesystem.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
root_tree: Root filesystem tree
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Ubuntu codename like "noble", "jammy", or None if not found
|
|
217
|
+
"""
|
|
218
|
+
try:
|
|
219
|
+
os_release = root_tree.get_child_at_path(PurePath("/etc/os-release"))
|
|
220
|
+
if hasattr(os_release, "content"):
|
|
221
|
+
content = os_release.content.decode("utf-8", errors="replace")
|
|
222
|
+
for line in content.splitlines():
|
|
223
|
+
if line.startswith("VERSION_CODENAME="):
|
|
224
|
+
return line.split("=", 1)[1].strip().strip('"')
|
|
225
|
+
except (FileNotFoundError, AttributeError):
|
|
226
|
+
pass
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def extract_vmlinux_from_ddeb(ddeb_path: Path, output_dir: Path) -> Path:
|
|
231
|
+
"""Extract vmlinux from ddeb archive.
|
|
232
|
+
|
|
233
|
+
ddeb structure:
|
|
234
|
+
- ddeb is an ar archive
|
|
235
|
+
- Contains data.tar.xz (or data.tar.zst)
|
|
236
|
+
- vmlinux is at usr/lib/debug/boot/vmlinux-*
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
ddeb_path: Path to downloaded ddeb file
|
|
240
|
+
output_dir: Directory to extract vmlinux to
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Path to extracted vmlinux file
|
|
244
|
+
|
|
245
|
+
Raises:
|
|
246
|
+
ValueError: If vmlinux not found in archive
|
|
247
|
+
"""
|
|
248
|
+
# Extract data.tar from ar archive using ar command
|
|
249
|
+
# ar archives are simple and ar is universally available
|
|
250
|
+
result = subprocess.run(
|
|
251
|
+
["ar", "-t", str(ddeb_path)],
|
|
252
|
+
capture_output=True,
|
|
253
|
+
text=True,
|
|
254
|
+
check=True,
|
|
255
|
+
)
|
|
256
|
+
archive_members = result.stdout.strip().split("\n")
|
|
257
|
+
|
|
258
|
+
# Find data.tar.* file
|
|
259
|
+
data_tar_name = None
|
|
260
|
+
for member in archive_members:
|
|
261
|
+
if member.startswith("data.tar"):
|
|
262
|
+
data_tar_name = member
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
if not data_tar_name:
|
|
266
|
+
raise ValueError(f"No data.tar found in ddeb: {ddeb_path}")
|
|
267
|
+
|
|
268
|
+
# Extract data.tar to temp location
|
|
269
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
270
|
+
tmpdir_path = Path(tmpdir)
|
|
271
|
+
subprocess.run(
|
|
272
|
+
["ar", "-x", str(ddeb_path), data_tar_name],
|
|
273
|
+
cwd=tmpdir_path,
|
|
274
|
+
check=True,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
data_tar_path = tmpdir_path / data_tar_name
|
|
278
|
+
|
|
279
|
+
# tarfile doesn't support zstd until Python 3.14, decompress first
|
|
280
|
+
if data_tar_name.endswith(".zst"):
|
|
281
|
+
plain_tar_path = tmpdir_path / "data.tar"
|
|
282
|
+
subprocess.run(
|
|
283
|
+
["zstd", "-d", str(data_tar_path), "-o", str(plain_tar_path)],
|
|
284
|
+
check=True,
|
|
285
|
+
)
|
|
286
|
+
data_tar_path = plain_tar_path
|
|
287
|
+
|
|
288
|
+
# Open data.tar (handles .xz, .gz, .bz2 and plain tar)
|
|
289
|
+
with tarfile.open(data_tar_path, "r:*") as tar:
|
|
290
|
+
# Find vmlinux file
|
|
291
|
+
vmlinux_member: Optional[tarfile.TarInfo] = None
|
|
292
|
+
tar_members: List[tarfile.TarInfo] = tar.getmembers()
|
|
293
|
+
for tar_entry in tar_members:
|
|
294
|
+
if tar_entry.name.endswith("/vmlinux") or "/boot/vmlinux-" in tar_entry.name:
|
|
295
|
+
if tar_entry.isfile():
|
|
296
|
+
vmlinux_member = tar_entry
|
|
297
|
+
break
|
|
298
|
+
|
|
299
|
+
if vmlinux_member is None:
|
|
300
|
+
# List all members for debugging
|
|
301
|
+
members_list = [m.name for m in tar_members][:20]
|
|
302
|
+
raise ValueError(f"Could not find vmlinux in ddeb. Members: {members_list}")
|
|
303
|
+
|
|
304
|
+
# Extract vmlinux
|
|
305
|
+
vmlinux_member.name = Path(vmlinux_member.name).name # Flatten path
|
|
306
|
+
tar.extract(vmlinux_member, path=output_dir)
|
|
307
|
+
return output_dir / vmlinux_member.name
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# --- DWARF/Symbol Parsing with dwarf2json ---
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def run_dwarf2json(vmlinux_path: Path) -> Dict[str, Any]:
|
|
314
|
+
"""Run dwarf2json on vmlinux and return parsed ISF JSON.
|
|
315
|
+
|
|
316
|
+
dwarf2json is a Go binary from the Volatility Foundation that extracts
|
|
317
|
+
DWARF debug info into the Volatility3 Intermediate Symbol Format (ISF).
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
vmlinux_path: Path to vmlinux ELF file with debug symbols
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Dictionary with keys: "symbols", "user_types", "enums", "base_types", "metadata"
|
|
324
|
+
|
|
325
|
+
Raises:
|
|
326
|
+
FileNotFoundError: If dwarf2json binary is not found on PATH
|
|
327
|
+
RuntimeError: If dwarf2json exits with non-zero status
|
|
328
|
+
"""
|
|
329
|
+
try:
|
|
330
|
+
proc = subprocess.Popen(
|
|
331
|
+
["dwarf2json", "linux", "--elf", str(vmlinux_path)],
|
|
332
|
+
stdout=subprocess.PIPE,
|
|
333
|
+
stderr=subprocess.PIPE,
|
|
334
|
+
)
|
|
335
|
+
except FileNotFoundError:
|
|
336
|
+
raise FileNotFoundError(
|
|
337
|
+
"dwarf2json not found on PATH. " "Install from https://github.com/volatilityfoundation/dwarf2json"
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
stdout_bytes, stderr_bytes = proc.communicate()
|
|
341
|
+
|
|
342
|
+
if proc.returncode != 0:
|
|
343
|
+
stderr = stderr_bytes.decode("utf-8", errors="replace")
|
|
344
|
+
raise RuntimeError(f"dwarf2json failed with exit code {proc.returncode}: {stderr}")
|
|
345
|
+
|
|
346
|
+
return json.loads(stdout_bytes)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def parse_symbols_for_neo4j(symbols_dict: Dict[str, Dict]) -> List[Dict[str, str]]:
|
|
350
|
+
"""Convert parsed ELF symbols to Neo4j-compatible format.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
symbols_dict: Dictionary from parse_elf_symbols()
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
List of symbol dictionaries with sym_name, address, and hash fields
|
|
357
|
+
"""
|
|
358
|
+
entries = []
|
|
359
|
+
for sym, value in sorted(symbols_dict.items()):
|
|
360
|
+
# Skip internal/compiler symbols
|
|
361
|
+
if sym.startswith("__"):
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
address = str(value["address"])
|
|
365
|
+
entries.append(
|
|
366
|
+
{
|
|
367
|
+
"sym_name": sym,
|
|
368
|
+
"address": address,
|
|
369
|
+
"hash": hashlib.sha1(address.encode()).hexdigest(),
|
|
370
|
+
}
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
return entries
|