oswatcher-plugins 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oswatcher_plugins-0.14.0.dist-info/METADATA +79 -0
- oswatcher_plugins-0.14.0.dist-info/RECORD +26 -0
- oswatcher_plugins-0.14.0.dist-info/WHEEL +4 -0
- oswatcher_plugins-0.14.0.dist-info/entry_points.txt +3 -0
- oswatcher_plugins-0.14.0.dist-info/licenses/LICENSE +202 -0
- plugins/__init__.py +0 -0
- plugins/__main__.py +69 -0
- plugins/config/__init__.py +18 -0
- plugins/config/default_settings.toml +1 -0
- plugins/plugins/__init__.py +29 -0
- plugins/plugins/filetype.py +58 -0
- plugins/plugins/linux_symbols.py +271 -0
- plugins/plugins/linux_symbols_service.py +373 -0
- plugins/plugins/registry.py +224 -0
- plugins/plugins/symbols.py +475 -0
- plugins/plugins/symbols_repository.py +142 -0
- plugins/plugins/symbols_service.py +47 -0
- plugins/plugins/syscalls.py +180 -0
- plugins/syscalls/exceptions.py +25 -0
- plugins/syscalls/filesystem.py +121 -0
- plugins/syscalls/kernel_parser.py +36 -0
- plugins/syscalls/kernel_repo_manager.py +106 -0
- plugins/syscalls/nodes.py +119 -0
- plugins/syscalls/syscall_table_parser.py +46 -0
- plugins/syscalls/syscalls_h_parser.py +52 -0
- plugins/types.py +108 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Repository layer for symbols plugin Neo4j operations."""
|
|
2
|
+
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from typing import Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from neogit.service.neogit import cypher_query_with_backoff
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SymbolsRepository:
|
|
10
|
+
"""Handles Neo4j database operations for symbols plugin."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, neogit):
|
|
13
|
+
"""Initialize repository with neogit service.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
neogit: Neogit service instance for database access
|
|
17
|
+
"""
|
|
18
|
+
self.neogit = neogit
|
|
19
|
+
|
|
20
|
+
def query_pe_blobs(self, root_hash: str, mime_type: str) -> List[Tuple[PurePath, str]]:
|
|
21
|
+
"""Query for PE file blobs matching MIME type.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
root_hash: Hash of the root Tree node
|
|
25
|
+
mime_type: MIME type to filter by
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
List of (file_path, blob_hash) tuples
|
|
29
|
+
"""
|
|
30
|
+
query = """
|
|
31
|
+
MATCH path = (r:Tree {hash: $root_hash})-[:HAS_CHILD_TREE|HAS_CHILD_BLOB*]->(b:Blob)
|
|
32
|
+
WHERE EXISTS {
|
|
33
|
+
MATCH (b)-[:HAS_MIME_TYPE]->(m:MimeType)
|
|
34
|
+
WHERE m.mime = $mime_type
|
|
35
|
+
}
|
|
36
|
+
RETURN [rel IN relationships(path) | rel.name] AS parts, b.hash
|
|
37
|
+
"""
|
|
38
|
+
rows, _ = self.neogit.db.cypher_query(query, {"mime_type": mime_type, "root_hash": root_hash})
|
|
39
|
+
return [(PurePath(*row[0]), row[1]) for row in rows]
|
|
40
|
+
|
|
41
|
+
def insert_symbols(self, blob_hash: str, param_list: List[Dict]) -> None:
|
|
42
|
+
"""Insert symbols into Neo4j.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
blob_hash: Hash of the PE file blob
|
|
46
|
+
param_list: List of dicts with hash, address, and symbol name key
|
|
47
|
+
(`sym_name` or `name`)
|
|
48
|
+
"""
|
|
49
|
+
query = """
|
|
50
|
+
MATCH (b:Blob {hash: $blob_hash})
|
|
51
|
+
WITH b
|
|
52
|
+
UNWIND $unwind as p
|
|
53
|
+
MERGE (s:Symbol {hash: p.hash, address: p.address})
|
|
54
|
+
MERGE (b)-[:HAS_SYMBOL {name: coalesce(p.sym_name, p.name)}]->(s)
|
|
55
|
+
"""
|
|
56
|
+
cypher_query_with_backoff(query, {"blob_hash": blob_hash, "unwind": param_list})
|
|
57
|
+
|
|
58
|
+
def insert_struct(self, blob_hash: str, struct_node, unwind_param: List[Dict]) -> None:
|
|
59
|
+
"""Insert Windows struct definition into Neo4j.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
blob_hash: Hash of the PE file blob
|
|
63
|
+
struct_node: StructMerkleNode with hash, size, kind, name
|
|
64
|
+
unwind_param: List of field dicts with hash, name, offset, data_type
|
|
65
|
+
"""
|
|
66
|
+
query = """
|
|
67
|
+
MERGE (s:Struct {hash: $hash, size: $size, kind: $kind})
|
|
68
|
+
WITH s
|
|
69
|
+
UNWIND $unwind_param as x
|
|
70
|
+
MERGE (f:StructField {hash: x.hash, offset: x.offset, data_type: x.data_type})
|
|
71
|
+
MERGE (s)-[:HAS_FIELD {name: x.name}]->(f)
|
|
72
|
+
WITH s
|
|
73
|
+
MATCH (b:Blob {hash: $blob_hash})
|
|
74
|
+
WITH b, s
|
|
75
|
+
MERGE (b)-[:HAS_STRUCT {name: $name}]->(s)
|
|
76
|
+
"""
|
|
77
|
+
cypher_query_with_backoff(
|
|
78
|
+
query,
|
|
79
|
+
{
|
|
80
|
+
"blob_hash": blob_hash,
|
|
81
|
+
"unwind_param": unwind_param,
|
|
82
|
+
"hash": struct_node.hash,
|
|
83
|
+
"name": struct_node.name,
|
|
84
|
+
"size": struct_node.size,
|
|
85
|
+
"kind": struct_node.kind.name,
|
|
86
|
+
},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def insert_data_type(self, node) -> None:
|
|
90
|
+
"""Insert Windows data type into Neo4j.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
node: DataTypeMerkleNode with type metadata
|
|
94
|
+
"""
|
|
95
|
+
query = """
|
|
96
|
+
MERGE (d:DataType {hash: $hash}) // Ensure 'hash' uniquely identifies 'DataType'
|
|
97
|
+
ON CREATE SET
|
|
98
|
+
d.type = CASE WHEN $type IS NOT NULL THEN $type END,
|
|
99
|
+
d.name = CASE WHEN $name IS NOT NULL THEN $name END,
|
|
100
|
+
d.array_counter = CASE WHEN $array_counter IS NOT NULL THEN $array_counter END,
|
|
101
|
+
d.bit_position = CASE WHEN $bit_position IS NOT NULL THEN $bit_position END,
|
|
102
|
+
d.bit_length = CASE WHEN $bit_length IS NOT NULL THEN $bit_length END
|
|
103
|
+
ON MATCH SET
|
|
104
|
+
d.type = CASE WHEN $type IS NOT NULL THEN $type END,
|
|
105
|
+
d.name = CASE WHEN $name IS NOT NULL THEN $name END,
|
|
106
|
+
d.array_counter = CASE WHEN $array_counter IS NOT NULL THEN $array_counter END,
|
|
107
|
+
d.bit_position = CASE WHEN $bit_position IS NOT NULL THEN $bit_position END,
|
|
108
|
+
d.bit_length = CASE WHEN $bit_length IS NOT NULL THEN $bit_length END
|
|
109
|
+
WITH d
|
|
110
|
+
UNWIND $children AS child
|
|
111
|
+
MERGE (c:DataType {hash: child.hash}) // Assuming 'hash' is unique for child nodes too
|
|
112
|
+
ON CREATE SET
|
|
113
|
+
c.type = CASE WHEN child.type IS NOT NULL THEN child.type END,
|
|
114
|
+
c.name = CASE WHEN child.name IS NOT NULL THEN child.name END,
|
|
115
|
+
c.array_counter = CASE WHEN child.array_counter IS NOT NULL THEN child.array_counter END,
|
|
116
|
+
c.bit_position = CASE WHEN child.bit_position IS NOT NULL THEN child.bit_position END,
|
|
117
|
+
c.bit_length = CASE WHEN child.bit_length IS NOT NULL THEN child.bit_length END
|
|
118
|
+
MERGE (d)-[:HAS_DATA_TYPE]->(c)
|
|
119
|
+
"""
|
|
120
|
+
children = [
|
|
121
|
+
{
|
|
122
|
+
"hash": x.hash,
|
|
123
|
+
"type": x.kind.name,
|
|
124
|
+
"name": x.name,
|
|
125
|
+
"array_counter": x.array_counter,
|
|
126
|
+
"bit_position": x.bit_position,
|
|
127
|
+
"bit_length": x.bit_length,
|
|
128
|
+
}
|
|
129
|
+
for hash, x in node.children.items()
|
|
130
|
+
]
|
|
131
|
+
cypher_query_with_backoff(
|
|
132
|
+
query,
|
|
133
|
+
{
|
|
134
|
+
"hash": node.hash,
|
|
135
|
+
"type": node.kind.name,
|
|
136
|
+
"name": node.name,
|
|
137
|
+
"array_counter": node.array_counter,
|
|
138
|
+
"bit_position": node.bit_position,
|
|
139
|
+
"bit_length": node.bit_length,
|
|
140
|
+
"children": children,
|
|
141
|
+
},
|
|
142
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Pure functions for symbols plugin business logic."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import PurePath
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def filter_valid_filenames(
|
|
9
|
+
blob_results: List[Tuple[PurePath, str]], allowed_filenames: List[str]
|
|
10
|
+
) -> List[Tuple[PurePath, str]]:
|
|
11
|
+
"""Filter blob results to only include allowed filenames.
|
|
12
|
+
|
|
13
|
+
This is a pure function with no side effects.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
blob_results: List of (path, blob_hash) tuples
|
|
17
|
+
allowed_filenames: List of allowed filenames to keep
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Filtered list of (path, blob_hash) tuples
|
|
21
|
+
"""
|
|
22
|
+
return [(path, blob_hash) for path, blob_hash in blob_results if path.name in allowed_filenames]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_symbols_from_json(symbols_dict: Dict) -> List[Dict[str, str]]:
|
|
26
|
+
"""Parse symbols from PDB JSON, filtering mangled names.
|
|
27
|
+
|
|
28
|
+
This is a pure function with no side effects.
|
|
29
|
+
|
|
30
|
+
Filters out symbols starting with '?' or '$' (compiler-generated).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
symbols_dict: Dictionary mapping symbol names to symbol data
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of symbol dictionaries with name, address, and hash fields
|
|
37
|
+
"""
|
|
38
|
+
entries = []
|
|
39
|
+
for sym, value in sorted(symbols_dict.items()):
|
|
40
|
+
# Skip mangled/compiler symbols
|
|
41
|
+
if sym.startswith("?") or sym.startswith("$"):
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
address = str(value["address"])
|
|
45
|
+
entries.append({"name": sym, "address": address, "hash": hashlib.sha1(address.encode()).hexdigest()})
|
|
46
|
+
|
|
47
|
+
return entries
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Linux syscall extraction plugin using kernel filesystem analysis and Git repository."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import appdirs
|
|
6
|
+
from attrs import define
|
|
7
|
+
from neogit.model.neo import Commit
|
|
8
|
+
|
|
9
|
+
from plugins.syscalls.exceptions import KernelVersionNotFoundError, PreKernel2011Error, SyscallFileNotFoundError
|
|
10
|
+
from plugins.syscalls.filesystem import KernelInfo, find_kernel_versions, get_boot_directory
|
|
11
|
+
from plugins.syscalls.kernel_repo_manager import ensure_kernel_repo, get_syscall_files
|
|
12
|
+
from plugins.syscalls.nodes import SyscallsMerkleVisitor, SyscallTableNode
|
|
13
|
+
from plugins.syscalls.syscall_table_parser import parse_syscall_table_line
|
|
14
|
+
from plugins.syscalls.syscalls_h_parser import parse_syscall_signature
|
|
15
|
+
from plugins.types import AbstractPlugin
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@define(auto_attribs=True)
|
|
19
|
+
class SyscallsPlugin(AbstractPlugin):
|
|
20
|
+
"""Plugin to extract Linux syscall information from kernel files.
|
|
21
|
+
|
|
22
|
+
This plugin:
|
|
23
|
+
1. Navigates to /boot directory using new Tree API
|
|
24
|
+
2. Finds vmlinuz kernel files
|
|
25
|
+
3. Parses kernel versions
|
|
26
|
+
4. Fetches syscall information from Linux kernel Git repository
|
|
27
|
+
5. Extracts syscall signatures with parameters
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def run(self, commit: Commit):
|
|
31
|
+
"""Execute syscall extraction for a commit.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
commit: The Commit node to analyze
|
|
35
|
+
"""
|
|
36
|
+
self.logger.info(f"Running syscall plugin for commit {commit.hash}")
|
|
37
|
+
|
|
38
|
+
# Get the root filesystem tree
|
|
39
|
+
try:
|
|
40
|
+
root_tree = commit.filesystem[0]
|
|
41
|
+
except IndexError:
|
|
42
|
+
self.logger.warning(f"No filesystem found for commit {commit.hash}")
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
# Navigate to /boot directory - using public function
|
|
46
|
+
boot_tree = get_boot_directory(root_tree)
|
|
47
|
+
if not boot_tree:
|
|
48
|
+
self.logger.info("No /boot directory found, skipping syscall extraction")
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
# Find kernel versions from vmlinuz files - using public function
|
|
52
|
+
kernel_info_list = find_kernel_versions(boot_tree, self)
|
|
53
|
+
if not kernel_info_list:
|
|
54
|
+
self.logger.info("No kernel files found in /boot")
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
self.logger.info(
|
|
58
|
+
f"Found {len(kernel_info_list)} kernel(s): "
|
|
59
|
+
+ ", ".join(f"{k.filename} ({k.architecture})" for k in kernel_info_list)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Extract syscalls from Linux kernel repository
|
|
63
|
+
syscall_data = self._extract_syscalls_from_repo(kernel_info_list)
|
|
64
|
+
|
|
65
|
+
# Transform to Nodes and visit with MerkleVisitor
|
|
66
|
+
self._transform_and_visit(kernel_info_list, syscall_data)
|
|
67
|
+
|
|
68
|
+
self.logger.info(f"Syscall extraction complete for commit {commit.hash}")
|
|
69
|
+
|
|
70
|
+
def _extract_syscalls_from_repo(self, kernel_info_list: List[KernelInfo]) -> Dict[str, List[dict]]:
|
|
71
|
+
"""Extract syscall information from Linux kernel Git repository.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
kernel_info_list: List of kernel information to extract
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Dictionary mapping kernel version to list of syscall data
|
|
78
|
+
"""
|
|
79
|
+
syscall_data = {}
|
|
80
|
+
cache_dir = appdirs.user_cache_dir("oswatcher-plugins")
|
|
81
|
+
self.logger.info(f"Cloning/updating Linux kernel repository to {cache_dir}")
|
|
82
|
+
repo = ensure_kernel_repo(cache_dir)
|
|
83
|
+
|
|
84
|
+
# Extract syscalls for each kernel version
|
|
85
|
+
for kernel_info in kernel_info_list:
|
|
86
|
+
version = kernel_info.version
|
|
87
|
+
try:
|
|
88
|
+
syscalls = self._extract_version_syscalls(repo, version)
|
|
89
|
+
if syscalls:
|
|
90
|
+
syscall_data[version] = syscalls
|
|
91
|
+
self.logger.info(f"Extracted {len(syscalls)} syscalls for {version} ({kernel_info.architecture})")
|
|
92
|
+
except PreKernel2011Error as e:
|
|
93
|
+
self.logger.warning(f"Skipping {version}: {e}")
|
|
94
|
+
except KernelVersionNotFoundError as e:
|
|
95
|
+
self.logger.warning(f"Version {version} not found in repository: {e}") # noqa: E713
|
|
96
|
+
except SyscallFileNotFoundError as e:
|
|
97
|
+
self.logger.warning(f"Syscall files not found for {version}: {e}")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
self.logger.error(f"Failed to extract syscalls for {version}: {e}")
|
|
100
|
+
|
|
101
|
+
return syscall_data
|
|
102
|
+
|
|
103
|
+
def _extract_version_syscalls(self, repo, version: str) -> List[dict]:
|
|
104
|
+
"""Extract syscall data for a specific kernel version.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
repo: Git repository object
|
|
108
|
+
version: Kernel version like 'v5.15'
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List of dictionaries with syscall information
|
|
112
|
+
"""
|
|
113
|
+
# Get syscall table and header files from repository
|
|
114
|
+
table_content, header_content = get_syscall_files(repo, version)
|
|
115
|
+
|
|
116
|
+
syscalls = []
|
|
117
|
+
|
|
118
|
+
# Parse syscall table
|
|
119
|
+
for line in table_content.splitlines():
|
|
120
|
+
syscall_index = parse_syscall_table_line(line)
|
|
121
|
+
if syscall_index:
|
|
122
|
+
# Get signature from header file
|
|
123
|
+
entry_name = f"sys_{syscall_index.name}"
|
|
124
|
+
signature = parse_syscall_signature(header_content, entry_name)
|
|
125
|
+
|
|
126
|
+
syscall_info = {
|
|
127
|
+
"name": syscall_index.name,
|
|
128
|
+
"index": syscall_index.index,
|
|
129
|
+
"entry_point": entry_name,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if signature:
|
|
133
|
+
syscall_info["parameters"] = signature.parameters
|
|
134
|
+
else:
|
|
135
|
+
syscall_info["parameters"] = None
|
|
136
|
+
self.logger.debug(f"No signature found for {entry_name}")
|
|
137
|
+
|
|
138
|
+
syscalls.append(syscall_info)
|
|
139
|
+
|
|
140
|
+
return syscalls
|
|
141
|
+
|
|
142
|
+
def _transform_and_visit(self, kernel_info_list: List[KernelInfo], syscall_data: Dict[str, List[dict]]):
|
|
143
|
+
"""Transform syscall data to Nodes and visit with MerkleVisitor.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
kernel_info_list: List of kernel information
|
|
147
|
+
syscall_data: Dictionary mapping kernel version to syscall list
|
|
148
|
+
"""
|
|
149
|
+
if not syscall_data:
|
|
150
|
+
self.logger.info("No syscall data to transform")
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
# Create visitor (threaded for async processing)
|
|
154
|
+
with SyscallsMerkleVisitor(thread=True) as visitor:
|
|
155
|
+
# Create SyscallTableNode for each kernel
|
|
156
|
+
for kernel_info in kernel_info_list:
|
|
157
|
+
version_data = syscall_data.get(kernel_info.version)
|
|
158
|
+
if not version_data:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
# Create SyscallTableNode
|
|
162
|
+
table_node = SyscallTableNode(architecture=kernel_info.architecture, syscalls=version_data)
|
|
163
|
+
|
|
164
|
+
# Visit to compute hashes
|
|
165
|
+
self.logger.info(
|
|
166
|
+
f"Creating SyscallTableNode for {kernel_info.version} "
|
|
167
|
+
f"({kernel_info.architecture}) with {len(version_data)} syscalls"
|
|
168
|
+
)
|
|
169
|
+
visitor.run_visit(table_node)
|
|
170
|
+
|
|
171
|
+
# Iterate visited nodes (includes both SyscallMerkleNodes and SyscallTableMerkleNodes)
|
|
172
|
+
for visited_node in visitor.as_gen():
|
|
173
|
+
merkle_node = visited_node.return_value
|
|
174
|
+
# Only log SyscallTableMerkleNodes (skip individual syscall children)
|
|
175
|
+
if hasattr(merkle_node, "architecture"):
|
|
176
|
+
self.logger.info(
|
|
177
|
+
f"Created SyscallTableMerkleNode: hash={merkle_node.hash[:8]}... "
|
|
178
|
+
f"arch={merkle_node.architecture} children={len(merkle_node.children)}"
|
|
179
|
+
)
|
|
180
|
+
# TODO: Insert into Neo4j (next phase)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Syscall extraction specific exceptions."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SyscallExtractionError(Exception):
|
|
5
|
+
"""Base exception for syscall extraction errors."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class KernelVersionNotFoundError(SyscallExtractionError):
|
|
11
|
+
"""Kernel version does not exist in the repository."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PreKernel2011Error(SyscallExtractionError):
|
|
17
|
+
"""Kernel version predates 2011 syscall table format (not supported)."""
|
|
18
|
+
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SyscallFileNotFoundError(SyscallExtractionError):
|
|
23
|
+
"""Required syscall files not found for this kernel version."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Filesystem navigation utilities for Linux kernel analysis."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import PurePath
|
|
5
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
6
|
+
|
|
7
|
+
import lief
|
|
8
|
+
from neogit.model.merkle import Blob, Tree
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from plugins.types import AbstractPlugin
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class KernelInfo:
|
|
16
|
+
"""Information about a kernel found in /boot directory."""
|
|
17
|
+
|
|
18
|
+
version: str # e.g., "v5.15"
|
|
19
|
+
blob_hash: str # Neo4j Blob hash
|
|
20
|
+
filename: str # e.g., "vmlinuz-5.15.0-91-generic"
|
|
21
|
+
architecture: str # e.g., "x86_64", "AARCH64" (from lief enum)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def detect_kernel_arch(vmlinuz_path: str) -> str:
|
|
25
|
+
"""Detect kernel architecture using lief parser.
|
|
26
|
+
|
|
27
|
+
Handles both raw ELF kernels (vmlinux) and compressed vmlinuz files
|
|
28
|
+
with an EFI boot stub (PE header).
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
vmlinuz_path: Path to vmlinuz file (local filesystem)
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Architecture string from lief enum (e.g., "ARCH.x86_64", "MACHINE_TYPES.AMD64")
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ValueError: If file cannot be parsed or architecture cannot be determined
|
|
38
|
+
"""
|
|
39
|
+
binary = lief.parse(vmlinuz_path)
|
|
40
|
+
|
|
41
|
+
if binary is None:
|
|
42
|
+
raise ValueError(f"Failed to parse {vmlinuz_path}")
|
|
43
|
+
|
|
44
|
+
# Raw ELF kernel (e.g., vmlinux)
|
|
45
|
+
if isinstance(binary, lief.ELF.Binary):
|
|
46
|
+
return str(binary.header.machine_type)
|
|
47
|
+
|
|
48
|
+
# Compressed vmlinuz with EFI boot stub (PE header)
|
|
49
|
+
if isinstance(binary, lief.PE.Binary):
|
|
50
|
+
return str(binary.header.machine)
|
|
51
|
+
|
|
52
|
+
raise ValueError(f"Unsupported binary format: {vmlinuz_path}")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_boot_directory(root: Tree) -> Optional[Tree]:
|
|
56
|
+
"""Navigate to /boot directory.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
root: Root filesystem tree
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Tree node for /boot directory, or None if not found or not a directory
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
boot = root.get_child_at_path(PurePath("/boot"))
|
|
66
|
+
if isinstance(boot, Tree):
|
|
67
|
+
return boot
|
|
68
|
+
except FileNotFoundError:
|
|
69
|
+
pass
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def find_kernel_versions(boot: Tree, plugin: "AbstractPlugin") -> List[KernelInfo]:
|
|
74
|
+
"""Find kernel versions from /boot directory contents.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
boot: /boot directory tree
|
|
78
|
+
plugin: Plugin instance (for downloading blobs)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Sorted list of unique kernel information (version, hash, filename, architecture)
|
|
82
|
+
"""
|
|
83
|
+
from plugins.syscalls.kernel_parser import parse_kernel_version
|
|
84
|
+
|
|
85
|
+
kernel_infos = []
|
|
86
|
+
seen_versions = set()
|
|
87
|
+
|
|
88
|
+
for name, child in boot.iter_children():
|
|
89
|
+
if not isinstance(child, Blob) or not name.startswith("vmlinuz-"):
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
version = parse_kernel_version(name)
|
|
94
|
+
|
|
95
|
+
# Skip duplicate versions (multiple builds of same version)
|
|
96
|
+
if version in seen_versions:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Download blob and detect architecture
|
|
100
|
+
with plugin.downloaded_file(child.hash) as vmlinuz_path:
|
|
101
|
+
try:
|
|
102
|
+
architecture = detect_kernel_arch(vmlinuz_path)
|
|
103
|
+
except ValueError as e:
|
|
104
|
+
plugin.logger.warning(f"Failed to detect architecture for {name}: {e}")
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
kernel_info = KernelInfo(
|
|
108
|
+
version=version,
|
|
109
|
+
blob_hash=child.hash,
|
|
110
|
+
filename=name,
|
|
111
|
+
architecture=architecture,
|
|
112
|
+
)
|
|
113
|
+
kernel_infos.append(kernel_info)
|
|
114
|
+
seen_versions.add(version)
|
|
115
|
+
|
|
116
|
+
except ValueError:
|
|
117
|
+
# Skip files that don't parse as valid kernel versions
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
# Sort by version for deterministic ordering
|
|
121
|
+
return sorted(kernel_infos, key=lambda k: k.version)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Kernel version and syscall parsing from boot filenames and headers."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
# Compiled regex pattern for kernel filename parsing
|
|
7
|
+
# Pattern: vmlinuz-{major}.{minor}.{patch}-{build}-{flavor}
|
|
8
|
+
KERNEL_VERSION_PATTERN = re.compile(r"^vmlinuz-(\d+)\.(\d+)\..*")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class SyscallIndex:
|
|
13
|
+
"""Represents a syscall with its name and index."""
|
|
14
|
+
|
|
15
|
+
name: str
|
|
16
|
+
index: int
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_kernel_version(filename: str) -> str:
|
|
20
|
+
"""Parse kernel version from vmlinuz filename.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
filename: Boot filename like 'vmlinuz-5.15.0-91-generic'
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Kernel version like 'v5.15'
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If filename format is invalid
|
|
30
|
+
"""
|
|
31
|
+
match = KERNEL_VERSION_PATTERN.match(filename)
|
|
32
|
+
if not match:
|
|
33
|
+
raise ValueError(f"Invalid kernel filename format: {filename}")
|
|
34
|
+
|
|
35
|
+
major, minor = match.groups()
|
|
36
|
+
return f"v{major}.{minor}"
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Kernel repository management using git show for blob extraction."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import git
|
|
6
|
+
from git.exc import GitCommandError
|
|
7
|
+
|
|
8
|
+
from .exceptions import KernelVersionNotFoundError, PreKernel2011Error, SyscallFileNotFoundError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def ensure_kernel_repo(cache_dir: str) -> git.Repo:
|
|
12
|
+
"""Ensure Linux kernel repository exists in cache directory.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
cache_dir: Cache directory path
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Git repository object
|
|
19
|
+
"""
|
|
20
|
+
cache_path = Path(cache_dir)
|
|
21
|
+
linux_path = cache_path / "linux"
|
|
22
|
+
|
|
23
|
+
if linux_path.exists() and (linux_path / ".git").exists():
|
|
24
|
+
# Repository already exists, open it and fetch latest tags
|
|
25
|
+
repo = git.Repo(linux_path)
|
|
26
|
+
# Fetch latest tags
|
|
27
|
+
repo.remotes.origin.fetch(tags=True)
|
|
28
|
+
return repo
|
|
29
|
+
else:
|
|
30
|
+
# Clone the repository
|
|
31
|
+
repo = git.Repo.clone_from("https://github.com/torvalds/linux.git", linux_path)
|
|
32
|
+
# Fetch tags (clone doesn't fetch tags by default in some Git versions)
|
|
33
|
+
repo.remotes.origin.fetch(tags=True)
|
|
34
|
+
return repo
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_file_content(repo: git.Repo, version: str, file_path: str) -> str:
|
|
38
|
+
"""Get file content at specific version using git show.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
repo: Git repository object
|
|
42
|
+
version: Git tag/commit like 'v5.15'
|
|
43
|
+
file_path: File path relative to repo root
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
File content as string
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
KernelVersionNotFoundError: If kernel version doesn't exist
|
|
50
|
+
SyscallFileNotFoundError: If file doesn't exist at that version
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
return repo.git.show(f"{version}:{file_path}") # noqa: E231
|
|
54
|
+
except GitCommandError as e:
|
|
55
|
+
error_str = str(e)
|
|
56
|
+
# Check for invalid revision/version
|
|
57
|
+
if "bad revision" in error_str.lower() or "unknown revision" in error_str.lower():
|
|
58
|
+
raise KernelVersionNotFoundError(f"Kernel version {version} not found in repository") # noqa: E713
|
|
59
|
+
# Check for path not found in tree
|
|
60
|
+
elif "path" in error_str and "not in" in error_str:
|
|
61
|
+
raise SyscallFileNotFoundError(f"File {file_path} not found in version {version}") # noqa: E713
|
|
62
|
+
# Check for other "does not exist" errors
|
|
63
|
+
elif "does not exist" in error_str:
|
|
64
|
+
if version in error_str:
|
|
65
|
+
raise KernelVersionNotFoundError(f"Kernel version {version} not found in repository") # noqa: E713
|
|
66
|
+
else:
|
|
67
|
+
raise SyscallFileNotFoundError(f"File {file_path} not found in version {version}") # noqa: E713
|
|
68
|
+
# Re-raise original error if we can't classify it
|
|
69
|
+
raise
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_syscall_files(repo: git.Repo, version: str) -> tuple[str, str]:
|
|
73
|
+
"""Get syscall table and header file contents for a kernel version.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
repo: Git repository object
|
|
77
|
+
version: Kernel version like 'v5.15'
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Tuple of (table_content, header_content)
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
PreKernel2011Error: If kernel predates 2011 syscall table format
|
|
84
|
+
KernelVersionNotFoundError: If kernel version doesn't exist
|
|
85
|
+
SyscallFileNotFoundError: If syscall files don't exist
|
|
86
|
+
"""
|
|
87
|
+
# Try post-2011 location first
|
|
88
|
+
table_path = "arch/x86/entry/syscalls/syscall_64.tbl"
|
|
89
|
+
header_path = "include/linux/syscalls.h"
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
table_content = get_file_content(repo, version, table_path)
|
|
93
|
+
header_content = get_file_content(repo, version, header_path)
|
|
94
|
+
return table_content, header_content
|
|
95
|
+
except SyscallFileNotFoundError as e:
|
|
96
|
+
# Check if it's a pre-2011 kernel (no .tbl files)
|
|
97
|
+
if table_path in str(e):
|
|
98
|
+
try:
|
|
99
|
+
# Try to get syscalls.h to see if the version exists
|
|
100
|
+
get_file_content(repo, version, header_path)
|
|
101
|
+
# If syscalls.h exists but .tbl doesn't, it's pre-2011
|
|
102
|
+
raise PreKernel2011Error(f"Kernel {version} predates 2011 syscall table format")
|
|
103
|
+
except (SyscallFileNotFoundError, KernelVersionNotFoundError):
|
|
104
|
+
# Neither file exists - version might be invalid
|
|
105
|
+
raise KernelVersionNotFoundError(f"Kernel version {version} not found")
|
|
106
|
+
raise
|