malwi 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- malwi-0.0.1/MANIFEST.in +2 -0
- malwi-0.0.1/PKG-INFO +97 -0
- malwi-0.0.1/README.md +67 -0
- malwi-0.0.1/pyproject.toml +52 -0
- malwi-0.0.1/setup.cfg +4 -0
- malwi-0.0.1/setup.py +36 -0
- malwi-0.0.1/src/cli/__init__.py +0 -0
- malwi-0.0.1/src/cli/entry.py +128 -0
- malwi-0.0.1/src/cli/predict.py +99 -0
- malwi-0.0.1/src/common/__init__.py +0 -0
- malwi-0.0.1/src/common/files.py +20 -0
- malwi-0.0.1/src/malwi.egg-info/PKG-INFO +97 -0
- malwi-0.0.1/src/malwi.egg-info/SOURCES.txt +31 -0
- malwi-0.0.1/src/malwi.egg-info/dependency_links.txt +1 -0
- malwi-0.0.1/src/malwi.egg-info/entry_points.txt +2 -0
- malwi-0.0.1/src/malwi.egg-info/requires.txt +16 -0
- malwi-0.0.1/src/malwi.egg-info/top_level.txt +3 -0
- malwi-0.0.1/src/research/__init__.py +0 -0
- malwi-0.0.1/src/research/analyze_data.py +274 -0
- malwi-0.0.1/src/research/download_data.py +358 -0
- malwi-0.0.1/src/research/filter_data.py +136 -0
- malwi-0.0.1/src/research/normalize_data.py +679 -0
- malwi-0.0.1/src/research/syntax_mapping/__init__.py +0 -0
- malwi-0.0.1/src/research/syntax_mapping/compression_mapping.json +38 -0
- malwi-0.0.1/src/research/syntax_mapping/function_mapping.json +826 -0
- malwi-0.0.1/src/research/syntax_mapping/import_mapping.json +189 -0
- malwi-0.0.1/src/research/syntax_mapping/node_mapping.json +239 -0
- malwi-0.0.1/src/research/syntax_mapping/node_targets.json +6 -0
- malwi-0.0.1/src/research/syntax_mapping/sensitive_files.json +28 -0
- malwi-0.0.1/src/research/syntax_mapping/special_tokens.json +3559 -0
- malwi-0.0.1/src/research/syntax_mapping/target_files.json +4 -0
- malwi-0.0.1/src/research/train.py +677 -0
- malwi-0.0.1/tests/test_map_ast.py +279 -0
malwi-0.0.1/MANIFEST.in
ADDED
malwi-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: malwi
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: malwi - AI Python Malware Scanner
|
|
5
|
+
Home-page: https://github.com/schirrmacher/malwi
|
|
6
|
+
Author: Marvin Schirrmacher
|
|
7
|
+
Author-email: Marvin Schirrmacher <m@schirrmacher.io>
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.7
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: accelerate>=1.6.0
|
|
12
|
+
Requires-Dist: datasets>=3.6.0
|
|
13
|
+
Requires-Dist: nltk>=3.9.1
|
|
14
|
+
Requires-Dist: pandas>=2.2.3
|
|
15
|
+
Requires-Dist: pytest>=8.3.5
|
|
16
|
+
Requires-Dist: scikit-learn>=1.6.1
|
|
17
|
+
Requires-Dist: tokenizers>=0.21.1
|
|
18
|
+
Requires-Dist: torch>=2.7.0
|
|
19
|
+
Requires-Dist: tqdm>=4.67.1
|
|
20
|
+
Requires-Dist: transformers>=4.51.3
|
|
21
|
+
Requires-Dist: tree-sitter>=0.24.0
|
|
22
|
+
Requires-Dist: tree-sitter-javascript>=0.23.1
|
|
23
|
+
Requires-Dist: tree-sitter-languages>=1.10.2
|
|
24
|
+
Requires-Dist: tree-sitter-python>=0.23.6
|
|
25
|
+
Requires-Dist: tree-sitter-rust>=0.24.0
|
|
26
|
+
Requires-Dist: tree-sitter-typescript>=0.23.2
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
|
|
31
|
+
# malwi - AI Python Malware Scanner
|
|
32
|
+
|
|
33
|
+
<img src="malwi-logo.png" alt="Logo">
|
|
34
|
+
|
|
35
|
+
Detect Python malware _fast_ - no internet, no expensive hardware, no fees.
|
|
36
|
+
|
|
37
|
+
malwi is specialized in detecting **zero-day vulnerabilities**, for classifying code as safe or harmful.
|
|
38
|
+
|
|
39
|
+
Open-source software made in Europe.
|
|
40
|
+
Based on open research, open code, open data.
|
|
41
|
+
πͺπΊπ€ποΈ
|
|
42
|
+
|
|
43
|
+
## Why malwi?
|
|
44
|
+
|
|
45
|
+
[The number of _malicious open-source packages_ is growing](https://arxiv.org/pdf/2404.04991). This is not just a threat to your business but also to the open-source community.
|
|
46
|
+
|
|
47
|
+
Typical malware behaviors include:
|
|
48
|
+
|
|
49
|
+
- _Exfiltration_ of data: Stealing credentials, API keys, or sensitive user data.
|
|
50
|
+
- _Backdoors_: Allowing remote attackers to gain unauthorized access to your system.
|
|
51
|
+
- _Destructive_ actions: Deleting files, corrupting databases, or sabotaging applications.
|
|
52
|
+
|
|
53
|
+
> **Attention**: Malicious packages might execute code during installation (e.g. through `setup.py`).
|
|
54
|
+
Make sure to *NOT* download or install malicious packages from the dataset with commands like `uv add`, `pip install`, `poetry add`.
|
|
55
|
+
|
|
56
|
+
## What's next?
|
|
57
|
+
|
|
58
|
+
The first iteration focuses on **maliciousness of Python source code**.
|
|
59
|
+
|
|
60
|
+
Future iterations will cover malware scanning for more languages (JavaScript, Rust, Go) and more formats (binaries, logs).
|
|
61
|
+
|
|
62
|
+
## How does it work?
|
|
63
|
+
|
|
64
|
+
malwi applies [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert) and Support Vector Machines (SVM) based on the design of [_Zero Day Malware Detection with Alpha: Fast DBI with Transformer Models for Real World Application_ (2025)](https://arxiv.org/pdf/2504.14886v1).
|
|
65
|
+
Additionally, malwi applies [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) for creating Abstract Syntax Tree (ASTs) which are mapped to a unified and security sensitive syntax used as training input. The Python malware dataset can be found [here](https://github.com/lxyeternal/pypi_malregistry). After 3 epochs of training you will get: Loss: `0.0986`, Accuracy: `0.9669`, F1: `0.9666`.
|
|
66
|
+
|
|
67
|
+
High-level training pipeline:
|
|
68
|
+
|
|
69
|
+
- Create dataset from malicious/benign repositories and map code to malwi syntax
|
|
70
|
+
- Remove code duplications based on hashes
|
|
71
|
+
- Train DistilBert based on the malwi samples for categorizing malicious/benign
|
|
72
|
+
|
|
73
|
+
## Support
|
|
74
|
+
|
|
75
|
+
Do you have access to malicious Rust, Go, whatever packages? **Contact me.**
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
### Develop
|
|
79
|
+
|
|
80
|
+
Prerequisites: [uv](https://docs.astral.sh/uv/)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
# Download and process data
|
|
85
|
+
cmds/download_and_preprocess.sh
|
|
86
|
+
|
|
87
|
+
# Only process data
|
|
88
|
+
cmds/preprocess.sh
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
# Preprocess then start training
|
|
93
|
+
cmds/preprocess_and_train.sh
|
|
94
|
+
|
|
95
|
+
# Only start training
|
|
96
|
+
cmds/train.sh
|
|
97
|
+
```
|
malwi-0.0.1/README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# malwi - AI Python Malware Scanner
|
|
2
|
+
|
|
3
|
+
<img src="malwi-logo.png" alt="Logo">
|
|
4
|
+
|
|
5
|
+
Detect Python malware _fast_ - no internet, no expensive hardware, no fees.
|
|
6
|
+
|
|
7
|
+
malwi is specialized in detecting **zero-day vulnerabilities**, for classifying code as safe or harmful.
|
|
8
|
+
|
|
9
|
+
Open-source software made in Europe.
|
|
10
|
+
Based on open research, open code, open data.
|
|
11
|
+
πͺπΊπ€ποΈ
|
|
12
|
+
|
|
13
|
+
## Why malwi?
|
|
14
|
+
|
|
15
|
+
[The number of _malicious open-source packages_ is growing](https://arxiv.org/pdf/2404.04991). This is not just a threat to your business but also to the open-source community.
|
|
16
|
+
|
|
17
|
+
Typical malware behaviors include:
|
|
18
|
+
|
|
19
|
+
- _Exfiltration_ of data: Stealing credentials, API keys, or sensitive user data.
|
|
20
|
+
- _Backdoors_: Allowing remote attackers to gain unauthorized access to your system.
|
|
21
|
+
- _Destructive_ actions: Deleting files, corrupting databases, or sabotaging applications.
|
|
22
|
+
|
|
23
|
+
> **Attention**: Malicious packages might execute code during installation (e.g. through `setup.py`).
|
|
24
|
+
Make sure to *NOT* download or install malicious packages from the dataset with commands like `uv add`, `pip install`, `poetry add`.
|
|
25
|
+
|
|
26
|
+
## What's next?
|
|
27
|
+
|
|
28
|
+
The first iteration focuses on **maliciousness of Python source code**.
|
|
29
|
+
|
|
30
|
+
Future iterations will cover malware scanning for more languages (JavaScript, Rust, Go) and more formats (binaries, logs).
|
|
31
|
+
|
|
32
|
+
## How does it work?
|
|
33
|
+
|
|
34
|
+
malwi applies [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert) and Support Vector Machines (SVM) based on the design of [_Zero Day Malware Detection with Alpha: Fast DBI with Transformer Models for Real World Application_ (2025)](https://arxiv.org/pdf/2504.14886v1).
|
|
35
|
+
Additionally, malwi applies [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) for creating Abstract Syntax Tree (ASTs) which are mapped to a unified and security sensitive syntax used as training input. The Python malware dataset can be found [here](https://github.com/lxyeternal/pypi_malregistry). After 3 epochs of training you will get: Loss: `0.0986`, Accuracy: `0.9669`, F1: `0.9666`.
|
|
36
|
+
|
|
37
|
+
High-level training pipeline:
|
|
38
|
+
|
|
39
|
+
- Create dataset from malicious/benign repositories and map code to malwi syntax
|
|
40
|
+
- Remove code duplications based on hashes
|
|
41
|
+
- Train DistilBert based on the malwi samples for categorizing malicious/benign
|
|
42
|
+
|
|
43
|
+
## Support
|
|
44
|
+
|
|
45
|
+
Do you have access to malicious Rust, Go, whatever packages? **Contact me.**
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
### Develop
|
|
49
|
+
|
|
50
|
+
Prerequisites: [uv](https://docs.astral.sh/uv/)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
# Download and process data
|
|
55
|
+
cmds/download_and_preprocess.sh
|
|
56
|
+
|
|
57
|
+
# Only process data
|
|
58
|
+
cmds/preprocess.sh
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
# Preprocess then start training
|
|
63
|
+
cmds/preprocess_and_train.sh
|
|
64
|
+
|
|
65
|
+
# Only start training
|
|
66
|
+
cmds/train.sh
|
|
67
|
+
```
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=80.4.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "malwi"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "malwi - AI Python Malware Scanner"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Marvin Schirrmacher", email = "m@schirrmacher.io" },
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
dependencies = [
|
|
18
|
+
"accelerate>=1.6.0",
|
|
19
|
+
"datasets>=3.6.0",
|
|
20
|
+
"nltk>=3.9.1",
|
|
21
|
+
"pandas>=2.2.3",
|
|
22
|
+
"pytest>=8.3.5",
|
|
23
|
+
"scikit-learn>=1.6.1",
|
|
24
|
+
"tokenizers>=0.21.1",
|
|
25
|
+
"torch>=2.7.0",
|
|
26
|
+
"tqdm>=4.67.1",
|
|
27
|
+
"transformers>=4.51.3",
|
|
28
|
+
"tree-sitter>=0.24.0",
|
|
29
|
+
"tree-sitter-javascript>=0.23.1",
|
|
30
|
+
"tree-sitter-languages>=1.10.2",
|
|
31
|
+
"tree-sitter-python>=0.23.6",
|
|
32
|
+
"tree-sitter-rust>=0.24.0",
|
|
33
|
+
"tree-sitter-typescript>=0.23.2",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.scripts]
|
|
37
|
+
malwi = "cli.entry:main"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools]
|
|
40
|
+
package-dir = { "" = "src" }
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.packages.find]
|
|
43
|
+
where = ["src"]
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.package-data]
|
|
46
|
+
"research.syntax_mapping" = ["*.json"]
|
|
47
|
+
|
|
48
|
+
[dependency-groups]
|
|
49
|
+
dev = [
|
|
50
|
+
"pytest>=8.3.5",
|
|
51
|
+
"ruff>=0.11.7",
|
|
52
|
+
]
|
malwi-0.0.1/setup.cfg
ADDED
malwi-0.0.1/setup.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="malwi",
|
|
5
|
+
version="0.0.1",
|
|
6
|
+
author="Marvin Schirrmacher",
|
|
7
|
+
author_email="m@schirrmacher.io",
|
|
8
|
+
description="malwi - AI Python Malware Scanner",
|
|
9
|
+
long_description=open("README.md").read(),
|
|
10
|
+
long_description_content_type="text/markdown",
|
|
11
|
+
url="https://github.com/schirrmacher/malwi",
|
|
12
|
+
packages=find_packages(where="src"),
|
|
13
|
+
package_dir={"": "src"},
|
|
14
|
+
include_package_data=True,
|
|
15
|
+
package_data={
|
|
16
|
+
"research.syntax_mapping": [
|
|
17
|
+
"compression_mapping.json",
|
|
18
|
+
"function_mapping.json",
|
|
19
|
+
"import_mapping.json",
|
|
20
|
+
"node_mapping.json",
|
|
21
|
+
"node_targets.json",
|
|
22
|
+
"sensitive_files.json",
|
|
23
|
+
"special_tokens.json",
|
|
24
|
+
"target_files.json",
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
install_requires=[
|
|
28
|
+
# Add any dependencies here
|
|
29
|
+
],
|
|
30
|
+
python_requires=">=3.7",
|
|
31
|
+
classifiers=[
|
|
32
|
+
"Programming Language :: Python :: 3",
|
|
33
|
+
"License :: OSI Approved :: MIT License",
|
|
34
|
+
"Operating System :: OS Independent",
|
|
35
|
+
],
|
|
36
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import argparse
|
|
3
|
+
from typing import List
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from src.research.normalize_data import MalwiNode, create_malwi_nodes_from_file
|
|
7
|
+
from src.cli.predict import initialize_hf_model_components, get_node_text_prediction
|
|
8
|
+
|
|
9
|
+
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def process_source_path(
|
|
13
|
+
input_path: str,
|
|
14
|
+
) -> List[MalwiNode]:
|
|
15
|
+
path_obj = Path(input_path)
|
|
16
|
+
all_nodes: List[MalwiNode] = []
|
|
17
|
+
|
|
18
|
+
if path_obj.is_file():
|
|
19
|
+
nodes = create_malwi_nodes_from_file(file_path=str(path_obj))
|
|
20
|
+
if nodes:
|
|
21
|
+
all_nodes.extend(nodes)
|
|
22
|
+
elif not any(
|
|
23
|
+
Path(input_path).suffix.lstrip(".") in ext
|
|
24
|
+
for ext in ["js", "ts", "rs", "py"]
|
|
25
|
+
):
|
|
26
|
+
logging.info(f"File '{input_path}' is not a supported file type.")
|
|
27
|
+
else:
|
|
28
|
+
logging.info(
|
|
29
|
+
f"No processable AST nodes found in '{input_path}' or relevant targets missing/empty in NODE_TARGETS for its language."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
elif path_obj.is_dir():
|
|
33
|
+
logging.info(f"Processing directory: {input_path}")
|
|
34
|
+
processed_files_in_dir = False
|
|
35
|
+
for file_path in path_obj.rglob("*"): # Using rglob for recursive traversal
|
|
36
|
+
if file_path.is_file():
|
|
37
|
+
nodes = create_malwi_nodes_from_file(file_path=str(file_path))
|
|
38
|
+
if nodes:
|
|
39
|
+
all_nodes.extend(nodes)
|
|
40
|
+
processed_files_in_dir = True
|
|
41
|
+
if not processed_files_in_dir:
|
|
42
|
+
logging.info(f"No processable files found in directory '{input_path}'.")
|
|
43
|
+
else:
|
|
44
|
+
logging.error(f"Path '{input_path}' is neither a file nor a directory.")
|
|
45
|
+
return all_nodes
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main():
|
|
49
|
+
parser = argparse.ArgumentParser(description="malwi - AI Python Malware Scanner")
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"path", metavar="PATH", help="Specify the package file or folder path."
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--debug",
|
|
55
|
+
"-d",
|
|
56
|
+
action="store_true",
|
|
57
|
+
help="Print the model input before prediction.",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--tokenizer-path",
|
|
61
|
+
"-t",
|
|
62
|
+
metavar="PATH",
|
|
63
|
+
help="Specify the custom tokenizer path (directory or file).",
|
|
64
|
+
default=None,
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--model-path",
|
|
68
|
+
"-m",
|
|
69
|
+
metavar="PATH",
|
|
70
|
+
help="Specify the custom model path (directory or file).",
|
|
71
|
+
default=None, # Default to None
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
args = parser.parse_args()
|
|
75
|
+
|
|
76
|
+
if not logging.getLogger().hasHandlers():
|
|
77
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
|
78
|
+
|
|
79
|
+
logging.info(
|
|
80
|
+
"""
|
|
81
|
+
__ __
|
|
82
|
+
.--------.---.-| .--.--.--|__|
|
|
83
|
+
| | _ | | | | | |
|
|
84
|
+
|__|__|__|___._|__|________|__|
|
|
85
|
+
AI Python Malware Scanner\n\n"""
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if not args.path:
|
|
89
|
+
parser.print_help()
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
initialize_hf_model_components(
|
|
93
|
+
model_path=args.model_path, tokenizer_path=args.tokenizer_path
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
all_collected_nodes = process_source_path(
|
|
97
|
+
input_path=args.path,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if not all_collected_nodes:
|
|
101
|
+
logging.info(
|
|
102
|
+
f"No processable AST nodes found for the given path: '{args.path}'."
|
|
103
|
+
)
|
|
104
|
+
else:
|
|
105
|
+
for n in all_collected_nodes:
|
|
106
|
+
node_ast_one_line = n.to_string()
|
|
107
|
+
|
|
108
|
+
if args.debug:
|
|
109
|
+
print(f"\nInput:\n{n.file_path}\n\n{node_ast_one_line}\n\n")
|
|
110
|
+
|
|
111
|
+
prediction_data = get_node_text_prediction(node_ast_one_line)
|
|
112
|
+
|
|
113
|
+
if prediction_data["status"] == "success":
|
|
114
|
+
probabilities = prediction_data["probabilities"]
|
|
115
|
+
benign = probabilities[0]
|
|
116
|
+
maliciousness = probabilities[1]
|
|
117
|
+
if maliciousness > 0.5:
|
|
118
|
+
print(f"{n.file_path}: π malicious {maliciousness:.2f}")
|
|
119
|
+
# else:
|
|
120
|
+
# print(f"{n.file_path}: π’ good {benign:.2f}")
|
|
121
|
+
else:
|
|
122
|
+
logging.error(
|
|
123
|
+
f"Prediction error for node in {n.file_path}: {prediction_data['message']}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
main()
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Any, Optional
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
from transformers import AutoTokenizer, DistilBertForSequenceClassification
|
|
7
|
+
|
|
8
|
+
HF_TOKENIZER_NAME = "schirrmacher/malwi-tokenizer"
|
|
9
|
+
HF_MODEL_NAME = "schirrmacher/malwi"
|
|
10
|
+
HF_TOKENIZER_INSTANCE = None
|
|
11
|
+
HF_MODEL_INSTANCE = None
|
|
12
|
+
HF_DEVICE_INSTANCE = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def initialize_hf_model_components(
|
|
16
|
+
model_path: Optional[str] = None, tokenizer_path: Optional[str] = None
|
|
17
|
+
):
|
|
18
|
+
global \
|
|
19
|
+
HF_TOKENIZER_INSTANCE, \
|
|
20
|
+
HF_MODEL_INSTANCE, \
|
|
21
|
+
HF_DEVICE_INSTANCE, \
|
|
22
|
+
HF_MODEL_NAME, \
|
|
23
|
+
HF_TOKENIZER_NAME
|
|
24
|
+
|
|
25
|
+
if HF_MODEL_INSTANCE is not None:
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
actual_tokenizer_path = tokenizer_path if tokenizer_path else HF_TOKENIZER_NAME
|
|
29
|
+
actual_model_path = model_path if model_path else HF_MODEL_NAME
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
HF_TOKENIZER_INSTANCE = AutoTokenizer.from_pretrained(
|
|
33
|
+
actual_tokenizer_path, trust_remote_code=True
|
|
34
|
+
)
|
|
35
|
+
HF_MODEL_INSTANCE = DistilBertForSequenceClassification.from_pretrained(
|
|
36
|
+
actual_model_path, trust_remote_code=True
|
|
37
|
+
)
|
|
38
|
+
HF_DEVICE_INSTANCE = torch.device(
|
|
39
|
+
"cuda" if torch.cuda.is_available() else "cpu"
|
|
40
|
+
)
|
|
41
|
+
HF_MODEL_INSTANCE.to(HF_DEVICE_INSTANCE)
|
|
42
|
+
HF_MODEL_INSTANCE.eval()
|
|
43
|
+
logging.info(f"HF model '{actual_model_path}' loaded on {HF_DEVICE_INSTANCE}.")
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logging.error(f"Failed to load HF model/tokenizer: {e}")
|
|
46
|
+
HF_TOKENIZER_INSTANCE = HF_MODEL_INSTANCE = HF_DEVICE_INSTANCE = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_node_text_prediction(text_input: str) -> Dict[str, Any]:
|
|
50
|
+
if (
|
|
51
|
+
HF_MODEL_INSTANCE is None
|
|
52
|
+
or HF_TOKENIZER_INSTANCE is None
|
|
53
|
+
or HF_DEVICE_INSTANCE is None
|
|
54
|
+
):
|
|
55
|
+
return {"status": "error", "message": "Model_Not_Loaded"}
|
|
56
|
+
try:
|
|
57
|
+
inputs = HF_TOKENIZER_INSTANCE(
|
|
58
|
+
text_input,
|
|
59
|
+
return_tensors="pt",
|
|
60
|
+
padding=True,
|
|
61
|
+
truncation=True,
|
|
62
|
+
max_length=512,
|
|
63
|
+
)
|
|
64
|
+
model_inputs = {}
|
|
65
|
+
if "input_ids" in inputs:
|
|
66
|
+
model_inputs["input_ids"] = inputs["input_ids"].to(HF_DEVICE_INSTANCE)
|
|
67
|
+
if "attention_mask" in inputs:
|
|
68
|
+
model_inputs["attention_mask"] = inputs["attention_mask"].to(
|
|
69
|
+
HF_DEVICE_INSTANCE
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if not model_inputs.get("input_ids") is not None:
|
|
73
|
+
return {"status": "error", "message": "Input_Error"}
|
|
74
|
+
|
|
75
|
+
with torch.no_grad():
|
|
76
|
+
outputs = HF_MODEL_INSTANCE(**model_inputs)
|
|
77
|
+
|
|
78
|
+
if hasattr(outputs, "logits"):
|
|
79
|
+
logits = outputs.logits
|
|
80
|
+
probabilities = F.softmax(logits, dim=-1).cpu()
|
|
81
|
+
first_item_probabilities = probabilities[0]
|
|
82
|
+
prediction_idx = torch.argmax(first_item_probabilities).item()
|
|
83
|
+
label_map = {0: "Benign", 1: "Malicious"}
|
|
84
|
+
predicted_label = label_map.get(
|
|
85
|
+
prediction_idx, f"Unknown_Index_{prediction_idx}"
|
|
86
|
+
)
|
|
87
|
+
return {
|
|
88
|
+
"status": "success",
|
|
89
|
+
"index": prediction_idx,
|
|
90
|
+
"label": predicted_label,
|
|
91
|
+
"probabilities": first_item_probabilities.tolist(),
|
|
92
|
+
}
|
|
93
|
+
return {"status": "error", "message": "No_Logits"}
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logging.error(
|
|
96
|
+
f"Exception during model inference for input '{text_input[:100]}...': {e}",
|
|
97
|
+
exc_info=True,
|
|
98
|
+
)
|
|
99
|
+
return {"status": "error", "message": "Inference_Err"}
|
|
File without changes
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import pathlib
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def read_json_from_file(filepath: pathlib.Path) -> Dict[str, Any]:
|
|
8
|
+
"""Reads and parses JSON data from a file."""
|
|
9
|
+
try:
|
|
10
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
11
|
+
data = json.load(f)
|
|
12
|
+
return data
|
|
13
|
+
except FileNotFoundError:
|
|
14
|
+
logging.error(f"Mapping file not found: {filepath}")
|
|
15
|
+
except json.JSONDecodeError:
|
|
16
|
+
logging.error(f"Could not decode JSON from file {filepath}. Check format.")
|
|
17
|
+
except Exception as e:
|
|
18
|
+
logging.error(f"An unexpected error occurred reading {filepath}: {e}")
|
|
19
|
+
# Return empty dict on error to allow script to potentially continue with defaults/empty mappings
|
|
20
|
+
return {}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: malwi
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: malwi - AI Python Malware Scanner
|
|
5
|
+
Home-page: https://github.com/schirrmacher/malwi
|
|
6
|
+
Author: Marvin Schirrmacher
|
|
7
|
+
Author-email: Marvin Schirrmacher <m@schirrmacher.io>
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.7
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: accelerate>=1.6.0
|
|
12
|
+
Requires-Dist: datasets>=3.6.0
|
|
13
|
+
Requires-Dist: nltk>=3.9.1
|
|
14
|
+
Requires-Dist: pandas>=2.2.3
|
|
15
|
+
Requires-Dist: pytest>=8.3.5
|
|
16
|
+
Requires-Dist: scikit-learn>=1.6.1
|
|
17
|
+
Requires-Dist: tokenizers>=0.21.1
|
|
18
|
+
Requires-Dist: torch>=2.7.0
|
|
19
|
+
Requires-Dist: tqdm>=4.67.1
|
|
20
|
+
Requires-Dist: transformers>=4.51.3
|
|
21
|
+
Requires-Dist: tree-sitter>=0.24.0
|
|
22
|
+
Requires-Dist: tree-sitter-javascript>=0.23.1
|
|
23
|
+
Requires-Dist: tree-sitter-languages>=1.10.2
|
|
24
|
+
Requires-Dist: tree-sitter-python>=0.23.6
|
|
25
|
+
Requires-Dist: tree-sitter-rust>=0.24.0
|
|
26
|
+
Requires-Dist: tree-sitter-typescript>=0.23.2
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
|
|
31
|
+
# malwi - AI Python Malware Scanner
|
|
32
|
+
|
|
33
|
+
<img src="malwi-logo.png" alt="Logo">
|
|
34
|
+
|
|
35
|
+
Detect Python malware _fast_ - no internet, no expensive hardware, no fees.
|
|
36
|
+
|
|
37
|
+
malwi is specialized in detecting **zero-day vulnerabilities**, for classifying code as safe or harmful.
|
|
38
|
+
|
|
39
|
+
Open-source software made in Europe.
|
|
40
|
+
Based on open research, open code, open data.
|
|
41
|
+
πͺπΊπ€ποΈ
|
|
42
|
+
|
|
43
|
+
## Why malwi?
|
|
44
|
+
|
|
45
|
+
[The number of _malicious open-source packages_ is growing](https://arxiv.org/pdf/2404.04991). This is not just a threat to your business but also to the open-source community.
|
|
46
|
+
|
|
47
|
+
Typical malware behaviors include:
|
|
48
|
+
|
|
49
|
+
- _Exfiltration_ of data: Stealing credentials, API keys, or sensitive user data.
|
|
50
|
+
- _Backdoors_: Allowing remote attackers to gain unauthorized access to your system.
|
|
51
|
+
- _Destructive_ actions: Deleting files, corrupting databases, or sabotaging applications.
|
|
52
|
+
|
|
53
|
+
> **Attention**: Malicious packages might execute code during installation (e.g. through `setup.py`).
|
|
54
|
+
Make sure to *NOT* download or install malicious packages from the dataset with commands like `uv add`, `pip install`, `poetry add`.
|
|
55
|
+
|
|
56
|
+
## What's next?
|
|
57
|
+
|
|
58
|
+
The first iteration focuses on **maliciousness of Python source code**.
|
|
59
|
+
|
|
60
|
+
Future iterations will cover malware scanning for more languages (JavaScript, Rust, Go) and more formats (binaries, logs).
|
|
61
|
+
|
|
62
|
+
## How does it work?
|
|
63
|
+
|
|
64
|
+
malwi applies [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert) and Support Vector Machines (SVM) based on the design of [_Zero Day Malware Detection with Alpha: Fast DBI with Transformer Models for Real World Application_ (2025)](https://arxiv.org/pdf/2504.14886v1).
|
|
65
|
+
Additionally, malwi applies [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) for creating Abstract Syntax Tree (ASTs) which are mapped to a unified and security sensitive syntax used as training input. The Python malware dataset can be found [here](https://github.com/lxyeternal/pypi_malregistry). After 3 epochs of training you will get: Loss: `0.0986`, Accuracy: `0.9669`, F1: `0.9666`.
|
|
66
|
+
|
|
67
|
+
High-level training pipeline:
|
|
68
|
+
|
|
69
|
+
- Create dataset from malicious/benign repositories and map code to malwi syntax
|
|
70
|
+
- Remove code duplications based on hashes
|
|
71
|
+
- Train DistilBert based on the malwi samples for categorizing malicious/benign
|
|
72
|
+
|
|
73
|
+
## Support
|
|
74
|
+
|
|
75
|
+
Do you have access to malicious Rust, Go, whatever packages? **Contact me.**
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
### Develop
|
|
79
|
+
|
|
80
|
+
Prerequisites: [uv](https://docs.astral.sh/uv/)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
# Download and process data
|
|
85
|
+
cmds/download_and_preprocess.sh
|
|
86
|
+
|
|
87
|
+
# Only process data
|
|
88
|
+
cmds/preprocess.sh
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
# Preprocess then start training
|
|
93
|
+
cmds/preprocess_and_train.sh
|
|
94
|
+
|
|
95
|
+
# Only start training
|
|
96
|
+
cmds/train.sh
|
|
97
|
+
```
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
src/cli/__init__.py
|
|
6
|
+
src/cli/entry.py
|
|
7
|
+
src/cli/predict.py
|
|
8
|
+
src/common/__init__.py
|
|
9
|
+
src/common/files.py
|
|
10
|
+
src/malwi.egg-info/PKG-INFO
|
|
11
|
+
src/malwi.egg-info/SOURCES.txt
|
|
12
|
+
src/malwi.egg-info/dependency_links.txt
|
|
13
|
+
src/malwi.egg-info/entry_points.txt
|
|
14
|
+
src/malwi.egg-info/requires.txt
|
|
15
|
+
src/malwi.egg-info/top_level.txt
|
|
16
|
+
src/research/__init__.py
|
|
17
|
+
src/research/analyze_data.py
|
|
18
|
+
src/research/download_data.py
|
|
19
|
+
src/research/filter_data.py
|
|
20
|
+
src/research/normalize_data.py
|
|
21
|
+
src/research/train.py
|
|
22
|
+
src/research/syntax_mapping/__init__.py
|
|
23
|
+
src/research/syntax_mapping/compression_mapping.json
|
|
24
|
+
src/research/syntax_mapping/function_mapping.json
|
|
25
|
+
src/research/syntax_mapping/import_mapping.json
|
|
26
|
+
src/research/syntax_mapping/node_mapping.json
|
|
27
|
+
src/research/syntax_mapping/node_targets.json
|
|
28
|
+
src/research/syntax_mapping/sensitive_files.json
|
|
29
|
+
src/research/syntax_mapping/special_tokens.json
|
|
30
|
+
src/research/syntax_mapping/target_files.json
|
|
31
|
+
tests/test_map_ast.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
accelerate>=1.6.0
|
|
2
|
+
datasets>=3.6.0
|
|
3
|
+
nltk>=3.9.1
|
|
4
|
+
pandas>=2.2.3
|
|
5
|
+
pytest>=8.3.5
|
|
6
|
+
scikit-learn>=1.6.1
|
|
7
|
+
tokenizers>=0.21.1
|
|
8
|
+
torch>=2.7.0
|
|
9
|
+
tqdm>=4.67.1
|
|
10
|
+
transformers>=4.51.3
|
|
11
|
+
tree-sitter>=0.24.0
|
|
12
|
+
tree-sitter-javascript>=0.23.1
|
|
13
|
+
tree-sitter-languages>=1.10.2
|
|
14
|
+
tree-sitter-python>=0.23.6
|
|
15
|
+
tree-sitter-rust>=0.24.0
|
|
16
|
+
tree-sitter-typescript>=0.23.2
|
|
File without changes
|