malwi 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. malwi-0.0.1/MANIFEST.in +2 -0
  2. malwi-0.0.1/PKG-INFO +97 -0
  3. malwi-0.0.1/README.md +67 -0
  4. malwi-0.0.1/pyproject.toml +52 -0
  5. malwi-0.0.1/setup.cfg +4 -0
  6. malwi-0.0.1/setup.py +36 -0
  7. malwi-0.0.1/src/cli/__init__.py +0 -0
  8. malwi-0.0.1/src/cli/entry.py +128 -0
  9. malwi-0.0.1/src/cli/predict.py +99 -0
  10. malwi-0.0.1/src/common/__init__.py +0 -0
  11. malwi-0.0.1/src/common/files.py +20 -0
  12. malwi-0.0.1/src/malwi.egg-info/PKG-INFO +97 -0
  13. malwi-0.0.1/src/malwi.egg-info/SOURCES.txt +31 -0
  14. malwi-0.0.1/src/malwi.egg-info/dependency_links.txt +1 -0
  15. malwi-0.0.1/src/malwi.egg-info/entry_points.txt +2 -0
  16. malwi-0.0.1/src/malwi.egg-info/requires.txt +16 -0
  17. malwi-0.0.1/src/malwi.egg-info/top_level.txt +3 -0
  18. malwi-0.0.1/src/research/__init__.py +0 -0
  19. malwi-0.0.1/src/research/analyze_data.py +274 -0
  20. malwi-0.0.1/src/research/download_data.py +358 -0
  21. malwi-0.0.1/src/research/filter_data.py +136 -0
  22. malwi-0.0.1/src/research/normalize_data.py +679 -0
  23. malwi-0.0.1/src/research/syntax_mapping/__init__.py +0 -0
  24. malwi-0.0.1/src/research/syntax_mapping/compression_mapping.json +38 -0
  25. malwi-0.0.1/src/research/syntax_mapping/function_mapping.json +826 -0
  26. malwi-0.0.1/src/research/syntax_mapping/import_mapping.json +189 -0
  27. malwi-0.0.1/src/research/syntax_mapping/node_mapping.json +239 -0
  28. malwi-0.0.1/src/research/syntax_mapping/node_targets.json +6 -0
  29. malwi-0.0.1/src/research/syntax_mapping/sensitive_files.json +28 -0
  30. malwi-0.0.1/src/research/syntax_mapping/special_tokens.json +3559 -0
  31. malwi-0.0.1/src/research/syntax_mapping/target_files.json +4 -0
  32. malwi-0.0.1/src/research/train.py +677 -0
  33. malwi-0.0.1/tests/test_map_ast.py +279 -0
@@ -0,0 +1,2 @@
1
+ include README.md
2
+ recursive-include src/research/syntax_mapping *.json
malwi-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: malwi
3
+ Version: 0.0.1
4
+ Summary: malwi - AI Python Malware Scanner
5
+ Home-page: https://github.com/schirrmacher/malwi
6
+ Author: Marvin Schirrmacher
7
+ Author-email: Marvin Schirrmacher <m@schirrmacher.io>
8
+ License: MIT
9
+ Requires-Python: >=3.7
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: accelerate>=1.6.0
12
+ Requires-Dist: datasets>=3.6.0
13
+ Requires-Dist: nltk>=3.9.1
14
+ Requires-Dist: pandas>=2.2.3
15
+ Requires-Dist: pytest>=8.3.5
16
+ Requires-Dist: scikit-learn>=1.6.1
17
+ Requires-Dist: tokenizers>=0.21.1
18
+ Requires-Dist: torch>=2.7.0
19
+ Requires-Dist: tqdm>=4.67.1
20
+ Requires-Dist: transformers>=4.51.3
21
+ Requires-Dist: tree-sitter>=0.24.0
22
+ Requires-Dist: tree-sitter-javascript>=0.23.1
23
+ Requires-Dist: tree-sitter-languages>=1.10.2
24
+ Requires-Dist: tree-sitter-python>=0.23.6
25
+ Requires-Dist: tree-sitter-rust>=0.24.0
26
+ Requires-Dist: tree-sitter-typescript>=0.23.2
27
+ Dynamic: author
28
+ Dynamic: home-page
29
+ Dynamic: requires-python
30
+
31
+ # malwi - AI Python Malware Scanner
32
+
33
+ <img src="malwi-logo.png" alt="Logo">
34
+
35
+ Detect Python malware _fast_ - no internet, no expensive hardware, no fees.
36
+
37
+ malwi is specialized in detecting **zero-day vulnerabilities**, for classifying code as safe or harmful.
38
+
39
+ Open-source software made in Europe.
40
+ Based on open research, open code, open data.
41
+ πŸ‡ͺπŸ‡ΊπŸ€˜πŸ•ŠοΈ
42
+
43
+ ## Why malwi?
44
+
45
+ [The number of _malicious open-source packages_ is growing](https://arxiv.org/pdf/2404.04991). This is not just a threat to your business but also to the open-source community.
46
+
47
+ Typical malware behaviors include:
48
+
49
+ - _Exfiltration_ of data: Stealing credentials, API keys, or sensitive user data.
50
+ - _Backdoors_: Allowing remote attackers to gain unauthorized access to your system.
51
+ - _Destructive_ actions: Deleting files, corrupting databases, or sabotaging applications.
52
+
53
+ > **Attention**: Malicious packages might execute code during installation (e.g. through `setup.py`).
54
+ Make sure to *NOT* download or install malicious packages from the dataset with commands like `uv add`, `pip install`, `poetry add`.
55
+
56
+ ## What's next?
57
+
58
+ The first iteration focuses on **maliciousness of Python source code**.
59
+
60
+ Future iterations will cover malware scanning for more languages (JavaScript, Rust, Go) and more formats (binaries, logs).
61
+
62
+ ## How does it work?
63
+
64
+ malwi applies [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert) and Support Vector Machines (SVM) based on the design of [_Zero Day Malware Detection with Alpha: Fast DBI with Transformer Models for Real World Application_ (2025)](https://arxiv.org/pdf/2504.14886v1).
65
+ Additionally, malwi applies [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) for creating Abstract Syntax Tree (ASTs) which are mapped to a unified and security sensitive syntax used as training input. The Python malware dataset can be found [here](https://github.com/lxyeternal/pypi_malregistry). After 3 epochs of training you will get: Loss: `0.0986`, Accuracy: `0.9669`, F1: `0.9666`.
66
+
67
+ High-level training pipeline:
68
+
69
+ - Create dataset from malicious/benign repositories and map code to malwi syntax
70
+ - Remove code duplications based on hashes
71
+ - Train DistilBert based on the malwi samples for categorizing malicious/benign
72
+
73
+ ## Support
74
+
75
+ Do you have access to malicious Rust, Go, whatever packages? **Contact me.**
76
+
77
+
78
+ ### Develop
79
+
80
+ Prerequisites: [uv](https://docs.astral.sh/uv/)
81
+
82
+
83
+ ```
84
+ # Download and process data
85
+ cmds/download_and_preprocess.sh
86
+
87
+ # Only process data
88
+ cmds/preprocess.sh
89
+ ```
90
+
91
+ ```
92
+ # Preprocess then start training
93
+ cmds/preprocess_and_train.sh
94
+
95
+ # Only start training
96
+ cmds/train.sh
97
+ ```
malwi-0.0.1/README.md ADDED
@@ -0,0 +1,67 @@
1
+ # malwi - AI Python Malware Scanner
2
+
3
+ <img src="malwi-logo.png" alt="Logo">
4
+
5
+ Detect Python malware _fast_ - no internet, no expensive hardware, no fees.
6
+
7
+ malwi is specialized in detecting **zero-day vulnerabilities**, for classifying code as safe or harmful.
8
+
9
+ Open-source software made in Europe.
10
+ Based on open research, open code, open data.
11
+ πŸ‡ͺπŸ‡ΊπŸ€˜πŸ•ŠοΈ
12
+
13
+ ## Why malwi?
14
+
15
+ [The number of _malicious open-source packages_ is growing](https://arxiv.org/pdf/2404.04991). This is not just a threat to your business but also to the open-source community.
16
+
17
+ Typical malware behaviors include:
18
+
19
+ - _Exfiltration_ of data: Stealing credentials, API keys, or sensitive user data.
20
+ - _Backdoors_: Allowing remote attackers to gain unauthorized access to your system.
21
+ - _Destructive_ actions: Deleting files, corrupting databases, or sabotaging applications.
22
+
23
+ > **Attention**: Malicious packages might execute code during installation (e.g. through `setup.py`).
24
+ Make sure to *NOT* download or install malicious packages from the dataset with commands like `uv add`, `pip install`, `poetry add`.
25
+
26
+ ## What's next?
27
+
28
+ The first iteration focuses on **maliciousness of Python source code**.
29
+
30
+ Future iterations will cover malware scanning for more languages (JavaScript, Rust, Go) and more formats (binaries, logs).
31
+
32
+ ## How does it work?
33
+
34
+ malwi applies [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert) and Support Vector Machines (SVM) based on the design of [_Zero Day Malware Detection with Alpha: Fast DBI with Transformer Models for Real World Application_ (2025)](https://arxiv.org/pdf/2504.14886v1).
35
+ Additionally, malwi applies [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) for creating Abstract Syntax Tree (ASTs) which are mapped to a unified and security sensitive syntax used as training input. The Python malware dataset can be found [here](https://github.com/lxyeternal/pypi_malregistry). After 3 epochs of training you will get: Loss: `0.0986`, Accuracy: `0.9669`, F1: `0.9666`.
36
+
37
+ High-level training pipeline:
38
+
39
+ - Create dataset from malicious/benign repositories and map code to malwi syntax
40
+ - Remove code duplications based on hashes
41
+ - Train DistilBert based on the malwi samples for categorizing malicious/benign
42
+
43
+ ## Support
44
+
45
+ Do you have access to malicious Rust, Go, whatever packages? **Contact me.**
46
+
47
+
48
+ ### Develop
49
+
50
+ Prerequisites: [uv](https://docs.astral.sh/uv/)
51
+
52
+
53
+ ```
54
+ # Download and process data
55
+ cmds/download_and_preprocess.sh
56
+
57
+ # Only process data
58
+ cmds/preprocess.sh
59
+ ```
60
+
61
+ ```
62
+ # Preprocess then start training
63
+ cmds/preprocess_and_train.sh
64
+
65
+ # Only start training
66
+ cmds/train.sh
67
+ ```
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools>=80.4.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "malwi"
7
+ version = "0.0.1"
8
+ description = "malwi - AI Python Malware Scanner"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = { text = "MIT" }
12
+
13
+ authors = [
14
+ { name = "Marvin Schirrmacher", email = "m@schirrmacher.io" },
15
+ ]
16
+
17
+ dependencies = [
18
+ "accelerate>=1.6.0",
19
+ "datasets>=3.6.0",
20
+ "nltk>=3.9.1",
21
+ "pandas>=2.2.3",
22
+ "pytest>=8.3.5",
23
+ "scikit-learn>=1.6.1",
24
+ "tokenizers>=0.21.1",
25
+ "torch>=2.7.0",
26
+ "tqdm>=4.67.1",
27
+ "transformers>=4.51.3",
28
+ "tree-sitter>=0.24.0",
29
+ "tree-sitter-javascript>=0.23.1",
30
+ "tree-sitter-languages>=1.10.2",
31
+ "tree-sitter-python>=0.23.6",
32
+ "tree-sitter-rust>=0.24.0",
33
+ "tree-sitter-typescript>=0.23.2",
34
+ ]
35
+
36
+ [project.scripts]
37
+ malwi = "cli.entry:main"
38
+
39
+ [tool.setuptools]
40
+ package-dir = { "" = "src" }
41
+
42
+ [tool.setuptools.packages.find]
43
+ where = ["src"]
44
+
45
+ [tool.setuptools.package-data]
46
+ "research.syntax_mapping" = ["*.json"]
47
+
48
+ [dependency-groups]
49
+ dev = [
50
+ "pytest>=8.3.5",
51
+ "ruff>=0.11.7",
52
+ ]
malwi-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
malwi-0.0.1/setup.py ADDED
@@ -0,0 +1,36 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="malwi",
5
+ version="0.0.1",
6
+ author="Marvin Schirrmacher",
7
+ author_email="m@schirrmacher.io",
8
+ description="malwi - AI Python Malware Scanner",
9
+ long_description=open("README.md").read(),
10
+ long_description_content_type="text/markdown",
11
+ url="https://github.com/schirrmacher/malwi",
12
+ packages=find_packages(where="src"),
13
+ package_dir={"": "src"},
14
+ include_package_data=True,
15
+ package_data={
16
+ "research.syntax_mapping": [
17
+ "compression_mapping.json",
18
+ "function_mapping.json",
19
+ "import_mapping.json",
20
+ "node_mapping.json",
21
+ "node_targets.json",
22
+ "sensitive_files.json",
23
+ "special_tokens.json",
24
+ "target_files.json",
25
+ ]
26
+ },
27
+ install_requires=[
28
+ # Add any dependencies here
29
+ ],
30
+ python_requires=">=3.7",
31
+ classifiers=[
32
+ "Programming Language :: Python :: 3",
33
+ "License :: OSI Approved :: MIT License",
34
+ "Operating System :: OS Independent",
35
+ ],
36
+ )
File without changes
@@ -0,0 +1,128 @@
1
+ import logging
2
+ import argparse
3
+ from typing import List
4
+ from pathlib import Path
5
+
6
+ from src.research.normalize_data import MalwiNode, create_malwi_nodes_from_file
7
+ from src.cli.predict import initialize_hf_model_components, get_node_text_prediction
8
+
9
+ logging.basicConfig(format="%(message)s", level=logging.INFO)
10
+
11
+
12
+ def process_source_path(
13
+ input_path: str,
14
+ ) -> List[MalwiNode]:
15
+ path_obj = Path(input_path)
16
+ all_nodes: List[MalwiNode] = []
17
+
18
+ if path_obj.is_file():
19
+ nodes = create_malwi_nodes_from_file(file_path=str(path_obj))
20
+ if nodes:
21
+ all_nodes.extend(nodes)
22
+ elif not any(
23
+ Path(input_path).suffix.lstrip(".") in ext
24
+ for ext in ["js", "ts", "rs", "py"]
25
+ ):
26
+ logging.info(f"File '{input_path}' is not a supported file type.")
27
+ else:
28
+ logging.info(
29
+ f"No processable AST nodes found in '{input_path}' or relevant targets missing/empty in NODE_TARGETS for its language."
30
+ )
31
+
32
+ elif path_obj.is_dir():
33
+ logging.info(f"Processing directory: {input_path}")
34
+ processed_files_in_dir = False
35
+ for file_path in path_obj.rglob("*"): # Using rglob for recursive traversal
36
+ if file_path.is_file():
37
+ nodes = create_malwi_nodes_from_file(file_path=str(file_path))
38
+ if nodes:
39
+ all_nodes.extend(nodes)
40
+ processed_files_in_dir = True
41
+ if not processed_files_in_dir:
42
+ logging.info(f"No processable files found in directory '{input_path}'.")
43
+ else:
44
+ logging.error(f"Path '{input_path}' is neither a file nor a directory.")
45
+ return all_nodes
46
+
47
+
48
+ def main():
49
+ parser = argparse.ArgumentParser(description="malwi - AI Python Malware Scanner")
50
+ parser.add_argument(
51
+ "path", metavar="PATH", help="Specify the package file or folder path."
52
+ )
53
+ parser.add_argument(
54
+ "--debug",
55
+ "-d",
56
+ action="store_true",
57
+ help="Print the model input before prediction.",
58
+ )
59
+ parser.add_argument(
60
+ "--tokenizer-path",
61
+ "-t",
62
+ metavar="PATH",
63
+ help="Specify the custom tokenizer path (directory or file).",
64
+ default=None,
65
+ )
66
+ parser.add_argument(
67
+ "--model-path",
68
+ "-m",
69
+ metavar="PATH",
70
+ help="Specify the custom model path (directory or file).",
71
+ default=None, # Default to None
72
+ )
73
+
74
+ args = parser.parse_args()
75
+
76
+ if not logging.getLogger().hasHandlers():
77
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
78
+
79
+ logging.info(
80
+ """
81
+ __ __
82
+ .--------.---.-| .--.--.--|__|
83
+ | | _ | | | | | |
84
+ |__|__|__|___._|__|________|__|
85
+ AI Python Malware Scanner\n\n"""
86
+ )
87
+
88
+ if not args.path:
89
+ parser.print_help()
90
+ return
91
+
92
+ initialize_hf_model_components(
93
+ model_path=args.model_path, tokenizer_path=args.tokenizer_path
94
+ )
95
+
96
+ all_collected_nodes = process_source_path(
97
+ input_path=args.path,
98
+ )
99
+
100
+ if not all_collected_nodes:
101
+ logging.info(
102
+ f"No processable AST nodes found for the given path: '{args.path}'."
103
+ )
104
+ else:
105
+ for n in all_collected_nodes:
106
+ node_ast_one_line = n.to_string()
107
+
108
+ if args.debug:
109
+ print(f"\nInput:\n{n.file_path}\n\n{node_ast_one_line}\n\n")
110
+
111
+ prediction_data = get_node_text_prediction(node_ast_one_line)
112
+
113
+ if prediction_data["status"] == "success":
114
+ probabilities = prediction_data["probabilities"]
115
+ benign = probabilities[0]
116
+ maliciousness = probabilities[1]
117
+ if maliciousness > 0.5:
118
+ print(f"{n.file_path}: πŸ›‘ malicious {maliciousness:.2f}")
119
+ # else:
120
+ # print(f"{n.file_path}: 🟒 good {benign:.2f}")
121
+ else:
122
+ logging.error(
123
+ f"Prediction error for node in {n.file_path}: {prediction_data['message']}"
124
+ )
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
@@ -0,0 +1,99 @@
1
+ import logging
2
+ from typing import Dict, Any, Optional
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from transformers import AutoTokenizer, DistilBertForSequenceClassification
7
+
8
+ HF_TOKENIZER_NAME = "schirrmacher/malwi-tokenizer"
9
+ HF_MODEL_NAME = "schirrmacher/malwi"
10
+ HF_TOKENIZER_INSTANCE = None
11
+ HF_MODEL_INSTANCE = None
12
+ HF_DEVICE_INSTANCE = None
13
+
14
+
15
+ def initialize_hf_model_components(
16
+ model_path: Optional[str] = None, tokenizer_path: Optional[str] = None
17
+ ):
18
+ global \
19
+ HF_TOKENIZER_INSTANCE, \
20
+ HF_MODEL_INSTANCE, \
21
+ HF_DEVICE_INSTANCE, \
22
+ HF_MODEL_NAME, \
23
+ HF_TOKENIZER_NAME
24
+
25
+ if HF_MODEL_INSTANCE is not None:
26
+ return
27
+
28
+ actual_tokenizer_path = tokenizer_path if tokenizer_path else HF_TOKENIZER_NAME
29
+ actual_model_path = model_path if model_path else HF_MODEL_NAME
30
+
31
+ try:
32
+ HF_TOKENIZER_INSTANCE = AutoTokenizer.from_pretrained(
33
+ actual_tokenizer_path, trust_remote_code=True
34
+ )
35
+ HF_MODEL_INSTANCE = DistilBertForSequenceClassification.from_pretrained(
36
+ actual_model_path, trust_remote_code=True
37
+ )
38
+ HF_DEVICE_INSTANCE = torch.device(
39
+ "cuda" if torch.cuda.is_available() else "cpu"
40
+ )
41
+ HF_MODEL_INSTANCE.to(HF_DEVICE_INSTANCE)
42
+ HF_MODEL_INSTANCE.eval()
43
+ logging.info(f"HF model '{actual_model_path}' loaded on {HF_DEVICE_INSTANCE}.")
44
+ except Exception as e:
45
+ logging.error(f"Failed to load HF model/tokenizer: {e}")
46
+ HF_TOKENIZER_INSTANCE = HF_MODEL_INSTANCE = HF_DEVICE_INSTANCE = None
47
+
48
+
49
+ def get_node_text_prediction(text_input: str) -> Dict[str, Any]:
50
+ if (
51
+ HF_MODEL_INSTANCE is None
52
+ or HF_TOKENIZER_INSTANCE is None
53
+ or HF_DEVICE_INSTANCE is None
54
+ ):
55
+ return {"status": "error", "message": "Model_Not_Loaded"}
56
+ try:
57
+ inputs = HF_TOKENIZER_INSTANCE(
58
+ text_input,
59
+ return_tensors="pt",
60
+ padding=True,
61
+ truncation=True,
62
+ max_length=512,
63
+ )
64
+ model_inputs = {}
65
+ if "input_ids" in inputs:
66
+ model_inputs["input_ids"] = inputs["input_ids"].to(HF_DEVICE_INSTANCE)
67
+ if "attention_mask" in inputs:
68
+ model_inputs["attention_mask"] = inputs["attention_mask"].to(
69
+ HF_DEVICE_INSTANCE
70
+ )
71
+
72
+ if not model_inputs.get("input_ids") is not None:
73
+ return {"status": "error", "message": "Input_Error"}
74
+
75
+ with torch.no_grad():
76
+ outputs = HF_MODEL_INSTANCE(**model_inputs)
77
+
78
+ if hasattr(outputs, "logits"):
79
+ logits = outputs.logits
80
+ probabilities = F.softmax(logits, dim=-1).cpu()
81
+ first_item_probabilities = probabilities[0]
82
+ prediction_idx = torch.argmax(first_item_probabilities).item()
83
+ label_map = {0: "Benign", 1: "Malicious"}
84
+ predicted_label = label_map.get(
85
+ prediction_idx, f"Unknown_Index_{prediction_idx}"
86
+ )
87
+ return {
88
+ "status": "success",
89
+ "index": prediction_idx,
90
+ "label": predicted_label,
91
+ "probabilities": first_item_probabilities.tolist(),
92
+ }
93
+ return {"status": "error", "message": "No_Logits"}
94
+ except Exception as e:
95
+ logging.error(
96
+ f"Exception during model inference for input '{text_input[:100]}...': {e}",
97
+ exc_info=True,
98
+ )
99
+ return {"status": "error", "message": "Inference_Err"}
File without changes
@@ -0,0 +1,20 @@
1
+ import json
2
+ import logging
3
+ import pathlib
4
+ from typing import Dict, Any
5
+
6
+
7
+ def read_json_from_file(filepath: pathlib.Path) -> Dict[str, Any]:
8
+ """Reads and parses JSON data from a file."""
9
+ try:
10
+ with open(filepath, "r", encoding="utf-8") as f:
11
+ data = json.load(f)
12
+ return data
13
+ except FileNotFoundError:
14
+ logging.error(f"Mapping file not found: {filepath}")
15
+ except json.JSONDecodeError:
16
+ logging.error(f"Could not decode JSON from file {filepath}. Check format.")
17
+ except Exception as e:
18
+ logging.error(f"An unexpected error occurred reading {filepath}: {e}")
19
+ # Return empty dict on error to allow script to potentially continue with defaults/empty mappings
20
+ return {}
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: malwi
3
+ Version: 0.0.1
4
+ Summary: malwi - AI Python Malware Scanner
5
+ Home-page: https://github.com/schirrmacher/malwi
6
+ Author: Marvin Schirrmacher
7
+ Author-email: Marvin Schirrmacher <m@schirrmacher.io>
8
+ License: MIT
9
+ Requires-Python: >=3.7
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: accelerate>=1.6.0
12
+ Requires-Dist: datasets>=3.6.0
13
+ Requires-Dist: nltk>=3.9.1
14
+ Requires-Dist: pandas>=2.2.3
15
+ Requires-Dist: pytest>=8.3.5
16
+ Requires-Dist: scikit-learn>=1.6.1
17
+ Requires-Dist: tokenizers>=0.21.1
18
+ Requires-Dist: torch>=2.7.0
19
+ Requires-Dist: tqdm>=4.67.1
20
+ Requires-Dist: transformers>=4.51.3
21
+ Requires-Dist: tree-sitter>=0.24.0
22
+ Requires-Dist: tree-sitter-javascript>=0.23.1
23
+ Requires-Dist: tree-sitter-languages>=1.10.2
24
+ Requires-Dist: tree-sitter-python>=0.23.6
25
+ Requires-Dist: tree-sitter-rust>=0.24.0
26
+ Requires-Dist: tree-sitter-typescript>=0.23.2
27
+ Dynamic: author
28
+ Dynamic: home-page
29
+ Dynamic: requires-python
30
+
31
+ # malwi - AI Python Malware Scanner
32
+
33
+ <img src="malwi-logo.png" alt="Logo">
34
+
35
+ Detect Python malware _fast_ - no internet, no expensive hardware, no fees.
36
+
37
+ malwi is specialized in detecting **zero-day vulnerabilities**, for classifying code as safe or harmful.
38
+
39
+ Open-source software made in Europe.
40
+ Based on open research, open code, open data.
41
+ πŸ‡ͺπŸ‡ΊπŸ€˜πŸ•ŠοΈ
42
+
43
+ ## Why malwi?
44
+
45
+ [The number of _malicious open-source packages_ is growing](https://arxiv.org/pdf/2404.04991). This is not just a threat to your business but also to the open-source community.
46
+
47
+ Typical malware behaviors include:
48
+
49
+ - _Exfiltration_ of data: Stealing credentials, API keys, or sensitive user data.
50
+ - _Backdoors_: Allowing remote attackers to gain unauthorized access to your system.
51
+ - _Destructive_ actions: Deleting files, corrupting databases, or sabotaging applications.
52
+
53
+ > **Attention**: Malicious packages might execute code during installation (e.g. through `setup.py`).
54
+ Make sure to *NOT* download or install malicious packages from the dataset with commands like `uv add`, `pip install`, `poetry add`.
55
+
56
+ ## What's next?
57
+
58
+ The first iteration focuses on **maliciousness of Python source code**.
59
+
60
+ Future iterations will cover malware scanning for more languages (JavaScript, Rust, Go) and more formats (binaries, logs).
61
+
62
+ ## How does it work?
63
+
64
+ malwi applies [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert) and Support Vector Machines (SVM) based on the design of [_Zero Day Malware Detection with Alpha: Fast DBI with Transformer Models for Real World Application_ (2025)](https://arxiv.org/pdf/2504.14886v1).
65
+ Additionally, malwi applies [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) for creating Abstract Syntax Tree (ASTs) which are mapped to a unified and security sensitive syntax used as training input. The Python malware dataset can be found [here](https://github.com/lxyeternal/pypi_malregistry). After 3 epochs of training you will get: Loss: `0.0986`, Accuracy: `0.9669`, F1: `0.9666`.
66
+
67
+ High-level training pipeline:
68
+
69
+ - Create dataset from malicious/benign repositories and map code to malwi syntax
70
+ - Remove code duplications based on hashes
71
+ - Train DistilBert based on the malwi samples for categorizing malicious/benign
72
+
73
+ ## Support
74
+
75
+ Do you have access to malicious Rust, Go, whatever packages? **Contact me.**
76
+
77
+
78
+ ### Develop
79
+
80
+ Prerequisites: [uv](https://docs.astral.sh/uv/)
81
+
82
+
83
+ ```
84
+ # Download and process data
85
+ cmds/download_and_preprocess.sh
86
+
87
+ # Only process data
88
+ cmds/preprocess.sh
89
+ ```
90
+
91
+ ```
92
+ # Preprocess then start training
93
+ cmds/preprocess_and_train.sh
94
+
95
+ # Only start training
96
+ cmds/train.sh
97
+ ```
@@ -0,0 +1,31 @@
1
+ MANIFEST.in
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ src/cli/__init__.py
6
+ src/cli/entry.py
7
+ src/cli/predict.py
8
+ src/common/__init__.py
9
+ src/common/files.py
10
+ src/malwi.egg-info/PKG-INFO
11
+ src/malwi.egg-info/SOURCES.txt
12
+ src/malwi.egg-info/dependency_links.txt
13
+ src/malwi.egg-info/entry_points.txt
14
+ src/malwi.egg-info/requires.txt
15
+ src/malwi.egg-info/top_level.txt
16
+ src/research/__init__.py
17
+ src/research/analyze_data.py
18
+ src/research/download_data.py
19
+ src/research/filter_data.py
20
+ src/research/normalize_data.py
21
+ src/research/train.py
22
+ src/research/syntax_mapping/__init__.py
23
+ src/research/syntax_mapping/compression_mapping.json
24
+ src/research/syntax_mapping/function_mapping.json
25
+ src/research/syntax_mapping/import_mapping.json
26
+ src/research/syntax_mapping/node_mapping.json
27
+ src/research/syntax_mapping/node_targets.json
28
+ src/research/syntax_mapping/sensitive_files.json
29
+ src/research/syntax_mapping/special_tokens.json
30
+ src/research/syntax_mapping/target_files.json
31
+ tests/test_map_ast.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ malwi = cli.entry:main
@@ -0,0 +1,16 @@
1
+ accelerate>=1.6.0
2
+ datasets>=3.6.0
3
+ nltk>=3.9.1
4
+ pandas>=2.2.3
5
+ pytest>=8.3.5
6
+ scikit-learn>=1.6.1
7
+ tokenizers>=0.21.1
8
+ torch>=2.7.0
9
+ tqdm>=4.67.1
10
+ transformers>=4.51.3
11
+ tree-sitter>=0.24.0
12
+ tree-sitter-javascript>=0.23.1
13
+ tree-sitter-languages>=1.10.2
14
+ tree-sitter-python>=0.23.6
15
+ tree-sitter-rust>=0.24.0
16
+ tree-sitter-typescript>=0.23.2
@@ -0,0 +1,3 @@
1
+ cli
2
+ common
3
+ research
File without changes