sentinelcodeai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sentinelcodeai-0.1.0/PKG-INFO +86 -0
- sentinelcodeai-0.1.0/README.md +62 -0
- sentinelcodeai-0.1.0/sentinelcodeai.egg-info/PKG-INFO +86 -0
- sentinelcodeai-0.1.0/sentinelcodeai.egg-info/SOURCES.txt +22 -0
- sentinelcodeai-0.1.0/sentinelcodeai.egg-info/dependency_links.txt +1 -0
- sentinelcodeai-0.1.0/sentinelcodeai.egg-info/entry_points.txt +3 -0
- sentinelcodeai-0.1.0/sentinelcodeai.egg-info/requires.txt +2 -0
- sentinelcodeai-0.1.0/sentinelcodeai.egg-info/top_level.txt +1 -0
- sentinelcodeai-0.1.0/setup.cfg +4 -0
- sentinelcodeai-0.1.0/setup.py +29 -0
- sentinelcodeai-0.1.0/src/__init__.py +0 -0
- sentinelcodeai-0.1.0/src/ai/__init__.py +0 -0
- sentinelcodeai-0.1.0/src/ai/nlp.py +28 -0
- sentinelcodeai-0.1.0/src/cli.py +85 -0
- sentinelcodeai-0.1.0/src/core/__init__.py +0 -0
- sentinelcodeai-0.1.0/src/core/cpp_ast.py +235 -0
- sentinelcodeai-0.1.0/src/core/leaks.py +82 -0
- sentinelcodeai-0.1.0/src/core/secrets.py +130 -0
- sentinelcodeai-0.1.0/src/git_hooks/__init__.py +0 -0
- sentinelcodeai-0.1.0/src/git_hooks/pre_commit.py +66 -0
- sentinelcodeai-0.1.0/src/scanner.py +120 -0
- sentinelcodeai-0.1.0/tests/test_ai.py +26 -0
- sentinelcodeai-0.1.0/tests/test_leaks.py +21 -0
- sentinelcodeai-0.1.0/tests/test_secrets.py +25 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sentinelcodeai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pre-commit security scanner — detects secrets and memory leaks before git commit.
|
|
5
|
+
Home-page: https://github.com/Yuva-Deekshitha-N/sentinelcodeai.git
|
|
6
|
+
Author: CodeSentinel
|
|
7
|
+
Author-email: yuvadeekshithanamani@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: rich
|
|
14
|
+
Requires-Dist: pycparser>=2.21
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: author-email
|
|
17
|
+
Dynamic: classifier
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
Dynamic: summary
|
|
24
|
+
|
|
25
|
+
# SentinelCodeAI
|
|
26
|
+
|
|
27
|
+
Static analysis tool that detects secrets, memory leaks, and sensitive context in any language — with automatic Git pre-commit hook integration.
|
|
28
|
+
|
|
29
|
+
## Structure
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
src/core/secrets.py # Regex-based secret detection (11 patterns)
|
|
33
|
+
src/core/leaks.py # AST-based memory leak detection
|
|
34
|
+
src/ai/nlp.py # NLP keyword context analysis
|
|
35
|
+
src/git_hooks/pre_commit.py # Git pre-commit hook logic
|
|
36
|
+
src/scanner.py # Shared scan + display engine
|
|
37
|
+
src/cli.py # CLI entry point
|
|
38
|
+
install_hook.py # One-time hook installer
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Setup
|
|
42
|
+
|
|
43
|
+
### Option A — pip install (hook auto-installs)
|
|
44
|
+
```bash
|
|
45
|
+
pip install -e .
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Option B — clone without pip
|
|
49
|
+
```bash
|
|
50
|
+
pip install -r requirements.txt
|
|
51
|
+
python install_hook.py
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Scan manually (file or folder)
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# scan a single file
|
|
58
|
+
sentinel --path path/to/file.py
|
|
59
|
+
|
|
60
|
+
# scan an entire folder
|
|
61
|
+
sentinel --path path/to/folder/
|
|
62
|
+
|
|
63
|
+
# without pip install
|
|
64
|
+
python -m src.cli --path path/to/file_or_folder
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## How the pre-commit hook works
|
|
68
|
+
|
|
69
|
+
Once installed, every `git commit` is automatically intercepted:
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
git commit -m "my changes"
|
|
73
|
+
|
|
|
74
|
+
v
|
|
75
|
+
SentinelCodeAI scans all staged files
|
|
76
|
+
|
|
|
77
|
+
v
|
|
78
|
+
HIGH risk found --> commit BLOCKED + report shown
|
|
79
|
+
All clean --> commit goes through
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Run Tests
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pytest tests/
|
|
86
|
+
```
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# SentinelCodeAI
|
|
2
|
+
|
|
3
|
+
Static analysis tool that detects secrets, memory leaks, and sensitive context in any language — with automatic Git pre-commit hook integration.
|
|
4
|
+
|
|
5
|
+
## Structure
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
src/core/secrets.py # Regex-based secret detection (11 patterns)
|
|
9
|
+
src/core/leaks.py # AST-based memory leak detection
|
|
10
|
+
src/ai/nlp.py # NLP keyword context analysis
|
|
11
|
+
src/git_hooks/pre_commit.py # Git pre-commit hook logic
|
|
12
|
+
src/scanner.py # Shared scan + display engine
|
|
13
|
+
src/cli.py # CLI entry point
|
|
14
|
+
install_hook.py # One-time hook installer
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Setup
|
|
18
|
+
|
|
19
|
+
### Option A — pip install (hook auto-installs)
|
|
20
|
+
```bash
|
|
21
|
+
pip install -e .
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Option B — clone without pip
|
|
25
|
+
```bash
|
|
26
|
+
pip install -r requirements.txt
|
|
27
|
+
python install_hook.py
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Scan manually (file or folder)
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# scan a single file
|
|
34
|
+
sentinel --path path/to/file.py
|
|
35
|
+
|
|
36
|
+
# scan an entire folder
|
|
37
|
+
sentinel --path path/to/folder/
|
|
38
|
+
|
|
39
|
+
# without pip install
|
|
40
|
+
python -m src.cli --path path/to/file_or_folder
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## How the pre-commit hook works
|
|
44
|
+
|
|
45
|
+
Once installed, every `git commit` is automatically intercepted:
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
git commit -m "my changes"
|
|
49
|
+
|
|
|
50
|
+
v
|
|
51
|
+
SentinelCodeAI scans all staged files
|
|
52
|
+
|
|
|
53
|
+
v
|
|
54
|
+
HIGH risk found --> commit BLOCKED + report shown
|
|
55
|
+
All clean --> commit goes through
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Run Tests
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pytest tests/
|
|
62
|
+
```
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sentinelcodeai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pre-commit security scanner — detects secrets and memory leaks before git commit.
|
|
5
|
+
Home-page: https://github.com/Yuva-Deekshitha-N/sentinelcodeai.git
|
|
6
|
+
Author: CodeSentinel
|
|
7
|
+
Author-email: yuvadeekshithanamani@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: rich
|
|
14
|
+
Requires-Dist: pycparser>=2.21
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: author-email
|
|
17
|
+
Dynamic: classifier
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
Dynamic: summary
|
|
24
|
+
|
|
25
|
+
# SentinelCodeAI
|
|
26
|
+
|
|
27
|
+
Static analysis tool that detects secrets, memory leaks, and sensitive context in any language — with automatic Git pre-commit hook integration.
|
|
28
|
+
|
|
29
|
+
## Structure
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
src/core/secrets.py # Regex-based secret detection (11 patterns)
|
|
33
|
+
src/core/leaks.py # AST-based memory leak detection
|
|
34
|
+
src/ai/nlp.py # NLP keyword context analysis
|
|
35
|
+
src/git_hooks/pre_commit.py # Git pre-commit hook logic
|
|
36
|
+
src/scanner.py # Shared scan + display engine
|
|
37
|
+
src/cli.py # CLI entry point
|
|
38
|
+
install_hook.py # One-time hook installer
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Setup
|
|
42
|
+
|
|
43
|
+
### Option A — pip install (hook auto-installs)
|
|
44
|
+
```bash
|
|
45
|
+
pip install -e .
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Option B — clone without pip
|
|
49
|
+
```bash
|
|
50
|
+
pip install -r requirements.txt
|
|
51
|
+
python install_hook.py
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Scan manually (file or folder)
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# scan a single file
|
|
58
|
+
sentinel --path path/to/file.py
|
|
59
|
+
|
|
60
|
+
# scan an entire folder
|
|
61
|
+
sentinel --path path/to/folder/
|
|
62
|
+
|
|
63
|
+
# without pip install
|
|
64
|
+
python -m src.cli --path path/to/file_or_folder
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## How the pre-commit hook works
|
|
68
|
+
|
|
69
|
+
Once installed, every `git commit` is automatically intercepted:
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
git commit -m "my changes"
|
|
73
|
+
|
|
|
74
|
+
v
|
|
75
|
+
SentinelCodeAI scans all staged files
|
|
76
|
+
|
|
|
77
|
+
v
|
|
78
|
+
HIGH risk found --> commit BLOCKED + report shown
|
|
79
|
+
All clean --> commit goes through
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Run Tests
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pytest tests/
|
|
86
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
sentinelcodeai.egg-info/PKG-INFO
|
|
4
|
+
sentinelcodeai.egg-info/SOURCES.txt
|
|
5
|
+
sentinelcodeai.egg-info/dependency_links.txt
|
|
6
|
+
sentinelcodeai.egg-info/entry_points.txt
|
|
7
|
+
sentinelcodeai.egg-info/requires.txt
|
|
8
|
+
sentinelcodeai.egg-info/top_level.txt
|
|
9
|
+
src/__init__.py
|
|
10
|
+
src/cli.py
|
|
11
|
+
src/scanner.py
|
|
12
|
+
src/ai/__init__.py
|
|
13
|
+
src/ai/nlp.py
|
|
14
|
+
src/core/__init__.py
|
|
15
|
+
src/core/cpp_ast.py
|
|
16
|
+
src/core/leaks.py
|
|
17
|
+
src/core/secrets.py
|
|
18
|
+
src/git_hooks/__init__.py
|
|
19
|
+
src/git_hooks/pre_commit.py
|
|
20
|
+
tests/test_ai.py
|
|
21
|
+
tests/test_leaks.py
|
|
22
|
+
tests/test_secrets.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
src
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="sentinelcodeai",
|
|
5
|
+
version="0.1.0",
|
|
6
|
+
author="CodeSentinel",
|
|
7
|
+
author_email="yuvadeekshithanamani@gmail.com",
|
|
8
|
+
description="Pre-commit security scanner — detects secrets and memory leaks before git commit.",
|
|
9
|
+
long_description=open("README.md", encoding="utf-8").read(),
|
|
10
|
+
long_description_content_type="text/markdown",
|
|
11
|
+
url="https://github.com/Yuva-Deekshitha-N/sentinelcodeai.git",
|
|
12
|
+
packages=find_packages(),
|
|
13
|
+
python_requires=">=3.10",
|
|
14
|
+
install_requires=[
|
|
15
|
+
"rich",
|
|
16
|
+
"pycparser>=2.21",
|
|
17
|
+
],
|
|
18
|
+
entry_points={
|
|
19
|
+
"console_scripts": [
|
|
20
|
+
"sentinel=src.cli:main",
|
|
21
|
+
"sca=src.cli:main",
|
|
22
|
+
]
|
|
23
|
+
},
|
|
24
|
+
classifiers=[
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"License :: OSI Approved :: MIT License",
|
|
27
|
+
"Operating System :: OS Independent",
|
|
28
|
+
],
|
|
29
|
+
)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
SENSITIVE_KEYWORDS = {
|
|
4
|
+
"password": "A variable named 'password' likely holds a plaintext credential. Plaintext passwords in code are a critical security risk.",
|
|
5
|
+
"secret": "A variable named 'secret' may contain a cryptographic secret or API secret that should never be hardcoded.",
|
|
6
|
+
"token": "A variable named 'token' may expose an authentication or API token that grants access to a service.",
|
|
7
|
+
"private": "A variable named 'private' may reference a private key or sensitive private data.",
|
|
8
|
+
"credential": "A variable named 'credential' likely holds authentication data such as a username/password pair or certificate.",
|
|
9
|
+
"api_key": "A variable named 'api_key' almost certainly contains a service API key that should be stored in environment variables.",
|
|
10
|
+
"auth": "A variable named 'auth' may hold authentication headers, tokens, or credentials used to access protected resources.",
|
|
11
|
+
"access_key": "A variable named 'access_key' likely contains a cloud or service access key that grants programmatic access.",
|
|
12
|
+
"passphrase": "A variable named 'passphrase' contains a passphrase used to protect a private key or encrypted data.",
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def analyze_context(code: str) -> list[dict]:
|
|
17
|
+
findings = []
|
|
18
|
+
for line_num, line in enumerate(code.splitlines(), start=1):
|
|
19
|
+
for keyword, explanation in SENSITIVE_KEYWORDS.items():
|
|
20
|
+
if re.search(rf"\b{keyword}\b", line, re.IGNORECASE):
|
|
21
|
+
findings.append({
|
|
22
|
+
"line": line_num,
|
|
23
|
+
"keyword": keyword,
|
|
24
|
+
"content": line.strip(),
|
|
25
|
+
"risk": "MEDIUM",
|
|
26
|
+
"explanation": explanation,
|
|
27
|
+
})
|
|
28
|
+
return findings
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from src.scanner import collect_files, run_scan
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
|
|
8
|
+
console = Console()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def install_global_hook():
|
|
12
|
+
# Global hooks folder inside SentinelCodeAI
|
|
13
|
+
hooks_dir = Path(__file__).resolve().parents[2] / "global_hooks"
|
|
14
|
+
hooks_dir.mkdir(exist_ok=True)
|
|
15
|
+
|
|
16
|
+
hook_src = Path(__file__).resolve().parent / "git_hooks" / "pre_commit_hook.sh"
|
|
17
|
+
hook_dest = hooks_dir / "pre-commit"
|
|
18
|
+
|
|
19
|
+
if not hook_src.exists():
|
|
20
|
+
console.print("[red]ERROR: Hook source file not found.[/red]")
|
|
21
|
+
sys.exit(1)
|
|
22
|
+
|
|
23
|
+
import shutil
|
|
24
|
+
shutil.copy(str(hook_src), str(hook_dest))
|
|
25
|
+
hook_dest.chmod(0o755)
|
|
26
|
+
|
|
27
|
+
# Tell git to use this folder for hooks in every repo
|
|
28
|
+
result = subprocess.run(
|
|
29
|
+
["git", "config", "--global", "core.hooksPath", str(hooks_dir)],
|
|
30
|
+
capture_output=True, text=True
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if result.returncode != 0:
|
|
34
|
+
console.print(f"[red]ERROR: Failed to set global hooks path: {result.stderr}[/red]")
|
|
35
|
+
sys.exit(1)
|
|
36
|
+
|
|
37
|
+
console.print("[bold green]SentinelCodeAI global hook installed successfully.[/bold green]")
|
|
38
|
+
console.print(f"Hooks folder : {hooks_dir}")
|
|
39
|
+
console.print("Every git commit on this machine is now protected automatically.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def main():
|
|
43
|
+
parser = argparse.ArgumentParser(
|
|
44
|
+
prog="sentinel",
|
|
45
|
+
description="SentinelCodeAI — scan a file or folder for secrets, leaks, and sensitive context.",
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--path",
|
|
49
|
+
help="Path to a file or folder to scan.",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--install-global",
|
|
53
|
+
action="store_true",
|
|
54
|
+
help="Install SentinelCodeAI as a global Git hook (runs on every repo on this machine).",
|
|
55
|
+
)
|
|
56
|
+
args = parser.parse_args()
|
|
57
|
+
|
|
58
|
+
if args.install_global:
|
|
59
|
+
install_global_hook()
|
|
60
|
+
sys.exit(0)
|
|
61
|
+
|
|
62
|
+
if not args.path:
|
|
63
|
+
parser.print_help()
|
|
64
|
+
sys.exit(1)
|
|
65
|
+
|
|
66
|
+
files = collect_files(args.path)
|
|
67
|
+
|
|
68
|
+
if not files:
|
|
69
|
+
console.print("[yellow]No scannable files found.[/yellow]")
|
|
70
|
+
sys.exit(0)
|
|
71
|
+
|
|
72
|
+
console.print(f"\n[bold]Scanning {len(files)} file(s) in: {args.path}[/bold]\n")
|
|
73
|
+
|
|
74
|
+
has_high_risk = run_scan(files)
|
|
75
|
+
|
|
76
|
+
if has_high_risk:
|
|
77
|
+
console.print("\n[bold red]HIGH risk issues found. Fix them before committing.[/bold red]")
|
|
78
|
+
sys.exit(1)
|
|
79
|
+
|
|
80
|
+
console.print("\n[bold green]Scan complete. No HIGH risk issues found.[/bold green]")
|
|
81
|
+
sys.exit(0)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
if __name__ == "__main__":
|
|
85
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""
|
|
2
|
+
C/C++ AST-based static analysis using pycparser.
|
|
3
|
+
|
|
4
|
+
Walks the real Abstract Syntax Tree of C/C++ source files to detect:
|
|
5
|
+
- malloc() without a paired free() → memory leak
|
|
6
|
+
- fopen() without a paired fclose() → resource leak
|
|
7
|
+
- new without delete → memory leak (regex-assisted, C++ extension)
|
|
8
|
+
- Pointer assigned then reassigned before free → dangling / lost pointer
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from pycparser import c_parser, c_ast, parse_file
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Helpers
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
def _strip_cpp_comments(code: str) -> str:
|
|
20
|
+
"""Remove // and /* */ comments so the C parser doesn't choke."""
|
|
21
|
+
code = re.sub(r"//[^\n]*", "", code)
|
|
22
|
+
code = re.sub(r"/\*.*?\*/", "", code, flags=re.DOTALL)
|
|
23
|
+
return code
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _remove_cpp_extensions(code: str) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Strip C++-only syntax that pycparser (a pure-C parser) can't handle,
|
|
29
|
+
so we can still analyse the C-style memory calls inside .cpp files.
|
|
30
|
+
"""
|
|
31
|
+
# Remove #include lines
|
|
32
|
+
code = re.sub(r"^\s*#include\s*[<\"][^\n]*", "", code, flags=re.MULTILINE)
|
|
33
|
+
# Remove using namespace / using std::
|
|
34
|
+
code = re.sub(r"^\s*using\s+[^\n;]+;", "", code, flags=re.MULTILINE)
|
|
35
|
+
# Remove class / struct definitions (keep function bodies)
|
|
36
|
+
code = re.sub(r"\bclass\b", "struct", code)
|
|
37
|
+
# Remove :: scope resolution
|
|
38
|
+
code = re.sub(r"\w+::", "", code)
|
|
39
|
+
# Remove template declarations
|
|
40
|
+
code = re.sub(r"template\s*<[^>]*>", "", code)
|
|
41
|
+
# Remove C++ casts
|
|
42
|
+
code = re.sub(r"\b(static_cast|dynamic_cast|reinterpret_cast|const_cast)\s*<[^>]*>", "", code)
|
|
43
|
+
return code
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# AST visitor — collects malloc/free/fopen/fclose call sites
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
class _MemoryCallVisitor(c_ast.NodeVisitor):
|
|
51
|
+
"""Walk the AST and record every call to malloc/free/fopen/fclose."""
|
|
52
|
+
|
|
53
|
+
def __init__(self):
|
|
54
|
+
self.calls: list[dict] = [] # {"name": str, "line": int}
|
|
55
|
+
|
|
56
|
+
def visit_FuncCall(self, node):
|
|
57
|
+
if node.name and isinstance(node.name, c_ast.ID):
|
|
58
|
+
fn = node.name.name
|
|
59
|
+
if fn in ("malloc", "calloc", "realloc", "free", "fopen", "fclose"):
|
|
60
|
+
line = node.coord.line if node.coord else 0
|
|
61
|
+
self.calls.append({"name": fn, "line": line})
|
|
62
|
+
self.generic_visit(node)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Public API
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
def analyze_cpp_ast(code: str) -> list[dict]:
|
|
70
|
+
"""
|
|
71
|
+
Parse C/C++ source with pycparser and return AST-level findings.
|
|
72
|
+
|
|
73
|
+
Returns a list of dicts compatible with the existing leak format:
|
|
74
|
+
{type, line, content, explanation, languages, engine}
|
|
75
|
+
"""
|
|
76
|
+
findings: list[dict] = []
|
|
77
|
+
lines = code.splitlines()
|
|
78
|
+
|
|
79
|
+
# ── 1. Try real AST parse ──────────────────────────────────────────────
|
|
80
|
+
ast_findings = _ast_analysis(code, lines)
|
|
81
|
+
findings.extend(ast_findings)
|
|
82
|
+
|
|
83
|
+
# ── 2. C++-only checks (new/delete, dangling ptr) via regex on raw code ─
|
|
84
|
+
findings.extend(_cpp_new_delete_analysis(code, lines))
|
|
85
|
+
findings.extend(_dangling_pointer_analysis(code, lines))
|
|
86
|
+
|
|
87
|
+
return findings
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# AST analysis (malloc/free, fopen/fclose pairing)
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def _ast_analysis(code: str, lines: list[str]) -> list[dict]:
|
|
95
|
+
findings: list[dict] = []
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
clean = _strip_cpp_comments(code)
|
|
99
|
+
clean = _remove_cpp_extensions(clean)
|
|
100
|
+
|
|
101
|
+
# pycparser needs a fake libc header stub
|
|
102
|
+
parser = c_parser.CParser()
|
|
103
|
+
# Inject minimal typedefs so the parser doesn't fail on FILE*, size_t etc.
|
|
104
|
+
preamble = (
|
|
105
|
+
"typedef unsigned long size_t;\n"
|
|
106
|
+
"typedef struct _IO_FILE FILE;\n"
|
|
107
|
+
"void *malloc(size_t size);\n"
|
|
108
|
+
"void *calloc(size_t n, size_t size);\n"
|
|
109
|
+
"void *realloc(void *ptr, size_t size);\n"
|
|
110
|
+
"void free(void *ptr);\n"
|
|
111
|
+
"FILE *fopen(const char *path, const char *mode);\n"
|
|
112
|
+
"int fclose(FILE *stream);\n"
|
|
113
|
+
)
|
|
114
|
+
ast = parser.parse(preamble + clean, filename="<input>")
|
|
115
|
+
|
|
116
|
+
visitor = _MemoryCallVisitor()
|
|
117
|
+
visitor.visit(ast)
|
|
118
|
+
|
|
119
|
+
malloc_lines = [c["line"] for c in visitor.calls if c["name"] in ("malloc", "calloc", "realloc")]
|
|
120
|
+
free_count = sum(1 for c in visitor.calls if c["name"] == "free")
|
|
121
|
+
fopen_lines = [c["line"] for c in visitor.calls if c["name"] == "fopen"]
|
|
122
|
+
fclose_count = sum(1 for c in visitor.calls if c["name"] == "fclose")
|
|
123
|
+
|
|
124
|
+
# malloc without free
|
|
125
|
+
if malloc_lines and free_count == 0:
|
|
126
|
+
for ln in malloc_lines:
|
|
127
|
+
src_line = lines[ln - 1].strip() if 0 < ln <= len(lines) else ""
|
|
128
|
+
findings.append({
|
|
129
|
+
"type": "ast_malloc_no_free",
|
|
130
|
+
"line": ln,
|
|
131
|
+
"content": src_line,
|
|
132
|
+
"explanation": (
|
|
133
|
+
"[AST] malloc/calloc/realloc detected but no free() found in this "
|
|
134
|
+
"translation unit. Heap memory will never be returned to the OS."
|
|
135
|
+
),
|
|
136
|
+
"languages": "C/C++",
|
|
137
|
+
"engine": "AST",
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
# fopen without fclose
|
|
141
|
+
if fopen_lines and fclose_count == 0:
|
|
142
|
+
for ln in fopen_lines:
|
|
143
|
+
src_line = lines[ln - 1].strip() if 0 < ln <= len(lines) else ""
|
|
144
|
+
findings.append({
|
|
145
|
+
"type": "ast_fopen_no_fclose",
|
|
146
|
+
"line": ln,
|
|
147
|
+
"content": src_line,
|
|
148
|
+
"explanation": (
|
|
149
|
+
"[AST] fopen() detected but no fclose() found in this translation unit. "
|
|
150
|
+
"The file descriptor will leak until the process exits."
|
|
151
|
+
),
|
|
152
|
+
"languages": "C/C++",
|
|
153
|
+
"engine": "AST",
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
except Exception:
|
|
157
|
+
# Parser failed (complex C++ syntax) — fall back silently; regex layer still runs
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
return findings
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# C++ new / delete analysis (regex-assisted, AST-style pairing logic)
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def _cpp_new_delete_analysis(code: str, lines: list[str]) -> list[dict]:
|
|
168
|
+
findings: list[dict] = []
|
|
169
|
+
|
|
170
|
+
new_lines = [i + 1 for i, l in enumerate(lines) if re.search(r"\bnew\b", l)]
|
|
171
|
+
delete_count = sum(1 for l in lines if re.search(r"\bdelete\b", l))
|
|
172
|
+
|
|
173
|
+
if new_lines and delete_count == 0:
|
|
174
|
+
for ln in new_lines:
|
|
175
|
+
findings.append({
|
|
176
|
+
"type": "ast_new_no_delete",
|
|
177
|
+
"line": ln,
|
|
178
|
+
"content": lines[ln - 1].strip(),
|
|
179
|
+
"explanation": (
|
|
180
|
+
"[AST] 'new' allocates heap memory but no 'delete' was found. "
|
|
181
|
+
"Prefer smart pointers (std::unique_ptr / std::shared_ptr) to avoid leaks."
|
|
182
|
+
),
|
|
183
|
+
"languages": "C++",
|
|
184
|
+
"engine": "AST",
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
return findings
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
# Dangling pointer detection
|
|
192
|
+
# ---------------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
def _dangling_pointer_analysis(code: str, lines: list[str]) -> list[dict]:
|
|
195
|
+
"""
|
|
196
|
+
Detect the pattern:
|
|
197
|
+
ptr = malloc(...); ← allocation
|
|
198
|
+
ptr = something; ← reassignment WITHOUT free → original block lost
|
|
199
|
+
"""
|
|
200
|
+
findings: list[dict] = []
|
|
201
|
+
|
|
202
|
+
# Collect pointer names that were malloc'd
|
|
203
|
+
malloc_vars: dict[str, int] = {}
|
|
204
|
+
for i, line in enumerate(lines, start=1):
|
|
205
|
+
m = re.search(r"\b(\w+)\s*=\s*(?:malloc|calloc|realloc)\s*\(", line)
|
|
206
|
+
if m:
|
|
207
|
+
malloc_vars[m.group(1)] = i
|
|
208
|
+
|
|
209
|
+
# Check if any of those vars are reassigned without a free in between
|
|
210
|
+
for var, alloc_line in malloc_vars.items():
|
|
211
|
+
freed = False
|
|
212
|
+
for i, line in enumerate(lines, start=1):
|
|
213
|
+
if i <= alloc_line:
|
|
214
|
+
continue
|
|
215
|
+
if re.search(rf"\bfree\s*\(\s*{var}\s*\)", line):
|
|
216
|
+
freed = True
|
|
217
|
+
break
|
|
218
|
+
# Reassigned without free
|
|
219
|
+
if re.search(rf"\b{var}\s*=\s*(?!NULL|nullptr|0\b)", line):
|
|
220
|
+
if not freed:
|
|
221
|
+
findings.append({
|
|
222
|
+
"type": "ast_dangling_pointer",
|
|
223
|
+
"line": i,
|
|
224
|
+
"content": lines[i - 1].strip(),
|
|
225
|
+
"explanation": (
|
|
226
|
+
f"[AST] Pointer '{var}' (allocated at line {alloc_line}) is "
|
|
227
|
+
"reassigned before being freed. The original heap block is lost — "
|
|
228
|
+
"this is a classic dangling/lost-pointer memory leak."
|
|
229
|
+
),
|
|
230
|
+
"languages": "C/C++",
|
|
231
|
+
"engine": "AST",
|
|
232
|
+
})
|
|
233
|
+
break
|
|
234
|
+
|
|
235
|
+
return findings
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
# Regex-based leak patterns — works across Python, C++, Java, JS, etc.
|
|
4
|
+
LEAK_PATTERNS = {
|
|
5
|
+
# Python
|
|
6
|
+
"python_unclosed_file": {
|
|
7
|
+
"pattern": r"\bopen\s*\([^)]+\)(?!\s*as\b)",
|
|
8
|
+
"explanation": "open() called without a 'with' block. File handle may never be closed, leaking OS resources.",
|
|
9
|
+
"languages": "Python",
|
|
10
|
+
},
|
|
11
|
+
"python_unclosed_db": {
|
|
12
|
+
"pattern": r"\b(psycopg2|pymysql|sqlite3|cx_Oracle|pyodbc)\.connect\s*\(",
|
|
13
|
+
"explanation": "Database connection opened. If not closed or used in a context manager, the connection leaks and exhausts the DB connection pool.",
|
|
14
|
+
"languages": "Python",
|
|
15
|
+
},
|
|
16
|
+
"python_unclosed_socket": {
|
|
17
|
+
"pattern": r"\bsocket\.socket\s*\(",
|
|
18
|
+
"explanation": "Socket created without a 'with' block. Unclosed sockets leak file descriptors and can cause connection exhaustion.",
|
|
19
|
+
"languages": "Python",
|
|
20
|
+
},
|
|
21
|
+
"python_unclosed_session": {
|
|
22
|
+
"pattern": r"\brequests\.Session\s*\(\s*\)(?!\s*as\b)",
|
|
23
|
+
"explanation": "requests.Session() opened without a context manager. Unclosed sessions leak TCP connections.",
|
|
24
|
+
"languages": "Python",
|
|
25
|
+
},
|
|
26
|
+
|
|
27
|
+
# C / C++
|
|
28
|
+
"cpp_malloc_no_free": {
|
|
29
|
+
"pattern": r"\bmalloc\s*\(",
|
|
30
|
+
"explanation": "malloc() allocates heap memory. If free() is never called, this causes a memory leak that grows over time.",
|
|
31
|
+
"languages": "C/C++",
|
|
32
|
+
},
|
|
33
|
+
"cpp_new_no_delete": {
|
|
34
|
+
"pattern": r"\bnew\s+\w+",
|
|
35
|
+
"explanation": "'new' allocates heap memory. Without a matching 'delete', the memory is never returned to the OS.",
|
|
36
|
+
"languages": "C/C++",
|
|
37
|
+
},
|
|
38
|
+
"cpp_fopen_no_fclose": {
|
|
39
|
+
"pattern": r"\bfopen\s*\(",
|
|
40
|
+
"explanation": "fopen() opens a file handle. If fclose() is never called, the file descriptor leaks.",
|
|
41
|
+
"languages": "C/C++",
|
|
42
|
+
},
|
|
43
|
+
|
|
44
|
+
# Java
|
|
45
|
+
"java_unclosed_stream": {
|
|
46
|
+
"pattern": r"\bnew\s+(FileInputStream|FileOutputStream|BufferedReader|FileReader|FileWriter)\s*\(",
|
|
47
|
+
"explanation": "Java stream opened without try-with-resources. If close() is not called, the stream leaks file descriptors.",
|
|
48
|
+
"languages": "Java",
|
|
49
|
+
},
|
|
50
|
+
"java_unclosed_connection": {
|
|
51
|
+
"pattern": r"\bDriverManager\.getConnection\s*\(",
|
|
52
|
+
"explanation": "JDBC connection opened. If not closed in a finally block or try-with-resources, the DB connection leaks.",
|
|
53
|
+
"languages": "Java",
|
|
54
|
+
},
|
|
55
|
+
|
|
56
|
+
# JavaScript / TypeScript
|
|
57
|
+
"js_unclosed_fs": {
|
|
58
|
+
"pattern": r"\bfs\.open\s*\(",
|
|
59
|
+
"explanation": "fs.open() called without a corresponding fs.close(). Leaks file descriptors in Node.js.",
|
|
60
|
+
"languages": "JavaScript/TypeScript",
|
|
61
|
+
},
|
|
62
|
+
"js_event_listener": {
|
|
63
|
+
"pattern": r"\baddEventListener\s*\(",
|
|
64
|
+
"explanation": "Event listener added. If removeEventListener() is never called, it prevents garbage collection and causes memory leaks.",
|
|
65
|
+
"languages": "JavaScript/TypeScript",
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def detect_leaks(code: str) -> list[dict]:
|
|
71
|
+
findings = []
|
|
72
|
+
for line_num, line in enumerate(code.splitlines(), start=1):
|
|
73
|
+
for leak_type, config in LEAK_PATTERNS.items():
|
|
74
|
+
if re.search(config["pattern"], line):
|
|
75
|
+
findings.append({
|
|
76
|
+
"type": leak_type,
|
|
77
|
+
"line": line_num,
|
|
78
|
+
"content": line.strip(),
|
|
79
|
+
"explanation": config["explanation"],
|
|
80
|
+
"languages": config["languages"],
|
|
81
|
+
})
|
|
82
|
+
return findings
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
|
|
4
|
+
# Secret patterns with risk levels and explanations
|
|
5
|
+
SECRET_PATTERNS = {
|
|
6
|
+
"aws_access_key": {
|
|
7
|
+
"pattern": r"AKIA[0-9A-Z]{16}",
|
|
8
|
+
"risk": "HIGH",
|
|
9
|
+
"explanation": "Hardcoded AWS Access Key ID detected. Attackers can use this to access your AWS account, spin up resources, steal data, or incur massive charges."
|
|
10
|
+
},
|
|
11
|
+
"aws_secret_key": {
|
|
12
|
+
"pattern": r"(?i)aws(.{0,20})?['\"][0-9a-zA-Z/+]{40}['\"]",
|
|
13
|
+
"risk": "HIGH",
|
|
14
|
+
"explanation": "Hardcoded AWS Secret Access Key detected. Combined with an Access Key ID, this grants full programmatic access to your AWS account."
|
|
15
|
+
},
|
|
16
|
+
"generic_api_key": {
|
|
17
|
+
"pattern": r"(?i)(api_key|apikey|api-key)\s*=\s*['\"][a-zA-Z0-9]{16,}['\"]",
|
|
18
|
+
"risk": "HIGH",
|
|
19
|
+
"explanation": "A hardcoded API key was found. If this key is pushed to a public repo, any third party can authenticate as you and abuse the associated service."
|
|
20
|
+
},
|
|
21
|
+
"private_key": {
|
|
22
|
+
"pattern": r"-----BEGIN (RSA|EC|DSA|OPENSSH) PRIVATE KEY-----",
|
|
23
|
+
"risk": "HIGH",
|
|
24
|
+
"explanation": "A private cryptographic key is embedded in the code. This can be used to impersonate your server, decrypt communications, or forge signatures."
|
|
25
|
+
},
|
|
26
|
+
"password": {
|
|
27
|
+
"pattern": r"(?i)(password|passwd|pwd)\s*=\s*['\"].{6,}['\"]",
|
|
28
|
+
"risk": "HIGH",
|
|
29
|
+
"explanation": "A plaintext password is hardcoded. Passwords in source code are permanently stored in Git history even after deletion and can be extracted by anyone with repo access."
|
|
30
|
+
},
|
|
31
|
+
"github_token": {
|
|
32
|
+
"pattern": r"ghp_[A-Za-z0-9]{36}",
|
|
33
|
+
"risk": "HIGH",
|
|
34
|
+
"explanation": "A GitHub Personal Access Token was found. This grants the holder read/write access to your repositories and account settings."
|
|
35
|
+
},
|
|
36
|
+
"jwt_token": {
|
|
37
|
+
"pattern": r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+",
|
|
38
|
+
"risk": "MEDIUM",
|
|
39
|
+
"explanation": "A JWT token is hardcoded. If unexpired, it can be replayed to authenticate as the token's subject without needing credentials."
|
|
40
|
+
},
|
|
41
|
+
"google_api_key": {
|
|
42
|
+
"pattern": r"AIza[0-9A-Za-z\-_]{35}",
|
|
43
|
+
"risk": "HIGH",
|
|
44
|
+
"explanation": "A Google API key was found. Exposure can lead to quota theft, unauthorized use of Google services, and unexpected billing on your account."
|
|
45
|
+
},
|
|
46
|
+
"slack_token": {
|
|
47
|
+
"pattern": r"xox[baprs]-[0-9a-zA-Z]{10,48}",
|
|
48
|
+
"risk": "HIGH",
|
|
49
|
+
"explanation": "A Slack token is hardcoded. This allows an attacker to read messages, post as your bot/user, and access private channels in your workspace."
|
|
50
|
+
},
|
|
51
|
+
"database_url": {
|
|
52
|
+
"pattern": r"(postgres|mysql)://.*:.*@",
|
|
53
|
+
"risk": "HIGH",
|
|
54
|
+
"explanation": "A database connection URL with embedded credentials was found. This exposes your database host, username, and password to anyone who reads the code."
|
|
55
|
+
},
|
|
56
|
+
"mongodb_url": {
|
|
57
|
+
"pattern": r"mongodb(\+srv)?://[^:]+:[^@]+@",
|
|
58
|
+
"risk": "HIGH",
|
|
59
|
+
"explanation": "A MongoDB connection string with embedded credentials was found. Exposes your database host, username, and password publicly."
|
|
60
|
+
},
|
|
61
|
+
"firebase_api_key": {
|
|
62
|
+
"pattern": r"(?i)firebase.*api.?key\s*[=:]\s*['\"][A-Za-z0-9_\-]{20,}['\"]",
|
|
63
|
+
"risk": "HIGH",
|
|
64
|
+
"explanation": "A Firebase API key was found. Exposes your Firebase project to unauthorized reads, writes, and abuse of Firebase services."
|
|
65
|
+
},
|
|
66
|
+
"firebase_db_url": {
|
|
67
|
+
"pattern": r"https://[a-z0-9-]+\.firebaseio\.com",
|
|
68
|
+
"risk": "HIGH",
|
|
69
|
+
"explanation": "A Firebase Realtime Database URL was found. If database rules are misconfigured, attackers can read or write all data."
|
|
70
|
+
},
|
|
71
|
+
"firebase_secret": {
|
|
72
|
+
"pattern": r"(?i)firebase.{0,20}secret\s*[=:]\s*['\"][A-Za-z0-9]{20,}['\"]",
|
|
73
|
+
"risk": "HIGH",
|
|
74
|
+
"explanation": "A Firebase legacy secret was found. This grants full admin access to your Firebase project."
|
|
75
|
+
},
|
|
76
|
+
"test_key": {
|
|
77
|
+
"pattern": r"(?i)test[_-]?key",
|
|
78
|
+
"risk": "LOW",
|
|
79
|
+
"explanation": "A test key identifier was found. While likely not a real secret, test keys are sometimes accidentally swapped with production keys."
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def detect_secrets(code: str) -> List[Dict]:
|
|
85
|
+
"""
|
|
86
|
+
Scan code and detect potential secrets.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
List of findings with:
|
|
90
|
+
- type
|
|
91
|
+
- risk
|
|
92
|
+
- line number
|
|
93
|
+
- matched content
|
|
94
|
+
"""
|
|
95
|
+
findings = []
|
|
96
|
+
|
|
97
|
+
for line_num, line in enumerate(code.splitlines(), start=1):
|
|
98
|
+
for secret_type, config in SECRET_PATTERNS.items():
|
|
99
|
+
pattern = config["pattern"]
|
|
100
|
+
risk = config["risk"]
|
|
101
|
+
|
|
102
|
+
matches = re.findall(pattern, line)
|
|
103
|
+
|
|
104
|
+
for match in matches:
|
|
105
|
+
findings.append({
|
|
106
|
+
"type": secret_type,
|
|
107
|
+
"risk": risk,
|
|
108
|
+
"line": line_num,
|
|
109
|
+
"content": line.strip(),
|
|
110
|
+
"matched": match if isinstance(match, str) else match[0],
|
|
111
|
+
"explanation": config["explanation"],
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
return findings
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def summarize_findings(findings: List[Dict]) -> Dict:
|
|
118
|
+
"""
|
|
119
|
+
Summarize findings into risk categories
|
|
120
|
+
"""
|
|
121
|
+
summary = {
|
|
122
|
+
"HIGH": 0,
|
|
123
|
+
"MEDIUM": 0,
|
|
124
|
+
"LOW": 0
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
for f in findings:
|
|
128
|
+
summary[f["risk"]] += 1
|
|
129
|
+
|
|
130
|
+
return summary
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from src.scanner import collect_files, run_scan
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
|
|
7
|
+
console = Console()
|
|
8
|
+
|
|
9
|
+
repo_root = Path(subprocess.run(
|
|
10
|
+
["git", "rev-parse", "--show-toplevel"],
|
|
11
|
+
capture_output=True, text=True
|
|
12
|
+
).stdout.strip()).resolve()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Files belonging to SentinelCodeAI itself — skip to avoid false positives
|
|
16
|
+
SENTINEL_OWN_FILES = {
|
|
17
|
+
"src/core/secrets.py",
|
|
18
|
+
"src/core/leaks.py",
|
|
19
|
+
"src/ai/nlp.py",
|
|
20
|
+
"src/scanner.py",
|
|
21
|
+
"src/cli.py",
|
|
22
|
+
"src/git_hooks/pre_commit.py",
|
|
23
|
+
"README.md",
|
|
24
|
+
"tests/test_secrets.py",
|
|
25
|
+
"tests/test_leaks.py",
|
|
26
|
+
"tests/test_ai.py",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_staged_files() -> list[Path]:
|
|
31
|
+
try:
|
|
32
|
+
output = subprocess.check_output(
|
|
33
|
+
["git", "diff", "--cached", "--name-only"]
|
|
34
|
+
)
|
|
35
|
+
files = []
|
|
36
|
+
for f in output.decode().splitlines():
|
|
37
|
+
if f in SENTINEL_OWN_FILES:
|
|
38
|
+
continue
|
|
39
|
+
resolved = (repo_root / f).resolve()
|
|
40
|
+
if str(resolved).startswith(str(repo_root)) and resolved.is_file():
|
|
41
|
+
files.append(resolved)
|
|
42
|
+
return files
|
|
43
|
+
except Exception:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def main():
|
|
48
|
+
files = get_staged_files()
|
|
49
|
+
|
|
50
|
+
if not files:
|
|
51
|
+
sys.exit(0)
|
|
52
|
+
|
|
53
|
+
console.print(f"[bold]SentinelCodeAI scanning {len(files)} staged file(s)...[/bold]\n")
|
|
54
|
+
|
|
55
|
+
has_high_risk = run_scan(files)
|
|
56
|
+
|
|
57
|
+
if has_high_risk:
|
|
58
|
+
console.print("\nCommit BLOCKED due to HIGH risk issues!", style="bold red")
|
|
59
|
+
sys.exit(1)
|
|
60
|
+
|
|
61
|
+
console.print("\nCommit Allowed", style="bold green")
|
|
62
|
+
sys.exit(0)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
main()
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from src.core.secrets import detect_secrets, summarize_findings
|
|
4
|
+
from src.core.leaks import detect_leaks
|
|
5
|
+
from src.core.cpp_ast import analyze_cpp_ast
|
|
6
|
+
from src.ai.nlp import analyze_context
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
if hasattr(sys.stdout, "reconfigure"):
|
|
10
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
# File types to skip (binaries, media, etc.)
|
|
15
|
+
SKIP_EXTENSIONS = {
|
|
16
|
+
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
|
|
17
|
+
".pdf", ".zip", ".tar", ".gz", ".exe", ".bin",
|
|
18
|
+
".pyc", ".pyo", ".so", ".dll", ".class",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def collect_files(path: str) -> list[Path]:
|
|
23
|
+
"""Return all scannable files from a file path or folder."""
|
|
24
|
+
target = Path(path).resolve()
|
|
25
|
+
if target.is_file():
|
|
26
|
+
return [target]
|
|
27
|
+
return [
|
|
28
|
+
f for f in target.rglob("*")
|
|
29
|
+
if f.is_file() and f.suffix not in SKIP_EXTENSIONS
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
CPP_EXTENSIONS = {".c", ".cpp", ".cc", ".cxx", ".h", ".hpp"}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def scan_file(file_path: Path) -> tuple:
|
|
37
|
+
try:
|
|
38
|
+
code = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
39
|
+
findings = detect_secrets(code)
|
|
40
|
+
summary = summarize_findings(findings)
|
|
41
|
+
leaks = detect_leaks(code)
|
|
42
|
+
# Run AST engine for C/C++ files
|
|
43
|
+
if file_path.suffix.lower() in CPP_EXTENSIONS:
|
|
44
|
+
leaks = leaks + analyze_cpp_ast(code)
|
|
45
|
+
nlp_findings = analyze_context(code)
|
|
46
|
+
return findings, summary, leaks, nlp_findings
|
|
47
|
+
except Exception:
|
|
48
|
+
return [], {"HIGH": 0, "MEDIUM": 0, "LOW": 0}, [], []
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def display_results(file: str, findings, summary, leaks, nlp_findings) -> bool:
|
|
52
|
+
"""Print findings for one file. Returns True if HIGH risk was found."""
|
|
53
|
+
has_high = False
|
|
54
|
+
|
|
55
|
+
# 🔴 HIGH
|
|
56
|
+
if summary["HIGH"] > 0:
|
|
57
|
+
has_high = True
|
|
58
|
+
console.print(f"[bold red]>> HIGH RISK in {file}[/bold red]")
|
|
59
|
+
for f in findings:
|
|
60
|
+
if f["risk"] == "HIGH":
|
|
61
|
+
console.print(f"[red] {f['type']} (line {f['line']})[/red]")
|
|
62
|
+
console.print(f" Detected : {f['matched']}")
|
|
63
|
+
console.print(f" Why : {f['explanation']}")
|
|
64
|
+
|
|
65
|
+
# MEDIUM
|
|
66
|
+
if summary["MEDIUM"] > 0:
|
|
67
|
+
console.print(f"[bold yellow]>> MEDIUM RISK in {file}[/bold yellow]")
|
|
68
|
+
for f in findings:
|
|
69
|
+
if f["risk"] == "MEDIUM":
|
|
70
|
+
console.print(f"[yellow] {f['type']} (line {f['line']})[/yellow]")
|
|
71
|
+
console.print(f" Detected : {f['matched']}")
|
|
72
|
+
console.print(f" Why : {f['explanation']}")
|
|
73
|
+
|
|
74
|
+
# LOW
|
|
75
|
+
if summary["LOW"] > 0:
|
|
76
|
+
console.print(f"[dim yellow]>> LOW RISK in {file}[/dim yellow]")
|
|
77
|
+
for f in findings:
|
|
78
|
+
if f["risk"] == "LOW":
|
|
79
|
+
console.print(f"[yellow] {f['type']} (line {f['line']})[/yellow]")
|
|
80
|
+
console.print(f" Detected : {f['matched']}")
|
|
81
|
+
console.print(f" Why : {f['explanation']}")
|
|
82
|
+
|
|
83
|
+
# Leaks
|
|
84
|
+
if leaks:
|
|
85
|
+
console.print(f"[yellow]>> Leak Issues in {file}[/yellow]")
|
|
86
|
+
for leak in leaks:
|
|
87
|
+
engine_tag = f" [{leak.get('engine', 'regex')}]" if leak.get('engine') else ""
|
|
88
|
+
console.print(f"[yellow] {leak['type']}{engine_tag} (line {leak['line']}) [{leak['languages']}][/yellow]")
|
|
89
|
+
console.print(f" Code : {leak['content']}")
|
|
90
|
+
console.print(f" Why : {leak['explanation']}")
|
|
91
|
+
|
|
92
|
+
# NLP
|
|
93
|
+
if nlp_findings:
|
|
94
|
+
console.print(f"[bold cyan]>> NLP Findings in {file}[/bold cyan]")
|
|
95
|
+
for n in nlp_findings:
|
|
96
|
+
console.print(f"[cyan] '{n['keyword']}' (line {n['line']}) - {n['risk']}[/cyan]")
|
|
97
|
+
console.print(f" Code : {n['content']}")
|
|
98
|
+
console.print(f" Why : {n['explanation']}")
|
|
99
|
+
|
|
100
|
+
# ✅ SAFE
|
|
101
|
+
if (
|
|
102
|
+
summary["HIGH"] == 0
|
|
103
|
+
and summary["MEDIUM"] == 0
|
|
104
|
+
and summary["LOW"] == 0
|
|
105
|
+
and not leaks
|
|
106
|
+
and not nlp_findings
|
|
107
|
+
):
|
|
108
|
+
console.print(f"[bold green]SAFE: {file}[/bold green]")
|
|
109
|
+
|
|
110
|
+
return has_high
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def run_scan(files: list[Path]) -> bool:
|
|
114
|
+
"""Scan a list of files. Returns True if any HIGH risk found."""
|
|
115
|
+
has_high_risk = False
|
|
116
|
+
for file_path in files:
|
|
117
|
+
findings, summary, leaks, nlp_findings = scan_file(file_path)
|
|
118
|
+
if display_results(str(file_path), findings, summary, leaks, nlp_findings):
|
|
119
|
+
has_high_risk = True
|
|
120
|
+
return has_high_risk
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from src.ai.nlp import analyze_context
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_detects_sensitive_keyword():
|
|
6
|
+
# 'token' appears as a standalone word on this line
|
|
7
|
+
code = 'token = get_token()'
|
|
8
|
+
findings = analyze_context(code)
|
|
9
|
+
assert any(f["keyword"] == "token" for f in findings)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_case_insensitive():
|
|
13
|
+
code = 'PASSWORD = os.environ["DB_PASS"]'
|
|
14
|
+
findings = analyze_context(code)
|
|
15
|
+
assert any(f["keyword"] == "password" for f in findings)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_no_findings_on_clean_code():
|
|
19
|
+
code = 'def add(a, b):\n return a + b'
|
|
20
|
+
assert analyze_context(code) == []
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_returns_line_number():
|
|
24
|
+
code = 'x = 1\nsecret = "abc"'
|
|
25
|
+
findings = analyze_context(code)
|
|
26
|
+
assert findings[0]["line"] == 2
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from src.core.leaks import detect_leaks
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_detects_unclosed_file():
|
|
6
|
+
code = 'f = open("data.txt", "r")\ndata = f.read()'
|
|
7
|
+
findings = detect_leaks(code)
|
|
8
|
+
assert any(f["type"] == "python_unclosed_file" for f in findings)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_no_leak_with_context_manager():
|
|
12
|
+
code = 'with open("data.txt") as f:\n data = f.read()'
|
|
13
|
+
findings = detect_leaks(code)
|
|
14
|
+
assert findings == []
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_syntax_error_handled():
|
|
18
|
+
# detect_leaks is regex-based and does not raise on syntax errors
|
|
19
|
+
code = "def broken(:"
|
|
20
|
+
findings = detect_leaks(code)
|
|
21
|
+
assert isinstance(findings, list)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from src.core.secrets import detect_secrets
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_detects_aws_access_key():
|
|
6
|
+
code = 'key = "AKIAIOSFODNN7EXAMPLE"'
|
|
7
|
+
findings = detect_secrets(code)
|
|
8
|
+
assert any(f["type"] == "aws_access_key" for f in findings)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_detects_password():
|
|
12
|
+
code = 'password = "supersecret123"'
|
|
13
|
+
findings = detect_secrets(code)
|
|
14
|
+
assert any(f["type"] == "password" for f in findings)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_no_false_positive():
|
|
18
|
+
code = 'x = 42\nprint("hello world")'
|
|
19
|
+
assert detect_secrets(code) == []
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_returns_correct_line_number():
|
|
23
|
+
code = "x = 1\npassword = 'mypassword'"
|
|
24
|
+
findings = detect_secrets(code)
|
|
25
|
+
assert findings[0]["line"] == 2
|