sentinelcodeai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: sentinelcodeai
3
+ Version: 0.1.0
4
+ Summary: Pre-commit security scanner — detects secrets and memory leaks before git commit.
5
+ Home-page: https://github.com/Yuva-Deekshitha-N/sentinelcodeai.git
6
+ Author: CodeSentinel
7
+ Author-email: yuvadeekshithanamani@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: rich
14
+ Requires-Dist: pycparser>=2.21
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # SentinelCodeAI
26
+
27
+ Static analysis tool that detects secrets, memory leaks, and sensitive context in any language — with automatic Git pre-commit hook integration.
28
+
29
+ ## Structure
30
+
31
+ ```
32
+ src/core/secrets.py # Regex-based secret detection (11 patterns)
33
+ src/core/leaks.py # AST-based memory leak detection
34
+ src/ai/nlp.py # NLP keyword context analysis
35
+ src/git_hooks/pre_commit.py # Git pre-commit hook logic
36
+ src/scanner.py # Shared scan + display engine
37
+ src/cli.py # CLI entry point
38
+ install_hook.py # One-time hook installer
39
+ ```
40
+
41
+ ## Setup
42
+
43
+ ### Option A — pip install (hook auto-installs)
44
+ ```bash
45
+ pip install -e .
46
+ ```
47
+
48
+ ### Option B — clone without pip
49
+ ```bash
50
+ pip install -r requirements.txt
51
+ python install_hook.py
52
+ ```
53
+
54
+ ## Scan manually (file or folder)
55
+
56
+ ```bash
57
+ # scan a single file
58
+ sentinel --path path/to/file.py
59
+
60
+ # scan an entire folder
61
+ sentinel --path path/to/folder/
62
+
63
+ # without pip install
64
+ python -m src.cli --path path/to/file_or_folder
65
+ ```
66
+
67
+ ## How the pre-commit hook works
68
+
69
+ Once installed, every `git commit` is automatically intercepted:
70
+
71
+ ```
72
+ git commit -m "my changes"
73
+ |
74
+ v
75
+ SentinelCodeAI scans all staged files
76
+ |
77
+ v
78
+ HIGH risk found --> commit BLOCKED + report shown
79
+ All clean --> commit goes through
80
+ ```
81
+
82
+ ## Run Tests
83
+
84
+ ```bash
85
+ pytest tests/
86
+ ```
@@ -0,0 +1,62 @@
1
+ # SentinelCodeAI
2
+
3
+ Static analysis tool that detects secrets, memory leaks, and sensitive context in any language — with automatic Git pre-commit hook integration.
4
+
5
+ ## Structure
6
+
7
+ ```
8
+ src/core/secrets.py # Regex-based secret detection (11 patterns)
9
+ src/core/leaks.py # AST-based memory leak detection
10
+ src/ai/nlp.py # NLP keyword context analysis
11
+ src/git_hooks/pre_commit.py # Git pre-commit hook logic
12
+ src/scanner.py # Shared scan + display engine
13
+ src/cli.py # CLI entry point
14
+ install_hook.py # One-time hook installer
15
+ ```
16
+
17
+ ## Setup
18
+
19
+ ### Option A — pip install (hook auto-installs)
20
+ ```bash
21
+ pip install -e .
22
+ ```
23
+
24
+ ### Option B — clone without pip
25
+ ```bash
26
+ pip install -r requirements.txt
27
+ python install_hook.py
28
+ ```
29
+
30
+ ## Scan manually (file or folder)
31
+
32
+ ```bash
33
+ # scan a single file
34
+ sentinel --path path/to/file.py
35
+
36
+ # scan an entire folder
37
+ sentinel --path path/to/folder/
38
+
39
+ # without pip install
40
+ python -m src.cli --path path/to/file_or_folder
41
+ ```
42
+
43
+ ## How the pre-commit hook works
44
+
45
+ Once installed, every `git commit` is automatically intercepted:
46
+
47
+ ```
48
+ git commit -m "my changes"
49
+ |
50
+ v
51
+ SentinelCodeAI scans all staged files
52
+ |
53
+ v
54
+ HIGH risk found --> commit BLOCKED + report shown
55
+ All clean --> commit goes through
56
+ ```
57
+
58
+ ## Run Tests
59
+
60
+ ```bash
61
+ pytest tests/
62
+ ```
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: sentinelcodeai
3
+ Version: 0.1.0
4
+ Summary: Pre-commit security scanner — detects secrets and memory leaks before git commit.
5
+ Home-page: https://github.com/Yuva-Deekshitha-N/sentinelcodeai.git
6
+ Author: CodeSentinel
7
+ Author-email: yuvadeekshithanamani@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: rich
14
+ Requires-Dist: pycparser>=2.21
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # SentinelCodeAI
26
+
27
+ Static analysis tool that detects secrets, memory leaks, and sensitive context in any language — with automatic Git pre-commit hook integration.
28
+
29
+ ## Structure
30
+
31
+ ```
32
+ src/core/secrets.py # Regex-based secret detection (11 patterns)
33
+ src/core/leaks.py # AST-based memory leak detection
34
+ src/ai/nlp.py # NLP keyword context analysis
35
+ src/git_hooks/pre_commit.py # Git pre-commit hook logic
36
+ src/scanner.py # Shared scan + display engine
37
+ src/cli.py # CLI entry point
38
+ install_hook.py # One-time hook installer
39
+ ```
40
+
41
+ ## Setup
42
+
43
+ ### Option A — pip install (hook auto-installs)
44
+ ```bash
45
+ pip install -e .
46
+ ```
47
+
48
+ ### Option B — clone without pip
49
+ ```bash
50
+ pip install -r requirements.txt
51
+ python install_hook.py
52
+ ```
53
+
54
+ ## Scan manually (file or folder)
55
+
56
+ ```bash
57
+ # scan a single file
58
+ sentinel --path path/to/file.py
59
+
60
+ # scan an entire folder
61
+ sentinel --path path/to/folder/
62
+
63
+ # without pip install
64
+ python -m src.cli --path path/to/file_or_folder
65
+ ```
66
+
67
+ ## How the pre-commit hook works
68
+
69
+ Once installed, every `git commit` is automatically intercepted:
70
+
71
+ ```
72
+ git commit -m "my changes"
73
+ |
74
+ v
75
+ SentinelCodeAI scans all staged files
76
+ |
77
+ v
78
+ HIGH risk found --> commit BLOCKED + report shown
79
+ All clean --> commit goes through
80
+ ```
81
+
82
+ ## Run Tests
83
+
84
+ ```bash
85
+ pytest tests/
86
+ ```
@@ -0,0 +1,22 @@
1
+ README.md
2
+ setup.py
3
+ sentinelcodeai.egg-info/PKG-INFO
4
+ sentinelcodeai.egg-info/SOURCES.txt
5
+ sentinelcodeai.egg-info/dependency_links.txt
6
+ sentinelcodeai.egg-info/entry_points.txt
7
+ sentinelcodeai.egg-info/requires.txt
8
+ sentinelcodeai.egg-info/top_level.txt
9
+ src/__init__.py
10
+ src/cli.py
11
+ src/scanner.py
12
+ src/ai/__init__.py
13
+ src/ai/nlp.py
14
+ src/core/__init__.py
15
+ src/core/cpp_ast.py
16
+ src/core/leaks.py
17
+ src/core/secrets.py
18
+ src/git_hooks/__init__.py
19
+ src/git_hooks/pre_commit.py
20
+ tests/test_ai.py
21
+ tests/test_leaks.py
22
+ tests/test_secrets.py
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ sca = src.cli:main
3
+ sentinel = src.cli:main
@@ -0,0 +1,2 @@
1
+ rich
2
+ pycparser>=2.21
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,29 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="sentinelcodeai",
5
+ version="0.1.0",
6
+ author="CodeSentinel",
7
+ author_email="yuvadeekshithanamani@gmail.com",
8
+ description="Pre-commit security scanner — detects secrets and memory leaks before git commit.",
9
+ long_description=open("README.md", encoding="utf-8").read(),
10
+ long_description_content_type="text/markdown",
11
+ url="https://github.com/Yuva-Deekshitha-N/sentinelcodeai.git",
12
+ packages=find_packages(),
13
+ python_requires=">=3.10",
14
+ install_requires=[
15
+ "rich",
16
+ "pycparser>=2.21",
17
+ ],
18
+ entry_points={
19
+ "console_scripts": [
20
+ "sentinel=src.cli:main",
21
+ "sca=src.cli:main",
22
+ ]
23
+ },
24
+ classifiers=[
25
+ "Programming Language :: Python :: 3",
26
+ "License :: OSI Approved :: MIT License",
27
+ "Operating System :: OS Independent",
28
+ ],
29
+ )
File without changes
File without changes
@@ -0,0 +1,28 @@
1
+ import re
2
+
3
+ SENSITIVE_KEYWORDS = {
4
+ "password": "A variable named 'password' likely holds a plaintext credential. Plaintext passwords in code are a critical security risk.",
5
+ "secret": "A variable named 'secret' may contain a cryptographic secret or API secret that should never be hardcoded.",
6
+ "token": "A variable named 'token' may expose an authentication or API token that grants access to a service.",
7
+ "private": "A variable named 'private' may reference a private key or sensitive private data.",
8
+ "credential": "A variable named 'credential' likely holds authentication data such as a username/password pair or certificate.",
9
+ "api_key": "A variable named 'api_key' almost certainly contains a service API key that should be stored in environment variables.",
10
+ "auth": "A variable named 'auth' may hold authentication headers, tokens, or credentials used to access protected resources.",
11
+ "access_key": "A variable named 'access_key' likely contains a cloud or service access key that grants programmatic access.",
12
+ "passphrase": "A variable named 'passphrase' contains a passphrase used to protect a private key or encrypted data.",
13
+ }
14
+
15
+
16
+ def analyze_context(code: str) -> list[dict]:
17
+ findings = []
18
+ for line_num, line in enumerate(code.splitlines(), start=1):
19
+ for keyword, explanation in SENSITIVE_KEYWORDS.items():
20
+ if re.search(rf"\b{keyword}\b", line, re.IGNORECASE):
21
+ findings.append({
22
+ "line": line_num,
23
+ "keyword": keyword,
24
+ "content": line.strip(),
25
+ "risk": "MEDIUM",
26
+ "explanation": explanation,
27
+ })
28
+ return findings
@@ -0,0 +1,85 @@
1
+ import argparse
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+ from src.scanner import collect_files, run_scan
6
+ from rich.console import Console
7
+
8
+ console = Console()
9
+
10
+
11
+ def install_global_hook():
12
+ # Global hooks folder inside SentinelCodeAI
13
+ hooks_dir = Path(__file__).resolve().parents[2] / "global_hooks"
14
+ hooks_dir.mkdir(exist_ok=True)
15
+
16
+ hook_src = Path(__file__).resolve().parent / "git_hooks" / "pre_commit_hook.sh"
17
+ hook_dest = hooks_dir / "pre-commit"
18
+
19
+ if not hook_src.exists():
20
+ console.print("[red]ERROR: Hook source file not found.[/red]")
21
+ sys.exit(1)
22
+
23
+ import shutil
24
+ shutil.copy(str(hook_src), str(hook_dest))
25
+ hook_dest.chmod(0o755)
26
+
27
+ # Tell git to use this folder for hooks in every repo
28
+ result = subprocess.run(
29
+ ["git", "config", "--global", "core.hooksPath", str(hooks_dir)],
30
+ capture_output=True, text=True
31
+ )
32
+
33
+ if result.returncode != 0:
34
+ console.print(f"[red]ERROR: Failed to set global hooks path: {result.stderr}[/red]")
35
+ sys.exit(1)
36
+
37
+ console.print("[bold green]SentinelCodeAI global hook installed successfully.[/bold green]")
38
+ console.print(f"Hooks folder : {hooks_dir}")
39
+ console.print("Every git commit on this machine is now protected automatically.")
40
+
41
+
42
+ def main():
43
+ parser = argparse.ArgumentParser(
44
+ prog="sentinel",
45
+ description="SentinelCodeAI — scan a file or folder for secrets, leaks, and sensitive context.",
46
+ )
47
+ parser.add_argument(
48
+ "--path",
49
+ help="Path to a file or folder to scan.",
50
+ )
51
+ parser.add_argument(
52
+ "--install-global",
53
+ action="store_true",
54
+ help="Install SentinelCodeAI as a global Git hook (runs on every repo on this machine).",
55
+ )
56
+ args = parser.parse_args()
57
+
58
+ if args.install_global:
59
+ install_global_hook()
60
+ sys.exit(0)
61
+
62
+ if not args.path:
63
+ parser.print_help()
64
+ sys.exit(1)
65
+
66
+ files = collect_files(args.path)
67
+
68
+ if not files:
69
+ console.print("[yellow]No scannable files found.[/yellow]")
70
+ sys.exit(0)
71
+
72
+ console.print(f"\n[bold]Scanning {len(files)} file(s) in: {args.path}[/bold]\n")
73
+
74
+ has_high_risk = run_scan(files)
75
+
76
+ if has_high_risk:
77
+ console.print("\n[bold red]HIGH risk issues found. Fix them before committing.[/bold red]")
78
+ sys.exit(1)
79
+
80
+ console.print("\n[bold green]Scan complete. No HIGH risk issues found.[/bold green]")
81
+ sys.exit(0)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
File without changes
@@ -0,0 +1,235 @@
1
+ """
2
+ C/C++ AST-based static analysis using pycparser.
3
+
4
+ Walks the real Abstract Syntax Tree of C/C++ source files to detect:
5
+ - malloc() without a paired free() → memory leak
6
+ - fopen() without a paired fclose() → resource leak
7
+ - new without delete → memory leak (regex-assisted, C++ extension)
8
+ - Pointer assigned then reassigned before free → dangling / lost pointer
9
+ """
10
+
11
+ import re
12
+ from pycparser import c_parser, c_ast, parse_file
13
+
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Helpers
17
+ # ---------------------------------------------------------------------------
18
+
19
+ def _strip_cpp_comments(code: str) -> str:
20
+ """Remove // and /* */ comments so the C parser doesn't choke."""
21
+ code = re.sub(r"//[^\n]*", "", code)
22
+ code = re.sub(r"/\*.*?\*/", "", code, flags=re.DOTALL)
23
+ return code
24
+
25
+
26
+ def _remove_cpp_extensions(code: str) -> str:
27
+ """
28
+ Strip C++-only syntax that pycparser (a pure-C parser) can't handle,
29
+ so we can still analyse the C-style memory calls inside .cpp files.
30
+ """
31
+ # Remove #include lines
32
+ code = re.sub(r"^\s*#include\s*[<\"][^\n]*", "", code, flags=re.MULTILINE)
33
+ # Remove using namespace / using std::
34
+ code = re.sub(r"^\s*using\s+[^\n;]+;", "", code, flags=re.MULTILINE)
35
+ # Remove class / struct definitions (keep function bodies)
36
+ code = re.sub(r"\bclass\b", "struct", code)
37
+ # Remove :: scope resolution
38
+ code = re.sub(r"\w+::", "", code)
39
+ # Remove template declarations
40
+ code = re.sub(r"template\s*<[^>]*>", "", code)
41
+ # Remove C++ casts
42
+ code = re.sub(r"\b(static_cast|dynamic_cast|reinterpret_cast|const_cast)\s*<[^>]*>", "", code)
43
+ return code
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # AST visitor — collects malloc/free/fopen/fclose call sites
48
+ # ---------------------------------------------------------------------------
49
+
50
+ class _MemoryCallVisitor(c_ast.NodeVisitor):
51
+ """Walk the AST and record every call to malloc/free/fopen/fclose."""
52
+
53
+ def __init__(self):
54
+ self.calls: list[dict] = [] # {"name": str, "line": int}
55
+
56
+ def visit_FuncCall(self, node):
57
+ if node.name and isinstance(node.name, c_ast.ID):
58
+ fn = node.name.name
59
+ if fn in ("malloc", "calloc", "realloc", "free", "fopen", "fclose"):
60
+ line = node.coord.line if node.coord else 0
61
+ self.calls.append({"name": fn, "line": line})
62
+ self.generic_visit(node)
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Public API
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def analyze_cpp_ast(code: str) -> list[dict]:
70
+ """
71
+ Parse C/C++ source with pycparser and return AST-level findings.
72
+
73
+ Returns a list of dicts compatible with the existing leak format:
74
+ {type, line, content, explanation, languages, engine}
75
+ """
76
+ findings: list[dict] = []
77
+ lines = code.splitlines()
78
+
79
+ # ── 1. Try real AST parse ──────────────────────────────────────────────
80
+ ast_findings = _ast_analysis(code, lines)
81
+ findings.extend(ast_findings)
82
+
83
+ # ── 2. C++-only checks (new/delete, dangling ptr) via regex on raw code ─
84
+ findings.extend(_cpp_new_delete_analysis(code, lines))
85
+ findings.extend(_dangling_pointer_analysis(code, lines))
86
+
87
+ return findings
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # AST analysis (malloc/free, fopen/fclose pairing)
92
+ # ---------------------------------------------------------------------------
93
+
94
+ def _ast_analysis(code: str, lines: list[str]) -> list[dict]:
95
+ findings: list[dict] = []
96
+
97
+ try:
98
+ clean = _strip_cpp_comments(code)
99
+ clean = _remove_cpp_extensions(clean)
100
+
101
+ # pycparser needs a fake libc header stub
102
+ parser = c_parser.CParser()
103
+ # Inject minimal typedefs so the parser doesn't fail on FILE*, size_t etc.
104
+ preamble = (
105
+ "typedef unsigned long size_t;\n"
106
+ "typedef struct _IO_FILE FILE;\n"
107
+ "void *malloc(size_t size);\n"
108
+ "void *calloc(size_t n, size_t size);\n"
109
+ "void *realloc(void *ptr, size_t size);\n"
110
+ "void free(void *ptr);\n"
111
+ "FILE *fopen(const char *path, const char *mode);\n"
112
+ "int fclose(FILE *stream);\n"
113
+ )
114
+ ast = parser.parse(preamble + clean, filename="<input>")
115
+
116
+ visitor = _MemoryCallVisitor()
117
+ visitor.visit(ast)
118
+
119
+ malloc_lines = [c["line"] for c in visitor.calls if c["name"] in ("malloc", "calloc", "realloc")]
120
+ free_count = sum(1 for c in visitor.calls if c["name"] == "free")
121
+ fopen_lines = [c["line"] for c in visitor.calls if c["name"] == "fopen"]
122
+ fclose_count = sum(1 for c in visitor.calls if c["name"] == "fclose")
123
+
124
+ # malloc without free
125
+ if malloc_lines and free_count == 0:
126
+ for ln in malloc_lines:
127
+ src_line = lines[ln - 1].strip() if 0 < ln <= len(lines) else ""
128
+ findings.append({
129
+ "type": "ast_malloc_no_free",
130
+ "line": ln,
131
+ "content": src_line,
132
+ "explanation": (
133
+ "[AST] malloc/calloc/realloc detected but no free() found in this "
134
+ "translation unit. Heap memory will never be returned to the OS."
135
+ ),
136
+ "languages": "C/C++",
137
+ "engine": "AST",
138
+ })
139
+
140
+ # fopen without fclose
141
+ if fopen_lines and fclose_count == 0:
142
+ for ln in fopen_lines:
143
+ src_line = lines[ln - 1].strip() if 0 < ln <= len(lines) else ""
144
+ findings.append({
145
+ "type": "ast_fopen_no_fclose",
146
+ "line": ln,
147
+ "content": src_line,
148
+ "explanation": (
149
+ "[AST] fopen() detected but no fclose() found in this translation unit. "
150
+ "The file descriptor will leak until the process exits."
151
+ ),
152
+ "languages": "C/C++",
153
+ "engine": "AST",
154
+ })
155
+
156
+ except Exception:
157
+ # Parser failed (complex C++ syntax) — fall back silently; regex layer still runs
158
+ pass
159
+
160
+ return findings
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # C++ new / delete analysis (regex-assisted, AST-style pairing logic)
165
+ # ---------------------------------------------------------------------------
166
+
167
+ def _cpp_new_delete_analysis(code: str, lines: list[str]) -> list[dict]:
168
+ findings: list[dict] = []
169
+
170
+ new_lines = [i + 1 for i, l in enumerate(lines) if re.search(r"\bnew\b", l)]
171
+ delete_count = sum(1 for l in lines if re.search(r"\bdelete\b", l))
172
+
173
+ if new_lines and delete_count == 0:
174
+ for ln in new_lines:
175
+ findings.append({
176
+ "type": "ast_new_no_delete",
177
+ "line": ln,
178
+ "content": lines[ln - 1].strip(),
179
+ "explanation": (
180
+ "[AST] 'new' allocates heap memory but no 'delete' was found. "
181
+ "Prefer smart pointers (std::unique_ptr / std::shared_ptr) to avoid leaks."
182
+ ),
183
+ "languages": "C++",
184
+ "engine": "AST",
185
+ })
186
+
187
+ return findings
188
+
189
+
190
+ # ---------------------------------------------------------------------------
191
+ # Dangling pointer detection
192
+ # ---------------------------------------------------------------------------
193
+
194
+ def _dangling_pointer_analysis(code: str, lines: list[str]) -> list[dict]:
195
+ """
196
+ Detect the pattern:
197
+ ptr = malloc(...); ← allocation
198
+ ptr = something; ← reassignment WITHOUT free → original block lost
199
+ """
200
+ findings: list[dict] = []
201
+
202
+ # Collect pointer names that were malloc'd
203
+ malloc_vars: dict[str, int] = {}
204
+ for i, line in enumerate(lines, start=1):
205
+ m = re.search(r"\b(\w+)\s*=\s*(?:malloc|calloc|realloc)\s*\(", line)
206
+ if m:
207
+ malloc_vars[m.group(1)] = i
208
+
209
+ # Check if any of those vars are reassigned without a free in between
210
+ for var, alloc_line in malloc_vars.items():
211
+ freed = False
212
+ for i, line in enumerate(lines, start=1):
213
+ if i <= alloc_line:
214
+ continue
215
+ if re.search(rf"\bfree\s*\(\s*{var}\s*\)", line):
216
+ freed = True
217
+ break
218
+ # Reassigned without free
219
+ if re.search(rf"\b{var}\s*=\s*(?!NULL|nullptr|0\b)", line):
220
+ if not freed:
221
+ findings.append({
222
+ "type": "ast_dangling_pointer",
223
+ "line": i,
224
+ "content": lines[i - 1].strip(),
225
+ "explanation": (
226
+ f"[AST] Pointer '{var}' (allocated at line {alloc_line}) is "
227
+ "reassigned before being freed. The original heap block is lost — "
228
+ "this is a classic dangling/lost-pointer memory leak."
229
+ ),
230
+ "languages": "C/C++",
231
+ "engine": "AST",
232
+ })
233
+ break
234
+
235
+ return findings
@@ -0,0 +1,82 @@
1
+ import re
2
+
3
+ # Regex-based leak patterns — works across Python, C++, Java, JS, etc.
4
+ LEAK_PATTERNS = {
5
+ # Python
6
+ "python_unclosed_file": {
7
+ "pattern": r"\bopen\s*\([^)]+\)(?!\s*as\b)",
8
+ "explanation": "open() called without a 'with' block. File handle may never be closed, leaking OS resources.",
9
+ "languages": "Python",
10
+ },
11
+ "python_unclosed_db": {
12
+ "pattern": r"\b(psycopg2|pymysql|sqlite3|cx_Oracle|pyodbc)\.connect\s*\(",
13
+ "explanation": "Database connection opened. If not closed or used in a context manager, the connection leaks and exhausts the DB connection pool.",
14
+ "languages": "Python",
15
+ },
16
+ "python_unclosed_socket": {
17
+ "pattern": r"\bsocket\.socket\s*\(",
18
+ "explanation": "Socket created without a 'with' block. Unclosed sockets leak file descriptors and can cause connection exhaustion.",
19
+ "languages": "Python",
20
+ },
21
+ "python_unclosed_session": {
22
+ "pattern": r"\brequests\.Session\s*\(\s*\)(?!\s*as\b)",
23
+ "explanation": "requests.Session() opened without a context manager. Unclosed sessions leak TCP connections.",
24
+ "languages": "Python",
25
+ },
26
+
27
+ # C / C++
28
+ "cpp_malloc_no_free": {
29
+ "pattern": r"\bmalloc\s*\(",
30
+ "explanation": "malloc() allocates heap memory. If free() is never called, this causes a memory leak that grows over time.",
31
+ "languages": "C/C++",
32
+ },
33
+ "cpp_new_no_delete": {
34
+ "pattern": r"\bnew\s+\w+",
35
+ "explanation": "'new' allocates heap memory. Without a matching 'delete', the memory is never returned to the OS.",
36
+ "languages": "C/C++",
37
+ },
38
+ "cpp_fopen_no_fclose": {
39
+ "pattern": r"\bfopen\s*\(",
40
+ "explanation": "fopen() opens a file handle. If fclose() is never called, the file descriptor leaks.",
41
+ "languages": "C/C++",
42
+ },
43
+
44
+ # Java
45
+ "java_unclosed_stream": {
46
+ "pattern": r"\bnew\s+(FileInputStream|FileOutputStream|BufferedReader|FileReader|FileWriter)\s*\(",
47
+ "explanation": "Java stream opened without try-with-resources. If close() is not called, the stream leaks file descriptors.",
48
+ "languages": "Java",
49
+ },
50
+ "java_unclosed_connection": {
51
+ "pattern": r"\bDriverManager\.getConnection\s*\(",
52
+ "explanation": "JDBC connection opened. If not closed in a finally block or try-with-resources, the DB connection leaks.",
53
+ "languages": "Java",
54
+ },
55
+
56
+ # JavaScript / TypeScript
57
+ "js_unclosed_fs": {
58
+ "pattern": r"\bfs\.open\s*\(",
59
+ "explanation": "fs.open() called without a corresponding fs.close(). Leaks file descriptors in Node.js.",
60
+ "languages": "JavaScript/TypeScript",
61
+ },
62
+ "js_event_listener": {
63
+ "pattern": r"\baddEventListener\s*\(",
64
+ "explanation": "Event listener added. If removeEventListener() is never called, it prevents garbage collection and causes memory leaks.",
65
+ "languages": "JavaScript/TypeScript",
66
+ },
67
+ }
68
+
69
+
70
+ def detect_leaks(code: str) -> list[dict]:
71
+ findings = []
72
+ for line_num, line in enumerate(code.splitlines(), start=1):
73
+ for leak_type, config in LEAK_PATTERNS.items():
74
+ if re.search(config["pattern"], line):
75
+ findings.append({
76
+ "type": leak_type,
77
+ "line": line_num,
78
+ "content": line.strip(),
79
+ "explanation": config["explanation"],
80
+ "languages": config["languages"],
81
+ })
82
+ return findings
@@ -0,0 +1,130 @@
1
+ import re
2
+ from typing import List, Dict
3
+
4
+ # Secret patterns with risk levels and explanations
5
+ SECRET_PATTERNS = {
6
+ "aws_access_key": {
7
+ "pattern": r"AKIA[0-9A-Z]{16}",
8
+ "risk": "HIGH",
9
+ "explanation": "Hardcoded AWS Access Key ID detected. Attackers can use this to access your AWS account, spin up resources, steal data, or incur massive charges."
10
+ },
11
+ "aws_secret_key": {
12
+ "pattern": r"(?i)aws(.{0,20})?['\"][0-9a-zA-Z/+]{40}['\"]",
13
+ "risk": "HIGH",
14
+ "explanation": "Hardcoded AWS Secret Access Key detected. Combined with an Access Key ID, this grants full programmatic access to your AWS account."
15
+ },
16
+ "generic_api_key": {
17
+ "pattern": r"(?i)(api_key|apikey|api-key)\s*=\s*['\"][a-zA-Z0-9]{16,}['\"]",
18
+ "risk": "HIGH",
19
+ "explanation": "A hardcoded API key was found. If this key is pushed to a public repo, any third party can authenticate as you and abuse the associated service."
20
+ },
21
+ "private_key": {
22
+ "pattern": r"-----BEGIN (RSA|EC|DSA|OPENSSH) PRIVATE KEY-----",
23
+ "risk": "HIGH",
24
+ "explanation": "A private cryptographic key is embedded in the code. This can be used to impersonate your server, decrypt communications, or forge signatures."
25
+ },
26
+ "password": {
27
+ "pattern": r"(?i)(password|passwd|pwd)\s*=\s*['\"].{6,}['\"]",
28
+ "risk": "HIGH",
29
+ "explanation": "A plaintext password is hardcoded. Passwords in source code are permanently stored in Git history even after deletion and can be extracted by anyone with repo access."
30
+ },
31
+ "github_token": {
32
+ "pattern": r"ghp_[A-Za-z0-9]{36}",
33
+ "risk": "HIGH",
34
+ "explanation": "A GitHub Personal Access Token was found. This grants the holder read/write access to your repositories and account settings."
35
+ },
36
+ "jwt_token": {
37
+ "pattern": r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+",
38
+ "risk": "MEDIUM",
39
+ "explanation": "A JWT token is hardcoded. If unexpired, it can be replayed to authenticate as the token's subject without needing credentials."
40
+ },
41
+ "google_api_key": {
42
+ "pattern": r"AIza[0-9A-Za-z\-_]{35}",
43
+ "risk": "HIGH",
44
+ "explanation": "A Google API key was found. Exposure can lead to quota theft, unauthorized use of Google services, and unexpected billing on your account."
45
+ },
46
+ "slack_token": {
47
+ "pattern": r"xox[baprs]-[0-9a-zA-Z]{10,48}",
48
+ "risk": "HIGH",
49
+ "explanation": "A Slack token is hardcoded. This allows an attacker to read messages, post as your bot/user, and access private channels in your workspace."
50
+ },
51
+ "database_url": {
52
+ "pattern": r"(postgres|mysql)://.*:.*@",
53
+ "risk": "HIGH",
54
+ "explanation": "A database connection URL with embedded credentials was found. This exposes your database host, username, and password to anyone who reads the code."
55
+ },
56
+ "mongodb_url": {
57
+ "pattern": r"mongodb(\+srv)?://[^:]+:[^@]+@",
58
+ "risk": "HIGH",
59
+ "explanation": "A MongoDB connection string with embedded credentials was found. Exposes your database host, username, and password publicly."
60
+ },
61
+ "firebase_api_key": {
62
+ "pattern": r"(?i)firebase.*api.?key\s*[=:]\s*['\"][A-Za-z0-9_\-]{20,}['\"]",
63
+ "risk": "HIGH",
64
+ "explanation": "A Firebase API key was found. Exposes your Firebase project to unauthorized reads, writes, and abuse of Firebase services."
65
+ },
66
+ "firebase_db_url": {
67
+ "pattern": r"https://[a-z0-9-]+\.firebaseio\.com",
68
+ "risk": "HIGH",
69
+ "explanation": "A Firebase Realtime Database URL was found. If database rules are misconfigured, attackers can read or write all data."
70
+ },
71
+ "firebase_secret": {
72
+ "pattern": r"(?i)firebase.{0,20}secret\s*[=:]\s*['\"][A-Za-z0-9]{20,}['\"]",
73
+ "risk": "HIGH",
74
+ "explanation": "A Firebase legacy secret was found. This grants full admin access to your Firebase project."
75
+ },
76
+ "test_key": {
77
+ "pattern": r"(?i)test[_-]?key",
78
+ "risk": "LOW",
79
+ "explanation": "A test key identifier was found. While likely not a real secret, test keys are sometimes accidentally swapped with production keys."
80
+ }
81
+ }
82
+
83
+
84
+ def detect_secrets(code: str) -> List[Dict]:
85
+ """
86
+ Scan code and detect potential secrets.
87
+
88
+ Returns:
89
+ List of findings with:
90
+ - type
91
+ - risk
92
+ - line number
93
+ - matched content
94
+ """
95
+ findings = []
96
+
97
+ for line_num, line in enumerate(code.splitlines(), start=1):
98
+ for secret_type, config in SECRET_PATTERNS.items():
99
+ pattern = config["pattern"]
100
+ risk = config["risk"]
101
+
102
+ matches = re.findall(pattern, line)
103
+
104
+ for match in matches:
105
+ findings.append({
106
+ "type": secret_type,
107
+ "risk": risk,
108
+ "line": line_num,
109
+ "content": line.strip(),
110
+ "matched": match if isinstance(match, str) else match[0],
111
+ "explanation": config["explanation"],
112
+ })
113
+
114
+ return findings
115
+
116
+
117
+ def summarize_findings(findings: List[Dict]) -> Dict:
118
+ """
119
+ Summarize findings into risk categories
120
+ """
121
+ summary = {
122
+ "HIGH": 0,
123
+ "MEDIUM": 0,
124
+ "LOW": 0
125
+ }
126
+
127
+ for f in findings:
128
+ summary[f["risk"]] += 1
129
+
130
+ return summary
File without changes
@@ -0,0 +1,66 @@
1
+ import subprocess
2
+ import sys
3
+ from pathlib import Path
4
+ from src.scanner import collect_files, run_scan
5
+ from rich.console import Console
6
+
7
+ console = Console()
8
+
9
+ repo_root = Path(subprocess.run(
10
+ ["git", "rev-parse", "--show-toplevel"],
11
+ capture_output=True, text=True
12
+ ).stdout.strip()).resolve()
13
+
14
+
15
+ # Files belonging to SentinelCodeAI itself — skip to avoid false positives
16
+ SENTINEL_OWN_FILES = {
17
+ "src/core/secrets.py",
18
+ "src/core/leaks.py",
19
+ "src/ai/nlp.py",
20
+ "src/scanner.py",
21
+ "src/cli.py",
22
+ "src/git_hooks/pre_commit.py",
23
+ "README.md",
24
+ "tests/test_secrets.py",
25
+ "tests/test_leaks.py",
26
+ "tests/test_ai.py",
27
+ }
28
+
29
+
30
+ def get_staged_files() -> list[Path]:
31
+ try:
32
+ output = subprocess.check_output(
33
+ ["git", "diff", "--cached", "--name-only"]
34
+ )
35
+ files = []
36
+ for f in output.decode().splitlines():
37
+ if f in SENTINEL_OWN_FILES:
38
+ continue
39
+ resolved = (repo_root / f).resolve()
40
+ if str(resolved).startswith(str(repo_root)) and resolved.is_file():
41
+ files.append(resolved)
42
+ return files
43
+ except Exception:
44
+ return []
45
+
46
+
47
+ def main():
48
+ files = get_staged_files()
49
+
50
+ if not files:
51
+ sys.exit(0)
52
+
53
+ console.print(f"[bold]SentinelCodeAI scanning {len(files)} staged file(s)...[/bold]\n")
54
+
55
+ has_high_risk = run_scan(files)
56
+
57
+ if has_high_risk:
58
+ console.print("\nCommit BLOCKED due to HIGH risk issues!", style="bold red")
59
+ sys.exit(1)
60
+
61
+ console.print("\nCommit Allowed", style="bold green")
62
+ sys.exit(0)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
@@ -0,0 +1,120 @@
1
+ import sys
2
+ from pathlib import Path
3
+ from src.core.secrets import detect_secrets, summarize_findings
4
+ from src.core.leaks import detect_leaks
5
+ from src.core.cpp_ast import analyze_cpp_ast
6
+ from src.ai.nlp import analyze_context
7
+ from rich.console import Console
8
+
9
+ if hasattr(sys.stdout, "reconfigure"):
10
+ sys.stdout.reconfigure(encoding="utf-8")
11
+
12
+ console = Console()
13
+
14
+ # File types to skip (binaries, media, etc.)
15
+ SKIP_EXTENSIONS = {
16
+ ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
17
+ ".pdf", ".zip", ".tar", ".gz", ".exe", ".bin",
18
+ ".pyc", ".pyo", ".so", ".dll", ".class",
19
+ }
20
+
21
+
22
+ def collect_files(path: str) -> list[Path]:
23
+ """Return all scannable files from a file path or folder."""
24
+ target = Path(path).resolve()
25
+ if target.is_file():
26
+ return [target]
27
+ return [
28
+ f for f in target.rglob("*")
29
+ if f.is_file() and f.suffix not in SKIP_EXTENSIONS
30
+ ]
31
+
32
+
33
+ CPP_EXTENSIONS = {".c", ".cpp", ".cc", ".cxx", ".h", ".hpp"}
34
+
35
+
36
+ def scan_file(file_path: Path) -> tuple:
37
+ try:
38
+ code = file_path.read_text(encoding="utf-8", errors="ignore")
39
+ findings = detect_secrets(code)
40
+ summary = summarize_findings(findings)
41
+ leaks = detect_leaks(code)
42
+ # Run AST engine for C/C++ files
43
+ if file_path.suffix.lower() in CPP_EXTENSIONS:
44
+ leaks = leaks + analyze_cpp_ast(code)
45
+ nlp_findings = analyze_context(code)
46
+ return findings, summary, leaks, nlp_findings
47
+ except Exception:
48
+ return [], {"HIGH": 0, "MEDIUM": 0, "LOW": 0}, [], []
49
+
50
+
51
+ def display_results(file: str, findings, summary, leaks, nlp_findings) -> bool:
52
+ """Print findings for one file. Returns True if HIGH risk was found."""
53
+ has_high = False
54
+
55
+ # 🔴 HIGH
56
+ if summary["HIGH"] > 0:
57
+ has_high = True
58
+ console.print(f"[bold red]>> HIGH RISK in {file}[/bold red]")
59
+ for f in findings:
60
+ if f["risk"] == "HIGH":
61
+ console.print(f"[red] {f['type']} (line {f['line']})[/red]")
62
+ console.print(f" Detected : {f['matched']}")
63
+ console.print(f" Why : {f['explanation']}")
64
+
65
+ # MEDIUM
66
+ if summary["MEDIUM"] > 0:
67
+ console.print(f"[bold yellow]>> MEDIUM RISK in {file}[/bold yellow]")
68
+ for f in findings:
69
+ if f["risk"] == "MEDIUM":
70
+ console.print(f"[yellow] {f['type']} (line {f['line']})[/yellow]")
71
+ console.print(f" Detected : {f['matched']}")
72
+ console.print(f" Why : {f['explanation']}")
73
+
74
+ # LOW
75
+ if summary["LOW"] > 0:
76
+ console.print(f"[dim yellow]>> LOW RISK in {file}[/dim yellow]")
77
+ for f in findings:
78
+ if f["risk"] == "LOW":
79
+ console.print(f"[yellow] {f['type']} (line {f['line']})[/yellow]")
80
+ console.print(f" Detected : {f['matched']}")
81
+ console.print(f" Why : {f['explanation']}")
82
+
83
+ # Leaks
84
+ if leaks:
85
+ console.print(f"[yellow]>> Leak Issues in {file}[/yellow]")
86
+ for leak in leaks:
87
+ engine_tag = f" [{leak.get('engine', 'regex')}]" if leak.get('engine') else ""
88
+ console.print(f"[yellow] {leak['type']}{engine_tag} (line {leak['line']}) [{leak['languages']}][/yellow]")
89
+ console.print(f" Code : {leak['content']}")
90
+ console.print(f" Why : {leak['explanation']}")
91
+
92
+ # NLP
93
+ if nlp_findings:
94
+ console.print(f"[bold cyan]>> NLP Findings in {file}[/bold cyan]")
95
+ for n in nlp_findings:
96
+ console.print(f"[cyan] '{n['keyword']}' (line {n['line']}) - {n['risk']}[/cyan]")
97
+ console.print(f" Code : {n['content']}")
98
+ console.print(f" Why : {n['explanation']}")
99
+
100
+ # ✅ SAFE
101
+ if (
102
+ summary["HIGH"] == 0
103
+ and summary["MEDIUM"] == 0
104
+ and summary["LOW"] == 0
105
+ and not leaks
106
+ and not nlp_findings
107
+ ):
108
+ console.print(f"[bold green]SAFE: {file}[/bold green]")
109
+
110
+ return has_high
111
+
112
+
113
+ def run_scan(files: list[Path]) -> bool:
114
+ """Scan a list of files. Returns True if any HIGH risk found."""
115
+ has_high_risk = False
116
+ for file_path in files:
117
+ findings, summary, leaks, nlp_findings = scan_file(file_path)
118
+ if display_results(str(file_path), findings, summary, leaks, nlp_findings):
119
+ has_high_risk = True
120
+ return has_high_risk
@@ -0,0 +1,26 @@
1
+ import pytest
2
+ from src.ai.nlp import analyze_context
3
+
4
+
5
+ def test_detects_sensitive_keyword():
6
+ # 'token' appears as a standalone word on this line
7
+ code = 'token = get_token()'
8
+ findings = analyze_context(code)
9
+ assert any(f["keyword"] == "token" for f in findings)
10
+
11
+
12
+ def test_case_insensitive():
13
+ code = 'PASSWORD = os.environ["DB_PASS"]'
14
+ findings = analyze_context(code)
15
+ assert any(f["keyword"] == "password" for f in findings)
16
+
17
+
18
+ def test_no_findings_on_clean_code():
19
+ code = 'def add(a, b):\n return a + b'
20
+ assert analyze_context(code) == []
21
+
22
+
23
+ def test_returns_line_number():
24
+ code = 'x = 1\nsecret = "abc"'
25
+ findings = analyze_context(code)
26
+ assert findings[0]["line"] == 2
@@ -0,0 +1,21 @@
1
+ import pytest
2
+ from src.core.leaks import detect_leaks
3
+
4
+
5
+ def test_detects_unclosed_file():
6
+ code = 'f = open("data.txt", "r")\ndata = f.read()'
7
+ findings = detect_leaks(code)
8
+ assert any(f["type"] == "python_unclosed_file" for f in findings)
9
+
10
+
11
+ def test_no_leak_with_context_manager():
12
+ code = 'with open("data.txt") as f:\n data = f.read()'
13
+ findings = detect_leaks(code)
14
+ assert findings == []
15
+
16
+
17
+ def test_syntax_error_handled():
18
+ # detect_leaks is regex-based and does not raise on syntax errors
19
+ code = "def broken(:"
20
+ findings = detect_leaks(code)
21
+ assert isinstance(findings, list)
@@ -0,0 +1,25 @@
1
+ import pytest
2
+ from src.core.secrets import detect_secrets
3
+
4
+
5
+ def test_detects_aws_access_key():
6
+ code = 'key = "AKIAIOSFODNN7EXAMPLE"'
7
+ findings = detect_secrets(code)
8
+ assert any(f["type"] == "aws_access_key" for f in findings)
9
+
10
+
11
+ def test_detects_password():
12
+ code = 'password = "supersecret123"'
13
+ findings = detect_secrets(code)
14
+ assert any(f["type"] == "password" for f in findings)
15
+
16
+
17
+ def test_no_false_positive():
18
+ code = 'x = 42\nprint("hello world")'
19
+ assert detect_secrets(code) == []
20
+
21
+
22
+ def test_returns_correct_line_number():
23
+ code = "x = 1\npassword = 'mypassword'"
24
+ findings = detect_secrets(code)
25
+ assert findings[0]["line"] == 2