credactor 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- credactor/__init__.py +3 -0
- credactor/__main__.py +6 -0
- credactor/cli.py +228 -0
- credactor/config.py +136 -0
- credactor/gitignore.py +73 -0
- credactor/patterns.py +204 -0
- credactor/redactor.py +319 -0
- credactor/report.py +219 -0
- credactor/scanner.py +406 -0
- credactor/suppressions.py +90 -0
- credactor/utils.py +67 -0
- credactor/walker.py +301 -0
- credactor-2.0.0.dist-info/METADATA +125 -0
- credactor-2.0.0.dist-info/RECORD +18 -0
- credactor-2.0.0.dist-info/WHEEL +5 -0
- credactor-2.0.0.dist-info/entry_points.txt +2 -0
- credactor-2.0.0.dist-info/licenses/LICENSE +201 -0
- credactor-2.0.0.dist-info/top_level.txt +1 -0
credactor/__init__.py
ADDED
credactor/__main__.py
ADDED
credactor/cli.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI entry point using argparse.
|
|
3
|
+
|
|
4
|
+
Addresses: #6 (--staged), #7 (--format), #8 (--dry-run), #24 (argparse),
|
|
5
|
+
#33 (--fix-all), #34 (exit codes: 0=clean, 1=findings, 2=error)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from .config import Config, apply_config_file, load_config_file
|
|
16
|
+
from .redactor import fix_all, interactive_review
|
|
17
|
+
from .report import json_report, print_gitignore_skipped, print_report, sarif_report
|
|
18
|
+
from .scanner import scan_file
|
|
19
|
+
from .suppressions import AllowList
|
|
20
|
+
from .walker import scan_git_history, scan_staged_files, select_json_files, walk_and_scan
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
24
|
+
parser = argparse.ArgumentParser(
|
|
25
|
+
prog='credactor',
|
|
26
|
+
description='Scan source files for hardcoded credentials and optionally redact them.',
|
|
27
|
+
epilog='Exit codes: 0 = clean, 1 = unresolved findings, 2 = error',
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
'target', nargs='?', default='.',
|
|
32
|
+
help='Directory or file to scan (default: current directory)',
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Mode flags
|
|
36
|
+
mode = parser.add_argument_group('mode')
|
|
37
|
+
mode.add_argument(
|
|
38
|
+
'--ci', action='store_true',
|
|
39
|
+
help='CI mode: report only (no prompts), exit 1 on findings',
|
|
40
|
+
)
|
|
41
|
+
mode.add_argument(
|
|
42
|
+
'--dry-run', action='store_true',
|
|
43
|
+
help='Show what would be found/replaced without modifying files',
|
|
44
|
+
)
|
|
45
|
+
mode.add_argument(
|
|
46
|
+
'--fix-all', action='store_true',
|
|
47
|
+
help='Replace all findings without prompting',
|
|
48
|
+
)
|
|
49
|
+
mode.add_argument(
|
|
50
|
+
'--staged', action='store_true',
|
|
51
|
+
help='Scan only git-staged files (for pre-commit hooks)',
|
|
52
|
+
)
|
|
53
|
+
mode.add_argument(
|
|
54
|
+
'--scan-history', action='store_true',
|
|
55
|
+
help='Scan git commit history for leaked credentials',
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Output flags
|
|
59
|
+
output = parser.add_argument_group('output')
|
|
60
|
+
output.add_argument(
|
|
61
|
+
'--format', '-f', choices=['text', 'json', 'sarif'], default='text',
|
|
62
|
+
dest='output_format',
|
|
63
|
+
help='Output format (default: text)',
|
|
64
|
+
)
|
|
65
|
+
output.add_argument(
|
|
66
|
+
'--no-color', action='store_true',
|
|
67
|
+
help='Disable ANSI color output',
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Replacement flags
|
|
71
|
+
replace = parser.add_argument_group('replacement')
|
|
72
|
+
replace.add_argument(
|
|
73
|
+
'--replace-with', choices=['sentinel', 'env', 'custom'], default='sentinel',
|
|
74
|
+
dest='replace_mode',
|
|
75
|
+
help='Replacement strategy: sentinel (default), env (language-aware env var ref), custom',
|
|
76
|
+
)
|
|
77
|
+
replace.add_argument(
|
|
78
|
+
'--replacement', type=str, default='REDACTED_BY_CREDACTOR',
|
|
79
|
+
help='Custom replacement string (used with --replace-with=sentinel or custom)',
|
|
80
|
+
)
|
|
81
|
+
replace.add_argument(
|
|
82
|
+
'--no-backup', action='store_true',
|
|
83
|
+
help='Skip creating .bak backup files before modifying',
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Configuration
|
|
87
|
+
config_group = parser.add_argument_group('configuration')
|
|
88
|
+
config_group.add_argument(
|
|
89
|
+
'--config', type=str, default=None,
|
|
90
|
+
help='Path to .credactor.toml config file',
|
|
91
|
+
)
|
|
92
|
+
config_group.add_argument(
|
|
93
|
+
'--scan-json', action='store_true',
|
|
94
|
+
help='Include .json files in the scan',
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return parser
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def main(argv: list[str] | None = None) -> None:
|
|
101
|
+
parser = build_parser()
|
|
102
|
+
args = parser.parse_args(argv)
|
|
103
|
+
|
|
104
|
+
# Build Config
|
|
105
|
+
config = Config(
|
|
106
|
+
ci_mode=args.ci,
|
|
107
|
+
dry_run=args.dry_run,
|
|
108
|
+
fix_all=args.fix_all,
|
|
109
|
+
staged_only=args.staged,
|
|
110
|
+
scan_history=args.scan_history,
|
|
111
|
+
scan_json=args.scan_json,
|
|
112
|
+
no_backup=args.no_backup,
|
|
113
|
+
no_color=args.no_color,
|
|
114
|
+
replace_mode=args.replace_mode,
|
|
115
|
+
custom_replacement=args.replacement,
|
|
116
|
+
output_format=args.output_format,
|
|
117
|
+
target=args.target,
|
|
118
|
+
config_path=args.config,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Load config file (#25)
|
|
122
|
+
target = config.target
|
|
123
|
+
if not os.path.exists(target):
|
|
124
|
+
print(f'Error: path not found: {target}', file=sys.stderr)
|
|
125
|
+
sys.exit(2)
|
|
126
|
+
|
|
127
|
+
# Guard against scanning system directories
|
|
128
|
+
_PROTECTED_DIRS = {'/', '/etc', '/usr', '/var', '/boot', '/sys', '/proc',
|
|
129
|
+
'/bin', '/sbin', '/lib', '/opt', '/root',
|
|
130
|
+
'C:\\', 'C:\\Windows', 'C:\\Program Files'}
|
|
131
|
+
resolved = str(Path(target).resolve())
|
|
132
|
+
if resolved in _PROTECTED_DIRS:
|
|
133
|
+
print(f'Error: refusing to scan system directory: {resolved}',
|
|
134
|
+
file=sys.stderr)
|
|
135
|
+
print(' Use a project directory instead.', file=sys.stderr)
|
|
136
|
+
sys.exit(2)
|
|
137
|
+
|
|
138
|
+
file_data = load_config_file(target, config.config_path)
|
|
139
|
+
if file_data:
|
|
140
|
+
apply_config_file(config, file_data)
|
|
141
|
+
|
|
142
|
+
# Suppressions (#3, #4)
|
|
143
|
+
allowlist = AllowList(target)
|
|
144
|
+
|
|
145
|
+
print(f'Scanning: {Path(target).resolve()}', file=sys.stderr)
|
|
146
|
+
|
|
147
|
+
# --- Dispatch based on mode ---
|
|
148
|
+
findings: list[dict] = []
|
|
149
|
+
|
|
150
|
+
if config.staged_only:
|
|
151
|
+
# #6 — staged files only
|
|
152
|
+
findings = scan_staged_files(target, config, allowlist)
|
|
153
|
+
elif config.scan_history:
|
|
154
|
+
# #11 — git history
|
|
155
|
+
findings = scan_git_history(target, config, allowlist)
|
|
156
|
+
else:
|
|
157
|
+
# Normal directory scan (#26 single walk)
|
|
158
|
+
dir_findings, gitignore_skipped, json_files = walk_and_scan(target, config, allowlist)
|
|
159
|
+
findings = dir_findings
|
|
160
|
+
|
|
161
|
+
# Report gitignored files
|
|
162
|
+
if config.output_format == 'text':
|
|
163
|
+
print_gitignore_skipped(gitignore_skipped, target, no_color=config.no_color)
|
|
164
|
+
|
|
165
|
+
# Optionally scan JSON files
|
|
166
|
+
if config.scan_json and json_files:
|
|
167
|
+
# Skip interactive selection when non-interactive
|
|
168
|
+
if (config.ci_mode or config.dry_run or config.fix_all
|
|
169
|
+
or config.output_format != 'text'):
|
|
170
|
+
json_paths = json_files
|
|
171
|
+
else:
|
|
172
|
+
json_paths = select_json_files(json_files, target)
|
|
173
|
+
|
|
174
|
+
for path in json_paths:
|
|
175
|
+
findings.extend(scan_file(path, config=config, allowlist=allowlist))
|
|
176
|
+
|
|
177
|
+
# --- Output ---
|
|
178
|
+
if not findings:
|
|
179
|
+
if config.output_format == 'json':
|
|
180
|
+
print(json_report(findings, target))
|
|
181
|
+
elif config.output_format == 'sarif':
|
|
182
|
+
print(sarif_report(findings, target))
|
|
183
|
+
else:
|
|
184
|
+
print('\n[OK] No hardcoded credentials detected. Safe for commits.\n')
|
|
185
|
+
sys.exit(0)
|
|
186
|
+
|
|
187
|
+
# We have findings — report them
|
|
188
|
+
if config.output_format == 'json':
|
|
189
|
+
print(json_report(findings, target))
|
|
190
|
+
elif config.output_format == 'sarif':
|
|
191
|
+
print(sarif_report(findings, target))
|
|
192
|
+
else:
|
|
193
|
+
print_report(findings, target, no_color=config.no_color)
|
|
194
|
+
|
|
195
|
+
# #34 — exit code semantics (consistent across all formats)
|
|
196
|
+
if config.ci_mode or config.dry_run:
|
|
197
|
+
sys.exit(1)
|
|
198
|
+
|
|
199
|
+
if config.fix_all:
|
|
200
|
+
# Confirmation before destructive batch operation
|
|
201
|
+
by_file: dict[str, list] = {}
|
|
202
|
+
for f in findings:
|
|
203
|
+
by_file.setdefault(f['file'], []).append(f)
|
|
204
|
+
print(f'\n --fix-all will modify {len(by_file)} file(s) '
|
|
205
|
+
f'with {len(findings)} replacement(s).')
|
|
206
|
+
if not config.no_backup:
|
|
207
|
+
print(' .bak backups will be created (contain original secrets).')
|
|
208
|
+
else:
|
|
209
|
+
print(' WARNING: --no-backup is set. No backups will be created.')
|
|
210
|
+
try:
|
|
211
|
+
answer = input(' Proceed? [y/N]: ').strip().lower()
|
|
212
|
+
except (KeyboardInterrupt, EOFError):
|
|
213
|
+
print('\n Aborted.')
|
|
214
|
+
sys.exit(1)
|
|
215
|
+
if answer not in ('y', 'yes'):
|
|
216
|
+
print(' Aborted.')
|
|
217
|
+
sys.exit(1)
|
|
218
|
+
|
|
219
|
+
unresolved = fix_all(findings, target, config)
|
|
220
|
+
sys.exit(1 if unresolved > 0 else 0)
|
|
221
|
+
|
|
222
|
+
# Non-text formats in non-CI mode: report and exit 1
|
|
223
|
+
if config.output_format != 'text':
|
|
224
|
+
sys.exit(1)
|
|
225
|
+
|
|
226
|
+
# Interactive mode (default, text only)
|
|
227
|
+
unresolved = interactive_review(findings, target, config)
|
|
228
|
+
sys.exit(1 if unresolved > 0 else 0)
|
credactor/config.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration loading from ``.credactor.toml`` files.
|
|
3
|
+
|
|
4
|
+
Addresses: #25 (config file support)
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Config:
|
|
17
|
+
"""Runtime configuration — populated from CLI flags and/or config file."""
|
|
18
|
+
|
|
19
|
+
# Thresholds
|
|
20
|
+
entropy_threshold: float = 3.5
|
|
21
|
+
min_value_length: int = 8
|
|
22
|
+
|
|
23
|
+
# Directories / files
|
|
24
|
+
skip_dirs: set[str] = field(default_factory=lambda: set())
|
|
25
|
+
skip_files: set[str] = field(default_factory=lambda: set())
|
|
26
|
+
extra_extensions: set[str] = field(default_factory=lambda: set())
|
|
27
|
+
extra_safe_values: set[str] = field(default_factory=lambda: set())
|
|
28
|
+
|
|
29
|
+
# Behaviour flags (populated by CLI)
|
|
30
|
+
ci_mode: bool = False
|
|
31
|
+
dry_run: bool = False
|
|
32
|
+
fix_all: bool = False
|
|
33
|
+
staged_only: bool = False
|
|
34
|
+
scan_history: bool = False
|
|
35
|
+
scan_json: bool = False
|
|
36
|
+
no_backup: bool = False
|
|
37
|
+
no_color: bool = False
|
|
38
|
+
replace_mode: str = 'sentinel' # 'sentinel' | 'env' | 'custom'
|
|
39
|
+
custom_replacement: str = 'REDACTED_BY_CREDACTOR'
|
|
40
|
+
output_format: str = 'text' # 'text' | 'json' | 'sarif'
|
|
41
|
+
target: str = '.'
|
|
42
|
+
config_path: Optional[str] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def load_config_file(root: str, explicit_path: Optional[str] = None) -> dict:
|
|
46
|
+
"""Load a .credactor.toml config file and return the raw dict.
|
|
47
|
+
|
|
48
|
+
Searches for .credactor.toml in root, then parent dirs up to /.
|
|
49
|
+
If explicit_path is given, only that path is tried.
|
|
50
|
+
"""
|
|
51
|
+
if explicit_path:
|
|
52
|
+
candidates = [Path(explicit_path)]
|
|
53
|
+
else:
|
|
54
|
+
# HIGH-06: Limit traversal depth to prevent picking up config files
|
|
55
|
+
# from shared parent directories (e.g. /tmp/.credactor.toml).
|
|
56
|
+
# Walk up at most 5 levels — enough for monorepo nesting.
|
|
57
|
+
max_depth = 5
|
|
58
|
+
candidates = []
|
|
59
|
+
p = Path(root).resolve()
|
|
60
|
+
for _ in range(max_depth):
|
|
61
|
+
candidates.append(p / '.credactor.toml')
|
|
62
|
+
if p.parent == p:
|
|
63
|
+
break
|
|
64
|
+
p = p.parent
|
|
65
|
+
|
|
66
|
+
for candidate in candidates:
|
|
67
|
+
if candidate.is_file():
|
|
68
|
+
return _parse_toml(candidate)
|
|
69
|
+
|
|
70
|
+
return {}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _parse_toml(path: Path) -> dict:
|
|
74
|
+
"""Parse a TOML file. Uses tomllib (3.11+) or tomli as fallback."""
|
|
75
|
+
if sys.version_info >= (3, 11):
|
|
76
|
+
import tomllib
|
|
77
|
+
with open(path, 'rb') as fh:
|
|
78
|
+
return tomllib.load(fh)
|
|
79
|
+
else:
|
|
80
|
+
try:
|
|
81
|
+
import tomli
|
|
82
|
+
with open(path, 'rb') as fh:
|
|
83
|
+
return tomli.load(fh)
|
|
84
|
+
except ImportError:
|
|
85
|
+
# Fall back to very basic key=value parsing for simple configs
|
|
86
|
+
return _basic_toml_parse(path)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _basic_toml_parse(path: Path) -> dict:
|
|
90
|
+
"""Minimal TOML-like parser for key = value pairs (no nested tables)."""
|
|
91
|
+
result: dict = {}
|
|
92
|
+
try:
|
|
93
|
+
with open(path, encoding='utf-8') as fh:
|
|
94
|
+
for line in fh:
|
|
95
|
+
stripped = line.strip()
|
|
96
|
+
if not stripped or stripped.startswith('#') or stripped.startswith('['):
|
|
97
|
+
continue
|
|
98
|
+
if '=' not in stripped:
|
|
99
|
+
continue
|
|
100
|
+
key, _, val = stripped.partition('=')
|
|
101
|
+
key = key.strip()
|
|
102
|
+
val = val.strip().strip('"').strip("'")
|
|
103
|
+
# Try to parse as list
|
|
104
|
+
if val.startswith('[') and val.endswith(']'):
|
|
105
|
+
items = val[1:-1].split(',')
|
|
106
|
+
result[key] = [i.strip().strip('"').strip("'") for i in items if i.strip()]
|
|
107
|
+
elif val.lower() in ('true', 'false'):
|
|
108
|
+
result[key] = val.lower() == 'true'
|
|
109
|
+
elif val.isdigit():
|
|
110
|
+
result[key] = int(val)
|
|
111
|
+
else:
|
|
112
|
+
try:
|
|
113
|
+
result[key] = float(val)
|
|
114
|
+
except ValueError:
|
|
115
|
+
result[key] = val
|
|
116
|
+
except (OSError, PermissionError):
|
|
117
|
+
pass
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def apply_config_file(config: Config, file_data: dict) -> None:
|
|
122
|
+
"""Merge values from a parsed config file into the Config object."""
|
|
123
|
+
if 'entropy_threshold' in file_data:
|
|
124
|
+
config.entropy_threshold = float(file_data['entropy_threshold'])
|
|
125
|
+
if 'min_value_length' in file_data:
|
|
126
|
+
config.min_value_length = int(file_data['min_value_length'])
|
|
127
|
+
if 'skip_dirs' in file_data:
|
|
128
|
+
config.skip_dirs.update(file_data['skip_dirs'])
|
|
129
|
+
if 'skip_files' in file_data:
|
|
130
|
+
config.skip_files.update(file_data['skip_files'])
|
|
131
|
+
if 'extra_extensions' in file_data:
|
|
132
|
+
config.extra_extensions.update(file_data['extra_extensions'])
|
|
133
|
+
if 'extra_safe_values' in file_data:
|
|
134
|
+
config.extra_safe_values.update(v.lower() for v in file_data['extra_safe_values'])
|
|
135
|
+
if 'replacement' in file_data:
|
|
136
|
+
config.custom_replacement = str(file_data['replacement'])
|
credactor/gitignore.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
.gitignore pattern loading and matching.
|
|
3
|
+
|
|
4
|
+
Extracted from the original credential_redactor.py with no logic changes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import fnmatch
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .patterns import SKIP_DIRS
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_gitignore_patterns(root: str) -> list[tuple[str, str]]:
|
|
15
|
+
"""Walk *root* and collect ``(pattern, base_dir)`` from every ``.gitignore``."""
|
|
16
|
+
patterns: list[tuple[str, str]] = []
|
|
17
|
+
root_path = Path(root).resolve()
|
|
18
|
+
|
|
19
|
+
for dirpath, dirnames, filenames in os.walk(root_path):
|
|
20
|
+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
|
21
|
+
if '.gitignore' in filenames:
|
|
22
|
+
gi_path = os.path.join(dirpath, '.gitignore')
|
|
23
|
+
try:
|
|
24
|
+
with open(gi_path, encoding='utf-8', errors='replace') as fh:
|
|
25
|
+
for line in fh:
|
|
26
|
+
stripped = line.strip()
|
|
27
|
+
if not stripped or stripped.startswith('#') or stripped.startswith('!'):
|
|
28
|
+
continue
|
|
29
|
+
patterns.append((stripped, dirpath))
|
|
30
|
+
except (OSError, PermissionError):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
return patterns
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def matches_gitignore(filepath: str, patterns: list[tuple[str, str]]) -> bool:
|
|
37
|
+
"""Return True if *filepath* is covered by any collected ``.gitignore`` pattern."""
|
|
38
|
+
file_path = Path(filepath).resolve()
|
|
39
|
+
|
|
40
|
+
for pattern, base_dir in patterns:
|
|
41
|
+
base_path = Path(base_dir).resolve()
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
rel = file_path.relative_to(base_path)
|
|
45
|
+
except ValueError:
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
rel_str = rel.as_posix()
|
|
49
|
+
rel_parts = rel.parts
|
|
50
|
+
|
|
51
|
+
# Pattern ending with '/' targets directories
|
|
52
|
+
if pattern.endswith('/'):
|
|
53
|
+
dir_pattern = pattern.rstrip('/')
|
|
54
|
+
if any(fnmatch.fnmatch(part, dir_pattern) for part in rel_parts[:-1]):
|
|
55
|
+
return True
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
# Pattern with '/' is anchored to the .gitignore directory
|
|
59
|
+
if '/' in pattern.lstrip('/'):
|
|
60
|
+
clean = pattern.lstrip('/')
|
|
61
|
+
if clean.startswith('**/'):
|
|
62
|
+
sub = clean[3:]
|
|
63
|
+
if fnmatch.fnmatch(rel_str, sub) or fnmatch.fnmatch(rel.name, sub):
|
|
64
|
+
return True
|
|
65
|
+
elif fnmatch.fnmatch(rel_str, clean):
|
|
66
|
+
return True
|
|
67
|
+
else:
|
|
68
|
+
if fnmatch.fnmatch(rel.name, pattern):
|
|
69
|
+
return True
|
|
70
|
+
if any(fnmatch.fnmatch(part, pattern) for part in rel_parts[:-1]):
|
|
71
|
+
return True
|
|
72
|
+
|
|
73
|
+
return False
|
credactor/patterns.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Regex patterns, constants, and safe-value lists for credential detection.
|
|
3
|
+
|
|
4
|
+
Addresses: #17 (connection strings), #18 (PEM keys), #19 (provider prefixes),
|
|
5
|
+
#20 (Vault/SOPS dynamic lookups), #21 (XML attributes)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
# File types to scan
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
SCAN_EXTENSIONS = {
|
|
14
|
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.sh', '.bash',
|
|
15
|
+
'.env', '.cfg', '.ini', '.toml',
|
|
16
|
+
'.yaml', '.yml',
|
|
17
|
+
'.rb', '.go', '.java', '.php', '.cs', '.kt',
|
|
18
|
+
'.tf', '.hcl', '.conf', '.properties',
|
|
19
|
+
'.xml',
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
# Directories / files to skip entirely
|
|
23
|
+
SKIP_DIRS = {'.git', '__pycache__', 'node_modules', '.venv', 'venv', '.tox',
|
|
24
|
+
'.mypy_cache', '.pytest_cache', 'dist', 'build', '.eggs'}
|
|
25
|
+
SKIP_FILES = {'package-lock.json', 'yarn.lock', 'poetry.lock', 'pnpm-lock.yaml'}
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Placeholder / safe values – findings with these values are suppressed
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
SAFE_VALUES = {
|
|
31
|
+
'', 'xxxxx', 'your_key_here', 'your_api_key', 'replace_me',
|
|
32
|
+
'changeme', 'placeholder', 'none', 'null', 'true', 'false',
|
|
33
|
+
'todo', '<your_key>', '<api_key>', 'example', 'test', 'dummy',
|
|
34
|
+
'your_secret', 'your_token', 'your_password', 'enter_here',
|
|
35
|
+
'your_client_id', 'your_client_secret', 'your_tenant_id',
|
|
36
|
+
'xxxx', 'xxxxxx', 'xxxxxxx', 'xxxxxxxx',
|
|
37
|
+
'redacted_by_credactor',
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Dynamic / runtime secret-retrieval patterns
|
|
42
|
+
# Lines containing these patterns fetch secrets at runtime — not hardcoded.
|
|
43
|
+
# Addresses #20: added Vault and SOPS patterns.
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
DYNAMIC_LOOKUP_RE = re.compile(
|
|
46
|
+
r'(?:'
|
|
47
|
+
r'Variable\.get' # Apache Airflow Variable store
|
|
48
|
+
r'|os\.getenv' # os.getenv('KEY')
|
|
49
|
+
r'|os\.environ(?:\.get)?\s*[\[({]' # os.environ['KEY'] / os.environ.get(
|
|
50
|
+
r'|environ\.get\s*\(' # environ.get(
|
|
51
|
+
r'|getenv\s*\(' # standalone getenv(
|
|
52
|
+
r'|config\.get\s*\(' # config.get(
|
|
53
|
+
r'|settings\.get\s*\(' # settings.get(
|
|
54
|
+
r'|SecretClient.*\.get_secret' # Azure Key Vault
|
|
55
|
+
r'|boto3.*\.get_secret' # AWS Secrets Manager
|
|
56
|
+
r'|keyring\.get_password' # system keyring
|
|
57
|
+
# #20 – Hashicorp Vault / SOPS
|
|
58
|
+
r'|vault:secret/' # Vault secret reference
|
|
59
|
+
r'|ENC\[AES256_GCM,' # SOPS-encrypted value
|
|
60
|
+
r'|hvac\.Client' # Hashicorp Vault Python client
|
|
61
|
+
r'|Vault\.read\s*\(' # Vault read call
|
|
62
|
+
r')',
|
|
63
|
+
re.IGNORECASE,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
# Suspicious variable name patterns (case-insensitive)
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
CRED_VAR_PATTERNS = re.compile(
|
|
70
|
+
r'(?i)\b('
|
|
71
|
+
r'api[_\-]?key|apikey|api[_\-]?token|'
|
|
72
|
+
r'auth[_\-]?token|access[_\-]?token|bearer[_\-]?token|'
|
|
73
|
+
r'client[_\-]?secret|secret[_\-]?key|app[_\-]?secret|'
|
|
74
|
+
r'private[_\-]?key|signing[_\-]?key|'
|
|
75
|
+
r'password|passwd|passphrase|pwd|'
|
|
76
|
+
r'access[_\-]?key|access[_\-]?id|secret[_\-]?id|'
|
|
77
|
+
r'client[_\-]?id|tenant[_\-]?id|app[_\-]?id|'
|
|
78
|
+
r'ssh[_\-]?key|encryption[_\-]?key|'
|
|
79
|
+
r'db[_\-]?password|database[_\-]?password|'
|
|
80
|
+
r'db[_\-]?pass|db[_\-]?pwd|'
|
|
81
|
+
r'postgres[_\-]?password|mysql[_\-]?(?:root[_\-]?)?password|'
|
|
82
|
+
r'mongo[_\-]?(?:uri|url|password)|redis[_\-]?(?:url|password)|'
|
|
83
|
+
r'database[_\-]?url|db[_\-]?(?:url|uri)|db[_\-]?conn(?:ection)?(?:[_\-]?string)?|'
|
|
84
|
+
r'smtp[_\-]?password|mail[_\-]?password|'
|
|
85
|
+
r'webhook[_\-]?secret|bot[_\-]?token|'
|
|
86
|
+
r'consumer[_\-]?key|consumer[_\-]?secret|'
|
|
87
|
+
r'refresh[_\-]?token|oauth[_\-]?token'
|
|
88
|
+
r')\b'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
# High-value credential value patterns — (regex, label, min_entropy, severity)
|
|
93
|
+
# #19: Added GCP, Stripe, Slack, GitHub, GitLab, npm, PyPI prefixes.
|
|
94
|
+
# #17: Added connection string pattern.
|
|
95
|
+
# #18: Added PEM private key header.
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
_JWT_RE = re.compile(
|
|
98
|
+
r'eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}'
|
|
99
|
+
)
|
|
100
|
+
_AWS_RE = re.compile(
|
|
101
|
+
r'\b(AKIA|ASIA|AROA|AIDA|ANPA|ANVA|AIPA)[A-Z0-9]{16}\b'
|
|
102
|
+
)
|
|
103
|
+
_HEX_RE = re.compile(r'\b[0-9a-fA-F]{32,64}\b')
|
|
104
|
+
_B64_RE = re.compile(r'[A-Za-z0-9+/=_\-]{60,}')
|
|
105
|
+
|
|
106
|
+
# #19 – Provider-specific token prefixes (deterministic, near-zero false positives)
|
|
107
|
+
_GCP_RE = re.compile(r'\bAIza[0-9A-Za-z_-]{35}\b')
|
|
108
|
+
_STRIPE_LIVE_RE = re.compile(r'\b[sr]k_live_[0-9a-zA-Z]{24,}\b')
|
|
109
|
+
_STRIPE_TEST_RE = re.compile(r'\b[sr]k_test_[0-9a-zA-Z]{24,}\b')
|
|
110
|
+
_SLACK_RE = re.compile(r'\bxox[bpsa]-[0-9A-Za-z-]{10,}\b')
|
|
111
|
+
_GITHUB_RE = re.compile(
|
|
112
|
+
r'\b(?:ghp_|gho_|ghs_|ghu_|github_pat_)[0-9A-Za-z_]{16,}\b'
|
|
113
|
+
)
|
|
114
|
+
_GITLAB_RE = re.compile(r'\bglpat-[0-9A-Za-z_-]{20,}\b')
|
|
115
|
+
_NPM_RE = re.compile(r'\bnpm_[0-9a-zA-Z]{36}\b')
|
|
116
|
+
_PYPI_RE = re.compile(r'\bpypi-[0-9a-zA-Z_-]{16,}\b')
|
|
117
|
+
|
|
118
|
+
# #17 – Connection strings with embedded credentials (scheme://user:pass@host)
|
|
119
|
+
_CONN_STRING_RE = re.compile(
|
|
120
|
+
r'[a-zA-Z][a-zA-Z0-9+.-]*://[^:@\s]+:[^@\s]+@[^\s"\']{3,}'
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# #18 – PEM private key header
|
|
124
|
+
_PEM_KEY_RE = re.compile(r'-----BEGIN\s+(?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----')
|
|
125
|
+
|
|
126
|
+
# severity: critical > high > medium > low
|
|
127
|
+
VALUE_PATTERNS = [
|
|
128
|
+
# Deterministic provider prefixes — critical severity
|
|
129
|
+
(_AWS_RE, 'AWS access key', 3.0, 'critical'),
|
|
130
|
+
(_GCP_RE, 'GCP API key', 3.0, 'critical'),
|
|
131
|
+
(_STRIPE_LIVE_RE, 'Stripe live key', 3.0, 'critical'),
|
|
132
|
+
(_GITHUB_RE, 'GitHub token', 3.0, 'critical'),
|
|
133
|
+
(_GITLAB_RE, 'GitLab token', 3.0, 'critical'),
|
|
134
|
+
(_SLACK_RE, 'Slack token', 3.0, 'critical'),
|
|
135
|
+
(_NPM_RE, 'npm token', 3.0, 'critical'),
|
|
136
|
+
(_PYPI_RE, 'PyPI token', 3.0, 'critical'),
|
|
137
|
+
(_PEM_KEY_RE, 'private key header', 0.0, 'critical'),
|
|
138
|
+
# Structural patterns — high severity
|
|
139
|
+
(_JWT_RE, 'JWT token', 3.3, 'high'),
|
|
140
|
+
(_CONN_STRING_RE, 'connection string', 2.5, 'high'),
|
|
141
|
+
(_STRIPE_TEST_RE, 'Stripe test key', 3.0, 'medium'),
|
|
142
|
+
# Heuristic patterns — medium/low severity
|
|
143
|
+
(_HEX_RE, 'hex credential', 3.5, 'medium'),
|
|
144
|
+
(_B64_RE, 'high-entropy string', 3.8, 'low'),
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
# Assignment detection — variable/key name on the left, value on the right
|
|
149
|
+
# #13: Fixed greedy capture for unquoted values.
|
|
150
|
+
# #21: Added XML attribute pattern.
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
|
|
153
|
+
# Standard assignment: VAR = "value" / VAR = 'value' / "key": "value"
|
|
154
|
+
# #13 fix: quoted values capture up to closing quote; unquoted values stop
|
|
155
|
+
# at whitespace or comment characters.
|
|
156
|
+
ASSIGNMENT_RE = re.compile(
|
|
157
|
+
r'''
|
|
158
|
+
["']? # optional quote around key name
|
|
159
|
+
(?P<var>[\w.\-]+) # variable or key name
|
|
160
|
+
["']? # optional closing quote around key name
|
|
161
|
+
\s*[:=]\s* # assignment or dict colon
|
|
162
|
+
(?:
|
|
163
|
+
(?P<q>["']) # opening quote
|
|
164
|
+
(?P<val_q>(?:(?!(?P=q)).)+) # value: everything up to matching quote
|
|
165
|
+
(?P=q) # closing quote
|
|
166
|
+
|
|
|
167
|
+
(?P<val_u>[^\s#;,\]}"']+) # unquoted: stop at whitespace/comment/delimiters/quotes
|
|
168
|
+
)
|
|
169
|
+
''',
|
|
170
|
+
re.VERBOSE,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# #21 – XML attribute: <... key="Password" value="secret" ...>
|
|
174
|
+
# Supports both orderings: key/name before or after value.
|
|
175
|
+
_XML_KEY_FIRST = re.compile(
|
|
176
|
+
r'<[^>]*?\b(?:key|name)\s*=\s*["\'](?P<xml_key>[^"\']+)["\']'
|
|
177
|
+
r'[^>]*?\bvalue\s*=\s*["\'](?P<xml_val>[^"\']+)["\']',
|
|
178
|
+
re.IGNORECASE,
|
|
179
|
+
)
|
|
180
|
+
_XML_VAL_FIRST = re.compile(
|
|
181
|
+
r'<[^>]*?\bvalue\s*=\s*["\'](?P<xml_val>[^"\']+)["\']'
|
|
182
|
+
r'[^>]*?\b(?:key|name)\s*=\s*["\'](?P<xml_key>[^"\']+)["\']',
|
|
183
|
+
re.IGNORECASE,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def xml_attr_finditer(line: str):
|
|
188
|
+
"""Yield (xml_key, xml_val) from XML attribute matches in either order."""
|
|
189
|
+
seen = set()
|
|
190
|
+
for pattern in (_XML_KEY_FIRST, _XML_VAL_FIRST):
|
|
191
|
+
for m in pattern.finditer(line):
|
|
192
|
+
key, val = m.group('xml_key'), m.group('xml_val')
|
|
193
|
+
if (key, val) not in seen:
|
|
194
|
+
seen.add((key, val))
|
|
195
|
+
yield key, val
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# Keep for backward compat in tests
|
|
199
|
+
XML_ATTR_RE = _XML_KEY_FIRST
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# Inline suppression comment pattern (#3)
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
SUPPRESS_RE = re.compile(r'credactor:\s*ignore', re.IGNORECASE)
|