credactor 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
credactor/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Credactor — credential redactor for source code."""
2
+
3
+ __version__ = '2.0.0'
credactor/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Allow running as ``python -m credactor``."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == '__main__':
6
+ main()
credactor/cli.py ADDED
@@ -0,0 +1,228 @@
1
+ """
2
+ CLI entry point using argparse.
3
+
4
+ Addresses: #6 (--staged), #7 (--format), #8 (--dry-run), #24 (argparse),
5
+ #33 (--fix-all), #34 (exit codes: 0=clean, 1=findings, 2=error)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ from .config import Config, apply_config_file, load_config_file
16
+ from .redactor import fix_all, interactive_review
17
+ from .report import json_report, print_gitignore_skipped, print_report, sarif_report
18
+ from .scanner import scan_file
19
+ from .suppressions import AllowList
20
+ from .walker import scan_git_history, scan_staged_files, select_json_files, walk_and_scan
21
+
22
+
23
+ def build_parser() -> argparse.ArgumentParser:
24
+ parser = argparse.ArgumentParser(
25
+ prog='credactor',
26
+ description='Scan source files for hardcoded credentials and optionally redact them.',
27
+ epilog='Exit codes: 0 = clean, 1 = unresolved findings, 2 = error',
28
+ )
29
+
30
+ parser.add_argument(
31
+ 'target', nargs='?', default='.',
32
+ help='Directory or file to scan (default: current directory)',
33
+ )
34
+
35
+ # Mode flags
36
+ mode = parser.add_argument_group('mode')
37
+ mode.add_argument(
38
+ '--ci', action='store_true',
39
+ help='CI mode: report only (no prompts), exit 1 on findings',
40
+ )
41
+ mode.add_argument(
42
+ '--dry-run', action='store_true',
43
+ help='Show what would be found/replaced without modifying files',
44
+ )
45
+ mode.add_argument(
46
+ '--fix-all', action='store_true',
47
+ help='Replace all findings without prompting',
48
+ )
49
+ mode.add_argument(
50
+ '--staged', action='store_true',
51
+ help='Scan only git-staged files (for pre-commit hooks)',
52
+ )
53
+ mode.add_argument(
54
+ '--scan-history', action='store_true',
55
+ help='Scan git commit history for leaked credentials',
56
+ )
57
+
58
+ # Output flags
59
+ output = parser.add_argument_group('output')
60
+ output.add_argument(
61
+ '--format', '-f', choices=['text', 'json', 'sarif'], default='text',
62
+ dest='output_format',
63
+ help='Output format (default: text)',
64
+ )
65
+ output.add_argument(
66
+ '--no-color', action='store_true',
67
+ help='Disable ANSI color output',
68
+ )
69
+
70
+ # Replacement flags
71
+ replace = parser.add_argument_group('replacement')
72
+ replace.add_argument(
73
+ '--replace-with', choices=['sentinel', 'env', 'custom'], default='sentinel',
74
+ dest='replace_mode',
75
+ help='Replacement strategy: sentinel (default), env (language-aware env var ref), custom',
76
+ )
77
+ replace.add_argument(
78
+ '--replacement', type=str, default='REDACTED_BY_CREDACTOR',
79
+ help='Custom replacement string (used with --replace-with=sentinel or custom)',
80
+ )
81
+ replace.add_argument(
82
+ '--no-backup', action='store_true',
83
+ help='Skip creating .bak backup files before modifying',
84
+ )
85
+
86
+ # Configuration
87
+ config_group = parser.add_argument_group('configuration')
88
+ config_group.add_argument(
89
+ '--config', type=str, default=None,
90
+ help='Path to .credactor.toml config file',
91
+ )
92
+ config_group.add_argument(
93
+ '--scan-json', action='store_true',
94
+ help='Include .json files in the scan',
95
+ )
96
+
97
+ return parser
98
+
99
+
100
+ def main(argv: list[str] | None = None) -> None:
101
+ parser = build_parser()
102
+ args = parser.parse_args(argv)
103
+
104
+ # Build Config
105
+ config = Config(
106
+ ci_mode=args.ci,
107
+ dry_run=args.dry_run,
108
+ fix_all=args.fix_all,
109
+ staged_only=args.staged,
110
+ scan_history=args.scan_history,
111
+ scan_json=args.scan_json,
112
+ no_backup=args.no_backup,
113
+ no_color=args.no_color,
114
+ replace_mode=args.replace_mode,
115
+ custom_replacement=args.replacement,
116
+ output_format=args.output_format,
117
+ target=args.target,
118
+ config_path=args.config,
119
+ )
120
+
121
+ # Load config file (#25)
122
+ target = config.target
123
+ if not os.path.exists(target):
124
+ print(f'Error: path not found: {target}', file=sys.stderr)
125
+ sys.exit(2)
126
+
127
+ # Guard against scanning system directories
128
+ _PROTECTED_DIRS = {'/', '/etc', '/usr', '/var', '/boot', '/sys', '/proc',
129
+ '/bin', '/sbin', '/lib', '/opt', '/root',
130
+ 'C:\\', 'C:\\Windows', 'C:\\Program Files'}
131
+ resolved = str(Path(target).resolve())
132
+ if resolved in _PROTECTED_DIRS:
133
+ print(f'Error: refusing to scan system directory: {resolved}',
134
+ file=sys.stderr)
135
+ print(' Use a project directory instead.', file=sys.stderr)
136
+ sys.exit(2)
137
+
138
+ file_data = load_config_file(target, config.config_path)
139
+ if file_data:
140
+ apply_config_file(config, file_data)
141
+
142
+ # Suppressions (#3, #4)
143
+ allowlist = AllowList(target)
144
+
145
+ print(f'Scanning: {Path(target).resolve()}', file=sys.stderr)
146
+
147
+ # --- Dispatch based on mode ---
148
+ findings: list[dict] = []
149
+
150
+ if config.staged_only:
151
+ # #6 — staged files only
152
+ findings = scan_staged_files(target, config, allowlist)
153
+ elif config.scan_history:
154
+ # #11 — git history
155
+ findings = scan_git_history(target, config, allowlist)
156
+ else:
157
+ # Normal directory scan (#26 single walk)
158
+ dir_findings, gitignore_skipped, json_files = walk_and_scan(target, config, allowlist)
159
+ findings = dir_findings
160
+
161
+ # Report gitignored files
162
+ if config.output_format == 'text':
163
+ print_gitignore_skipped(gitignore_skipped, target, no_color=config.no_color)
164
+
165
+ # Optionally scan JSON files
166
+ if config.scan_json and json_files:
167
+ # Skip interactive selection when non-interactive
168
+ if (config.ci_mode or config.dry_run or config.fix_all
169
+ or config.output_format != 'text'):
170
+ json_paths = json_files
171
+ else:
172
+ json_paths = select_json_files(json_files, target)
173
+
174
+ for path in json_paths:
175
+ findings.extend(scan_file(path, config=config, allowlist=allowlist))
176
+
177
+ # --- Output ---
178
+ if not findings:
179
+ if config.output_format == 'json':
180
+ print(json_report(findings, target))
181
+ elif config.output_format == 'sarif':
182
+ print(sarif_report(findings, target))
183
+ else:
184
+ print('\n[OK] No hardcoded credentials detected. Safe for commits.\n')
185
+ sys.exit(0)
186
+
187
+ # We have findings — report them
188
+ if config.output_format == 'json':
189
+ print(json_report(findings, target))
190
+ elif config.output_format == 'sarif':
191
+ print(sarif_report(findings, target))
192
+ else:
193
+ print_report(findings, target, no_color=config.no_color)
194
+
195
+ # #34 — exit code semantics (consistent across all formats)
196
+ if config.ci_mode or config.dry_run:
197
+ sys.exit(1)
198
+
199
+ if config.fix_all:
200
+ # Confirmation before destructive batch operation
201
+ by_file: dict[str, list] = {}
202
+ for f in findings:
203
+ by_file.setdefault(f['file'], []).append(f)
204
+ print(f'\n --fix-all will modify {len(by_file)} file(s) '
205
+ f'with {len(findings)} replacement(s).')
206
+ if not config.no_backup:
207
+ print(' .bak backups will be created (contain original secrets).')
208
+ else:
209
+ print(' WARNING: --no-backup is set. No backups will be created.')
210
+ try:
211
+ answer = input(' Proceed? [y/N]: ').strip().lower()
212
+ except (KeyboardInterrupt, EOFError):
213
+ print('\n Aborted.')
214
+ sys.exit(1)
215
+ if answer not in ('y', 'yes'):
216
+ print(' Aborted.')
217
+ sys.exit(1)
218
+
219
+ unresolved = fix_all(findings, target, config)
220
+ sys.exit(1 if unresolved > 0 else 0)
221
+
222
+ # Non-text formats in non-CI mode: report and exit 1
223
+ if config.output_format != 'text':
224
+ sys.exit(1)
225
+
226
+ # Interactive mode (default, text only)
227
+ unresolved = interactive_review(findings, target, config)
228
+ sys.exit(1 if unresolved > 0 else 0)
credactor/config.py ADDED
@@ -0,0 +1,136 @@
1
+ """
2
+ Configuration loading from ``.credactor.toml`` files.
3
+
4
+ Addresses: #25 (config file support)
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import sys
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+
15
+ @dataclass
16
+ class Config:
17
+ """Runtime configuration — populated from CLI flags and/or config file."""
18
+
19
+ # Thresholds
20
+ entropy_threshold: float = 3.5
21
+ min_value_length: int = 8
22
+
23
+ # Directories / files
24
+ skip_dirs: set[str] = field(default_factory=lambda: set())
25
+ skip_files: set[str] = field(default_factory=lambda: set())
26
+ extra_extensions: set[str] = field(default_factory=lambda: set())
27
+ extra_safe_values: set[str] = field(default_factory=lambda: set())
28
+
29
+ # Behaviour flags (populated by CLI)
30
+ ci_mode: bool = False
31
+ dry_run: bool = False
32
+ fix_all: bool = False
33
+ staged_only: bool = False
34
+ scan_history: bool = False
35
+ scan_json: bool = False
36
+ no_backup: bool = False
37
+ no_color: bool = False
38
+ replace_mode: str = 'sentinel' # 'sentinel' | 'env' | 'custom'
39
+ custom_replacement: str = 'REDACTED_BY_CREDACTOR'
40
+ output_format: str = 'text' # 'text' | 'json' | 'sarif'
41
+ target: str = '.'
42
+ config_path: Optional[str] = None
43
+
44
+
45
+ def load_config_file(root: str, explicit_path: Optional[str] = None) -> dict:
46
+ """Load a .credactor.toml config file and return the raw dict.
47
+
48
+ Searches for .credactor.toml in root, then parent dirs up to /.
49
+ If explicit_path is given, only that path is tried.
50
+ """
51
+ if explicit_path:
52
+ candidates = [Path(explicit_path)]
53
+ else:
54
+ # HIGH-06: Limit traversal depth to prevent picking up config files
55
+ # from shared parent directories (e.g. /tmp/.credactor.toml).
56
+ # Walk up at most 5 levels — enough for monorepo nesting.
57
+ max_depth = 5
58
+ candidates = []
59
+ p = Path(root).resolve()
60
+ for _ in range(max_depth):
61
+ candidates.append(p / '.credactor.toml')
62
+ if p.parent == p:
63
+ break
64
+ p = p.parent
65
+
66
+ for candidate in candidates:
67
+ if candidate.is_file():
68
+ return _parse_toml(candidate)
69
+
70
+ return {}
71
+
72
+
73
+ def _parse_toml(path: Path) -> dict:
74
+ """Parse a TOML file. Uses tomllib (3.11+) or tomli as fallback."""
75
+ if sys.version_info >= (3, 11):
76
+ import tomllib
77
+ with open(path, 'rb') as fh:
78
+ return tomllib.load(fh)
79
+ else:
80
+ try:
81
+ import tomli
82
+ with open(path, 'rb') as fh:
83
+ return tomli.load(fh)
84
+ except ImportError:
85
+ # Fall back to very basic key=value parsing for simple configs
86
+ return _basic_toml_parse(path)
87
+
88
+
89
+ def _basic_toml_parse(path: Path) -> dict:
90
+ """Minimal TOML-like parser for key = value pairs (no nested tables)."""
91
+ result: dict = {}
92
+ try:
93
+ with open(path, encoding='utf-8') as fh:
94
+ for line in fh:
95
+ stripped = line.strip()
96
+ if not stripped or stripped.startswith('#') or stripped.startswith('['):
97
+ continue
98
+ if '=' not in stripped:
99
+ continue
100
+ key, _, val = stripped.partition('=')
101
+ key = key.strip()
102
+ val = val.strip().strip('"').strip("'")
103
+ # Try to parse as list
104
+ if val.startswith('[') and val.endswith(']'):
105
+ items = val[1:-1].split(',')
106
+ result[key] = [i.strip().strip('"').strip("'") for i in items if i.strip()]
107
+ elif val.lower() in ('true', 'false'):
108
+ result[key] = val.lower() == 'true'
109
+ elif val.isdigit():
110
+ result[key] = int(val)
111
+ else:
112
+ try:
113
+ result[key] = float(val)
114
+ except ValueError:
115
+ result[key] = val
116
+ except (OSError, PermissionError):
117
+ pass
118
+ return result
119
+
120
+
121
+ def apply_config_file(config: Config, file_data: dict) -> None:
122
+ """Merge values from a parsed config file into the Config object."""
123
+ if 'entropy_threshold' in file_data:
124
+ config.entropy_threshold = float(file_data['entropy_threshold'])
125
+ if 'min_value_length' in file_data:
126
+ config.min_value_length = int(file_data['min_value_length'])
127
+ if 'skip_dirs' in file_data:
128
+ config.skip_dirs.update(file_data['skip_dirs'])
129
+ if 'skip_files' in file_data:
130
+ config.skip_files.update(file_data['skip_files'])
131
+ if 'extra_extensions' in file_data:
132
+ config.extra_extensions.update(file_data['extra_extensions'])
133
+ if 'extra_safe_values' in file_data:
134
+ config.extra_safe_values.update(v.lower() for v in file_data['extra_safe_values'])
135
+ if 'replacement' in file_data:
136
+ config.custom_replacement = str(file_data['replacement'])
credactor/gitignore.py ADDED
@@ -0,0 +1,73 @@
1
+ """
2
+ .gitignore pattern loading and matching.
3
+
4
+ Extracted from the original credential_redactor.py with no logic changes.
5
+ """
6
+
7
+ import fnmatch
8
+ import os
9
+ from pathlib import Path
10
+
11
+ from .patterns import SKIP_DIRS
12
+
13
+
14
+ def load_gitignore_patterns(root: str) -> list[tuple[str, str]]:
15
+ """Walk *root* and collect ``(pattern, base_dir)`` from every ``.gitignore``."""
16
+ patterns: list[tuple[str, str]] = []
17
+ root_path = Path(root).resolve()
18
+
19
+ for dirpath, dirnames, filenames in os.walk(root_path):
20
+ dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
21
+ if '.gitignore' in filenames:
22
+ gi_path = os.path.join(dirpath, '.gitignore')
23
+ try:
24
+ with open(gi_path, encoding='utf-8', errors='replace') as fh:
25
+ for line in fh:
26
+ stripped = line.strip()
27
+ if not stripped or stripped.startswith('#') or stripped.startswith('!'):
28
+ continue
29
+ patterns.append((stripped, dirpath))
30
+ except (OSError, PermissionError):
31
+ pass
32
+
33
+ return patterns
34
+
35
+
36
+ def matches_gitignore(filepath: str, patterns: list[tuple[str, str]]) -> bool:
37
+ """Return True if *filepath* is covered by any collected ``.gitignore`` pattern."""
38
+ file_path = Path(filepath).resolve()
39
+
40
+ for pattern, base_dir in patterns:
41
+ base_path = Path(base_dir).resolve()
42
+
43
+ try:
44
+ rel = file_path.relative_to(base_path)
45
+ except ValueError:
46
+ continue
47
+
48
+ rel_str = rel.as_posix()
49
+ rel_parts = rel.parts
50
+
51
+ # Pattern ending with '/' targets directories
52
+ if pattern.endswith('/'):
53
+ dir_pattern = pattern.rstrip('/')
54
+ if any(fnmatch.fnmatch(part, dir_pattern) for part in rel_parts[:-1]):
55
+ return True
56
+ continue
57
+
58
+ # Pattern with '/' is anchored to the .gitignore directory
59
+ if '/' in pattern.lstrip('/'):
60
+ clean = pattern.lstrip('/')
61
+ if clean.startswith('**/'):
62
+ sub = clean[3:]
63
+ if fnmatch.fnmatch(rel_str, sub) or fnmatch.fnmatch(rel.name, sub):
64
+ return True
65
+ elif fnmatch.fnmatch(rel_str, clean):
66
+ return True
67
+ else:
68
+ if fnmatch.fnmatch(rel.name, pattern):
69
+ return True
70
+ if any(fnmatch.fnmatch(part, pattern) for part in rel_parts[:-1]):
71
+ return True
72
+
73
+ return False
credactor/patterns.py ADDED
@@ -0,0 +1,204 @@
1
+ """
2
+ Regex patterns, constants, and safe-value lists for credential detection.
3
+
4
+ Addresses: #17 (connection strings), #18 (PEM keys), #19 (provider prefixes),
5
+ #20 (Vault/SOPS dynamic lookups), #21 (XML attributes)
6
+ """
7
+
8
+ import re
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # File types to scan
12
+ # ---------------------------------------------------------------------------
13
+ SCAN_EXTENSIONS = {
14
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.sh', '.bash',
15
+ '.env', '.cfg', '.ini', '.toml',
16
+ '.yaml', '.yml',
17
+ '.rb', '.go', '.java', '.php', '.cs', '.kt',
18
+ '.tf', '.hcl', '.conf', '.properties',
19
+ '.xml',
20
+ }
21
+
22
+ # Directories / files to skip entirely
23
+ SKIP_DIRS = {'.git', '__pycache__', 'node_modules', '.venv', 'venv', '.tox',
24
+ '.mypy_cache', '.pytest_cache', 'dist', 'build', '.eggs'}
25
+ SKIP_FILES = {'package-lock.json', 'yarn.lock', 'poetry.lock', 'pnpm-lock.yaml'}
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Placeholder / safe values – findings with these values are suppressed
29
+ # ---------------------------------------------------------------------------
30
+ SAFE_VALUES = {
31
+ '', 'xxxxx', 'your_key_here', 'your_api_key', 'replace_me',
32
+ 'changeme', 'placeholder', 'none', 'null', 'true', 'false',
33
+ 'todo', '<your_key>', '<api_key>', 'example', 'test', 'dummy',
34
+ 'your_secret', 'your_token', 'your_password', 'enter_here',
35
+ 'your_client_id', 'your_client_secret', 'your_tenant_id',
36
+ 'xxxx', 'xxxxxx', 'xxxxxxx', 'xxxxxxxx',
37
+ 'redacted_by_credactor',
38
+ }
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Dynamic / runtime secret-retrieval patterns
42
+ # Lines containing these patterns fetch secrets at runtime — not hardcoded.
43
+ # Addresses #20: added Vault and SOPS patterns.
44
+ # ---------------------------------------------------------------------------
45
+ DYNAMIC_LOOKUP_RE = re.compile(
46
+ r'(?:'
47
+ r'Variable\.get' # Apache Airflow Variable store
48
+ r'|os\.getenv' # os.getenv('KEY')
49
+ r'|os\.environ(?:\.get)?\s*[\[({]' # os.environ['KEY'] / os.environ.get(
50
+ r'|environ\.get\s*\(' # environ.get(
51
+ r'|getenv\s*\(' # standalone getenv(
52
+ r'|config\.get\s*\(' # config.get(
53
+ r'|settings\.get\s*\(' # settings.get(
54
+ r'|SecretClient.*\.get_secret' # Azure Key Vault
55
+ r'|boto3.*\.get_secret' # AWS Secrets Manager
56
+ r'|keyring\.get_password' # system keyring
57
+ # #20 – Hashicorp Vault / SOPS
58
+ r'|vault:secret/' # Vault secret reference
59
+ r'|ENC\[AES256_GCM,' # SOPS-encrypted value
60
+ r'|hvac\.Client' # Hashicorp Vault Python client
61
+ r'|Vault\.read\s*\(' # Vault read call
62
+ r')',
63
+ re.IGNORECASE,
64
+ )
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Suspicious variable name patterns (case-insensitive)
68
+ # ---------------------------------------------------------------------------
69
+ CRED_VAR_PATTERNS = re.compile(
70
+ r'(?i)\b('
71
+ r'api[_\-]?key|apikey|api[_\-]?token|'
72
+ r'auth[_\-]?token|access[_\-]?token|bearer[_\-]?token|'
73
+ r'client[_\-]?secret|secret[_\-]?key|app[_\-]?secret|'
74
+ r'private[_\-]?key|signing[_\-]?key|'
75
+ r'password|passwd|passphrase|pwd|'
76
+ r'access[_\-]?key|access[_\-]?id|secret[_\-]?id|'
77
+ r'client[_\-]?id|tenant[_\-]?id|app[_\-]?id|'
78
+ r'ssh[_\-]?key|encryption[_\-]?key|'
79
+ r'db[_\-]?password|database[_\-]?password|'
80
+ r'db[_\-]?pass|db[_\-]?pwd|'
81
+ r'postgres[_\-]?password|mysql[_\-]?(?:root[_\-]?)?password|'
82
+ r'mongo[_\-]?(?:uri|url|password)|redis[_\-]?(?:url|password)|'
83
+ r'database[_\-]?url|db[_\-]?(?:url|uri)|db[_\-]?conn(?:ection)?(?:[_\-]?string)?|'
84
+ r'smtp[_\-]?password|mail[_\-]?password|'
85
+ r'webhook[_\-]?secret|bot[_\-]?token|'
86
+ r'consumer[_\-]?key|consumer[_\-]?secret|'
87
+ r'refresh[_\-]?token|oauth[_\-]?token'
88
+ r')\b'
89
+ )
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # High-value credential value patterns — (regex, label, min_entropy, severity)
93
+ # #19: Added GCP, Stripe, Slack, GitHub, GitLab, npm, PyPI prefixes.
94
+ # #17: Added connection string pattern.
95
+ # #18: Added PEM private key header.
96
+ # ---------------------------------------------------------------------------
97
+ _JWT_RE = re.compile(
98
+ r'eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}'
99
+ )
100
+ _AWS_RE = re.compile(
101
+ r'\b(AKIA|ASIA|AROA|AIDA|ANPA|ANVA|AIPA)[A-Z0-9]{16}\b'
102
+ )
103
+ _HEX_RE = re.compile(r'\b[0-9a-fA-F]{32,64}\b')
104
+ _B64_RE = re.compile(r'[A-Za-z0-9+/=_\-]{60,}')
105
+
106
+ # #19 – Provider-specific token prefixes (deterministic, near-zero false positives)
107
+ _GCP_RE = re.compile(r'\bAIza[0-9A-Za-z_-]{35}\b')
108
+ _STRIPE_LIVE_RE = re.compile(r'\b[sr]k_live_[0-9a-zA-Z]{24,}\b')
109
+ _STRIPE_TEST_RE = re.compile(r'\b[sr]k_test_[0-9a-zA-Z]{24,}\b')
110
+ _SLACK_RE = re.compile(r'\bxox[bpsa]-[0-9A-Za-z-]{10,}\b')
111
+ _GITHUB_RE = re.compile(
112
+ r'\b(?:ghp_|gho_|ghs_|ghu_|github_pat_)[0-9A-Za-z_]{16,}\b'
113
+ )
114
+ _GITLAB_RE = re.compile(r'\bglpat-[0-9A-Za-z_-]{20,}\b')
115
+ _NPM_RE = re.compile(r'\bnpm_[0-9a-zA-Z]{36}\b')
116
+ _PYPI_RE = re.compile(r'\bpypi-[0-9a-zA-Z_-]{16,}\b')
117
+
118
+ # #17 – Connection strings with embedded credentials (scheme://user:pass@host)
119
+ _CONN_STRING_RE = re.compile(
120
+ r'[a-zA-Z][a-zA-Z0-9+.-]*://[^:@\s]+:[^@\s]+@[^\s"\']{3,}'
121
+ )
122
+
123
+ # #18 – PEM private key header
124
+ _PEM_KEY_RE = re.compile(r'-----BEGIN\s+(?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----')
125
+
126
+ # severity: critical > high > medium > low
127
+ VALUE_PATTERNS = [
128
+ # Deterministic provider prefixes — critical severity
129
+ (_AWS_RE, 'AWS access key', 3.0, 'critical'),
130
+ (_GCP_RE, 'GCP API key', 3.0, 'critical'),
131
+ (_STRIPE_LIVE_RE, 'Stripe live key', 3.0, 'critical'),
132
+ (_GITHUB_RE, 'GitHub token', 3.0, 'critical'),
133
+ (_GITLAB_RE, 'GitLab token', 3.0, 'critical'),
134
+ (_SLACK_RE, 'Slack token', 3.0, 'critical'),
135
+ (_NPM_RE, 'npm token', 3.0, 'critical'),
136
+ (_PYPI_RE, 'PyPI token', 3.0, 'critical'),
137
+ (_PEM_KEY_RE, 'private key header', 0.0, 'critical'),
138
+ # Structural patterns — high severity
139
+ (_JWT_RE, 'JWT token', 3.3, 'high'),
140
+ (_CONN_STRING_RE, 'connection string', 2.5, 'high'),
141
+ (_STRIPE_TEST_RE, 'Stripe test key', 3.0, 'medium'),
142
+ # Heuristic patterns — medium/low severity
143
+ (_HEX_RE, 'hex credential', 3.5, 'medium'),
144
+ (_B64_RE, 'high-entropy string', 3.8, 'low'),
145
+ ]
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Assignment detection — variable/key name on the left, value on the right
149
+ # #13: Fixed greedy capture for unquoted values.
150
+ # #21: Added XML attribute pattern.
151
+ # ---------------------------------------------------------------------------
152
+
153
+ # Standard assignment: VAR = "value" / VAR = 'value' / "key": "value"
154
+ # #13 fix: quoted values capture up to closing quote; unquoted values stop
155
+ # at whitespace or comment characters.
156
+ ASSIGNMENT_RE = re.compile(
157
+ r'''
158
+ ["']? # optional quote around key name
159
+ (?P<var>[\w.\-]+) # variable or key name
160
+ ["']? # optional closing quote around key name
161
+ \s*[:=]\s* # assignment or dict colon
162
+ (?:
163
+ (?P<q>["']) # opening quote
164
+ (?P<val_q>(?:(?!(?P=q)).)+) # value: everything up to matching quote
165
+ (?P=q) # closing quote
166
+ |
167
+ (?P<val_u>[^\s#;,\]}"']+) # unquoted: stop at whitespace/comment/delimiters/quotes
168
+ )
169
+ ''',
170
+ re.VERBOSE,
171
+ )
172
+
173
+ # #21 – XML attribute: <... key="Password" value="secret" ...>
174
+ # Supports both orderings: key/name before or after value.
175
+ _XML_KEY_FIRST = re.compile(
176
+ r'<[^>]*?\b(?:key|name)\s*=\s*["\'](?P<xml_key>[^"\']+)["\']'
177
+ r'[^>]*?\bvalue\s*=\s*["\'](?P<xml_val>[^"\']+)["\']',
178
+ re.IGNORECASE,
179
+ )
180
+ _XML_VAL_FIRST = re.compile(
181
+ r'<[^>]*?\bvalue\s*=\s*["\'](?P<xml_val>[^"\']+)["\']'
182
+ r'[^>]*?\b(?:key|name)\s*=\s*["\'](?P<xml_key>[^"\']+)["\']',
183
+ re.IGNORECASE,
184
+ )
185
+
186
+
187
+ def xml_attr_finditer(line: str):
188
+ """Yield (xml_key, xml_val) from XML attribute matches in either order."""
189
+ seen = set()
190
+ for pattern in (_XML_KEY_FIRST, _XML_VAL_FIRST):
191
+ for m in pattern.finditer(line):
192
+ key, val = m.group('xml_key'), m.group('xml_val')
193
+ if (key, val) not in seen:
194
+ seen.add((key, val))
195
+ yield key, val
196
+
197
+
198
+ # Keep for backward compat in tests
199
+ XML_ATTR_RE = _XML_KEY_FIRST
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Inline suppression comment pattern (#3)
203
+ # ---------------------------------------------------------------------------
204
+ SUPPRESS_RE = re.compile(r'credactor:\s*ignore', re.IGNORECASE)