devguard 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devguard/INTEGRATION_SUMMARY.md +121 -0
- devguard/__init__.py +3 -0
- devguard/__main__.py +6 -0
- devguard/checkers/__init__.py +41 -0
- devguard/checkers/api_usage.py +523 -0
- devguard/checkers/aws_cost.py +331 -0
- devguard/checkers/aws_iam.py +284 -0
- devguard/checkers/base.py +25 -0
- devguard/checkers/container.py +137 -0
- devguard/checkers/domain.py +189 -0
- devguard/checkers/firecrawl.py +117 -0
- devguard/checkers/fly.py +225 -0
- devguard/checkers/github.py +210 -0
- devguard/checkers/npm.py +327 -0
- devguard/checkers/npm_security.py +244 -0
- devguard/checkers/redteam.py +290 -0
- devguard/checkers/secret.py +279 -0
- devguard/checkers/swarm.py +376 -0
- devguard/checkers/tailscale.py +143 -0
- devguard/checkers/tailsnitch.py +303 -0
- devguard/checkers/tavily.py +179 -0
- devguard/checkers/vercel.py +192 -0
- devguard/cli.py +1510 -0
- devguard/cli_helpers.py +189 -0
- devguard/config.py +249 -0
- devguard/core.py +293 -0
- devguard/dashboard.py +715 -0
- devguard/discovery.py +363 -0
- devguard/http_client.py +142 -0
- devguard/llm_service.py +481 -0
- devguard/mcp_server.py +259 -0
- devguard/metrics.py +144 -0
- devguard/models.py +208 -0
- devguard/reporting.py +1571 -0
- devguard/sarif.py +295 -0
- devguard/scripts/ANALYSIS_SUMMARY.md +141 -0
- devguard/scripts/README.md +221 -0
- devguard/scripts/auto_fix_recommendations.py +145 -0
- devguard/scripts/generate_npmignore.py +175 -0
- devguard/scripts/generate_security_report.py +324 -0
- devguard/scripts/prepublish_check.sh +29 -0
- devguard/scripts/redteam_npm_packages.py +1262 -0
- devguard/scripts/review_all_repos.py +300 -0
- devguard/spec.py +617 -0
- devguard/sweeps/__init__.py +23 -0
- devguard/sweeps/ai_editor_config_audit.py +697 -0
- devguard/sweeps/cargo_publish_audit.py +655 -0
- devguard/sweeps/dependency_audit.py +419 -0
- devguard/sweeps/gitignore_audit.py +336 -0
- devguard/sweeps/local_dev.py +260 -0
- devguard/sweeps/local_dirty_worktree_secrets.py +521 -0
- devguard/sweeps/project_flaudit.py +636 -0
- devguard/sweeps/public_github_secrets.py +680 -0
- devguard/sweeps/publish_audit.py +478 -0
- devguard/sweeps/ssh_key_audit.py +327 -0
- devguard/utils.py +174 -0
- devguard-0.2.0.dist-info/METADATA +225 -0
- devguard-0.2.0.dist-info/RECORD +60 -0
- devguard-0.2.0.dist-info/WHEEL +4 -0
- devguard-0.2.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import fnmatch
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
from collections import Counter
|
|
8
|
+
from collections.abc import Iterable
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import UTC, datetime
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from threading import Lock
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _utc_now() -> str:
|
|
18
|
+
return datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _default_dev_root() -> Path:
|
|
22
|
+
return Path(os.getenv("DEV_DIR") or "~/Documents/dev").expanduser()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Files where hash-like strings routinely trigger false positives (e.g. SentryToken on uv.lock).
|
|
26
|
+
LOCK_FILE_BASENAMES: frozenset[str] = frozenset({
|
|
27
|
+
"uv.lock",
|
|
28
|
+
"Cargo.lock",
|
|
29
|
+
"package-lock.json",
|
|
30
|
+
"pnpm-lock.yaml",
|
|
31
|
+
"yarn.lock",
|
|
32
|
+
"poetry.lock",
|
|
33
|
+
"Gemfile.lock",
|
|
34
|
+
"composer.lock",
|
|
35
|
+
"Pipfile.lock",
|
|
36
|
+
"requirements.lock",
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _iter_git_repos(root: Path, max_depth: int) -> Iterable[Path]:
|
|
41
|
+
"""Discover git repos under root, bounded by max_depth."""
|
|
42
|
+
root = root.resolve()
|
|
43
|
+
try:
|
|
44
|
+
max_depth = int(max_depth)
|
|
45
|
+
except Exception:
|
|
46
|
+
max_depth = 2
|
|
47
|
+
max_depth = max(0, min(max_depth, 6))
|
|
48
|
+
|
|
49
|
+
# BFS-ish walk with depth bound.
|
|
50
|
+
stack: list[tuple[Path, int]] = [(root, 0)]
|
|
51
|
+
seen: set[Path] = set()
|
|
52
|
+
|
|
53
|
+
junk_top = {
|
|
54
|
+
"node_modules",
|
|
55
|
+
".venv",
|
|
56
|
+
"venv",
|
|
57
|
+
"dist",
|
|
58
|
+
"build",
|
|
59
|
+
".git",
|
|
60
|
+
".cache",
|
|
61
|
+
".state",
|
|
62
|
+
"__pycache__",
|
|
63
|
+
"_trash",
|
|
64
|
+
"_scratch",
|
|
65
|
+
"_external",
|
|
66
|
+
"_archive",
|
|
67
|
+
"_forks",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
while stack:
|
|
71
|
+
cur, depth = stack.pop()
|
|
72
|
+
if cur in seen:
|
|
73
|
+
continue
|
|
74
|
+
seen.add(cur)
|
|
75
|
+
|
|
76
|
+
# If this directory *is* a repo root, yield it and don't descend further.
|
|
77
|
+
if (cur / ".git").exists():
|
|
78
|
+
yield cur
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
if depth >= max_depth:
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
for child in cur.iterdir():
|
|
86
|
+
if not child.is_dir():
|
|
87
|
+
continue
|
|
88
|
+
name = child.name
|
|
89
|
+
if depth == 0 and name in junk_top:
|
|
90
|
+
continue
|
|
91
|
+
# Skip hidden dirs by default (except `_infra` pattern is handled in other sweeps;
|
|
92
|
+
# here we only care about local worktrees).
|
|
93
|
+
if name.startswith("."):
|
|
94
|
+
continue
|
|
95
|
+
stack.append((child, depth + 1))
|
|
96
|
+
except Exception:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _dirty_paths(repo: Path, timeout_s: int = 8) -> tuple[list[str], str | None]:
|
|
101
|
+
"""Return a list of dirty file paths (relative to repo) from `git status --porcelain`.
|
|
102
|
+
|
|
103
|
+
This includes modified, added, deleted (ignored), renamed (new path), and untracked files.
|
|
104
|
+
Returns ([], None) when the repo is clean (not an error).
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
res = subprocess.run(
|
|
108
|
+
["git", "status", "--porcelain=v1", "-z"],
|
|
109
|
+
cwd=str(repo),
|
|
110
|
+
capture_output=True,
|
|
111
|
+
text=True,
|
|
112
|
+
timeout=timeout_s,
|
|
113
|
+
env=os.environ.copy(),
|
|
114
|
+
)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
return [], str(e)
|
|
117
|
+
|
|
118
|
+
if res.returncode != 0:
|
|
119
|
+
return [], (res.stderr or "").strip()[:300] or f"git status exit={res.returncode}"
|
|
120
|
+
|
|
121
|
+
out = res.stdout or ""
|
|
122
|
+
if not out:
|
|
123
|
+
return [], None
|
|
124
|
+
|
|
125
|
+
paths: list[str] = []
|
|
126
|
+
for entry in out.split("\0"):
|
|
127
|
+
if not entry:
|
|
128
|
+
continue
|
|
129
|
+
# Porcelain v1 format begins with XY status and a space, then path.
|
|
130
|
+
# For renames, it can be "R old -> new" (in -z form it's "R old\0new\0" in some modes),
|
|
131
|
+
# but we keep this parser simple and best-effort.
|
|
132
|
+
if len(entry) >= 4 and entry[2] == " ":
|
|
133
|
+
p = entry[3:]
|
|
134
|
+
else:
|
|
135
|
+
p = entry
|
|
136
|
+
# Handle the "old -> new" display form (non -z) defensively.
|
|
137
|
+
if " -> " in p:
|
|
138
|
+
p = p.split(" -> ", 1)[1]
|
|
139
|
+
p = p.strip()
|
|
140
|
+
if not p:
|
|
141
|
+
continue
|
|
142
|
+
paths.append(p)
|
|
143
|
+
|
|
144
|
+
# Dedup and drop obviously non-files.
|
|
145
|
+
uniq = sorted(set(paths))
|
|
146
|
+
return uniq, None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass(frozen=True)
|
|
150
|
+
class LocalDirtyFinding:
|
|
151
|
+
repo_path: str
|
|
152
|
+
engine: str
|
|
153
|
+
type: str
|
|
154
|
+
file: str | None
|
|
155
|
+
line: int | None
|
|
156
|
+
git_tracked: bool | None = None
|
|
157
|
+
git_ignored: bool | None = None
|
|
158
|
+
exposure: str | None = None
|
|
159
|
+
|
|
160
|
+
def to_dict(self) -> dict[str, Any]:
|
|
161
|
+
return {
|
|
162
|
+
"repo_path": self.repo_path,
|
|
163
|
+
"engine": self.engine,
|
|
164
|
+
"type": self.type,
|
|
165
|
+
"file": self.file,
|
|
166
|
+
"line": self.line,
|
|
167
|
+
"git_tracked": self.git_tracked,
|
|
168
|
+
"git_ignored": self.git_ignored,
|
|
169
|
+
"exposure": self.exposure,
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _parse_trufflehog_filesystem_json(stdout: str, repo_path: str) -> list[LocalDirtyFinding]:
|
|
174
|
+
findings: list[LocalDirtyFinding] = []
|
|
175
|
+
for line in (stdout or "").splitlines():
|
|
176
|
+
s = line.strip()
|
|
177
|
+
if not s:
|
|
178
|
+
continue
|
|
179
|
+
try:
|
|
180
|
+
obj = json.loads(s)
|
|
181
|
+
except Exception:
|
|
182
|
+
continue
|
|
183
|
+
if not isinstance(obj, dict):
|
|
184
|
+
continue
|
|
185
|
+
detector = obj.get("DetectorName") or obj.get("Detector") or obj.get("DetectorType") or "unknown"
|
|
186
|
+
|
|
187
|
+
file_path = None
|
|
188
|
+
line_no = None
|
|
189
|
+
sm = obj.get("SourceMetadata") or {}
|
|
190
|
+
data = sm.get("Data") if isinstance(sm, dict) else {}
|
|
191
|
+
fs = data.get("Filesystem") if isinstance(data, dict) else {}
|
|
192
|
+
if isinstance(fs, dict):
|
|
193
|
+
file_path = fs.get("file") or fs.get("path")
|
|
194
|
+
lv = fs.get("line")
|
|
195
|
+
if isinstance(lv, int):
|
|
196
|
+
line_no = lv
|
|
197
|
+
elif isinstance(lv, str):
|
|
198
|
+
try:
|
|
199
|
+
line_no = int(lv)
|
|
200
|
+
except Exception:
|
|
201
|
+
line_no = None
|
|
202
|
+
|
|
203
|
+
findings.append(
|
|
204
|
+
LocalDirtyFinding(
|
|
205
|
+
repo_path=repo_path,
|
|
206
|
+
engine="trufflehog",
|
|
207
|
+
type=str(detector),
|
|
208
|
+
file=str(file_path) if file_path is not None else None,
|
|
209
|
+
line=line_no,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
return findings
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def scan_dirty_worktrees(
|
|
216
|
+
*,
|
|
217
|
+
dev_root: Path | None,
|
|
218
|
+
max_depth: int,
|
|
219
|
+
only_dirty: bool,
|
|
220
|
+
exclude_repo_globs: list[str] | None = None,
|
|
221
|
+
check_upstream: bool = True,
|
|
222
|
+
fetch_remotes: bool = False,
|
|
223
|
+
max_paths_per_repo: int = 50,
|
|
224
|
+
include_ignored_files: bool = False,
|
|
225
|
+
max_concurrency: int,
|
|
226
|
+
timeout_s: int,
|
|
227
|
+
) -> tuple[dict[str, Any], list[str]]:
|
|
228
|
+
errors: list[str] = []
|
|
229
|
+
root = dev_root if dev_root is not None else _default_dev_root()
|
|
230
|
+
|
|
231
|
+
repos = sorted({str(p) for p in _iter_git_repos(root, max_depth=max_depth)})
|
|
232
|
+
globs = [g for g in (exclude_repo_globs or []) if isinstance(g, str) and g.strip()]
|
|
233
|
+
if globs:
|
|
234
|
+
repos = [r for r in repos if not any(fnmatch.fnmatch(r, g) for g in globs)]
|
|
235
|
+
# When only_dirty, we filter inside _scan_one (single git status call per repo)
|
|
236
|
+
# rather than calling _is_repo_dirty + _dirty_paths (two calls per repo).
|
|
237
|
+
dirty_repos = repos
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
max_concurrency = int(max_concurrency)
|
|
241
|
+
except Exception:
|
|
242
|
+
max_concurrency = 4
|
|
243
|
+
max_concurrency = max(1, min(max_concurrency, 12))
|
|
244
|
+
|
|
245
|
+
per_repo_timeout = max(30, min(int(timeout_s), 600))
|
|
246
|
+
|
|
247
|
+
findings: list[LocalDirtyFinding] = []
|
|
248
|
+
repo_meta: list[dict[str, Any]] = []
|
|
249
|
+
ignored_paths_skipped_total = 0
|
|
250
|
+
repos_with_ignored_skips = 0
|
|
251
|
+
repos_with_truncated_paths = 0
|
|
252
|
+
ignored_skipped_basenames: Counter[str] = Counter()
|
|
253
|
+
ignored_skipped_lock = Lock()
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
max_paths_per_repo = int(max_paths_per_repo)
|
|
257
|
+
except Exception:
|
|
258
|
+
max_paths_per_repo = 50
|
|
259
|
+
max_paths_per_repo = max(1, min(max_paths_per_repo, 500))
|
|
260
|
+
|
|
261
|
+
def _classify(
|
|
262
|
+
repo: Path, repo_path: str, abs_path: str
|
|
263
|
+
) -> tuple[bool | None, bool | None, str | None, str]:
|
|
264
|
+
rel = abs_path[len(repo_path.rstrip("/") + "/") :] if abs_path.startswith(repo_path.rstrip("/") + "/") else ""
|
|
265
|
+
tracked = _git_check_tracked(repo, rel) if rel else None
|
|
266
|
+
ignored = _git_check_ignored(repo, rel) if rel else None
|
|
267
|
+
exposure = None
|
|
268
|
+
if tracked is True:
|
|
269
|
+
exposure = "tracked"
|
|
270
|
+
elif ignored is True:
|
|
271
|
+
exposure = "untracked_ignored"
|
|
272
|
+
elif ignored is False:
|
|
273
|
+
exposure = "untracked_not_ignored"
|
|
274
|
+
return tracked, ignored, exposure, rel
|
|
275
|
+
|
|
276
|
+
def _git_check_ignored(repo: Path, rel_path: str) -> bool | None:
|
|
277
|
+
try:
|
|
278
|
+
r = subprocess.run(
|
|
279
|
+
["git", "check-ignore", "-q", rel_path],
|
|
280
|
+
cwd=str(repo),
|
|
281
|
+
capture_output=True,
|
|
282
|
+
text=True,
|
|
283
|
+
timeout=5,
|
|
284
|
+
env=os.environ.copy(),
|
|
285
|
+
)
|
|
286
|
+
if r.returncode == 0:
|
|
287
|
+
return True
|
|
288
|
+
if r.returncode == 1:
|
|
289
|
+
return False
|
|
290
|
+
return None
|
|
291
|
+
except Exception:
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
def _git_check_tracked(repo: Path, rel_path: str) -> bool | None:
|
|
295
|
+
try:
|
|
296
|
+
r = subprocess.run(
|
|
297
|
+
["git", "ls-files", "--error-unmatch", rel_path],
|
|
298
|
+
cwd=str(repo),
|
|
299
|
+
capture_output=True,
|
|
300
|
+
text=True,
|
|
301
|
+
timeout=5,
|
|
302
|
+
env=os.environ.copy(),
|
|
303
|
+
)
|
|
304
|
+
if r.returncode == 0:
|
|
305
|
+
return True
|
|
306
|
+
if r.returncode == 1:
|
|
307
|
+
return False
|
|
308
|
+
return None
|
|
309
|
+
except Exception:
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
def _maybe_fetch(repo: Path) -> str | None:
|
|
313
|
+
if not fetch_remotes:
|
|
314
|
+
return None
|
|
315
|
+
try:
|
|
316
|
+
res = subprocess.run(
|
|
317
|
+
["git", "fetch", "--prune", "--quiet"],
|
|
318
|
+
cwd=str(repo),
|
|
319
|
+
capture_output=True,
|
|
320
|
+
text=True,
|
|
321
|
+
timeout=min(30, per_repo_timeout),
|
|
322
|
+
env=os.environ.copy(),
|
|
323
|
+
)
|
|
324
|
+
if res.returncode != 0:
|
|
325
|
+
return (res.stderr or "").strip()[:300] or f"git fetch exit={res.returncode}"
|
|
326
|
+
return None
|
|
327
|
+
except Exception as e:
|
|
328
|
+
return str(e)
|
|
329
|
+
|
|
330
|
+
def _ahead_behind(repo: Path) -> tuple[int | None, int | None, str | None]:
|
|
331
|
+
# Returns (ahead, behind, err). Values are None if no upstream.
|
|
332
|
+
try:
|
|
333
|
+
up = subprocess.run(
|
|
334
|
+
["git", "rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{u}"],
|
|
335
|
+
cwd=str(repo),
|
|
336
|
+
capture_output=True,
|
|
337
|
+
text=True,
|
|
338
|
+
timeout=8,
|
|
339
|
+
env=os.environ.copy(),
|
|
340
|
+
)
|
|
341
|
+
if up.returncode != 0:
|
|
342
|
+
return None, None, None
|
|
343
|
+
cnt = subprocess.run(
|
|
344
|
+
["git", "rev-list", "--left-right", "--count", "HEAD...@{u}"],
|
|
345
|
+
cwd=str(repo),
|
|
346
|
+
capture_output=True,
|
|
347
|
+
text=True,
|
|
348
|
+
timeout=10,
|
|
349
|
+
env=os.environ.copy(),
|
|
350
|
+
)
|
|
351
|
+
if cnt.returncode != 0:
|
|
352
|
+
return None, None, (cnt.stderr or "").strip()[:200] or f"rev-list exit={cnt.returncode}"
|
|
353
|
+
# output: "<left>\t<right>" where left=behind? Actually for HEAD...@{u}, left=commits unique to HEAD, right=unique to upstream.
|
|
354
|
+
parts = (cnt.stdout or "").strip().split()
|
|
355
|
+
if len(parts) >= 2:
|
|
356
|
+
ahead = int(parts[0])
|
|
357
|
+
behind = int(parts[1])
|
|
358
|
+
return ahead, behind, None
|
|
359
|
+
return None, None, "unexpected rev-list output"
|
|
360
|
+
except Exception as e:
|
|
361
|
+
return None, None, str(e)
|
|
362
|
+
|
|
363
|
+
def _scan_one(repo_path: str) -> tuple[str, list[LocalDirtyFinding], list[str], dict | None]:
|
|
364
|
+
"""Returns (repo_path, findings, errors, repo_meta_entry_or_None)."""
|
|
365
|
+
repo_errors: list[str] = []
|
|
366
|
+
repo_findings: list[LocalDirtyFinding] = []
|
|
367
|
+
|
|
368
|
+
repo = Path(repo_path)
|
|
369
|
+
rel_paths, err = _dirty_paths(repo)
|
|
370
|
+
if err:
|
|
371
|
+
return repo_path, [], [f"git status failed for {repo_path}: {err}"], None
|
|
372
|
+
if not rel_paths:
|
|
373
|
+
return repo_path, [], [], None
|
|
374
|
+
|
|
375
|
+
rel_paths_sorted = sorted(rel_paths)
|
|
376
|
+
truncated = False
|
|
377
|
+
if len(rel_paths_sorted) > max_paths_per_repo:
|
|
378
|
+
rel_paths_sorted = rel_paths_sorted[:max_paths_per_repo]
|
|
379
|
+
truncated = True
|
|
380
|
+
|
|
381
|
+
abs_paths: list[str] = []
|
|
382
|
+
skipped_ignored = 0
|
|
383
|
+
skipped_ignored_sample: list[str] = []
|
|
384
|
+
for rp in rel_paths_sorted:
|
|
385
|
+
ap = repo / rp
|
|
386
|
+
if not (ap.exists() and ap.is_file()):
|
|
387
|
+
continue
|
|
388
|
+
abs_p = str(ap)
|
|
389
|
+
tracked, ignored, _exposure, _rel = _classify(repo, repo_path, abs_p)
|
|
390
|
+
if ignored is True and tracked is False and not include_ignored_files:
|
|
391
|
+
skipped_ignored += 1
|
|
392
|
+
bn = Path(rp).name
|
|
393
|
+
if bn and len(skipped_ignored_sample) < 5 and bn not in skipped_ignored_sample:
|
|
394
|
+
skipped_ignored_sample.append(bn)
|
|
395
|
+
if bn:
|
|
396
|
+
with ignored_skipped_lock:
|
|
397
|
+
ignored_skipped_basenames[bn] += 1
|
|
398
|
+
continue
|
|
399
|
+
abs_paths.append(abs_p)
|
|
400
|
+
if not abs_paths:
|
|
401
|
+
return repo_path, [], [], None
|
|
402
|
+
|
|
403
|
+
fetch_err = _maybe_fetch(repo) if check_upstream else None
|
|
404
|
+
ahead, behind, ab_err = _ahead_behind(repo) if check_upstream else (None, None, None)
|
|
405
|
+
meta_entry = {
|
|
406
|
+
"repo_path": repo_path,
|
|
407
|
+
"dirty_paths_count": len(rel_paths),
|
|
408
|
+
"scanned_paths_count": len(abs_paths),
|
|
409
|
+
"paths_truncated": truncated,
|
|
410
|
+
"max_paths_per_repo": max_paths_per_repo,
|
|
411
|
+
"ignored_paths_skipped": skipped_ignored,
|
|
412
|
+
"ignored_paths_skipped_sample": skipped_ignored_sample,
|
|
413
|
+
"ahead": ahead,
|
|
414
|
+
"behind": behind,
|
|
415
|
+
"upstream_checked": bool(check_upstream),
|
|
416
|
+
"fetched": bool(fetch_remotes),
|
|
417
|
+
"upstream_error": ab_err,
|
|
418
|
+
"fetch_error": fetch_err,
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
cmd = [
|
|
422
|
+
"trufflehog",
|
|
423
|
+
"filesystem",
|
|
424
|
+
"--json",
|
|
425
|
+
"--no-update",
|
|
426
|
+
"--no-verification",
|
|
427
|
+
"--no-fail-on-scan-errors",
|
|
428
|
+
f"--concurrency={max_concurrency}",
|
|
429
|
+
*abs_paths,
|
|
430
|
+
]
|
|
431
|
+
try:
|
|
432
|
+
res = subprocess.run(
|
|
433
|
+
cmd,
|
|
434
|
+
capture_output=True,
|
|
435
|
+
text=True,
|
|
436
|
+
timeout=per_repo_timeout,
|
|
437
|
+
env=os.environ.copy(),
|
|
438
|
+
)
|
|
439
|
+
except Exception as e:
|
|
440
|
+
return repo_path, [], [f"trufflehog filesystem failed for {repo_path}: {e}"], None
|
|
441
|
+
|
|
442
|
+
# TruffleHog may exit non-zero on some errors; we tolerate and record stderr.
|
|
443
|
+
if res.returncode not in (0, 183):
|
|
444
|
+
stderr = (res.stderr or "").strip()
|
|
445
|
+
if stderr:
|
|
446
|
+
repo_errors.append(f"trufflehog filesystem error for {repo_path}: exit={res.returncode} stderr={stderr[:600]}")
|
|
447
|
+
|
|
448
|
+
if res.stdout:
|
|
449
|
+
parsed = _parse_trufflehog_filesystem_json(res.stdout, repo_path=repo_path)
|
|
450
|
+
for f in parsed:
|
|
451
|
+
# Skip findings in lock files -- they contain dependency hashes that
|
|
452
|
+
# routinely trigger false positives (e.g. SentryToken on uv.lock).
|
|
453
|
+
if f.file and Path(f.file).name in LOCK_FILE_BASENAMES:
|
|
454
|
+
continue
|
|
455
|
+
if f.file:
|
|
456
|
+
tracked, ignored, exposure, _rel = _classify(repo, repo_path, f.file)
|
|
457
|
+
object.__setattr__(f, "git_tracked", tracked)
|
|
458
|
+
object.__setattr__(f, "git_ignored", ignored)
|
|
459
|
+
object.__setattr__(f, "exposure", exposure)
|
|
460
|
+
repo_findings.append(f)
|
|
461
|
+
|
|
462
|
+
return repo_path, repo_findings, repo_errors, meta_entry
|
|
463
|
+
|
|
464
|
+
with ThreadPoolExecutor(max_workers=max_concurrency) as ex:
|
|
465
|
+
futures = [ex.submit(_scan_one, r) for r in dirty_repos]
|
|
466
|
+
for fut in as_completed(futures):
|
|
467
|
+
try:
|
|
468
|
+
_repo, fs, es, meta_entry = fut.result()
|
|
469
|
+
except Exception as e:
|
|
470
|
+
errors.append(f"dirty worktree scan worker crashed: {e}")
|
|
471
|
+
continue
|
|
472
|
+
if fs:
|
|
473
|
+
findings.extend(fs)
|
|
474
|
+
if es:
|
|
475
|
+
errors.extend(es)
|
|
476
|
+
if meta_entry:
|
|
477
|
+
repo_meta.append(meta_entry)
|
|
478
|
+
ignored_paths_skipped_total += meta_entry["ignored_paths_skipped"]
|
|
479
|
+
if meta_entry["ignored_paths_skipped"] > 0:
|
|
480
|
+
repos_with_ignored_skips += 1
|
|
481
|
+
if meta_entry["paths_truncated"]:
|
|
482
|
+
repos_with_truncated_paths += 1
|
|
483
|
+
|
|
484
|
+
report: dict[str, Any] = {
|
|
485
|
+
"generated_at": _utc_now(),
|
|
486
|
+
"scope": {
|
|
487
|
+
"dev_root": str(root),
|
|
488
|
+
"repos_discovered_count": len(repos),
|
|
489
|
+
"repos_scanned_count": len(repo_meta),
|
|
490
|
+
"only_dirty": bool(only_dirty),
|
|
491
|
+
"max_depth": int(max_depth),
|
|
492
|
+
"exclude_repo_globs": globs,
|
|
493
|
+
"check_upstream": bool(check_upstream),
|
|
494
|
+
"fetch_remotes": bool(fetch_remotes),
|
|
495
|
+
"max_paths_per_repo": max_paths_per_repo,
|
|
496
|
+
"include_ignored_files": bool(include_ignored_files),
|
|
497
|
+
},
|
|
498
|
+
"engine": {
|
|
499
|
+
"name": "trufflehog",
|
|
500
|
+
"mode": "filesystem",
|
|
501
|
+
"max_concurrency": max_concurrency,
|
|
502
|
+
"per_repo_timeout_s": per_repo_timeout,
|
|
503
|
+
},
|
|
504
|
+
"repos": repo_meta[:500],
|
|
505
|
+
"findings": [f.to_dict() for f in findings[:500]],
|
|
506
|
+
"summary": {
|
|
507
|
+
"findings_total": len(findings),
|
|
508
|
+
"ignored_paths_skipped_total": ignored_paths_skipped_total,
|
|
509
|
+
"repos_with_ignored_paths_skipped": repos_with_ignored_skips,
|
|
510
|
+
"repos_with_paths_truncated": repos_with_truncated_paths,
|
|
511
|
+
"ignored_paths_skipped_top_basenames": ignored_skipped_basenames.most_common(15),
|
|
512
|
+
},
|
|
513
|
+
"errors": errors,
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
return report, errors
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def write_report(path: Path, report: dict[str, Any]) -> None:
|
|
520
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
521
|
+
path.write_text(json.dumps(report, indent=2))
|