devguard 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. devguard/INTEGRATION_SUMMARY.md +121 -0
  2. devguard/__init__.py +3 -0
  3. devguard/__main__.py +6 -0
  4. devguard/checkers/__init__.py +41 -0
  5. devguard/checkers/api_usage.py +523 -0
  6. devguard/checkers/aws_cost.py +331 -0
  7. devguard/checkers/aws_iam.py +284 -0
  8. devguard/checkers/base.py +25 -0
  9. devguard/checkers/container.py +137 -0
  10. devguard/checkers/domain.py +189 -0
  11. devguard/checkers/firecrawl.py +117 -0
  12. devguard/checkers/fly.py +225 -0
  13. devguard/checkers/github.py +210 -0
  14. devguard/checkers/npm.py +327 -0
  15. devguard/checkers/npm_security.py +244 -0
  16. devguard/checkers/redteam.py +290 -0
  17. devguard/checkers/secret.py +279 -0
  18. devguard/checkers/swarm.py +376 -0
  19. devguard/checkers/tailscale.py +143 -0
  20. devguard/checkers/tailsnitch.py +303 -0
  21. devguard/checkers/tavily.py +179 -0
  22. devguard/checkers/vercel.py +192 -0
  23. devguard/cli.py +1510 -0
  24. devguard/cli_helpers.py +189 -0
  25. devguard/config.py +249 -0
  26. devguard/core.py +293 -0
  27. devguard/dashboard.py +715 -0
  28. devguard/discovery.py +363 -0
  29. devguard/http_client.py +142 -0
  30. devguard/llm_service.py +481 -0
  31. devguard/mcp_server.py +259 -0
  32. devguard/metrics.py +144 -0
  33. devguard/models.py +208 -0
  34. devguard/reporting.py +1571 -0
  35. devguard/sarif.py +295 -0
  36. devguard/scripts/ANALYSIS_SUMMARY.md +141 -0
  37. devguard/scripts/README.md +221 -0
  38. devguard/scripts/auto_fix_recommendations.py +145 -0
  39. devguard/scripts/generate_npmignore.py +175 -0
  40. devguard/scripts/generate_security_report.py +324 -0
  41. devguard/scripts/prepublish_check.sh +29 -0
  42. devguard/scripts/redteam_npm_packages.py +1262 -0
  43. devguard/scripts/review_all_repos.py +300 -0
  44. devguard/spec.py +617 -0
  45. devguard/sweeps/__init__.py +23 -0
  46. devguard/sweeps/ai_editor_config_audit.py +697 -0
  47. devguard/sweeps/cargo_publish_audit.py +655 -0
  48. devguard/sweeps/dependency_audit.py +419 -0
  49. devguard/sweeps/gitignore_audit.py +336 -0
  50. devguard/sweeps/local_dev.py +260 -0
  51. devguard/sweeps/local_dirty_worktree_secrets.py +521 -0
  52. devguard/sweeps/project_flaudit.py +636 -0
  53. devguard/sweeps/public_github_secrets.py +680 -0
  54. devguard/sweeps/publish_audit.py +478 -0
  55. devguard/sweeps/ssh_key_audit.py +327 -0
  56. devguard/utils.py +174 -0
  57. devguard-0.2.0.dist-info/METADATA +225 -0
  58. devguard-0.2.0.dist-info/RECORD +60 -0
  59. devguard-0.2.0.dist-info/WHEEL +4 -0
  60. devguard-0.2.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,680 @@
1
+ from __future__ import annotations
2
+
3
+ import fnmatch
4
+ import json
5
+ import logging
6
+ import os
7
+ import subprocess
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from dataclasses import dataclass
10
+ from datetime import UTC, datetime
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import httpx
15
+
16
+ logging.getLogger("httpx").setLevel(logging.WARNING)
17
+
18
+
19
+ _LOCK_FILE_BASENAMES: frozenset[str] = frozenset({
20
+ "uv.lock", "Cargo.lock", "package-lock.json", "pnpm-lock.yaml",
21
+ "yarn.lock", "poetry.lock", "Gemfile.lock", "composer.lock",
22
+ "Pipfile.lock", "requirements.lock",
23
+ })
24
+
25
+
26
+ def _utc_now() -> str:
27
+ return datetime.now(UTC).isoformat().replace("+00:00", "Z")
28
+
29
+
30
+ def _run(cmd: list[str], timeout_s: int) -> subprocess.CompletedProcess[str]:
31
+ return subprocess.run(
32
+ cmd,
33
+ capture_output=True,
34
+ text=True,
35
+ timeout=timeout_s,
36
+ env=os.environ.copy(),
37
+ )
38
+
39
+
40
+ def _match_any(name: str, patterns: list[str]) -> bool:
41
+ return any(fnmatch.fnmatch(name, p) for p in patterns)
42
+
43
+
44
+ def _list_public_repos(owner: str, include_forks: bool, timeout_s: int = 30) -> tuple[list[str], list[str]]:
45
+ """List public repos for a GitHub owner via `gh repo list`."""
46
+ errors: list[str] = []
47
+ cmd = [
48
+ "gh",
49
+ "repo",
50
+ "list",
51
+ owner,
52
+ "--visibility",
53
+ "public",
54
+ "--limit",
55
+ "1000",
56
+ "--json",
57
+ "nameWithOwner,isFork",
58
+ ]
59
+ try:
60
+ res = _run(cmd, timeout_s=timeout_s)
61
+ except Exception as e:
62
+ return [], [f"gh repo list failed for {owner}: {e}"]
63
+
64
+ if res.returncode != 0:
65
+ errors.append(f"gh repo list failed for {owner}: exit={res.returncode} stderr={res.stderr.strip()[:300]}")
66
+ return [], errors
67
+
68
+ try:
69
+ data = json.loads(res.stdout)
70
+ except Exception as e:
71
+ errors.append(f"gh repo list JSON parse failed for {owner}: {e}")
72
+ return [], errors
73
+
74
+ repos: list[str] = []
75
+ for r in data or []:
76
+ try:
77
+ full = r.get("nameWithOwner")
78
+ is_fork = bool(r.get("isFork"))
79
+ if not full:
80
+ continue
81
+ if is_fork and not include_forks:
82
+ continue
83
+ repos.append(full)
84
+ except Exception:
85
+ continue
86
+
87
+ return sorted(set(repos)), errors
88
+
89
+
90
+ def _list_public_repos_via_api(owner: str, include_forks: bool, token: str | None) -> tuple[list[str], list[str]]:
91
+ """List public repos via GitHub REST API (token-only; no gh required)."""
92
+ errors: list[str] = []
93
+ headers = {"Accept": "application/vnd.github+json"}
94
+ if token:
95
+ headers["Authorization"] = f"Bearer {token}"
96
+
97
+ # Try orgs endpoint first; fall back to users endpoint.
98
+ endpoints = [
99
+ f"https://api.github.com/orgs/{owner}/repos",
100
+ f"https://api.github.com/users/{owner}/repos",
101
+ ]
102
+
103
+ repos: list[str] = []
104
+ with httpx.Client(timeout=20.0, headers=headers) as client:
105
+ for base_url in endpoints:
106
+ repos.clear()
107
+ try:
108
+ page = 1
109
+ while True:
110
+ resp = client.get(
111
+ base_url,
112
+ params={
113
+ "type": "public",
114
+ "per_page": 100,
115
+ "page": page,
116
+ "sort": "full_name",
117
+ "direction": "asc",
118
+ },
119
+ )
120
+ # If org endpoint doesn't match, it commonly returns 404.
121
+ if resp.status_code == 404 and "orgs/" in base_url:
122
+ raise RuntimeError("not an org")
123
+ resp.raise_for_status()
124
+ data = resp.json()
125
+ if not isinstance(data, list) or not data:
126
+ break
127
+ for r in data:
128
+ if not isinstance(r, dict):
129
+ continue
130
+ full = r.get("full_name")
131
+ if not isinstance(full, str) or not full:
132
+ continue
133
+ if r.get("fork") and not include_forks:
134
+ continue
135
+ repos.append(full)
136
+ if len(data) < 100:
137
+ break
138
+ page += 1
139
+ # success for this endpoint
140
+ return sorted(set(repos)), errors
141
+ except Exception as e:
142
+ # try next endpoint
143
+ errors.append(f"github api list repos failed for {owner} via {base_url}: {e}")
144
+ continue
145
+
146
+ return [], errors
147
+
148
+
149
+ def _get_github_token() -> tuple[str | None, list[str]]:
150
+ """Best-effort token retrieval.
151
+
152
+ Priority:
153
+ 1) GITHUB_TOKEN env
154
+ 2) GH_TOKEN env
155
+ 3) `gh auth token` (requires prior gh login; non-interactive)
156
+ """
157
+ errors: list[str] = []
158
+ token = (os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN") or "").strip()
159
+ if token:
160
+ return token, errors
161
+
162
+ # Best effort: derive from gh if logged in.
163
+ try:
164
+ if subprocess.run(
165
+ ["gh", "auth", "status"],
166
+ capture_output=True,
167
+ text=True,
168
+ timeout=5,
169
+ env=os.environ.copy(),
170
+ ).returncode == 0:
171
+ res = subprocess.run(
172
+ ["gh", "auth", "token"],
173
+ capture_output=True,
174
+ text=True,
175
+ timeout=10,
176
+ env=os.environ.copy(),
177
+ )
178
+ t = (res.stdout or "").strip()
179
+ if t:
180
+ return t, errors
181
+ except FileNotFoundError:
182
+ # gh not installed
183
+ pass
184
+ except Exception as e:
185
+ errors.append(f"gh auth token failed: {e}")
186
+
187
+ return None, errors
188
+
189
+
190
+ def _github_api_get_json(url: str, token: str | None) -> tuple[Any | None, str | None]:
191
+ headers = {"Accept": "application/vnd.github+json"}
192
+ if token:
193
+ headers["Authorization"] = f"Bearer {token}"
194
+ try:
195
+ with httpx.Client(timeout=20.0, headers=headers) as client:
196
+ r = client.get(url)
197
+ r.raise_for_status()
198
+ return r.json(), None
199
+ except Exception as e:
200
+ return None, str(e)
201
+
202
+
203
+ def _expand_owners(owners: list[str], token: str | None) -> tuple[list[str], list[str]]:
204
+ """Expand sentinel owners into real owners.
205
+
206
+ Supported sentinels:
207
+ - "@me": current authenticated user
208
+ - "@orgs": all orgs for current user
209
+ - "@all": @me + @orgs
210
+ """
211
+ errs: list[str] = []
212
+
213
+ requested = [o.strip() for o in owners if o and o.strip()]
214
+ if not requested:
215
+ requested = ["@me"]
216
+
217
+ want_me = "@all" in requested or "@me" in requested
218
+ want_orgs = "@all" in requested or "@orgs" in requested
219
+
220
+ # Keep explicit owners too (anything not a sentinel).
221
+ expanded: list[str] = [o for o in requested if not o.startswith("@")]
222
+
223
+ # If a token isn't available, don't even try to resolve @me/@orgs via API.
224
+ # This avoids noisy 401s and makes the failure mode clearer.
225
+ if (want_me or want_orgs) and not token:
226
+ errs.append("cannot expand @me/@orgs without a GitHub token (set GITHUB_TOKEN or GH_TOKEN)")
227
+ return sorted(set(expanded)), errs
228
+
229
+ me_login: str | None = None
230
+ if want_me or want_orgs:
231
+ user_obj, err = _github_api_get_json("https://api.github.com/user", token)
232
+ if isinstance(user_obj, dict) and isinstance(user_obj.get("login"), str):
233
+ me_login = user_obj["login"]
234
+ else:
235
+ if err:
236
+ errs.append(f"failed to resolve @me via GitHub API: {err}")
237
+
238
+ if want_me and me_login:
239
+ expanded.append(me_login)
240
+
241
+ if want_orgs and me_login:
242
+ # /user/orgs returns orgs for the authenticated user.
243
+ orgs = []
244
+ page = 1
245
+ with httpx.Client(
246
+ timeout=20.0,
247
+ headers={
248
+ "Accept": "application/vnd.github+json",
249
+ **({"Authorization": f"Bearer {token}"} if token else {}),
250
+ },
251
+ ) as client:
252
+ while True:
253
+ try:
254
+ r = client.get("https://api.github.com/user/orgs", params={"per_page": 100, "page": page})
255
+ r.raise_for_status()
256
+ data = r.json()
257
+ if not isinstance(data, list) or not data:
258
+ break
259
+ for o in data:
260
+ if isinstance(o, dict) and isinstance(o.get("login"), str):
261
+ orgs.append(o["login"])
262
+ if len(data) < 100:
263
+ break
264
+ page += 1
265
+ except Exception as e:
266
+ errs.append(f"failed to resolve @orgs via GitHub API: {e}")
267
+ break
268
+
269
+ expanded.extend(orgs)
270
+
271
+ # Dedup, preserve readability.
272
+ expanded = sorted(set(expanded))
273
+ return expanded, errs
274
+
275
+
276
+ @dataclass
277
+ class RedactedFinding:
278
+ repo: str
279
+ type: str
280
+ verified: bool | None
281
+ file: str | None
282
+ commit: str | None
283
+ line: int | None
284
+
285
+ def to_dict(self) -> dict[str, Any]:
286
+ return {
287
+ "repo": self.repo,
288
+ "type": self.type,
289
+ "verified": self.verified,
290
+ "file": self.file,
291
+ "commit": self.commit,
292
+ "line": self.line,
293
+ }
294
+
295
+
296
+ def _extract_finding(obj: dict[str, Any], repo: str) -> RedactedFinding | None:
297
+ """Extract a safe/redacted finding from TruffleHog JSON."""
298
+ if not isinstance(obj, dict):
299
+ return None
300
+
301
+ detector = obj.get("DetectorName") or obj.get("Detector") or obj.get("DetectorType") or "unknown"
302
+ verified = obj.get("Verified")
303
+ if verified is not None:
304
+ verified = bool(verified)
305
+
306
+ file_path = None
307
+ commit = None
308
+ line = None
309
+
310
+ # Common v3 layout: SourceMetadata.Data.Git
311
+ sm = obj.get("SourceMetadata") or {}
312
+ data = sm.get("Data") if isinstance(sm, dict) else {}
313
+ git = data.get("Git") if isinstance(data, dict) else {}
314
+ if isinstance(git, dict):
315
+ file_path = git.get("file") or git.get("path")
316
+ commit = git.get("commit")
317
+ line_val = git.get("line")
318
+ if isinstance(line_val, int):
319
+ line = line_val
320
+ elif isinstance(line_val, str):
321
+ try:
322
+ line = int(line_val)
323
+ except Exception:
324
+ line = None
325
+
326
+ # Fallbacks (older layouts)
327
+ if file_path is None and isinstance(obj.get("File"), str):
328
+ file_path = obj.get("File")
329
+ if commit is None and isinstance(obj.get("Commit"), str):
330
+ commit = obj.get("Commit")
331
+
332
+ if isinstance(commit, str) and len(commit) > 8:
333
+ commit = commit[:8]
334
+
335
+ return RedactedFinding(
336
+ repo=repo,
337
+ type=str(detector),
338
+ verified=verified,
339
+ file=str(file_path) if file_path is not None else None,
340
+ commit=str(commit) if commit is not None else None,
341
+ line=line,
342
+ )
343
+
344
+
345
+ def scan_public_github_repos(
346
+ *,
347
+ owners: list[str],
348
+ include_repos: list[str],
349
+ exclude_repos: list[str],
350
+ include_forks: bool,
351
+ max_repos: int,
352
+ engines: list[str] | None = None,
353
+ timeout_s: int = 900,
354
+ max_concurrency: int = 4,
355
+ ) -> tuple[dict[str, Any], list[str]]:
356
+ """Scan public repos for the given owners and return a redacted report."""
357
+ errors: list[str] = []
358
+
359
+ # 0) Token + owner expansion.
360
+ repos: list[str] = []
361
+ discovery_errors: list[str] = []
362
+ discovery_method = "gh"
363
+
364
+ token, token_errors = _get_github_token()
365
+ errors.extend(token_errors)
366
+
367
+ expanded_owners, owner_errs = _expand_owners(owners, token)
368
+ discovery_errors.extend(owner_errs)
369
+
370
+ # 1) Discover repos.
371
+ for owner in expanded_owners:
372
+ # Prefer `gh` if available because it respects local auth and avoids rate limits,
373
+ # but fall back to token-only GitHub API when `gh` isn't usable.
374
+ rs, es = _list_public_repos(owner, include_forks=include_forks)
375
+ if rs:
376
+ repos.extend(rs)
377
+ else:
378
+ discovery_method = "github_api"
379
+ rs2, es2 = _list_public_repos_via_api(owner, include_forks=include_forks, token=token)
380
+ repos.extend(rs2)
381
+ discovery_errors.extend(es + es2)
382
+
383
+ repos = sorted(set(repos))
384
+ if include_repos:
385
+ repos = [r for r in repos if _match_any(r, include_repos)]
386
+ if exclude_repos:
387
+ repos = [r for r in repos if not _match_any(r, exclude_repos)]
388
+ if max_repos and len(repos) > max_repos:
389
+ repos = repos[:max_repos]
390
+
391
+ requested_engines = [e.strip().lower() for e in (engines or ["trufflehog"]) if e and e.strip()]
392
+ # Dedup while preserving order.
393
+ seen: set[str] = set()
394
+ requested_engines = [e for e in requested_engines if not (e in seen or seen.add(e))]
395
+ supported = {"trufflehog", "kingfisher"}
396
+ unknown = [e for e in requested_engines if e not in supported]
397
+ if unknown:
398
+ errors.append(f"Unknown engines: {unknown}. Supported: {sorted(supported)}")
399
+ requested_engines = [e for e in requested_engines if e in supported]
400
+ if not requested_engines:
401
+ requested_engines = ["trufflehog"]
402
+
403
+ # Clamp concurrency to a safe range.
404
+ try:
405
+ max_concurrency = int(max_concurrency)
406
+ except Exception:
407
+ max_concurrency = 4
408
+ max_concurrency = max(1, min(max_concurrency, 12))
409
+
410
+ # 2) Run scan engines per repo.
411
+ if not token and any(e in ("trufflehog", "kingfisher") for e in requested_engines):
412
+ errors.append(
413
+ "Missing GitHub token for public GitHub scans. "
414
+ "Set GITHUB_TOKEN/GH_TOKEN or run `gh auth login` then rerun."
415
+ )
416
+
417
+ findings: list[RedactedFinding] = []
418
+ engine_summaries: dict[str, Any] = {}
419
+
420
+ if repos and token:
421
+ # Avoid passing tokens on argv (shows up in process lists).
422
+ env = os.environ.copy()
423
+ env["GITHUB_TOKEN"] = token
424
+ env["KF_GITHUB_TOKEN"] = token
425
+
426
+ # Interpret `timeout_s` as a *per-repo* timeout upper bound (not total),
427
+ # but clamp it so CI doesn't get stuck.
428
+ per_repo_timeout = max(30, min(int(timeout_s), 600))
429
+
430
+ def _scan_one_repo_trufflehog(repo_full: str) -> tuple[str, list[RedactedFinding], list[str]]:
431
+ repo_errors: list[str] = []
432
+ repo_findings: list[RedactedFinding] = []
433
+
434
+ cmd = [
435
+ "trufflehog",
436
+ "github",
437
+ "--json",
438
+ "--no-update",
439
+ "--results",
440
+ "verified,unverified,unknown",
441
+ "--filter-unverified",
442
+ "--no-fail-on-scan-errors",
443
+ "--repo",
444
+ f"https://github.com/{repo_full}",
445
+ ]
446
+ try:
447
+ res = subprocess.run(
448
+ cmd,
449
+ capture_output=True,
450
+ text=True,
451
+ timeout=per_repo_timeout,
452
+ env=env,
453
+ )
454
+ except Exception as e:
455
+ return repo_full, [], [f"trufflehog github failed for {repo_full}: {e}"]
456
+
457
+ # Some orgs enforce SAML SSO on tokens, which can block even public repo API
458
+ # access. For public repos, retry once without auth (best-effort).
459
+ if res.returncode not in (0, 183):
460
+ stderr = (res.stderr or "").strip()
461
+ if "Resource protected by organization SAML" in stderr:
462
+ env_no_token = os.environ.copy()
463
+ env_no_token.pop("GITHUB_TOKEN", None)
464
+ retry_cmd = [
465
+ "trufflehog",
466
+ "github",
467
+ "--json",
468
+ "--no-update",
469
+ "--results",
470
+ "verified,unverified,unknown",
471
+ "--filter-unverified",
472
+ "--no-fail-on-scan-errors",
473
+ "--no-verification",
474
+ "--repo",
475
+ f"https://github.com/{repo_full}",
476
+ ]
477
+ try:
478
+ retry = subprocess.run(
479
+ retry_cmd,
480
+ capture_output=True,
481
+ text=True,
482
+ timeout=per_repo_timeout,
483
+ env=env_no_token,
484
+ )
485
+ if retry.returncode in (0, 183):
486
+ res = retry
487
+ else:
488
+ repo_errors.append(
489
+ f"trufflehog github scan error for {repo_full}: exit={res.returncode} "
490
+ f"stderr={stderr[:600]}"
491
+ )
492
+ except Exception:
493
+ repo_errors.append(
494
+ f"trufflehog github scan error for {repo_full}: exit={res.returncode} "
495
+ f"stderr={stderr[:600]}"
496
+ )
497
+ else:
498
+ repo_errors.append(
499
+ f"trufflehog github scan error for {repo_full}: exit={res.returncode} "
500
+ f"stderr={stderr[:600]}"
501
+ )
502
+
503
+ if res.stdout:
504
+ for line in res.stdout.splitlines():
505
+ line = line.strip()
506
+ if not line:
507
+ continue
508
+ try:
509
+ obj = json.loads(line)
510
+ except Exception:
511
+ continue
512
+ repo_name = (
513
+ obj.get("SourceMetadata", {})
514
+ .get("Data", {})
515
+ .get("Git", {})
516
+ .get("repository", None)
517
+ )
518
+ repo_name = repo_name if isinstance(repo_name, str) and repo_name else repo_full
519
+ f = _extract_finding(obj, repo=repo_name)
520
+ if f:
521
+ # Skip lock file false positives (dependency hashes).
522
+ if f.file and Path(f.file).name in _LOCK_FILE_BASENAMES:
523
+ continue
524
+ # Encode engine in the type for now to keep output schema stable.
525
+ f.type = f"trufflehog:{f.type}"
526
+ repo_findings.append(f)
527
+
528
+ return repo_full, repo_findings, repo_errors
529
+
530
+ def _scan_one_repo_kingfisher(repo_full: str) -> tuple[str, list[RedactedFinding], list[str]]:
531
+ repo_errors: list[str] = []
532
+ repo_findings: list[RedactedFinding] = []
533
+
534
+ # Use --git-url so we can keep our own repo enumeration/filtering.
535
+ # Use --redact so output never includes plaintext secrets.
536
+ cmd = [
537
+ "kingfisher",
538
+ "scan",
539
+ "--git-url",
540
+ f"https://github.com/{repo_full}.git",
541
+ "--format",
542
+ "jsonl",
543
+ "--redact",
544
+ "--no-update-check",
545
+ "--no-validate",
546
+ ]
547
+ try:
548
+ res = subprocess.run(
549
+ cmd,
550
+ capture_output=True,
551
+ text=True,
552
+ timeout=per_repo_timeout,
553
+ env=env,
554
+ )
555
+ except Exception as e:
556
+ return repo_full, [], [f"kingfisher failed for {repo_full}: {e}"]
557
+
558
+ # Kingfisher writes logs + JSONL to stdout. Parse JSON objects from lines.
559
+ summary_obj: dict[str, Any] | None = None
560
+ for line in (res.stdout or "").splitlines():
561
+ s = line.strip()
562
+ if not s.startswith("{"):
563
+ continue
564
+ try:
565
+ obj = json.loads(s)
566
+ except Exception:
567
+ continue
568
+ if isinstance(obj, dict) and "scan_date" in obj and "findings" in obj:
569
+ summary_obj = obj
570
+ continue
571
+ if not isinstance(obj, dict):
572
+ continue
573
+ rule = obj.get("rule") or obj.get("rule_id") or obj.get("id")
574
+ path = obj.get("path") or obj.get("file") or obj.get("Path") or obj.get("File")
575
+ # Require at least a rule or path to distinguish findings from log lines.
576
+ if not rule and not path:
577
+ continue
578
+ rule = rule or "kingfisher"
579
+ line_val = obj.get("line") or obj.get("line_num") or obj.get("line_number")
580
+ commit = obj.get("commit") or obj.get("Commit")
581
+ try:
582
+ line_i = int(line_val) if line_val is not None else None
583
+ except Exception:
584
+ line_i = None
585
+ if isinstance(commit, str) and len(commit) > 8:
586
+ commit = commit[:8]
587
+ file_str = str(path) if path is not None else None
588
+ # Skip lock file false positives.
589
+ if file_str and Path(file_str).name in _LOCK_FILE_BASENAMES:
590
+ continue
591
+ repo_findings.append(
592
+ RedactedFinding(
593
+ repo=repo_full,
594
+ type=f"kingfisher:{rule}",
595
+ verified=None,
596
+ file=file_str,
597
+ commit=str(commit) if commit is not None else None,
598
+ line=line_i,
599
+ )
600
+ )
601
+
602
+ # If we couldn't parse anything, surface stderr to make it debuggable.
603
+ if summary_obj is None and not repo_findings:
604
+ stderr = (res.stderr or "").strip()
605
+ repo_errors.append(
606
+ f"kingfisher produced no parseable JSON for {repo_full}: exit={res.returncode} stderr={stderr[:600]}"
607
+ )
608
+
609
+ return repo_full, repo_findings, repo_errors
610
+
611
+ def _run_engine(
612
+ engine: str,
613
+ scan_one_repo,
614
+ ) -> None:
615
+ per_engine_findings: list[RedactedFinding] = []
616
+ per_engine_errors: list[str] = []
617
+ with ThreadPoolExecutor(max_workers=max_concurrency) as ex:
618
+ futures = [ex.submit(scan_one_repo, r) for r in repos]
619
+ for fut in as_completed(futures):
620
+ _repo_full, repo_findings, repo_errors = fut.result()
621
+ if repo_errors:
622
+ per_engine_errors.extend(repo_errors)
623
+ if repo_findings:
624
+ per_engine_findings.extend(repo_findings)
625
+ findings.extend(per_engine_findings)
626
+ errors.extend(per_engine_errors)
627
+ engine_summaries[engine] = {
628
+ "findings_total": len(per_engine_findings),
629
+ "errors_total": len(per_engine_errors),
630
+ }
631
+
632
+ if "trufflehog" in requested_engines:
633
+ _run_engine("trufflehog", _scan_one_repo_trufflehog)
634
+ if "kingfisher" in requested_engines:
635
+ _run_engine("kingfisher", _scan_one_repo_kingfisher)
636
+
637
+ # Summaries
638
+ verified = sum(1 for f in findings if f.verified is True)
639
+ unverified = sum(1 for f in findings if f.verified is False)
640
+ unknown = sum(1 for f in findings if f.verified is None)
641
+
642
+ report: dict[str, Any] = {
643
+ "generated_at": _utc_now(),
644
+ "scope": {
645
+ "owners": owners,
646
+ "owners_expanded": expanded_owners,
647
+ "repos_scanned": repos,
648
+ "repos_scanned_count": len(repos),
649
+ "max_repos": max_repos,
650
+ "include_repos": include_repos,
651
+ "exclude_repos": exclude_repos,
652
+ "include_forks": include_forks,
653
+ },
654
+ "discovery": {
655
+ "method": discovery_method,
656
+ "errors": discovery_errors,
657
+ },
658
+ "engine": {
659
+ "requested_engines": requested_engines,
660
+ "max_concurrency": max_concurrency,
661
+ "per_repo_timeout_s": per_repo_timeout if repos and token else None,
662
+ "summaries": engine_summaries,
663
+ },
664
+ # Redacted: no secret values/snippets included.
665
+ "findings": [f.to_dict() for f in findings[:500]],
666
+ "summary": {
667
+ "findings_total": len(findings),
668
+ "verified": verified,
669
+ "unverified": unverified,
670
+ "unknown": unknown,
671
+ },
672
+ "errors": errors,
673
+ }
674
+
675
+ return report, errors
676
+
677
+
678
+ def write_report(path: Path, report: dict[str, Any]) -> None:
679
+ path.parent.mkdir(parents=True, exist_ok=True)
680
+ path.write_text(json.dumps(report, indent=2))