devguard 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devguard/INTEGRATION_SUMMARY.md +121 -0
- devguard/__init__.py +3 -0
- devguard/__main__.py +6 -0
- devguard/checkers/__init__.py +41 -0
- devguard/checkers/api_usage.py +523 -0
- devguard/checkers/aws_cost.py +331 -0
- devguard/checkers/aws_iam.py +284 -0
- devguard/checkers/base.py +25 -0
- devguard/checkers/container.py +137 -0
- devguard/checkers/domain.py +189 -0
- devguard/checkers/firecrawl.py +117 -0
- devguard/checkers/fly.py +225 -0
- devguard/checkers/github.py +210 -0
- devguard/checkers/npm.py +327 -0
- devguard/checkers/npm_security.py +244 -0
- devguard/checkers/redteam.py +290 -0
- devguard/checkers/secret.py +279 -0
- devguard/checkers/swarm.py +376 -0
- devguard/checkers/tailscale.py +143 -0
- devguard/checkers/tailsnitch.py +303 -0
- devguard/checkers/tavily.py +179 -0
- devguard/checkers/vercel.py +192 -0
- devguard/cli.py +1510 -0
- devguard/cli_helpers.py +189 -0
- devguard/config.py +249 -0
- devguard/core.py +293 -0
- devguard/dashboard.py +715 -0
- devguard/discovery.py +363 -0
- devguard/http_client.py +142 -0
- devguard/llm_service.py +481 -0
- devguard/mcp_server.py +259 -0
- devguard/metrics.py +144 -0
- devguard/models.py +208 -0
- devguard/reporting.py +1571 -0
- devguard/sarif.py +295 -0
- devguard/scripts/ANALYSIS_SUMMARY.md +141 -0
- devguard/scripts/README.md +221 -0
- devguard/scripts/auto_fix_recommendations.py +145 -0
- devguard/scripts/generate_npmignore.py +175 -0
- devguard/scripts/generate_security_report.py +324 -0
- devguard/scripts/prepublish_check.sh +29 -0
- devguard/scripts/redteam_npm_packages.py +1262 -0
- devguard/scripts/review_all_repos.py +300 -0
- devguard/spec.py +617 -0
- devguard/sweeps/__init__.py +23 -0
- devguard/sweeps/ai_editor_config_audit.py +697 -0
- devguard/sweeps/cargo_publish_audit.py +655 -0
- devguard/sweeps/dependency_audit.py +419 -0
- devguard/sweeps/gitignore_audit.py +336 -0
- devguard/sweeps/local_dev.py +260 -0
- devguard/sweeps/local_dirty_worktree_secrets.py +521 -0
- devguard/sweeps/project_flaudit.py +636 -0
- devguard/sweeps/public_github_secrets.py +680 -0
- devguard/sweeps/publish_audit.py +478 -0
- devguard/sweeps/ssh_key_audit.py +327 -0
- devguard/utils.py +174 -0
- devguard-0.2.0.dist-info/METADATA +225 -0
- devguard-0.2.0.dist-info/RECORD +60 -0
- devguard-0.2.0.dist-info/WHEEL +4 -0
- devguard-0.2.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,680 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import fnmatch
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_LOCK_FILE_BASENAMES: frozenset[str] = frozenset({
|
|
20
|
+
"uv.lock", "Cargo.lock", "package-lock.json", "pnpm-lock.yaml",
|
|
21
|
+
"yarn.lock", "poetry.lock", "Gemfile.lock", "composer.lock",
|
|
22
|
+
"Pipfile.lock", "requirements.lock",
|
|
23
|
+
})
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _utc_now() -> str:
|
|
27
|
+
return datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _run(cmd: list[str], timeout_s: int) -> subprocess.CompletedProcess[str]:
|
|
31
|
+
return subprocess.run(
|
|
32
|
+
cmd,
|
|
33
|
+
capture_output=True,
|
|
34
|
+
text=True,
|
|
35
|
+
timeout=timeout_s,
|
|
36
|
+
env=os.environ.copy(),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _match_any(name: str, patterns: list[str]) -> bool:
|
|
41
|
+
return any(fnmatch.fnmatch(name, p) for p in patterns)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _list_public_repos(owner: str, include_forks: bool, timeout_s: int = 30) -> tuple[list[str], list[str]]:
|
|
45
|
+
"""List public repos for a GitHub owner via `gh repo list`."""
|
|
46
|
+
errors: list[str] = []
|
|
47
|
+
cmd = [
|
|
48
|
+
"gh",
|
|
49
|
+
"repo",
|
|
50
|
+
"list",
|
|
51
|
+
owner,
|
|
52
|
+
"--visibility",
|
|
53
|
+
"public",
|
|
54
|
+
"--limit",
|
|
55
|
+
"1000",
|
|
56
|
+
"--json",
|
|
57
|
+
"nameWithOwner,isFork",
|
|
58
|
+
]
|
|
59
|
+
try:
|
|
60
|
+
res = _run(cmd, timeout_s=timeout_s)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
return [], [f"gh repo list failed for {owner}: {e}"]
|
|
63
|
+
|
|
64
|
+
if res.returncode != 0:
|
|
65
|
+
errors.append(f"gh repo list failed for {owner}: exit={res.returncode} stderr={res.stderr.strip()[:300]}")
|
|
66
|
+
return [], errors
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
data = json.loads(res.stdout)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
errors.append(f"gh repo list JSON parse failed for {owner}: {e}")
|
|
72
|
+
return [], errors
|
|
73
|
+
|
|
74
|
+
repos: list[str] = []
|
|
75
|
+
for r in data or []:
|
|
76
|
+
try:
|
|
77
|
+
full = r.get("nameWithOwner")
|
|
78
|
+
is_fork = bool(r.get("isFork"))
|
|
79
|
+
if not full:
|
|
80
|
+
continue
|
|
81
|
+
if is_fork and not include_forks:
|
|
82
|
+
continue
|
|
83
|
+
repos.append(full)
|
|
84
|
+
except Exception:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
return sorted(set(repos)), errors
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _list_public_repos_via_api(owner: str, include_forks: bool, token: str | None) -> tuple[list[str], list[str]]:
|
|
91
|
+
"""List public repos via GitHub REST API (token-only; no gh required)."""
|
|
92
|
+
errors: list[str] = []
|
|
93
|
+
headers = {"Accept": "application/vnd.github+json"}
|
|
94
|
+
if token:
|
|
95
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
96
|
+
|
|
97
|
+
# Try orgs endpoint first; fall back to users endpoint.
|
|
98
|
+
endpoints = [
|
|
99
|
+
f"https://api.github.com/orgs/{owner}/repos",
|
|
100
|
+
f"https://api.github.com/users/{owner}/repos",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
repos: list[str] = []
|
|
104
|
+
with httpx.Client(timeout=20.0, headers=headers) as client:
|
|
105
|
+
for base_url in endpoints:
|
|
106
|
+
repos.clear()
|
|
107
|
+
try:
|
|
108
|
+
page = 1
|
|
109
|
+
while True:
|
|
110
|
+
resp = client.get(
|
|
111
|
+
base_url,
|
|
112
|
+
params={
|
|
113
|
+
"type": "public",
|
|
114
|
+
"per_page": 100,
|
|
115
|
+
"page": page,
|
|
116
|
+
"sort": "full_name",
|
|
117
|
+
"direction": "asc",
|
|
118
|
+
},
|
|
119
|
+
)
|
|
120
|
+
# If org endpoint doesn't match, it commonly returns 404.
|
|
121
|
+
if resp.status_code == 404 and "orgs/" in base_url:
|
|
122
|
+
raise RuntimeError("not an org")
|
|
123
|
+
resp.raise_for_status()
|
|
124
|
+
data = resp.json()
|
|
125
|
+
if not isinstance(data, list) or not data:
|
|
126
|
+
break
|
|
127
|
+
for r in data:
|
|
128
|
+
if not isinstance(r, dict):
|
|
129
|
+
continue
|
|
130
|
+
full = r.get("full_name")
|
|
131
|
+
if not isinstance(full, str) or not full:
|
|
132
|
+
continue
|
|
133
|
+
if r.get("fork") and not include_forks:
|
|
134
|
+
continue
|
|
135
|
+
repos.append(full)
|
|
136
|
+
if len(data) < 100:
|
|
137
|
+
break
|
|
138
|
+
page += 1
|
|
139
|
+
# success for this endpoint
|
|
140
|
+
return sorted(set(repos)), errors
|
|
141
|
+
except Exception as e:
|
|
142
|
+
# try next endpoint
|
|
143
|
+
errors.append(f"github api list repos failed for {owner} via {base_url}: {e}")
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
return [], errors
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _get_github_token() -> tuple[str | None, list[str]]:
|
|
150
|
+
"""Best-effort token retrieval.
|
|
151
|
+
|
|
152
|
+
Priority:
|
|
153
|
+
1) GITHUB_TOKEN env
|
|
154
|
+
2) GH_TOKEN env
|
|
155
|
+
3) `gh auth token` (requires prior gh login; non-interactive)
|
|
156
|
+
"""
|
|
157
|
+
errors: list[str] = []
|
|
158
|
+
token = (os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN") or "").strip()
|
|
159
|
+
if token:
|
|
160
|
+
return token, errors
|
|
161
|
+
|
|
162
|
+
# Best effort: derive from gh if logged in.
|
|
163
|
+
try:
|
|
164
|
+
if subprocess.run(
|
|
165
|
+
["gh", "auth", "status"],
|
|
166
|
+
capture_output=True,
|
|
167
|
+
text=True,
|
|
168
|
+
timeout=5,
|
|
169
|
+
env=os.environ.copy(),
|
|
170
|
+
).returncode == 0:
|
|
171
|
+
res = subprocess.run(
|
|
172
|
+
["gh", "auth", "token"],
|
|
173
|
+
capture_output=True,
|
|
174
|
+
text=True,
|
|
175
|
+
timeout=10,
|
|
176
|
+
env=os.environ.copy(),
|
|
177
|
+
)
|
|
178
|
+
t = (res.stdout or "").strip()
|
|
179
|
+
if t:
|
|
180
|
+
return t, errors
|
|
181
|
+
except FileNotFoundError:
|
|
182
|
+
# gh not installed
|
|
183
|
+
pass
|
|
184
|
+
except Exception as e:
|
|
185
|
+
errors.append(f"gh auth token failed: {e}")
|
|
186
|
+
|
|
187
|
+
return None, errors
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _github_api_get_json(url: str, token: str | None) -> tuple[Any | None, str | None]:
|
|
191
|
+
headers = {"Accept": "application/vnd.github+json"}
|
|
192
|
+
if token:
|
|
193
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
194
|
+
try:
|
|
195
|
+
with httpx.Client(timeout=20.0, headers=headers) as client:
|
|
196
|
+
r = client.get(url)
|
|
197
|
+
r.raise_for_status()
|
|
198
|
+
return r.json(), None
|
|
199
|
+
except Exception as e:
|
|
200
|
+
return None, str(e)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _expand_owners(owners: list[str], token: str | None) -> tuple[list[str], list[str]]:
|
|
204
|
+
"""Expand sentinel owners into real owners.
|
|
205
|
+
|
|
206
|
+
Supported sentinels:
|
|
207
|
+
- "@me": current authenticated user
|
|
208
|
+
- "@orgs": all orgs for current user
|
|
209
|
+
- "@all": @me + @orgs
|
|
210
|
+
"""
|
|
211
|
+
errs: list[str] = []
|
|
212
|
+
|
|
213
|
+
requested = [o.strip() for o in owners if o and o.strip()]
|
|
214
|
+
if not requested:
|
|
215
|
+
requested = ["@me"]
|
|
216
|
+
|
|
217
|
+
want_me = "@all" in requested or "@me" in requested
|
|
218
|
+
want_orgs = "@all" in requested or "@orgs" in requested
|
|
219
|
+
|
|
220
|
+
# Keep explicit owners too (anything not a sentinel).
|
|
221
|
+
expanded: list[str] = [o for o in requested if not o.startswith("@")]
|
|
222
|
+
|
|
223
|
+
# If a token isn't available, don't even try to resolve @me/@orgs via API.
|
|
224
|
+
# This avoids noisy 401s and makes the failure mode clearer.
|
|
225
|
+
if (want_me or want_orgs) and not token:
|
|
226
|
+
errs.append("cannot expand @me/@orgs without a GitHub token (set GITHUB_TOKEN or GH_TOKEN)")
|
|
227
|
+
return sorted(set(expanded)), errs
|
|
228
|
+
|
|
229
|
+
me_login: str | None = None
|
|
230
|
+
if want_me or want_orgs:
|
|
231
|
+
user_obj, err = _github_api_get_json("https://api.github.com/user", token)
|
|
232
|
+
if isinstance(user_obj, dict) and isinstance(user_obj.get("login"), str):
|
|
233
|
+
me_login = user_obj["login"]
|
|
234
|
+
else:
|
|
235
|
+
if err:
|
|
236
|
+
errs.append(f"failed to resolve @me via GitHub API: {err}")
|
|
237
|
+
|
|
238
|
+
if want_me and me_login:
|
|
239
|
+
expanded.append(me_login)
|
|
240
|
+
|
|
241
|
+
if want_orgs and me_login:
|
|
242
|
+
# /user/orgs returns orgs for the authenticated user.
|
|
243
|
+
orgs = []
|
|
244
|
+
page = 1
|
|
245
|
+
with httpx.Client(
|
|
246
|
+
timeout=20.0,
|
|
247
|
+
headers={
|
|
248
|
+
"Accept": "application/vnd.github+json",
|
|
249
|
+
**({"Authorization": f"Bearer {token}"} if token else {}),
|
|
250
|
+
},
|
|
251
|
+
) as client:
|
|
252
|
+
while True:
|
|
253
|
+
try:
|
|
254
|
+
r = client.get("https://api.github.com/user/orgs", params={"per_page": 100, "page": page})
|
|
255
|
+
r.raise_for_status()
|
|
256
|
+
data = r.json()
|
|
257
|
+
if not isinstance(data, list) or not data:
|
|
258
|
+
break
|
|
259
|
+
for o in data:
|
|
260
|
+
if isinstance(o, dict) and isinstance(o.get("login"), str):
|
|
261
|
+
orgs.append(o["login"])
|
|
262
|
+
if len(data) < 100:
|
|
263
|
+
break
|
|
264
|
+
page += 1
|
|
265
|
+
except Exception as e:
|
|
266
|
+
errs.append(f"failed to resolve @orgs via GitHub API: {e}")
|
|
267
|
+
break
|
|
268
|
+
|
|
269
|
+
expanded.extend(orgs)
|
|
270
|
+
|
|
271
|
+
# Dedup, preserve readability.
|
|
272
|
+
expanded = sorted(set(expanded))
|
|
273
|
+
return expanded, errs
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
@dataclass
|
|
277
|
+
class RedactedFinding:
|
|
278
|
+
repo: str
|
|
279
|
+
type: str
|
|
280
|
+
verified: bool | None
|
|
281
|
+
file: str | None
|
|
282
|
+
commit: str | None
|
|
283
|
+
line: int | None
|
|
284
|
+
|
|
285
|
+
def to_dict(self) -> dict[str, Any]:
|
|
286
|
+
return {
|
|
287
|
+
"repo": self.repo,
|
|
288
|
+
"type": self.type,
|
|
289
|
+
"verified": self.verified,
|
|
290
|
+
"file": self.file,
|
|
291
|
+
"commit": self.commit,
|
|
292
|
+
"line": self.line,
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _extract_finding(obj: dict[str, Any], repo: str) -> RedactedFinding | None:
|
|
297
|
+
"""Extract a safe/redacted finding from TruffleHog JSON."""
|
|
298
|
+
if not isinstance(obj, dict):
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
detector = obj.get("DetectorName") or obj.get("Detector") or obj.get("DetectorType") or "unknown"
|
|
302
|
+
verified = obj.get("Verified")
|
|
303
|
+
if verified is not None:
|
|
304
|
+
verified = bool(verified)
|
|
305
|
+
|
|
306
|
+
file_path = None
|
|
307
|
+
commit = None
|
|
308
|
+
line = None
|
|
309
|
+
|
|
310
|
+
# Common v3 layout: SourceMetadata.Data.Git
|
|
311
|
+
sm = obj.get("SourceMetadata") or {}
|
|
312
|
+
data = sm.get("Data") if isinstance(sm, dict) else {}
|
|
313
|
+
git = data.get("Git") if isinstance(data, dict) else {}
|
|
314
|
+
if isinstance(git, dict):
|
|
315
|
+
file_path = git.get("file") or git.get("path")
|
|
316
|
+
commit = git.get("commit")
|
|
317
|
+
line_val = git.get("line")
|
|
318
|
+
if isinstance(line_val, int):
|
|
319
|
+
line = line_val
|
|
320
|
+
elif isinstance(line_val, str):
|
|
321
|
+
try:
|
|
322
|
+
line = int(line_val)
|
|
323
|
+
except Exception:
|
|
324
|
+
line = None
|
|
325
|
+
|
|
326
|
+
# Fallbacks (older layouts)
|
|
327
|
+
if file_path is None and isinstance(obj.get("File"), str):
|
|
328
|
+
file_path = obj.get("File")
|
|
329
|
+
if commit is None and isinstance(obj.get("Commit"), str):
|
|
330
|
+
commit = obj.get("Commit")
|
|
331
|
+
|
|
332
|
+
if isinstance(commit, str) and len(commit) > 8:
|
|
333
|
+
commit = commit[:8]
|
|
334
|
+
|
|
335
|
+
return RedactedFinding(
|
|
336
|
+
repo=repo,
|
|
337
|
+
type=str(detector),
|
|
338
|
+
verified=verified,
|
|
339
|
+
file=str(file_path) if file_path is not None else None,
|
|
340
|
+
commit=str(commit) if commit is not None else None,
|
|
341
|
+
line=line,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def scan_public_github_repos(
|
|
346
|
+
*,
|
|
347
|
+
owners: list[str],
|
|
348
|
+
include_repos: list[str],
|
|
349
|
+
exclude_repos: list[str],
|
|
350
|
+
include_forks: bool,
|
|
351
|
+
max_repos: int,
|
|
352
|
+
engines: list[str] | None = None,
|
|
353
|
+
timeout_s: int = 900,
|
|
354
|
+
max_concurrency: int = 4,
|
|
355
|
+
) -> tuple[dict[str, Any], list[str]]:
|
|
356
|
+
"""Scan public repos for the given owners and return a redacted report."""
|
|
357
|
+
errors: list[str] = []
|
|
358
|
+
|
|
359
|
+
# 0) Token + owner expansion.
|
|
360
|
+
repos: list[str] = []
|
|
361
|
+
discovery_errors: list[str] = []
|
|
362
|
+
discovery_method = "gh"
|
|
363
|
+
|
|
364
|
+
token, token_errors = _get_github_token()
|
|
365
|
+
errors.extend(token_errors)
|
|
366
|
+
|
|
367
|
+
expanded_owners, owner_errs = _expand_owners(owners, token)
|
|
368
|
+
discovery_errors.extend(owner_errs)
|
|
369
|
+
|
|
370
|
+
# 1) Discover repos.
|
|
371
|
+
for owner in expanded_owners:
|
|
372
|
+
# Prefer `gh` if available because it respects local auth and avoids rate limits,
|
|
373
|
+
# but fall back to token-only GitHub API when `gh` isn't usable.
|
|
374
|
+
rs, es = _list_public_repos(owner, include_forks=include_forks)
|
|
375
|
+
if rs:
|
|
376
|
+
repos.extend(rs)
|
|
377
|
+
else:
|
|
378
|
+
discovery_method = "github_api"
|
|
379
|
+
rs2, es2 = _list_public_repos_via_api(owner, include_forks=include_forks, token=token)
|
|
380
|
+
repos.extend(rs2)
|
|
381
|
+
discovery_errors.extend(es + es2)
|
|
382
|
+
|
|
383
|
+
repos = sorted(set(repos))
|
|
384
|
+
if include_repos:
|
|
385
|
+
repos = [r for r in repos if _match_any(r, include_repos)]
|
|
386
|
+
if exclude_repos:
|
|
387
|
+
repos = [r for r in repos if not _match_any(r, exclude_repos)]
|
|
388
|
+
if max_repos and len(repos) > max_repos:
|
|
389
|
+
repos = repos[:max_repos]
|
|
390
|
+
|
|
391
|
+
requested_engines = [e.strip().lower() for e in (engines or ["trufflehog"]) if e and e.strip()]
|
|
392
|
+
# Dedup while preserving order.
|
|
393
|
+
seen: set[str] = set()
|
|
394
|
+
requested_engines = [e for e in requested_engines if not (e in seen or seen.add(e))]
|
|
395
|
+
supported = {"trufflehog", "kingfisher"}
|
|
396
|
+
unknown = [e for e in requested_engines if e not in supported]
|
|
397
|
+
if unknown:
|
|
398
|
+
errors.append(f"Unknown engines: {unknown}. Supported: {sorted(supported)}")
|
|
399
|
+
requested_engines = [e for e in requested_engines if e in supported]
|
|
400
|
+
if not requested_engines:
|
|
401
|
+
requested_engines = ["trufflehog"]
|
|
402
|
+
|
|
403
|
+
# Clamp concurrency to a safe range.
|
|
404
|
+
try:
|
|
405
|
+
max_concurrency = int(max_concurrency)
|
|
406
|
+
except Exception:
|
|
407
|
+
max_concurrency = 4
|
|
408
|
+
max_concurrency = max(1, min(max_concurrency, 12))
|
|
409
|
+
|
|
410
|
+
# 2) Run scan engines per repo.
|
|
411
|
+
if not token and any(e in ("trufflehog", "kingfisher") for e in requested_engines):
|
|
412
|
+
errors.append(
|
|
413
|
+
"Missing GitHub token for public GitHub scans. "
|
|
414
|
+
"Set GITHUB_TOKEN/GH_TOKEN or run `gh auth login` then rerun."
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
findings: list[RedactedFinding] = []
|
|
418
|
+
engine_summaries: dict[str, Any] = {}
|
|
419
|
+
|
|
420
|
+
if repos and token:
|
|
421
|
+
# Avoid passing tokens on argv (shows up in process lists).
|
|
422
|
+
env = os.environ.copy()
|
|
423
|
+
env["GITHUB_TOKEN"] = token
|
|
424
|
+
env["KF_GITHUB_TOKEN"] = token
|
|
425
|
+
|
|
426
|
+
# Interpret `timeout_s` as a *per-repo* timeout upper bound (not total),
|
|
427
|
+
# but clamp it so CI doesn't get stuck.
|
|
428
|
+
per_repo_timeout = max(30, min(int(timeout_s), 600))
|
|
429
|
+
|
|
430
|
+
def _scan_one_repo_trufflehog(repo_full: str) -> tuple[str, list[RedactedFinding], list[str]]:
|
|
431
|
+
repo_errors: list[str] = []
|
|
432
|
+
repo_findings: list[RedactedFinding] = []
|
|
433
|
+
|
|
434
|
+
cmd = [
|
|
435
|
+
"trufflehog",
|
|
436
|
+
"github",
|
|
437
|
+
"--json",
|
|
438
|
+
"--no-update",
|
|
439
|
+
"--results",
|
|
440
|
+
"verified,unverified,unknown",
|
|
441
|
+
"--filter-unverified",
|
|
442
|
+
"--no-fail-on-scan-errors",
|
|
443
|
+
"--repo",
|
|
444
|
+
f"https://github.com/{repo_full}",
|
|
445
|
+
]
|
|
446
|
+
try:
|
|
447
|
+
res = subprocess.run(
|
|
448
|
+
cmd,
|
|
449
|
+
capture_output=True,
|
|
450
|
+
text=True,
|
|
451
|
+
timeout=per_repo_timeout,
|
|
452
|
+
env=env,
|
|
453
|
+
)
|
|
454
|
+
except Exception as e:
|
|
455
|
+
return repo_full, [], [f"trufflehog github failed for {repo_full}: {e}"]
|
|
456
|
+
|
|
457
|
+
# Some orgs enforce SAML SSO on tokens, which can block even public repo API
|
|
458
|
+
# access. For public repos, retry once without auth (best-effort).
|
|
459
|
+
if res.returncode not in (0, 183):
|
|
460
|
+
stderr = (res.stderr or "").strip()
|
|
461
|
+
if "Resource protected by organization SAML" in stderr:
|
|
462
|
+
env_no_token = os.environ.copy()
|
|
463
|
+
env_no_token.pop("GITHUB_TOKEN", None)
|
|
464
|
+
retry_cmd = [
|
|
465
|
+
"trufflehog",
|
|
466
|
+
"github",
|
|
467
|
+
"--json",
|
|
468
|
+
"--no-update",
|
|
469
|
+
"--results",
|
|
470
|
+
"verified,unverified,unknown",
|
|
471
|
+
"--filter-unverified",
|
|
472
|
+
"--no-fail-on-scan-errors",
|
|
473
|
+
"--no-verification",
|
|
474
|
+
"--repo",
|
|
475
|
+
f"https://github.com/{repo_full}",
|
|
476
|
+
]
|
|
477
|
+
try:
|
|
478
|
+
retry = subprocess.run(
|
|
479
|
+
retry_cmd,
|
|
480
|
+
capture_output=True,
|
|
481
|
+
text=True,
|
|
482
|
+
timeout=per_repo_timeout,
|
|
483
|
+
env=env_no_token,
|
|
484
|
+
)
|
|
485
|
+
if retry.returncode in (0, 183):
|
|
486
|
+
res = retry
|
|
487
|
+
else:
|
|
488
|
+
repo_errors.append(
|
|
489
|
+
f"trufflehog github scan error for {repo_full}: exit={res.returncode} "
|
|
490
|
+
f"stderr={stderr[:600]}"
|
|
491
|
+
)
|
|
492
|
+
except Exception:
|
|
493
|
+
repo_errors.append(
|
|
494
|
+
f"trufflehog github scan error for {repo_full}: exit={res.returncode} "
|
|
495
|
+
f"stderr={stderr[:600]}"
|
|
496
|
+
)
|
|
497
|
+
else:
|
|
498
|
+
repo_errors.append(
|
|
499
|
+
f"trufflehog github scan error for {repo_full}: exit={res.returncode} "
|
|
500
|
+
f"stderr={stderr[:600]}"
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
if res.stdout:
|
|
504
|
+
for line in res.stdout.splitlines():
|
|
505
|
+
line = line.strip()
|
|
506
|
+
if not line:
|
|
507
|
+
continue
|
|
508
|
+
try:
|
|
509
|
+
obj = json.loads(line)
|
|
510
|
+
except Exception:
|
|
511
|
+
continue
|
|
512
|
+
repo_name = (
|
|
513
|
+
obj.get("SourceMetadata", {})
|
|
514
|
+
.get("Data", {})
|
|
515
|
+
.get("Git", {})
|
|
516
|
+
.get("repository", None)
|
|
517
|
+
)
|
|
518
|
+
repo_name = repo_name if isinstance(repo_name, str) and repo_name else repo_full
|
|
519
|
+
f = _extract_finding(obj, repo=repo_name)
|
|
520
|
+
if f:
|
|
521
|
+
# Skip lock file false positives (dependency hashes).
|
|
522
|
+
if f.file and Path(f.file).name in _LOCK_FILE_BASENAMES:
|
|
523
|
+
continue
|
|
524
|
+
# Encode engine in the type for now to keep output schema stable.
|
|
525
|
+
f.type = f"trufflehog:{f.type}"
|
|
526
|
+
repo_findings.append(f)
|
|
527
|
+
|
|
528
|
+
return repo_full, repo_findings, repo_errors
|
|
529
|
+
|
|
530
|
+
def _scan_one_repo_kingfisher(repo_full: str) -> tuple[str, list[RedactedFinding], list[str]]:
|
|
531
|
+
repo_errors: list[str] = []
|
|
532
|
+
repo_findings: list[RedactedFinding] = []
|
|
533
|
+
|
|
534
|
+
# Use --git-url so we can keep our own repo enumeration/filtering.
|
|
535
|
+
# Use --redact so output never includes plaintext secrets.
|
|
536
|
+
cmd = [
|
|
537
|
+
"kingfisher",
|
|
538
|
+
"scan",
|
|
539
|
+
"--git-url",
|
|
540
|
+
f"https://github.com/{repo_full}.git",
|
|
541
|
+
"--format",
|
|
542
|
+
"jsonl",
|
|
543
|
+
"--redact",
|
|
544
|
+
"--no-update-check",
|
|
545
|
+
"--no-validate",
|
|
546
|
+
]
|
|
547
|
+
try:
|
|
548
|
+
res = subprocess.run(
|
|
549
|
+
cmd,
|
|
550
|
+
capture_output=True,
|
|
551
|
+
text=True,
|
|
552
|
+
timeout=per_repo_timeout,
|
|
553
|
+
env=env,
|
|
554
|
+
)
|
|
555
|
+
except Exception as e:
|
|
556
|
+
return repo_full, [], [f"kingfisher failed for {repo_full}: {e}"]
|
|
557
|
+
|
|
558
|
+
# Kingfisher writes logs + JSONL to stdout. Parse JSON objects from lines.
|
|
559
|
+
summary_obj: dict[str, Any] | None = None
|
|
560
|
+
for line in (res.stdout or "").splitlines():
|
|
561
|
+
s = line.strip()
|
|
562
|
+
if not s.startswith("{"):
|
|
563
|
+
continue
|
|
564
|
+
try:
|
|
565
|
+
obj = json.loads(s)
|
|
566
|
+
except Exception:
|
|
567
|
+
continue
|
|
568
|
+
if isinstance(obj, dict) and "scan_date" in obj and "findings" in obj:
|
|
569
|
+
summary_obj = obj
|
|
570
|
+
continue
|
|
571
|
+
if not isinstance(obj, dict):
|
|
572
|
+
continue
|
|
573
|
+
rule = obj.get("rule") or obj.get("rule_id") or obj.get("id")
|
|
574
|
+
path = obj.get("path") or obj.get("file") or obj.get("Path") or obj.get("File")
|
|
575
|
+
# Require at least a rule or path to distinguish findings from log lines.
|
|
576
|
+
if not rule and not path:
|
|
577
|
+
continue
|
|
578
|
+
rule = rule or "kingfisher"
|
|
579
|
+
line_val = obj.get("line") or obj.get("line_num") or obj.get("line_number")
|
|
580
|
+
commit = obj.get("commit") or obj.get("Commit")
|
|
581
|
+
try:
|
|
582
|
+
line_i = int(line_val) if line_val is not None else None
|
|
583
|
+
except Exception:
|
|
584
|
+
line_i = None
|
|
585
|
+
if isinstance(commit, str) and len(commit) > 8:
|
|
586
|
+
commit = commit[:8]
|
|
587
|
+
file_str = str(path) if path is not None else None
|
|
588
|
+
# Skip lock file false positives.
|
|
589
|
+
if file_str and Path(file_str).name in _LOCK_FILE_BASENAMES:
|
|
590
|
+
continue
|
|
591
|
+
repo_findings.append(
|
|
592
|
+
RedactedFinding(
|
|
593
|
+
repo=repo_full,
|
|
594
|
+
type=f"kingfisher:{rule}",
|
|
595
|
+
verified=None,
|
|
596
|
+
file=file_str,
|
|
597
|
+
commit=str(commit) if commit is not None else None,
|
|
598
|
+
line=line_i,
|
|
599
|
+
)
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# If we couldn't parse anything, surface stderr to make it debuggable.
|
|
603
|
+
if summary_obj is None and not repo_findings:
|
|
604
|
+
stderr = (res.stderr or "").strip()
|
|
605
|
+
repo_errors.append(
|
|
606
|
+
f"kingfisher produced no parseable JSON for {repo_full}: exit={res.returncode} stderr={stderr[:600]}"
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
return repo_full, repo_findings, repo_errors
|
|
610
|
+
|
|
611
|
+
def _run_engine(
|
|
612
|
+
engine: str,
|
|
613
|
+
scan_one_repo,
|
|
614
|
+
) -> None:
|
|
615
|
+
per_engine_findings: list[RedactedFinding] = []
|
|
616
|
+
per_engine_errors: list[str] = []
|
|
617
|
+
with ThreadPoolExecutor(max_workers=max_concurrency) as ex:
|
|
618
|
+
futures = [ex.submit(scan_one_repo, r) for r in repos]
|
|
619
|
+
for fut in as_completed(futures):
|
|
620
|
+
_repo_full, repo_findings, repo_errors = fut.result()
|
|
621
|
+
if repo_errors:
|
|
622
|
+
per_engine_errors.extend(repo_errors)
|
|
623
|
+
if repo_findings:
|
|
624
|
+
per_engine_findings.extend(repo_findings)
|
|
625
|
+
findings.extend(per_engine_findings)
|
|
626
|
+
errors.extend(per_engine_errors)
|
|
627
|
+
engine_summaries[engine] = {
|
|
628
|
+
"findings_total": len(per_engine_findings),
|
|
629
|
+
"errors_total": len(per_engine_errors),
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
if "trufflehog" in requested_engines:
|
|
633
|
+
_run_engine("trufflehog", _scan_one_repo_trufflehog)
|
|
634
|
+
if "kingfisher" in requested_engines:
|
|
635
|
+
_run_engine("kingfisher", _scan_one_repo_kingfisher)
|
|
636
|
+
|
|
637
|
+
# Summaries
|
|
638
|
+
verified = sum(1 for f in findings if f.verified is True)
|
|
639
|
+
unverified = sum(1 for f in findings if f.verified is False)
|
|
640
|
+
unknown = sum(1 for f in findings if f.verified is None)
|
|
641
|
+
|
|
642
|
+
report: dict[str, Any] = {
|
|
643
|
+
"generated_at": _utc_now(),
|
|
644
|
+
"scope": {
|
|
645
|
+
"owners": owners,
|
|
646
|
+
"owners_expanded": expanded_owners,
|
|
647
|
+
"repos_scanned": repos,
|
|
648
|
+
"repos_scanned_count": len(repos),
|
|
649
|
+
"max_repos": max_repos,
|
|
650
|
+
"include_repos": include_repos,
|
|
651
|
+
"exclude_repos": exclude_repos,
|
|
652
|
+
"include_forks": include_forks,
|
|
653
|
+
},
|
|
654
|
+
"discovery": {
|
|
655
|
+
"method": discovery_method,
|
|
656
|
+
"errors": discovery_errors,
|
|
657
|
+
},
|
|
658
|
+
"engine": {
|
|
659
|
+
"requested_engines": requested_engines,
|
|
660
|
+
"max_concurrency": max_concurrency,
|
|
661
|
+
"per_repo_timeout_s": per_repo_timeout if repos and token else None,
|
|
662
|
+
"summaries": engine_summaries,
|
|
663
|
+
},
|
|
664
|
+
# Redacted: no secret values/snippets included.
|
|
665
|
+
"findings": [f.to_dict() for f in findings[:500]],
|
|
666
|
+
"summary": {
|
|
667
|
+
"findings_total": len(findings),
|
|
668
|
+
"verified": verified,
|
|
669
|
+
"unverified": unverified,
|
|
670
|
+
"unknown": unknown,
|
|
671
|
+
},
|
|
672
|
+
"errors": errors,
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
return report, errors
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def write_report(path: Path, report: dict[str, Any]) -> None:
|
|
679
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
680
|
+
path.write_text(json.dumps(report, indent=2))
|