semhound 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semhound/__init__.py +1 -0
- semhound/ai_client.py +167 -0
- semhound/cli.py +139 -0
- semhound/scanner.py +466 -0
- semhound-0.1.0.dist-info/METADATA +255 -0
- semhound-0.1.0.dist-info/RECORD +10 -0
- semhound-0.1.0.dist-info/WHEEL +5 -0
- semhound-0.1.0.dist-info/entry_points.txt +2 -0
- semhound-0.1.0.dist-info/licenses/LICENSE +21 -0
- semhound-0.1.0.dist-info/top_level.txt +1 -0
semhound/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
semhound/ai_client.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_SYSTEM_PROMPT_DEFAULT = (
|
|
11
|
+
"You are a senior application security engineer performing code review. "
|
|
12
|
+
"Evaluate whether the provided code snippet is a true positive security finding. "
|
|
13
|
+
"Be concise and precise."
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
_USER_PROMPT_TEMPLATE = """\
|
|
17
|
+
Rule: {rule_message}
|
|
18
|
+
|
|
19
|
+
Code snippet:
|
|
20
|
+
{code_snippet}
|
|
21
|
+
|
|
22
|
+
Respond in JSON only — no markdown, no explanation outside the JSON object:
|
|
23
|
+
{{"confidence": <integer 0-100>, "true_positive": <true or false>, "reasoning": "<one sentence>"}}"""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _parse_ai_response(text: str) -> Tuple[str, str]:
|
|
27
|
+
match = re.search(r"\{.*?\}", text, re.DOTALL)
|
|
28
|
+
if not match:
|
|
29
|
+
return "ERROR", "ERROR"
|
|
30
|
+
try:
|
|
31
|
+
data = json.loads(match.group())
|
|
32
|
+
confidence = str(data.get("confidence", "ERROR"))
|
|
33
|
+
true_positive = str(data.get("true_positive", "ERROR"))
|
|
34
|
+
return confidence, true_positive
|
|
35
|
+
except (json.JSONDecodeError, KeyError):
|
|
36
|
+
return "ERROR", "ERROR"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BaseAIClient(ABC):
|
|
40
|
+
def __init__(self, config: dict):
|
|
41
|
+
self.model = config.get("model", "")
|
|
42
|
+
self.system_prompt = config.get("system_prompt", _SYSTEM_PROMPT_DEFAULT)
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def analyze(self, code_snippet: str, rule_message: str) -> Tuple[str, str]:
|
|
46
|
+
"""Return (confidence_score, true_positive) as strings."""
|
|
47
|
+
|
|
48
|
+
def _build_user_prompt(self, code_snippet: str, rule_message: str) -> str:
|
|
49
|
+
return _USER_PROMPT_TEMPLATE.format(
|
|
50
|
+
rule_message=rule_message,
|
|
51
|
+
code_snippet=code_snippet,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ClaudeClient(BaseAIClient):
|
|
56
|
+
def __init__(self, config: dict):
|
|
57
|
+
super().__init__(config)
|
|
58
|
+
try:
|
|
59
|
+
import anthropic
|
|
60
|
+
except ImportError:
|
|
61
|
+
raise ImportError(
|
|
62
|
+
"Anthropic SDK not installed. Run: pip install semhound"
|
|
63
|
+
) from None
|
|
64
|
+
self._client = anthropic.Anthropic(api_key=config["api_key"])
|
|
65
|
+
|
|
66
|
+
def analyze(self, code_snippet: str, rule_message: str) -> tuple[str, str]:
|
|
67
|
+
response = self._client.messages.create(
|
|
68
|
+
model=self.model or "claude-sonnet-4-6",
|
|
69
|
+
max_tokens=256,
|
|
70
|
+
system=self.system_prompt,
|
|
71
|
+
messages=[{"role": "user", "content": self._build_user_prompt(code_snippet, rule_message)}],
|
|
72
|
+
)
|
|
73
|
+
return _parse_ai_response(response.content[0].text)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class GeminiClient(BaseAIClient):
|
|
77
|
+
def __init__(self, config: dict):
|
|
78
|
+
super().__init__(config)
|
|
79
|
+
try:
|
|
80
|
+
import google.generativeai as genai
|
|
81
|
+
except ImportError:
|
|
82
|
+
raise ImportError(
|
|
83
|
+
"Google Generative AI SDK not installed. Run: pip install semhound"
|
|
84
|
+
) from None
|
|
85
|
+
genai.configure(api_key=config["api_key"])
|
|
86
|
+
self._model = genai.GenerativeModel(
|
|
87
|
+
model_name=self.model or "gemini-1.5-pro",
|
|
88
|
+
system_instruction=self.system_prompt,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def analyze(self, code_snippet: str, rule_message: str) -> tuple[str, str]:
|
|
92
|
+
response = self._model.generate_content(self._build_user_prompt(code_snippet, rule_message))
|
|
93
|
+
return _parse_ai_response(response.text)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class OpenAIClient(BaseAIClient):
|
|
97
|
+
def __init__(self, config: dict):
|
|
98
|
+
super().__init__(config)
|
|
99
|
+
try:
|
|
100
|
+
import openai
|
|
101
|
+
except ImportError:
|
|
102
|
+
raise ImportError(
|
|
103
|
+
"OpenAI SDK not installed. Run: pip install semhound"
|
|
104
|
+
) from None
|
|
105
|
+
self._client = openai.OpenAI(api_key=config["api_key"])
|
|
106
|
+
|
|
107
|
+
def analyze(self, code_snippet: str, rule_message: str) -> tuple[str, str]:
|
|
108
|
+
response = self._client.chat.completions.create(
|
|
109
|
+
model=self.model or "gpt-4o",
|
|
110
|
+
max_tokens=256,
|
|
111
|
+
messages=[
|
|
112
|
+
{"role": "system", "content": self.system_prompt},
|
|
113
|
+
{"role": "user", "content": self._build_user_prompt(code_snippet, rule_message)},
|
|
114
|
+
],
|
|
115
|
+
)
|
|
116
|
+
return _parse_ai_response(response.choices[0].message.content)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class BedrockClient(BaseAIClient):
|
|
120
|
+
"""Uses the Bedrock Converse API — model-agnostic, works with any Bedrock-hosted model."""
|
|
121
|
+
|
|
122
|
+
def __init__(self, config: dict):
|
|
123
|
+
super().__init__(config)
|
|
124
|
+
try:
|
|
125
|
+
import boto3
|
|
126
|
+
except ImportError:
|
|
127
|
+
raise ImportError(
|
|
128
|
+
"boto3 not installed. Run: pip install semhound"
|
|
129
|
+
) from None
|
|
130
|
+
profile = config.get("aws_profile")
|
|
131
|
+
region = config.get("aws_region", "us-east-1")
|
|
132
|
+
session = boto3.Session(profile_name=profile) if profile else boto3.Session()
|
|
133
|
+
self._client = session.client("bedrock-runtime", region_name=region)
|
|
134
|
+
self._model_id = self.model or "anthropic.claude-3-5-sonnet-20241022-v2:0"
|
|
135
|
+
|
|
136
|
+
def analyze(self, code_snippet: str, rule_message: str) -> tuple[str, str]:
|
|
137
|
+
response = self._client.converse(
|
|
138
|
+
modelId=self._model_id,
|
|
139
|
+
system=[{"text": self.system_prompt}],
|
|
140
|
+
messages=[{"role": "user", "content": [{"text": self._build_user_prompt(code_snippet, rule_message)}]}],
|
|
141
|
+
inferenceConfig={"maxTokens": 256},
|
|
142
|
+
)
|
|
143
|
+
text = response["output"]["message"]["content"][0]["text"]
|
|
144
|
+
return _parse_ai_response(text)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
_PROVIDER_MAP = {
|
|
148
|
+
"claude": ClaudeClient,
|
|
149
|
+
"gemini": GeminiClient,
|
|
150
|
+
"openai": OpenAIClient,
|
|
151
|
+
"bedrock": BedrockClient,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_ai_client(config_path: Optional[str]) -> Optional[BaseAIClient]:
|
|
156
|
+
if config_path is None:
|
|
157
|
+
return None
|
|
158
|
+
path = Path(config_path)
|
|
159
|
+
if not path.exists():
|
|
160
|
+
raise FileNotFoundError(f"AI config not found: {config_path}")
|
|
161
|
+
with open(path) as f:
|
|
162
|
+
config = yaml.safe_load(f)
|
|
163
|
+
provider = config.get("provider", "").lower()
|
|
164
|
+
cls = _PROVIDER_MAP.get(provider)
|
|
165
|
+
if cls is None:
|
|
166
|
+
raise ValueError(f"Unknown AI provider '{provider}'. Choose from: {', '.join(_PROVIDER_MAP)}")
|
|
167
|
+
return cls(config)
|
semhound/cli.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import shutil
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from .ai_client import get_ai_client
|
|
6
|
+
from .scanner import discover_repos, download_rules, run_preflight, run_scan
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main() -> None:
|
|
10
|
+
parser = argparse.ArgumentParser(
|
|
11
|
+
prog="semhound",
|
|
12
|
+
description=(
|
|
13
|
+
"Scan every repository across one or more GitHub organisations or users "
|
|
14
|
+
"using Semgrep rules."
|
|
15
|
+
),
|
|
16
|
+
)
|
|
17
|
+
parser.add_argument(
|
|
18
|
+
"github_targets",
|
|
19
|
+
nargs="*",
|
|
20
|
+
metavar="ORG_OR_USER",
|
|
21
|
+
help=(
|
|
22
|
+
"One or more GitHub organization or username targets to scan "
|
|
23
|
+
"(e.g. my-org another-org someuser)"
|
|
24
|
+
),
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--orgs-file",
|
|
28
|
+
default=None,
|
|
29
|
+
metavar="PATH",
|
|
30
|
+
help=(
|
|
31
|
+
"Path to a text file listing GitHub org names to scan, one per line. "
|
|
32
|
+
"Blank lines and lines starting with '#' are ignored. "
|
|
33
|
+
"Can be combined with inline ORG_OR_USER arguments."
|
|
34
|
+
),
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--rules-dir",
|
|
38
|
+
default=None,
|
|
39
|
+
metavar="PATH",
|
|
40
|
+
help="Path to a local folder containing Semgrep .yaml rule files",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--rules-url",
|
|
44
|
+
action="append",
|
|
45
|
+
default=[],
|
|
46
|
+
metavar="URL",
|
|
47
|
+
help=(
|
|
48
|
+
"HTTPS URL of a Semgrep .yaml rule file to download before scanning. "
|
|
49
|
+
"Can be specified multiple times to download several rules."
|
|
50
|
+
),
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--ai-config",
|
|
54
|
+
default=None,
|
|
55
|
+
metavar="PATH",
|
|
56
|
+
help="Path to AI config file (ai.config). Omit to skip AI analysis.",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"--threads",
|
|
60
|
+
type=int,
|
|
61
|
+
default=5,
|
|
62
|
+
metavar="N",
|
|
63
|
+
help="Number of parallel worker threads (default: 5)",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--sarif",
|
|
67
|
+
action="store_true",
|
|
68
|
+
default=False,
|
|
69
|
+
help="Also write a SARIF 2.1.0 report (<target>_scan.sarif) alongside the CSV",
|
|
70
|
+
)
|
|
71
|
+
args = parser.parse_args()
|
|
72
|
+
|
|
73
|
+
targets: list[str] = list(args.github_targets)
|
|
74
|
+
|
|
75
|
+
if args.orgs_file:
|
|
76
|
+
try:
|
|
77
|
+
with open(args.orgs_file, encoding="utf-8") as fh:
|
|
78
|
+
for line in fh:
|
|
79
|
+
name = line.strip()
|
|
80
|
+
if name and not name.startswith("#"):
|
|
81
|
+
targets.append(name)
|
|
82
|
+
except OSError as exc:
|
|
83
|
+
parser.error(f"Cannot read --orgs-file: {exc}")
|
|
84
|
+
|
|
85
|
+
# Deduplicate while preserving order
|
|
86
|
+
seen: set[str] = set()
|
|
87
|
+
unique_targets: list[str] = []
|
|
88
|
+
for t in targets:
|
|
89
|
+
if t not in seen:
|
|
90
|
+
seen.add(t)
|
|
91
|
+
unique_targets.append(t)
|
|
92
|
+
targets = unique_targets
|
|
93
|
+
|
|
94
|
+
if not targets:
|
|
95
|
+
parser.error(
|
|
96
|
+
"At least one GitHub org or username must be provided "
|
|
97
|
+
"(inline or via --orgs-file)."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if not args.rules_dir and not args.rules_url:
|
|
101
|
+
parser.error("At least one of --rules-dir or --rules-url must be provided.")
|
|
102
|
+
|
|
103
|
+
for url in args.rules_url:
|
|
104
|
+
if not url.lower().startswith("https://"):
|
|
105
|
+
parser.error(f"--rules-url only accepts HTTPS URLs: {url}")
|
|
106
|
+
|
|
107
|
+
run_preflight()
|
|
108
|
+
|
|
109
|
+
rules_sources: list[str] = []
|
|
110
|
+
if args.rules_dir:
|
|
111
|
+
rules_sources.append(args.rules_dir)
|
|
112
|
+
|
|
113
|
+
downloaded_tmpdir: str | None = None
|
|
114
|
+
if args.rules_url:
|
|
115
|
+
try:
|
|
116
|
+
downloaded_tmpdir = download_rules(args.rules_url)
|
|
117
|
+
rules_sources.append(downloaded_tmpdir)
|
|
118
|
+
except (ValueError, RuntimeError) as exc:
|
|
119
|
+
print(f"[error] {exc}", file=sys.stderr)
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
ai_client = get_ai_client(args.ai_config)
|
|
124
|
+
except (FileNotFoundError, ValueError, ImportError) as exc:
|
|
125
|
+
print(f"[error] {exc}", file=sys.stderr)
|
|
126
|
+
sys.exit(1)
|
|
127
|
+
|
|
128
|
+
if ai_client is None:
|
|
129
|
+
print("[info] No --ai-config provided; AI analysis will be skipped.")
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
for target in targets:
|
|
133
|
+
print(f"\n[info] Discovering repositories for '{target}' ...")
|
|
134
|
+
repos = discover_repos(target)
|
|
135
|
+
print(f"[info] Found {len(repos)} repository/repositories.")
|
|
136
|
+
run_scan(repos, target, rules_sources, ai_client, args.threads, output_sarif=args.sarif)
|
|
137
|
+
finally:
|
|
138
|
+
if downloaded_tmpdir:
|
|
139
|
+
shutil.rmtree(downloaded_tmpdir, ignore_errors=True)
|
semhound/scanner.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import tempfile
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import urllib.error
|
|
10
|
+
import urllib.request
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
|
|
17
|
+
from .ai_client import BaseAIClient
|
|
18
|
+
|
|
19
|
+
_MAX_RETRIES = 3
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_TOOL_INSTALL = {
|
|
23
|
+
"gh": {
|
|
24
|
+
"name": "GitHub CLI",
|
|
25
|
+
"url": "https://cli.github.com",
|
|
26
|
+
"mac": "brew install gh",
|
|
27
|
+
"linux": "sudo apt install gh # Debian/Ubuntu (apt.cli.github.com)\n"
|
|
28
|
+
" sudo dnf install gh # Fedora/RHEL\n"
|
|
29
|
+
" See https://github.com/cli/cli/blob/trunk/docs/install_linux.md",
|
|
30
|
+
"windows": "winget install --id GitHub.cli\n"
|
|
31
|
+
" # or: choco install gh",
|
|
32
|
+
},
|
|
33
|
+
"git": {
|
|
34
|
+
"name": "Git",
|
|
35
|
+
"url": "https://git-scm.com",
|
|
36
|
+
"mac": "brew install git",
|
|
37
|
+
"linux": "sudo apt install git # Debian/Ubuntu\n"
|
|
38
|
+
" sudo dnf install git # Fedora/RHEL",
|
|
39
|
+
"windows": "winget install --id Git.Git\n"
|
|
40
|
+
" # or: choco install git — then restart your terminal",
|
|
41
|
+
},
|
|
42
|
+
"semgrep": {
|
|
43
|
+
"name": "Semgrep",
|
|
44
|
+
"url": "https://semgrep.dev",
|
|
45
|
+
"mac": "brew install semgrep\n"
|
|
46
|
+
" # or: pip install semgrep",
|
|
47
|
+
"linux": "pip install semgrep",
|
|
48
|
+
"windows": "pip install semgrep",
|
|
49
|
+
},
|
|
50
|
+
"ssh": {
|
|
51
|
+
"name": "OpenSSH client",
|
|
52
|
+
"url": "https://www.openssh.com",
|
|
53
|
+
"mac": "ssh ships with macOS — no install needed",
|
|
54
|
+
"linux": "sudo apt install openssh-client # Debian/Ubuntu\n"
|
|
55
|
+
" sudo dnf install openssh # Fedora/RHEL",
|
|
56
|
+
"windows": "# OpenSSH ships with Windows 10/11. If missing, run in PowerShell (Admin):\n"
|
|
57
|
+
" Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0",
|
|
58
|
+
},
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _print_install_hint(tool: str, system: str) -> None:
|
|
63
|
+
info = _TOOL_INSTALL.get(tool, {"name": tool, "url": "", "mac": "", "linux": "", "windows": ""})
|
|
64
|
+
if system == "Darwin":
|
|
65
|
+
cmd = info.get("mac", "")
|
|
66
|
+
elif system == "Windows":
|
|
67
|
+
cmd = info.get("windows", "")
|
|
68
|
+
else:
|
|
69
|
+
cmd = info.get("linux", "")
|
|
70
|
+
print(f"\n {info['name']} — {info['url']}", file=sys.stderr)
|
|
71
|
+
print(f" Install: {cmd}", file=sys.stderr)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def run_preflight() -> None:
|
|
75
|
+
import platform
|
|
76
|
+
system = platform.system()
|
|
77
|
+
|
|
78
|
+
missing = [t for t in ("gh", "git", "semgrep", "ssh") if shutil.which(t) is None]
|
|
79
|
+
if missing:
|
|
80
|
+
print("[error] The following required tools are missing from PATH:", file=sys.stderr)
|
|
81
|
+
for tool in missing:
|
|
82
|
+
_print_install_hint(tool, system)
|
|
83
|
+
print(file=sys.stderr)
|
|
84
|
+
print(" Install the tools above, then re-run semhound.", file=sys.stderr)
|
|
85
|
+
sys.exit(1)
|
|
86
|
+
|
|
87
|
+
auth = subprocess.run(["gh", "auth", "status"], capture_output=True, text=True)
|
|
88
|
+
if auth.returncode != 0:
|
|
89
|
+
print("[error] GitHub CLI is not authenticated.", file=sys.stderr)
|
|
90
|
+
print(" Run: gh auth login", file=sys.stderr)
|
|
91
|
+
print(" Docs: https://cli.github.com/manual/gh_auth_login", file=sys.stderr)
|
|
92
|
+
sys.exit(1)
|
|
93
|
+
|
|
94
|
+
# Warn if the token is visibly missing scopes semhound needs.
|
|
95
|
+
# Fine-grained PATs may not list scopes, so only warn when we can
|
|
96
|
+
# positively confirm scopes are present but the required ones are absent.
|
|
97
|
+
auth_output = auth.stdout + auth.stderr
|
|
98
|
+
if "Token scopes:" in auth_output:
|
|
99
|
+
missing_scopes = [s for s in ("repo", "read:org") if f"'{s}'" not in auth_output]
|
|
100
|
+
if missing_scopes:
|
|
101
|
+
print(
|
|
102
|
+
f"[warn] GitHub token may be missing scopes: {', '.join(missing_scopes)}",
|
|
103
|
+
file=sys.stderr,
|
|
104
|
+
)
|
|
105
|
+
print(
|
|
106
|
+
" Private repos and org membership require these scopes.",
|
|
107
|
+
file=sys.stderr,
|
|
108
|
+
)
|
|
109
|
+
print(
|
|
110
|
+
f" Re-authenticate if needed: gh auth login --scopes {','.join(missing_scopes)}",
|
|
111
|
+
file=sys.stderr,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
result = subprocess.run(
|
|
115
|
+
["ssh", "-T", "-o", "StrictHostKeyChecking=no", "git@github.com"],
|
|
116
|
+
capture_output=True,
|
|
117
|
+
text=True,
|
|
118
|
+
)
|
|
119
|
+
combined = result.stdout + result.stderr
|
|
120
|
+
if "Hi " not in combined:
|
|
121
|
+
print("[error] GitHub SSH authentication failed.", file=sys.stderr)
|
|
122
|
+
if "Permission denied" in combined:
|
|
123
|
+
print(
|
|
124
|
+
" Your SSH key is not registered with GitHub or does not have access.",
|
|
125
|
+
file=sys.stderr,
|
|
126
|
+
)
|
|
127
|
+
print(" Add your public key at: https://github.com/settings/keys", file=sys.stderr)
|
|
128
|
+
elif "Connection refused" in combined or "timed out" in combined.lower():
|
|
129
|
+
print(
|
|
130
|
+
" Could not reach GitHub over SSH — port 22 may be blocked by your network.",
|
|
131
|
+
file=sys.stderr,
|
|
132
|
+
)
|
|
133
|
+
print(
|
|
134
|
+
" Try SSH over HTTPS port 443: "
|
|
135
|
+
"https://docs.github.com/en/authentication/troubleshooting-ssh/using-ssh-over-the-https-port",
|
|
136
|
+
file=sys.stderr,
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
print(
|
|
140
|
+
" Ensure an SSH key is added to your GitHub account:",
|
|
141
|
+
file=sys.stderr,
|
|
142
|
+
)
|
|
143
|
+
print(
|
|
144
|
+
" https://docs.github.com/en/authentication/connecting-to-github-with-ssh",
|
|
145
|
+
file=sys.stderr,
|
|
146
|
+
)
|
|
147
|
+
print(f" SSH output: {combined.strip()}", file=sys.stderr)
|
|
148
|
+
sys.exit(1)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def discover_repos(target: str) -> list[dict]:
|
|
152
|
+
result = subprocess.run(
|
|
153
|
+
["gh", "repo", "list", target, "--limit", "1000", "--json", "sshUrl,name,defaultBranchRef"],
|
|
154
|
+
capture_output=True,
|
|
155
|
+
text=True,
|
|
156
|
+
)
|
|
157
|
+
if result.returncode != 0:
|
|
158
|
+
err = (result.stderr or result.stdout).strip()
|
|
159
|
+
err_lower = err.lower()
|
|
160
|
+
if "could not resolve" in err_lower or "not found" in err_lower:
|
|
161
|
+
print(
|
|
162
|
+
f"[error] '{target}' was not found on GitHub. "
|
|
163
|
+
"Check the org name or username spelling.",
|
|
164
|
+
file=sys.stderr,
|
|
165
|
+
)
|
|
166
|
+
elif (
|
|
167
|
+
"must have admin rights" in err_lower
|
|
168
|
+
or "permission" in err_lower
|
|
169
|
+
or "403" in err
|
|
170
|
+
or "forbidden" in err_lower
|
|
171
|
+
):
|
|
172
|
+
print(
|
|
173
|
+
f"[error] Permission denied when listing repositories for '{target}'.",
|
|
174
|
+
file=sys.stderr,
|
|
175
|
+
)
|
|
176
|
+
print(
|
|
177
|
+
" Your GitHub token may be missing the 'repo' or 'read:org' scope.",
|
|
178
|
+
file=sys.stderr,
|
|
179
|
+
)
|
|
180
|
+
print(
|
|
181
|
+
" Re-authenticate: gh auth login --scopes repo,read:org",
|
|
182
|
+
file=sys.stderr,
|
|
183
|
+
)
|
|
184
|
+
elif "token" in err_lower and ("scope" in err_lower or "grant" in err_lower):
|
|
185
|
+
print(
|
|
186
|
+
f"[error] GitHub token lacks the required scopes to list repositories for '{target}'.",
|
|
187
|
+
file=sys.stderr,
|
|
188
|
+
)
|
|
189
|
+
print(
|
|
190
|
+
" Re-authenticate: gh auth login --scopes repo,read:org",
|
|
191
|
+
file=sys.stderr,
|
|
192
|
+
)
|
|
193
|
+
elif "rate limit" in err_lower or "429" in err:
|
|
194
|
+
print(
|
|
195
|
+
"[error] GitHub API rate limit exceeded. Wait a few minutes and retry.",
|
|
196
|
+
file=sys.stderr,
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
print(
|
|
200
|
+
f"[error] Failed to list repositories for '{target}': {err}",
|
|
201
|
+
file=sys.stderr,
|
|
202
|
+
)
|
|
203
|
+
sys.exit(1)
|
|
204
|
+
|
|
205
|
+
raw = json.loads(result.stdout)
|
|
206
|
+
repos = []
|
|
207
|
+
for r in raw:
|
|
208
|
+
branch_ref = r.get("defaultBranchRef") or {}
|
|
209
|
+
repos.append({
|
|
210
|
+
"name": r["name"],
|
|
211
|
+
"sshUrl": r["sshUrl"],
|
|
212
|
+
"defaultBranch": branch_ref.get("name", "main"),
|
|
213
|
+
})
|
|
214
|
+
return repos
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def download_rules(urls: list[str]) -> str:
|
|
218
|
+
"""Download semgrep rule files from HTTPS URLs into a new temp directory.
|
|
219
|
+
|
|
220
|
+
Returns the temp directory path. Caller is responsible for cleanup.
|
|
221
|
+
"""
|
|
222
|
+
tmpdir = tempfile.mkdtemp(prefix="semhound_rules_")
|
|
223
|
+
for url in urls:
|
|
224
|
+
if not url.lower().startswith("https://"):
|
|
225
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
226
|
+
raise ValueError(f"Only HTTPS URLs are allowed for rule downloads: {url}")
|
|
227
|
+
filename = Path(url.split("?")[0]).name or "rule.yaml"
|
|
228
|
+
dest = Path(tmpdir) / filename
|
|
229
|
+
counter = 1
|
|
230
|
+
while dest.exists():
|
|
231
|
+
dest = Path(tmpdir) / f"{dest.stem}_{counter}{dest.suffix}"
|
|
232
|
+
counter += 1
|
|
233
|
+
print(f"[info] Downloading rule: {url}")
|
|
234
|
+
try:
|
|
235
|
+
urllib.request.urlretrieve(url, dest) # noqa: S310 – URL validated above
|
|
236
|
+
except urllib.error.URLError as exc:
|
|
237
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
238
|
+
raise RuntimeError(f"Failed to download rule from {url}: {exc}") from exc
|
|
239
|
+
return tmpdir
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _run_cmd(args: list, cwd: Optional[str] = None) -> subprocess.CompletedProcess:
|
|
243
|
+
return subprocess.run(args, capture_output=True, text=True, cwd=cwd)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _analyze_with_retry(
|
|
247
|
+
ai_client: BaseAIClient,
|
|
248
|
+
snippet: str,
|
|
249
|
+
message: str,
|
|
250
|
+
name: str,
|
|
251
|
+
rule_id: str,
|
|
252
|
+
) -> tuple[str, str]:
|
|
253
|
+
for attempt in range(_MAX_RETRIES):
|
|
254
|
+
try:
|
|
255
|
+
confidence, true_positive = ai_client.analyze(snippet, message)
|
|
256
|
+
if confidence != "ERROR":
|
|
257
|
+
return confidence, true_positive
|
|
258
|
+
if attempt < _MAX_RETRIES - 1:
|
|
259
|
+
wait = 2 ** attempt
|
|
260
|
+
tqdm.write(f" [retry] {name} — {rule_id} (attempt {attempt + 1}, retrying in {wait}s)")
|
|
261
|
+
time.sleep(wait)
|
|
262
|
+
except Exception as exc:
|
|
263
|
+
if attempt < _MAX_RETRIES - 1:
|
|
264
|
+
wait = 2 ** attempt
|
|
265
|
+
tqdm.write(f" [retry] {name} — {rule_id} (attempt {attempt + 1}, error: {str(exc)[:60]}, retrying in {wait}s)")
|
|
266
|
+
time.sleep(wait)
|
|
267
|
+
else:
|
|
268
|
+
return "ERROR", str(exc)[:80]
|
|
269
|
+
return "ERROR", "ERROR"
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _write_sarif(results: list[dict], output_file: str) -> None:
|
|
273
|
+
rules_seen: dict[str, str] = {}
|
|
274
|
+
for r in results:
|
|
275
|
+
rules_seen.setdefault(r["rule_id"], r["message"])
|
|
276
|
+
|
|
277
|
+
sarif = {
|
|
278
|
+
"$schema": "https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json",
|
|
279
|
+
"version": "2.1.0",
|
|
280
|
+
"runs": [{
|
|
281
|
+
"tool": {
|
|
282
|
+
"driver": {
|
|
283
|
+
"name": "semhound",
|
|
284
|
+
"version": "0.1.0",
|
|
285
|
+
"rules": [
|
|
286
|
+
{
|
|
287
|
+
"id": rid,
|
|
288
|
+
"shortDescription": {"text": msg[:200]},
|
|
289
|
+
"fullDescription": {"text": msg},
|
|
290
|
+
}
|
|
291
|
+
for rid, msg in rules_seen.items()
|
|
292
|
+
],
|
|
293
|
+
}
|
|
294
|
+
},
|
|
295
|
+
"results": [
|
|
296
|
+
{
|
|
297
|
+
"ruleId": r["rule_id"],
|
|
298
|
+
"message": {"text": r["message"]},
|
|
299
|
+
"locations": [{
|
|
300
|
+
"physicalLocation": {
|
|
301
|
+
"artifactLocation": {"uri": r["permalink"]},
|
|
302
|
+
"region": {"startLine": r["line"]},
|
|
303
|
+
}
|
|
304
|
+
}],
|
|
305
|
+
"properties": {
|
|
306
|
+
"repository": r["repo"],
|
|
307
|
+
"confidence": r["confidence"],
|
|
308
|
+
"truePositive": r["true_positive"],
|
|
309
|
+
},
|
|
310
|
+
}
|
|
311
|
+
for r in results
|
|
312
|
+
],
|
|
313
|
+
}],
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
with open(output_file, "w", encoding="utf-8") as fh:
|
|
317
|
+
json.dump(sarif, fh, indent=2)
|
|
318
|
+
print(f"SARIF report written to: {output_file}")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _scan_repo(
|
|
322
|
+
repo: dict,
|
|
323
|
+
org: str,
|
|
324
|
+
rules_sources: list[str],
|
|
325
|
+
ai_client: Optional[BaseAIClient],
|
|
326
|
+
csv_writer: "csv.writer",
|
|
327
|
+
csv_lock: threading.Lock,
|
|
328
|
+
sarif_results: list,
|
|
329
|
+
sarif_lock: threading.Lock,
|
|
330
|
+
progress: tqdm,
|
|
331
|
+
) -> None:
|
|
332
|
+
name = repo["name"]
|
|
333
|
+
ssh_url = repo["sshUrl"]
|
|
334
|
+
|
|
335
|
+
tempdir = tempfile.mkdtemp(prefix=f"semhound_{name}_")
|
|
336
|
+
try:
|
|
337
|
+
tqdm.write(f" [clone] {name}")
|
|
338
|
+
clone = _run_cmd([
|
|
339
|
+
"git", "clone",
|
|
340
|
+
"--depth", "1",
|
|
341
|
+
"--single-branch",
|
|
342
|
+
"--no-tags",
|
|
343
|
+
ssh_url,
|
|
344
|
+
tempdir,
|
|
345
|
+
])
|
|
346
|
+
if clone.returncode != 0:
|
|
347
|
+
err = clone.stderr.strip()
|
|
348
|
+
if "Permission denied (publickey)" in err:
|
|
349
|
+
tqdm.write(
|
|
350
|
+
f" [skip] {name} — SSH key rejected by GitHub. "
|
|
351
|
+
"Ensure your key has read access to this repository."
|
|
352
|
+
)
|
|
353
|
+
elif "Repository not found" in err or "Could not read from remote repository" in err:
|
|
354
|
+
tqdm.write(
|
|
355
|
+
f" [skip] {name} — repository not found or your account lacks read access."
|
|
356
|
+
)
|
|
357
|
+
else:
|
|
358
|
+
tqdm.write(f" [skip] {name} — clone failed: {err[:200]}")
|
|
359
|
+
return
|
|
360
|
+
|
|
361
|
+
rev = _run_cmd(["git", "rev-parse", "HEAD"], cwd=tempdir)
|
|
362
|
+
commit_id = rev.stdout.strip() if rev.returncode == 0 else "HEAD"
|
|
363
|
+
|
|
364
|
+
tqdm.write(f" [scan] {name}")
|
|
365
|
+
semgrep_cmd = ["semgrep", "--jobs", "1"]
|
|
366
|
+
for src in rules_sources:
|
|
367
|
+
semgrep_cmd += ["--config", src]
|
|
368
|
+
semgrep_cmd += ["--json", "--quiet", tempdir]
|
|
369
|
+
semgrep = _run_cmd(semgrep_cmd)
|
|
370
|
+
|
|
371
|
+
if semgrep.returncode not in (0, 1):
|
|
372
|
+
tqdm.write(f" [warn] {name} — semgrep exited {semgrep.returncode}")
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
raw_findings = json.loads(semgrep.stdout).get("results", [])
|
|
376
|
+
except json.JSONDecodeError:
|
|
377
|
+
tqdm.write(f" [warn] {name} — could not parse semgrep output")
|
|
378
|
+
raw_findings = []
|
|
379
|
+
|
|
380
|
+
sarif_batch: list[dict] = []
|
|
381
|
+
|
|
382
|
+
for finding in raw_findings:
|
|
383
|
+
rel_path = Path(finding["path"]).relative_to(tempdir)
|
|
384
|
+
line = finding["start"]["line"]
|
|
385
|
+
rule_id = finding.get("check_id", "unknown")
|
|
386
|
+
message = finding.get("extra", {}).get("message", rule_id)
|
|
387
|
+
snippet = finding.get("extra", {}).get("lines", "").strip()
|
|
388
|
+
permalink = f"https://github.com/{org}/{name}/blob/{commit_id}/{rel_path}#L{line}"
|
|
389
|
+
|
|
390
|
+
confidence, true_positive = "", ""
|
|
391
|
+
if ai_client is not None:
|
|
392
|
+
tqdm.write(f" [analyze] {name} — {rule_id}")
|
|
393
|
+
confidence, true_positive = _analyze_with_retry(
|
|
394
|
+
ai_client, snippet, message, name, rule_id
|
|
395
|
+
)
|
|
396
|
+
tqdm.write(
|
|
397
|
+
f" [ai] {name} — {rule_id} | "
|
|
398
|
+
f"confidence={confidence} true_positive={true_positive}"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
with csv_lock:
|
|
402
|
+
csv_writer.writerow([name, rule_id, message, permalink, confidence, true_positive])
|
|
403
|
+
|
|
404
|
+
sarif_batch.append({
|
|
405
|
+
"repo": name,
|
|
406
|
+
"rule_id": rule_id,
|
|
407
|
+
"message": message,
|
|
408
|
+
"permalink": permalink,
|
|
409
|
+
"line": line,
|
|
410
|
+
"confidence": confidence,
|
|
411
|
+
"true_positive": true_positive,
|
|
412
|
+
})
|
|
413
|
+
|
|
414
|
+
with sarif_lock:
|
|
415
|
+
sarif_results.extend(sarif_batch)
|
|
416
|
+
|
|
417
|
+
tqdm.write(f" [done] {name} — {len(raw_findings)} finding(s)")
|
|
418
|
+
|
|
419
|
+
finally:
|
|
420
|
+
shutil.rmtree(tempdir, ignore_errors=True)
|
|
421
|
+
progress.update(1)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def run_scan(
|
|
425
|
+
repos: list,
|
|
426
|
+
org: str,
|
|
427
|
+
rules_sources: list[str],
|
|
428
|
+
ai_client: Optional[BaseAIClient],
|
|
429
|
+
threads: int,
|
|
430
|
+
output_sarif: bool = False,
|
|
431
|
+
) -> None:
|
|
432
|
+
if not repos:
|
|
433
|
+
print("[info] No repositories found for this organization.")
|
|
434
|
+
return
|
|
435
|
+
|
|
436
|
+
output_file = f"{org}_scan.csv"
|
|
437
|
+
csv_lock = threading.Lock()
|
|
438
|
+
sarif_results: list[dict] = []
|
|
439
|
+
sarif_lock = threading.Lock()
|
|
440
|
+
|
|
441
|
+
with open(output_file, "w", newline="", encoding="utf-8") as fh:
|
|
442
|
+
writer = csv.writer(fh)
|
|
443
|
+
writer.writerow([
|
|
444
|
+
"Repository", "Rule", "Issue Description", "Location",
|
|
445
|
+
"Confidence Score (AI)", "True Positive (AI)",
|
|
446
|
+
])
|
|
447
|
+
|
|
448
|
+
progress = tqdm(total=len(repos), desc=f"Scanning {org}", unit="repo")
|
|
449
|
+
with ThreadPoolExecutor(max_workers=threads) as pool:
|
|
450
|
+
futures = {
|
|
451
|
+
pool.submit(
|
|
452
|
+
_scan_repo, repo, org, rules_sources, ai_client,
|
|
453
|
+
writer, csv_lock, sarif_results, sarif_lock, progress,
|
|
454
|
+
): repo["name"]
|
|
455
|
+
for repo in repos
|
|
456
|
+
}
|
|
457
|
+
for future in as_completed(futures):
|
|
458
|
+
exc = future.exception()
|
|
459
|
+
if exc:
|
|
460
|
+
tqdm.write(f" [error] {futures[future]} — {exc}")
|
|
461
|
+
progress.close()
|
|
462
|
+
|
|
463
|
+
print(f"\nResults written to: {output_file}")
|
|
464
|
+
|
|
465
|
+
if output_sarif:
|
|
466
|
+
_write_sarif(sarif_results, f"{org}_scan.sarif")
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semhound
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scan every repository across your GitHub organisations using Semgrep rules, with optional AI triage
|
|
5
|
+
Author-email: Rohit Salecha <i@rohitsalecha.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/salecharohit/semhound
|
|
8
|
+
Project-URL: Issues, https://github.com/salecharohit/semhound/issues
|
|
9
|
+
Keywords: security,semgrep,github,appsec,threat-hunting,sast
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Security
|
|
22
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: tqdm>=4.66
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: anthropic>=0.25
|
|
29
|
+
Requires-Dist: google-generativeai>=0.5
|
|
30
|
+
Requires-Dist: openai>=1.30
|
|
31
|
+
Requires-Dist: boto3>=1.34
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# semhound
|
|
35
|
+
|
|
36
|
+
[](https://github.com/salecharohit/semhound/actions/workflows/release.yml)
|
|
37
|
+
[](https://pypi.org/project/semhound)
|
|
38
|
+
[](https://pypi.org/project/semhound)
|
|
39
|
+
[](https://pypi.org/project/semhound)
|
|
40
|
+
[](LICENSE)
|
|
41
|
+
|
|
42
|
+
**semhound** automates Semgrep scanning at org scale — you bring the rules, it handles discovery, cloning, scanning, and reporting across every repository in one or more GitHub organisations or user accounts. Optionally route each finding through an AI provider to triage true vs. false positives with a customised prompt.
|
|
43
|
+
|
|
44
|
+
Just like [TruffleHog](https://github.com/trufflesecurity/trufflehog) sweeps repos for secrets, semhound sweeps repos for any code pattern you define.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## How it works
|
|
49
|
+
|
|
50
|
+
1. **Discover** — uses `gh repo list` to find every repository for each target (org or user)
|
|
51
|
+
2. **Clone** — shallow-clones each repo in parallel (`--depth 1`) via SSH
|
|
52
|
+
3. **Scan** — runs your Semgrep rules across every cloned repo
|
|
53
|
+
4. **Report** — writes a consolidated CSV (and optional SARIF) per target, with GitHub permalinks to every finding
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Use-cases
|
|
58
|
+
|
|
59
|
+
**Bug bounty SQL injection — identify the same pattern across all repos**
|
|
60
|
+
A bug bounty report flagged a SQL injection in one of your apps. Write a Semgrep rule for that pattern and sweep your entire org to find every other repo where the same issue exists.
|
|
61
|
+
|
|
62
|
+
**Zero-day in a third-party OSS library — find every repo still running the vulnerable version**
|
|
63
|
+
A zero-day drops for a widely-used library — think log4j. Write a Semgrep rule that matches that version string in dependency files and sweep all your orgs in one pass. You get an immediate list of every repo still running the vulnerable version so you can prioritise upgrades before the exploit is weaponised.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Prerequisites
|
|
68
|
+
|
|
69
|
+
The following tools must be installed and on your `PATH`. semhound checks for all of them at startup and prints platform-specific install instructions for anything missing.
|
|
70
|
+
|
|
71
|
+
| Tool | macOS | Linux | Windows |
|
|
72
|
+
|------|-------|-------|---------|
|
|
73
|
+
| [GitHub CLI `gh`](https://cli.github.com) — repo discovery | `brew install gh` | [install guide](https://github.com/cli/cli/blob/trunk/docs/install_linux.md) | `winget install --id GitHub.cli` |
|
|
74
|
+
| `git` — shallow cloning | `brew install git` | `sudo apt install git` | `winget install --id Git.Git` |
|
|
75
|
+
| [Semgrep](https://semgrep.dev) — static analysis | `brew install semgrep` | `pip install semgrep` | `pip install semgrep` |
|
|
76
|
+
| OpenSSH — cloning via SSH | ships with macOS | `sudo apt install openssh-client` | ships with Windows 10/11 |
|
|
77
|
+
|
|
78
|
+
**Authenticate the GitHub CLI** (once):
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
gh auth login
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Register an SSH key** with your GitHub account (once) so semhound can clone private repos:
|
|
85
|
+
[docs.github.com/en/authentication/connecting-to-github-with-ssh](https://docs.github.com/en/authentication/connecting-to-github-with-ssh)
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Installation
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install semhound
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**From source** (for local development):
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
git clone git@github.com:salecharohit/semhound.git
|
|
99
|
+
cd semhound
|
|
100
|
+
pip install -e .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Usage
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
semhound [ORG_OR_USER ...] [--orgs-file PATH]
|
|
109
|
+
--rules-dir PATH Local folder of Semgrep .yaml rule files
|
|
110
|
+
--rules-url URL HTTPS URL of a Semgrep rule file (repeatable)
|
|
111
|
+
--ai-config PATH AI provider config file (omit to skip AI triage)
|
|
112
|
+
--threads N Parallel worker threads per target (default: 5)
|
|
113
|
+
--sarif Also write a SARIF 2.1.0 report alongside the CSV
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Pass one or more GitHub org names or usernames inline, load a list from `--orgs-file`, or mix both. All targets are deduplicated and scanned sequentially; each produces its own `<target>_scan.csv`.
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Single org
|
|
120
|
+
semhound acme-corp --rules-dir ./rules
|
|
121
|
+
|
|
122
|
+
# Single user account
|
|
123
|
+
semhound octocat --rules-dir ./rules
|
|
124
|
+
|
|
125
|
+
# Mix orgs and users inline
|
|
126
|
+
semhound acme-corp octocat --rules-dir ./rules
|
|
127
|
+
|
|
128
|
+
# Load orgs from a file
|
|
129
|
+
semhound --orgs-file orgs.txt --rules-dir ./rules
|
|
130
|
+
|
|
131
|
+
# Org file + inline username
|
|
132
|
+
semhound octocat --orgs-file orgs.txt --rules-dir ./rules
|
|
133
|
+
|
|
134
|
+
# Remote rule — no local files needed
|
|
135
|
+
semhound acme-corp \
|
|
136
|
+
--rules-url https://raw.githubusercontent.com/example/rules/main/sqli.yaml
|
|
137
|
+
|
|
138
|
+
# Full sweep: org file + remote rule + AI triage + 10 threads
|
|
139
|
+
semhound --orgs-file orgs.txt \
|
|
140
|
+
--rules-dir ./rules \
|
|
141
|
+
--rules-url https://raw.githubusercontent.com/example/rules/main/extra.yaml \
|
|
142
|
+
--ai-config ai.config \
|
|
143
|
+
--threads 10
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
`orgs.txt` — one org name or username per line; blank lines and `#` comments ignored.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Semgrep Rules
|
|
151
|
+
|
|
152
|
+
Rules come from a local directory (`--rules-dir`), one or more HTTPS URLs (`--rules-url`), or both. At least one source is required. Rules must be valid Semgrep `.yaml` files. Files downloaded via `--rules-url` are placed in a temporary directory and deleted after the scan.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## AI Analysis (optional)
|
|
157
|
+
|
|
158
|
+
Copy `ai.config.example` to `ai.config`, fill in your credentials, and pass `--ai-config ai.config`. Each finding is sent to the model, which returns a **confidence score** (0–100) and a **true positive** verdict. Without `--ai-config` those columns are left blank.
|
|
159
|
+
|
|
160
|
+
### Supported providers
|
|
161
|
+
|
|
162
|
+
| Provider | Required fields | Notes |
|
|
163
|
+
|----------|----------------|-------|
|
|
164
|
+
| `claude` | `api_key`, `model` | Anthropic direct API |
|
|
165
|
+
| `openai` | `api_key`, `model` | OpenAI API |
|
|
166
|
+
| `gemini` | `api_key`, `model` | Google Gemini API |
|
|
167
|
+
| `bedrock` | `aws_region`, `model` | Uses standard AWS credential chain — no API key needed |
|
|
168
|
+
|
|
169
|
+
The `system_prompt` field is optional but strongly recommended — tailoring it to your scenario produces sharper verdicts. Use the examples below as a starting point.
|
|
170
|
+
|
|
171
|
+
### Example: Bug bounty SQL injection sweep — AWS Bedrock
|
|
172
|
+
|
|
173
|
+
No API key needed; credentials come from `~/.aws/credentials`, an IAM role, SSO, etc. Find model IDs in the AWS Console under **Bedrock → Model access**.
|
|
174
|
+
|
|
175
|
+
```yaml
|
|
176
|
+
provider: bedrock
|
|
177
|
+
aws_profile: default # omit to use the default credential chain
|
|
178
|
+
aws_region: us-east-1
|
|
179
|
+
model: anthropic.claude-3-5-sonnet-20241022-v2:0
|
|
180
|
+
|
|
181
|
+
system_prompt: >
|
|
182
|
+
You are an application security engineer triaging SQL injection findings
|
|
183
|
+
flagged by a Semgrep rule after a bug bounty report.
|
|
184
|
+
For each code snippet, assess whether user-controlled input reaches a
|
|
185
|
+
database query without going through a parameterised query or ORM.
|
|
186
|
+
Rate confidence based on how directly the input flows into the query.
|
|
187
|
+
Be concise and precise.
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Example: Zero-day library sweep — OpenAI
|
|
191
|
+
|
|
192
|
+
```yaml
|
|
193
|
+
provider: openai
|
|
194
|
+
api_key: sk-...
|
|
195
|
+
model: gpt-4o
|
|
196
|
+
|
|
197
|
+
system_prompt: >
|
|
198
|
+
You are an application security engineer triaging findings from a
|
|
199
|
+
zero-day sweep across the org.
|
|
200
|
+
A CVE has been published for a specific function in a third-party library.
|
|
201
|
+
For each code snippet, assess whether the flagged function call matches the
|
|
202
|
+
vulnerable usage pattern described in the CVE, and whether any caller-side
|
|
203
|
+
mitigations such as input validation or version guards are already present.
|
|
204
|
+
Prioritise findings where the dangerous call is reachable with no mitigations.
|
|
205
|
+
Be concise and precise.
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
**Live triage output:**
|
|
209
|
+
|
|
210
|
+
```
|
|
211
|
+
[analyze] my-repo — sqli-raw-format
|
|
212
|
+
[ai] my-repo — sqli-raw-format | confidence=91 true_positive=true
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
If a provider returns an unparseable response, the tool retries up to 3 times with exponential backoff (1 s → 2 s → 4 s) before recording `ERROR`.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Output
|
|
220
|
+
|
|
221
|
+
Results are written to `<target>_scan.csv`. Pass `--sarif` to also produce `<target>_scan.sarif`.
|
|
222
|
+
|
|
223
|
+
| Column | Description |
|
|
224
|
+
|--------|-------------|
|
|
225
|
+
| Repository | Repository name |
|
|
226
|
+
| Rule | Semgrep rule ID |
|
|
227
|
+
| Issue Description | Rule message |
|
|
228
|
+
| Location | GitHub permalink to the exact line |
|
|
229
|
+
| Confidence Score (AI) | 0–100 (blank without `--ai-config`) |
|
|
230
|
+
| True Positive (AI) | `true` / `false` (blank without `--ai-config`) |
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## FAQ
|
|
235
|
+
|
|
236
|
+
**Who is this tool for?**
|
|
237
|
+
semhound is built for **Purple and Blue teams** — security engineers who need to identify vulnerable code patterns at org scale, not one repo at a time. Whether you're responding to a bug bounty report, sweeping for a CVE across an acquired company's codebase, or enforcing a security pattern across 200 repos, semhound gives you the answer in one command.
|
|
238
|
+
|
|
239
|
+
**What authentication is needed?**
|
|
240
|
+
semhound uses two mechanisms. `gh auth login` creates an OAuth token used for repository discovery via `gh repo list`. Cloning uses SSH with a key registered in your GitHub account — preferred over HTTPS because keys don't expire, are never embedded in URLs, and have no credential helper overhead when cloning hundreds of repos in parallel.
|
|
241
|
+
|
|
242
|
+
**Does it scan git history?**
|
|
243
|
+
No. semhound does a shallow clone of the default branch (`--depth 1`) and scans the current state of the code. It is designed for broad, fast coverage across many repos, not deep forensic history analysis.
|
|
244
|
+
|
|
245
|
+
**How is this different from TruffleHog or Gitleaks?**
|
|
246
|
+
TruffleHog and Gitleaks are purpose-built secrets scanners — they detect API keys, tokens, and credentials using their own built-in signatures. semhound is not a secrets scanner. It runs any Semgrep rule you give it — security vulnerabilities, dangerous function calls, vulnerable dependency versions, custom code patterns. Use TruffleHog for secrets; use semhound when you need to hunt for arbitrary code patterns at org scale.
|
|
247
|
+
|
|
248
|
+
**How is this different from running Semgrep directly?**
|
|
249
|
+
Semgrep is a scanner; it needs a target. Running it directly means you clone each repo yourself, run the command, collect results, repeat. semhound wraps that entire loop — it discovers every repo in an org or user account, clones them in parallel, runs your rules across all of them, and writes a consolidated CSV. One command replaces what would otherwise be a shell script across dozens or hundreds of repos.
|
|
250
|
+
|
|
251
|
+
**How is this different from GitHub Advanced Security (GHAS)?**
|
|
252
|
+
GHAS must be enabled repository by repository and requires a GitHub Enterprise licence for private repos. semhound works with any GitHub account, needs no per-repo setup, and lets you bring your own Semgrep rules. It runs on demand from anywhere, against any org or user you have access to.
|
|
253
|
+
|
|
254
|
+
**How is this different from git-secrets?**
|
|
255
|
+
git-secrets is a pre-commit hook that stops developers from committing secrets at commit time. semhound is a retrospective org-wide scanner — it sweeps repositories that already exist, across teams and orgs, looking for patterns you define. Different problem, different tool.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
semhound/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
2
|
+
semhound/ai_client.py,sha256=Tax77vJWEAaCJuONKAIX7gVQ9dHvrWC46NlEcSu0P-A,5980
|
|
3
|
+
semhound/cli.py,sha256=sa0zPH5aotLdvzWdPy_1K5S3STKXKqrbylC0BEDE47I,4349
|
|
4
|
+
semhound/scanner.py,sha256=-lMFpjq1idw8dOmcRUaHaK5qDOiZMme3KWOSun6fA-A,17113
|
|
5
|
+
semhound-0.1.0.dist-info/licenses/LICENSE,sha256=rFlLsEXWg6OiR87ZpTTpbfrnDONkPDo4xfO2RLtFJ0I,1070
|
|
6
|
+
semhound-0.1.0.dist-info/METADATA,sha256=t36kR4EHHm_mceCz3_YWQk9iGCZ-NJRaDQzem694-0k,12142
|
|
7
|
+
semhound-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
semhound-0.1.0.dist-info/entry_points.txt,sha256=4z61JRy6aNlmbI8lUSAnUVUopYayblJVC6BCmrlQzQg,47
|
|
9
|
+
semhound-0.1.0.dist-info/top_level.txt,sha256=-jRzGRa_6b3W3moaNb6RxvSJzJ4JZHKATOKcyxc5C0E,9
|
|
10
|
+
semhound-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Rohit Salecha
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
semhound
|