semhound 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
semhound-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Rohit Salecha
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,255 @@
1
+ Metadata-Version: 2.4
2
+ Name: semhound
3
+ Version: 0.1.0
4
+ Summary: Scan every repository across your GitHub organisations using Semgrep rules, with optional AI triage
5
+ Author-email: Rohit Salecha <i@rohitsalecha.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/salecharohit/semhound
8
+ Project-URL: Issues, https://github.com/salecharohit/semhound/issues
9
+ Keywords: security,semgrep,github,appsec,threat-hunting,sast
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Security
22
+ Classifier: Topic :: Software Development :: Quality Assurance
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: tqdm>=4.66
27
+ Requires-Dist: pyyaml>=6.0
28
+ Requires-Dist: anthropic>=0.25
29
+ Requires-Dist: google-generativeai>=0.5
30
+ Requires-Dist: openai>=1.30
31
+ Requires-Dist: boto3>=1.34
32
+ Dynamic: license-file
33
+
34
+ # semhound
35
+
36
+ [![Release](https://github.com/salecharohit/semhound/actions/workflows/release.yml/badge.svg)](https://github.com/salecharohit/semhound/actions/workflows/release.yml)
37
+ [![PyPI version](https://img.shields.io/pypi/v/semhound)](https://pypi.org/project/semhound)
38
+ [![Python versions](https://img.shields.io/pypi/pyversions/semhound)](https://pypi.org/project/semhound)
39
+ [![PyPI downloads](https://img.shields.io/pypi/dm/semhound)](https://pypi.org/project/semhound)
40
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
41
+
42
+ **semhound** automates Semgrep scanning at org scale — you bring the rules, it handles discovery, cloning, scanning, and reporting across every repository in one or more GitHub organisations or user accounts. Optionally route each finding through an AI provider to triage true vs. false positives with a customised prompt.
43
+
44
+ Just like [TruffleHog](https://github.com/trufflesecurity/trufflehog) sweeps repos for secrets, semhound sweeps repos for any code pattern you define.
45
+
46
+ ---
47
+
48
+ ## How it works
49
+
50
+ 1. **Discover** — uses `gh repo list` to find every repository for each target (org or user)
51
+ 2. **Clone** — shallow-clones each repo in parallel (`--depth 1`) via SSH
52
+ 3. **Scan** — runs your Semgrep rules across every cloned repo
53
+ 4. **Report** — writes a consolidated CSV (and optional SARIF) per target, with GitHub permalinks to every finding
54
+
55
+ ---
56
+
57
+ ## Use-cases
58
+
59
+ **Bug bounty SQL injection — identify the same pattern across all repos**
60
+ A bug bounty report flagged a SQL injection in one of your apps. Write a Semgrep rule for that pattern and sweep your entire org to find every other repo where the same issue exists.
61
+
62
+ **Zero-day in a third-party OSS library — find every repo still running the vulnerable version**
63
+ A zero-day drops for a widely-used library — think log4j. Write a Semgrep rule that matches that version string in dependency files and sweep all your orgs in one pass. You get an immediate list of every repo still running the vulnerable version so you can prioritise upgrades before the exploit is weaponised.
64
+
65
+ ---
66
+
67
+ ## Prerequisites
68
+
69
+ The following tools must be installed and on your `PATH`. semhound checks for all of them at startup and prints platform-specific install instructions for anything missing.
70
+
71
+ | Tool | macOS | Linux | Windows |
72
+ |------|-------|-------|---------|
73
+ | [GitHub CLI `gh`](https://cli.github.com) — repo discovery | `brew install gh` | [install guide](https://github.com/cli/cli/blob/trunk/docs/install_linux.md) | `winget install --id GitHub.cli` |
74
+ | `git` — shallow cloning | `brew install git` | `sudo apt install git` | `winget install --id Git.Git` |
75
+ | [Semgrep](https://semgrep.dev) — static analysis | `brew install semgrep` | `pip install semgrep` | `pip install semgrep` |
76
+ | OpenSSH — cloning via SSH | ships with macOS | `sudo apt install openssh-client` | ships with Windows 10/11 |
77
+
78
+ **Authenticate the GitHub CLI** (once):
79
+
80
+ ```bash
81
+ gh auth login
82
+ ```
83
+
84
+ **Register an SSH key** with your GitHub account (once) so semhound can clone private repos:
85
+ [docs.github.com/en/authentication/connecting-to-github-with-ssh](https://docs.github.com/en/authentication/connecting-to-github-with-ssh)
86
+
87
+ ---
88
+
89
+ ## Installation
90
+
91
+ ```bash
92
+ pip install semhound
93
+ ```
94
+
95
+ **From source** (for local development):
96
+
97
+ ```bash
98
+ git clone git@github.com:salecharohit/semhound.git
99
+ cd semhound
100
+ pip install -e .
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Usage
106
+
107
+ ```
108
+ semhound [ORG_OR_USER ...] [--orgs-file PATH]
109
+ --rules-dir PATH Local folder of Semgrep .yaml rule files
110
+ --rules-url URL HTTPS URL of a Semgrep rule file (repeatable)
111
+ --ai-config PATH AI provider config file (omit to skip AI triage)
112
+ --threads N Parallel worker threads per target (default: 5)
113
+ --sarif Also write a SARIF 2.1.0 report alongside the CSV
114
+ ```
115
+
116
+ Pass one or more GitHub org names or usernames inline, load a list from `--orgs-file`, or mix both. All targets are deduplicated and scanned sequentially; each produces its own `<target>_scan.csv`.
117
+
118
+ ```bash
119
+ # Single org
120
+ semhound acme-corp --rules-dir ./rules
121
+
122
+ # Single user account
123
+ semhound octocat --rules-dir ./rules
124
+
125
+ # Mix orgs and users inline
126
+ semhound acme-corp octocat --rules-dir ./rules
127
+
128
+ # Load orgs from a file
129
+ semhound --orgs-file orgs.txt --rules-dir ./rules
130
+
131
+ # Org file + inline username
132
+ semhound octocat --orgs-file orgs.txt --rules-dir ./rules
133
+
134
+ # Remote rule — no local files needed
135
+ semhound acme-corp \
136
+ --rules-url https://raw.githubusercontent.com/example/rules/main/sqli.yaml
137
+
138
+ # Full sweep: org file + remote rule + AI triage + 10 threads
139
+ semhound --orgs-file orgs.txt \
140
+ --rules-dir ./rules \
141
+ --rules-url https://raw.githubusercontent.com/example/rules/main/extra.yaml \
142
+ --ai-config ai.config \
143
+ --threads 10
144
+ ```
145
+
146
+ `orgs.txt` — one org name or username per line; blank lines and `#` comments ignored.
147
+
148
+ ---
149
+
150
+ ## Semgrep Rules
151
+
152
+ Rules come from a local directory (`--rules-dir`), one or more HTTPS URLs (`--rules-url`), or both. At least one source is required. Rules must be valid Semgrep `.yaml` files. Files downloaded via `--rules-url` are placed in a temporary directory and deleted after the scan.
153
+
154
+ ---
155
+
156
+ ## AI Analysis (optional)
157
+
158
+ Copy `ai.config.example` to `ai.config`, fill in your credentials, and pass `--ai-config ai.config`. Each finding is sent to the model, which returns a **confidence score** (0–100) and a **true positive** verdict. Without `--ai-config` those columns are left blank.
159
+
160
+ ### Supported providers
161
+
162
+ | Provider | Required fields | Notes |
163
+ |----------|----------------|-------|
164
+ | `claude` | `api_key`, `model` | Anthropic direct API |
165
+ | `openai` | `api_key`, `model` | OpenAI API |
166
+ | `gemini` | `api_key`, `model` | Google Gemini API |
167
+ | `bedrock` | `aws_region`, `model` | Uses standard AWS credential chain — no API key needed |
168
+
169
+ The `system_prompt` field is optional but strongly recommended — tailoring it to your scenario produces sharper verdicts. Use the examples below as a starting point.
170
+
171
+ ### Example: Bug bounty SQL injection sweep — AWS Bedrock
172
+
173
+ No API key needed; credentials come from `~/.aws/credentials`, an IAM role, SSO, etc. Find model IDs in the AWS Console under **Bedrock → Model access**.
174
+
175
+ ```yaml
176
+ provider: bedrock
177
+ aws_profile: default # omit to use the default credential chain
178
+ aws_region: us-east-1
179
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
180
+
181
+ system_prompt: >
182
+ You are an application security engineer triaging SQL injection findings
183
+ flagged by a Semgrep rule after a bug bounty report.
184
+ For each code snippet, assess whether user-controlled input reaches a
185
+ database query without going through a parameterised query or ORM.
186
+ Rate confidence based on how directly the input flows into the query.
187
+ Be concise and precise.
188
+ ```
189
+
190
+ ### Example: Zero-day library sweep — OpenAI
191
+
192
+ ```yaml
193
+ provider: openai
194
+ api_key: sk-...
195
+ model: gpt-4o
196
+
197
+ system_prompt: >
198
+ You are an application security engineer triaging findings from a
199
+ zero-day sweep across the org.
200
+ A CVE has been published for a specific function in a third-party library.
201
+ For each code snippet, assess whether the flagged function call matches the
202
+ vulnerable usage pattern described in the CVE, and whether any caller-side
203
+ mitigations such as input validation or version guards are already present.
204
+ Prioritise findings where the dangerous call is reachable with no mitigations.
205
+ Be concise and precise.
206
+ ```
207
+
208
+ **Live triage output:**
209
+
210
+ ```
211
+ [analyze] my-repo — sqli-raw-format
212
+ [ai] my-repo — sqli-raw-format | confidence=91 true_positive=true
213
+ ```
214
+
215
+ If a provider returns an unparseable response, the tool retries up to 3 times with exponential backoff (1 s → 2 s → 4 s) before recording `ERROR`.
216
+
217
+ ---
218
+
219
+ ## Output
220
+
221
+ Results are written to `<target>_scan.csv`. Pass `--sarif` to also produce `<target>_scan.sarif`.
222
+
223
+ | Column | Description |
224
+ |--------|-------------|
225
+ | Repository | Repository name |
226
+ | Rule | Semgrep rule ID |
227
+ | Issue Description | Rule message |
228
+ | Location | GitHub permalink to the exact line |
229
+ | Confidence Score (AI) | 0–100 (blank without `--ai-config`) |
230
+ | True Positive (AI) | `true` / `false` (blank without `--ai-config`) |
231
+
232
+ ---
233
+
234
+ ## FAQ
235
+
236
+ **Who is this tool for?**
237
+ semhound is built for **Purple and Blue teams** — security engineers who need to identify vulnerable code patterns at org scale, not one repo at a time. Whether you're responding to a bug bounty report, sweeping for a CVE across an acquired company's codebase, or enforcing a security pattern across 200 repos, semhound gives you the answer in one command.
238
+
239
+ **What authentication is needed?**
240
+ semhound uses two mechanisms. `gh auth login` creates an OAuth token used for repository discovery via `gh repo list`. Cloning uses SSH with a key registered in your GitHub account — preferred over HTTPS because keys don't expire, are never embedded in URLs, and have no credential helper overhead when cloning hundreds of repos in parallel.
241
+
242
+ **Does it scan git history?**
243
+ No. semhound does a shallow clone of the default branch (`--depth 1`) and scans the current state of the code. It is designed for broad, fast coverage across many repos, not deep forensic history analysis.
244
+
245
+ **How is this different from TruffleHog or Gitleaks?**
246
+ TruffleHog and Gitleaks are purpose-built secrets scanners — they detect API keys, tokens, and credentials using their own built-in signatures. semhound is not a secrets scanner. It runs any Semgrep rule you give it — security vulnerabilities, dangerous function calls, vulnerable dependency versions, custom code patterns. Use TruffleHog for secrets; use semhound when you need to hunt for arbitrary code patterns at org scale.
247
+
248
+ **How is this different from running Semgrep directly?**
249
+ Semgrep is a scanner; it needs a target. Running it directly means you clone each repo yourself, run the command, collect results, repeat. semhound wraps that entire loop — it discovers every repo in an org or user account, clones them in parallel, runs your rules across all of them, and writes a consolidated CSV. One command replaces what would otherwise be a shell script across dozens or hundreds of repos.
250
+
251
+ **How is this different from GitHub Advanced Security (GHAS)?**
252
+ GHAS must be enabled repository by repository and requires a GitHub Enterprise licence for private repos. semhound works with any GitHub account, needs no per-repo setup, and lets you bring your own Semgrep rules. It runs on demand from anywhere, against any org or user you have access to.
253
+
254
+ **How is this different from git-secrets?**
255
+ git-secrets is a pre-commit hook that stops developers from committing secrets at commit time. semhound is a retrospective org-wide scanner — it sweeps repositories that already exist, across teams and orgs, looking for patterns you define. Different problem, different tool.
@@ -0,0 +1,222 @@
1
+ # semhound
2
+
3
+ [![Release](https://github.com/salecharohit/semhound/actions/workflows/release.yml/badge.svg)](https://github.com/salecharohit/semhound/actions/workflows/release.yml)
4
+ [![PyPI version](https://img.shields.io/pypi/v/semhound)](https://pypi.org/project/semhound)
5
+ [![Python versions](https://img.shields.io/pypi/pyversions/semhound)](https://pypi.org/project/semhound)
6
+ [![PyPI downloads](https://img.shields.io/pypi/dm/semhound)](https://pypi.org/project/semhound)
7
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
8
+
9
+ **semhound** automates Semgrep scanning at org scale — you bring the rules, it handles discovery, cloning, scanning, and reporting across every repository in one or more GitHub organisations or user accounts. Optionally route each finding through an AI provider to triage true vs. false positives with a customised prompt.
10
+
11
+ Just like [TruffleHog](https://github.com/trufflesecurity/trufflehog) sweeps repos for secrets, semhound sweeps repos for any code pattern you define.
12
+
13
+ ---
14
+
15
+ ## How it works
16
+
17
+ 1. **Discover** — uses `gh repo list` to find every repository for each target (org or user)
18
+ 2. **Clone** — shallow-clones each repo in parallel (`--depth 1`) via SSH
19
+ 3. **Scan** — runs your Semgrep rules across every cloned repo
20
+ 4. **Report** — writes a consolidated CSV (and optional SARIF) per target, with GitHub permalinks to every finding
21
+
22
+ ---
23
+
24
+ ## Use-cases
25
+
26
+ **Bug bounty SQL injection — identify the same pattern across all repos**
27
+ A bug bounty report flagged a SQL injection in one of your apps. Write a Semgrep rule for that pattern and sweep your entire org to find every other repo where the same issue exists.
28
+
29
+ **Zero-day in a third-party OSS library — find every repo still running the vulnerable version**
30
+ A zero-day drops for a widely-used library — think log4j. Write a Semgrep rule that matches that version string in dependency files and sweep all your orgs in one pass. You get an immediate list of every repo still running the vulnerable version so you can prioritise upgrades before the exploit is weaponised.
31
+
32
+ ---
33
+
34
+ ## Prerequisites
35
+
36
+ The following tools must be installed and on your `PATH`. semhound checks for all of them at startup and prints platform-specific install instructions for anything missing.
37
+
38
+ | Tool | macOS | Linux | Windows |
39
+ |------|-------|-------|---------|
40
+ | [GitHub CLI `gh`](https://cli.github.com) — repo discovery | `brew install gh` | [install guide](https://github.com/cli/cli/blob/trunk/docs/install_linux.md) | `winget install --id GitHub.cli` |
41
+ | `git` — shallow cloning | `brew install git` | `sudo apt install git` | `winget install --id Git.Git` |
42
+ | [Semgrep](https://semgrep.dev) — static analysis | `brew install semgrep` | `pip install semgrep` | `pip install semgrep` |
43
+ | OpenSSH — cloning via SSH | ships with macOS | `sudo apt install openssh-client` | ships with Windows 10/11 |
44
+
45
+ **Authenticate the GitHub CLI** (once):
46
+
47
+ ```bash
48
+ gh auth login
49
+ ```
50
+
51
+ **Register an SSH key** with your GitHub account (once) so semhound can clone private repos:
52
+ [docs.github.com/en/authentication/connecting-to-github-with-ssh](https://docs.github.com/en/authentication/connecting-to-github-with-ssh)
53
+
54
+ ---
55
+
56
+ ## Installation
57
+
58
+ ```bash
59
+ pip install semhound
60
+ ```
61
+
62
+ **From source** (for local development):
63
+
64
+ ```bash
65
+ git clone git@github.com:salecharohit/semhound.git
66
+ cd semhound
67
+ pip install -e .
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Usage
73
+
74
+ ```
75
+ semhound [ORG_OR_USER ...] [--orgs-file PATH]
76
+ --rules-dir PATH Local folder of Semgrep .yaml rule files
77
+ --rules-url URL HTTPS URL of a Semgrep rule file (repeatable)
78
+ --ai-config PATH AI provider config file (omit to skip AI triage)
79
+ --threads N Parallel worker threads per target (default: 5)
80
+ --sarif Also write a SARIF 2.1.0 report alongside the CSV
81
+ ```
82
+
83
+ Pass one or more GitHub org names or usernames inline, load a list from `--orgs-file`, or mix both. All targets are deduplicated and scanned sequentially; each produces its own `<target>_scan.csv`.
84
+
85
+ ```bash
86
+ # Single org
87
+ semhound acme-corp --rules-dir ./rules
88
+
89
+ # Single user account
90
+ semhound octocat --rules-dir ./rules
91
+
92
+ # Mix orgs and users inline
93
+ semhound acme-corp octocat --rules-dir ./rules
94
+
95
+ # Load orgs from a file
96
+ semhound --orgs-file orgs.txt --rules-dir ./rules
97
+
98
+ # Org file + inline username
99
+ semhound octocat --orgs-file orgs.txt --rules-dir ./rules
100
+
101
+ # Remote rule — no local files needed
102
+ semhound acme-corp \
103
+ --rules-url https://raw.githubusercontent.com/example/rules/main/sqli.yaml
104
+
105
+ # Full sweep: org file + remote rule + AI triage + 10 threads
106
+ semhound --orgs-file orgs.txt \
107
+ --rules-dir ./rules \
108
+ --rules-url https://raw.githubusercontent.com/example/rules/main/extra.yaml \
109
+ --ai-config ai.config \
110
+ --threads 10
111
+ ```
112
+
113
+ `orgs.txt` — one org name or username per line; blank lines and `#` comments ignored.
114
+
115
+ ---
116
+
117
+ ## Semgrep Rules
118
+
119
+ Rules come from a local directory (`--rules-dir`), one or more HTTPS URLs (`--rules-url`), or both. At least one source is required. Rules must be valid Semgrep `.yaml` files. Files downloaded via `--rules-url` are placed in a temporary directory and deleted after the scan.
120
+
121
+ ---
122
+
123
+ ## AI Analysis (optional)
124
+
125
+ Copy `ai.config.example` to `ai.config`, fill in your credentials, and pass `--ai-config ai.config`. Each finding is sent to the model, which returns a **confidence score** (0–100) and a **true positive** verdict. Without `--ai-config` those columns are left blank.
126
+
127
+ ### Supported providers
128
+
129
+ | Provider | Required fields | Notes |
130
+ |----------|----------------|-------|
131
+ | `claude` | `api_key`, `model` | Anthropic direct API |
132
+ | `openai` | `api_key`, `model` | OpenAI API |
133
+ | `gemini` | `api_key`, `model` | Google Gemini API |
134
+ | `bedrock` | `aws_region`, `model` | Uses standard AWS credential chain — no API key needed |
135
+
136
+ The `system_prompt` field is optional but strongly recommended — tailoring it to your scenario produces sharper verdicts. Use the examples below as a starting point.
137
+
138
+ ### Example: Bug bounty SQL injection sweep — AWS Bedrock
139
+
140
+ No API key needed; credentials come from `~/.aws/credentials`, an IAM role, SSO, etc. Find model IDs in the AWS Console under **Bedrock → Model access**.
141
+
142
+ ```yaml
143
+ provider: bedrock
144
+ aws_profile: default # omit to use the default credential chain
145
+ aws_region: us-east-1
146
+ model: anthropic.claude-3-5-sonnet-20241022-v2:0
147
+
148
+ system_prompt: >
149
+ You are an application security engineer triaging SQL injection findings
150
+ flagged by a Semgrep rule after a bug bounty report.
151
+ For each code snippet, assess whether user-controlled input reaches a
152
+ database query without going through a parameterised query or ORM.
153
+ Rate confidence based on how directly the input flows into the query.
154
+ Be concise and precise.
155
+ ```
156
+
157
+ ### Example: Zero-day library sweep — OpenAI
158
+
159
+ ```yaml
160
+ provider: openai
161
+ api_key: sk-...
162
+ model: gpt-4o
163
+
164
+ system_prompt: >
165
+ You are an application security engineer triaging findings from a
166
+ zero-day sweep across the org.
167
+ A CVE has been published for a specific function in a third-party library.
168
+ For each code snippet, assess whether the flagged function call matches the
169
+ vulnerable usage pattern described in the CVE, and whether any caller-side
170
+ mitigations such as input validation or version guards are already present.
171
+ Prioritise findings where the dangerous call is reachable with no mitigations.
172
+ Be concise and precise.
173
+ ```
174
+
175
+ **Live triage output:**
176
+
177
+ ```
178
+ [analyze] my-repo — sqli-raw-format
179
+ [ai] my-repo — sqli-raw-format | confidence=91 true_positive=true
180
+ ```
181
+
182
+ If a provider returns an unparseable response, the tool retries up to 3 times with exponential backoff (1 s → 2 s → 4 s) before recording `ERROR`.
183
+
184
+ ---
185
+
186
+ ## Output
187
+
188
+ Results are written to `<target>_scan.csv`. Pass `--sarif` to also produce `<target>_scan.sarif`.
189
+
190
+ | Column | Description |
191
+ |--------|-------------|
192
+ | Repository | Repository name |
193
+ | Rule | Semgrep rule ID |
194
+ | Issue Description | Rule message |
195
+ | Location | GitHub permalink to the exact line |
196
+ | Confidence Score (AI) | 0–100 (blank without `--ai-config`) |
197
+ | True Positive (AI) | `true` / `false` (blank without `--ai-config`) |
198
+
199
+ ---
200
+
201
+ ## FAQ
202
+
203
+ **Who is this tool for?**
204
+ semhound is built for **Purple and Blue teams** — security engineers who need to identify vulnerable code patterns at org scale, not one repo at a time. Whether you're responding to a bug bounty report, sweeping for a CVE across an acquired company's codebase, or enforcing a security pattern across 200 repos, semhound gives you the answer in one command.
205
+
206
+ **What authentication is needed?**
207
+ semhound uses two mechanisms. `gh auth login` creates an OAuth token used for repository discovery via `gh repo list`. Cloning uses SSH with a key registered in your GitHub account — preferred over HTTPS because keys don't expire, are never embedded in URLs, and have no credential helper overhead when cloning hundreds of repos in parallel.
208
+
209
+ **Does it scan git history?**
210
+ No. semhound does a shallow clone of the default branch (`--depth 1`) and scans the current state of the code. It is designed for broad, fast coverage across many repos, not deep forensic history analysis.
211
+
212
+ **How is this different from TruffleHog or Gitleaks?**
213
+ TruffleHog and Gitleaks are purpose-built secrets scanners — they detect API keys, tokens, and credentials using their own built-in signatures. semhound is not a secrets scanner. It runs any Semgrep rule you give it — security vulnerabilities, dangerous function calls, vulnerable dependency versions, custom code patterns. Use TruffleHog for secrets; use semhound when you need to hunt for arbitrary code patterns at org scale.
214
+
215
+ **How is this different from running Semgrep directly?**
216
+ Semgrep is a scanner; it needs a target. Running it directly means you clone each repo yourself, run the command, collect results, repeat. semhound wraps that entire loop — it discovers every repo in an org or user account, clones them in parallel, runs your rules across all of them, and writes a consolidated CSV. One command replaces what would otherwise be a shell script across dozens or hundreds of repos.
217
+
218
+ **How is this different from GitHub Advanced Security (GHAS)?**
219
+ GHAS must be enabled repository by repository and requires a GitHub Enterprise licence for private repos. semhound works with any GitHub account, needs no per-repo setup, and lets you bring your own Semgrep rules. It runs on demand from anywhere, against any org or user you have access to.
220
+
221
+ **How is this different from git-secrets?**
222
+ git-secrets is a pre-commit hook that stops developers from committing secrets at commit time. semhound is a retrospective org-wide scanner — it sweeps repositories that already exist, across teams and orgs, looking for patterns you define. Different problem, different tool.
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "semhound"
7
+ version = "0.1.0"
8
+ description = "Scan every repository across your GitHub organisations using Semgrep rules, with optional AI triage"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "Rohit Salecha", email = "i@rohitsalecha.com" }]
12
+ requires-python = ">=3.9"
13
+ keywords = ["security", "semgrep", "github", "appsec", "threat-hunting", "sast"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Environment :: Console",
17
+ "Intended Audience :: Developers",
18
+ "Intended Audience :: Information Technology",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Operating System :: OS Independent",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Security",
27
+ "Topic :: Software Development :: Quality Assurance",
28
+ ]
29
+ dependencies = [
30
+ "tqdm>=4.66",
31
+ "pyyaml>=6.0",
32
+ "anthropic>=0.25",
33
+ "google-generativeai>=0.5",
34
+ "openai>=1.30",
35
+ "boto3>=1.34",
36
+ ]
37
+
38
+ [project.urls]
39
+ Homepage = "https://github.com/salecharohit/semhound"
40
+ Issues = "https://github.com/salecharohit/semhound/issues"
41
+
42
+ [project.scripts]
43
+ semhound = "semhound.cli:main"
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"