gitcolombo 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020-2026 Soxoj
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: gitcolombo
3
+ Version: 0.3.0
4
+ Summary: 🕵️ OSINT tool to extract identities (names, emails, GitHub logins) from git repositories and the GitHub API.
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: osint,git,github,email,investigation,recon,doxing
8
+ Author: Soxoj
9
+ Author-email: soxoj@protonmail.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Information Technology
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Natural Language :: English
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
22
+ Classifier: Topic :: Security
23
+ Classifier: Topic :: Software Development :: Version Control :: Git
24
+ Project-URL: Bug Tracker, https://github.com/soxoj/gitcolombo/issues
25
+ Project-URL: Homepage, https://pypi.org/project/gitcolombo
26
+ Project-URL: Repository, https://github.com/soxoj/gitcolombo
27
+ Project-URL: Web version, https://gitcolombo.soxoj.com
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Gitcolombo
31
+
32
+ <p align="center">
33
+ <img src="https://telegra.ph/file/0730b125282266989e861.png" alt="Gitcolombo" width="320">
34
+ </p>
35
+
36
+ OSINT tool that extracts identities — names, emails, and links between
37
+ seemingly unrelated accounts — from git repositories and GitHub.
38
+
39
+ - **Python CLI** (`gitcolombo.py`) — clones repos, walks `git log`, and can
40
+ call the GitHub API for richer signals.
41
+ - **Web version** (`gitcolombo.html`) — a single static HTML file; open it
42
+ in a browser and query the GitHub API directly, no install.
43
+
44
+ For the full breakdown of where each email/name comes from
45
+ (PGP keys, public events, commit search, commit-message trailers, etc.)
46
+ see **[docs.md](./docs.md)**.
47
+
48
+ ## Web version
49
+
50
+ Hosted at **<https://gitcolombo.soxoj.com>** — or open `gitcolombo.html`
51
+ locally. A single static HTML file that queries the GitHub API straight
52
+ from your browser; no install, no backend.
53
+
54
+ <p align="center">
55
+ <img src="gitcolombo.png" alt="Gitcolombo web version" width="640">
56
+ </p>
57
+
58
+ ## Install
59
+
60
+ Requires Python 3.10+ and a working `git` binary. No third-party
61
+ Python dependencies.
62
+
63
+ ```sh
64
+ pip install gitcolombo
65
+ ```
66
+
67
+ Or from source:
68
+
69
+ ```sh
70
+ git clone https://github.com/Soxoj/gitcolombo
71
+ cd gitcolombo
72
+ pip install -e .
73
+ ```
74
+
75
+ ## Usage
76
+
77
+ ```sh
78
+ # from any git URL
79
+ gitcolombo -u https://github.com/Soxoj/maigret
80
+
81
+ # from a local directory, recursively
82
+ gitcolombo -d ./maigret -r
83
+
84
+ # clone and scan every public repo of a GitHub user/org
85
+ gitcolombo --nickname octocat
86
+
87
+ # API-only: find emails for a GitHub username without cloning
88
+ gitcolombo --search Soxoj
89
+
90
+ # change where remote repos get cloned (default: ./repos)
91
+ gitcolombo -u https://github.com/Soxoj/maigret --repos-dir ./clones
92
+ ```
93
+
94
+ `python -m gitcolombo` works equivalently if you'd rather not put the
95
+ script on `$PATH`.
96
+
97
+ Remote repositories are cloned into `./repos/` by default; override
98
+ with `--repos-dir`. For batch cloning from GitLab and Bitbucket groups
99
+ use [ghorg](https://github.com/gabrie30/ghorg).
100
+
101
+ ## Output
102
+
103
+ - Per-person details: name, email, author/committer counts, and other
104
+ identities that may belong to the same person.
105
+ - Emails that share a name.
106
+ - Different names tied to the same email.
107
+ - General statistics across the scanned repos.
108
+
109
+ ## Why it works
110
+
111
+ Developers often commit with one identity (e.g. work account), then
112
+ switch to another (e.g. personal account) and run `git commit --amend`,
113
+ forgetting that this rewrites the *committer* but leaves the original
114
+ *author* in place. The two roles drift apart, and that mismatch is
115
+ exactly what gitcolombo correlates.
116
+
117
+ Short explainer on author vs. committer:
118
+ <https://stackoverflow.com/questions/18750808/difference-between-author-and-committer-in-git>
119
+
120
+ ## Testing
121
+
122
+ Stdlib-only test suite — no third-party dependencies. From the repo root
123
+ (after `pip install -e .`):
124
+
125
+ ```sh
126
+ python3 -m unittest test_gitcolombo -v
127
+ ```
128
+
129
+ The end-to-end test creates a real git repository in a temp directory,
130
+ so a working `git` binary is required (the test is skipped if `git` is
131
+ missing).
132
+
133
+ Tests run on every push and pull request via GitHub Actions
134
+ (`.github/workflows/tests.yml`) across Python 3.10–3.13.
135
+
136
+ ## Further reading
137
+
138
+ - [docs.md](./docs.md) — extraction methods, ranking, filters, rate limits
139
+ - [RUS] <https://telegra.ph/Gitcolombo---OSINT-v-GitHub-03-02>
140
+
141
+ ## Roadmap
142
+
143
+ - [x] Total statistics for repos in a directory
144
+ - [x] GitHub support: clone all repos from account/group
145
+ - [x] GitHub support: extract links to accounts from commit info
146
+ - [x] GitHub support: API pagination
147
+ - [x] Exclude "system" accounts (e.g. `noreply@github.com`, `@users.noreply.github.com`)
148
+ - [ ] Reverse mapping email → names (currently only name → emails)
149
+ - [ ] Probabilistic graph links based on shared names/emails and Levenshtein distance
150
+ - [ ] Other popular git platforms: GitLab, Bitbucket
151
+
@@ -0,0 +1,121 @@
1
+ # Gitcolombo
2
+
3
+ <p align="center">
4
+ <img src="https://telegra.ph/file/0730b125282266989e861.png" alt="Gitcolombo" width="320">
5
+ </p>
6
+
7
+ OSINT tool that extracts identities — names, emails, and links between
8
+ seemingly unrelated accounts — from git repositories and GitHub.
9
+
10
+ - **Python CLI** (`gitcolombo.py`) — clones repos, walks `git log`, and can
11
+ call the GitHub API for richer signals.
12
+ - **Web version** (`gitcolombo.html`) — a single static HTML file; open it
13
+ in a browser and query the GitHub API directly, no install.
14
+
15
+ For the full breakdown of where each email/name comes from
16
+ (PGP keys, public events, commit search, commit-message trailers, etc.)
17
+ see **[docs.md](./docs.md)**.
18
+
19
+ ## Web version
20
+
21
+ Hosted at **<https://gitcolombo.soxoj.com>** — or open `gitcolombo.html`
22
+ locally. A single static HTML file that queries the GitHub API straight
23
+ from your browser; no install, no backend.
24
+
25
+ <p align="center">
26
+ <img src="gitcolombo.png" alt="Gitcolombo web version" width="640">
27
+ </p>
28
+
29
+ ## Install
30
+
31
+ Requires Python 3.10+ and a working `git` binary. No third-party
32
+ Python dependencies.
33
+
34
+ ```sh
35
+ pip install gitcolombo
36
+ ```
37
+
38
+ Or from source:
39
+
40
+ ```sh
41
+ git clone https://github.com/Soxoj/gitcolombo
42
+ cd gitcolombo
43
+ pip install -e .
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ```sh
49
+ # from any git URL
50
+ gitcolombo -u https://github.com/Soxoj/maigret
51
+
52
+ # from a local directory, recursively
53
+ gitcolombo -d ./maigret -r
54
+
55
+ # clone and scan every public repo of a GitHub user/org
56
+ gitcolombo --nickname octocat
57
+
58
+ # API-only: find emails for a GitHub username without cloning
59
+ gitcolombo --search Soxoj
60
+
61
+ # change where remote repos get cloned (default: ./repos)
62
+ gitcolombo -u https://github.com/Soxoj/maigret --repos-dir ./clones
63
+ ```
64
+
65
+ `python -m gitcolombo` works equivalently if you'd rather not put the
66
+ script on `$PATH`.
67
+
68
+ Remote repositories are cloned into `./repos/` by default; override
69
+ with `--repos-dir`. For batch cloning from GitLab and Bitbucket groups
70
+ use [ghorg](https://github.com/gabrie30/ghorg).
71
+
72
+ ## Output
73
+
74
+ - Per-person details: name, email, author/committer counts, and other
75
+ identities that may belong to the same person.
76
+ - Emails that share a name.
77
+ - Different names tied to the same email.
78
+ - General statistics across the scanned repos.
79
+
80
+ ## Why it works
81
+
82
+ Developers often commit with one identity (e.g. work account), then
83
+ switch to another (e.g. personal account) and run `git commit --amend`,
84
+ forgetting that this rewrites the *committer* but leaves the original
85
+ *author* in place. The two roles drift apart, and that mismatch is
86
+ exactly what gitcolombo correlates.
87
+
88
+ Short explainer on author vs. committer:
89
+ <https://stackoverflow.com/questions/18750808/difference-between-author-and-committer-in-git>
90
+
91
+ ## Testing
92
+
93
+ Stdlib-only test suite — no third-party dependencies. From the repo root
94
+ (after `pip install -e .`):
95
+
96
+ ```sh
97
+ python3 -m unittest test_gitcolombo -v
98
+ ```
99
+
100
+ The end-to-end test creates a real git repository in a temp directory,
101
+ so a working `git` binary is required (the test is skipped if `git` is
102
+ missing).
103
+
104
+ Tests run on every push and pull request via GitHub Actions
105
+ (`.github/workflows/tests.yml`) across Python 3.10–3.13.
106
+
107
+ ## Further reading
108
+
109
+ - [docs.md](./docs.md) — extraction methods, ranking, filters, rate limits
110
+ - [RUS] <https://telegra.ph/Gitcolombo---OSINT-v-GitHub-03-02>
111
+
112
+ ## Roadmap
113
+
114
+ - [x] Total statistics for repos in a directory
115
+ - [x] GitHub support: clone all repos from account/group
116
+ - [x] GitHub support: extract links to accounts from commit info
117
+ - [x] GitHub support: API pagination
118
+ - [x] Exclude "system" accounts (e.g. `noreply@github.com`, `@users.noreply.github.com`)
119
+ - [ ] Reverse mapping email → names (currently only name → emails)
120
+ - [ ] Probabilistic graph links based on shared names/emails and Levenshtein distance
121
+ - [ ] Other popular git platforms: GitLab, Bitbucket
@@ -0,0 +1,965 @@
1
+ """Gitcolombo — OSINT tool: extract account info from git repositories.
2
+
3
+ Walks one or more git repositories and aggregates per-person stats
4
+ (name, email, author/committer counts, alternate identities) and detects
5
+ identity overlaps via shared emails or shared names. Optionally resolves
6
+ GitHub logins by scraping commit pages.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ __title__ = "Gitcolombo"
11
+ __author__ = "Soxoj"
12
+ __author_email__ = "soxoj@protonmail.com"
13
+ __license__ = "MIT"
14
+
15
+ from .__version__ import __version__
16
+
17
+ import argparse
18
+ import json
19
+ import logging
20
+ import os
21
+ import re
22
+ import subprocess
23
+ import sys
24
+ import threading
25
+ import time
26
+ import urllib.error
27
+ import urllib.request
28
+ from collections import defaultdict
29
+ from concurrent.futures import ThreadPoolExecutor
30
+ from dataclasses import dataclass, field
31
+ from typing import Iterable
32
+
33
+
34
+ DELIMITER = "-" * 15
35
+
36
+ # git log --pretty format: hash;"author_name author_email";"committer_name committer_email"
37
+ GIT_LOG_FORMAT = r'%H;"%an %ae";"%cn %ce"'
38
+ GIT_LOG_LINE_RE = re.compile(r'(\w+);"(.*?)";"(.*?)"')
39
+ GIT_NAME_EMAIL_RE = re.compile(r"^(.*?)\s+(\S+)$")
40
+ GITHUB_COMMIT_AUTHOR_RE = re.compile(r'<a href=".+?commits\?author=(.+?)"')
41
+
42
+ GITHUB_USER_URL = "https://api.github.com/users/{nickname}"
43
+ GITHUB_REPOS_URL = (
44
+ "https://api.github.com/users/{nickname}/repos?per_page={per_page}&page={page}"
45
+ )
46
+ GITHUB_PER_PAGE = 100
47
+
48
+ HTTP_TIMEOUT = 15
49
+ HTTP_USER_AGENT = f"gitcolombo/{__version__}"
50
+ RESOLVE_WORKERS = 8
51
+ CLONE_WORKERS = 8
52
+ DEFAULT_REPOS_DIR = "repos"
53
+
54
+ GITHUB_GPG_KEYS_URL = "https://api.github.com/users/{nickname}/gpg_keys"
55
+ GITHUB_SEARCH_COMMITS_URL = (
56
+ "https://api.github.com/search/commits?q=author:{nickname}"
57
+ "&per_page={per_page}&page={page}"
58
+ )
59
+ GITHUB_SEARCH_MAX_PAGES = 10 # /search/* caps results at 1000
60
+
61
+ # Well-known git trailer keys (DCO sign-off, GitHub co-authorship, kernel reviews).
62
+ # A real email in any of these is a strong identity signal: trailers are
63
+ # typically added intentionally by tooling (`git commit -s`, GitHub UI's
64
+ # "Co-authored-by", patch-review workflows) rather than being auto-generated.
65
+ TRAILER_RE = re.compile(
66
+ r"^(?P<key>Signed-off-by|Co-authored-by|Reviewed-by|Tested-by|"
67
+ r"Reported-by|Acked-by|Suggested-by|Cc):\s+"
68
+ r"(?P<name>[^<]+?)\s+<(?P<email>[^>]+)>\s*$",
69
+ re.MULTILINE | re.IGNORECASE,
70
+ )
71
+
72
+ SYSTEM_EMAILS = frozenset({"noreply@github.com"})
73
+
74
+ logger = logging.getLogger("gitcolombo")
75
+
76
+ # Service noreply addresses from any vendor (github, anthropic, gitlab, ...)
77
+ # plus GitHub's user-private `{id}+{login}@users.noreply.github.com` pattern.
78
+ SYSTEM_EMAIL_RE = re.compile(
79
+ r'(^(?:noreply|no-reply|donotreply|do-not-reply)@|@users\.noreply\.github\.com$)',
80
+ re.IGNORECASE,
81
+ )
82
+
83
+
84
+ def is_system_email(email):
85
+ return bool(email and SYSTEM_EMAIL_RE.search(email))
86
+
87
+
88
+ # ---------- Terminal styling ----------
89
+
90
+ # ANSI 256-color palette, picked to mirror the web UI's green-on-black look.
91
+ NEON = "\033[38;5;46m" # primary bright green
92
+ LIME = "\033[38;5;82m" # highlight (slightly lighter)
93
+ GREEN_DIM = "\033[38;5;34m" # secondary green
94
+ GREY = "\033[38;5;240m" # faint borders / dot-leaders
95
+ RED = "\033[38;5;196m" # warnings / noreply tags
96
+ BOLD = "\033[1m"
97
+ RESET = "\033[0m"
98
+
99
+ BANNER = r"""
100
+ ░██████╗░██╗████████╗░█████╗░░█████╗░██╗░░░░░░█████╗░███╗░░░███╗██████╗░░█████╗░
101
+ ██╔════╝░██║╚══██╔══╝██╔══██╗██╔══██╗██║░░░░░██╔══██╗████╗░████║██╔══██╗██╔══██╗
102
+ ██║░░██╗░██║░░░██║░░░██║░░╚═╝██║░░██║██║░░░░░██║░░██║██╔████╔██║██████╦╝██║░░██║
103
+ ██║░░╚██╗██║░░░██║░░░██║░░██╗██║░░██║██║░░░░░██║░░██║██║╚██╔╝██║██╔══██╗██║░░██║
104
+ ╚██████╔╝██║░░░██║░░░╚█████╔╝╚█████╔╝███████╗╚█████╔╝██║░╚═╝░██║██████╦╝╚█████╔╝
105
+ ░╚═════╝░╚═╝░░░╚═╝░░░░╚════╝░░╚════╝░╚══════╝░╚════╝░╚═╝░░░░░╚═╝╚═════╝░░╚════╝░
106
+ :: git commit osint ::
107
+ """
108
+
109
+ _COLOR_ENABLED = False
110
+ RULE_WIDTH = 80
111
+
112
+
113
+ def _setup_colors(force_off: bool) -> None:
114
+ global _COLOR_ENABLED
115
+ if force_off or os.environ.get("NO_COLOR"):
116
+ _COLOR_ENABLED = False
117
+ return
118
+ try:
119
+ _COLOR_ENABLED = sys.stdout.isatty()
120
+ except Exception:
121
+ _COLOR_ENABLED = False
122
+
123
+
124
+ def _c(code: str, text: str) -> str:
125
+ return f"{code}{text}{RESET}" if _COLOR_ENABLED else text
126
+
127
+
128
+ def _rule(width: int = RULE_WIDTH) -> str:
129
+ return _c(GREY, "─" * width)
130
+
131
+
132
+ def _tag(text: str, color: str = GREEN_DIM) -> str:
133
+ return _c(color, f"[{text}]")
134
+
135
+
136
+ def _email_with_tag(email: str) -> str:
137
+ """Bare email, with a trailing [noreply] tag if it's a service address."""
138
+ out = _c(NEON, email)
139
+ if is_system_email(email):
140
+ out += " " + _tag("noreply", RED)
141
+ return out
142
+
143
+
144
+ def _email_brackets(email: str) -> str:
145
+ """<email> [noreply]? — tag stays outside the angle brackets."""
146
+ out = _c(GREEN_DIM, "<") + _c(NEON, email) + _c(GREEN_DIM, ">")
147
+ if is_system_email(email):
148
+ out += " " + _tag("noreply", RED)
149
+ return out
150
+
151
+
152
+ def _section(title: str) -> list[str]:
153
+ return ["", _rule(), _c(GREEN_DIM, f"[ {title} ]"), _rule(), ""]
154
+
155
+
156
+ # ---------- HTTP helpers ----------
157
+
158
+ def _http_get(url: str) -> bytes | None:
159
+ req = urllib.request.Request(url, headers={"User-Agent": HTTP_USER_AGENT})
160
+ try:
161
+ with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp:
162
+ return resp.read()
163
+ except (urllib.error.URLError, TimeoutError) as exc:
164
+ logger.debug("GET %s failed: %s", url, exc)
165
+ return None
166
+
167
+
168
+ def _http_get_json(url: str):
169
+ payload = _http_get(url)
170
+ if payload is None:
171
+ return None
172
+ try:
173
+ return json.loads(payload.decode("utf-8"))
174
+ except (json.JSONDecodeError, UnicodeDecodeError) as exc:
175
+ logger.debug("Bad JSON from %s: %s", url, exc)
176
+ return None
177
+
178
+
179
+ # ---------- GitHub API ----------
180
+
181
+ def _gh_authed(url: str, token: str | None):
182
+ """Like _http_get_json but with optional bearer token (for higher rate limits)."""
183
+ headers = {
184
+ "Accept": "application/vnd.github+json",
185
+ "User-Agent": HTTP_USER_AGENT,
186
+ }
187
+ if token:
188
+ headers["Authorization"] = "Bearer " + token
189
+ req = urllib.request.Request(url, headers=headers)
190
+ try:
191
+ with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp:
192
+ payload = resp.read()
193
+ except (urllib.error.URLError, TimeoutError) as exc:
194
+ logger.debug("GET %s failed: %s", url, exc)
195
+ return None
196
+ try:
197
+ return json.loads(payload.decode("utf-8"))
198
+ except (json.JSONDecodeError, UnicodeDecodeError) as exc:
199
+ logger.debug("Bad JSON from %s: %s", url, exc)
200
+ return None
201
+
202
+
203
+ def get_public_repos_count(nickname: str) -> int:
204
+ data = _http_get_json(GITHUB_USER_URL.format(nickname=nickname))
205
+ if not data:
206
+ return 0
207
+ return int(data.get("public_repos", 0))
208
+
209
+
210
+ def get_github_repos(
211
+ nickname: str, repos_count: int, include_forks: bool = False,
212
+ ) -> set[str]:
213
+ """Return URLs of *nickname*'s repos. Forks dropped unless include_forks.
214
+
215
+ Logs a per-call summary (seen / forks-skipped / failed-pages) at INFO so
216
+ the caller can explain a "245 found → only 31 cloned" gap.
217
+ """
218
+ if repos_count <= 0:
219
+ return set()
220
+ last_page = (repos_count + GITHUB_PER_PAGE - 1) // GITHUB_PER_PAGE
221
+ repos: set[str] = set()
222
+ seen = 0
223
+ forks_skipped = 0
224
+ failed_pages = 0
225
+ for page in range(1, last_page + 1):
226
+ data = _http_get_json(
227
+ GITHUB_REPOS_URL.format(
228
+ nickname=nickname, per_page=GITHUB_PER_PAGE, page=page,
229
+ )
230
+ )
231
+ if not data:
232
+ failed_pages += 1
233
+ logger.warning(
234
+ "repos listing page %d/%d returned no data (rate limit? "
235
+ "try GITHUB_TOKEN env var)", page, last_page,
236
+ )
237
+ continue
238
+ for repo in data:
239
+ seen += 1
240
+ if repo.get("fork") and not include_forks:
241
+ forks_skipped += 1
242
+ continue
243
+ repos.add(repo["html_url"])
244
+ logger.info(
245
+ "listing: %d seen, %d forks %s, %d kept%s",
246
+ seen,
247
+ forks_skipped,
248
+ "kept" if include_forks else "skipped",
249
+ len(repos),
250
+ f", {failed_pages} page(s) failed" if failed_pages else "",
251
+ )
252
+ return repos
253
+
254
+
255
+ def resolve_github_username(repo_url: str, commit_hash: str) -> str | None:
256
+ """Scrape commit page to find the GitHub login behind an email."""
257
+ if not repo_url.startswith("https://github.com/"):
258
+ return None
259
+ commit_url = f"{repo_url.rstrip('/')}/commit/{commit_hash}"
260
+ page = _http_get(commit_url)
261
+ if page is None:
262
+ return None
263
+ match = GITHUB_COMMIT_AUTHOR_RE.search(page.decode("utf-8", errors="replace"))
264
+ return match.group(1) if match else None
265
+
266
+
267
+ def get_gpg_keys_emails(nickname: str, token: str | None = None):
268
+ """Fetch user-uploaded PGP keys via /users/{u}/gpg_keys and yield emails.
269
+
270
+ These emails come from the key's UIDs — the user uploaded them themselves,
271
+ so this is a direct identity disclosure. `verified=True` means GitHub has
272
+ confirmed the user controls that mailbox.
273
+
274
+ Yields dicts: {email, verified, key_id, created_at, source}.
275
+ """
276
+ keys = _gh_authed(GITHUB_GPG_KEYS_URL.format(nickname=nickname), token)
277
+ if not keys:
278
+ return
279
+ seen: set[str] = set()
280
+
281
+ def _walk(key, source):
282
+ if not key or key.get("revoked"):
283
+ return
284
+ key_id = key.get("key_id", "")
285
+ created = key.get("created_at", "")
286
+ for entry in (key.get("emails") or []):
287
+ email = entry.get("email")
288
+ if not email:
289
+ continue
290
+ k = email.lower()
291
+ if k in seen:
292
+ continue
293
+ seen.add(k)
294
+ yield {
295
+ "email": email,
296
+ "verified": bool(entry.get("verified")),
297
+ "key_id": key_id,
298
+ "created_at": created,
299
+ "source": source,
300
+ }
301
+
302
+ for key in keys:
303
+ yield from _walk(key, "primary")
304
+ for sub in (key.get("subkeys") or []):
305
+ yield from _walk(sub, "subkey")
306
+
307
+
308
+ def print_gpg_results(results, ignore_noreply: bool = True) -> bool:
309
+ """Pretty-print get_gpg_keys_emails() output. Returns True if printed."""
310
+ rows = [
311
+ r for r in results
312
+ if not (ignore_noreply and is_system_email(r["email"]))
313
+ ]
314
+ if not rows:
315
+ return False
316
+ for line in _section("pgp key uids"):
317
+ print(line)
318
+ print(" " + _c(GREEN_DIM, "source: /users/{u}/gpg_keys (user-uploaded)"))
319
+ print()
320
+ rows.sort(key=lambda r: (not r["verified"], r["email"]))
321
+ for r in rows:
322
+ flag_color = LIME if r["verified"] else GREEN_DIM
323
+ flag = _tag("verified" if r["verified"] else "unverified", flag_color)
324
+ print(" {arrow} {email:40} {flag} {kid}={key} {src}".format(
325
+ arrow=_c(LIME, "▶"),
326
+ email=_email_with_tag(r["email"]),
327
+ flag=flag,
328
+ kid=_c(GREEN_DIM, "key_id"),
329
+ key=_c(NEON, r["key_id"] or "?"),
330
+ src=_tag(r["source"]),
331
+ ))
332
+ print()
333
+ return True
334
+
335
+
336
+ def search_commits_by_author(nickname: str, token: str | None = None):
337
+ """Use /search/commits?q=author:{u} to find commits across all of public GitHub.
338
+
339
+ Also extracts well-known git trailers from each commit message body
340
+ (Signed-off-by, Co-authored-by, Reviewed-by, etc.).
341
+
342
+ Yields dicts: {email, name, role, repo, sha, date}.
343
+ """
344
+ seen: set[tuple[str, str, str]] = set()
345
+ for page in range(1, GITHUB_SEARCH_MAX_PAGES + 1):
346
+ url = GITHUB_SEARCH_COMMITS_URL.format(
347
+ nickname=nickname, per_page=GITHUB_PER_PAGE, page=page,
348
+ )
349
+ data = _gh_authed(url, token)
350
+ if not data:
351
+ return
352
+ items = data.get("items") or []
353
+ if not items:
354
+ return
355
+ for item in items:
356
+ commit = item.get("commit") or {}
357
+ repo = (item.get("repository") or {}).get("full_name", "")
358
+ sha = item.get("sha", "")
359
+ date = (commit.get("author") or {}).get("date", "")
360
+ message = commit.get("message") or ""
361
+ for role in ("author", "committer"):
362
+ who = commit.get(role) or {}
363
+ email = who.get("email")
364
+ name = who.get("name") or ""
365
+ if not email:
366
+ continue
367
+ key = (email.lower(), name.lower(), role)
368
+ if key in seen:
369
+ continue
370
+ seen.add(key)
371
+ yield {"email": email, "name": name, "role": role,
372
+ "repo": repo, "sha": sha, "date": date}
373
+ # trailers in the commit message body
374
+ for tm in TRAILER_RE.finditer(message):
375
+ t_key = tm.group("key").lower()
376
+ t_name = (tm.group("name") or "").strip()
377
+ t_email = (tm.group("email") or "").strip()
378
+ if not t_email:
379
+ continue
380
+ # reject malformed names: ':' implies another trailer label was
381
+ # crammed onto the same line; '@' implies a @-mention or stray
382
+ # handle. Real personal names don't contain either.
383
+ if ":" in t_name or "@" in t_name:
384
+ continue
385
+ key = (t_email.lower(), t_name.lower(), t_key)
386
+ if key in seen:
387
+ continue
388
+ seen.add(key)
389
+ yield {"email": t_email, "name": t_name, "role": t_key,
390
+ "repo": repo, "sha": sha, "date": date}
391
+ if len(items) < GITHUB_PER_PAGE:
392
+ return
393
+
394
+
395
+ def print_search_results(results, ignore_noreply: bool = True) -> None:
396
+ """Pretty-print search_commits_by_author() output grouped by (email, name)."""
397
+ groups: dict[tuple[str, str], list[dict]] = {}
398
+ for r in results:
399
+ if ignore_noreply and is_system_email(r["email"]):
400
+ continue
401
+ key = (r["email"], r["name"])
402
+ groups.setdefault(key, []).append(r)
403
+
404
+ if not groups:
405
+ print(_c(RED, "[!] no public commits found via /search/commits"))
406
+ return
407
+
408
+ for line in _section("commit search"):
409
+ print(line)
410
+ print(" " + _c(GREEN_DIM, "identities found: ") + _c(NEON, str(len(groups))))
411
+ print()
412
+ ordered = sorted(groups.items(), key=lambda kv: -len(kv[1]))
413
+ for (email, name), rows in ordered:
414
+ repos = sorted({r["repo"] for r in rows if r["repo"]})
415
+ roles = sorted({r["role"] for r in rows})
416
+ print(" {arrow} {name} {brackets} {hits} {roles}".format(
417
+ arrow=_c(LIME, "▶"),
418
+ name=_c(BOLD + NEON, name or "?"),
419
+ brackets=_email_brackets(email),
420
+ hits=_c(LIME, f"×{len(rows)}"),
421
+ roles=_tag(", ".join(roles)),
422
+ ))
423
+ for i, repo in enumerate(repos[:5]):
424
+ last = i == min(4, len(repos) - 1) and len(repos) <= 5
425
+ branch = "└─" if last else "├─"
426
+ print(" " + _c(GREEN_DIM, branch) + " "
427
+ + _c(GREEN_DIM, "repo ") + _c(NEON, repo))
428
+ if len(repos) > 5:
429
+ print(" " + _c(GREEN_DIM, "└─ ")
430
+ + _c(GREEN_DIM, f"... +{len(repos) - 5} more repos"))
431
+ print()
432
+
433
+
434
+ # ---------- Filesystem helpers ----------
435
+
436
+ def find_all_repos_recursively(path: str) -> list[str]:
437
+ """Return repo roots (directories that contain a .git subdir) under path."""
438
+ repos: list[str] = []
439
+ for current_dir, dirs, _ in os.walk(path):
440
+ if ".git" in dirs:
441
+ repos.append(current_dir)
442
+ dirs[:] = [d for d in dirs if d != ".git"]
443
+ return repos
444
+
445
+
446
+ # ---------- Git subprocess ----------
447
+
448
+ def git_log(repo_dir: str) -> str:
449
+ try:
450
+ result = subprocess.run(
451
+ ["git", "log", f"--pretty={GIT_LOG_FORMAT}", "--all"],
452
+ cwd=repo_dir, check=False, capture_output=True, text=True,
453
+ )
454
+ except FileNotFoundError:
455
+ logger.error("'git' binary not found")
456
+ return ""
457
+ if result.returncode != 0:
458
+ logger.debug("git log failed in %s: %s", repo_dir, result.stderr.strip())
459
+ return result.stdout
460
+
461
+
462
+ def _clone_target_dir(url: str) -> str:
463
+ name = url.rstrip("/").split("/")[-1]
464
+ return name[:-4] if name.endswith(".git") else name
465
+
466
+
467
+ def git_clone(url: str, dest_dir: str) -> str | None:
468
+ """Clone *url* into *dest_dir*/<repo-name>. Returns the cloned path or None."""
469
+ os.makedirs(dest_dir, exist_ok=True)
470
+ target = os.path.join(dest_dir, _clone_target_dir(url))
471
+ try:
472
+ result = subprocess.run(
473
+ ["git", "clone", url, target],
474
+ check=False, capture_output=True, text=True,
475
+ )
476
+ except FileNotFoundError:
477
+ logger.error("'git' binary not found")
478
+ return None
479
+ if result.returncode != 0:
480
+ logger.debug("git clone failed for %s: %s", url, result.stderr.strip())
481
+ return None
482
+ return target
483
+
484
+
485
+ def _short_url(url: str, width: int = 50) -> str:
486
+ """Trim URL for progress display: keep owner/repo tail."""
487
+ if len(url) <= width:
488
+ return url
489
+ tail = "/".join(url.rstrip("/").split("/")[-2:])
490
+ return ("…" + tail)[-width:]
491
+
492
+
493
+ def clone_many(
494
+ urls: list[str],
495
+ dest_dir: str,
496
+ workers: int = CLONE_WORKERS,
497
+ ) -> dict[str, str | None]:
498
+ """Clone *urls* concurrently. Returns {url: local_path or None}.
499
+
500
+ Prints a live progress line to stderr (overwritten on TTY, line-per-tick
501
+ otherwise) so the user can see what's happening during long clone batches.
502
+ """
503
+ total = len(urls)
504
+ if total == 0:
505
+ return {}
506
+
507
+ results: dict[str, str | None] = {}
508
+ state = {"done": 0, "ok": 0, "fail": 0, "current": ""}
509
+ lock = threading.Lock()
510
+ started = time.monotonic()
511
+ is_tty = False
512
+ try:
513
+ is_tty = sys.stderr.isatty()
514
+ except Exception:
515
+ pass
516
+
517
+ last_done = {"value": -1}
518
+
519
+ def render(final: bool = False) -> None:
520
+ elapsed = time.monotonic() - started
521
+ fail_chunk = _c(RED, f"fail={state['fail']}") if state["fail"] else \
522
+ _c(GREEN_DIM, "fail=0")
523
+ line = (
524
+ _c(GREEN_DIM, "[*] ")
525
+ + _c(LIME, "cloning ")
526
+ + _c(NEON, f"{state['done']}/{total}")
527
+ + " " + _c(GREEN_DIM, f"ok={state['ok']}")
528
+ + " " + fail_chunk
529
+ + " " + _c(GREEN_DIM, f"{elapsed:>4.0f}s")
530
+ )
531
+ if state["current"] and not final:
532
+ line += " " + _c(GREEN_DIM, "· ") + _c(NEON, state["current"])
533
+ if is_tty:
534
+ # \r + clear-to-end-of-line keeps the progress on a single line.
535
+ sys.stderr.write("\r\033[K" + line)
536
+ if final:
537
+ sys.stderr.write("\n")
538
+ sys.stderr.flush()
539
+ else:
540
+ # Non-TTY: avoid a flood of identical "0/N" lines while threads
541
+ # pick up their first job. Only emit when the done counter ticks
542
+ # forward (or on the final summary).
543
+ if final or state["done"] != last_done["value"]:
544
+ last_done["value"] = state["done"]
545
+ sys.stderr.write(line + "\n")
546
+
547
+ def worker(url: str) -> None:
548
+ with lock:
549
+ state["current"] = _short_url(url)
550
+ render()
551
+ path = git_clone(url, dest_dir)
552
+ with lock:
553
+ state["done"] += 1
554
+ if path:
555
+ state["ok"] += 1
556
+ else:
557
+ state["fail"] += 1
558
+ # Don't keep stale "current" once this thread is done; the next
559
+ # worker that picks up a job will overwrite it.
560
+ state["current"] = ""
561
+ render()
562
+
563
+ render() # initial 0/total
564
+ try:
565
+ with ThreadPoolExecutor(max_workers=max(1, workers)) as pool:
566
+ futures = {pool.submit(worker, url): url for url in urls}
567
+ for fut in futures:
568
+ try:
569
+ fut.result()
570
+ except Exception as exc: # pragma: no cover - defensive
571
+ logger.debug("clone worker for %s raised: %s", futures[fut], exc)
572
+ finally:
573
+ with lock:
574
+ state["current"] = ""
575
+ render(final=True)
576
+
577
+ # Map each URL to its deterministic target path so callers get a stable
578
+ # {url: path|None} contract regardless of completion order.
579
+ for url in urls:
580
+ target = os.path.join(dest_dir, _clone_target_dir(url))
581
+ results[url] = target if os.path.isdir(os.path.join(target, ".git")) else None
582
+ return results
583
+
584
+
585
+ # ---------- Data classes ----------
586
+
587
+ def _split_name_email(raw: str) -> tuple[str, str]:
588
+ m = GIT_NAME_EMAIL_RE.match(raw)
589
+ if not m:
590
+ logger.error("Could not extract name/email from %r", raw)
591
+ return "", ""
592
+ return m.group(1), m.group(2)
593
+
594
+
595
+ @dataclass
596
+ class Commit:
597
+ hash: str
598
+ author: str
599
+ committer: str
600
+ author_name: str
601
+ author_email: str
602
+ committer_name: str
603
+ committer_email: str
604
+
605
+ @property
606
+ def author_committer_same(self) -> bool:
607
+ return (
608
+ self.author_name == self.committer_name
609
+ and self.author_email == self.committer_email
610
+ )
611
+
612
+ @classmethod
613
+ def parse(cls, line: str) -> "Commit | None":
614
+ m = GIT_LOG_LINE_RE.search(line)
615
+ if not m:
616
+ logger.error("Could not parse commit line %r", line)
617
+ return None
618
+ h, author, committer = m.groups()
619
+ a_name, a_email = _split_name_email(author)
620
+ c_name, c_email = _split_name_email(committer)
621
+ return cls(h, author, committer, a_name, a_email, c_name, c_email)
622
+
623
+ def __str__(self) -> str:
624
+ return (
625
+ f"Hash: {self.hash}\n"
626
+ f"Author name: {self.author_name}\n"
627
+ f"Author email: {self.author_email}\n"
628
+ f"Committer name: {self.committer_name}\n"
629
+ f"Committer email: {self.committer_email}\n"
630
+ )
631
+
632
+
633
+ @dataclass
634
+ class Person:
635
+ key: str
636
+ name: str = ""
637
+ email: str = ""
638
+ as_author: int = 0
639
+ as_committer: int = 0
640
+ also_known: dict[str, "Person"] = field(default_factory=dict)
641
+ github_login: str | None = None
642
+ repo_url: str | None = None
643
+ last_commit_hash: str | None = None
644
+
645
+ def __str__(self) -> str:
646
+ # Headline: ▶ name <email> [noreply]?
647
+ header = " {arrow} {name} {brackets}".format(
648
+ arrow=_c(LIME, "▶"),
649
+ name=_c(BOLD + NEON, self.name or "?"),
650
+ brackets=_email_brackets(self.email),
651
+ )
652
+ rows: list[tuple[str, str]] = []
653
+ if self.as_author:
654
+ rows.append(("author", _c(LIME, f"×{self.as_author}")))
655
+ if self.as_committer:
656
+ rows.append(("committer", _c(LIME, f"×{self.as_committer}")))
657
+ if self.github_login:
658
+ url = f"https://github.com/{self.github_login}"
659
+ rows.append(("github", _c(LIME, url) + " " + _tag("verified", LIME)))
660
+ for alias in self.also_known.values():
661
+ alias_text = f"{alias.name} {_email_brackets(alias.email)}"
662
+ rows.append(("alias", alias_text))
663
+
664
+ lines = [header]
665
+ for i, (label, value) in enumerate(rows):
666
+ branch = "└─" if i == len(rows) - 1 else "├─"
667
+ lines.append(
668
+ " " + _c(GREEN_DIM, branch) + " "
669
+ + _c(GREEN_DIM, f"{label:<10}") + " " + value
670
+ )
671
+ return "\n".join(lines)
672
+
673
+
674
+ # ---------- Analyst ----------
675
+
676
+ class GitAnalyst:
677
+ def __init__(self, repos_dir: str = DEFAULT_REPOS_DIR) -> None:
678
+ self.repos_dir = repos_dir
679
+ self.commits: list[Commit] = []
680
+ self.persons: dict[str, Person] = {}
681
+ self.name_to_emails: dict[str, set[str]] = defaultdict(set)
682
+ self.repos: list[str] = []
683
+ self.same_emails_persons: dict[str, tuple[list[str], set[str]]] = {}
684
+
685
+ def append(self, source: str, *, cloned_path: str | None = None) -> None:
686
+ if cloned_path is not None:
687
+ repo_dir = cloned_path
688
+ elif "://" in source:
689
+ repo_dir = git_clone(source, self.repos_dir)
690
+ if repo_dir is None:
691
+ return
692
+ else:
693
+ repo_dir = source
694
+
695
+ self.repos.append(repo_dir)
696
+ log_output = git_log(repo_dir)
697
+ new_commits = [
698
+ c for c in (Commit.parse(line) for line in log_output.splitlines() if line)
699
+ if c is not None
700
+ ]
701
+ self.commits.extend(new_commits)
702
+ self._analyze(new_commits, source)
703
+
704
+ @property
705
+ def sorted_persons(self) -> list[tuple[str, Person]]:
706
+ return sorted(
707
+ self.persons.items(),
708
+ key=lambda item: item[1].as_author + item[1].as_committer,
709
+ )
710
+
711
+ def resolve_persons(self) -> None:
712
+ targets = [
713
+ p for p in self.persons.values()
714
+ if p.email not in SYSTEM_EMAILS and p.repo_url and p.last_commit_hash
715
+ ]
716
+ if not targets:
717
+ return
718
+ with ThreadPoolExecutor(max_workers=RESOLVE_WORKERS) as pool:
719
+ futures = {
720
+ pool.submit(resolve_github_username, p.repo_url, p.last_commit_hash): p
721
+ for p in targets
722
+ }
723
+ for fut, person in futures.items():
724
+ login = fut.result()
725
+ if login:
726
+ person.github_login = login
727
+
728
+ def _upsert(
729
+ self, key: str, name: str, email: str, repo_url: str, commit_hash: str,
730
+ ) -> Person:
731
+ person = self.persons.get(key) or Person(key=key)
732
+ person.name = name
733
+ person.email = email
734
+ person.repo_url = repo_url
735
+ person.last_commit_hash = commit_hash
736
+ self.persons[key] = person
737
+ return person
738
+
739
+ def _analyze(self, new_commits: Iterable[Commit], repo_url: str) -> None:
740
+ for commit in new_commits:
741
+ author = self._upsert(
742
+ commit.author, commit.author_name, commit.author_email,
743
+ repo_url, commit.hash,
744
+ )
745
+ author.as_author += 1
746
+
747
+ committer = self._upsert(
748
+ commit.committer, commit.committer_name, commit.committer_email,
749
+ repo_url, commit.hash,
750
+ )
751
+ committer.as_committer += 1
752
+
753
+ if not commit.author_committer_same:
754
+ author.also_known[commit.committer] = committer
755
+ committer.also_known[commit.author] = author
756
+
757
+ self.name_to_emails[commit.author_name].add(commit.author_email)
758
+ self.name_to_emails[commit.committer_name].add(commit.committer_email)
759
+
760
+ # Group names that share the exact same set of emails — these are
761
+ # treated as the same person. O(n) instead of the previous O(n²).
762
+ emails_to_names: dict[frozenset[str], list[str]] = defaultdict(list)
763
+ for name, emails in self.name_to_emails.items():
764
+ emails_to_names[frozenset(emails)].append(name)
765
+ self.same_emails_persons = {
766
+ ",".join(sorted(names)): (sorted(names), set(emails))
767
+ for emails, names in emails_to_names.items()
768
+ if len(names) > 1
769
+ }
770
+
771
+ def __str__(self) -> str:
772
+ parts: list[str] = []
773
+
774
+ # 1. Stats — top-level summary of what was scanned and what was found.
775
+ parts.extend(_section("stats"))
776
+ for label, value in (
777
+ ("repos", len(self.repos)),
778
+ ("commits", len(self.commits)),
779
+ ("persons", len(self.persons)),
780
+ ):
781
+ dots = "." * (16 - len(label))
782
+ parts.append(" " + _c(GREEN_DIM, label) + " "
783
+ + _c(GREY, dots) + " " + _c(NEON, str(value)))
784
+ parts.append("")
785
+ parts.append(" " + _c(GREEN_DIM, "targets"))
786
+ for i, repo in enumerate(self.repos):
787
+ branch = "└─" if i == len(self.repos) - 1 else "├─"
788
+ parts.append(" " + _c(GREEN_DIM, branch) + " " + _c(NEON, repo))
789
+
790
+ # 2. Correlation — shared names with multiple emails + same-person clusters.
791
+ matching: list[str] = []
792
+ for name, emails in self.name_to_emails.items():
793
+ if len(emails) <= 1:
794
+ continue
795
+ sorted_emails = sorted(emails)
796
+ block = [
797
+ " {bang} {name} {arrow} {n} emails".format(
798
+ bang=_c(RED, "[!]"),
799
+ name=_c(BOLD + NEON, name),
800
+ arrow=_c(GREEN_DIM, "→"),
801
+ n=_c(LIME, str(len(sorted_emails))),
802
+ )
803
+ ]
804
+ for i, e in enumerate(sorted_emails):
805
+ branch = "└─" if i == len(sorted_emails) - 1 else "├─"
806
+ block.append(" " + _c(GREEN_DIM, branch) + " "
807
+ + _email_with_tag(e))
808
+ matching.append("\n".join(block))
809
+
810
+ same_person: list[str] = []
811
+ for names, _emails in self.same_emails_persons.values():
812
+ joined = _c(BOLD + NEON, (" " + _c(GREEN_DIM, "≡") + " ").join(names))
813
+ same_person.append(
814
+ " " + _c(RED, "[!]") + " " + _c(GREEN_DIM, "same person:") + " "
815
+ + joined
816
+ )
817
+
818
+ if matching or same_person:
819
+ parts.extend(_section("correlation"))
820
+ if matching:
821
+ parts.append("\n\n".join(matching))
822
+ parts.append("")
823
+ if same_person:
824
+ parts.extend(same_person)
825
+ parts.append("")
826
+
827
+ # 3. Identities — per-person breakdown.
828
+ parts.extend(_section("identities"))
829
+ for _, person in self.sorted_persons:
830
+ parts.append(str(person))
831
+ parts.append("")
832
+
833
+ return "\n".join(parts)
834
+
835
+
836
+ # ---------- CLI ----------
837
+
838
+ def _parse_args() -> argparse.Namespace:
839
+ parser = argparse.ArgumentParser(
840
+ prog="gitcolombo",
841
+ description="Extract accounts' information from git repo and make some researches.",
842
+ )
843
+ parser.add_argument("-d", "--dir", help="directory with git project(s)")
844
+ parser.add_argument("-u", "--url", help="url of git repo")
845
+ parser.add_argument(
846
+ "--github", action="store_true",
847
+ help="try to extract extended info from GitHub",
848
+ )
849
+ parser.add_argument(
850
+ "--nickname", type=str,
851
+ help="download repos from GitHub by nickname",
852
+ )
853
+ parser.add_argument(
854
+ "--search", type=str, metavar="USERNAME",
855
+ help="API-only path: query /users/{u}/gpg_keys + /search/commits "
856
+ "for emails (no cloning, ~1000 commit results max)",
857
+ )
858
+ parser.add_argument(
859
+ "--no-ignore-noreply", action="store_true",
860
+ help="do not filter service noreply addresses from --search results",
861
+ )
862
+ parser.add_argument(
863
+ "-r", "--recursive", action="store_true",
864
+ help="recursive directory processing",
865
+ )
866
+ parser.add_argument(
867
+ "--repos-dir", default=DEFAULT_REPOS_DIR,
868
+ help=f"directory to clone remote repositories into (default: {DEFAULT_REPOS_DIR})",
869
+ )
870
+ parser.add_argument(
871
+ "--clone-workers", type=int, default=CLONE_WORKERS,
872
+ help=f"parallel git-clone workers (default: {CLONE_WORKERS})",
873
+ )
874
+ parser.add_argument(
875
+ "--include-forks", action="store_true",
876
+ help="include forked repositories (default: skipped — forks add upstream "
877
+ "history that is not the target user's work)",
878
+ )
879
+ parser.add_argument("--debug", action="store_true", help="print debug information")
880
+ parser.add_argument(
881
+ "--no-color", action="store_true",
882
+ help="disable ANSI colors (also honored via NO_COLOR env var or non-TTY stdout)",
883
+ )
884
+ return parser.parse_args()
885
+
886
+
887
+ def _collect_sources(args: argparse.Namespace) -> list[str]:
888
+ sources: list[str] = []
889
+ if args.url:
890
+ sources.append(args.url)
891
+ if args.dir:
892
+ sources.append(args.dir.rstrip("/"))
893
+ if args.recursive:
894
+ sources.extend(find_all_repos_recursively(args.dir))
895
+ if args.nickname:
896
+ count = get_public_repos_count(args.nickname)
897
+ if count:
898
+ logger.info("found %d public repos for %s", count, args.nickname)
899
+ sources.extend(get_github_repos(
900
+ args.nickname, repos_count=count,
901
+ include_forks=args.include_forks,
902
+ ))
903
+ return sources
904
+
905
+
906
+ def main() -> None:
907
+ args = _parse_args()
908
+ _setup_colors(force_off=args.no_color)
909
+ logging.basicConfig(
910
+ level=logging.DEBUG if args.debug else logging.INFO,
911
+ format=_c(GREEN_DIM, "[*] ") + _c(LIME, "%(levelname)s") + " %(message)s",
912
+ )
913
+
914
+ print(_c(NEON, BANNER), flush=True)
915
+
916
+ if args.search:
917
+ token = os.environ.get("GITHUB_TOKEN")
918
+ ignore = not args.no_ignore_noreply
919
+ gpg = list(get_gpg_keys_emails(args.search, token=token))
920
+ had_gpg = print_gpg_results(gpg, ignore_noreply=ignore)
921
+ results = list(search_commits_by_author(args.search, token=token))
922
+ print_search_results(results, ignore_noreply=ignore)
923
+ if not had_gpg and not results:
924
+ print("No emails found via /gpg_keys or /search/commits.")
925
+ return
926
+
927
+ sources = _collect_sources(args)
928
+ if not sources:
929
+ print("Run me with git repo link or path!")
930
+ return
931
+
932
+ analyst = GitAnalyst(repos_dir=args.repos_dir)
933
+
934
+ url_sources = [s for s in sources if "://" in s]
935
+ local_sources = [s for s in sources if "://" not in s]
936
+
937
+ cloned: dict[str, str | None] = {}
938
+ if url_sources:
939
+ logger.info(
940
+ "cloning %d repo(s) into %s with %d workers",
941
+ len(url_sources), args.repos_dir, args.clone_workers,
942
+ )
943
+ cloned = clone_many(url_sources, args.repos_dir, workers=args.clone_workers)
944
+ failed = [u for u, p in cloned.items() if p is None]
945
+ if failed:
946
+ logger.warning("%d clone(s) failed (see --debug for reasons)", len(failed))
947
+
948
+ to_analyze = len(local_sources) + sum(1 for p in cloned.values() if p)
949
+ if to_analyze:
950
+ logger.info("analyzing %d repo(s)...", to_analyze)
951
+ for src in local_sources:
952
+ analyst.append(src)
953
+ for url, path in cloned.items():
954
+ if path:
955
+ analyst.append(url, cloned_path=path)
956
+
957
+ if analyst.persons:
958
+ logger.info("resolving GitHub usernames for %d identities...",
959
+ len(analyst.persons))
960
+ analyst.resolve_persons()
961
+
962
+ if analyst.repos:
963
+ print(analyst)
964
+ else:
965
+ print("Run me with git repo link or path!")
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python3
2
+ """Gitcolombo entrypoint — enables `python -m gitcolombo ...`."""
3
+
4
+ from .__init__ import main
5
+
6
+ if __name__ == "__main__":
7
+ main()
@@ -0,0 +1,3 @@
1
+ """Gitcolombo version file"""
2
+
3
+ __version__ = "0.3.0"
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["poetry-core"]
3
+ build-backend = "poetry.core.masonry.api"
4
+
5
+ [tool.poetry]
6
+ name = "gitcolombo"
7
+ version = "0.3.0"
8
+ description = "🕵️ OSINT tool to extract identities (names, emails, GitHub logins) from git repositories and the GitHub API."
9
+ authors = ["Soxoj <soxoj@protonmail.com>"]
10
+ readme = "README.md"
11
+ license = "MIT"
12
+ homepage = "https://pypi.org/project/gitcolombo"
13
+ repository = "https://github.com/soxoj/gitcolombo"
14
+ keywords = ["osint", "git", "github", "email", "investigation", "recon", "doxing"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Programming Language :: Python :: 3.14",
23
+ "Intended Audience :: Information Technology",
24
+ "Operating System :: OS Independent",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Natural Language :: English",
27
+ "Topic :: Security",
28
+ "Topic :: Software Development :: Version Control :: Git"
29
+ ]
30
+ include = ["gitcolombo/*.py"]
31
+
32
+ [tool.poetry.urls]
33
+ "Bug Tracker" = "https://github.com/soxoj/gitcolombo/issues"
34
+ "Web version" = "https://gitcolombo.soxoj.com"
35
+
36
+ [tool.poetry.dependencies]
37
+ # Stdlib-only by design — git binary is the only external requirement.
38
+ python = "^3.10"
39
+
40
+ [tool.poetry.group.dev.dependencies]
41
+ # poetry install --with dev
42
+ pytest = ">=8.0,<10.0"
43
+ coverage = "^7.0"
44
+
45
+ [tool.poetry.scripts]
46
+ # Installed by `pip install gitcolombo`. Run with `gitcolombo --help`.
47
+ gitcolombo = "gitcolombo:main"