gitcolombo 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitcolombo/__init__.py +965 -0
- gitcolombo/__main__.py +7 -0
- gitcolombo/__version__.py +3 -0
- gitcolombo-0.3.0.dist-info/METADATA +151 -0
- gitcolombo-0.3.0.dist-info/RECORD +8 -0
- gitcolombo-0.3.0.dist-info/WHEEL +4 -0
- gitcolombo-0.3.0.dist-info/entry_points.txt +3 -0
- gitcolombo-0.3.0.dist-info/licenses/LICENSE +21 -0
gitcolombo/__init__.py
ADDED
|
@@ -0,0 +1,965 @@
|
|
|
1
|
+
"""Gitcolombo — OSINT tool: extract account info from git repositories.
|
|
2
|
+
|
|
3
|
+
Walks one or more git repositories and aggregates per-person stats
|
|
4
|
+
(name, email, author/committer counts, alternate identities) and detects
|
|
5
|
+
identity overlaps via shared emails or shared names. Optionally resolves
|
|
6
|
+
GitHub logins by scraping commit pages.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
__title__ = "Gitcolombo"
|
|
11
|
+
__author__ = "Soxoj"
|
|
12
|
+
__author_email__ = "soxoj@protonmail.com"
|
|
13
|
+
__license__ = "MIT"
|
|
14
|
+
|
|
15
|
+
from .__version__ import __version__
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
import threading
|
|
25
|
+
import time
|
|
26
|
+
import urllib.error
|
|
27
|
+
import urllib.request
|
|
28
|
+
from collections import defaultdict
|
|
29
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from typing import Iterable
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
DELIMITER = "-" * 15
|
|
35
|
+
|
|
36
|
+
# git log --pretty format: hash;"author_name author_email";"committer_name committer_email"
|
|
37
|
+
GIT_LOG_FORMAT = r'%H;"%an %ae";"%cn %ce"'
|
|
38
|
+
GIT_LOG_LINE_RE = re.compile(r'(\w+);"(.*?)";"(.*?)"')
|
|
39
|
+
GIT_NAME_EMAIL_RE = re.compile(r"^(.*?)\s+(\S+)$")
|
|
40
|
+
GITHUB_COMMIT_AUTHOR_RE = re.compile(r'<a href=".+?commits\?author=(.+?)"')
|
|
41
|
+
|
|
42
|
+
GITHUB_USER_URL = "https://api.github.com/users/{nickname}"
|
|
43
|
+
GITHUB_REPOS_URL = (
|
|
44
|
+
"https://api.github.com/users/{nickname}/repos?per_page={per_page}&page={page}"
|
|
45
|
+
)
|
|
46
|
+
GITHUB_PER_PAGE = 100
|
|
47
|
+
|
|
48
|
+
HTTP_TIMEOUT = 15
|
|
49
|
+
HTTP_USER_AGENT = f"gitcolombo/{__version__}"
|
|
50
|
+
RESOLVE_WORKERS = 8
|
|
51
|
+
CLONE_WORKERS = 8
|
|
52
|
+
DEFAULT_REPOS_DIR = "repos"
|
|
53
|
+
|
|
54
|
+
GITHUB_GPG_KEYS_URL = "https://api.github.com/users/{nickname}/gpg_keys"
|
|
55
|
+
GITHUB_SEARCH_COMMITS_URL = (
|
|
56
|
+
"https://api.github.com/search/commits?q=author:{nickname}"
|
|
57
|
+
"&per_page={per_page}&page={page}"
|
|
58
|
+
)
|
|
59
|
+
GITHUB_SEARCH_MAX_PAGES = 10 # /search/* caps results at 1000
|
|
60
|
+
|
|
61
|
+
# Well-known git trailer keys (DCO sign-off, GitHub co-authorship, kernel reviews).
|
|
62
|
+
# A real email in any of these is a strong identity signal: trailers are
|
|
63
|
+
# typically added intentionally by tooling (`git commit -s`, GitHub UI's
|
|
64
|
+
# "Co-authored-by", patch-review workflows) rather than being auto-generated.
|
|
65
|
+
TRAILER_RE = re.compile(
|
|
66
|
+
r"^(?P<key>Signed-off-by|Co-authored-by|Reviewed-by|Tested-by|"
|
|
67
|
+
r"Reported-by|Acked-by|Suggested-by|Cc):\s+"
|
|
68
|
+
r"(?P<name>[^<]+?)\s+<(?P<email>[^>]+)>\s*$",
|
|
69
|
+
re.MULTILINE | re.IGNORECASE,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
SYSTEM_EMAILS = frozenset({"noreply@github.com"})
|
|
73
|
+
|
|
74
|
+
logger = logging.getLogger("gitcolombo")
|
|
75
|
+
|
|
76
|
+
# Service noreply addresses from any vendor (github, anthropic, gitlab, ...)
|
|
77
|
+
# plus GitHub's user-private `{id}+{login}@users.noreply.github.com` pattern.
|
|
78
|
+
SYSTEM_EMAIL_RE = re.compile(
|
|
79
|
+
r'(^(?:noreply|no-reply|donotreply|do-not-reply)@|@users\.noreply\.github\.com$)',
|
|
80
|
+
re.IGNORECASE,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def is_system_email(email):
|
|
85
|
+
return bool(email and SYSTEM_EMAIL_RE.search(email))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------- Terminal styling ----------
|
|
89
|
+
|
|
90
|
+
# ANSI 256-color palette, picked to mirror the web UI's green-on-black look.
|
|
91
|
+
NEON = "\033[38;5;46m" # primary bright green
|
|
92
|
+
LIME = "\033[38;5;82m" # highlight (slightly lighter)
|
|
93
|
+
GREEN_DIM = "\033[38;5;34m" # secondary green
|
|
94
|
+
GREY = "\033[38;5;240m" # faint borders / dot-leaders
|
|
95
|
+
RED = "\033[38;5;196m" # warnings / noreply tags
|
|
96
|
+
BOLD = "\033[1m"
|
|
97
|
+
RESET = "\033[0m"
|
|
98
|
+
|
|
99
|
+
BANNER = r"""
|
|
100
|
+
░██████╗░██╗████████╗░█████╗░░█████╗░██╗░░░░░░█████╗░███╗░░░███╗██████╗░░█████╗░
|
|
101
|
+
██╔════╝░██║╚══██╔══╝██╔══██╗██╔══██╗██║░░░░░██╔══██╗████╗░████║██╔══██╗██╔══██╗
|
|
102
|
+
██║░░██╗░██║░░░██║░░░██║░░╚═╝██║░░██║██║░░░░░██║░░██║██╔████╔██║██████╦╝██║░░██║
|
|
103
|
+
██║░░╚██╗██║░░░██║░░░██║░░██╗██║░░██║██║░░░░░██║░░██║██║╚██╔╝██║██╔══██╗██║░░██║
|
|
104
|
+
╚██████╔╝██║░░░██║░░░╚█████╔╝╚█████╔╝███████╗╚█████╔╝██║░╚═╝░██║██████╦╝╚█████╔╝
|
|
105
|
+
░╚═════╝░╚═╝░░░╚═╝░░░░╚════╝░░╚════╝░╚══════╝░╚════╝░╚═╝░░░░░╚═╝╚═════╝░░╚════╝░
|
|
106
|
+
:: git commit osint ::
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
_COLOR_ENABLED = False
|
|
110
|
+
RULE_WIDTH = 80
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _setup_colors(force_off: bool) -> None:
|
|
114
|
+
global _COLOR_ENABLED
|
|
115
|
+
if force_off or os.environ.get("NO_COLOR"):
|
|
116
|
+
_COLOR_ENABLED = False
|
|
117
|
+
return
|
|
118
|
+
try:
|
|
119
|
+
_COLOR_ENABLED = sys.stdout.isatty()
|
|
120
|
+
except Exception:
|
|
121
|
+
_COLOR_ENABLED = False
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _c(code: str, text: str) -> str:
|
|
125
|
+
return f"{code}{text}{RESET}" if _COLOR_ENABLED else text
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _rule(width: int = RULE_WIDTH) -> str:
|
|
129
|
+
return _c(GREY, "─" * width)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _tag(text: str, color: str = GREEN_DIM) -> str:
|
|
133
|
+
return _c(color, f"[{text}]")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _email_with_tag(email: str) -> str:
|
|
137
|
+
"""Bare email, with a trailing [noreply] tag if it's a service address."""
|
|
138
|
+
out = _c(NEON, email)
|
|
139
|
+
if is_system_email(email):
|
|
140
|
+
out += " " + _tag("noreply", RED)
|
|
141
|
+
return out
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _email_brackets(email: str) -> str:
|
|
145
|
+
"""<email> [noreply]? — tag stays outside the angle brackets."""
|
|
146
|
+
out = _c(GREEN_DIM, "<") + _c(NEON, email) + _c(GREEN_DIM, ">")
|
|
147
|
+
if is_system_email(email):
|
|
148
|
+
out += " " + _tag("noreply", RED)
|
|
149
|
+
return out
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _section(title: str) -> list[str]:
|
|
153
|
+
return ["", _rule(), _c(GREEN_DIM, f"[ {title} ]"), _rule(), ""]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------- HTTP helpers ----------
|
|
157
|
+
|
|
158
|
+
def _http_get(url: str) -> bytes | None:
|
|
159
|
+
req = urllib.request.Request(url, headers={"User-Agent": HTTP_USER_AGENT})
|
|
160
|
+
try:
|
|
161
|
+
with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp:
|
|
162
|
+
return resp.read()
|
|
163
|
+
except (urllib.error.URLError, TimeoutError) as exc:
|
|
164
|
+
logger.debug("GET %s failed: %s", url, exc)
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _http_get_json(url: str):
|
|
169
|
+
payload = _http_get(url)
|
|
170
|
+
if payload is None:
|
|
171
|
+
return None
|
|
172
|
+
try:
|
|
173
|
+
return json.loads(payload.decode("utf-8"))
|
|
174
|
+
except (json.JSONDecodeError, UnicodeDecodeError) as exc:
|
|
175
|
+
logger.debug("Bad JSON from %s: %s", url, exc)
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# ---------- GitHub API ----------
|
|
180
|
+
|
|
181
|
+
def _gh_authed(url: str, token: str | None):
|
|
182
|
+
"""Like _http_get_json but with optional bearer token (for higher rate limits)."""
|
|
183
|
+
headers = {
|
|
184
|
+
"Accept": "application/vnd.github+json",
|
|
185
|
+
"User-Agent": HTTP_USER_AGENT,
|
|
186
|
+
}
|
|
187
|
+
if token:
|
|
188
|
+
headers["Authorization"] = "Bearer " + token
|
|
189
|
+
req = urllib.request.Request(url, headers=headers)
|
|
190
|
+
try:
|
|
191
|
+
with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp:
|
|
192
|
+
payload = resp.read()
|
|
193
|
+
except (urllib.error.URLError, TimeoutError) as exc:
|
|
194
|
+
logger.debug("GET %s failed: %s", url, exc)
|
|
195
|
+
return None
|
|
196
|
+
try:
|
|
197
|
+
return json.loads(payload.decode("utf-8"))
|
|
198
|
+
except (json.JSONDecodeError, UnicodeDecodeError) as exc:
|
|
199
|
+
logger.debug("Bad JSON from %s: %s", url, exc)
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def get_public_repos_count(nickname: str) -> int:
|
|
204
|
+
data = _http_get_json(GITHUB_USER_URL.format(nickname=nickname))
|
|
205
|
+
if not data:
|
|
206
|
+
return 0
|
|
207
|
+
return int(data.get("public_repos", 0))
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def get_github_repos(
|
|
211
|
+
nickname: str, repos_count: int, include_forks: bool = False,
|
|
212
|
+
) -> set[str]:
|
|
213
|
+
"""Return URLs of *nickname*'s repos. Forks dropped unless include_forks.
|
|
214
|
+
|
|
215
|
+
Logs a per-call summary (seen / forks-skipped / failed-pages) at INFO so
|
|
216
|
+
the caller can explain a "245 found → only 31 cloned" gap.
|
|
217
|
+
"""
|
|
218
|
+
if repos_count <= 0:
|
|
219
|
+
return set()
|
|
220
|
+
last_page = (repos_count + GITHUB_PER_PAGE - 1) // GITHUB_PER_PAGE
|
|
221
|
+
repos: set[str] = set()
|
|
222
|
+
seen = 0
|
|
223
|
+
forks_skipped = 0
|
|
224
|
+
failed_pages = 0
|
|
225
|
+
for page in range(1, last_page + 1):
|
|
226
|
+
data = _http_get_json(
|
|
227
|
+
GITHUB_REPOS_URL.format(
|
|
228
|
+
nickname=nickname, per_page=GITHUB_PER_PAGE, page=page,
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
if not data:
|
|
232
|
+
failed_pages += 1
|
|
233
|
+
logger.warning(
|
|
234
|
+
"repos listing page %d/%d returned no data (rate limit? "
|
|
235
|
+
"try GITHUB_TOKEN env var)", page, last_page,
|
|
236
|
+
)
|
|
237
|
+
continue
|
|
238
|
+
for repo in data:
|
|
239
|
+
seen += 1
|
|
240
|
+
if repo.get("fork") and not include_forks:
|
|
241
|
+
forks_skipped += 1
|
|
242
|
+
continue
|
|
243
|
+
repos.add(repo["html_url"])
|
|
244
|
+
logger.info(
|
|
245
|
+
"listing: %d seen, %d forks %s, %d kept%s",
|
|
246
|
+
seen,
|
|
247
|
+
forks_skipped,
|
|
248
|
+
"kept" if include_forks else "skipped",
|
|
249
|
+
len(repos),
|
|
250
|
+
f", {failed_pages} page(s) failed" if failed_pages else "",
|
|
251
|
+
)
|
|
252
|
+
return repos
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def resolve_github_username(repo_url: str, commit_hash: str) -> str | None:
|
|
256
|
+
"""Scrape commit page to find the GitHub login behind an email."""
|
|
257
|
+
if not repo_url.startswith("https://github.com/"):
|
|
258
|
+
return None
|
|
259
|
+
commit_url = f"{repo_url.rstrip('/')}/commit/{commit_hash}"
|
|
260
|
+
page = _http_get(commit_url)
|
|
261
|
+
if page is None:
|
|
262
|
+
return None
|
|
263
|
+
match = GITHUB_COMMIT_AUTHOR_RE.search(page.decode("utf-8", errors="replace"))
|
|
264
|
+
return match.group(1) if match else None
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def get_gpg_keys_emails(nickname: str, token: str | None = None):
|
|
268
|
+
"""Fetch user-uploaded PGP keys via /users/{u}/gpg_keys and yield emails.
|
|
269
|
+
|
|
270
|
+
These emails come from the key's UIDs — the user uploaded them themselves,
|
|
271
|
+
so this is a direct identity disclosure. `verified=True` means GitHub has
|
|
272
|
+
confirmed the user controls that mailbox.
|
|
273
|
+
|
|
274
|
+
Yields dicts: {email, verified, key_id, created_at, source}.
|
|
275
|
+
"""
|
|
276
|
+
keys = _gh_authed(GITHUB_GPG_KEYS_URL.format(nickname=nickname), token)
|
|
277
|
+
if not keys:
|
|
278
|
+
return
|
|
279
|
+
seen: set[str] = set()
|
|
280
|
+
|
|
281
|
+
def _walk(key, source):
|
|
282
|
+
if not key or key.get("revoked"):
|
|
283
|
+
return
|
|
284
|
+
key_id = key.get("key_id", "")
|
|
285
|
+
created = key.get("created_at", "")
|
|
286
|
+
for entry in (key.get("emails") or []):
|
|
287
|
+
email = entry.get("email")
|
|
288
|
+
if not email:
|
|
289
|
+
continue
|
|
290
|
+
k = email.lower()
|
|
291
|
+
if k in seen:
|
|
292
|
+
continue
|
|
293
|
+
seen.add(k)
|
|
294
|
+
yield {
|
|
295
|
+
"email": email,
|
|
296
|
+
"verified": bool(entry.get("verified")),
|
|
297
|
+
"key_id": key_id,
|
|
298
|
+
"created_at": created,
|
|
299
|
+
"source": source,
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
for key in keys:
|
|
303
|
+
yield from _walk(key, "primary")
|
|
304
|
+
for sub in (key.get("subkeys") or []):
|
|
305
|
+
yield from _walk(sub, "subkey")
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def print_gpg_results(results, ignore_noreply: bool = True) -> bool:
|
|
309
|
+
"""Pretty-print get_gpg_keys_emails() output. Returns True if printed."""
|
|
310
|
+
rows = [
|
|
311
|
+
r for r in results
|
|
312
|
+
if not (ignore_noreply and is_system_email(r["email"]))
|
|
313
|
+
]
|
|
314
|
+
if not rows:
|
|
315
|
+
return False
|
|
316
|
+
for line in _section("pgp key uids"):
|
|
317
|
+
print(line)
|
|
318
|
+
print(" " + _c(GREEN_DIM, "source: /users/{u}/gpg_keys (user-uploaded)"))
|
|
319
|
+
print()
|
|
320
|
+
rows.sort(key=lambda r: (not r["verified"], r["email"]))
|
|
321
|
+
for r in rows:
|
|
322
|
+
flag_color = LIME if r["verified"] else GREEN_DIM
|
|
323
|
+
flag = _tag("verified" if r["verified"] else "unverified", flag_color)
|
|
324
|
+
print(" {arrow} {email:40} {flag} {kid}={key} {src}".format(
|
|
325
|
+
arrow=_c(LIME, "▶"),
|
|
326
|
+
email=_email_with_tag(r["email"]),
|
|
327
|
+
flag=flag,
|
|
328
|
+
kid=_c(GREEN_DIM, "key_id"),
|
|
329
|
+
key=_c(NEON, r["key_id"] or "?"),
|
|
330
|
+
src=_tag(r["source"]),
|
|
331
|
+
))
|
|
332
|
+
print()
|
|
333
|
+
return True
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def search_commits_by_author(nickname: str, token: str | None = None):
|
|
337
|
+
"""Use /search/commits?q=author:{u} to find commits across all of public GitHub.
|
|
338
|
+
|
|
339
|
+
Also extracts well-known git trailers from each commit message body
|
|
340
|
+
(Signed-off-by, Co-authored-by, Reviewed-by, etc.).
|
|
341
|
+
|
|
342
|
+
Yields dicts: {email, name, role, repo, sha, date}.
|
|
343
|
+
"""
|
|
344
|
+
seen: set[tuple[str, str, str]] = set()
|
|
345
|
+
for page in range(1, GITHUB_SEARCH_MAX_PAGES + 1):
|
|
346
|
+
url = GITHUB_SEARCH_COMMITS_URL.format(
|
|
347
|
+
nickname=nickname, per_page=GITHUB_PER_PAGE, page=page,
|
|
348
|
+
)
|
|
349
|
+
data = _gh_authed(url, token)
|
|
350
|
+
if not data:
|
|
351
|
+
return
|
|
352
|
+
items = data.get("items") or []
|
|
353
|
+
if not items:
|
|
354
|
+
return
|
|
355
|
+
for item in items:
|
|
356
|
+
commit = item.get("commit") or {}
|
|
357
|
+
repo = (item.get("repository") or {}).get("full_name", "")
|
|
358
|
+
sha = item.get("sha", "")
|
|
359
|
+
date = (commit.get("author") or {}).get("date", "")
|
|
360
|
+
message = commit.get("message") or ""
|
|
361
|
+
for role in ("author", "committer"):
|
|
362
|
+
who = commit.get(role) or {}
|
|
363
|
+
email = who.get("email")
|
|
364
|
+
name = who.get("name") or ""
|
|
365
|
+
if not email:
|
|
366
|
+
continue
|
|
367
|
+
key = (email.lower(), name.lower(), role)
|
|
368
|
+
if key in seen:
|
|
369
|
+
continue
|
|
370
|
+
seen.add(key)
|
|
371
|
+
yield {"email": email, "name": name, "role": role,
|
|
372
|
+
"repo": repo, "sha": sha, "date": date}
|
|
373
|
+
# trailers in the commit message body
|
|
374
|
+
for tm in TRAILER_RE.finditer(message):
|
|
375
|
+
t_key = tm.group("key").lower()
|
|
376
|
+
t_name = (tm.group("name") or "").strip()
|
|
377
|
+
t_email = (tm.group("email") or "").strip()
|
|
378
|
+
if not t_email:
|
|
379
|
+
continue
|
|
380
|
+
# reject malformed names: ':' implies another trailer label was
|
|
381
|
+
# crammed onto the same line; '@' implies a @-mention or stray
|
|
382
|
+
# handle. Real personal names don't contain either.
|
|
383
|
+
if ":" in t_name or "@" in t_name:
|
|
384
|
+
continue
|
|
385
|
+
key = (t_email.lower(), t_name.lower(), t_key)
|
|
386
|
+
if key in seen:
|
|
387
|
+
continue
|
|
388
|
+
seen.add(key)
|
|
389
|
+
yield {"email": t_email, "name": t_name, "role": t_key,
|
|
390
|
+
"repo": repo, "sha": sha, "date": date}
|
|
391
|
+
if len(items) < GITHUB_PER_PAGE:
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def print_search_results(results, ignore_noreply: bool = True) -> None:
|
|
396
|
+
"""Pretty-print search_commits_by_author() output grouped by (email, name)."""
|
|
397
|
+
groups: dict[tuple[str, str], list[dict]] = {}
|
|
398
|
+
for r in results:
|
|
399
|
+
if ignore_noreply and is_system_email(r["email"]):
|
|
400
|
+
continue
|
|
401
|
+
key = (r["email"], r["name"])
|
|
402
|
+
groups.setdefault(key, []).append(r)
|
|
403
|
+
|
|
404
|
+
if not groups:
|
|
405
|
+
print(_c(RED, "[!] no public commits found via /search/commits"))
|
|
406
|
+
return
|
|
407
|
+
|
|
408
|
+
for line in _section("commit search"):
|
|
409
|
+
print(line)
|
|
410
|
+
print(" " + _c(GREEN_DIM, "identities found: ") + _c(NEON, str(len(groups))))
|
|
411
|
+
print()
|
|
412
|
+
ordered = sorted(groups.items(), key=lambda kv: -len(kv[1]))
|
|
413
|
+
for (email, name), rows in ordered:
|
|
414
|
+
repos = sorted({r["repo"] for r in rows if r["repo"]})
|
|
415
|
+
roles = sorted({r["role"] for r in rows})
|
|
416
|
+
print(" {arrow} {name} {brackets} {hits} {roles}".format(
|
|
417
|
+
arrow=_c(LIME, "▶"),
|
|
418
|
+
name=_c(BOLD + NEON, name or "?"),
|
|
419
|
+
brackets=_email_brackets(email),
|
|
420
|
+
hits=_c(LIME, f"×{len(rows)}"),
|
|
421
|
+
roles=_tag(", ".join(roles)),
|
|
422
|
+
))
|
|
423
|
+
for i, repo in enumerate(repos[:5]):
|
|
424
|
+
last = i == min(4, len(repos) - 1) and len(repos) <= 5
|
|
425
|
+
branch = "└─" if last else "├─"
|
|
426
|
+
print(" " + _c(GREEN_DIM, branch) + " "
|
|
427
|
+
+ _c(GREEN_DIM, "repo ") + _c(NEON, repo))
|
|
428
|
+
if len(repos) > 5:
|
|
429
|
+
print(" " + _c(GREEN_DIM, "└─ ")
|
|
430
|
+
+ _c(GREEN_DIM, f"... +{len(repos) - 5} more repos"))
|
|
431
|
+
print()
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
# ---------- Filesystem helpers ----------
|
|
435
|
+
|
|
436
|
+
def find_all_repos_recursively(path: str) -> list[str]:
|
|
437
|
+
"""Return repo roots (directories that contain a .git subdir) under path."""
|
|
438
|
+
repos: list[str] = []
|
|
439
|
+
for current_dir, dirs, _ in os.walk(path):
|
|
440
|
+
if ".git" in dirs:
|
|
441
|
+
repos.append(current_dir)
|
|
442
|
+
dirs[:] = [d for d in dirs if d != ".git"]
|
|
443
|
+
return repos
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
# ---------- Git subprocess ----------
|
|
447
|
+
|
|
448
|
+
def git_log(repo_dir: str) -> str:
|
|
449
|
+
try:
|
|
450
|
+
result = subprocess.run(
|
|
451
|
+
["git", "log", f"--pretty={GIT_LOG_FORMAT}", "--all"],
|
|
452
|
+
cwd=repo_dir, check=False, capture_output=True, text=True,
|
|
453
|
+
)
|
|
454
|
+
except FileNotFoundError:
|
|
455
|
+
logger.error("'git' binary not found")
|
|
456
|
+
return ""
|
|
457
|
+
if result.returncode != 0:
|
|
458
|
+
logger.debug("git log failed in %s: %s", repo_dir, result.stderr.strip())
|
|
459
|
+
return result.stdout
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _clone_target_dir(url: str) -> str:
|
|
463
|
+
name = url.rstrip("/").split("/")[-1]
|
|
464
|
+
return name[:-4] if name.endswith(".git") else name
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def git_clone(url: str, dest_dir: str) -> str | None:
|
|
468
|
+
"""Clone *url* into *dest_dir*/<repo-name>. Returns the cloned path or None."""
|
|
469
|
+
os.makedirs(dest_dir, exist_ok=True)
|
|
470
|
+
target = os.path.join(dest_dir, _clone_target_dir(url))
|
|
471
|
+
try:
|
|
472
|
+
result = subprocess.run(
|
|
473
|
+
["git", "clone", url, target],
|
|
474
|
+
check=False, capture_output=True, text=True,
|
|
475
|
+
)
|
|
476
|
+
except FileNotFoundError:
|
|
477
|
+
logger.error("'git' binary not found")
|
|
478
|
+
return None
|
|
479
|
+
if result.returncode != 0:
|
|
480
|
+
logger.debug("git clone failed for %s: %s", url, result.stderr.strip())
|
|
481
|
+
return None
|
|
482
|
+
return target
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _short_url(url: str, width: int = 50) -> str:
|
|
486
|
+
"""Trim URL for progress display: keep owner/repo tail."""
|
|
487
|
+
if len(url) <= width:
|
|
488
|
+
return url
|
|
489
|
+
tail = "/".join(url.rstrip("/").split("/")[-2:])
|
|
490
|
+
return ("…" + tail)[-width:]
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def clone_many(
|
|
494
|
+
urls: list[str],
|
|
495
|
+
dest_dir: str,
|
|
496
|
+
workers: int = CLONE_WORKERS,
|
|
497
|
+
) -> dict[str, str | None]:
|
|
498
|
+
"""Clone *urls* concurrently. Returns {url: local_path or None}.
|
|
499
|
+
|
|
500
|
+
Prints a live progress line to stderr (overwritten on TTY, line-per-tick
|
|
501
|
+
otherwise) so the user can see what's happening during long clone batches.
|
|
502
|
+
"""
|
|
503
|
+
total = len(urls)
|
|
504
|
+
if total == 0:
|
|
505
|
+
return {}
|
|
506
|
+
|
|
507
|
+
results: dict[str, str | None] = {}
|
|
508
|
+
state = {"done": 0, "ok": 0, "fail": 0, "current": ""}
|
|
509
|
+
lock = threading.Lock()
|
|
510
|
+
started = time.monotonic()
|
|
511
|
+
is_tty = False
|
|
512
|
+
try:
|
|
513
|
+
is_tty = sys.stderr.isatty()
|
|
514
|
+
except Exception:
|
|
515
|
+
pass
|
|
516
|
+
|
|
517
|
+
last_done = {"value": -1}
|
|
518
|
+
|
|
519
|
+
def render(final: bool = False) -> None:
|
|
520
|
+
elapsed = time.monotonic() - started
|
|
521
|
+
fail_chunk = _c(RED, f"fail={state['fail']}") if state["fail"] else \
|
|
522
|
+
_c(GREEN_DIM, "fail=0")
|
|
523
|
+
line = (
|
|
524
|
+
_c(GREEN_DIM, "[*] ")
|
|
525
|
+
+ _c(LIME, "cloning ")
|
|
526
|
+
+ _c(NEON, f"{state['done']}/{total}")
|
|
527
|
+
+ " " + _c(GREEN_DIM, f"ok={state['ok']}")
|
|
528
|
+
+ " " + fail_chunk
|
|
529
|
+
+ " " + _c(GREEN_DIM, f"{elapsed:>4.0f}s")
|
|
530
|
+
)
|
|
531
|
+
if state["current"] and not final:
|
|
532
|
+
line += " " + _c(GREEN_DIM, "· ") + _c(NEON, state["current"])
|
|
533
|
+
if is_tty:
|
|
534
|
+
# \r + clear-to-end-of-line keeps the progress on a single line.
|
|
535
|
+
sys.stderr.write("\r\033[K" + line)
|
|
536
|
+
if final:
|
|
537
|
+
sys.stderr.write("\n")
|
|
538
|
+
sys.stderr.flush()
|
|
539
|
+
else:
|
|
540
|
+
# Non-TTY: avoid a flood of identical "0/N" lines while threads
|
|
541
|
+
# pick up their first job. Only emit when the done counter ticks
|
|
542
|
+
# forward (or on the final summary).
|
|
543
|
+
if final or state["done"] != last_done["value"]:
|
|
544
|
+
last_done["value"] = state["done"]
|
|
545
|
+
sys.stderr.write(line + "\n")
|
|
546
|
+
|
|
547
|
+
def worker(url: str) -> None:
|
|
548
|
+
with lock:
|
|
549
|
+
state["current"] = _short_url(url)
|
|
550
|
+
render()
|
|
551
|
+
path = git_clone(url, dest_dir)
|
|
552
|
+
with lock:
|
|
553
|
+
state["done"] += 1
|
|
554
|
+
if path:
|
|
555
|
+
state["ok"] += 1
|
|
556
|
+
else:
|
|
557
|
+
state["fail"] += 1
|
|
558
|
+
# Don't keep stale "current" once this thread is done; the next
|
|
559
|
+
# worker that picks up a job will overwrite it.
|
|
560
|
+
state["current"] = ""
|
|
561
|
+
render()
|
|
562
|
+
|
|
563
|
+
render() # initial 0/total
|
|
564
|
+
try:
|
|
565
|
+
with ThreadPoolExecutor(max_workers=max(1, workers)) as pool:
|
|
566
|
+
futures = {pool.submit(worker, url): url for url in urls}
|
|
567
|
+
for fut in futures:
|
|
568
|
+
try:
|
|
569
|
+
fut.result()
|
|
570
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
571
|
+
logger.debug("clone worker for %s raised: %s", futures[fut], exc)
|
|
572
|
+
finally:
|
|
573
|
+
with lock:
|
|
574
|
+
state["current"] = ""
|
|
575
|
+
render(final=True)
|
|
576
|
+
|
|
577
|
+
# Map each URL to its deterministic target path so callers get a stable
|
|
578
|
+
# {url: path|None} contract regardless of completion order.
|
|
579
|
+
for url in urls:
|
|
580
|
+
target = os.path.join(dest_dir, _clone_target_dir(url))
|
|
581
|
+
results[url] = target if os.path.isdir(os.path.join(target, ".git")) else None
|
|
582
|
+
return results
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
# ---------- Data classes ----------
|
|
586
|
+
|
|
587
|
+
def _split_name_email(raw: str) -> tuple[str, str]:
|
|
588
|
+
m = GIT_NAME_EMAIL_RE.match(raw)
|
|
589
|
+
if not m:
|
|
590
|
+
logger.error("Could not extract name/email from %r", raw)
|
|
591
|
+
return "", ""
|
|
592
|
+
return m.group(1), m.group(2)
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
@dataclass
|
|
596
|
+
class Commit:
|
|
597
|
+
hash: str
|
|
598
|
+
author: str
|
|
599
|
+
committer: str
|
|
600
|
+
author_name: str
|
|
601
|
+
author_email: str
|
|
602
|
+
committer_name: str
|
|
603
|
+
committer_email: str
|
|
604
|
+
|
|
605
|
+
@property
|
|
606
|
+
def author_committer_same(self) -> bool:
|
|
607
|
+
return (
|
|
608
|
+
self.author_name == self.committer_name
|
|
609
|
+
and self.author_email == self.committer_email
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
@classmethod
|
|
613
|
+
def parse(cls, line: str) -> "Commit | None":
|
|
614
|
+
m = GIT_LOG_LINE_RE.search(line)
|
|
615
|
+
if not m:
|
|
616
|
+
logger.error("Could not parse commit line %r", line)
|
|
617
|
+
return None
|
|
618
|
+
h, author, committer = m.groups()
|
|
619
|
+
a_name, a_email = _split_name_email(author)
|
|
620
|
+
c_name, c_email = _split_name_email(committer)
|
|
621
|
+
return cls(h, author, committer, a_name, a_email, c_name, c_email)
|
|
622
|
+
|
|
623
|
+
def __str__(self) -> str:
|
|
624
|
+
return (
|
|
625
|
+
f"Hash: {self.hash}\n"
|
|
626
|
+
f"Author name: {self.author_name}\n"
|
|
627
|
+
f"Author email: {self.author_email}\n"
|
|
628
|
+
f"Committer name: {self.committer_name}\n"
|
|
629
|
+
f"Committer email: {self.committer_email}\n"
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
@dataclass
|
|
634
|
+
class Person:
|
|
635
|
+
key: str
|
|
636
|
+
name: str = ""
|
|
637
|
+
email: str = ""
|
|
638
|
+
as_author: int = 0
|
|
639
|
+
as_committer: int = 0
|
|
640
|
+
also_known: dict[str, "Person"] = field(default_factory=dict)
|
|
641
|
+
github_login: str | None = None
|
|
642
|
+
repo_url: str | None = None
|
|
643
|
+
last_commit_hash: str | None = None
|
|
644
|
+
|
|
645
|
+
def __str__(self) -> str:
|
|
646
|
+
# Headline: ▶ name <email> [noreply]?
|
|
647
|
+
header = " {arrow} {name} {brackets}".format(
|
|
648
|
+
arrow=_c(LIME, "▶"),
|
|
649
|
+
name=_c(BOLD + NEON, self.name or "?"),
|
|
650
|
+
brackets=_email_brackets(self.email),
|
|
651
|
+
)
|
|
652
|
+
rows: list[tuple[str, str]] = []
|
|
653
|
+
if self.as_author:
|
|
654
|
+
rows.append(("author", _c(LIME, f"×{self.as_author}")))
|
|
655
|
+
if self.as_committer:
|
|
656
|
+
rows.append(("committer", _c(LIME, f"×{self.as_committer}")))
|
|
657
|
+
if self.github_login:
|
|
658
|
+
url = f"https://github.com/{self.github_login}"
|
|
659
|
+
rows.append(("github", _c(LIME, url) + " " + _tag("verified", LIME)))
|
|
660
|
+
for alias in self.also_known.values():
|
|
661
|
+
alias_text = f"{alias.name} {_email_brackets(alias.email)}"
|
|
662
|
+
rows.append(("alias", alias_text))
|
|
663
|
+
|
|
664
|
+
lines = [header]
|
|
665
|
+
for i, (label, value) in enumerate(rows):
|
|
666
|
+
branch = "└─" if i == len(rows) - 1 else "├─"
|
|
667
|
+
lines.append(
|
|
668
|
+
" " + _c(GREEN_DIM, branch) + " "
|
|
669
|
+
+ _c(GREEN_DIM, f"{label:<10}") + " " + value
|
|
670
|
+
)
|
|
671
|
+
return "\n".join(lines)
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
# ---------- Analyst ----------
|
|
675
|
+
|
|
676
|
+
class GitAnalyst:
|
|
677
|
+
def __init__(self, repos_dir: str = DEFAULT_REPOS_DIR) -> None:
|
|
678
|
+
self.repos_dir = repos_dir
|
|
679
|
+
self.commits: list[Commit] = []
|
|
680
|
+
self.persons: dict[str, Person] = {}
|
|
681
|
+
self.name_to_emails: dict[str, set[str]] = defaultdict(set)
|
|
682
|
+
self.repos: list[str] = []
|
|
683
|
+
self.same_emails_persons: dict[str, tuple[list[str], set[str]]] = {}
|
|
684
|
+
|
|
685
|
+
def append(self, source: str, *, cloned_path: str | None = None) -> None:
|
|
686
|
+
if cloned_path is not None:
|
|
687
|
+
repo_dir = cloned_path
|
|
688
|
+
elif "://" in source:
|
|
689
|
+
repo_dir = git_clone(source, self.repos_dir)
|
|
690
|
+
if repo_dir is None:
|
|
691
|
+
return
|
|
692
|
+
else:
|
|
693
|
+
repo_dir = source
|
|
694
|
+
|
|
695
|
+
self.repos.append(repo_dir)
|
|
696
|
+
log_output = git_log(repo_dir)
|
|
697
|
+
new_commits = [
|
|
698
|
+
c for c in (Commit.parse(line) for line in log_output.splitlines() if line)
|
|
699
|
+
if c is not None
|
|
700
|
+
]
|
|
701
|
+
self.commits.extend(new_commits)
|
|
702
|
+
self._analyze(new_commits, source)
|
|
703
|
+
|
|
704
|
+
@property
|
|
705
|
+
def sorted_persons(self) -> list[tuple[str, Person]]:
|
|
706
|
+
return sorted(
|
|
707
|
+
self.persons.items(),
|
|
708
|
+
key=lambda item: item[1].as_author + item[1].as_committer,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
def resolve_persons(self) -> None:
|
|
712
|
+
targets = [
|
|
713
|
+
p for p in self.persons.values()
|
|
714
|
+
if p.email not in SYSTEM_EMAILS and p.repo_url and p.last_commit_hash
|
|
715
|
+
]
|
|
716
|
+
if not targets:
|
|
717
|
+
return
|
|
718
|
+
with ThreadPoolExecutor(max_workers=RESOLVE_WORKERS) as pool:
|
|
719
|
+
futures = {
|
|
720
|
+
pool.submit(resolve_github_username, p.repo_url, p.last_commit_hash): p
|
|
721
|
+
for p in targets
|
|
722
|
+
}
|
|
723
|
+
for fut, person in futures.items():
|
|
724
|
+
login = fut.result()
|
|
725
|
+
if login:
|
|
726
|
+
person.github_login = login
|
|
727
|
+
|
|
728
|
+
def _upsert(
|
|
729
|
+
self, key: str, name: str, email: str, repo_url: str, commit_hash: str,
|
|
730
|
+
) -> Person:
|
|
731
|
+
person = self.persons.get(key) or Person(key=key)
|
|
732
|
+
person.name = name
|
|
733
|
+
person.email = email
|
|
734
|
+
person.repo_url = repo_url
|
|
735
|
+
person.last_commit_hash = commit_hash
|
|
736
|
+
self.persons[key] = person
|
|
737
|
+
return person
|
|
738
|
+
|
|
739
|
+
def _analyze(self, new_commits: Iterable[Commit], repo_url: str) -> None:
|
|
740
|
+
for commit in new_commits:
|
|
741
|
+
author = self._upsert(
|
|
742
|
+
commit.author, commit.author_name, commit.author_email,
|
|
743
|
+
repo_url, commit.hash,
|
|
744
|
+
)
|
|
745
|
+
author.as_author += 1
|
|
746
|
+
|
|
747
|
+
committer = self._upsert(
|
|
748
|
+
commit.committer, commit.committer_name, commit.committer_email,
|
|
749
|
+
repo_url, commit.hash,
|
|
750
|
+
)
|
|
751
|
+
committer.as_committer += 1
|
|
752
|
+
|
|
753
|
+
if not commit.author_committer_same:
|
|
754
|
+
author.also_known[commit.committer] = committer
|
|
755
|
+
committer.also_known[commit.author] = author
|
|
756
|
+
|
|
757
|
+
self.name_to_emails[commit.author_name].add(commit.author_email)
|
|
758
|
+
self.name_to_emails[commit.committer_name].add(commit.committer_email)
|
|
759
|
+
|
|
760
|
+
# Group names that share the exact same set of emails — these are
|
|
761
|
+
# treated as the same person. O(n) instead of the previous O(n²).
|
|
762
|
+
emails_to_names: dict[frozenset[str], list[str]] = defaultdict(list)
|
|
763
|
+
for name, emails in self.name_to_emails.items():
|
|
764
|
+
emails_to_names[frozenset(emails)].append(name)
|
|
765
|
+
self.same_emails_persons = {
|
|
766
|
+
",".join(sorted(names)): (sorted(names), set(emails))
|
|
767
|
+
for emails, names in emails_to_names.items()
|
|
768
|
+
if len(names) > 1
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
def __str__(self) -> str:
|
|
772
|
+
parts: list[str] = []
|
|
773
|
+
|
|
774
|
+
# 1. Stats — top-level summary of what was scanned and what was found.
|
|
775
|
+
parts.extend(_section("stats"))
|
|
776
|
+
for label, value in (
|
|
777
|
+
("repos", len(self.repos)),
|
|
778
|
+
("commits", len(self.commits)),
|
|
779
|
+
("persons", len(self.persons)),
|
|
780
|
+
):
|
|
781
|
+
dots = "." * (16 - len(label))
|
|
782
|
+
parts.append(" " + _c(GREEN_DIM, label) + " "
|
|
783
|
+
+ _c(GREY, dots) + " " + _c(NEON, str(value)))
|
|
784
|
+
parts.append("")
|
|
785
|
+
parts.append(" " + _c(GREEN_DIM, "targets"))
|
|
786
|
+
for i, repo in enumerate(self.repos):
|
|
787
|
+
branch = "└─" if i == len(self.repos) - 1 else "├─"
|
|
788
|
+
parts.append(" " + _c(GREEN_DIM, branch) + " " + _c(NEON, repo))
|
|
789
|
+
|
|
790
|
+
# 2. Correlation — shared names with multiple emails + same-person clusters.
|
|
791
|
+
matching: list[str] = []
|
|
792
|
+
for name, emails in self.name_to_emails.items():
|
|
793
|
+
if len(emails) <= 1:
|
|
794
|
+
continue
|
|
795
|
+
sorted_emails = sorted(emails)
|
|
796
|
+
block = [
|
|
797
|
+
" {bang} {name} {arrow} {n} emails".format(
|
|
798
|
+
bang=_c(RED, "[!]"),
|
|
799
|
+
name=_c(BOLD + NEON, name),
|
|
800
|
+
arrow=_c(GREEN_DIM, "→"),
|
|
801
|
+
n=_c(LIME, str(len(sorted_emails))),
|
|
802
|
+
)
|
|
803
|
+
]
|
|
804
|
+
for i, e in enumerate(sorted_emails):
|
|
805
|
+
branch = "└─" if i == len(sorted_emails) - 1 else "├─"
|
|
806
|
+
block.append(" " + _c(GREEN_DIM, branch) + " "
|
|
807
|
+
+ _email_with_tag(e))
|
|
808
|
+
matching.append("\n".join(block))
|
|
809
|
+
|
|
810
|
+
same_person: list[str] = []
|
|
811
|
+
for names, _emails in self.same_emails_persons.values():
|
|
812
|
+
joined = _c(BOLD + NEON, (" " + _c(GREEN_DIM, "≡") + " ").join(names))
|
|
813
|
+
same_person.append(
|
|
814
|
+
" " + _c(RED, "[!]") + " " + _c(GREEN_DIM, "same person:") + " "
|
|
815
|
+
+ joined
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
if matching or same_person:
|
|
819
|
+
parts.extend(_section("correlation"))
|
|
820
|
+
if matching:
|
|
821
|
+
parts.append("\n\n".join(matching))
|
|
822
|
+
parts.append("")
|
|
823
|
+
if same_person:
|
|
824
|
+
parts.extend(same_person)
|
|
825
|
+
parts.append("")
|
|
826
|
+
|
|
827
|
+
# 3. Identities — per-person breakdown.
|
|
828
|
+
parts.extend(_section("identities"))
|
|
829
|
+
for _, person in self.sorted_persons:
|
|
830
|
+
parts.append(str(person))
|
|
831
|
+
parts.append("")
|
|
832
|
+
|
|
833
|
+
return "\n".join(parts)
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
# ---------- CLI ----------
|
|
837
|
+
|
|
838
|
+
def _parse_args() -> argparse.Namespace:
|
|
839
|
+
parser = argparse.ArgumentParser(
|
|
840
|
+
prog="gitcolombo",
|
|
841
|
+
description="Extract accounts' information from git repo and make some researches.",
|
|
842
|
+
)
|
|
843
|
+
parser.add_argument("-d", "--dir", help="directory with git project(s)")
|
|
844
|
+
parser.add_argument("-u", "--url", help="url of git repo")
|
|
845
|
+
parser.add_argument(
|
|
846
|
+
"--github", action="store_true",
|
|
847
|
+
help="try to extract extended info from GitHub",
|
|
848
|
+
)
|
|
849
|
+
parser.add_argument(
|
|
850
|
+
"--nickname", type=str,
|
|
851
|
+
help="download repos from GitHub by nickname",
|
|
852
|
+
)
|
|
853
|
+
parser.add_argument(
|
|
854
|
+
"--search", type=str, metavar="USERNAME",
|
|
855
|
+
help="API-only path: query /users/{u}/gpg_keys + /search/commits "
|
|
856
|
+
"for emails (no cloning, ~1000 commit results max)",
|
|
857
|
+
)
|
|
858
|
+
parser.add_argument(
|
|
859
|
+
"--no-ignore-noreply", action="store_true",
|
|
860
|
+
help="do not filter service noreply addresses from --search results",
|
|
861
|
+
)
|
|
862
|
+
parser.add_argument(
|
|
863
|
+
"-r", "--recursive", action="store_true",
|
|
864
|
+
help="recursive directory processing",
|
|
865
|
+
)
|
|
866
|
+
parser.add_argument(
|
|
867
|
+
"--repos-dir", default=DEFAULT_REPOS_DIR,
|
|
868
|
+
help=f"directory to clone remote repositories into (default: {DEFAULT_REPOS_DIR})",
|
|
869
|
+
)
|
|
870
|
+
parser.add_argument(
|
|
871
|
+
"--clone-workers", type=int, default=CLONE_WORKERS,
|
|
872
|
+
help=f"parallel git-clone workers (default: {CLONE_WORKERS})",
|
|
873
|
+
)
|
|
874
|
+
parser.add_argument(
|
|
875
|
+
"--include-forks", action="store_true",
|
|
876
|
+
help="include forked repositories (default: skipped — forks add upstream "
|
|
877
|
+
"history that is not the target user's work)",
|
|
878
|
+
)
|
|
879
|
+
parser.add_argument("--debug", action="store_true", help="print debug information")
|
|
880
|
+
parser.add_argument(
|
|
881
|
+
"--no-color", action="store_true",
|
|
882
|
+
help="disable ANSI colors (also honored via NO_COLOR env var or non-TTY stdout)",
|
|
883
|
+
)
|
|
884
|
+
return parser.parse_args()
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def _collect_sources(args: argparse.Namespace) -> list[str]:
|
|
888
|
+
sources: list[str] = []
|
|
889
|
+
if args.url:
|
|
890
|
+
sources.append(args.url)
|
|
891
|
+
if args.dir:
|
|
892
|
+
sources.append(args.dir.rstrip("/"))
|
|
893
|
+
if args.recursive:
|
|
894
|
+
sources.extend(find_all_repos_recursively(args.dir))
|
|
895
|
+
if args.nickname:
|
|
896
|
+
count = get_public_repos_count(args.nickname)
|
|
897
|
+
if count:
|
|
898
|
+
logger.info("found %d public repos for %s", count, args.nickname)
|
|
899
|
+
sources.extend(get_github_repos(
|
|
900
|
+
args.nickname, repos_count=count,
|
|
901
|
+
include_forks=args.include_forks,
|
|
902
|
+
))
|
|
903
|
+
return sources
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def main() -> None:
|
|
907
|
+
args = _parse_args()
|
|
908
|
+
_setup_colors(force_off=args.no_color)
|
|
909
|
+
logging.basicConfig(
|
|
910
|
+
level=logging.DEBUG if args.debug else logging.INFO,
|
|
911
|
+
format=_c(GREEN_DIM, "[*] ") + _c(LIME, "%(levelname)s") + " %(message)s",
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
print(_c(NEON, BANNER), flush=True)
|
|
915
|
+
|
|
916
|
+
if args.search:
|
|
917
|
+
token = os.environ.get("GITHUB_TOKEN")
|
|
918
|
+
ignore = not args.no_ignore_noreply
|
|
919
|
+
gpg = list(get_gpg_keys_emails(args.search, token=token))
|
|
920
|
+
had_gpg = print_gpg_results(gpg, ignore_noreply=ignore)
|
|
921
|
+
results = list(search_commits_by_author(args.search, token=token))
|
|
922
|
+
print_search_results(results, ignore_noreply=ignore)
|
|
923
|
+
if not had_gpg and not results:
|
|
924
|
+
print("No emails found via /gpg_keys or /search/commits.")
|
|
925
|
+
return
|
|
926
|
+
|
|
927
|
+
sources = _collect_sources(args)
|
|
928
|
+
if not sources:
|
|
929
|
+
print("Run me with git repo link or path!")
|
|
930
|
+
return
|
|
931
|
+
|
|
932
|
+
analyst = GitAnalyst(repos_dir=args.repos_dir)
|
|
933
|
+
|
|
934
|
+
url_sources = [s for s in sources if "://" in s]
|
|
935
|
+
local_sources = [s for s in sources if "://" not in s]
|
|
936
|
+
|
|
937
|
+
cloned: dict[str, str | None] = {}
|
|
938
|
+
if url_sources:
|
|
939
|
+
logger.info(
|
|
940
|
+
"cloning %d repo(s) into %s with %d workers",
|
|
941
|
+
len(url_sources), args.repos_dir, args.clone_workers,
|
|
942
|
+
)
|
|
943
|
+
cloned = clone_many(url_sources, args.repos_dir, workers=args.clone_workers)
|
|
944
|
+
failed = [u for u, p in cloned.items() if p is None]
|
|
945
|
+
if failed:
|
|
946
|
+
logger.warning("%d clone(s) failed (see --debug for reasons)", len(failed))
|
|
947
|
+
|
|
948
|
+
to_analyze = len(local_sources) + sum(1 for p in cloned.values() if p)
|
|
949
|
+
if to_analyze:
|
|
950
|
+
logger.info("analyzing %d repo(s)...", to_analyze)
|
|
951
|
+
for src in local_sources:
|
|
952
|
+
analyst.append(src)
|
|
953
|
+
for url, path in cloned.items():
|
|
954
|
+
if path:
|
|
955
|
+
analyst.append(url, cloned_path=path)
|
|
956
|
+
|
|
957
|
+
if analyst.persons:
|
|
958
|
+
logger.info("resolving GitHub usernames for %d identities...",
|
|
959
|
+
len(analyst.persons))
|
|
960
|
+
analyst.resolve_persons()
|
|
961
|
+
|
|
962
|
+
if analyst.repos:
|
|
963
|
+
print(analyst)
|
|
964
|
+
else:
|
|
965
|
+
print("Run me with git repo link or path!")
|
gitcolombo/__main__.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gitcolombo
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: 🕵️ OSINT tool to extract identities (names, emails, GitHub logins) from git repositories and the GitHub API.
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: osint,git,github,email,investigation,recon,doxing
|
|
8
|
+
Author: Soxoj
|
|
9
|
+
Author-email: soxoj@protonmail.com
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Information Technology
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Natural Language :: English
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
|
+
Classifier: Topic :: Security
|
|
23
|
+
Classifier: Topic :: Software Development :: Version Control :: Git
|
|
24
|
+
Project-URL: Bug Tracker, https://github.com/soxoj/gitcolombo/issues
|
|
25
|
+
Project-URL: Homepage, https://pypi.org/project/gitcolombo
|
|
26
|
+
Project-URL: Repository, https://github.com/soxoj/gitcolombo
|
|
27
|
+
Project-URL: Web version, https://gitcolombo.soxoj.com
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# Gitcolombo
|
|
31
|
+
|
|
32
|
+
<p align="center">
|
|
33
|
+
<img src="https://telegra.ph/file/0730b125282266989e861.png" alt="Gitcolombo" width="320">
|
|
34
|
+
</p>
|
|
35
|
+
|
|
36
|
+
OSINT tool that extracts identities — names, emails, and links between
|
|
37
|
+
seemingly unrelated accounts — from git repositories and GitHub.
|
|
38
|
+
|
|
39
|
+
- **Python CLI** (`gitcolombo.py`) — clones repos, walks `git log`, and can
|
|
40
|
+
call the GitHub API for richer signals.
|
|
41
|
+
- **Web version** (`gitcolombo.html`) — a single static HTML file; open it
|
|
42
|
+
in a browser and query the GitHub API directly, no install.
|
|
43
|
+
|
|
44
|
+
For the full breakdown of where each email/name comes from
|
|
45
|
+
(PGP keys, public events, commit search, commit-message trailers, etc.)
|
|
46
|
+
see **[docs.md](./docs.md)**.
|
|
47
|
+
|
|
48
|
+
## Web version
|
|
49
|
+
|
|
50
|
+
Hosted at **<https://gitcolombo.soxoj.com>** — or open `gitcolombo.html`
|
|
51
|
+
locally. A single static HTML file that queries the GitHub API straight
|
|
52
|
+
from your browser; no install, no backend.
|
|
53
|
+
|
|
54
|
+
<p align="center">
|
|
55
|
+
<img src="gitcolombo.png" alt="Gitcolombo web version" width="640">
|
|
56
|
+
</p>
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
|
|
60
|
+
Requires Python 3.10+ and a working `git` binary. No third-party
|
|
61
|
+
Python dependencies.
|
|
62
|
+
|
|
63
|
+
```sh
|
|
64
|
+
pip install gitcolombo
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Or from source:
|
|
68
|
+
|
|
69
|
+
```sh
|
|
70
|
+
git clone https://github.com/Soxoj/gitcolombo
|
|
71
|
+
cd gitcolombo
|
|
72
|
+
pip install -e .
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
```sh
|
|
78
|
+
# from any git URL
|
|
79
|
+
gitcolombo -u https://github.com/Soxoj/maigret
|
|
80
|
+
|
|
81
|
+
# from a local directory, recursively
|
|
82
|
+
gitcolombo -d ./maigret -r
|
|
83
|
+
|
|
84
|
+
# clone and scan every public repo of a GitHub user/org
|
|
85
|
+
gitcolombo --nickname octocat
|
|
86
|
+
|
|
87
|
+
# API-only: find emails for a GitHub username without cloning
|
|
88
|
+
gitcolombo --search Soxoj
|
|
89
|
+
|
|
90
|
+
# change where remote repos get cloned (default: ./repos)
|
|
91
|
+
gitcolombo -u https://github.com/Soxoj/maigret --repos-dir ./clones
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
`python -m gitcolombo` works equivalently if you'd rather not put the
|
|
95
|
+
script on `$PATH`.
|
|
96
|
+
|
|
97
|
+
Remote repositories are cloned into `./repos/` by default; override
|
|
98
|
+
with `--repos-dir`. For batch cloning from GitLab and Bitbucket groups
|
|
99
|
+
use [ghorg](https://github.com/gabrie30/ghorg).
|
|
100
|
+
|
|
101
|
+
## Output
|
|
102
|
+
|
|
103
|
+
- Per-person details: name, email, author/committer counts, and other
|
|
104
|
+
identities that may belong to the same person.
|
|
105
|
+
- Emails that share a name.
|
|
106
|
+
- Different names tied to the same email.
|
|
107
|
+
- General statistics across the scanned repos.
|
|
108
|
+
|
|
109
|
+
## Why it works
|
|
110
|
+
|
|
111
|
+
Developers often commit with one identity (e.g. work account), then
|
|
112
|
+
switch to another (e.g. personal account) and run `git commit --amend`,
|
|
113
|
+
forgetting that this rewrites the *committer* but leaves the original
|
|
114
|
+
*author* in place. The two roles drift apart, and that mismatch is
|
|
115
|
+
exactly what gitcolombo correlates.
|
|
116
|
+
|
|
117
|
+
Short explainer on author vs. committer:
|
|
118
|
+
<https://stackoverflow.com/questions/18750808/difference-between-author-and-committer-in-git>
|
|
119
|
+
|
|
120
|
+
## Testing
|
|
121
|
+
|
|
122
|
+
Stdlib-only test suite — no third-party dependencies. From the repo root
|
|
123
|
+
(after `pip install -e .`):
|
|
124
|
+
|
|
125
|
+
```sh
|
|
126
|
+
python3 -m unittest test_gitcolombo -v
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The end-to-end test creates a real git repository in a temp directory,
|
|
130
|
+
so a working `git` binary is required (the test is skipped if `git` is
|
|
131
|
+
missing).
|
|
132
|
+
|
|
133
|
+
Tests run on every push and pull request via GitHub Actions
|
|
134
|
+
(`.github/workflows/tests.yml`) across Python 3.10–3.13.
|
|
135
|
+
|
|
136
|
+
## Further reading
|
|
137
|
+
|
|
138
|
+
- [docs.md](./docs.md) — extraction methods, ranking, filters, rate limits
|
|
139
|
+
- [RUS] <https://telegra.ph/Gitcolombo---OSINT-v-GitHub-03-02>
|
|
140
|
+
|
|
141
|
+
## Roadmap
|
|
142
|
+
|
|
143
|
+
- [x] Total statistics for repos in a directory
|
|
144
|
+
- [x] GitHub support: clone all repos from account/group
|
|
145
|
+
- [x] GitHub support: extract links to accounts from commit info
|
|
146
|
+
- [x] GitHub support: API pagination
|
|
147
|
+
- [x] Exclude "system" accounts (e.g. `noreply@github.com`, `@users.noreply.github.com`)
|
|
148
|
+
- [ ] Reverse mapping email → names (currently only name → emails)
|
|
149
|
+
- [ ] Probabilistic graph links based on shared names/emails and Levenshtein distance
|
|
150
|
+
- [ ] Other popular git platforms: GitLab, Bitbucket
|
|
151
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
gitcolombo/__init__.py,sha256=aJfzEQZl6JtjwpFYn1_j48lEsComsoWsZQ5EJGYVj3k,35250
|
|
2
|
+
gitcolombo/__main__.py,sha256=HNsAvF58v8vDKd30QQiy6unLZMV_rLhtd-f2ayu8cGY,158
|
|
3
|
+
gitcolombo/__version__.py,sha256=32_Z-_9Iv9cGj0Jhz1gbCYOfM2XVxLAzJVTA7xlIJLw,53
|
|
4
|
+
gitcolombo-0.3.0.dist-info/METADATA,sha256=aKeHSm3nLLOrCO3iNvIScLP03fIye1ChLt9yCLuGQj4,5076
|
|
5
|
+
gitcolombo-0.3.0.dist-info/WHEEL,sha256=eY7nduwzv-ldUxpzbRlxwvC693Hg6PX8bWDjEHjZ_dk,88
|
|
6
|
+
gitcolombo-0.3.0.dist-info/entry_points.txt,sha256=6oV6eGFTLs6-nlbgkveiv160yOvl7XpEyCuF2yuRIjs,46
|
|
7
|
+
gitcolombo-0.3.0.dist-info/licenses/LICENSE,sha256=l0jCeclcWOZMyeU46McXrp3OZrvtHWnn0-d-Qx1-WC0,1067
|
|
8
|
+
gitcolombo-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2020-2026 Soxoj
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|