praiser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- praiser/__init__.py +7 -0
- praiser/__main__.py +6 -0
- praiser/cache.py +60 -0
- praiser/cli.py +285 -0
- praiser/config.py +86 -0
- praiser/crossforge.py +188 -0
- praiser/data/__init__.py +1 -0
- praiser/data/known_projects.json +95 -0
- praiser/discovery.py +218 -0
- praiser/extractors/__init__.py +54 -0
- praiser/extractors/authors.py +107 -0
- praiser/extractors/base.py +106 -0
- praiser/extractors/codeowners.py +133 -0
- praiser/extractors/contributors.py +89 -0
- praiser/extractors/curated.py +41 -0
- praiser/extractors/enhancement_proposals.py +254 -0
- praiser/extractors/governance.py +127 -0
- praiser/extractors/llm_founders.py +50 -0
- praiser/extractors/maintainers.py +157 -0
- praiser/extractors/manifests.py +184 -0
- praiser/extractors/ownership.py +29 -0
- praiser/extractors/packages.py +66 -0
- praiser/extractors/subcomponents.py +76 -0
- praiser/extractors/web_roles.py +131 -0
- praiser/extractors/wikidata.py +95 -0
- praiser/forge/__init__.py +35 -0
- praiser/forge/_http.py +86 -0
- praiser/forge/base.py +265 -0
- praiser/forge/bitbucket.py +171 -0
- praiser/forge/cgit.py +95 -0
- praiser/forge/gitea.py +179 -0
- praiser/forge/gitee.py +175 -0
- praiser/forge/github.py +309 -0
- praiser/forge/gitlab.py +197 -0
- praiser/github_client.py +462 -0
- praiser/identity.py +25 -0
- praiser/llm.py +247 -0
- praiser/models.py +253 -0
- praiser/pipeline.py +313 -0
- praiser/popularity.py +109 -0
- praiser/progress.py +67 -0
- praiser/registries.py +260 -0
- praiser/registry.py +311 -0
- praiser/render.py +219 -0
- praiser-0.1.0.dist-info/METADATA +350 -0
- praiser-0.1.0.dist-info/RECORD +50 -0
- praiser-0.1.0.dist-info/WHEEL +5 -0
- praiser-0.1.0.dist-info/entry_points.txt +2 -0
- praiser-0.1.0.dist-info/licenses/LICENSE +28 -0
- praiser-0.1.0.dist-info/top_level.txt +1 -0
praiser/__init__.py
ADDED
praiser/__main__.py
ADDED
praiser/cache.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Dead-simple file-based JSON cache keyed by a request hash.
|
|
2
|
+
|
|
3
|
+
Re-runs and LLM steps reuse cached payloads instead of re-fetching. Values
|
|
4
|
+
must be JSON-serialisable. A ``None`` value is stored faithfully and returned
|
|
5
|
+
as ``None`` on hit, so callers distinguish hit-with-None from miss via
|
|
6
|
+
``has()`` or the ``default`` sentinel.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
_MISS = object()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Cache:
|
|
21
|
+
def __init__(self, directory: Path | str, ttl: float | None = None) -> None:
|
|
22
|
+
self.dir = Path(directory)
|
|
23
|
+
self.ttl = ttl # seconds; None = never expire
|
|
24
|
+
self.dir.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def key(*parts: Any) -> str:
|
|
28
|
+
blob = json.dumps(parts, sort_keys=True, default=str)
|
|
29
|
+
return hashlib.sha256(blob.encode("utf-8")).hexdigest()
|
|
30
|
+
|
|
31
|
+
def _path(self, key: str) -> Path:
|
|
32
|
+
return self.dir / f"{key}.json"
|
|
33
|
+
|
|
34
|
+
def has(self, key: str) -> bool:
|
|
35
|
+
return self.get(key, default=_MISS) is not _MISS
|
|
36
|
+
|
|
37
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
38
|
+
path = self._path(key)
|
|
39
|
+
if not path.exists():
|
|
40
|
+
return default
|
|
41
|
+
try:
|
|
42
|
+
with path.open(encoding="utf-8") as fh:
|
|
43
|
+
record = json.load(fh)
|
|
44
|
+
except (json.JSONDecodeError, OSError):
|
|
45
|
+
return default
|
|
46
|
+
if self.ttl is not None and time.time() - record.get("ts", 0) > self.ttl:
|
|
47
|
+
return default
|
|
48
|
+
return record.get("value")
|
|
49
|
+
|
|
50
|
+
def set(self, key: str, value: Any) -> None:
|
|
51
|
+
path = self._path(key)
|
|
52
|
+
# Unique temp name per writer so concurrent writes (threads) to the same
|
|
53
|
+
# key don't clobber each other's temp file; the final replace is atomic.
|
|
54
|
+
tmp = path.with_suffix(f".{os.getpid()}.{threading.get_ident()}.tmp")
|
|
55
|
+
try:
|
|
56
|
+
with tmp.open("w", encoding="utf-8") as fh:
|
|
57
|
+
json.dump({"ts": time.time(), "value": value}, fh)
|
|
58
|
+
tmp.replace(path)
|
|
59
|
+
finally:
|
|
60
|
+
tmp.unlink(missing_ok=True)
|
praiser/cli.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""Command-line entry point: ``praiser <username> [...]``."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from . import __version__
|
|
8
|
+
from . import llm as _llm
|
|
9
|
+
from .config import Config, resolve_token
|
|
10
|
+
from .github_client import RateLimitError
|
|
11
|
+
from .pipeline import _humanize, run
|
|
12
|
+
from .render import render, render_highlights
|
|
13
|
+
|
|
14
|
+
# Shown when Anthropic credentials are needed (LLM features).
|
|
15
|
+
ANTHROPIC_HELP = (
|
|
16
|
+
"Use EITHER an API key — https://console.anthropic.com/settings/keys, then "
|
|
17
|
+
"`export ANTHROPIC_API_KEY=<key>` (pay-as-you-go) — OR your Claude "
|
|
18
|
+
"subscription: run `claude setup-token` and "
|
|
19
|
+
"`export CLAUDE_CODE_OAUTH_TOKEN=<token>`. Also install the extra: "
|
|
20
|
+
"pip install 'praiser[llm]'."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Shown whenever a token would help. Public-data discovery needs no scopes; add
|
|
24
|
+
# `repo` + `read:org` to reach private repos and resolve org/team membership.
|
|
25
|
+
TOKEN_HELP = (
|
|
26
|
+
"Get a token at https://github.com/settings/tokens (classic: no scopes "
|
|
27
|
+
"needed for public data; add 'repo' and 'read:org' for private/org access; "
|
|
28
|
+
"fine-grained: read-only 'Contents' + 'Members'), then run "
|
|
29
|
+
"`export GITHUB_TOKEN=<token>` or pass --token. Or just `gh auth login`."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _token_hint(token_source: str) -> str:
|
|
34
|
+
"""A leading-newline hint about tokens, tailored to where ours came from."""
|
|
35
|
+
if token_source == "none":
|
|
36
|
+
return (
|
|
37
|
+
"\nA token raises the limit from ~60 to 5,000 requests/hour. "
|
|
38
|
+
+ TOKEN_HELP
|
|
39
|
+
)
|
|
40
|
+
if token_source == "gh":
|
|
41
|
+
return (
|
|
42
|
+
"\nYou're authenticated via the gh CLI (already 5,000 requests/hour), "
|
|
43
|
+
"so a different token won't raise the limit — just wait and re-run. "
|
|
44
|
+
"To use an explicit token instead, set GITHUB_TOKEN: " + TOKEN_HELP
|
|
45
|
+
)
|
|
46
|
+
# flag / env: the user already supplied a token; 5,000/hr is the ceiling.
|
|
47
|
+
return ""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
51
|
+
p = argparse.ArgumentParser(
|
|
52
|
+
prog="praiser",
|
|
53
|
+
description="Record the popular projects a user maintains, steers, or "
|
|
54
|
+
"authors standards for (contributors excluded). Scans GitHub "
|
|
55
|
+
"by default, or Codeberg / GitLab via --forge.",
|
|
56
|
+
)
|
|
57
|
+
p.add_argument("username", help="login to investigate (on the chosen --forge)")
|
|
58
|
+
p.add_argument("--forge",
|
|
59
|
+
choices=["github", "codeberg", "gitlab", "gitee", "bitbucket",
|
|
60
|
+
"cgit"],
|
|
61
|
+
default="github",
|
|
62
|
+
help="code host to scan (default: github); 'codeberg' uses "
|
|
63
|
+
"the Gitea/Forgejo API, 'gitlab' the GitLab API, 'gitee' "
|
|
64
|
+
"the Gitee API, 'bitbucket' the Bitbucket API, 'cgit' an "
|
|
65
|
+
"API-less cgit host (e.g. kernel.org, Savannah) via "
|
|
66
|
+
"--forge-url + --add-repo")
|
|
67
|
+
p.add_argument("--forge-url", default=None, metavar="URL",
|
|
68
|
+
help="base URL of a self-hosted instance for --forge "
|
|
69
|
+
"gitlab|codeberg (e.g. https://gitlab.gnome.org or a "
|
|
70
|
+
"private Gitea); default: the public host")
|
|
71
|
+
p.add_argument("--forge-name", default=None, metavar="LABEL",
|
|
72
|
+
help="short label for the --forge-url instance (default: the "
|
|
73
|
+
"forge's own name)")
|
|
74
|
+
p.add_argument("--cross-forge", action="store_true",
|
|
75
|
+
help="follow verified cross-links on the user's profile to "
|
|
76
|
+
"their accounts on other forges (bidirectional only) and "
|
|
77
|
+
"scan them all into one merged record")
|
|
78
|
+
p.add_argument("--also-forge", action="append", default=[],
|
|
79
|
+
metavar="FORGE:LOGIN", dest="also_forge",
|
|
80
|
+
help="also scan this identity on another forge (repeatable), "
|
|
81
|
+
"e.g. --also-forge gitlab:johnsmith; merged into one record")
|
|
82
|
+
p.add_argument("--min-stars", type=int, default=50,
|
|
83
|
+
help="popularity threshold (default: 50); high-signal roles "
|
|
84
|
+
"and registry overrides survive regardless")
|
|
85
|
+
p.add_argument("--format", choices=["md", "json"], default=None,
|
|
86
|
+
dest="fmt",
|
|
87
|
+
help="emit the full report as md or json (default output is "
|
|
88
|
+
"the highlights summary)")
|
|
89
|
+
p.add_argument("--highlights", nargs="?", type=int, const=8, default=None,
|
|
90
|
+
metavar="N",
|
|
91
|
+
help="top-N highlights, one line each (this is the default "
|
|
92
|
+
"view; N defaults to 8)")
|
|
93
|
+
p.add_argument("--token", default=None,
|
|
94
|
+
help="GitHub token (or set GITHUB_TOKEN / GH_TOKEN)")
|
|
95
|
+
p.add_argument("--cache-dir", default=None,
|
|
96
|
+
help="cache directory (default: ~/.cache/praiser)")
|
|
97
|
+
p.add_argument("--registry", default=None, dest="registry_path",
|
|
98
|
+
help="known-projects JSON file, merged over the seed "
|
|
99
|
+
"(default: ~/.local/share/praiser/known_projects.json)")
|
|
100
|
+
p.add_argument("--save-registry", action=argparse.BooleanOptionalAction,
|
|
101
|
+
default=True,
|
|
102
|
+
help="persist observed popularity and web-discovered role "
|
|
103
|
+
"sources to the registry (default: on)")
|
|
104
|
+
p.add_argument("--no-llm", action="store_true",
|
|
105
|
+
help="disable all Claude features (prose fallback + role "
|
|
106
|
+
"discovery)")
|
|
107
|
+
p.add_argument("--discover-roles", action=argparse.BooleanOptionalAction,
|
|
108
|
+
default=True,
|
|
109
|
+
help="for popular repos, use Claude + web search to find "
|
|
110
|
+
"official team/governance pages AND to identify the "
|
|
111
|
+
"project's founder(s)/creator(s) (default: on when LLM "
|
|
112
|
+
"credentials are available; needs the llm extra + an API "
|
|
113
|
+
"key or Claude subscription)")
|
|
114
|
+
p.add_argument("--wikidata", action=argparse.BooleanOptionalAction,
|
|
115
|
+
default=True,
|
|
116
|
+
help="derive creator/founder/developer roles for popular "
|
|
117
|
+
"projects from Wikidata (handle-matched; default: on)")
|
|
118
|
+
p.add_argument("--package-registries", action=argparse.BooleanOptionalAction,
|
|
119
|
+
default=True,
|
|
120
|
+
help="also look up the user on PyPI, npm and crates.io to "
|
|
121
|
+
"credit packages they maintain/author and surface the "
|
|
122
|
+
"repos those ship from (default: on)")
|
|
123
|
+
p.add_argument("--add-repo", action="append", default=[],
|
|
124
|
+
metavar="OWNER/REPO[:PATH]", dest="extra_repos",
|
|
125
|
+
help="also scan this repo even if discovery missed it "
|
|
126
|
+
"(repeatable); append :PATH to credit a subcomponent "
|
|
127
|
+
"(e.g. numpy/numpy:numpy/f2py). Role detected automatically")
|
|
128
|
+
p.add_argument("--include-private", action="store_true",
|
|
129
|
+
help="also scan private repos (default: skip them)")
|
|
130
|
+
p.add_argument("--contributor-pages", type=int, default=2, metavar="N",
|
|
131
|
+
help="contributors API pages to fetch per repo, 100 each "
|
|
132
|
+
"(default: 2; lower = faster cold runs, may miss "
|
|
133
|
+
"deep-ranked contributors)")
|
|
134
|
+
p.add_argument("-j", "--jobs", type=int, default=8, metavar="N",
|
|
135
|
+
help="candidates scanned concurrently (default: 8)")
|
|
136
|
+
p.add_argument("-o", "--output", default=None,
|
|
137
|
+
help="write output to a file instead of stdout")
|
|
138
|
+
p.add_argument("-v", "--verbose", action="store_true",
|
|
139
|
+
help="detailed per-repo logging to stderr")
|
|
140
|
+
p.add_argument("-q", "--quiet", action="store_true",
|
|
141
|
+
help="suppress the live progress display")
|
|
142
|
+
p.add_argument("--version", action="version",
|
|
143
|
+
version=f"praiser {__version__}")
|
|
144
|
+
return p
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def main(argv: list[str] | None = None) -> int:
|
|
148
|
+
args = build_parser().parse_args(argv)
|
|
149
|
+
|
|
150
|
+
if args.forge == "github":
|
|
151
|
+
token, token_source = resolve_token(args.token)
|
|
152
|
+
if not token:
|
|
153
|
+
print(
|
|
154
|
+
"warning: no GitHub token found; discovery and rate limits will be "
|
|
155
|
+
"severely restricted (~60 requests/hour).\n" + TOKEN_HELP,
|
|
156
|
+
file=sys.stderr,
|
|
157
|
+
)
|
|
158
|
+
else: # codeberg / gitlab — public data works unauthenticated
|
|
159
|
+
token_envs = {
|
|
160
|
+
"codeberg": ("CODEBERG_TOKEN", "FORGEJO_TOKEN"),
|
|
161
|
+
"gitlab": ("GITLAB_TOKEN",),
|
|
162
|
+
"gitee": ("GITEE_TOKEN",),
|
|
163
|
+
"bitbucket": ("BITBUCKET_TOKEN",),
|
|
164
|
+
}.get(args.forge, ())
|
|
165
|
+
env_token = next((v for e in token_envs if (v := os.environ.get(e))), None)
|
|
166
|
+
token = args.token or env_token
|
|
167
|
+
token_source = "flag" if args.token else ("env" if token else "none")
|
|
168
|
+
if args.forge == "bitbucket" and not token:
|
|
169
|
+
print(
|
|
170
|
+
"warning: Bitbucket's anonymous rate limit is very low (~60 "
|
|
171
|
+
"requests/hour) — a multi-repo scan will be throttled. Set "
|
|
172
|
+
"BITBUCKET_TOKEN (an app password or access token) or pass "
|
|
173
|
+
"--token for a usable scan.",
|
|
174
|
+
file=sys.stderr,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Role discovery is on by default; only nag about missing creds/conflicts
|
|
178
|
+
# when the user EXPLICITLY asked for it (default-on degrades silently).
|
|
179
|
+
argv_tokens = argv if argv is not None else sys.argv[1:]
|
|
180
|
+
if args.discover_roles and "--discover-roles" in argv_tokens:
|
|
181
|
+
if args.no_llm:
|
|
182
|
+
print("warning: --discover-roles needs the LLM, but --no-llm was "
|
|
183
|
+
"given; role discovery is off.", file=sys.stderr)
|
|
184
|
+
else:
|
|
185
|
+
reason = _llm.availability()
|
|
186
|
+
if reason:
|
|
187
|
+
print(f"warning: --discover-roles is on but {reason}; role "
|
|
188
|
+
f"discovery is off.\n{ANTHROPIC_HELP}", file=sys.stderr)
|
|
189
|
+
|
|
190
|
+
# Split --add-repo values: "owner/repo" or "owner/repo:subpath".
|
|
191
|
+
extra_repos: list[str] = []
|
|
192
|
+
extra_subcomponents: dict[str, list[str]] = {}
|
|
193
|
+
for item in args.extra_repos:
|
|
194
|
+
repo, sep, path = item.partition(":")
|
|
195
|
+
if "/" not in repo:
|
|
196
|
+
continue
|
|
197
|
+
extra_repos.append(repo)
|
|
198
|
+
if sep and path:
|
|
199
|
+
extra_subcomponents.setdefault(repo, []).append(path)
|
|
200
|
+
|
|
201
|
+
# Highlights is the default view; --format md|json switches to the full report.
|
|
202
|
+
highlights = args.highlights
|
|
203
|
+
if highlights is None and args.fmt is None:
|
|
204
|
+
highlights = 8
|
|
205
|
+
|
|
206
|
+
if args.forge_url and args.forge == "github":
|
|
207
|
+
print("warning: --forge-url is ignored for --forge github (github.com "
|
|
208
|
+
"is the only GitHub host praiser supports).", file=sys.stderr)
|
|
209
|
+
|
|
210
|
+
config = Config(
|
|
211
|
+
username=args.username,
|
|
212
|
+
forge=args.forge,
|
|
213
|
+
forge_url=args.forge_url,
|
|
214
|
+
forge_name=args.forge_name,
|
|
215
|
+
cross_forge=args.cross_forge,
|
|
216
|
+
also_forge=args.also_forge,
|
|
217
|
+
token=token,
|
|
218
|
+
min_stars=args.min_stars,
|
|
219
|
+
fmt=args.fmt or "md",
|
|
220
|
+
highlights=highlights,
|
|
221
|
+
cache_dir=args.cache_dir,
|
|
222
|
+
use_llm=not args.no_llm,
|
|
223
|
+
registry_path=args.registry_path,
|
|
224
|
+
save_registry=args.save_registry,
|
|
225
|
+
verbose=args.verbose,
|
|
226
|
+
quiet=args.quiet,
|
|
227
|
+
include_private=args.include_private,
|
|
228
|
+
contributor_pages=args.contributor_pages,
|
|
229
|
+
jobs=args.jobs,
|
|
230
|
+
discover_roles=args.discover_roles,
|
|
231
|
+
use_wikidata=args.wikidata,
|
|
232
|
+
use_package_registries=args.package_registries,
|
|
233
|
+
extra_repos=extra_repos,
|
|
234
|
+
extra_subcomponents=extra_subcomponents,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
result = run(config)
|
|
239
|
+
except KeyboardInterrupt:
|
|
240
|
+
# Cancelled by the user (Ctrl-C): exit quietly, no stack trace. The
|
|
241
|
+
# cache keeps whatever already succeeded, so a re-run resumes.
|
|
242
|
+
print("\ncancelled (partial work is cached; re-run to continue).",
|
|
243
|
+
file=sys.stderr)
|
|
244
|
+
return 130
|
|
245
|
+
except RateLimitError as exc:
|
|
246
|
+
print(
|
|
247
|
+
"error: GitHub rate limit reached before discovery could run; "
|
|
248
|
+
f"wait {_humanize(exc.reset_in)} for it to reset."
|
|
249
|
+
+ _token_hint(token_source),
|
|
250
|
+
file=sys.stderr,
|
|
251
|
+
)
|
|
252
|
+
return 1
|
|
253
|
+
except Exception as exc:
|
|
254
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
255
|
+
return 1
|
|
256
|
+
|
|
257
|
+
if result.partial_reset_in is not None:
|
|
258
|
+
print(
|
|
259
|
+
"warning: GitHub rate limit reached during the run — results are "
|
|
260
|
+
"PARTIAL (some repos were not fully scanned). Wait "
|
|
261
|
+
f"{_humanize(result.partial_reset_in)} for the limit to reset, then "
|
|
262
|
+
"re-run to finish; the cache preserves what already succeeded."
|
|
263
|
+
+ _token_hint(token_source),
|
|
264
|
+
file=sys.stderr,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if config.highlights is not None:
|
|
268
|
+
output = render_highlights(
|
|
269
|
+
config.username, result.records, config.highlights, result.secondary
|
|
270
|
+
)
|
|
271
|
+
else:
|
|
272
|
+
output = render(
|
|
273
|
+
config.username, result.records, config.fmt, result.secondary
|
|
274
|
+
)
|
|
275
|
+
if args.output:
|
|
276
|
+
with open(args.output, "w", encoding="utf-8") as fh:
|
|
277
|
+
fh.write(output + "\n")
|
|
278
|
+
print(f"wrote {args.output}", file=sys.stderr)
|
|
279
|
+
else:
|
|
280
|
+
print(output)
|
|
281
|
+
return 0
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
if __name__ == "__main__":
|
|
285
|
+
raise SystemExit(main())
|
praiser/config.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Runtime configuration: token, thresholds, paths."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def default_cache_dir() -> Path:
|
|
11
|
+
base = os.environ.get("XDG_CACHE_HOME") or os.path.expanduser("~/.cache")
|
|
12
|
+
return Path(base) / "praiser"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def default_registry_path() -> Path:
|
|
16
|
+
# The learned/curated registry lives in a data dir (not the cache, which is
|
|
17
|
+
# safe to wipe) so discovered role sources and popularity persist.
|
|
18
|
+
base = os.environ.get("XDG_DATA_HOME") or os.path.expanduser("~/.local/share")
|
|
19
|
+
return Path(base) / "praiser" / "known_projects.json"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def resolve_token(explicit: str | None) -> tuple[str | None, str]:
|
|
23
|
+
"""Return (token, source). source is one of: flag, env, gh, none."""
|
|
24
|
+
if explicit:
|
|
25
|
+
return explicit, "flag"
|
|
26
|
+
for var in ("GITHUB_TOKEN", "GH_TOKEN"):
|
|
27
|
+
val = os.environ.get(var)
|
|
28
|
+
if val:
|
|
29
|
+
return val, "env"
|
|
30
|
+
gh = _gh_cli_token()
|
|
31
|
+
if gh:
|
|
32
|
+
return gh, "gh"
|
|
33
|
+
return None, "none"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _gh_cli_token() -> str | None:
|
|
37
|
+
"""Fall back to the GitHub CLI's token if `gh` is installed and logged in."""
|
|
38
|
+
if not shutil.which("gh"):
|
|
39
|
+
return None
|
|
40
|
+
try:
|
|
41
|
+
out = subprocess.run(
|
|
42
|
+
["gh", "auth", "token"],
|
|
43
|
+
capture_output=True, text=True, timeout=5,
|
|
44
|
+
)
|
|
45
|
+
except Exception:
|
|
46
|
+
return None
|
|
47
|
+
token = out.stdout.strip()
|
|
48
|
+
return token or None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Config:
|
|
53
|
+
username: str
|
|
54
|
+
forge: str = "github" # "github" | "codeberg" | "gitlab" | ...
|
|
55
|
+
forge_url: str | None = None # base URL of a self-hosted instance
|
|
56
|
+
forge_name: str | None = None # short label for that instance
|
|
57
|
+
cross_forge: bool = False # follow verified profile links to other forges
|
|
58
|
+
also_forge: list[str] = field(default_factory=list) # extra "forge:login" ids
|
|
59
|
+
token: str | None = None
|
|
60
|
+
min_stars: int = 50
|
|
61
|
+
fmt: str = "md" # "md" | "json"
|
|
62
|
+
highlights: int | None = None # if set, print only the top-N highlights
|
|
63
|
+
cache_dir: Path | None = None
|
|
64
|
+
use_llm: bool = True
|
|
65
|
+
registry_path: Path | None = None # user known-projects file (defaults below)
|
|
66
|
+
save_registry: bool = True # persist learned popularity + role sources
|
|
67
|
+
verbose: bool = False
|
|
68
|
+
quiet: bool = False # suppress the default progress display
|
|
69
|
+
include_private: bool = False # scan private repos too (default: skip)
|
|
70
|
+
contributor_pages: int = 2 # contributors API pages (100 each)
|
|
71
|
+
jobs: int = 8 # concurrent candidates during attribution
|
|
72
|
+
discover_roles: bool = True # find role pages via LLM + web search
|
|
73
|
+
use_wikidata: bool = True # derive creator/developer roles via Wikidata
|
|
74
|
+
use_package_registries: bool = True # discover roles via PyPI/npm/crates.io
|
|
75
|
+
extra_repos: list[str] = field(default_factory=list) # user-supplied owner/repo
|
|
76
|
+
# user-supplied subcomponents: owner/repo -> [paths]
|
|
77
|
+
extra_subcomponents: dict[str, list[str]] = field(default_factory=dict)
|
|
78
|
+
|
|
79
|
+
def __post_init__(self) -> None:
|
|
80
|
+
if self.cache_dir is None:
|
|
81
|
+
self.cache_dir = default_cache_dir()
|
|
82
|
+
self.cache_dir = Path(self.cache_dir)
|
|
83
|
+
self.registry_path = (
|
|
84
|
+
default_registry_path() if self.registry_path is None
|
|
85
|
+
else Path(self.registry_path)
|
|
86
|
+
)
|
praiser/crossforge.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Cross-forge identity resolution (issues #18, #25).
|
|
2
|
+
|
|
3
|
+
From a single anchor account, discover the person's accounts on *other* forges,
|
|
4
|
+
confirming each in one of two false-merge-resistant ways:
|
|
5
|
+
|
|
6
|
+
* **Bidirectional profile links** (#18): the candidate's profile links back to an
|
|
7
|
+
already-confirmed account. A false merge would need two different people to
|
|
8
|
+
link to each other.
|
|
9
|
+
* **Personal-site hub** (#25): people often list their accounts on a personal
|
|
10
|
+
site rather than in a forge bio. When a confirmed profile links to a non-forge
|
|
11
|
+
URL, we fetch that page (one hop, cached) and, if it's an *owned* hub (it also
|
|
12
|
+
links back to a confirmed account), accept the other forge accounts it lists —
|
|
13
|
+
provided the candidate shares the handle or display name. A link-farm guard
|
|
14
|
+
skips hubs that reference many distinct accounts.
|
|
15
|
+
|
|
16
|
+
Under-merge (someone who didn't cross-link at all) is safe; over-merge is
|
|
17
|
+
refused. The traversal is forge-agnostic (operates on ``Forge.profile_links`` +
|
|
18
|
+
a URL parser + ``Forge.get_url`` for the hub), so it's unit-testable with fakes.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import re
|
|
22
|
+
|
|
23
|
+
from .forge import Forge
|
|
24
|
+
from .forge._http import extract_urls
|
|
25
|
+
from .models import FORGE_WEB_HOSTS, Identity
|
|
26
|
+
|
|
27
|
+
# A personal-site hub linking to more distinct forge accounts than this is
|
|
28
|
+
# probably a directory/link-farm, not one person's identity page — skip it.
|
|
29
|
+
_MAX_HUB_ACCOUNTS = 6
|
|
30
|
+
|
|
31
|
+
# host (no scheme, no www.) -> forge name
|
|
32
|
+
_HOST_FORGE = {
|
|
33
|
+
host.split("://", 1)[1].lower(): name for name, host in FORGE_WEB_HOSTS.items()
|
|
34
|
+
}
|
|
35
|
+
# First-path-segment values that are never a user profile.
|
|
36
|
+
_NON_PROFILE = {
|
|
37
|
+
"sponsors", "orgs", "users", "topics", "features", "about", "settings",
|
|
38
|
+
"marketplace", "apps", "explore", "help", "pricing", "-", "dashboard",
|
|
39
|
+
}
|
|
40
|
+
# A profile URL is host + exactly ONE path segment (the login).
|
|
41
|
+
_PROFILE_RE = re.compile(r"https?://([^/\s]+)/([^/?#\s]+)/?(?:[?#].*)?$")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def parse_profile_url(url: str) -> tuple[str, str] | None:
|
|
45
|
+
"""``(forge_name, login)`` for a profile URL on a known host, else None.
|
|
46
|
+
|
|
47
|
+
Only single-segment paths match (a profile), so repo/group links like
|
|
48
|
+
``github.com/owner/repo`` are ignored.
|
|
49
|
+
"""
|
|
50
|
+
if not url:
|
|
51
|
+
return None
|
|
52
|
+
m = _PROFILE_RE.match(url.strip())
|
|
53
|
+
if not m:
|
|
54
|
+
return None
|
|
55
|
+
host, seg = m.group(1).lower(), m.group(2)
|
|
56
|
+
if host.startswith("www."):
|
|
57
|
+
host = host[4:]
|
|
58
|
+
forge = _HOST_FORGE.get(host)
|
|
59
|
+
if forge is None:
|
|
60
|
+
return None
|
|
61
|
+
if not seg or seg.lower() in _NON_PROFILE or "." in seg: # skip *.git, files
|
|
62
|
+
return None
|
|
63
|
+
return (forge, seg)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def resolve_cross_forge(
|
|
67
|
+
anchor: Forge,
|
|
68
|
+
anchor_login: str,
|
|
69
|
+
forge_factory,
|
|
70
|
+
*,
|
|
71
|
+
max_accounts: int = 8,
|
|
72
|
+
) -> tuple[Identity, list[tuple[str, str]]]:
|
|
73
|
+
"""Resolve the person's accounts across forges from an anchor account.
|
|
74
|
+
|
|
75
|
+
``forge_factory(forge_name) -> Forge | None`` builds a forge for a name
|
|
76
|
+
(None if unsupported). Returns the merged ``Identity`` and the confirmed
|
|
77
|
+
``[(forge_name, login), …]`` (anchor included), for the executor to scan.
|
|
78
|
+
"""
|
|
79
|
+
forges: dict[str, Forge] = {anchor.name: anchor}
|
|
80
|
+
confirmed: dict[tuple[str, str], str] = {(anchor.name, anchor_login.lower()): anchor_login}
|
|
81
|
+
names: set[str] = set()
|
|
82
|
+
visited: set[tuple[str, str]] = set()
|
|
83
|
+
visited_hubs: set[str] = set()
|
|
84
|
+
|
|
85
|
+
def links_of(forge: Forge, login: str) -> list[str]:
|
|
86
|
+
# A throttled/flaky forge contributes no links rather than aborting the
|
|
87
|
+
# whole resolution (best-effort, like discovery).
|
|
88
|
+
try:
|
|
89
|
+
return forge.profile_links(login)
|
|
90
|
+
except Exception:
|
|
91
|
+
return []
|
|
92
|
+
|
|
93
|
+
def note_name(forge: Forge, login: str) -> None:
|
|
94
|
+
try:
|
|
95
|
+
user = forge.resolve_user(login)
|
|
96
|
+
except Exception:
|
|
97
|
+
return
|
|
98
|
+
if user and user.name:
|
|
99
|
+
names.add(user.name)
|
|
100
|
+
|
|
101
|
+
def get_forge(fname: str) -> Forge | None:
|
|
102
|
+
if fname not in forges:
|
|
103
|
+
cforge = forge_factory(fname)
|
|
104
|
+
if cforge is None:
|
|
105
|
+
return None
|
|
106
|
+
forges[fname] = cforge
|
|
107
|
+
return forges[fname]
|
|
108
|
+
|
|
109
|
+
def confirm(cforge: Forge, clogin: str) -> None:
|
|
110
|
+
confirmed[(cforge.name, clogin.lower())] = clogin
|
|
111
|
+
note_name(cforge, clogin)
|
|
112
|
+
queue.append((cforge, clogin))
|
|
113
|
+
|
|
114
|
+
def name_matches(cforge: Forge, clogin: str) -> bool:
|
|
115
|
+
try:
|
|
116
|
+
user = cforge.resolve_user(clogin)
|
|
117
|
+
except Exception:
|
|
118
|
+
return False
|
|
119
|
+
lowered = {n.strip().lower() for n in names}
|
|
120
|
+
return bool(user and user.name and user.name.strip().lower() in lowered)
|
|
121
|
+
|
|
122
|
+
note_name(anchor, anchor_login)
|
|
123
|
+
queue: list[tuple[Forge, str]] = [(anchor, anchor_login)]
|
|
124
|
+
while queue and len(confirmed) < max_accounts:
|
|
125
|
+
forge, login = queue.pop(0)
|
|
126
|
+
if (forge.name, login.lower()) in visited:
|
|
127
|
+
continue
|
|
128
|
+
visited.add((forge.name, login.lower()))
|
|
129
|
+
|
|
130
|
+
forge_targets: list[tuple[str, str]] = []
|
|
131
|
+
hub_urls: list[str] = []
|
|
132
|
+
for url in links_of(forge, login):
|
|
133
|
+
parsed = parse_profile_url(url)
|
|
134
|
+
if parsed is not None:
|
|
135
|
+
forge_targets.append(parsed)
|
|
136
|
+
elif url.strip().lower().startswith(("http://", "https://")):
|
|
137
|
+
hub_urls.append(url.strip())
|
|
138
|
+
|
|
139
|
+
# 1. Direct bidirectional: the candidate's profile links back.
|
|
140
|
+
for fname, clogin in forge_targets:
|
|
141
|
+
if (fname, clogin.lower()) in confirmed:
|
|
142
|
+
continue
|
|
143
|
+
cforge = get_forge(fname)
|
|
144
|
+
if cforge is None:
|
|
145
|
+
continue
|
|
146
|
+
back = {
|
|
147
|
+
(f, ll.lower()) for u2 in links_of(cforge, clogin)
|
|
148
|
+
if (p := parse_profile_url(u2)) for f, ll in [p]
|
|
149
|
+
}
|
|
150
|
+
if set(confirmed) & back:
|
|
151
|
+
confirm(cforge, clogin)
|
|
152
|
+
|
|
153
|
+
# 2. Personal-site hub (#25): fetch the site, and if it's an owned hub
|
|
154
|
+
# (links back to a confirmed account), accept the accounts it lists that
|
|
155
|
+
# share the handle or display name.
|
|
156
|
+
for hub in hub_urls:
|
|
157
|
+
if hub in visited_hubs:
|
|
158
|
+
continue
|
|
159
|
+
visited_hubs.add(hub)
|
|
160
|
+
try:
|
|
161
|
+
page = forge.get_url(hub)
|
|
162
|
+
except Exception:
|
|
163
|
+
page = None
|
|
164
|
+
hub_accounts = {
|
|
165
|
+
p for u2 in extract_urls(page or "") if (p := parse_profile_url(u2))
|
|
166
|
+
}
|
|
167
|
+
if not hub_accounts or len(hub_accounts) > _MAX_HUB_ACCOUNTS:
|
|
168
|
+
continue # empty, or a link-farm — not a personal identity hub
|
|
169
|
+
hub_keys = {(f, ll.lower()) for f, ll in hub_accounts}
|
|
170
|
+
if not (set(confirmed) & hub_keys):
|
|
171
|
+
continue # not reached-from + links-back: not the person's own hub
|
|
172
|
+
confirmed_logins = {cl for (_, cl) in confirmed}
|
|
173
|
+
for fname, clogin in hub_accounts:
|
|
174
|
+
if (fname, clogin.lower()) in confirmed:
|
|
175
|
+
continue
|
|
176
|
+
cforge = get_forge(fname)
|
|
177
|
+
if cforge is None:
|
|
178
|
+
continue
|
|
179
|
+
if clogin.lower() in confirmed_logins or name_matches(cforge, clogin):
|
|
180
|
+
confirm(cforge, clogin)
|
|
181
|
+
|
|
182
|
+
identity = Identity(
|
|
183
|
+
primary_login=anchor_login,
|
|
184
|
+
logins=set(confirmed.values()),
|
|
185
|
+
names=names,
|
|
186
|
+
)
|
|
187
|
+
ids = [(fname, login) for (fname, _), login in confirmed.items()]
|
|
188
|
+
return identity, ids
|
praiser/data/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Marks praiser.data as a package so importlib.resources can read bundled files.
|