praiser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. praiser/__init__.py +7 -0
  2. praiser/__main__.py +6 -0
  3. praiser/cache.py +60 -0
  4. praiser/cli.py +285 -0
  5. praiser/config.py +86 -0
  6. praiser/crossforge.py +188 -0
  7. praiser/data/__init__.py +1 -0
  8. praiser/data/known_projects.json +95 -0
  9. praiser/discovery.py +218 -0
  10. praiser/extractors/__init__.py +54 -0
  11. praiser/extractors/authors.py +107 -0
  12. praiser/extractors/base.py +106 -0
  13. praiser/extractors/codeowners.py +133 -0
  14. praiser/extractors/contributors.py +89 -0
  15. praiser/extractors/curated.py +41 -0
  16. praiser/extractors/enhancement_proposals.py +254 -0
  17. praiser/extractors/governance.py +127 -0
  18. praiser/extractors/llm_founders.py +50 -0
  19. praiser/extractors/maintainers.py +157 -0
  20. praiser/extractors/manifests.py +184 -0
  21. praiser/extractors/ownership.py +29 -0
  22. praiser/extractors/packages.py +66 -0
  23. praiser/extractors/subcomponents.py +76 -0
  24. praiser/extractors/web_roles.py +131 -0
  25. praiser/extractors/wikidata.py +95 -0
  26. praiser/forge/__init__.py +35 -0
  27. praiser/forge/_http.py +86 -0
  28. praiser/forge/base.py +265 -0
  29. praiser/forge/bitbucket.py +171 -0
  30. praiser/forge/cgit.py +95 -0
  31. praiser/forge/gitea.py +179 -0
  32. praiser/forge/gitee.py +175 -0
  33. praiser/forge/github.py +309 -0
  34. praiser/forge/gitlab.py +197 -0
  35. praiser/github_client.py +462 -0
  36. praiser/identity.py +25 -0
  37. praiser/llm.py +247 -0
  38. praiser/models.py +253 -0
  39. praiser/pipeline.py +313 -0
  40. praiser/popularity.py +109 -0
  41. praiser/progress.py +67 -0
  42. praiser/registries.py +260 -0
  43. praiser/registry.py +311 -0
  44. praiser/render.py +219 -0
  45. praiser-0.1.0.dist-info/METADATA +350 -0
  46. praiser-0.1.0.dist-info/RECORD +50 -0
  47. praiser-0.1.0.dist-info/WHEEL +5 -0
  48. praiser-0.1.0.dist-info/entry_points.txt +2 -0
  49. praiser-0.1.0.dist-info/licenses/LICENSE +28 -0
  50. praiser-0.1.0.dist-info/top_level.txt +1 -0
praiser/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """praiser — record the popular projects a GitHub user has an elevated role in.
2
+
3
+ Elevated roles are maintainer / code owner / steering-council member /
4
+ standards author. Plain contributors are intentionally excluded.
5
+ """
6
+
7
+ __version__ = "0.1.0"
praiser/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Allow `python -m praiser`."""
2
+
3
+ from praiser.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
praiser/cache.py ADDED
@@ -0,0 +1,60 @@
1
+ """Dead-simple file-based JSON cache keyed by a request hash.
2
+
3
+ Re-runs and LLM steps reuse cached payloads instead of re-fetching. Values
4
+ must be JSON-serialisable. A ``None`` value is stored faithfully and returned
5
+ as ``None`` on hit, so callers distinguish hit-with-None from miss via
6
+ ``has()`` or the ``default`` sentinel.
7
+ """
8
+
9
+ import hashlib
10
+ import json
11
+ import os
12
+ import threading
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ _MISS = object()
18
+
19
+
20
+ class Cache:
21
+ def __init__(self, directory: Path | str, ttl: float | None = None) -> None:
22
+ self.dir = Path(directory)
23
+ self.ttl = ttl # seconds; None = never expire
24
+ self.dir.mkdir(parents=True, exist_ok=True)
25
+
26
+ @staticmethod
27
+ def key(*parts: Any) -> str:
28
+ blob = json.dumps(parts, sort_keys=True, default=str)
29
+ return hashlib.sha256(blob.encode("utf-8")).hexdigest()
30
+
31
+ def _path(self, key: str) -> Path:
32
+ return self.dir / f"{key}.json"
33
+
34
+ def has(self, key: str) -> bool:
35
+ return self.get(key, default=_MISS) is not _MISS
36
+
37
+ def get(self, key: str, default: Any = None) -> Any:
38
+ path = self._path(key)
39
+ if not path.exists():
40
+ return default
41
+ try:
42
+ with path.open(encoding="utf-8") as fh:
43
+ record = json.load(fh)
44
+ except (json.JSONDecodeError, OSError):
45
+ return default
46
+ if self.ttl is not None and time.time() - record.get("ts", 0) > self.ttl:
47
+ return default
48
+ return record.get("value")
49
+
50
+ def set(self, key: str, value: Any) -> None:
51
+ path = self._path(key)
52
+ # Unique temp name per writer so concurrent writes (threads) to the same
53
+ # key don't clobber each other's temp file; the final replace is atomic.
54
+ tmp = path.with_suffix(f".{os.getpid()}.{threading.get_ident()}.tmp")
55
+ try:
56
+ with tmp.open("w", encoding="utf-8") as fh:
57
+ json.dump({"ts": time.time(), "value": value}, fh)
58
+ tmp.replace(path)
59
+ finally:
60
+ tmp.unlink(missing_ok=True)
praiser/cli.py ADDED
@@ -0,0 +1,285 @@
1
+ """Command-line entry point: ``praiser <username> [...]``."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+ from . import __version__
8
+ from . import llm as _llm
9
+ from .config import Config, resolve_token
10
+ from .github_client import RateLimitError
11
+ from .pipeline import _humanize, run
12
+ from .render import render, render_highlights
13
+
14
+ # Shown when Anthropic credentials are needed (LLM features).
15
+ ANTHROPIC_HELP = (
16
+ "Use EITHER an API key — https://console.anthropic.com/settings/keys, then "
17
+ "`export ANTHROPIC_API_KEY=<key>` (pay-as-you-go) — OR your Claude "
18
+ "subscription: run `claude setup-token` and "
19
+ "`export CLAUDE_CODE_OAUTH_TOKEN=<token>`. Also install the extra: "
20
+ "pip install 'praiser[llm]'."
21
+ )
22
+
23
+ # Shown whenever a token would help. Public-data discovery needs no scopes; add
24
+ # `repo` + `read:org` to reach private repos and resolve org/team membership.
25
+ TOKEN_HELP = (
26
+ "Get a token at https://github.com/settings/tokens (classic: no scopes "
27
+ "needed for public data; add 'repo' and 'read:org' for private/org access; "
28
+ "fine-grained: read-only 'Contents' + 'Members'), then run "
29
+ "`export GITHUB_TOKEN=<token>` or pass --token. Or just `gh auth login`."
30
+ )
31
+
32
+
33
+ def _token_hint(token_source: str) -> str:
34
+ """A leading-newline hint about tokens, tailored to where ours came from."""
35
+ if token_source == "none":
36
+ return (
37
+ "\nA token raises the limit from ~60 to 5,000 requests/hour. "
38
+ + TOKEN_HELP
39
+ )
40
+ if token_source == "gh":
41
+ return (
42
+ "\nYou're authenticated via the gh CLI (already 5,000 requests/hour), "
43
+ "so a different token won't raise the limit — just wait and re-run. "
44
+ "To use an explicit token instead, set GITHUB_TOKEN: " + TOKEN_HELP
45
+ )
46
+ # flag / env: the user already supplied a token; 5,000/hr is the ceiling.
47
+ return ""
48
+
49
+
50
+ def build_parser() -> argparse.ArgumentParser:
51
+ p = argparse.ArgumentParser(
52
+ prog="praiser",
53
+ description="Record the popular projects a user maintains, steers, or "
54
+ "authors standards for (contributors excluded). Scans GitHub "
55
+ "by default, or Codeberg / GitLab via --forge.",
56
+ )
57
+ p.add_argument("username", help="login to investigate (on the chosen --forge)")
58
+ p.add_argument("--forge",
59
+ choices=["github", "codeberg", "gitlab", "gitee", "bitbucket",
60
+ "cgit"],
61
+ default="github",
62
+ help="code host to scan (default: github); 'codeberg' uses "
63
+ "the Gitea/Forgejo API, 'gitlab' the GitLab API, 'gitee' "
64
+ "the Gitee API, 'bitbucket' the Bitbucket API, 'cgit' an "
65
+ "API-less cgit host (e.g. kernel.org, Savannah) via "
66
+ "--forge-url + --add-repo")
67
+ p.add_argument("--forge-url", default=None, metavar="URL",
68
+ help="base URL of a self-hosted instance for --forge "
69
+ "gitlab|codeberg (e.g. https://gitlab.gnome.org or a "
70
+ "private Gitea); default: the public host")
71
+ p.add_argument("--forge-name", default=None, metavar="LABEL",
72
+ help="short label for the --forge-url instance (default: the "
73
+ "forge's own name)")
74
+ p.add_argument("--cross-forge", action="store_true",
75
+ help="follow verified cross-links on the user's profile to "
76
+ "their accounts on other forges (bidirectional only) and "
77
+ "scan them all into one merged record")
78
+ p.add_argument("--also-forge", action="append", default=[],
79
+ metavar="FORGE:LOGIN", dest="also_forge",
80
+ help="also scan this identity on another forge (repeatable), "
81
+ "e.g. --also-forge gitlab:johnsmith; merged into one record")
82
+ p.add_argument("--min-stars", type=int, default=50,
83
+ help="popularity threshold (default: 50); high-signal roles "
84
+ "and registry overrides survive regardless")
85
+ p.add_argument("--format", choices=["md", "json"], default=None,
86
+ dest="fmt",
87
+ help="emit the full report as md or json (default output is "
88
+ "the highlights summary)")
89
+ p.add_argument("--highlights", nargs="?", type=int, const=8, default=None,
90
+ metavar="N",
91
+ help="top-N highlights, one line each (this is the default "
92
+ "view; N defaults to 8)")
93
+ p.add_argument("--token", default=None,
94
+ help="GitHub token (or set GITHUB_TOKEN / GH_TOKEN)")
95
+ p.add_argument("--cache-dir", default=None,
96
+ help="cache directory (default: ~/.cache/praiser)")
97
+ p.add_argument("--registry", default=None, dest="registry_path",
98
+ help="known-projects JSON file, merged over the seed "
99
+ "(default: ~/.local/share/praiser/known_projects.json)")
100
+ p.add_argument("--save-registry", action=argparse.BooleanOptionalAction,
101
+ default=True,
102
+ help="persist observed popularity and web-discovered role "
103
+ "sources to the registry (default: on)")
104
+ p.add_argument("--no-llm", action="store_true",
105
+ help="disable all Claude features (prose fallback + role "
106
+ "discovery)")
107
+ p.add_argument("--discover-roles", action=argparse.BooleanOptionalAction,
108
+ default=True,
109
+ help="for popular repos, use Claude + web search to find "
110
+ "official team/governance pages AND to identify the "
111
+ "project's founder(s)/creator(s) (default: on when LLM "
112
+ "credentials are available; needs the llm extra + an API "
113
+ "key or Claude subscription)")
114
+ p.add_argument("--wikidata", action=argparse.BooleanOptionalAction,
115
+ default=True,
116
+ help="derive creator/founder/developer roles for popular "
117
+ "projects from Wikidata (handle-matched; default: on)")
118
+ p.add_argument("--package-registries", action=argparse.BooleanOptionalAction,
119
+ default=True,
120
+ help="also look up the user on PyPI, npm and crates.io to "
121
+ "credit packages they maintain/author and surface the "
122
+ "repos those ship from (default: on)")
123
+ p.add_argument("--add-repo", action="append", default=[],
124
+ metavar="OWNER/REPO[:PATH]", dest="extra_repos",
125
+ help="also scan this repo even if discovery missed it "
126
+ "(repeatable); append :PATH to credit a subcomponent "
127
+ "(e.g. numpy/numpy:numpy/f2py). Role detected automatically")
128
+ p.add_argument("--include-private", action="store_true",
129
+ help="also scan private repos (default: skip them)")
130
+ p.add_argument("--contributor-pages", type=int, default=2, metavar="N",
131
+ help="contributors API pages to fetch per repo, 100 each "
132
+ "(default: 2; lower = faster cold runs, may miss "
133
+ "deep-ranked contributors)")
134
+ p.add_argument("-j", "--jobs", type=int, default=8, metavar="N",
135
+ help="candidates scanned concurrently (default: 8)")
136
+ p.add_argument("-o", "--output", default=None,
137
+ help="write output to a file instead of stdout")
138
+ p.add_argument("-v", "--verbose", action="store_true",
139
+ help="detailed per-repo logging to stderr")
140
+ p.add_argument("-q", "--quiet", action="store_true",
141
+ help="suppress the live progress display")
142
+ p.add_argument("--version", action="version",
143
+ version=f"praiser {__version__}")
144
+ return p
145
+
146
+
147
+ def main(argv: list[str] | None = None) -> int:
148
+ args = build_parser().parse_args(argv)
149
+
150
+ if args.forge == "github":
151
+ token, token_source = resolve_token(args.token)
152
+ if not token:
153
+ print(
154
+ "warning: no GitHub token found; discovery and rate limits will be "
155
+ "severely restricted (~60 requests/hour).\n" + TOKEN_HELP,
156
+ file=sys.stderr,
157
+ )
158
+ else: # codeberg / gitlab — public data works unauthenticated
159
+ token_envs = {
160
+ "codeberg": ("CODEBERG_TOKEN", "FORGEJO_TOKEN"),
161
+ "gitlab": ("GITLAB_TOKEN",),
162
+ "gitee": ("GITEE_TOKEN",),
163
+ "bitbucket": ("BITBUCKET_TOKEN",),
164
+ }.get(args.forge, ())
165
+ env_token = next((v for e in token_envs if (v := os.environ.get(e))), None)
166
+ token = args.token or env_token
167
+ token_source = "flag" if args.token else ("env" if token else "none")
168
+ if args.forge == "bitbucket" and not token:
169
+ print(
170
+ "warning: Bitbucket's anonymous rate limit is very low (~60 "
171
+ "requests/hour) — a multi-repo scan will be throttled. Set "
172
+ "BITBUCKET_TOKEN (an app password or access token) or pass "
173
+ "--token for a usable scan.",
174
+ file=sys.stderr,
175
+ )
176
+
177
+ # Role discovery is on by default; only nag about missing creds/conflicts
178
+ # when the user EXPLICITLY asked for it (default-on degrades silently).
179
+ argv_tokens = argv if argv is not None else sys.argv[1:]
180
+ if args.discover_roles and "--discover-roles" in argv_tokens:
181
+ if args.no_llm:
182
+ print("warning: --discover-roles needs the LLM, but --no-llm was "
183
+ "given; role discovery is off.", file=sys.stderr)
184
+ else:
185
+ reason = _llm.availability()
186
+ if reason:
187
+ print(f"warning: --discover-roles is on but {reason}; role "
188
+ f"discovery is off.\n{ANTHROPIC_HELP}", file=sys.stderr)
189
+
190
+ # Split --add-repo values: "owner/repo" or "owner/repo:subpath".
191
+ extra_repos: list[str] = []
192
+ extra_subcomponents: dict[str, list[str]] = {}
193
+ for item in args.extra_repos:
194
+ repo, sep, path = item.partition(":")
195
+ if "/" not in repo:
196
+ continue
197
+ extra_repos.append(repo)
198
+ if sep and path:
199
+ extra_subcomponents.setdefault(repo, []).append(path)
200
+
201
+ # Highlights is the default view; --format md|json switches to the full report.
202
+ highlights = args.highlights
203
+ if highlights is None and args.fmt is None:
204
+ highlights = 8
205
+
206
+ if args.forge_url and args.forge == "github":
207
+ print("warning: --forge-url is ignored for --forge github (github.com "
208
+ "is the only GitHub host praiser supports).", file=sys.stderr)
209
+
210
+ config = Config(
211
+ username=args.username,
212
+ forge=args.forge,
213
+ forge_url=args.forge_url,
214
+ forge_name=args.forge_name,
215
+ cross_forge=args.cross_forge,
216
+ also_forge=args.also_forge,
217
+ token=token,
218
+ min_stars=args.min_stars,
219
+ fmt=args.fmt or "md",
220
+ highlights=highlights,
221
+ cache_dir=args.cache_dir,
222
+ use_llm=not args.no_llm,
223
+ registry_path=args.registry_path,
224
+ save_registry=args.save_registry,
225
+ verbose=args.verbose,
226
+ quiet=args.quiet,
227
+ include_private=args.include_private,
228
+ contributor_pages=args.contributor_pages,
229
+ jobs=args.jobs,
230
+ discover_roles=args.discover_roles,
231
+ use_wikidata=args.wikidata,
232
+ use_package_registries=args.package_registries,
233
+ extra_repos=extra_repos,
234
+ extra_subcomponents=extra_subcomponents,
235
+ )
236
+
237
+ try:
238
+ result = run(config)
239
+ except KeyboardInterrupt:
240
+ # Cancelled by the user (Ctrl-C): exit quietly, no stack trace. The
241
+ # cache keeps whatever already succeeded, so a re-run resumes.
242
+ print("\ncancelled (partial work is cached; re-run to continue).",
243
+ file=sys.stderr)
244
+ return 130
245
+ except RateLimitError as exc:
246
+ print(
247
+ "error: GitHub rate limit reached before discovery could run; "
248
+ f"wait {_humanize(exc.reset_in)} for it to reset."
249
+ + _token_hint(token_source),
250
+ file=sys.stderr,
251
+ )
252
+ return 1
253
+ except Exception as exc:
254
+ print(f"error: {exc}", file=sys.stderr)
255
+ return 1
256
+
257
+ if result.partial_reset_in is not None:
258
+ print(
259
+ "warning: GitHub rate limit reached during the run — results are "
260
+ "PARTIAL (some repos were not fully scanned). Wait "
261
+ f"{_humanize(result.partial_reset_in)} for the limit to reset, then "
262
+ "re-run to finish; the cache preserves what already succeeded."
263
+ + _token_hint(token_source),
264
+ file=sys.stderr,
265
+ )
266
+
267
+ if config.highlights is not None:
268
+ output = render_highlights(
269
+ config.username, result.records, config.highlights, result.secondary
270
+ )
271
+ else:
272
+ output = render(
273
+ config.username, result.records, config.fmt, result.secondary
274
+ )
275
+ if args.output:
276
+ with open(args.output, "w", encoding="utf-8") as fh:
277
+ fh.write(output + "\n")
278
+ print(f"wrote {args.output}", file=sys.stderr)
279
+ else:
280
+ print(output)
281
+ return 0
282
+
283
+
284
+ if __name__ == "__main__":
285
+ raise SystemExit(main())
praiser/config.py ADDED
@@ -0,0 +1,86 @@
1
+ """Runtime configuration: token, thresholds, paths."""
2
+
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+
9
+
10
+ def default_cache_dir() -> Path:
11
+ base = os.environ.get("XDG_CACHE_HOME") or os.path.expanduser("~/.cache")
12
+ return Path(base) / "praiser"
13
+
14
+
15
+ def default_registry_path() -> Path:
16
+ # The learned/curated registry lives in a data dir (not the cache, which is
17
+ # safe to wipe) so discovered role sources and popularity persist.
18
+ base = os.environ.get("XDG_DATA_HOME") or os.path.expanduser("~/.local/share")
19
+ return Path(base) / "praiser" / "known_projects.json"
20
+
21
+
22
+ def resolve_token(explicit: str | None) -> tuple[str | None, str]:
23
+ """Return (token, source). source is one of: flag, env, gh, none."""
24
+ if explicit:
25
+ return explicit, "flag"
26
+ for var in ("GITHUB_TOKEN", "GH_TOKEN"):
27
+ val = os.environ.get(var)
28
+ if val:
29
+ return val, "env"
30
+ gh = _gh_cli_token()
31
+ if gh:
32
+ return gh, "gh"
33
+ return None, "none"
34
+
35
+
36
+ def _gh_cli_token() -> str | None:
37
+ """Fall back to the GitHub CLI's token if `gh` is installed and logged in."""
38
+ if not shutil.which("gh"):
39
+ return None
40
+ try:
41
+ out = subprocess.run(
42
+ ["gh", "auth", "token"],
43
+ capture_output=True, text=True, timeout=5,
44
+ )
45
+ except Exception:
46
+ return None
47
+ token = out.stdout.strip()
48
+ return token or None
49
+
50
+
51
+ @dataclass
52
+ class Config:
53
+ username: str
54
+ forge: str = "github" # "github" | "codeberg" | "gitlab" | ...
55
+ forge_url: str | None = None # base URL of a self-hosted instance
56
+ forge_name: str | None = None # short label for that instance
57
+ cross_forge: bool = False # follow verified profile links to other forges
58
+ also_forge: list[str] = field(default_factory=list) # extra "forge:login" ids
59
+ token: str | None = None
60
+ min_stars: int = 50
61
+ fmt: str = "md" # "md" | "json"
62
+ highlights: int | None = None # if set, print only the top-N highlights
63
+ cache_dir: Path | None = None
64
+ use_llm: bool = True
65
+ registry_path: Path | None = None # user known-projects file (defaults below)
66
+ save_registry: bool = True # persist learned popularity + role sources
67
+ verbose: bool = False
68
+ quiet: bool = False # suppress the default progress display
69
+ include_private: bool = False # scan private repos too (default: skip)
70
+ contributor_pages: int = 2 # contributors API pages (100 each)
71
+ jobs: int = 8 # concurrent candidates during attribution
72
+ discover_roles: bool = True # find role pages via LLM + web search
73
+ use_wikidata: bool = True # derive creator/developer roles via Wikidata
74
+ use_package_registries: bool = True # discover roles via PyPI/npm/crates.io
75
+ extra_repos: list[str] = field(default_factory=list) # user-supplied owner/repo
76
+ # user-supplied subcomponents: owner/repo -> [paths]
77
+ extra_subcomponents: dict[str, list[str]] = field(default_factory=dict)
78
+
79
+ def __post_init__(self) -> None:
80
+ if self.cache_dir is None:
81
+ self.cache_dir = default_cache_dir()
82
+ self.cache_dir = Path(self.cache_dir)
83
+ self.registry_path = (
84
+ default_registry_path() if self.registry_path is None
85
+ else Path(self.registry_path)
86
+ )
praiser/crossforge.py ADDED
@@ -0,0 +1,188 @@
1
+ """Cross-forge identity resolution (issues #18, #25).
2
+
3
+ From a single anchor account, discover the person's accounts on *other* forges,
4
+ confirming each in one of two false-merge-resistant ways:
5
+
6
+ * **Bidirectional profile links** (#18): the candidate's profile links back to an
7
+ already-confirmed account. A false merge would need two different people to
8
+ link to each other.
9
+ * **Personal-site hub** (#25): people often list their accounts on a personal
10
+ site rather than in a forge bio. When a confirmed profile links to a non-forge
11
+ URL, we fetch that page (one hop, cached) and, if it's an *owned* hub (it also
12
+ links back to a confirmed account), accept the other forge accounts it lists —
13
+ provided the candidate shares the handle or display name. A link-farm guard
14
+ skips hubs that reference many distinct accounts.
15
+
16
+ Under-merge (someone who didn't cross-link at all) is safe; over-merge is
17
+ refused. The traversal is forge-agnostic (operates on ``Forge.profile_links`` +
18
+ a URL parser + ``Forge.get_url`` for the hub), so it's unit-testable with fakes.
19
+ """
20
+
21
+ import re
22
+
23
+ from .forge import Forge
24
+ from .forge._http import extract_urls
25
+ from .models import FORGE_WEB_HOSTS, Identity
26
+
27
+ # A personal-site hub linking to more distinct forge accounts than this is
28
+ # probably a directory/link-farm, not one person's identity page — skip it.
29
+ _MAX_HUB_ACCOUNTS = 6
30
+
31
+ # host (no scheme, no www.) -> forge name
32
+ _HOST_FORGE = {
33
+ host.split("://", 1)[1].lower(): name for name, host in FORGE_WEB_HOSTS.items()
34
+ }
35
+ # First-path-segment values that are never a user profile.
36
+ _NON_PROFILE = {
37
+ "sponsors", "orgs", "users", "topics", "features", "about", "settings",
38
+ "marketplace", "apps", "explore", "help", "pricing", "-", "dashboard",
39
+ }
40
+ # A profile URL is host + exactly ONE path segment (the login).
41
+ _PROFILE_RE = re.compile(r"https?://([^/\s]+)/([^/?#\s]+)/?(?:[?#].*)?$")
42
+
43
+
44
+ def parse_profile_url(url: str) -> tuple[str, str] | None:
45
+ """``(forge_name, login)`` for a profile URL on a known host, else None.
46
+
47
+ Only single-segment paths match (a profile), so repo/group links like
48
+ ``github.com/owner/repo`` are ignored.
49
+ """
50
+ if not url:
51
+ return None
52
+ m = _PROFILE_RE.match(url.strip())
53
+ if not m:
54
+ return None
55
+ host, seg = m.group(1).lower(), m.group(2)
56
+ if host.startswith("www."):
57
+ host = host[4:]
58
+ forge = _HOST_FORGE.get(host)
59
+ if forge is None:
60
+ return None
61
+ if not seg or seg.lower() in _NON_PROFILE or "." in seg: # skip *.git, files
62
+ return None
63
+ return (forge, seg)
64
+
65
+
66
+ def resolve_cross_forge(
67
+ anchor: Forge,
68
+ anchor_login: str,
69
+ forge_factory,
70
+ *,
71
+ max_accounts: int = 8,
72
+ ) -> tuple[Identity, list[tuple[str, str]]]:
73
+ """Resolve the person's accounts across forges from an anchor account.
74
+
75
+ ``forge_factory(forge_name) -> Forge | None`` builds a forge for a name
76
+ (None if unsupported). Returns the merged ``Identity`` and the confirmed
77
+ ``[(forge_name, login), …]`` (anchor included), for the executor to scan.
78
+ """
79
+ forges: dict[str, Forge] = {anchor.name: anchor}
80
+ confirmed: dict[tuple[str, str], str] = {(anchor.name, anchor_login.lower()): anchor_login}
81
+ names: set[str] = set()
82
+ visited: set[tuple[str, str]] = set()
83
+ visited_hubs: set[str] = set()
84
+
85
+ def links_of(forge: Forge, login: str) -> list[str]:
86
+ # A throttled/flaky forge contributes no links rather than aborting the
87
+ # whole resolution (best-effort, like discovery).
88
+ try:
89
+ return forge.profile_links(login)
90
+ except Exception:
91
+ return []
92
+
93
+ def note_name(forge: Forge, login: str) -> None:
94
+ try:
95
+ user = forge.resolve_user(login)
96
+ except Exception:
97
+ return
98
+ if user and user.name:
99
+ names.add(user.name)
100
+
101
+ def get_forge(fname: str) -> Forge | None:
102
+ if fname not in forges:
103
+ cforge = forge_factory(fname)
104
+ if cforge is None:
105
+ return None
106
+ forges[fname] = cforge
107
+ return forges[fname]
108
+
109
+ def confirm(cforge: Forge, clogin: str) -> None:
110
+ confirmed[(cforge.name, clogin.lower())] = clogin
111
+ note_name(cforge, clogin)
112
+ queue.append((cforge, clogin))
113
+
114
+ def name_matches(cforge: Forge, clogin: str) -> bool:
115
+ try:
116
+ user = cforge.resolve_user(clogin)
117
+ except Exception:
118
+ return False
119
+ lowered = {n.strip().lower() for n in names}
120
+ return bool(user and user.name and user.name.strip().lower() in lowered)
121
+
122
+ note_name(anchor, anchor_login)
123
+ queue: list[tuple[Forge, str]] = [(anchor, anchor_login)]
124
+ while queue and len(confirmed) < max_accounts:
125
+ forge, login = queue.pop(0)
126
+ if (forge.name, login.lower()) in visited:
127
+ continue
128
+ visited.add((forge.name, login.lower()))
129
+
130
+ forge_targets: list[tuple[str, str]] = []
131
+ hub_urls: list[str] = []
132
+ for url in links_of(forge, login):
133
+ parsed = parse_profile_url(url)
134
+ if parsed is not None:
135
+ forge_targets.append(parsed)
136
+ elif url.strip().lower().startswith(("http://", "https://")):
137
+ hub_urls.append(url.strip())
138
+
139
+ # 1. Direct bidirectional: the candidate's profile links back.
140
+ for fname, clogin in forge_targets:
141
+ if (fname, clogin.lower()) in confirmed:
142
+ continue
143
+ cforge = get_forge(fname)
144
+ if cforge is None:
145
+ continue
146
+ back = {
147
+ (f, ll.lower()) for u2 in links_of(cforge, clogin)
148
+ if (p := parse_profile_url(u2)) for f, ll in [p]
149
+ }
150
+ if set(confirmed) & back:
151
+ confirm(cforge, clogin)
152
+
153
+ # 2. Personal-site hub (#25): fetch the site, and if it's an owned hub
154
+ # (links back to a confirmed account), accept the accounts it lists that
155
+ # share the handle or display name.
156
+ for hub in hub_urls:
157
+ if hub in visited_hubs:
158
+ continue
159
+ visited_hubs.add(hub)
160
+ try:
161
+ page = forge.get_url(hub)
162
+ except Exception:
163
+ page = None
164
+ hub_accounts = {
165
+ p for u2 in extract_urls(page or "") if (p := parse_profile_url(u2))
166
+ }
167
+ if not hub_accounts or len(hub_accounts) > _MAX_HUB_ACCOUNTS:
168
+ continue # empty, or a link-farm — not a personal identity hub
169
+ hub_keys = {(f, ll.lower()) for f, ll in hub_accounts}
170
+ if not (set(confirmed) & hub_keys):
171
+ continue # not reached-from + links-back: not the person's own hub
172
+ confirmed_logins = {cl for (_, cl) in confirmed}
173
+ for fname, clogin in hub_accounts:
174
+ if (fname, clogin.lower()) in confirmed:
175
+ continue
176
+ cforge = get_forge(fname)
177
+ if cforge is None:
178
+ continue
179
+ if clogin.lower() in confirmed_logins or name_matches(cforge, clogin):
180
+ confirm(cforge, clogin)
181
+
182
+ identity = Identity(
183
+ primary_login=anchor_login,
184
+ logins=set(confirmed.values()),
185
+ names=names,
186
+ )
187
+ ids = [(fname, login) for (fname, _), login in confirmed.items()]
188
+ return identity, ids
@@ -0,0 +1 @@
1
+ # Marks praiser.data as a package so importlib.resources can read bundled files.