praiser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- praiser-0.1.0/LICENSE +28 -0
- praiser-0.1.0/PKG-INFO +350 -0
- praiser-0.1.0/README.md +321 -0
- praiser-0.1.0/praiser/__init__.py +7 -0
- praiser-0.1.0/praiser/__main__.py +6 -0
- praiser-0.1.0/praiser/cache.py +60 -0
- praiser-0.1.0/praiser/cli.py +285 -0
- praiser-0.1.0/praiser/config.py +86 -0
- praiser-0.1.0/praiser/crossforge.py +188 -0
- praiser-0.1.0/praiser/data/__init__.py +1 -0
- praiser-0.1.0/praiser/data/known_projects.json +95 -0
- praiser-0.1.0/praiser/discovery.py +218 -0
- praiser-0.1.0/praiser/extractors/__init__.py +54 -0
- praiser-0.1.0/praiser/extractors/authors.py +107 -0
- praiser-0.1.0/praiser/extractors/base.py +106 -0
- praiser-0.1.0/praiser/extractors/codeowners.py +133 -0
- praiser-0.1.0/praiser/extractors/contributors.py +89 -0
- praiser-0.1.0/praiser/extractors/curated.py +41 -0
- praiser-0.1.0/praiser/extractors/enhancement_proposals.py +254 -0
- praiser-0.1.0/praiser/extractors/governance.py +127 -0
- praiser-0.1.0/praiser/extractors/llm_founders.py +50 -0
- praiser-0.1.0/praiser/extractors/maintainers.py +157 -0
- praiser-0.1.0/praiser/extractors/manifests.py +184 -0
- praiser-0.1.0/praiser/extractors/ownership.py +29 -0
- praiser-0.1.0/praiser/extractors/packages.py +66 -0
- praiser-0.1.0/praiser/extractors/subcomponents.py +76 -0
- praiser-0.1.0/praiser/extractors/web_roles.py +131 -0
- praiser-0.1.0/praiser/extractors/wikidata.py +95 -0
- praiser-0.1.0/praiser/forge/__init__.py +35 -0
- praiser-0.1.0/praiser/forge/_http.py +86 -0
- praiser-0.1.0/praiser/forge/base.py +265 -0
- praiser-0.1.0/praiser/forge/bitbucket.py +171 -0
- praiser-0.1.0/praiser/forge/cgit.py +95 -0
- praiser-0.1.0/praiser/forge/gitea.py +179 -0
- praiser-0.1.0/praiser/forge/gitee.py +175 -0
- praiser-0.1.0/praiser/forge/github.py +309 -0
- praiser-0.1.0/praiser/forge/gitlab.py +197 -0
- praiser-0.1.0/praiser/github_client.py +462 -0
- praiser-0.1.0/praiser/identity.py +25 -0
- praiser-0.1.0/praiser/llm.py +247 -0
- praiser-0.1.0/praiser/models.py +253 -0
- praiser-0.1.0/praiser/pipeline.py +313 -0
- praiser-0.1.0/praiser/popularity.py +109 -0
- praiser-0.1.0/praiser/progress.py +67 -0
- praiser-0.1.0/praiser/registries.py +260 -0
- praiser-0.1.0/praiser/registry.py +311 -0
- praiser-0.1.0/praiser/render.py +219 -0
- praiser-0.1.0/praiser.egg-info/PKG-INFO +350 -0
- praiser-0.1.0/praiser.egg-info/SOURCES.txt +93 -0
- praiser-0.1.0/praiser.egg-info/dependency_links.txt +1 -0
- praiser-0.1.0/praiser.egg-info/entry_points.txt +2 -0
- praiser-0.1.0/praiser.egg-info/requires.txt +12 -0
- praiser-0.1.0/praiser.egg-info/top_level.txt +1 -0
- praiser-0.1.0/pyproject.toml +53 -0
- praiser-0.1.0/setup.cfg +4 -0
- praiser-0.1.0/tests/test_attribute.py +69 -0
- praiser-0.1.0/tests/test_authors.py +65 -0
- praiser-0.1.0/tests/test_bitbucket_forge.py +108 -0
- praiser-0.1.0/tests/test_cgit_forge.py +70 -0
- praiser-0.1.0/tests/test_cli.py +75 -0
- praiser-0.1.0/tests/test_codeowners.py +40 -0
- praiser-0.1.0/tests/test_contributors.py +123 -0
- praiser-0.1.0/tests/test_crossforge.py +153 -0
- praiser-0.1.0/tests/test_curated.py +65 -0
- praiser-0.1.0/tests/test_discovery.py +42 -0
- praiser-0.1.0/tests/test_enhancement_proposals.py +96 -0
- praiser-0.1.0/tests/test_file_fetch.py +63 -0
- praiser-0.1.0/tests/test_forge_base.py +51 -0
- praiser-0.1.0/tests/test_gitea_forge.py +120 -0
- praiser-0.1.0/tests/test_gitee_forge.py +87 -0
- praiser-0.1.0/tests/test_github_client.py +58 -0
- praiser-0.1.0/tests/test_github_forge.py +149 -0
- praiser-0.1.0/tests/test_gitlab_forge.py +116 -0
- praiser-0.1.0/tests/test_governance.py +43 -0
- praiser-0.1.0/tests/test_involvement.py +35 -0
- praiser-0.1.0/tests/test_llm_founders.py +63 -0
- praiser-0.1.0/tests/test_maintainers.py +44 -0
- praiser-0.1.0/tests/test_manifests.py +130 -0
- praiser-0.1.0/tests/test_manual_repos.py +76 -0
- praiser-0.1.0/tests/test_multiforge.py +25 -0
- praiser-0.1.0/tests/test_ownership.py +29 -0
- praiser-0.1.0/tests/test_packages_extractor.py +78 -0
- praiser-0.1.0/tests/test_pipeline_helpers.py +21 -0
- praiser-0.1.0/tests/test_popularity.py +102 -0
- praiser-0.1.0/tests/test_progress.py +56 -0
- praiser-0.1.0/tests/test_registries.py +177 -0
- praiser-0.1.0/tests/test_registry.py +67 -0
- praiser-0.1.0/tests/test_render.py +167 -0
- praiser-0.1.0/tests/test_result_cache.py +76 -0
- praiser-0.1.0/tests/test_role_discovery.py +98 -0
- praiser-0.1.0/tests/test_subcomponents.py +59 -0
- praiser-0.1.0/tests/test_web_cache.py +105 -0
- praiser-0.1.0/tests/test_web_roles.py +89 -0
- praiser-0.1.0/tests/test_web_service.py +134 -0
- praiser-0.1.0/tests/test_wikidata.py +93 -0
praiser-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026, Pearu Peterson
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its contributors
|
|
16
|
+
may be used to endorse or promote products derived from this software
|
|
17
|
+
without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
praiser-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: praiser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Record the popular open-source projects where a person holds an elevated role (author, maintainer, steering council, standards author, core contributor), with evidence.
|
|
5
|
+
Author-email: Pearu Peterson <pearu.peterson@gmail.com>
|
|
6
|
+
License-Expression: BSD-3-Clause
|
|
7
|
+
Project-URL: Homepage, https://github.com/openteams-ai/praiser
|
|
8
|
+
Project-URL: Repository, https://github.com/openteams-ai/praiser
|
|
9
|
+
Project-URL: Issues, https://github.com/openteams-ai/praiser/issues
|
|
10
|
+
Keywords: open-source,maintainer,contributions,github,roles
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Classifier: Environment :: Console
|
|
16
|
+
Classifier: Topic :: Software Development
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Provides-Extra: http
|
|
21
|
+
Requires-Dist: httpx>=0.27; extra == "http"
|
|
22
|
+
Provides-Extra: llm
|
|
23
|
+
Requires-Dist: anthropic>=0.40; extra == "llm"
|
|
24
|
+
Provides-Extra: yaml
|
|
25
|
+
Requires-Dist: pyyaml>=6; extra == "yaml"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# praiser
|
|
31
|
+
|
|
32
|
+
[](https://github.com/openteams-ai/praiser/actions/workflows/ci.yml)
|
|
33
|
+
|
|
34
|
+
**🌟 Try the web demo: <https://praiser.streamlit.app/>**
|
|
35
|
+
|
|
36
|
+
Given a username, **praiser** records the popular open-source projects where
|
|
37
|
+
that person holds an **elevated role** — author/creator, maintainer, code owner,
|
|
38
|
+
steering-council member, standards (PEP/RFC) author, or core contributor — with
|
|
39
|
+
a clickable **evidence link** and a **confidence** score for every claim. Plain
|
|
40
|
+
drive-by contributors are intentionally excluded (the record would otherwise be
|
|
41
|
+
enormous and low-signal). By default it prints a compact **highlights** summary;
|
|
42
|
+
`--format md|json` gives the full per-project report.
|
|
43
|
+
|
|
44
|
+
Projects record roles in many different ways — a `CODEOWNERS` file, a
|
|
45
|
+
`MAINTAINERS` list, Kubernetes `OWNERS` YAML, a `GOVERNANCE.md` page, a package
|
|
46
|
+
manifest's author field, a numbered enhancement-proposal series with `Author:`
|
|
47
|
+
headers, a team page on the project's website, the commit history, package
|
|
48
|
+
registries (PyPI/npm/crates), or **Wikidata** creator/developer claims (matched
|
|
49
|
+
by GitHub handle). `praiser` figures out **which convention each project uses**
|
|
50
|
+
rather than assuming one, and corroborates signals that a fork or vendored copy
|
|
51
|
+
could fake.
|
|
52
|
+
|
|
53
|
+
It scans **GitHub** by default, plus **GitLab** (`--forge gitlab`), **Codeberg**
|
|
54
|
+
/ any Gitea/Forgejo host (`--forge codeberg`), **Gitee** (`--forge gitee`),
|
|
55
|
+
**Bitbucket** (`--forge bitbucket`), and API-less **cgit** hosts like kernel.org
|
|
56
|
+
or Savannah (`--forge cgit`) — including **self-hosted instances** with
|
|
57
|
+
`--forge-url` (e.g. `--forge gitlab --forge-url https://gitlab.gnome.org`). The
|
|
58
|
+
pipeline talks to a neutral `Forge` interface, so adding another host is a
|
|
59
|
+
self-contained addition.
|
|
60
|
+
|
|
61
|
+
People use different usernames on different forges, so `--cross-forge` follows
|
|
62
|
+
the links a person publishes on their own profile — and on the personal site
|
|
63
|
+
those profiles point to — to their accounts elsewhere, keeping only links
|
|
64
|
+
confirmed either **bidirectionally** (the other profile links back) or through
|
|
65
|
+
an **owned personal-site hub** (a site reached from, and linking back to, a
|
|
66
|
+
confirmed account, that also lists the other account with a matching
|
|
67
|
+
handle/name), and merges everything into one record. Because the links are owner-published and
|
|
68
|
+
mutually confirmed, it never falsely merges two different people (it may
|
|
69
|
+
under-merge someone who hasn't cross-linked, which is safe). `--also-forge
|
|
70
|
+
FORGE:LOGIN` adds an identity explicitly when you'd rather not rely on links.
|
|
71
|
+
|
|
72
|
+
## Install
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install praiser # core (stdlib only)
|
|
76
|
+
pip install 'praiser[http]' # + httpx (faster, pooled HTTP; recommended)
|
|
77
|
+
pip install 'praiser[http,llm,yaml]' # + Claude fallback + YAML role files
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The core has **no dependencies** (it runs on the stdlib), so `pip install praiser`
|
|
81
|
+
is enough to get the `praiser` command. The extras add: `http` (httpx, falls back
|
|
82
|
+
to urllib), `llm` (Claude fallback for ambiguous governance prose), `yaml`
|
|
83
|
+
(Kubernetes `OWNERS` / YAML-front-matter proposals).
|
|
84
|
+
|
|
85
|
+
From a checkout, for development:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install -e '.[http,llm,yaml,dev]' # editable + all extras + tests
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Requires Python 3.11+ (for the stdlib `tomllib`).
|
|
92
|
+
|
|
93
|
+
## Usage
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
export GITHUB_TOKEN=ghp_... # a PAT; raises rate limits and enables search
|
|
97
|
+
praiser torvalds # default: the highlights summary (below)
|
|
98
|
+
praiser gvanrossum --format md # the full report (Markdown)
|
|
99
|
+
praiser gvanrossum --format json -o gvanrossum.json # full report as JSON
|
|
100
|
+
praiser someuser --no-discover-roles --no-llm # skip the LLM/web features
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
By default `praiser <username>` prints a compact **highlights** summary — the
|
|
104
|
+
top roles, one line each, plus breadth stats. Use `--format md|json` for the
|
|
105
|
+
full per-project report with evidence links, or `--highlights N` to change the
|
|
106
|
+
count.
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
pearu — top 8 highlights:
|
|
110
|
+
- pytorch/pytorch — Maintainer (101k★, conf 0.90)
|
|
111
|
+
- numpy/numpy — Maintainer (32k★, conf 0.90)
|
|
112
|
+
- scipy/scipy — Maintainer (15k★, conf 0.90)
|
|
113
|
+
- heavyai/heavydb — Core contributor (3k★, conf 0.80)
|
|
114
|
+
…
|
|
115
|
+
…plus N more elevated-role project(s); M smaller but widely-used project(s) with a notable role.
|
|
116
|
+
Reach: T project(s) across C communities (distinct orgs).
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
The footer summarises breadth beyond the top roles: the smaller-but-widely-used
|
|
120
|
+
projects where the user also holds a notable role, and the **community reach**
|
|
121
|
+
(distinct organisations) — a proxy for the potential to introduce ideas widely.
|
|
122
|
+
|
|
123
|
+
`praiser <username>` is meant to be sufficient on its own: role auto-discovery
|
|
124
|
+
and registry persistence are **on by default** (auto-discovery activates only
|
|
125
|
+
when LLM credentials are present, and degrades silently otherwise).
|
|
126
|
+
|
|
127
|
+
### GitHub token
|
|
128
|
+
|
|
129
|
+
Without a token GitHub allows only ~60 requests/hour, which is not enough.
|
|
130
|
+
Provide one via `--token`, the `GITHUB_TOKEN`/`GH_TOKEN` env var, or simply be
|
|
131
|
+
logged into the [`gh` CLI](https://cli.github.com) (`gh auth login`) — the tool
|
|
132
|
+
falls back to `gh auth token` automatically.
|
|
133
|
+
|
|
134
|
+
For the optional LLM features (`--discover-roles`, and the governance prose
|
|
135
|
+
fallback) set an Anthropic API key — create one at
|
|
136
|
+
**https://console.anthropic.com/settings/keys**, then
|
|
137
|
+
`export ANTHROPIC_API_KEY=...` and install the extra (`pip install
|
|
138
|
+
'praiser[llm]'`).
|
|
139
|
+
|
|
140
|
+
Create a GitHub Personal Access Token at **https://github.com/settings/tokens**:
|
|
141
|
+
|
|
142
|
+
* *classic* — no scopes are needed for public data; add `repo` (private repos)
|
|
143
|
+
and `read:org` (resolve `@org/team` membership in CODEOWNERS) for full
|
|
144
|
+
coverage;
|
|
145
|
+
* *fine-grained* — read-only **Contents** + **Members** permissions.
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Rate limits & performance
|
|
152
|
+
|
|
153
|
+
A token is capped at 5,000 REST requests/hour and that ceiling can't be raised
|
|
154
|
+
for a PAT (only GitHub Apps / Enterprise Cloud go higher). To stay under it the
|
|
155
|
+
tool:
|
|
156
|
+
|
|
157
|
+
* **fetches file contents via GraphQL in batches** — GraphQL is a *separate*
|
|
158
|
+
5,000-points/hour bucket, so the bulk of the work (reading CODEOWNERS,
|
|
159
|
+
manifests, and possibly hundreds of proposal files) doesn't touch the REST
|
|
160
|
+
limit, and many files come back in one request;
|
|
161
|
+
* **caches every request** so re-runs and resumed runs are nearly free;
|
|
162
|
+
* **drops forks** and only deep-scans plausible candidates.
|
|
163
|
+
|
|
164
|
+
If a run is rate-limited it stops early, tells you how long to wait, and the
|
|
165
|
+
cache preserves what already succeeded — so re-running finishes the job.
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
praiser <username>
|
|
169
|
+
[--forge github|codeberg|gitlab|gitee|bitbucket|cgit] code host (default: github)
|
|
170
|
+
[--forge-url URL] self-hosted instance for --forge gitlab|codeberg|cgit
|
|
171
|
+
[--forge-name LABEL] short label for the --forge-url instance
|
|
172
|
+
[--cross-forge] follow verified profile links to the person's other
|
|
173
|
+
forges and merge into one record
|
|
174
|
+
[--also-forge FORGE:LOGIN] also scan this identity on another forge (repeatable)
|
|
175
|
+
[--min-stars N] popularity threshold (default 50)
|
|
176
|
+
[--highlights [N]] top-N highlights summary (this is the DEFAULT view; N=8)
|
|
177
|
+
[--format md|json] full per-project report instead of the highlights
|
|
178
|
+
[--token TOKEN] or GITHUB_TOKEN / GH_TOKEN
|
|
179
|
+
[--cache-dir DIR] default ~/.cache/praiser
|
|
180
|
+
[--registry FILE] known-projects file (default: ~/.local/share/praiser/)
|
|
181
|
+
[--no-save-registry] don't persist popularity + discovered role sources
|
|
182
|
+
[--no-discover-roles] don't web-search for role pages (default: on w/ LLM)
|
|
183
|
+
[--no-wikidata] don't derive creator/developer roles from Wikidata
|
|
184
|
+
[--no-package-registries] skip PyPI/npm/crates.io lookups (default: on)
|
|
185
|
+
[--no-llm] disable all Claude features
|
|
186
|
+
[--add-repo OWNER/REPO] force-scan a repo discovery missed (repeatable)
|
|
187
|
+
[--include-private] also scan private repos (default: skip them)
|
|
188
|
+
[--contributor-pages N] contributors API pages/repo, 100 each (default: 2)
|
|
189
|
+
[-j N | --jobs N] candidates scanned concurrently (default: 8)
|
|
190
|
+
[-o FILE] write output to a file instead of stdout
|
|
191
|
+
[-v] detailed per-repo logging
|
|
192
|
+
[-q] suppress the live progress display
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
On an interactive terminal the tool shows live progress on stderr
|
|
196
|
+
(`scanning repo 42/107 …`) so you can see it working; output (JSON/Markdown)
|
|
197
|
+
still goes to stdout. Progress is automatically suppressed when stderr is
|
|
198
|
+
redirected, with `-q`, or in `-v` mode (which prints detailed logs instead).
|
|
199
|
+
|
|
200
|
+
JSON is the source of truth; Markdown is a human-readable view. Every claim
|
|
201
|
+
carries an **evidence link** (file/page URL) and a **confidence** score.
|
|
202
|
+
|
|
203
|
+
## Web demo
|
|
204
|
+
|
|
205
|
+
A [Streamlit](https://streamlit.io) web UI wraps the same engine: type a
|
|
206
|
+
username, pick a forge, and get the ranked record with evidence links — with
|
|
207
|
+
instant **view / top-N / min-stars** controls, a live progress bar, and a
|
|
208
|
+
"recent scans" picker. Collected results are shared across sessions via a
|
|
209
|
+
durable cache, so repeat lookups are fast.
|
|
210
|
+
|
|
211
|
+
- **Hosted demo:** <https://praiser.streamlit.app/>
|
|
212
|
+
- **Run locally or deploy your own:** see [`web/README.md`](web/README.md).
|
|
213
|
+
|
|
214
|
+
The web layer is split into a framework-agnostic core (`web/core` — a
|
|
215
|
+
`praise()` service + cache) and the Streamlit frontend (`web/streamlit`), so a
|
|
216
|
+
different frontend (FastAPI, Gradio, …) can reuse the core unchanged.
|
|
217
|
+
|
|
218
|
+
## How it works
|
|
219
|
+
|
|
220
|
+
1. **Identity resolution** — assemble `{logins, names, emails}` from the
|
|
221
|
+
profile. Handle/email matches are high-confidence; name-only matches are weak.
|
|
222
|
+
2. **Discovery (wide net)** — owned repos, org repos, contributed-to repos
|
|
223
|
+
(over-collected on purpose), **commit search** (`author:`, catches old
|
|
224
|
+
involvement the contribution graph has dropped), code search for the handle
|
|
225
|
+
in role files, **name search** in `AUTHORS`/`THANKS`/`CONTRIBUTORS`,
|
|
226
|
+
**package registries** (packages the user maintains on npm/crates.io, whose
|
|
227
|
+
source repos are pulled in — catches projects where the role is "package
|
|
228
|
+
maintainer" rather than "top committer"; `--no-package-registries` to skip),
|
|
229
|
+
and curated registry seeds.
|
|
230
|
+
Forks (which inherit upstream role files) and private repos are dropped
|
|
231
|
+
here — a public "popular projects" record shouldn't surface or leak private
|
|
232
|
+
repos. Use `--include-private` to scan them anyway. If the net still misses a
|
|
233
|
+
project (e.g. a private-dev repo, or one whose history GitHub doesn't
|
|
234
|
+
attribute), name it with `--add-repo OWNER/REPO` — it's force-scanned and
|
|
235
|
+
force-included, with the role still detected automatically.
|
|
236
|
+
3. **Role attribution** — a registry of pluggable [extractors](praiser/extractors)
|
|
237
|
+
(`ownership`, `codeowners`, `maintainers`, `manifests`, `enhancement_proposals`,
|
|
238
|
+
`governance`, `contributors`, `subcomponents`, `authors`, `web_roles`,
|
|
239
|
+
`packages`). The
|
|
240
|
+
`contributors` signal measures size by commits **and** merged-PR count
|
|
241
|
+
(robust to squash/ghstack one-commit-per-PR workflows and unlinked commit
|
|
242
|
+
emails); `subcomponents` credits leading/authoring a *part* of a monorepo via
|
|
243
|
+
commit-path analysis (e.g. f2py in NumPy, sparse tensors in PyTorch) — seeded
|
|
244
|
+
in the registry and extendable with `--add-repo owner/repo:path`. `packages`
|
|
245
|
+
credits **maintainer** of an npm/crates.io package (keyed on the user's login)
|
|
246
|
+
and **author** of a PyPI distribution (matched on the author/maintainer name,
|
|
247
|
+
so a popular package isn't mis-credited to a mere contributor) — only when
|
|
248
|
+
the package itself names the repo as its source, which guards against
|
|
249
|
+
registry-handle collisions.
|
|
250
|
+
(A LOC-diff size axis is intentionally deferred — noisy with generated/vendored
|
|
251
|
+
code and costly to compute — until a need justifies the extra dimension.)
|
|
252
|
+
A repo under the user's
|
|
253
|
+
own account is attributed as **author/creator**, and manifest `authors` vs
|
|
254
|
+
`maintainers` fields map to the author vs maintainer roles — so a user's own
|
|
255
|
+
projects read "Author", not merely "core contributor". Structured files are parsed
|
|
256
|
+
deterministically; ambiguous prose falls back to Claude **only after** a
|
|
257
|
+
keyword/regex pass. `contributors` records a **core-contributor** role for
|
|
258
|
+
substantial committers to popular/widely-used repos (catches historical
|
|
259
|
+
maintainers and authors of major components, e.g. f2py in NumPy). Role-file
|
|
260
|
+
matches (`CODEOWNERS`/`AUTHORS`) are corroborated with **copy-resistant**
|
|
261
|
+
signals — affiliation or being the canonical popular project — so a repo that
|
|
262
|
+
*vendored* an upstream's history and role files isn't a false positive.
|
|
263
|
+
4. **Popularity filter** — `--min-stars`, with an override so high-signal roles
|
|
264
|
+
on smaller-but-notable standards projects survive. Elevated-role projects
|
|
265
|
+
that miss the bar but are **widely used and maintained** (real fork
|
|
266
|
+
engagement + recently pushed) are reported as a secondary group with a count.
|
|
267
|
+
5. **Render** — ranked by popularity × role weight × confidence. Live
|
|
268
|
+
rate-limit dynamics (REST/GraphQL remaining) are shown during the scan.
|
|
269
|
+
|
|
270
|
+
## The known-projects registry
|
|
271
|
+
|
|
272
|
+
[`praiser/data/known_projects.json`](praiser/data/known_projects.json) stores
|
|
273
|
+
popular/important projects together with:
|
|
274
|
+
|
|
275
|
+
* **`role_conventions`** — how that project records roles *in the repo* (which
|
|
276
|
+
extractor + path + header format), so extractors can parse directly instead of
|
|
277
|
+
re-detecting, and curated knowledge is reusable;
|
|
278
|
+
* **`role_sources`** — **authoritative web pages** that list role holders, with
|
|
279
|
+
the role each confers. Many projects record maintainers/steering councils on a
|
|
280
|
+
site, not in a repo file, and the format varies wildly — so you point at the
|
|
281
|
+
exact URL rather than have the tool guess. The `web_roles` extractor fetches
|
|
282
|
+
each page and matches the user by GitHub handle (a `github.com/<handle>` link)
|
|
283
|
+
or full name. Example:
|
|
284
|
+
```json
|
|
285
|
+
"numpy/numpy": {
|
|
286
|
+
"role_sources": [
|
|
287
|
+
{"url": "https://numpy.org/teams/", "role": "maintainer", "label": "NumPy team"},
|
|
288
|
+
{"url": "https://numpy.org/about/", "role": "steering_council", "label": "Steering Council"}
|
|
289
|
+
]
|
|
290
|
+
}
|
|
291
|
+
```
|
|
292
|
+
This is more authoritative than commit-count heuristics — it reflects the
|
|
293
|
+
project's own statement of who holds the role — and it's why a vendored *copy*
|
|
294
|
+
of a project (which carries the upstream's commit history, making the user look
|
|
295
|
+
like a heavy committer) is not mistaken for a real role: role-file and
|
|
296
|
+
contributor signals are trusted only on the user's own/org repos or the
|
|
297
|
+
canonical popular project, never on a small unaffiliated copy.
|
|
298
|
+
* **`popularity`** — cached/curated stars/forks plus `min_stars_override` for
|
|
299
|
+
high-signal-but-small standards projects;
|
|
300
|
+
* **`importance`** — a human label (`critical`, `high`, ...).
|
|
301
|
+
|
|
302
|
+
Point `--registry mine.json` at your own file to extend or override the seed;
|
|
303
|
+
add `--save-registry` to have observed popularity **and any web-discovered role
|
|
304
|
+
sources** (`--discover-roles`) written back — so a one-off discovery becomes
|
|
305
|
+
reusable curated knowledge. Authoritative roles are conservative: high-authority
|
|
306
|
+
roles like steering council require a GitHub-**handle** match on the page (not
|
|
307
|
+
just a name, which is too easily a founder/credit mention).
|
|
308
|
+
|
|
309
|
+
Discovery results are also cached (the web-search call and fetched pages), so
|
|
310
|
+
re-runs don't re-search even without `--save-registry`.
|
|
311
|
+
|
|
312
|
+
### Enhancement-proposal generalization
|
|
313
|
+
|
|
314
|
+
PEP / NEP / SPEC / JEP and friends share one shape: a folder of numbered
|
|
315
|
+
documents with an `Author:` (or `:Author:`, or YAML front-matter) header. They
|
|
316
|
+
are handled by **one** extractor parameterized by `(path, header_format)`, which
|
|
317
|
+
also **auto-detects** the pattern when a repo has a directory of numbered
|
|
318
|
+
`*.rst`/`*.md` files with author metadata.
|
|
319
|
+
|
|
320
|
+
## Development
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
pip install -e '.[dev]'
|
|
324
|
+
pytest # offline parser tests, no network
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
Each extractor keeps its parsing logic in a pure function (e.g.
|
|
328
|
+
`parse_codeowners`, `parse_proposal_header`, `parse_owners_yaml`) so tests run
|
|
329
|
+
fully offline.
|
|
330
|
+
|
|
331
|
+
### Releasing to PyPI
|
|
332
|
+
|
|
333
|
+
The version is single-sourced from `praiser.__version__`. To cut a release:
|
|
334
|
+
|
|
335
|
+
1. Bump `__version__` in `praiser/__init__.py`; commit.
|
|
336
|
+
2. Create a **GitHub Release** with tag `vX.Y.Z` (matching the version).
|
|
337
|
+
3. The `publish.yml` workflow builds the sdist + wheel and uploads them to PyPI
|
|
338
|
+
via **Trusted Publishing** (OIDC — no API token stored).
|
|
339
|
+
|
|
340
|
+
**One-time PyPI setup** (before the first release): on PyPI, add a *trusted
|
|
341
|
+
publisher* for the project — owner `openteams-ai`, repo `praiser`, workflow
|
|
342
|
+
`publish.yml`, environment `pypi`. (For the very first upload you can instead
|
|
343
|
+
`python -m build && twine upload dist/*` with a PyPI token, then switch to
|
|
344
|
+
trusted publishing.)
|
|
345
|
+
|
|
346
|
+
## Author & license
|
|
347
|
+
|
|
348
|
+
Created by **Pearu Peterson** (pearu.peterson@gmail.com), with assistance from
|
|
349
|
+
**Claude** (Anthropic). Licensed under the **BSD 3-Clause** license — see
|
|
350
|
+
[LICENSE](LICENSE).
|