patchrail 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. patchrail/__init__.py +7 -0
  2. patchrail/__main__.py +7 -0
  3. patchrail/ci/__init__.py +7 -0
  4. patchrail/ci/classify.py +888 -0
  5. patchrail/cli.py +8566 -0
  6. patchrail/funded_issues/__init__.py +138 -0
  7. patchrail/funded_issues/algora_board.py +240 -0
  8. patchrail/funded_issues/blocklist.py +112 -0
  9. patchrail/funded_issues/discovery.py +4091 -0
  10. patchrail/funded_issues/importers.py +316 -0
  11. patchrail/funded_issues/source_noise.py +349 -0
  12. patchrail/funded_issues/store.py +459 -0
  13. patchrail/queue/__init__.py +75 -0
  14. patchrail/queue/server.py +273 -0
  15. patchrail/queue/status.py +756 -0
  16. patchrail/queue/store.py +600 -0
  17. patchrail/reviewer_quick_check.py +650 -0
  18. patchrail/schemas/__init__.py +1 -0
  19. patchrail/schemas/application-dossier.v1.schema.json +305 -0
  20. patchrail/schemas/ci-benchmark.v1.schema.json +174 -0
  21. patchrail/schemas/ci-fixture-check.v1.schema.json +122 -0
  22. patchrail/schemas/ci-pilot-metrics.v1.schema.json +164 -0
  23. patchrail/schemas/ci-pilot-summary.v1.schema.json +146 -0
  24. patchrail/schemas/ci-result.v1.schema.json +133 -0
  25. patchrail/schemas/funded-issues-client-report.v1.schema.json +524 -0
  26. patchrail/schemas/funded-issues-recheck-queue.v1.schema.json +333 -0
  27. patchrail/schemas/funded-issues-recheck-summary.v1.schema.json +136 -0
  28. patchrail/schemas/funded-issues-report.v1.schema.json +836 -0
  29. patchrail/schemas/funded-issues-shortlist.v1.schema.json +953 -0
  30. patchrail/schemas/funded-issues-store-status.v1.schema.json +96 -0
  31. patchrail/schemas/funded-issues-store.v1.schema.json +117 -0
  32. patchrail/schemas/queue-audit-event.v1.schema.json +44 -0
  33. patchrail/schemas/queue-audit-summary.v1.schema.json +169 -0
  34. patchrail/schemas/queue-gate-report.v1.schema.json +158 -0
  35. patchrail/schemas/queue-policy-resolution.v1.schema.json +188 -0
  36. patchrail/schemas/queue-policy-scan.v1.schema.json +175 -0
  37. patchrail/schemas/queue-proposal.v1.schema.json +61 -0
  38. patchrail/schemas/queue-review.v1.schema.json +218 -0
  39. patchrail/schemas/queue-status.v1.schema.json +179 -0
  40. patchrail/schemas/queue-work-item.v1.schema.json +64 -0
  41. patchrail/schemas/reviewer-quick-check-artifacts.v1.schema.json +104 -0
  42. patchrail/web_metrics.py +649 -0
  43. patchrail-0.1.0.dist-info/METADATA +279 -0
  44. patchrail-0.1.0.dist-info/RECORD +47 -0
  45. patchrail-0.1.0.dist-info/WHEEL +4 -0
  46. patchrail-0.1.0.dist-info/entry_points.txt +2 -0
  47. patchrail-0.1.0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,316 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from patchrail.funded_issues.discovery import FundedIssue, funded_issues_payload
10
+
11
+
12
+ SUPPORTED_PROVIDERS = ("algora", "github", "openpledge", "polar")
13
+
14
+ _CURRENCY_SYMBOLS = {"$": "USD", "€": "EUR", "£": "GBP", "¥": "JPY"}
15
+ _CURRENCY_CODES = ("USD", "EUR", "GBP", "JPY", "CAD", "AUD")
16
+ _CURRENCY_CODE_PATTERN = re.compile(r"(?i)\b(" + "|".join(_CURRENCY_CODES) + r")\b")
17
+
18
+ _AMBIGUOUS_SCOPE_TERMS = (
19
+ "architecture",
20
+ "broad",
21
+ "entire",
22
+ "rewrite",
23
+ "unclear",
24
+ )
25
+ _SPAM_ATTRACTIVE_LABELS = ("bounty", "reward", "paid")
26
+ _CONTRIBUTION_SIGNAL_LABELS = (
27
+ "ci",
28
+ "bug",
29
+ "good first issue",
30
+ "good-first-issue",
31
+ "help wanted",
32
+ "tests",
33
+ )
34
+
35
+
36
+ def import_provider_export(provider: str, source: Path) -> dict[str, Any]:
37
+ provider = provider.lower()
38
+ if provider not in SUPPORTED_PROVIDERS:
39
+ raise ValueError(f"unsupported provider: {provider}")
40
+ payload = json.loads(source.read_text(encoding="utf-8"))
41
+ records = _extract_records(payload)
42
+ issues = [
43
+ _issue_from_provider_record(provider, record, index) for index, record in enumerate(records)
44
+ ]
45
+ return funded_issues_payload(
46
+ issues,
47
+ import_source={
48
+ "provider": provider,
49
+ "path": str(source),
50
+ "records_loaded": len(records),
51
+ "local_file_only": True,
52
+ },
53
+ )
54
+
55
+
56
+ def _extract_records(payload: Any) -> list[dict[str, Any]]:
57
+ if isinstance(payload, list):
58
+ records = payload
59
+ elif isinstance(payload, dict):
60
+ for key in ("issues", "items", "bounties", "results", "data"):
61
+ value = payload.get(key)
62
+ if isinstance(value, list):
63
+ records = value
64
+ break
65
+ else:
66
+ records = [payload]
67
+ else:
68
+ raise ValueError("provider export must be a JSON object or array")
69
+ if not all(isinstance(record, dict) for record in records):
70
+ raise ValueError("provider export records must be objects")
71
+ return records
72
+
73
+
74
+ def _issue_from_provider_record(provider: str, raw: dict[str, Any], index: int) -> FundedIssue:
75
+ repository = _repository(raw)
76
+ issue_number = _issue_number(raw)
77
+ title = _first_string(raw, "title", "name", "summary") or "Untitled funded issue"
78
+ url = _first_string(raw, "url", "html_url", "issue_url", "github_url") or repository
79
+ amount, currency = _funding(raw)
80
+ labels = _labels(raw)
81
+ contribution_guidelines_url = _first_string(
82
+ raw,
83
+ "contribution_guidelines_url",
84
+ "contributing_url",
85
+ "guidelines_url",
86
+ )
87
+ opportunity_state = _opportunity_state(raw, labels)
88
+ contribution_signals = _contribution_signals(raw, labels, contribution_guidelines_url)
89
+ risk_flags = _risk_flags(
90
+ raw, title, labels, amount, contribution_guidelines_url, opportunity_state
91
+ )
92
+ identifier = _first_string(raw, "id", "node_id", "slug") or _stable_id(
93
+ provider, repository, issue_number, title, index
94
+ )
95
+ language = _first_string(raw, "language", "primary_language", "repo_language")
96
+ return FundedIssue(
97
+ id=str(identifier),
98
+ platform=provider,
99
+ repository=repository,
100
+ issue_number=issue_number,
101
+ title=title,
102
+ url=url,
103
+ funding_amount=amount,
104
+ funding_currency=currency,
105
+ language=language,
106
+ labels=labels,
107
+ contribution_signals=contribution_signals,
108
+ risk_flags=risk_flags,
109
+ maintainer_permission=str(raw.get("maintainer_permission") or "public_issue_only"),
110
+ contribution_guidelines_url=contribution_guidelines_url,
111
+ opportunity_state=opportunity_state,
112
+ )
113
+
114
+
115
+ def _repository(raw: dict[str, Any]) -> str:
116
+ direct = _first_string(raw, "repository", "repo", "repo_full_name", "full_name")
117
+ if direct:
118
+ return direct
119
+ nested = raw.get("repository")
120
+ if isinstance(nested, dict):
121
+ nested_name = _first_string(nested, "full_name", "name")
122
+ owner = nested.get("owner")
123
+ if nested_name and "/" in nested_name:
124
+ return nested_name
125
+ if isinstance(owner, dict):
126
+ owner_name = _first_string(owner, "login", "name")
127
+ else:
128
+ owner_name = str(owner) if owner else None
129
+ if owner_name and nested_name:
130
+ return f"{owner_name}/{nested_name}"
131
+ owner = _first_string(raw, "owner", "org", "organization")
132
+ repo_name = _first_string(raw, "repo_name", "project", "name")
133
+ if owner and repo_name:
134
+ return f"{owner}/{repo_name}"
135
+ url = _first_string(raw, "url", "html_url", "issue_url", "github_url") or ""
136
+ match = re.search(r"github\.com/([^/\s]+/[^/\s#]+)", url)
137
+ if match:
138
+ return match.group(1).removesuffix(".git")
139
+ return "unknown/unknown"
140
+
141
+
142
+ def _issue_number(raw: dict[str, Any]) -> int | None:
143
+ for key in ("issue_number", "number", "github_issue_number", "issue"):
144
+ value = raw.get(key)
145
+ if isinstance(value, int):
146
+ return value
147
+ if isinstance(value, str) and value.isdigit():
148
+ return int(value)
149
+ url = _first_string(raw, "url", "html_url", "issue_url", "github_url") or ""
150
+ match = re.search(r"/issues/(\d+)", url)
151
+ return int(match.group(1)) if match else None
152
+
153
+
154
+ def _funding(raw: dict[str, Any]) -> tuple[float | None, str | None]:
155
+ funding = raw.get("funding")
156
+ if isinstance(funding, dict):
157
+ raw_amount = funding.get("amount") or funding.get("value") or funding.get("usd")
158
+ amount = _numeric(raw_amount)
159
+ currency = _first_string(funding, "currency", "currency_code")
160
+ return amount, _normalize_currency(currency, raw_amount)
161
+
162
+ bounty = raw.get("bounty")
163
+ if isinstance(bounty, dict):
164
+ raw_amount = bounty.get("amount") or bounty.get("value") or bounty.get("usd")
165
+ amount = _numeric(raw_amount)
166
+ currency = _first_string(bounty, "currency", "currency_code")
167
+ return amount, _normalize_currency(currency, raw_amount)
168
+
169
+ for key in ("amount_usd", "reward_usd", "bounty_usd", "funding_usd"):
170
+ amount = _numeric(raw.get(key))
171
+ if amount is not None:
172
+ return amount, "USD"
173
+
174
+ raw_amount = raw.get("amount") or raw.get("reward") or raw.get("bounty_amount")
175
+ amount = _numeric(raw_amount)
176
+ currency = _first_string(raw, "currency", "currency_code")
177
+ return amount, _normalize_currency(currency, raw_amount)
178
+
179
+
180
+ def _labels(raw: dict[str, Any]) -> list[str]:
181
+ labels = raw.get("labels")
182
+ if not isinstance(labels, list):
183
+ return []
184
+ values = []
185
+ for label in labels:
186
+ if isinstance(label, dict):
187
+ value = _first_string(label, "name", "label")
188
+ else:
189
+ value = str(label)
190
+ if value:
191
+ values.append(value)
192
+ return values
193
+
194
+
195
+ def _contribution_signals(
196
+ raw: dict[str, Any], labels: list[str], contribution_guidelines_url: str | None
197
+ ) -> list[str]:
198
+ signals = _string_list(raw.get("contribution_signals"))
199
+ normalized_labels = {label.lower() for label in labels}
200
+ for label in normalized_labels:
201
+ if label in _CONTRIBUTION_SIGNAL_LABELS:
202
+ signals.append(f"label:{label}")
203
+ body = _first_string(raw, "body", "description") or ""
204
+ if "reproduction" in body.lower() or "steps to reproduce" in body.lower():
205
+ signals.append("reproduction included")
206
+ if contribution_guidelines_url:
207
+ signals.append("contribution guidelines linked")
208
+ return sorted(set(signals))
209
+
210
+
211
+ def _risk_flags(
212
+ raw: dict[str, Any],
213
+ title: str,
214
+ labels: list[str],
215
+ amount: float | None,
216
+ contribution_guidelines_url: str | None,
217
+ opportunity_state: str,
218
+ ) -> list[str]:
219
+ flags = _string_list(raw.get("risk_flags"))
220
+ title_lower = title.lower()
221
+ label_lowers = {label.lower() for label in labels}
222
+ if any(term in title_lower for term in _AMBIGUOUS_SCOPE_TERMS):
223
+ flags.append("ambiguous_scope")
224
+ if any(label in label_lowers for label in _SPAM_ATTRACTIVE_LABELS):
225
+ flags.append("spam_attractive")
226
+ if amount is not None and amount >= 1000 and not contribution_guidelines_url:
227
+ flags.append("spam_attractive")
228
+ if not contribution_guidelines_url:
229
+ flags.append("no_contribution_guidelines")
230
+ if opportunity_state == "stale":
231
+ flags.append("stale_no_maintainer_signal")
232
+ if opportunity_state == "closed":
233
+ flags.append("closed_or_inactive")
234
+ return sorted(set(flags))
235
+
236
+
237
+ def _opportunity_state(raw: dict[str, Any], labels: list[str]) -> str:
238
+ label_lowers = {label.lower() for label in labels}
239
+ if "stale" in label_lowers:
240
+ return "stale"
241
+ for key in ("opportunity_state", "state", "status", "issue_state"):
242
+ value = raw.get(key)
243
+ if isinstance(value, bool):
244
+ continue
245
+ if isinstance(value, str) and value.strip():
246
+ normalized = value.strip().lower().replace("-", "_").replace(" ", "_")
247
+ if normalized in {"active", "open", "opened", "live", "available"}:
248
+ return "active"
249
+ if normalized in {"closed", "completed", "done", "paid", "resolved", "cancelled"}:
250
+ return "closed"
251
+ if normalized in {"stale", "inactive", "abandoned", "expired"}:
252
+ return "stale"
253
+ for key in ("open", "is_open"):
254
+ value = raw.get(key)
255
+ if isinstance(value, bool):
256
+ return "active" if value else "closed"
257
+ return "unknown"
258
+
259
+
260
+ def _first_string(raw: dict[str, Any], *keys: str) -> str | None:
261
+ for key in keys:
262
+ value = raw.get(key)
263
+ if isinstance(value, str) and value.strip():
264
+ return value.strip()
265
+ return None
266
+
267
+
268
+ def _numeric(value: Any) -> float | None:
269
+ if value is None:
270
+ return None
271
+ if isinstance(value, int | float):
272
+ return float(value)
273
+ if isinstance(value, str):
274
+ cleaned = value
275
+ for symbol in _CURRENCY_SYMBOLS:
276
+ cleaned = cleaned.replace(symbol, "")
277
+ cleaned = _CURRENCY_CODE_PATTERN.sub("", cleaned)
278
+ cleaned = cleaned.replace(",", "").strip()
279
+ try:
280
+ return float(cleaned)
281
+ except ValueError:
282
+ return None
283
+ return None
284
+
285
+
286
+ def _detect_currency(value: Any) -> str | None:
287
+ if not isinstance(value, str):
288
+ return None
289
+ for symbol, code in _CURRENCY_SYMBOLS.items():
290
+ if symbol in value:
291
+ return code
292
+ match = _CURRENCY_CODE_PATTERN.search(value)
293
+ if match:
294
+ return match.group(1).upper()
295
+ return None
296
+
297
+
298
+ def _normalize_currency(explicit: str | None, raw_amount: Any) -> str | None:
299
+ if explicit:
300
+ return explicit.upper()
301
+ return _detect_currency(raw_amount)
302
+
303
+
304
+ def _string_list(value: Any) -> list[str]:
305
+ if not isinstance(value, list):
306
+ return []
307
+ return [str(item) for item in value if str(item).strip()]
308
+
309
+
310
+ def _stable_id(
311
+ provider: str, repository: str, issue_number: int | None, title: str, index: int
312
+ ) -> str:
313
+ digest = hashlib.sha256(f"{repository}:{issue_number}:{title}:{index}".encode()).hexdigest()[
314
+ :12
315
+ ]
316
+ return f"{provider}-{digest}"
@@ -0,0 +1,349 @@
1
+ """Owner-level source-noise heuristic for the read-only funded-issues tracker.
2
+
3
+ The per-issue scoring in :mod:`patchrail.funded_issues.discovery` cannot tell a
4
+ throwaway trap org (a brand-new account spamming near-identical honeypot
5
+ bounties) apart from a credible sponsor running a noisy program -- every issue
6
+ lands at ``risk_level=high`` regardless of *who* posted it. This module adds the
7
+ missing *owner-level* signal: given one owner's already-collected public GitHub
8
+ metadata plus the list of tracker-store entries attributed to that owner, it
9
+ derives a list of ``noise_flags`` and a single ``source_noise`` verdict.
10
+
11
+ It is pure and fully offline -- callers pass metadata that was gathered
12
+ read-only elsewhere; nothing here performs a network call, and (like the rest of
13
+ the tracker) it never writes to any third party.
14
+
15
+ Flags
16
+ -----
17
+ Each owner is screened for the following flags (constants below hold the
18
+ thresholds):
19
+
20
+ * ``new_account`` (strong) -- account younger than
21
+ :data:`NEW_ACCOUNT_MAX_AGE_DAYS` days.
22
+ * ``no_website`` (strong) -- no public website/blog declared. Absent metadata is
23
+ treated as "no website": for a noise screen, an unproven signal is a negative
24
+ one.
25
+ * ``unverifiable_payout`` (strong) -- payout cannot be verified from a primary
26
+ public source. Absent metadata is likewise treated as unverifiable.
27
+ * ``anomalous_volume`` (strong) -- the owner contributes at least
28
+ :data:`ANOMALOUS_MIN_VOLUME` tracked entries and at least
29
+ :data:`ANOMALOUS_DUP_RATIO` of them collapse to one near-identical title
30
+ signature (the honeypot/aggregator template pattern).
31
+ * ``low_repos`` (supporting) -- at most :data:`LOW_REPO_MAX` public repos.
32
+ * ``few_followers`` (supporting) -- at most :data:`FEW_FOLLOWERS_MAX` followers.
33
+
34
+ Verdict criterion
35
+ -----------------
36
+ ``source_noise`` is ``True`` when the owner trips **at least**
37
+ :data:`STRONG_FLAG_THRESHOLD` *strong* flags (the members of
38
+ :data:`STRONG_NOISE_FLAGS`). Supporting flags add colour to a report but never,
39
+ on their own, flip the verdict -- so a legitimate one-repo sponsor with a
40
+ website and verifiable payouts stays clean, while a new website-less org with
41
+ unverifiable payouts and templated volume is flagged.
42
+
43
+ Issue-level manual overrides
44
+ ----------------------------
45
+ The owner-level pass is necessarily coarse: it condemns or clears *every* issue
46
+ from an owner at once. Sometimes a human needs to override a single issue --
47
+ flag one "Test Bounty" from an otherwise legitimate owner, or clear one issue
48
+ from a flagged owner. :func:`apply_source_noise_to_store` accepts a
49
+ ``manual_overrides`` mapping (issue URL -> list of flags) for exactly this. The
50
+ overrides are *issue-level* and always win over the heuristic. Because the
51
+ caller passes them on **every** apply, they survive re-applies of the owner-level
52
+ heuristic that would otherwise reset the entry's ``noise_flags`` -- there is no
53
+ hidden state, just a deterministic re-stamp on each pass.
54
+ """
55
+
56
+ from __future__ import annotations
57
+
58
+ import re
59
+ from datetime import datetime
60
+ from typing import Any
61
+
62
+ SOURCE_NOISE_SCHEMA_VERSION = "patchrail.funded_issues.source_noise.v1"
63
+
64
+ # Thresholds. Tuned against the 2026-06-10 screening: trap orgs created within
65
+ # the last ~4 weeks with a single repo and a couple of followers.
66
+ NEW_ACCOUNT_MAX_AGE_DAYS = 90
67
+ LOW_REPO_MAX = 1
68
+ FEW_FOLLOWERS_MAX = 5
69
+ ANOMALOUS_MIN_VOLUME = 5
70
+ ANOMALOUS_DUP_RATIO = 0.6
71
+ STRONG_FLAG_THRESHOLD = 2
72
+
73
+ # Flags that, in sufficient number, flip ``source_noise``. The supporting flags
74
+ # (``low_repos`` / ``few_followers``) are deliberately excluded: weak corporate
75
+ # signals should never condemn an owner on their own.
76
+ STRONG_NOISE_FLAGS = frozenset(
77
+ {"new_account", "no_website", "unverifiable_payout", "anomalous_volume"}
78
+ )
79
+
80
+ _TITLE_TOKEN_RE = re.compile(r"[a-z]+")
81
+ # How many leading title tokens form the near-identical signature. Honeypot and
82
+ # aggregator templates share a long fixed prefix; six tokens is enough to cluster
83
+ # them while keeping genuinely distinct issues apart.
84
+ _SIGNATURE_TOKENS = 6
85
+
86
+
87
+ def _parse_iso(value: str) -> datetime:
88
+ text = str(value).strip()
89
+ if text.endswith("Z"):
90
+ text = text[:-1] + "+00:00"
91
+ return datetime.fromisoformat(text)
92
+
93
+
94
+ def _account_age_days(metadata: dict[str, Any], now: str | None) -> int | None:
95
+ """Resolve account age in days, preferring an explicit ``account_age_days``.
96
+
97
+ Falls back to ``created_at`` differenced against ``now`` when both are
98
+ present and parseable. Returns ``None`` when age cannot be established, in
99
+ which case the ``new_account`` flag is simply not raised.
100
+ """
101
+
102
+ age = metadata.get("account_age_days")
103
+ if age is not None:
104
+ try:
105
+ return int(age)
106
+ except (TypeError, ValueError):
107
+ return None
108
+
109
+ created_at = metadata.get("created_at")
110
+ if created_at and now:
111
+ try:
112
+ return (_parse_iso(now) - _parse_iso(str(created_at))).days
113
+ except ValueError:
114
+ return None
115
+ return None
116
+
117
+
118
+ def _title_signature(title: Any) -> tuple[str, ...]:
119
+ tokens = _TITLE_TOKEN_RE.findall(str(title).lower())
120
+ return tuple(tokens[:_SIGNATURE_TOKENS])
121
+
122
+
123
+ def _entry_title(entry: dict[str, Any]) -> str:
124
+ issue = entry.get("issue") or {}
125
+ return str(issue.get("title") or "")
126
+
127
+
128
+ def _is_anomalous_volume(entries: list[dict[str, Any]]) -> bool:
129
+ """True when the owner posts a high volume of near-identical issue titles.
130
+
131
+ Titles are reduced to a leading-token signature; if the largest cluster of
132
+ identical signatures covers at least :data:`ANOMALOUS_DUP_RATIO` of a
133
+ sufficiently large batch, the volume is anomalous.
134
+ """
135
+
136
+ total = len(entries)
137
+ if total < ANOMALOUS_MIN_VOLUME:
138
+ return False
139
+ counts: dict[tuple[str, ...], int] = {}
140
+ for entry in entries:
141
+ signature = _title_signature(_entry_title(entry))
142
+ if not signature:
143
+ continue
144
+ counts[signature] = counts.get(signature, 0) + 1
145
+ if not counts:
146
+ return False
147
+ largest_cluster = max(counts.values())
148
+ return largest_cluster / total >= ANOMALOUS_DUP_RATIO
149
+
150
+
151
+ _REPOS_URL_OWNER_RE = re.compile(r"/repos/([^/]+)/")
152
+
153
+
154
+ def _entry_owner(entry: dict[str, Any]) -> str:
155
+ """Derive the owning account for a store entry.
156
+
157
+ Prefers an explicit ``issue.owner``, then the ``/repos/<owner>/`` segment of
158
+ ``issue.url`` (the canonical GitHub API reference, always present in stores
159
+ built by discovery), and finally ``issue.repository`` — which appears both
160
+ as ``owner/repo`` and as the API-derived ``repos/<owner>`` form.
161
+ """
162
+
163
+ issue = entry.get("issue") or {}
164
+ owner = issue.get("owner")
165
+ if owner:
166
+ return str(owner)
167
+ match = _REPOS_URL_OWNER_RE.search(str(issue.get("url") or ""))
168
+ if match:
169
+ return match.group(1)
170
+ repository = str(issue.get("repository") or "")
171
+ segments = [part for part in repository.split("/") if part]
172
+ if len(segments) >= 2 and segments[0] == "repos":
173
+ return segments[1]
174
+ if segments:
175
+ return segments[0]
176
+ return repository
177
+
178
+
179
+ def assess_owner_source_noise(
180
+ owner_metadata: dict[str, Any],
181
+ entries: list[dict[str, Any]],
182
+ *,
183
+ now: str | None = None,
184
+ ) -> dict[str, Any]:
185
+ """Screen one owner for source noise from offline public signals.
186
+
187
+ ``owner_metadata`` is a mapping of public signals for the owner
188
+ (``account_age_days`` or ``created_at``, ``public_repos``, ``followers``,
189
+ ``has_website``, ``payout_verifiable``). ``entries`` is the list of tracker
190
+ store entries attributed to that owner (used for the volume heuristic).
191
+
192
+ Returns a mapping with the sorted ``noise_flags``, the ``strong_flags``
193
+ subset that drives the verdict, ``strong_flag_count``, the boolean
194
+ ``source_noise`` verdict (see module docstring for the criterion), and the
195
+ ``tracked_entries`` count. Performs no network calls.
196
+ """
197
+
198
+ metadata = owner_metadata or {}
199
+ flags: list[str] = []
200
+
201
+ age = _account_age_days(metadata, now)
202
+ if age is not None and age < NEW_ACCOUNT_MAX_AGE_DAYS:
203
+ flags.append("new_account")
204
+
205
+ public_repos = metadata.get("public_repos")
206
+ if public_repos is not None and int(public_repos) <= LOW_REPO_MAX:
207
+ flags.append("low_repos")
208
+
209
+ followers = metadata.get("followers")
210
+ if followers is not None and int(followers) <= FEW_FOLLOWERS_MAX:
211
+ flags.append("few_followers")
212
+
213
+ if not metadata.get("has_website", False):
214
+ flags.append("no_website")
215
+
216
+ if not metadata.get("payout_verifiable", False):
217
+ flags.append("unverifiable_payout")
218
+
219
+ if _is_anomalous_volume(entries):
220
+ flags.append("anomalous_volume")
221
+
222
+ noise_flags = sorted(flags)
223
+ strong_flags = [flag for flag in noise_flags if flag in STRONG_NOISE_FLAGS]
224
+ return {
225
+ "schema_version": SOURCE_NOISE_SCHEMA_VERSION,
226
+ "noise_flags": noise_flags,
227
+ "strong_flags": strong_flags,
228
+ "strong_flag_count": len(strong_flags),
229
+ "source_noise": len(strong_flags) >= STRONG_FLAG_THRESHOLD,
230
+ "tracked_entries": len(entries),
231
+ }
232
+
233
+
234
+ def entries_by_owner(store: dict[str, Any]) -> dict[str, list[dict[str, Any]]]:
235
+ """Group a store's entries by derived owner, preserving entry references."""
236
+
237
+ grouped: dict[str, list[dict[str, Any]]] = {}
238
+ for entry in store.get("entries", {}).values():
239
+ grouped.setdefault(_entry_owner(entry), []).append(entry)
240
+ return grouped
241
+
242
+
243
+ def _validate_manual_overrides(
244
+ manual_overrides: dict[str, list[str]] | None,
245
+ ) -> dict[str, list[str]]:
246
+ """Return validated overrides, raising ``ValueError`` on malformed flags.
247
+
248
+ Every value must be a list whose members are all non-empty strings. A
249
+ non-string or empty/blank string is a caller bug, not noise data, so it is
250
+ rejected loudly rather than silently coerced.
251
+ """
252
+
253
+ overrides = manual_overrides or {}
254
+ for url, flags in overrides.items():
255
+ if not isinstance(flags, list):
256
+ raise ValueError(
257
+ f"manual_overrides[{url!r}] must be a list of flag strings, "
258
+ f"got {type(flags).__name__}"
259
+ )
260
+ for flag in flags:
261
+ if not isinstance(flag, str) or not flag.strip():
262
+ raise ValueError(
263
+ f"manual_overrides[{url!r}] contains an invalid flag "
264
+ f"{flag!r}: flags must be non-empty strings"
265
+ )
266
+ return overrides
267
+
268
+
269
+ def apply_source_noise_to_store(
270
+ store: dict[str, Any],
271
+ owner_metadata: dict[str, dict[str, Any]] | None = None,
272
+ *,
273
+ now: str | None = None,
274
+ manual_overrides: dict[str, list[str]] | None = None,
275
+ ) -> dict[str, Any]:
276
+ """Stamp the owner-level ``source_noise`` verdict onto store entries in place.
277
+
278
+ Entries are grouped by owner; each owner is assessed once via
279
+ :func:`assess_owner_source_noise` using ``owner_metadata[owner]`` (an empty
280
+ mapping when absent). Every entry for a *flagged* owner has its
281
+ ``noise_flags`` set to the owner's flag list; entries for a *clean* owner are
282
+ reset to ``[]``. This keeps the per-entry ``noise_flags`` an honest mirror of
283
+ the current owner verdict, so re-applying with refreshed metadata can clear a
284
+ previously-flagged owner.
285
+
286
+ ``manual_overrides`` is an optional, *issue-level* escape hatch mapping an
287
+ exact ``store["entries"]`` URL key to a list of flag strings. It is applied
288
+ **after** the owner-level pass and always wins:
289
+
290
+ * a non-empty flag list marks the entry as noise -- merged with (and
291
+ de-duplicated against) any owner-level flags, then sorted -- even when the
292
+ owner is clean;
293
+ * an empty list ``[]`` forces the entry clean, overriding a flagged owner.
294
+
295
+ Because the caller supplies the overrides on every apply, they persist across
296
+ re-applies of the heuristic without any stored state. Malformed overrides
297
+ (non-string or empty flags) raise :class:`ValueError`. Returns a summary of
298
+ the pass.
299
+ """
300
+
301
+ metadata_by_owner = owner_metadata or {}
302
+ overrides = _validate_manual_overrides(manual_overrides)
303
+ summary = {
304
+ "owners_assessed": 0,
305
+ "owners_flagged": 0,
306
+ "owners_without_metadata": [],
307
+ "entries_flagged": 0,
308
+ "entries_cleared": 0,
309
+ "entries_manual_noise": 0,
310
+ "entries_manual_clean": 0,
311
+ "manual_urls_not_in_store": [],
312
+ }
313
+ entries = store.get("entries", {})
314
+ for owner, owner_entries in entries_by_owner(store).items():
315
+ if owner not in metadata_by_owner:
316
+ # Absent metadata reads as negative signals (see module docstring),
317
+ # so surface it: a long list here means the caller is screening
318
+ # owners it never actually looked up.
319
+ summary["owners_without_metadata"].append(owner)
320
+ assessment = assess_owner_source_noise(
321
+ metadata_by_owner.get(owner, {}), owner_entries, now=now
322
+ )
323
+ summary["owners_assessed"] += 1
324
+ flagged = assessment["source_noise"]
325
+ if flagged:
326
+ summary["owners_flagged"] += 1
327
+ for entry in owner_entries:
328
+ if flagged:
329
+ entry["noise_flags"] = list(assessment["noise_flags"])
330
+ summary["entries_flagged"] += 1
331
+ else:
332
+ entry["noise_flags"] = []
333
+ summary["entries_cleared"] += 1
334
+
335
+ # Issue-level overrides win over the heuristic. Applied as a second pass over
336
+ # the store's URL keys so an override on a clean owner's entry still lands.
337
+ for url, manual_flags in overrides.items():
338
+ entry = entries.get(url)
339
+ if entry is None:
340
+ summary["manual_urls_not_in_store"].append(url)
341
+ continue
342
+ if manual_flags:
343
+ owner_flags = entry.get("noise_flags") or []
344
+ entry["noise_flags"] = sorted(set(owner_flags) | set(manual_flags))
345
+ summary["entries_manual_noise"] += 1
346
+ else:
347
+ entry["noise_flags"] = []
348
+ summary["entries_manual_clean"] += 1
349
+ return summary