patchrail 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patchrail/__init__.py +7 -0
- patchrail/__main__.py +7 -0
- patchrail/ci/__init__.py +7 -0
- patchrail/ci/classify.py +888 -0
- patchrail/cli.py +8566 -0
- patchrail/funded_issues/__init__.py +138 -0
- patchrail/funded_issues/algora_board.py +240 -0
- patchrail/funded_issues/blocklist.py +112 -0
- patchrail/funded_issues/discovery.py +4091 -0
- patchrail/funded_issues/importers.py +316 -0
- patchrail/funded_issues/source_noise.py +349 -0
- patchrail/funded_issues/store.py +459 -0
- patchrail/queue/__init__.py +75 -0
- patchrail/queue/server.py +273 -0
- patchrail/queue/status.py +756 -0
- patchrail/queue/store.py +600 -0
- patchrail/reviewer_quick_check.py +650 -0
- patchrail/schemas/__init__.py +1 -0
- patchrail/schemas/application-dossier.v1.schema.json +305 -0
- patchrail/schemas/ci-benchmark.v1.schema.json +174 -0
- patchrail/schemas/ci-fixture-check.v1.schema.json +122 -0
- patchrail/schemas/ci-pilot-metrics.v1.schema.json +164 -0
- patchrail/schemas/ci-pilot-summary.v1.schema.json +146 -0
- patchrail/schemas/ci-result.v1.schema.json +133 -0
- patchrail/schemas/funded-issues-client-report.v1.schema.json +524 -0
- patchrail/schemas/funded-issues-recheck-queue.v1.schema.json +333 -0
- patchrail/schemas/funded-issues-recheck-summary.v1.schema.json +136 -0
- patchrail/schemas/funded-issues-report.v1.schema.json +836 -0
- patchrail/schemas/funded-issues-shortlist.v1.schema.json +953 -0
- patchrail/schemas/funded-issues-store-status.v1.schema.json +96 -0
- patchrail/schemas/funded-issues-store.v1.schema.json +117 -0
- patchrail/schemas/queue-audit-event.v1.schema.json +44 -0
- patchrail/schemas/queue-audit-summary.v1.schema.json +169 -0
- patchrail/schemas/queue-gate-report.v1.schema.json +158 -0
- patchrail/schemas/queue-policy-resolution.v1.schema.json +188 -0
- patchrail/schemas/queue-policy-scan.v1.schema.json +175 -0
- patchrail/schemas/queue-proposal.v1.schema.json +61 -0
- patchrail/schemas/queue-review.v1.schema.json +218 -0
- patchrail/schemas/queue-status.v1.schema.json +179 -0
- patchrail/schemas/queue-work-item.v1.schema.json +64 -0
- patchrail/schemas/reviewer-quick-check-artifacts.v1.schema.json +104 -0
- patchrail/web_metrics.py +649 -0
- patchrail-0.1.0.dist-info/METADATA +279 -0
- patchrail-0.1.0.dist-info/RECORD +47 -0
- patchrail-0.1.0.dist-info/WHEEL +4 -0
- patchrail-0.1.0.dist-info/entry_points.txt +2 -0
- patchrail-0.1.0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from patchrail.funded_issues.discovery import FundedIssue, funded_issues_payload
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
SUPPORTED_PROVIDERS = ("algora", "github", "openpledge", "polar")
|
|
13
|
+
|
|
14
|
+
_CURRENCY_SYMBOLS = {"$": "USD", "€": "EUR", "£": "GBP", "¥": "JPY"}
|
|
15
|
+
_CURRENCY_CODES = ("USD", "EUR", "GBP", "JPY", "CAD", "AUD")
|
|
16
|
+
_CURRENCY_CODE_PATTERN = re.compile(r"(?i)\b(" + "|".join(_CURRENCY_CODES) + r")\b")
|
|
17
|
+
|
|
18
|
+
_AMBIGUOUS_SCOPE_TERMS = (
|
|
19
|
+
"architecture",
|
|
20
|
+
"broad",
|
|
21
|
+
"entire",
|
|
22
|
+
"rewrite",
|
|
23
|
+
"unclear",
|
|
24
|
+
)
|
|
25
|
+
_SPAM_ATTRACTIVE_LABELS = ("bounty", "reward", "paid")
|
|
26
|
+
_CONTRIBUTION_SIGNAL_LABELS = (
|
|
27
|
+
"ci",
|
|
28
|
+
"bug",
|
|
29
|
+
"good first issue",
|
|
30
|
+
"good-first-issue",
|
|
31
|
+
"help wanted",
|
|
32
|
+
"tests",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def import_provider_export(provider: str, source: Path) -> dict[str, Any]:
|
|
37
|
+
provider = provider.lower()
|
|
38
|
+
if provider not in SUPPORTED_PROVIDERS:
|
|
39
|
+
raise ValueError(f"unsupported provider: {provider}")
|
|
40
|
+
payload = json.loads(source.read_text(encoding="utf-8"))
|
|
41
|
+
records = _extract_records(payload)
|
|
42
|
+
issues = [
|
|
43
|
+
_issue_from_provider_record(provider, record, index) for index, record in enumerate(records)
|
|
44
|
+
]
|
|
45
|
+
return funded_issues_payload(
|
|
46
|
+
issues,
|
|
47
|
+
import_source={
|
|
48
|
+
"provider": provider,
|
|
49
|
+
"path": str(source),
|
|
50
|
+
"records_loaded": len(records),
|
|
51
|
+
"local_file_only": True,
|
|
52
|
+
},
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _extract_records(payload: Any) -> list[dict[str, Any]]:
|
|
57
|
+
if isinstance(payload, list):
|
|
58
|
+
records = payload
|
|
59
|
+
elif isinstance(payload, dict):
|
|
60
|
+
for key in ("issues", "items", "bounties", "results", "data"):
|
|
61
|
+
value = payload.get(key)
|
|
62
|
+
if isinstance(value, list):
|
|
63
|
+
records = value
|
|
64
|
+
break
|
|
65
|
+
else:
|
|
66
|
+
records = [payload]
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError("provider export must be a JSON object or array")
|
|
69
|
+
if not all(isinstance(record, dict) for record in records):
|
|
70
|
+
raise ValueError("provider export records must be objects")
|
|
71
|
+
return records
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _issue_from_provider_record(provider: str, raw: dict[str, Any], index: int) -> FundedIssue:
|
|
75
|
+
repository = _repository(raw)
|
|
76
|
+
issue_number = _issue_number(raw)
|
|
77
|
+
title = _first_string(raw, "title", "name", "summary") or "Untitled funded issue"
|
|
78
|
+
url = _first_string(raw, "url", "html_url", "issue_url", "github_url") or repository
|
|
79
|
+
amount, currency = _funding(raw)
|
|
80
|
+
labels = _labels(raw)
|
|
81
|
+
contribution_guidelines_url = _first_string(
|
|
82
|
+
raw,
|
|
83
|
+
"contribution_guidelines_url",
|
|
84
|
+
"contributing_url",
|
|
85
|
+
"guidelines_url",
|
|
86
|
+
)
|
|
87
|
+
opportunity_state = _opportunity_state(raw, labels)
|
|
88
|
+
contribution_signals = _contribution_signals(raw, labels, contribution_guidelines_url)
|
|
89
|
+
risk_flags = _risk_flags(
|
|
90
|
+
raw, title, labels, amount, contribution_guidelines_url, opportunity_state
|
|
91
|
+
)
|
|
92
|
+
identifier = _first_string(raw, "id", "node_id", "slug") or _stable_id(
|
|
93
|
+
provider, repository, issue_number, title, index
|
|
94
|
+
)
|
|
95
|
+
language = _first_string(raw, "language", "primary_language", "repo_language")
|
|
96
|
+
return FundedIssue(
|
|
97
|
+
id=str(identifier),
|
|
98
|
+
platform=provider,
|
|
99
|
+
repository=repository,
|
|
100
|
+
issue_number=issue_number,
|
|
101
|
+
title=title,
|
|
102
|
+
url=url,
|
|
103
|
+
funding_amount=amount,
|
|
104
|
+
funding_currency=currency,
|
|
105
|
+
language=language,
|
|
106
|
+
labels=labels,
|
|
107
|
+
contribution_signals=contribution_signals,
|
|
108
|
+
risk_flags=risk_flags,
|
|
109
|
+
maintainer_permission=str(raw.get("maintainer_permission") or "public_issue_only"),
|
|
110
|
+
contribution_guidelines_url=contribution_guidelines_url,
|
|
111
|
+
opportunity_state=opportunity_state,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _repository(raw: dict[str, Any]) -> str:
|
|
116
|
+
direct = _first_string(raw, "repository", "repo", "repo_full_name", "full_name")
|
|
117
|
+
if direct:
|
|
118
|
+
return direct
|
|
119
|
+
nested = raw.get("repository")
|
|
120
|
+
if isinstance(nested, dict):
|
|
121
|
+
nested_name = _first_string(nested, "full_name", "name")
|
|
122
|
+
owner = nested.get("owner")
|
|
123
|
+
if nested_name and "/" in nested_name:
|
|
124
|
+
return nested_name
|
|
125
|
+
if isinstance(owner, dict):
|
|
126
|
+
owner_name = _first_string(owner, "login", "name")
|
|
127
|
+
else:
|
|
128
|
+
owner_name = str(owner) if owner else None
|
|
129
|
+
if owner_name and nested_name:
|
|
130
|
+
return f"{owner_name}/{nested_name}"
|
|
131
|
+
owner = _first_string(raw, "owner", "org", "organization")
|
|
132
|
+
repo_name = _first_string(raw, "repo_name", "project", "name")
|
|
133
|
+
if owner and repo_name:
|
|
134
|
+
return f"{owner}/{repo_name}"
|
|
135
|
+
url = _first_string(raw, "url", "html_url", "issue_url", "github_url") or ""
|
|
136
|
+
match = re.search(r"github\.com/([^/\s]+/[^/\s#]+)", url)
|
|
137
|
+
if match:
|
|
138
|
+
return match.group(1).removesuffix(".git")
|
|
139
|
+
return "unknown/unknown"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _issue_number(raw: dict[str, Any]) -> int | None:
|
|
143
|
+
for key in ("issue_number", "number", "github_issue_number", "issue"):
|
|
144
|
+
value = raw.get(key)
|
|
145
|
+
if isinstance(value, int):
|
|
146
|
+
return value
|
|
147
|
+
if isinstance(value, str) and value.isdigit():
|
|
148
|
+
return int(value)
|
|
149
|
+
url = _first_string(raw, "url", "html_url", "issue_url", "github_url") or ""
|
|
150
|
+
match = re.search(r"/issues/(\d+)", url)
|
|
151
|
+
return int(match.group(1)) if match else None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _funding(raw: dict[str, Any]) -> tuple[float | None, str | None]:
|
|
155
|
+
funding = raw.get("funding")
|
|
156
|
+
if isinstance(funding, dict):
|
|
157
|
+
raw_amount = funding.get("amount") or funding.get("value") or funding.get("usd")
|
|
158
|
+
amount = _numeric(raw_amount)
|
|
159
|
+
currency = _first_string(funding, "currency", "currency_code")
|
|
160
|
+
return amount, _normalize_currency(currency, raw_amount)
|
|
161
|
+
|
|
162
|
+
bounty = raw.get("bounty")
|
|
163
|
+
if isinstance(bounty, dict):
|
|
164
|
+
raw_amount = bounty.get("amount") or bounty.get("value") or bounty.get("usd")
|
|
165
|
+
amount = _numeric(raw_amount)
|
|
166
|
+
currency = _first_string(bounty, "currency", "currency_code")
|
|
167
|
+
return amount, _normalize_currency(currency, raw_amount)
|
|
168
|
+
|
|
169
|
+
for key in ("amount_usd", "reward_usd", "bounty_usd", "funding_usd"):
|
|
170
|
+
amount = _numeric(raw.get(key))
|
|
171
|
+
if amount is not None:
|
|
172
|
+
return amount, "USD"
|
|
173
|
+
|
|
174
|
+
raw_amount = raw.get("amount") or raw.get("reward") or raw.get("bounty_amount")
|
|
175
|
+
amount = _numeric(raw_amount)
|
|
176
|
+
currency = _first_string(raw, "currency", "currency_code")
|
|
177
|
+
return amount, _normalize_currency(currency, raw_amount)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _labels(raw: dict[str, Any]) -> list[str]:
|
|
181
|
+
labels = raw.get("labels")
|
|
182
|
+
if not isinstance(labels, list):
|
|
183
|
+
return []
|
|
184
|
+
values = []
|
|
185
|
+
for label in labels:
|
|
186
|
+
if isinstance(label, dict):
|
|
187
|
+
value = _first_string(label, "name", "label")
|
|
188
|
+
else:
|
|
189
|
+
value = str(label)
|
|
190
|
+
if value:
|
|
191
|
+
values.append(value)
|
|
192
|
+
return values
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _contribution_signals(
|
|
196
|
+
raw: dict[str, Any], labels: list[str], contribution_guidelines_url: str | None
|
|
197
|
+
) -> list[str]:
|
|
198
|
+
signals = _string_list(raw.get("contribution_signals"))
|
|
199
|
+
normalized_labels = {label.lower() for label in labels}
|
|
200
|
+
for label in normalized_labels:
|
|
201
|
+
if label in _CONTRIBUTION_SIGNAL_LABELS:
|
|
202
|
+
signals.append(f"label:{label}")
|
|
203
|
+
body = _first_string(raw, "body", "description") or ""
|
|
204
|
+
if "reproduction" in body.lower() or "steps to reproduce" in body.lower():
|
|
205
|
+
signals.append("reproduction included")
|
|
206
|
+
if contribution_guidelines_url:
|
|
207
|
+
signals.append("contribution guidelines linked")
|
|
208
|
+
return sorted(set(signals))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _risk_flags(
|
|
212
|
+
raw: dict[str, Any],
|
|
213
|
+
title: str,
|
|
214
|
+
labels: list[str],
|
|
215
|
+
amount: float | None,
|
|
216
|
+
contribution_guidelines_url: str | None,
|
|
217
|
+
opportunity_state: str,
|
|
218
|
+
) -> list[str]:
|
|
219
|
+
flags = _string_list(raw.get("risk_flags"))
|
|
220
|
+
title_lower = title.lower()
|
|
221
|
+
label_lowers = {label.lower() for label in labels}
|
|
222
|
+
if any(term in title_lower for term in _AMBIGUOUS_SCOPE_TERMS):
|
|
223
|
+
flags.append("ambiguous_scope")
|
|
224
|
+
if any(label in label_lowers for label in _SPAM_ATTRACTIVE_LABELS):
|
|
225
|
+
flags.append("spam_attractive")
|
|
226
|
+
if amount is not None and amount >= 1000 and not contribution_guidelines_url:
|
|
227
|
+
flags.append("spam_attractive")
|
|
228
|
+
if not contribution_guidelines_url:
|
|
229
|
+
flags.append("no_contribution_guidelines")
|
|
230
|
+
if opportunity_state == "stale":
|
|
231
|
+
flags.append("stale_no_maintainer_signal")
|
|
232
|
+
if opportunity_state == "closed":
|
|
233
|
+
flags.append("closed_or_inactive")
|
|
234
|
+
return sorted(set(flags))
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _opportunity_state(raw: dict[str, Any], labels: list[str]) -> str:
|
|
238
|
+
label_lowers = {label.lower() for label in labels}
|
|
239
|
+
if "stale" in label_lowers:
|
|
240
|
+
return "stale"
|
|
241
|
+
for key in ("opportunity_state", "state", "status", "issue_state"):
|
|
242
|
+
value = raw.get(key)
|
|
243
|
+
if isinstance(value, bool):
|
|
244
|
+
continue
|
|
245
|
+
if isinstance(value, str) and value.strip():
|
|
246
|
+
normalized = value.strip().lower().replace("-", "_").replace(" ", "_")
|
|
247
|
+
if normalized in {"active", "open", "opened", "live", "available"}:
|
|
248
|
+
return "active"
|
|
249
|
+
if normalized in {"closed", "completed", "done", "paid", "resolved", "cancelled"}:
|
|
250
|
+
return "closed"
|
|
251
|
+
if normalized in {"stale", "inactive", "abandoned", "expired"}:
|
|
252
|
+
return "stale"
|
|
253
|
+
for key in ("open", "is_open"):
|
|
254
|
+
value = raw.get(key)
|
|
255
|
+
if isinstance(value, bool):
|
|
256
|
+
return "active" if value else "closed"
|
|
257
|
+
return "unknown"
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _first_string(raw: dict[str, Any], *keys: str) -> str | None:
|
|
261
|
+
for key in keys:
|
|
262
|
+
value = raw.get(key)
|
|
263
|
+
if isinstance(value, str) and value.strip():
|
|
264
|
+
return value.strip()
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _numeric(value: Any) -> float | None:
|
|
269
|
+
if value is None:
|
|
270
|
+
return None
|
|
271
|
+
if isinstance(value, int | float):
|
|
272
|
+
return float(value)
|
|
273
|
+
if isinstance(value, str):
|
|
274
|
+
cleaned = value
|
|
275
|
+
for symbol in _CURRENCY_SYMBOLS:
|
|
276
|
+
cleaned = cleaned.replace(symbol, "")
|
|
277
|
+
cleaned = _CURRENCY_CODE_PATTERN.sub("", cleaned)
|
|
278
|
+
cleaned = cleaned.replace(",", "").strip()
|
|
279
|
+
try:
|
|
280
|
+
return float(cleaned)
|
|
281
|
+
except ValueError:
|
|
282
|
+
return None
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _detect_currency(value: Any) -> str | None:
|
|
287
|
+
if not isinstance(value, str):
|
|
288
|
+
return None
|
|
289
|
+
for symbol, code in _CURRENCY_SYMBOLS.items():
|
|
290
|
+
if symbol in value:
|
|
291
|
+
return code
|
|
292
|
+
match = _CURRENCY_CODE_PATTERN.search(value)
|
|
293
|
+
if match:
|
|
294
|
+
return match.group(1).upper()
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _normalize_currency(explicit: str | None, raw_amount: Any) -> str | None:
|
|
299
|
+
if explicit:
|
|
300
|
+
return explicit.upper()
|
|
301
|
+
return _detect_currency(raw_amount)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _string_list(value: Any) -> list[str]:
|
|
305
|
+
if not isinstance(value, list):
|
|
306
|
+
return []
|
|
307
|
+
return [str(item) for item in value if str(item).strip()]
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _stable_id(
|
|
311
|
+
provider: str, repository: str, issue_number: int | None, title: str, index: int
|
|
312
|
+
) -> str:
|
|
313
|
+
digest = hashlib.sha256(f"{repository}:{issue_number}:{title}:{index}".encode()).hexdigest()[
|
|
314
|
+
:12
|
|
315
|
+
]
|
|
316
|
+
return f"{provider}-{digest}"
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""Owner-level source-noise heuristic for the read-only funded-issues tracker.
|
|
2
|
+
|
|
3
|
+
The per-issue scoring in :mod:`patchrail.funded_issues.discovery` cannot tell a
|
|
4
|
+
throwaway trap org (a brand-new account spamming near-identical honeypot
|
|
5
|
+
bounties) apart from a credible sponsor running a noisy program -- every issue
|
|
6
|
+
lands at ``risk_level=high`` regardless of *who* posted it. This module adds the
|
|
7
|
+
missing *owner-level* signal: given one owner's already-collected public GitHub
|
|
8
|
+
metadata plus the list of tracker-store entries attributed to that owner, it
|
|
9
|
+
derives a list of ``noise_flags`` and a single ``source_noise`` verdict.
|
|
10
|
+
|
|
11
|
+
It is pure and fully offline -- callers pass metadata that was gathered
|
|
12
|
+
read-only elsewhere; nothing here performs a network call, and (like the rest of
|
|
13
|
+
the tracker) it never writes to any third party.
|
|
14
|
+
|
|
15
|
+
Flags
|
|
16
|
+
-----
|
|
17
|
+
Each owner is screened for the following flags (constants below hold the
|
|
18
|
+
thresholds):
|
|
19
|
+
|
|
20
|
+
* ``new_account`` (strong) -- account younger than
|
|
21
|
+
:data:`NEW_ACCOUNT_MAX_AGE_DAYS` days.
|
|
22
|
+
* ``no_website`` (strong) -- no public website/blog declared. Absent metadata is
|
|
23
|
+
treated as "no website": for a noise screen, an unproven signal is a negative
|
|
24
|
+
one.
|
|
25
|
+
* ``unverifiable_payout`` (strong) -- payout cannot be verified from a primary
|
|
26
|
+
public source. Absent metadata is likewise treated as unverifiable.
|
|
27
|
+
* ``anomalous_volume`` (strong) -- the owner contributes at least
|
|
28
|
+
:data:`ANOMALOUS_MIN_VOLUME` tracked entries and at least
|
|
29
|
+
:data:`ANOMALOUS_DUP_RATIO` of them collapse to one near-identical title
|
|
30
|
+
signature (the honeypot/aggregator template pattern).
|
|
31
|
+
* ``low_repos`` (supporting) -- at most :data:`LOW_REPO_MAX` public repos.
|
|
32
|
+
* ``few_followers`` (supporting) -- at most :data:`FEW_FOLLOWERS_MAX` followers.
|
|
33
|
+
|
|
34
|
+
Verdict criterion
|
|
35
|
+
-----------------
|
|
36
|
+
``source_noise`` is ``True`` when the owner trips **at least**
|
|
37
|
+
:data:`STRONG_FLAG_THRESHOLD` *strong* flags (the members of
|
|
38
|
+
:data:`STRONG_NOISE_FLAGS`). Supporting flags add colour to a report but never,
|
|
39
|
+
on their own, flip the verdict -- so a legitimate one-repo sponsor with a
|
|
40
|
+
website and verifiable payouts stays clean, while a new website-less org with
|
|
41
|
+
unverifiable payouts and templated volume is flagged.
|
|
42
|
+
|
|
43
|
+
Issue-level manual overrides
|
|
44
|
+
----------------------------
|
|
45
|
+
The owner-level pass is necessarily coarse: it condemns or clears *every* issue
|
|
46
|
+
from an owner at once. Sometimes a human needs to override a single issue --
|
|
47
|
+
flag one "Test Bounty" from an otherwise legitimate owner, or clear one issue
|
|
48
|
+
from a flagged owner. :func:`apply_source_noise_to_store` accepts a
|
|
49
|
+
``manual_overrides`` mapping (issue URL -> list of flags) for exactly this. The
|
|
50
|
+
overrides are *issue-level* and always win over the heuristic. Because the
|
|
51
|
+
caller passes them on **every** apply, they survive re-applies of the owner-level
|
|
52
|
+
heuristic that would otherwise reset the entry's ``noise_flags`` -- there is no
|
|
53
|
+
hidden state, just a deterministic re-stamp on each pass.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
from __future__ import annotations
|
|
57
|
+
|
|
58
|
+
import re
|
|
59
|
+
from datetime import datetime
|
|
60
|
+
from typing import Any
|
|
61
|
+
|
|
62
|
+
SOURCE_NOISE_SCHEMA_VERSION = "patchrail.funded_issues.source_noise.v1"
|
|
63
|
+
|
|
64
|
+
# Thresholds. Tuned against the 2026-06-10 screening: trap orgs created within
|
|
65
|
+
# the last ~4 weeks with a single repo and a couple of followers.
|
|
66
|
+
NEW_ACCOUNT_MAX_AGE_DAYS = 90
|
|
67
|
+
LOW_REPO_MAX = 1
|
|
68
|
+
FEW_FOLLOWERS_MAX = 5
|
|
69
|
+
ANOMALOUS_MIN_VOLUME = 5
|
|
70
|
+
ANOMALOUS_DUP_RATIO = 0.6
|
|
71
|
+
STRONG_FLAG_THRESHOLD = 2
|
|
72
|
+
|
|
73
|
+
# Flags that, in sufficient number, flip ``source_noise``. The supporting flags
|
|
74
|
+
# (``low_repos`` / ``few_followers``) are deliberately excluded: weak corporate
|
|
75
|
+
# signals should never condemn an owner on their own.
|
|
76
|
+
STRONG_NOISE_FLAGS = frozenset(
|
|
77
|
+
{"new_account", "no_website", "unverifiable_payout", "anomalous_volume"}
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
_TITLE_TOKEN_RE = re.compile(r"[a-z]+")
|
|
81
|
+
# How many leading title tokens form the near-identical signature. Honeypot and
|
|
82
|
+
# aggregator templates share a long fixed prefix; six tokens is enough to cluster
|
|
83
|
+
# them while keeping genuinely distinct issues apart.
|
|
84
|
+
_SIGNATURE_TOKENS = 6
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _parse_iso(value: str) -> datetime:
|
|
88
|
+
text = str(value).strip()
|
|
89
|
+
if text.endswith("Z"):
|
|
90
|
+
text = text[:-1] + "+00:00"
|
|
91
|
+
return datetime.fromisoformat(text)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _account_age_days(metadata: dict[str, Any], now: str | None) -> int | None:
|
|
95
|
+
"""Resolve account age in days, preferring an explicit ``account_age_days``.
|
|
96
|
+
|
|
97
|
+
Falls back to ``created_at`` differenced against ``now`` when both are
|
|
98
|
+
present and parseable. Returns ``None`` when age cannot be established, in
|
|
99
|
+
which case the ``new_account`` flag is simply not raised.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
age = metadata.get("account_age_days")
|
|
103
|
+
if age is not None:
|
|
104
|
+
try:
|
|
105
|
+
return int(age)
|
|
106
|
+
except (TypeError, ValueError):
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
created_at = metadata.get("created_at")
|
|
110
|
+
if created_at and now:
|
|
111
|
+
try:
|
|
112
|
+
return (_parse_iso(now) - _parse_iso(str(created_at))).days
|
|
113
|
+
except ValueError:
|
|
114
|
+
return None
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _title_signature(title: Any) -> tuple[str, ...]:
|
|
119
|
+
tokens = _TITLE_TOKEN_RE.findall(str(title).lower())
|
|
120
|
+
return tuple(tokens[:_SIGNATURE_TOKENS])
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _entry_title(entry: dict[str, Any]) -> str:
|
|
124
|
+
issue = entry.get("issue") or {}
|
|
125
|
+
return str(issue.get("title") or "")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _is_anomalous_volume(entries: list[dict[str, Any]]) -> bool:
|
|
129
|
+
"""True when the owner posts a high volume of near-identical issue titles.
|
|
130
|
+
|
|
131
|
+
Titles are reduced to a leading-token signature; if the largest cluster of
|
|
132
|
+
identical signatures covers at least :data:`ANOMALOUS_DUP_RATIO` of a
|
|
133
|
+
sufficiently large batch, the volume is anomalous.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
total = len(entries)
|
|
137
|
+
if total < ANOMALOUS_MIN_VOLUME:
|
|
138
|
+
return False
|
|
139
|
+
counts: dict[tuple[str, ...], int] = {}
|
|
140
|
+
for entry in entries:
|
|
141
|
+
signature = _title_signature(_entry_title(entry))
|
|
142
|
+
if not signature:
|
|
143
|
+
continue
|
|
144
|
+
counts[signature] = counts.get(signature, 0) + 1
|
|
145
|
+
if not counts:
|
|
146
|
+
return False
|
|
147
|
+
largest_cluster = max(counts.values())
|
|
148
|
+
return largest_cluster / total >= ANOMALOUS_DUP_RATIO
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
_REPOS_URL_OWNER_RE = re.compile(r"/repos/([^/]+)/")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _entry_owner(entry: dict[str, Any]) -> str:
|
|
155
|
+
"""Derive the owning account for a store entry.
|
|
156
|
+
|
|
157
|
+
Prefers an explicit ``issue.owner``, then the ``/repos/<owner>/`` segment of
|
|
158
|
+
``issue.url`` (the canonical GitHub API reference, always present in stores
|
|
159
|
+
built by discovery), and finally ``issue.repository`` — which appears both
|
|
160
|
+
as ``owner/repo`` and as the API-derived ``repos/<owner>`` form.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
issue = entry.get("issue") or {}
|
|
164
|
+
owner = issue.get("owner")
|
|
165
|
+
if owner:
|
|
166
|
+
return str(owner)
|
|
167
|
+
match = _REPOS_URL_OWNER_RE.search(str(issue.get("url") or ""))
|
|
168
|
+
if match:
|
|
169
|
+
return match.group(1)
|
|
170
|
+
repository = str(issue.get("repository") or "")
|
|
171
|
+
segments = [part for part in repository.split("/") if part]
|
|
172
|
+
if len(segments) >= 2 and segments[0] == "repos":
|
|
173
|
+
return segments[1]
|
|
174
|
+
if segments:
|
|
175
|
+
return segments[0]
|
|
176
|
+
return repository
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def assess_owner_source_noise(
|
|
180
|
+
owner_metadata: dict[str, Any],
|
|
181
|
+
entries: list[dict[str, Any]],
|
|
182
|
+
*,
|
|
183
|
+
now: str | None = None,
|
|
184
|
+
) -> dict[str, Any]:
|
|
185
|
+
"""Screen one owner for source noise from offline public signals.
|
|
186
|
+
|
|
187
|
+
``owner_metadata`` is a mapping of public signals for the owner
|
|
188
|
+
(``account_age_days`` or ``created_at``, ``public_repos``, ``followers``,
|
|
189
|
+
``has_website``, ``payout_verifiable``). ``entries`` is the list of tracker
|
|
190
|
+
store entries attributed to that owner (used for the volume heuristic).
|
|
191
|
+
|
|
192
|
+
Returns a mapping with the sorted ``noise_flags``, the ``strong_flags``
|
|
193
|
+
subset that drives the verdict, ``strong_flag_count``, the boolean
|
|
194
|
+
``source_noise`` verdict (see module docstring for the criterion), and the
|
|
195
|
+
``tracked_entries`` count. Performs no network calls.
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
metadata = owner_metadata or {}
|
|
199
|
+
flags: list[str] = []
|
|
200
|
+
|
|
201
|
+
age = _account_age_days(metadata, now)
|
|
202
|
+
if age is not None and age < NEW_ACCOUNT_MAX_AGE_DAYS:
|
|
203
|
+
flags.append("new_account")
|
|
204
|
+
|
|
205
|
+
public_repos = metadata.get("public_repos")
|
|
206
|
+
if public_repos is not None and int(public_repos) <= LOW_REPO_MAX:
|
|
207
|
+
flags.append("low_repos")
|
|
208
|
+
|
|
209
|
+
followers = metadata.get("followers")
|
|
210
|
+
if followers is not None and int(followers) <= FEW_FOLLOWERS_MAX:
|
|
211
|
+
flags.append("few_followers")
|
|
212
|
+
|
|
213
|
+
if not metadata.get("has_website", False):
|
|
214
|
+
flags.append("no_website")
|
|
215
|
+
|
|
216
|
+
if not metadata.get("payout_verifiable", False):
|
|
217
|
+
flags.append("unverifiable_payout")
|
|
218
|
+
|
|
219
|
+
if _is_anomalous_volume(entries):
|
|
220
|
+
flags.append("anomalous_volume")
|
|
221
|
+
|
|
222
|
+
noise_flags = sorted(flags)
|
|
223
|
+
strong_flags = [flag for flag in noise_flags if flag in STRONG_NOISE_FLAGS]
|
|
224
|
+
return {
|
|
225
|
+
"schema_version": SOURCE_NOISE_SCHEMA_VERSION,
|
|
226
|
+
"noise_flags": noise_flags,
|
|
227
|
+
"strong_flags": strong_flags,
|
|
228
|
+
"strong_flag_count": len(strong_flags),
|
|
229
|
+
"source_noise": len(strong_flags) >= STRONG_FLAG_THRESHOLD,
|
|
230
|
+
"tracked_entries": len(entries),
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def entries_by_owner(store: dict[str, Any]) -> dict[str, list[dict[str, Any]]]:
|
|
235
|
+
"""Group a store's entries by derived owner, preserving entry references."""
|
|
236
|
+
|
|
237
|
+
grouped: dict[str, list[dict[str, Any]]] = {}
|
|
238
|
+
for entry in store.get("entries", {}).values():
|
|
239
|
+
grouped.setdefault(_entry_owner(entry), []).append(entry)
|
|
240
|
+
return grouped
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _validate_manual_overrides(
|
|
244
|
+
manual_overrides: dict[str, list[str]] | None,
|
|
245
|
+
) -> dict[str, list[str]]:
|
|
246
|
+
"""Return validated overrides, raising ``ValueError`` on malformed flags.
|
|
247
|
+
|
|
248
|
+
Every value must be a list whose members are all non-empty strings. A
|
|
249
|
+
non-string or empty/blank string is a caller bug, not noise data, so it is
|
|
250
|
+
rejected loudly rather than silently coerced.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
overrides = manual_overrides or {}
|
|
254
|
+
for url, flags in overrides.items():
|
|
255
|
+
if not isinstance(flags, list):
|
|
256
|
+
raise ValueError(
|
|
257
|
+
f"manual_overrides[{url!r}] must be a list of flag strings, "
|
|
258
|
+
f"got {type(flags).__name__}"
|
|
259
|
+
)
|
|
260
|
+
for flag in flags:
|
|
261
|
+
if not isinstance(flag, str) or not flag.strip():
|
|
262
|
+
raise ValueError(
|
|
263
|
+
f"manual_overrides[{url!r}] contains an invalid flag "
|
|
264
|
+
f"{flag!r}: flags must be non-empty strings"
|
|
265
|
+
)
|
|
266
|
+
return overrides
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def apply_source_noise_to_store(
|
|
270
|
+
store: dict[str, Any],
|
|
271
|
+
owner_metadata: dict[str, dict[str, Any]] | None = None,
|
|
272
|
+
*,
|
|
273
|
+
now: str | None = None,
|
|
274
|
+
manual_overrides: dict[str, list[str]] | None = None,
|
|
275
|
+
) -> dict[str, Any]:
|
|
276
|
+
"""Stamp the owner-level ``source_noise`` verdict onto store entries in place.
|
|
277
|
+
|
|
278
|
+
Entries are grouped by owner; each owner is assessed once via
|
|
279
|
+
:func:`assess_owner_source_noise` using ``owner_metadata[owner]`` (an empty
|
|
280
|
+
mapping when absent). Every entry for a *flagged* owner has its
|
|
281
|
+
``noise_flags`` set to the owner's flag list; entries for a *clean* owner are
|
|
282
|
+
reset to ``[]``. This keeps the per-entry ``noise_flags`` an honest mirror of
|
|
283
|
+
the current owner verdict, so re-applying with refreshed metadata can clear a
|
|
284
|
+
previously-flagged owner.
|
|
285
|
+
|
|
286
|
+
``manual_overrides`` is an optional, *issue-level* escape hatch mapping an
|
|
287
|
+
exact ``store["entries"]`` URL key to a list of flag strings. It is applied
|
|
288
|
+
**after** the owner-level pass and always wins:
|
|
289
|
+
|
|
290
|
+
* a non-empty flag list marks the entry as noise -- merged with (and
|
|
291
|
+
de-duplicated against) any owner-level flags, then sorted -- even when the
|
|
292
|
+
owner is clean;
|
|
293
|
+
* an empty list ``[]`` forces the entry clean, overriding a flagged owner.
|
|
294
|
+
|
|
295
|
+
Because the caller supplies the overrides on every apply, they persist across
|
|
296
|
+
re-applies of the heuristic without any stored state. Malformed overrides
|
|
297
|
+
(non-string or empty flags) raise :class:`ValueError`. Returns a summary of
|
|
298
|
+
the pass.
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
metadata_by_owner = owner_metadata or {}
|
|
302
|
+
overrides = _validate_manual_overrides(manual_overrides)
|
|
303
|
+
summary = {
|
|
304
|
+
"owners_assessed": 0,
|
|
305
|
+
"owners_flagged": 0,
|
|
306
|
+
"owners_without_metadata": [],
|
|
307
|
+
"entries_flagged": 0,
|
|
308
|
+
"entries_cleared": 0,
|
|
309
|
+
"entries_manual_noise": 0,
|
|
310
|
+
"entries_manual_clean": 0,
|
|
311
|
+
"manual_urls_not_in_store": [],
|
|
312
|
+
}
|
|
313
|
+
entries = store.get("entries", {})
|
|
314
|
+
for owner, owner_entries in entries_by_owner(store).items():
|
|
315
|
+
if owner not in metadata_by_owner:
|
|
316
|
+
# Absent metadata reads as negative signals (see module docstring),
|
|
317
|
+
# so surface it: a long list here means the caller is screening
|
|
318
|
+
# owners it never actually looked up.
|
|
319
|
+
summary["owners_without_metadata"].append(owner)
|
|
320
|
+
assessment = assess_owner_source_noise(
|
|
321
|
+
metadata_by_owner.get(owner, {}), owner_entries, now=now
|
|
322
|
+
)
|
|
323
|
+
summary["owners_assessed"] += 1
|
|
324
|
+
flagged = assessment["source_noise"]
|
|
325
|
+
if flagged:
|
|
326
|
+
summary["owners_flagged"] += 1
|
|
327
|
+
for entry in owner_entries:
|
|
328
|
+
if flagged:
|
|
329
|
+
entry["noise_flags"] = list(assessment["noise_flags"])
|
|
330
|
+
summary["entries_flagged"] += 1
|
|
331
|
+
else:
|
|
332
|
+
entry["noise_flags"] = []
|
|
333
|
+
summary["entries_cleared"] += 1
|
|
334
|
+
|
|
335
|
+
# Issue-level overrides win over the heuristic. Applied as a second pass over
|
|
336
|
+
# the store's URL keys so an override on a clean owner's entry still lands.
|
|
337
|
+
for url, manual_flags in overrides.items():
|
|
338
|
+
entry = entries.get(url)
|
|
339
|
+
if entry is None:
|
|
340
|
+
summary["manual_urls_not_in_store"].append(url)
|
|
341
|
+
continue
|
|
342
|
+
if manual_flags:
|
|
343
|
+
owner_flags = entry.get("noise_flags") or []
|
|
344
|
+
entry["noise_flags"] = sorted(set(owner_flags) | set(manual_flags))
|
|
345
|
+
summary["entries_manual_noise"] += 1
|
|
346
|
+
else:
|
|
347
|
+
entry["noise_flags"] = []
|
|
348
|
+
summary["entries_manual_clean"] += 1
|
|
349
|
+
return summary
|