pmkit 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pmkit/__init__.py +8 -0
- pmkit/backlog.py +409 -0
- pmkit/cli.py +723 -0
- pmkit/connectors/__init__.py +35 -0
- pmkit/connectors/base.py +67 -0
- pmkit/connectors/changelog.py +37 -0
- pmkit/connectors/github.py +49 -0
- pmkit/connectors/hn.py +42 -0
- pmkit/connectors/reddit.py +42 -0
- pmkit/connectors/web.py +44 -0
- pmkit/connectors/x.py +50 -0
- pmkit/dedup.py +64 -0
- pmkit/discover.py +83 -0
- pmkit/dogfood/__init__.py +7 -0
- pmkit/dogfood/file_gaps.py +52 -0
- pmkit/dogfood/install.py +111 -0
- pmkit/dogfood/mcp.py +73 -0
- pmkit/dogfood/report.py +157 -0
- pmkit/dogfood/sample.py +32 -0
- pmkit/dogfood/ui.py +106 -0
- pmkit/killtest.py +31 -0
- pmkit/launch/__init__.py +15 -0
- pmkit/launch/collateral.py +159 -0
- pmkit/launch/drafts.py +53 -0
- pmkit/launch/listen.py +88 -0
- pmkit/launch/plan.py +82 -0
- pmkit/launch/policy.py +153 -0
- pmkit/launch/store.py +260 -0
- pmkit/rice.py +54 -0
- pmkit-0.1.1.dist-info/METADATA +29 -0
- pmkit-0.1.1.dist-info/RECORD +33 -0
- pmkit-0.1.1.dist-info/WHEEL +4 -0
- pmkit-0.1.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Discovery connectors registry.
|
|
2
|
+
|
|
3
|
+
Each connector fetches signals from one OSS source and returns normalized candidate
|
|
4
|
+
dicts with provenance. Connectors degrade gracefully: a missing key or a source error
|
|
5
|
+
skips that source rather than aborting the run.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .github import GitHubConnector
|
|
11
|
+
from .hn import HNConnector
|
|
12
|
+
from .reddit import RedditConnector
|
|
13
|
+
from .web import WebConnector
|
|
14
|
+
from .x import XConnector
|
|
15
|
+
|
|
16
|
+
# Order matters only for display; discovery runs all available connectors.
|
|
17
|
+
REGISTRY = [
|
|
18
|
+
GitHubConnector(),
|
|
19
|
+
HNConnector(),
|
|
20
|
+
RedditConnector(),
|
|
21
|
+
WebConnector(),
|
|
22
|
+
XConnector(),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_connectors(names=None):
|
|
27
|
+
"""Return connector instances, optionally filtered by name."""
|
|
28
|
+
if not names:
|
|
29
|
+
return list(REGISTRY)
|
|
30
|
+
wanted = set(names)
|
|
31
|
+
selected = [c for c in REGISTRY if c.name in wanted]
|
|
32
|
+
unknown = wanted - {c.name for c in REGISTRY}
|
|
33
|
+
if unknown:
|
|
34
|
+
raise ValueError(f"unknown source(s): {sorted(unknown)}")
|
|
35
|
+
return selected
|
pmkit/connectors/base.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Connector framework: config, HTTP helper, and the candidate shape.
|
|
2
|
+
|
|
3
|
+
A connector fetches signals from one OSS source and returns *candidate* dicts. The HTTP
|
|
4
|
+
fetch is always separated from a pure ``parse_*`` function so parsing is unit-testable
|
|
5
|
+
without the network. Connectors never raise out of discovery: a missing key or a source
|
|
6
|
+
error is reported as a skip, not a crash (graceful degradation, R3/U3).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import urllib.error
|
|
14
|
+
import urllib.request
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
DEFAULT_UA = "pmkit/0.1 (+https://github.com/dkedar7/pm-system)"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ConnectorError(Exception):
|
|
22
|
+
"""A source-level failure (auth, network, bad target). Caught by discovery."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Config:
|
|
27
|
+
"""Runtime config, sourced from the environment. All keys optional."""
|
|
28
|
+
|
|
29
|
+
github_token: Optional[str] = None
|
|
30
|
+
brave_key: Optional[str] = None
|
|
31
|
+
x_bearer: Optional[str] = None
|
|
32
|
+
user_agent: str = DEFAULT_UA
|
|
33
|
+
timeout: float = 15.0
|
|
34
|
+
min_engagement: int = 2 # below this, a candidate is flagged low-confidence
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_env(cls) -> "Config":
|
|
38
|
+
return cls(
|
|
39
|
+
github_token=os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN"),
|
|
40
|
+
brave_key=os.environ.get("BRAVE_API_KEY"),
|
|
41
|
+
x_bearer=os.environ.get("X_BEARER_TOKEN") or os.environ.get("TWITTER_BEARER_TOKEN"),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def candidate(title: str, problem: str, source_type: str, url: str,
|
|
46
|
+
engagement: int = 0, created_at: Optional[str] = None) -> dict:
|
|
47
|
+
"""Build a normalized candidate dict with one source of provenance."""
|
|
48
|
+
return {
|
|
49
|
+
"title": (title or "").strip()[:200],
|
|
50
|
+
"problem": (problem or "").strip()[:1000],
|
|
51
|
+
"engagement": int(engagement or 0),
|
|
52
|
+
"source": {"type": source_type, "url": url, "created_at": created_at},
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def http_get_json(url: str, headers: Optional[dict] = None, timeout: float = 15.0):
|
|
57
|
+
"""GET a URL and parse JSON. Raises ConnectorError on any failure."""
|
|
58
|
+
req = urllib.request.Request(url, headers={"User-Agent": DEFAULT_UA, **(headers or {})})
|
|
59
|
+
try:
|
|
60
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
61
|
+
return json.loads(resp.read().decode("utf-8"))
|
|
62
|
+
except urllib.error.HTTPError as e:
|
|
63
|
+
raise ConnectorError(f"HTTP {e.code} for {url}") from e
|
|
64
|
+
except (urllib.error.URLError, TimeoutError) as e:
|
|
65
|
+
raise ConnectorError(f"network error for {url}: {e}") from e
|
|
66
|
+
except json.JSONDecodeError as e:
|
|
67
|
+
raise ConnectorError(f"bad JSON from {url}: {e}") from e
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Changelog / releases utility.
|
|
2
|
+
|
|
3
|
+
Recent releases are not opportunities, so this is not a candidate-producing connector.
|
|
4
|
+
It supplies "recently shipped" context that the already-solved kill-test (U4) uses to
|
|
5
|
+
refute candidates a maintainer has just addressed. Zero-config (GitHub releases API).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .base import Config, ConnectorError, http_get_json
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def recent_releases(target: str, cfg: Config, limit: int = 10) -> list[dict]:
|
|
14
|
+
if "/" not in target:
|
|
15
|
+
raise ConnectorError(f"changelog needs an owner/repo target, got {target!r}")
|
|
16
|
+
owner, repo = target.split("/", 1)
|
|
17
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/releases?per_page={limit}"
|
|
18
|
+
headers = {"Accept": "application/vnd.github+json"}
|
|
19
|
+
if cfg.github_token:
|
|
20
|
+
headers["Authorization"] = f"Bearer {cfg.github_token}"
|
|
21
|
+
data = http_get_json(url, headers, cfg.timeout)
|
|
22
|
+
return parse_releases(data)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_releases(data: list) -> list[dict]:
|
|
26
|
+
out: list[dict] = []
|
|
27
|
+
for r in data or []:
|
|
28
|
+
out.append(
|
|
29
|
+
{
|
|
30
|
+
"name": r.get("name") or r.get("tag_name", ""),
|
|
31
|
+
"tag": r.get("tag_name", ""),
|
|
32
|
+
"published_at": r.get("published_at"),
|
|
33
|
+
"body": (r.get("body") or "")[:2000],
|
|
34
|
+
"url": r.get("html_url", ""),
|
|
35
|
+
}
|
|
36
|
+
)
|
|
37
|
+
return out
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""GitHub issues connector. Zero-config (better with GITHUB_TOKEN for rate limits)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import Config, ConnectorError, candidate, http_get_json
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GitHubConnector:
|
|
9
|
+
name = "github"
|
|
10
|
+
|
|
11
|
+
def available(self, cfg: Config) -> tuple[bool, str]:
|
|
12
|
+
return True, "public API (set GITHUB_TOKEN to raise rate limits)"
|
|
13
|
+
|
|
14
|
+
def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
|
|
15
|
+
if "/" not in target:
|
|
16
|
+
raise ConnectorError(f"github needs an owner/repo target, got {target!r}")
|
|
17
|
+
owner, repo = target.split("/", 1)
|
|
18
|
+
# sort=comments (desc) biases toward discussed issues; avoids the URL-encoding
|
|
19
|
+
# pitfalls of the reactions-+1 token while still surfacing high-signal pain.
|
|
20
|
+
url = (
|
|
21
|
+
f"https://api.github.com/repos/{owner}/{repo}/issues"
|
|
22
|
+
f"?state=open&sort=comments&direction=desc&per_page={min(limit, 100)}"
|
|
23
|
+
)
|
|
24
|
+
headers = {"Accept": "application/vnd.github+json"}
|
|
25
|
+
if cfg.github_token:
|
|
26
|
+
headers["Authorization"] = f"Bearer {cfg.github_token}"
|
|
27
|
+
data = http_get_json(url, headers, cfg.timeout)
|
|
28
|
+
return parse_issues(data, target)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_issues(data: list, target: str) -> list[dict]:
|
|
32
|
+
"""Pure parser: GitHub issues JSON -> candidates. Skips pull requests."""
|
|
33
|
+
out: list[dict] = []
|
|
34
|
+
for it in data or []:
|
|
35
|
+
if "pull_request" in it: # the issues endpoint also returns PRs
|
|
36
|
+
continue
|
|
37
|
+
reactions = (it.get("reactions") or {}).get("total_count", 0)
|
|
38
|
+
engagement = int(reactions) + int(it.get("comments", 0))
|
|
39
|
+
out.append(
|
|
40
|
+
candidate(
|
|
41
|
+
title=it.get("title", ""),
|
|
42
|
+
problem=(it.get("body") or "")[:1000],
|
|
43
|
+
source_type="github",
|
|
44
|
+
url=it.get("html_url", ""),
|
|
45
|
+
engagement=engagement,
|
|
46
|
+
created_at=it.get("created_at"),
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
return out
|
pmkit/connectors/hn.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Hacker News connector via the Algolia search API. Zero-config."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import urllib.parse
|
|
6
|
+
|
|
7
|
+
from .base import Config, candidate, http_get_json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HNConnector:
|
|
11
|
+
name = "hn"
|
|
12
|
+
|
|
13
|
+
def available(self, cfg: Config) -> tuple[bool, str]:
|
|
14
|
+
return True, "Algolia HN search (no key)"
|
|
15
|
+
|
|
16
|
+
def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
|
|
17
|
+
query = target.split("/")[-1] # repo/project name
|
|
18
|
+
q = urllib.parse.quote(query)
|
|
19
|
+
url = f"https://hn.algolia.com/api/v1/search?query={q}&tags=story&hitsPerPage={limit}"
|
|
20
|
+
data = http_get_json(url, {}, cfg.timeout)
|
|
21
|
+
return parse_hn(data, query)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_hn(data: dict, query: str) -> list[dict]:
|
|
25
|
+
out: list[dict] = []
|
|
26
|
+
for hit in (data or {}).get("hits", []):
|
|
27
|
+
title = hit.get("title") or hit.get("story_title") or ""
|
|
28
|
+
if not title:
|
|
29
|
+
continue
|
|
30
|
+
object_id = hit.get("objectID")
|
|
31
|
+
url = hit.get("url") or f"https://news.ycombinator.com/item?id={object_id}"
|
|
32
|
+
out.append(
|
|
33
|
+
candidate(
|
|
34
|
+
title=title,
|
|
35
|
+
problem=f"Discussed on Hacker News re: {query}",
|
|
36
|
+
source_type="hn",
|
|
37
|
+
url=url,
|
|
38
|
+
engagement=int(hit.get("points", 0)) + int(hit.get("num_comments", 0)),
|
|
39
|
+
created_at=hit.get("created_at"),
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
return out
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Reddit connector via the public search JSON. Keyless but rate-limited; degrades."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import urllib.parse
|
|
6
|
+
|
|
7
|
+
from .base import Config, candidate, http_get_json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RedditConnector:
|
|
11
|
+
name = "reddit"
|
|
12
|
+
|
|
13
|
+
def available(self, cfg: Config) -> tuple[bool, str]:
|
|
14
|
+
return True, "public search JSON (best-effort; may rate-limit)"
|
|
15
|
+
|
|
16
|
+
def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
|
|
17
|
+
query = target.split("/")[-1]
|
|
18
|
+
q = urllib.parse.quote(query)
|
|
19
|
+
url = f"https://www.reddit.com/search.json?q={q}&sort=top&t=year&limit={limit}"
|
|
20
|
+
data = http_get_json(url, {}, cfg.timeout)
|
|
21
|
+
return parse_reddit(data, query)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_reddit(data: dict, query: str) -> list[dict]:
|
|
25
|
+
out: list[dict] = []
|
|
26
|
+
for child in (data or {}).get("data", {}).get("children", []):
|
|
27
|
+
d = child.get("data", {})
|
|
28
|
+
title = d.get("title", "")
|
|
29
|
+
if not title:
|
|
30
|
+
continue
|
|
31
|
+
permalink = d.get("permalink", "")
|
|
32
|
+
out.append(
|
|
33
|
+
candidate(
|
|
34
|
+
title=title,
|
|
35
|
+
problem=(d.get("selftext") or f"Reddit discussion re: {query}")[:1000],
|
|
36
|
+
source_type="reddit",
|
|
37
|
+
url=f"https://www.reddit.com{permalink}" if permalink else d.get("url", ""),
|
|
38
|
+
engagement=int(d.get("score", 0)) + int(d.get("num_comments", 0)),
|
|
39
|
+
created_at=str(d.get("created_utc", "")),
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
return out
|
pmkit/connectors/web.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Web search connector via Brave Search. Requires BRAVE_API_KEY; skipped otherwise."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import urllib.parse
|
|
6
|
+
|
|
7
|
+
from .base import Config, ConnectorError, candidate, http_get_json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WebConnector:
|
|
11
|
+
name = "web"
|
|
12
|
+
|
|
13
|
+
def available(self, cfg: Config) -> tuple[bool, str]:
|
|
14
|
+
if cfg.brave_key:
|
|
15
|
+
return True, "Brave Search"
|
|
16
|
+
return False, "no BRAVE_API_KEY"
|
|
17
|
+
|
|
18
|
+
def fetch(self, target: str, cfg: Config, limit: int = 20) -> list[dict]:
|
|
19
|
+
if not cfg.brave_key:
|
|
20
|
+
raise ConnectorError("no BRAVE_API_KEY")
|
|
21
|
+
query = f"{target.split('/')[-1]} issues OR limitations OR feature request"
|
|
22
|
+
q = urllib.parse.quote(query)
|
|
23
|
+
url = f"https://api.search.brave.com/res/v1/web/search?q={q}&count={min(limit, 20)}"
|
|
24
|
+
headers = {"Accept": "application/json", "X-Subscription-Token": cfg.brave_key}
|
|
25
|
+
data = http_get_json(url, headers, cfg.timeout)
|
|
26
|
+
return parse_brave(data, target)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_brave(data: dict, target: str) -> list[dict]:
|
|
30
|
+
out: list[dict] = []
|
|
31
|
+
for r in (data or {}).get("web", {}).get("results", []):
|
|
32
|
+
title = r.get("title", "")
|
|
33
|
+
if not title:
|
|
34
|
+
continue
|
|
35
|
+
out.append(
|
|
36
|
+
candidate(
|
|
37
|
+
title=title,
|
|
38
|
+
problem=(r.get("description") or "")[:1000],
|
|
39
|
+
source_type="web",
|
|
40
|
+
url=r.get("url", ""),
|
|
41
|
+
engagement=0, # web results carry no engagement signal
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
return out
|
pmkit/connectors/x.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""X (Twitter) connector via API v2 recent search. Requires X_BEARER_TOKEN; skipped otherwise."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import urllib.parse
|
|
6
|
+
|
|
7
|
+
from .base import Config, ConnectorError, candidate, http_get_json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class XConnector:
|
|
11
|
+
name = "x"
|
|
12
|
+
|
|
13
|
+
def available(self, cfg: Config) -> tuple[bool, str]:
|
|
14
|
+
if cfg.x_bearer:
|
|
15
|
+
return True, "X API v2 recent search"
|
|
16
|
+
return False, "no X_BEARER_TOKEN"
|
|
17
|
+
|
|
18
|
+
def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
|
|
19
|
+
if not cfg.x_bearer:
|
|
20
|
+
raise ConnectorError("no X_BEARER_TOKEN")
|
|
21
|
+
query = urllib.parse.quote(f"{target.split('/')[-1]} (bug OR feature OR wish) -is:retweet lang:en")
|
|
22
|
+
url = (
|
|
23
|
+
"https://api.twitter.com/2/tweets/search/recent"
|
|
24
|
+
f"?query={query}&max_results={min(max(limit, 10), 100)}"
|
|
25
|
+
"&tweet.fields=public_metrics,created_at"
|
|
26
|
+
)
|
|
27
|
+
headers = {"Authorization": f"Bearer {cfg.x_bearer}"}
|
|
28
|
+
data = http_get_json(url, headers, cfg.timeout)
|
|
29
|
+
return parse_x(data, target)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parse_x(data: dict, target: str) -> list[dict]:
|
|
33
|
+
out: list[dict] = []
|
|
34
|
+
for t in (data or {}).get("data", []):
|
|
35
|
+
text = t.get("text", "")
|
|
36
|
+
if not text:
|
|
37
|
+
continue
|
|
38
|
+
metrics = t.get("public_metrics", {})
|
|
39
|
+
engagement = int(metrics.get("like_count", 0)) + int(metrics.get("retweet_count", 0))
|
|
40
|
+
out.append(
|
|
41
|
+
candidate(
|
|
42
|
+
title=text[:120],
|
|
43
|
+
problem=text,
|
|
44
|
+
source_type="x",
|
|
45
|
+
url=f"https://twitter.com/i/web/status/{t.get('id')}",
|
|
46
|
+
engagement=engagement,
|
|
47
|
+
created_at=t.get("created_at"),
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
return out
|
pmkit/dedup.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Near-duplicate detection for discovered candidates.
|
|
2
|
+
|
|
3
|
+
The backlog handles *exact* dedup via the normalized (target, dedup_key). This module adds
|
|
4
|
+
*near*-duplicate matching so two differently-worded reports of the same problem on the same
|
|
5
|
+
target collapse into one item (evidence accrues rather than duplicating). Pure functions,
|
|
6
|
+
no I/O — fully unit-testable.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
_TOKEN_RE = re.compile(r"[a-z0-9]+")
|
|
15
|
+
|
|
16
|
+
# A small stopword set keeps generic words from inflating similarity.
|
|
17
|
+
_STOP = {
|
|
18
|
+
"the", "a", "an", "to", "of", "and", "or", "in", "on", "for", "is", "are",
|
|
19
|
+
"be", "with", "no", "not", "it", "this", "that", "when", "how", "i", "we",
|
|
20
|
+
"you", "my", "should", "would", "could", "can", "have", "has", "add", "support",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
DEFAULT_THRESHOLD = 0.6
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def token_set(text: str) -> set[str]:
|
|
27
|
+
return {t for t in _TOKEN_RE.findall((text or "").lower()) if t not in _STOP and len(t) > 2}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def jaccard(a: set[str], b: set[str]) -> float:
|
|
31
|
+
if not a and not b:
|
|
32
|
+
return 0.0
|
|
33
|
+
inter = len(a & b)
|
|
34
|
+
union = len(a | b)
|
|
35
|
+
return inter / union if union else 0.0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def similarity(text_a: str, text_b: str) -> float:
|
|
39
|
+
"""Token-set Jaccard over the combined title+problem text of two candidates."""
|
|
40
|
+
return jaccard(token_set(text_a), token_set(text_b))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _candidate_text(item: dict) -> str:
|
|
44
|
+
return f"{item.get('title', '')} {item.get('problem', '')}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def find_near_duplicate(
|
|
48
|
+
cand: dict,
|
|
49
|
+
existing: list[dict],
|
|
50
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
51
|
+
) -> Optional[dict]:
|
|
52
|
+
"""Return the most similar existing item above ``threshold``, or None.
|
|
53
|
+
|
|
54
|
+
``cand`` and each ``existing`` item are dicts with 'title' and 'problem'.
|
|
55
|
+
"""
|
|
56
|
+
cand_text = _candidate_text(cand)
|
|
57
|
+
best: Optional[dict] = None
|
|
58
|
+
best_score = threshold
|
|
59
|
+
for item in existing:
|
|
60
|
+
score = similarity(cand_text, _candidate_text(item))
|
|
61
|
+
if score >= best_score:
|
|
62
|
+
best = item
|
|
63
|
+
best_score = score
|
|
64
|
+
return best
|
pmkit/discover.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Discovery orchestration: run connectors, dedup, and write candidates to the backlog.
|
|
2
|
+
|
|
3
|
+
Connectors are injectable so the orchestration is testable without the network. Each
|
|
4
|
+
candidate is matched against the existing backlog (near-duplicate by similarity, exact by
|
|
5
|
+
the backlog's own key) — a match attaches evidence, a miss creates a new item. Candidates
|
|
6
|
+
with weak engagement or no source are flagged low-confidence rather than dropped (R3).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from .backlog import Backlog, make_dedup_key
|
|
14
|
+
from .connectors import get_connectors
|
|
15
|
+
from .connectors.base import Config, ConnectorError
|
|
16
|
+
from .dedup import DEFAULT_THRESHOLD, find_near_duplicate
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run_discovery(
|
|
20
|
+
backlog: Backlog,
|
|
21
|
+
target: str,
|
|
22
|
+
connectors: Optional[list] = None,
|
|
23
|
+
cfg: Optional[Config] = None,
|
|
24
|
+
limit: int = 25,
|
|
25
|
+
near_threshold: float = DEFAULT_THRESHOLD,
|
|
26
|
+
) -> dict:
|
|
27
|
+
cfg = cfg or Config.from_env()
|
|
28
|
+
connectors = connectors if connectors is not None else get_connectors()
|
|
29
|
+
|
|
30
|
+
summary = {
|
|
31
|
+
"target": target,
|
|
32
|
+
"fetched": 0,
|
|
33
|
+
"new": 0,
|
|
34
|
+
"merged": 0,
|
|
35
|
+
"low_confidence": 0,
|
|
36
|
+
"by_source": {},
|
|
37
|
+
"skipped": [],
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Seed with existing items for this target so re-runs dedup across runs (R10).
|
|
41
|
+
current = [it for it in backlog.list() if it["target"] == target]
|
|
42
|
+
|
|
43
|
+
for conn in connectors:
|
|
44
|
+
ok, reason = conn.available(cfg)
|
|
45
|
+
if not ok:
|
|
46
|
+
summary["skipped"].append({"source": conn.name, "reason": reason})
|
|
47
|
+
continue
|
|
48
|
+
try:
|
|
49
|
+
cands = conn.fetch(target, cfg, limit)
|
|
50
|
+
except ConnectorError as e:
|
|
51
|
+
summary["skipped"].append({"source": conn.name, "reason": str(e)})
|
|
52
|
+
continue
|
|
53
|
+
except Exception as e: # defensive: a connector bug must not abort the run
|
|
54
|
+
summary["skipped"].append({"source": conn.name, "reason": f"unexpected: {e}"})
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
summary["by_source"][conn.name] = len(cands)
|
|
58
|
+
for cand in cands:
|
|
59
|
+
summary["fetched"] += 1
|
|
60
|
+
low_conf = cand["engagement"] < cfg.min_engagement or not cand["source"].get("url")
|
|
61
|
+
# Match exact-key first (the same dedup add_candidate would do internally), then
|
|
62
|
+
# near-duplicate — so a candidate that add_candidate would merge is counted as
|
|
63
|
+
# 'merged', not 'new' (fixes the new-count overcount).
|
|
64
|
+
key = make_dedup_key(target, cand["problem"])
|
|
65
|
+
dup = backlog.find_existing(target, key) or find_near_duplicate(cand, current, near_threshold)
|
|
66
|
+
if dup is not None:
|
|
67
|
+
backlog.attach_evidence(dup["id"], [cand["source"]])
|
|
68
|
+
summary["merged"] += 1
|
|
69
|
+
continue
|
|
70
|
+
opp_id = backlog.add_candidate(
|
|
71
|
+
target=target,
|
|
72
|
+
title=cand["title"],
|
|
73
|
+
problem=cand["problem"],
|
|
74
|
+
sources=[cand["source"]],
|
|
75
|
+
low_confidence=low_conf,
|
|
76
|
+
)
|
|
77
|
+
summary["new"] += 1
|
|
78
|
+
if low_conf:
|
|
79
|
+
summary["low_confidence"] += 1
|
|
80
|
+
# Make this candidate visible to later ones in the same run.
|
|
81
|
+
current.append({"id": opp_id, "title": cand["title"], "problem": cand["problem"]})
|
|
82
|
+
|
|
83
|
+
return summary
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""pm-dogfood deterministic helpers.
|
|
2
|
+
|
|
3
|
+
The pm-dogfood *skill* owns inference and judgment (infer the usage scenario from a
|
|
4
|
+
product's docs, decide what counts as a gap, confirm reproducibility). This package holds
|
|
5
|
+
the mechanical, unit-testable pieces it calls: the clean-room install runner, the UI and
|
|
6
|
+
MCP drivers, the report/parity builder, and the confirmed+deduped backlog filer.
|
|
7
|
+
"""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""File confirmed dogfood gaps back to the opportunity backlog (deduped).
|
|
2
|
+
|
|
3
|
+
Closes the funnel loop: a reproducible doc-vs-reality gap becomes a new opportunity.
|
|
4
|
+
Only *confirmed* gaps are filed (the skill marks the ones that reproduced on re-run);
|
|
5
|
+
flaky/unconfirmed gaps stay report-only. Filing reuses the backlog's exact-key dedup, so
|
|
6
|
+
re-running pm-dogfood doesn't pile up duplicates. Each filed item carries a
|
|
7
|
+
``source=dogfood`` provenance entry.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Iterable
|
|
13
|
+
|
|
14
|
+
from ..backlog import Backlog, make_dedup_key
|
|
15
|
+
from .report import Finding
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def file_gaps(
|
|
19
|
+
backlog: Backlog,
|
|
20
|
+
target: str,
|
|
21
|
+
gaps: Iterable[Finding],
|
|
22
|
+
confirmed: set[str],
|
|
23
|
+
*,
|
|
24
|
+
source: str = "dogfood",
|
|
25
|
+
) -> dict:
|
|
26
|
+
"""File confirmed gaps as backlog opportunities; skip unconfirmed; dedup the rest.
|
|
27
|
+
|
|
28
|
+
``confirmed`` is the set of gap titles that reproduced. Returns counts of
|
|
29
|
+
filed / deduped / skipped (report-only) gap titles.
|
|
30
|
+
"""
|
|
31
|
+
filed: list[int] = []
|
|
32
|
+
deduped: list[int] = []
|
|
33
|
+
skipped: list[str] = []
|
|
34
|
+
for g in gaps:
|
|
35
|
+
if g.title not in confirmed:
|
|
36
|
+
skipped.append(g.title)
|
|
37
|
+
continue
|
|
38
|
+
before = len(backlog.list())
|
|
39
|
+
opp_id = backlog.add_candidate(
|
|
40
|
+
target=target,
|
|
41
|
+
title=f"[dogfood] {g.title}",
|
|
42
|
+
problem=f"{g.claim} -> observed: {g.observed}".strip(),
|
|
43
|
+
# key on the gap's title (its real identity) so distinct gaps with similar
|
|
44
|
+
# output don't collapse into one — claim text is constant per interface.
|
|
45
|
+
dedup_key=make_dedup_key(target, f"{g.title} {g.observed}"),
|
|
46
|
+
sources=[{"type": source, "url": "", "excerpt": (g.observed or "")[:200]}],
|
|
47
|
+
)
|
|
48
|
+
if len(backlog.list()) > before:
|
|
49
|
+
filed.append(opp_id)
|
|
50
|
+
else:
|
|
51
|
+
deduped.append(opp_id)
|
|
52
|
+
return {"filed": filed, "deduped": deduped, "skipped": skipped}
|