pmkit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ """Discovery connectors registry.
2
+
3
+ Each connector fetches signals from one OSS source and returns normalized candidate
4
+ dicts with provenance. Connectors degrade gracefully: a missing key or a source error
5
+ skips that source rather than aborting the run.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .github import GitHubConnector
11
+ from .hn import HNConnector
12
+ from .reddit import RedditConnector
13
+ from .web import WebConnector
14
+ from .x import XConnector
15
+
16
+ # Order matters only for display; discovery runs all available connectors.
17
+ REGISTRY = [
18
+ GitHubConnector(),
19
+ HNConnector(),
20
+ RedditConnector(),
21
+ WebConnector(),
22
+ XConnector(),
23
+ ]
24
+
25
+
26
+ def get_connectors(names=None):
27
+ """Return connector instances, optionally filtered by name."""
28
+ if not names:
29
+ return list(REGISTRY)
30
+ wanted = set(names)
31
+ selected = [c for c in REGISTRY if c.name in wanted]
32
+ unknown = wanted - {c.name for c in REGISTRY}
33
+ if unknown:
34
+ raise ValueError(f"unknown source(s): {sorted(unknown)}")
35
+ return selected
@@ -0,0 +1,67 @@
1
+ """Connector framework: config, HTTP helper, and the candidate shape.
2
+
3
+ A connector fetches signals from one OSS source and returns *candidate* dicts. The HTTP
4
+ fetch is always separated from a pure ``parse_*`` function so parsing is unit-testable
5
+ without the network. Connectors never raise out of discovery: a missing key or a source
6
+ error is reported as a skip, not a crash (graceful degradation, R3/U3).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import os
13
+ import urllib.error
14
+ import urllib.request
15
+ from dataclasses import dataclass
16
+ from typing import Optional
17
+
18
+ DEFAULT_UA = "pmkit/0.1 (+https://github.com/dkedar7/pm-system)"
19
+
20
+
21
+ class ConnectorError(Exception):
22
+ """A source-level failure (auth, network, bad target). Caught by discovery."""
23
+
24
+
25
+ @dataclass
26
+ class Config:
27
+ """Runtime config, sourced from the environment. All keys optional."""
28
+
29
+ github_token: Optional[str] = None
30
+ brave_key: Optional[str] = None
31
+ x_bearer: Optional[str] = None
32
+ user_agent: str = DEFAULT_UA
33
+ timeout: float = 15.0
34
+ min_engagement: int = 2 # below this, a candidate is flagged low-confidence
35
+
36
+ @classmethod
37
+ def from_env(cls) -> "Config":
38
+ return cls(
39
+ github_token=os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN"),
40
+ brave_key=os.environ.get("BRAVE_API_KEY"),
41
+ x_bearer=os.environ.get("X_BEARER_TOKEN") or os.environ.get("TWITTER_BEARER_TOKEN"),
42
+ )
43
+
44
+
45
+ def candidate(title: str, problem: str, source_type: str, url: str,
46
+ engagement: int = 0, created_at: Optional[str] = None) -> dict:
47
+ """Build a normalized candidate dict with one source of provenance."""
48
+ return {
49
+ "title": (title or "").strip()[:200],
50
+ "problem": (problem or "").strip()[:1000],
51
+ "engagement": int(engagement or 0),
52
+ "source": {"type": source_type, "url": url, "created_at": created_at},
53
+ }
54
+
55
+
56
+ def http_get_json(url: str, headers: Optional[dict] = None, timeout: float = 15.0):
57
+ """GET a URL and parse JSON. Raises ConnectorError on any failure."""
58
+ req = urllib.request.Request(url, headers={"User-Agent": DEFAULT_UA, **(headers or {})})
59
+ try:
60
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
61
+ return json.loads(resp.read().decode("utf-8"))
62
+ except urllib.error.HTTPError as e:
63
+ raise ConnectorError(f"HTTP {e.code} for {url}") from e
64
+ except (urllib.error.URLError, TimeoutError) as e:
65
+ raise ConnectorError(f"network error for {url}: {e}") from e
66
+ except json.JSONDecodeError as e:
67
+ raise ConnectorError(f"bad JSON from {url}: {e}") from e
@@ -0,0 +1,37 @@
1
+ """Changelog / releases utility.
2
+
3
+ Recent releases are not opportunities, so this is not a candidate-producing connector.
4
+ It supplies "recently shipped" context that the already-solved kill-test (U4) uses to
5
+ refute candidates a maintainer has just addressed. Zero-config (GitHub releases API).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .base import Config, ConnectorError, http_get_json
11
+
12
+
13
+ def recent_releases(target: str, cfg: Config, limit: int = 10) -> list[dict]:
14
+ if "/" not in target:
15
+ raise ConnectorError(f"changelog needs an owner/repo target, got {target!r}")
16
+ owner, repo = target.split("/", 1)
17
+ url = f"https://api.github.com/repos/{owner}/{repo}/releases?per_page={limit}"
18
+ headers = {"Accept": "application/vnd.github+json"}
19
+ if cfg.github_token:
20
+ headers["Authorization"] = f"Bearer {cfg.github_token}"
21
+ data = http_get_json(url, headers, cfg.timeout)
22
+ return parse_releases(data)
23
+
24
+
25
+ def parse_releases(data: list) -> list[dict]:
26
+ out: list[dict] = []
27
+ for r in data or []:
28
+ out.append(
29
+ {
30
+ "name": r.get("name") or r.get("tag_name", ""),
31
+ "tag": r.get("tag_name", ""),
32
+ "published_at": r.get("published_at"),
33
+ "body": (r.get("body") or "")[:2000],
34
+ "url": r.get("html_url", ""),
35
+ }
36
+ )
37
+ return out
@@ -0,0 +1,49 @@
1
+ """GitHub issues connector. Zero-config (better with GITHUB_TOKEN for rate limits)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import Config, ConnectorError, candidate, http_get_json
6
+
7
+
8
+ class GitHubConnector:
9
+ name = "github"
10
+
11
+ def available(self, cfg: Config) -> tuple[bool, str]:
12
+ return True, "public API (set GITHUB_TOKEN to raise rate limits)"
13
+
14
+ def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
15
+ if "/" not in target:
16
+ raise ConnectorError(f"github needs an owner/repo target, got {target!r}")
17
+ owner, repo = target.split("/", 1)
18
+ # sort=comments (desc) biases toward discussed issues; avoids the URL-encoding
19
+ # pitfalls of the reactions-+1 token while still surfacing high-signal pain.
20
+ url = (
21
+ f"https://api.github.com/repos/{owner}/{repo}/issues"
22
+ f"?state=open&sort=comments&direction=desc&per_page={min(limit, 100)}"
23
+ )
24
+ headers = {"Accept": "application/vnd.github+json"}
25
+ if cfg.github_token:
26
+ headers["Authorization"] = f"Bearer {cfg.github_token}"
27
+ data = http_get_json(url, headers, cfg.timeout)
28
+ return parse_issues(data, target)
29
+
30
+
31
+ def parse_issues(data: list, target: str) -> list[dict]:
32
+ """Pure parser: GitHub issues JSON -> candidates. Skips pull requests."""
33
+ out: list[dict] = []
34
+ for it in data or []:
35
+ if "pull_request" in it: # the issues endpoint also returns PRs
36
+ continue
37
+ reactions = (it.get("reactions") or {}).get("total_count", 0)
38
+ engagement = int(reactions) + int(it.get("comments", 0))
39
+ out.append(
40
+ candidate(
41
+ title=it.get("title", ""),
42
+ problem=(it.get("body") or "")[:1000],
43
+ source_type="github",
44
+ url=it.get("html_url", ""),
45
+ engagement=engagement,
46
+ created_at=it.get("created_at"),
47
+ )
48
+ )
49
+ return out
pmkit/connectors/hn.py ADDED
@@ -0,0 +1,42 @@
1
+ """Hacker News connector via the Algolia search API. Zero-config."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import urllib.parse
6
+
7
+ from .base import Config, candidate, http_get_json
8
+
9
+
10
+ class HNConnector:
11
+ name = "hn"
12
+
13
+ def available(self, cfg: Config) -> tuple[bool, str]:
14
+ return True, "Algolia HN search (no key)"
15
+
16
+ def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
17
+ query = target.split("/")[-1] # repo/project name
18
+ q = urllib.parse.quote(query)
19
+ url = f"https://hn.algolia.com/api/v1/search?query={q}&tags=story&hitsPerPage={limit}"
20
+ data = http_get_json(url, {}, cfg.timeout)
21
+ return parse_hn(data, query)
22
+
23
+
24
+ def parse_hn(data: dict, query: str) -> list[dict]:
25
+ out: list[dict] = []
26
+ for hit in (data or {}).get("hits", []):
27
+ title = hit.get("title") or hit.get("story_title") or ""
28
+ if not title:
29
+ continue
30
+ object_id = hit.get("objectID")
31
+ url = hit.get("url") or f"https://news.ycombinator.com/item?id={object_id}"
32
+ out.append(
33
+ candidate(
34
+ title=title,
35
+ problem=f"Discussed on Hacker News re: {query}",
36
+ source_type="hn",
37
+ url=url,
38
+ engagement=int(hit.get("points", 0)) + int(hit.get("num_comments", 0)),
39
+ created_at=hit.get("created_at"),
40
+ )
41
+ )
42
+ return out
@@ -0,0 +1,42 @@
1
+ """Reddit connector via the public search JSON. Keyless but rate-limited; degrades."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import urllib.parse
6
+
7
+ from .base import Config, candidate, http_get_json
8
+
9
+
10
+ class RedditConnector:
11
+ name = "reddit"
12
+
13
+ def available(self, cfg: Config) -> tuple[bool, str]:
14
+ return True, "public search JSON (best-effort; may rate-limit)"
15
+
16
+ def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
17
+ query = target.split("/")[-1]
18
+ q = urllib.parse.quote(query)
19
+ url = f"https://www.reddit.com/search.json?q={q}&sort=top&t=year&limit={limit}"
20
+ data = http_get_json(url, {}, cfg.timeout)
21
+ return parse_reddit(data, query)
22
+
23
+
24
+ def parse_reddit(data: dict, query: str) -> list[dict]:
25
+ out: list[dict] = []
26
+ for child in (data or {}).get("data", {}).get("children", []):
27
+ d = child.get("data", {})
28
+ title = d.get("title", "")
29
+ if not title:
30
+ continue
31
+ permalink = d.get("permalink", "")
32
+ out.append(
33
+ candidate(
34
+ title=title,
35
+ problem=(d.get("selftext") or f"Reddit discussion re: {query}")[:1000],
36
+ source_type="reddit",
37
+ url=f"https://www.reddit.com{permalink}" if permalink else d.get("url", ""),
38
+ engagement=int(d.get("score", 0)) + int(d.get("num_comments", 0)),
39
+ created_at=str(d.get("created_utc", "")),
40
+ )
41
+ )
42
+ return out
@@ -0,0 +1,44 @@
1
+ """Web search connector via Brave Search. Requires BRAVE_API_KEY; skipped otherwise."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import urllib.parse
6
+
7
+ from .base import Config, ConnectorError, candidate, http_get_json
8
+
9
+
10
+ class WebConnector:
11
+ name = "web"
12
+
13
+ def available(self, cfg: Config) -> tuple[bool, str]:
14
+ if cfg.brave_key:
15
+ return True, "Brave Search"
16
+ return False, "no BRAVE_API_KEY"
17
+
18
+ def fetch(self, target: str, cfg: Config, limit: int = 20) -> list[dict]:
19
+ if not cfg.brave_key:
20
+ raise ConnectorError("no BRAVE_API_KEY")
21
+ query = f"{target.split('/')[-1]} issues OR limitations OR feature request"
22
+ q = urllib.parse.quote(query)
23
+ url = f"https://api.search.brave.com/res/v1/web/search?q={q}&count={min(limit, 20)}"
24
+ headers = {"Accept": "application/json", "X-Subscription-Token": cfg.brave_key}
25
+ data = http_get_json(url, headers, cfg.timeout)
26
+ return parse_brave(data, target)
27
+
28
+
29
+ def parse_brave(data: dict, target: str) -> list[dict]:
30
+ out: list[dict] = []
31
+ for r in (data or {}).get("web", {}).get("results", []):
32
+ title = r.get("title", "")
33
+ if not title:
34
+ continue
35
+ out.append(
36
+ candidate(
37
+ title=title,
38
+ problem=(r.get("description") or "")[:1000],
39
+ source_type="web",
40
+ url=r.get("url", ""),
41
+ engagement=0, # web results carry no engagement signal
42
+ )
43
+ )
44
+ return out
pmkit/connectors/x.py ADDED
@@ -0,0 +1,50 @@
1
+ """X (Twitter) connector via API v2 recent search. Requires X_BEARER_TOKEN; skipped otherwise."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import urllib.parse
6
+
7
+ from .base import Config, ConnectorError, candidate, http_get_json
8
+
9
+
10
+ class XConnector:
11
+ name = "x"
12
+
13
+ def available(self, cfg: Config) -> tuple[bool, str]:
14
+ if cfg.x_bearer:
15
+ return True, "X API v2 recent search"
16
+ return False, "no X_BEARER_TOKEN"
17
+
18
+ def fetch(self, target: str, cfg: Config, limit: int = 25) -> list[dict]:
19
+ if not cfg.x_bearer:
20
+ raise ConnectorError("no X_BEARER_TOKEN")
21
+ query = urllib.parse.quote(f"{target.split('/')[-1]} (bug OR feature OR wish) -is:retweet lang:en")
22
+ url = (
23
+ "https://api.twitter.com/2/tweets/search/recent"
24
+ f"?query={query}&max_results={min(max(limit, 10), 100)}"
25
+ "&tweet.fields=public_metrics,created_at"
26
+ )
27
+ headers = {"Authorization": f"Bearer {cfg.x_bearer}"}
28
+ data = http_get_json(url, headers, cfg.timeout)
29
+ return parse_x(data, target)
30
+
31
+
32
+ def parse_x(data: dict, target: str) -> list[dict]:
33
+ out: list[dict] = []
34
+ for t in (data or {}).get("data", []):
35
+ text = t.get("text", "")
36
+ if not text:
37
+ continue
38
+ metrics = t.get("public_metrics", {})
39
+ engagement = int(metrics.get("like_count", 0)) + int(metrics.get("retweet_count", 0))
40
+ out.append(
41
+ candidate(
42
+ title=text[:120],
43
+ problem=text,
44
+ source_type="x",
45
+ url=f"https://twitter.com/i/web/status/{t.get('id')}",
46
+ engagement=engagement,
47
+ created_at=t.get("created_at"),
48
+ )
49
+ )
50
+ return out
pmkit/dedup.py ADDED
@@ -0,0 +1,64 @@
1
+ """Near-duplicate detection for discovered candidates.
2
+
3
+ The backlog handles *exact* dedup via the normalized (target, dedup_key). This module adds
4
+ *near*-duplicate matching so two differently-worded reports of the same problem on the same
5
+ target collapse into one item (evidence accrues rather than duplicating). Pure functions,
6
+ no I/O — fully unit-testable.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from typing import Optional
13
+
14
+ _TOKEN_RE = re.compile(r"[a-z0-9]+")
15
+
16
+ # A small stopword set keeps generic words from inflating similarity.
17
+ _STOP = {
18
+ "the", "a", "an", "to", "of", "and", "or", "in", "on", "for", "is", "are",
19
+ "be", "with", "no", "not", "it", "this", "that", "when", "how", "i", "we",
20
+ "you", "my", "should", "would", "could", "can", "have", "has", "add", "support",
21
+ }
22
+
23
+ DEFAULT_THRESHOLD = 0.6
24
+
25
+
26
+ def token_set(text: str) -> set[str]:
27
+ return {t for t in _TOKEN_RE.findall((text or "").lower()) if t not in _STOP and len(t) > 2}
28
+
29
+
30
+ def jaccard(a: set[str], b: set[str]) -> float:
31
+ if not a and not b:
32
+ return 0.0
33
+ inter = len(a & b)
34
+ union = len(a | b)
35
+ return inter / union if union else 0.0
36
+
37
+
38
+ def similarity(text_a: str, text_b: str) -> float:
39
+ """Token-set Jaccard over the combined title+problem text of two candidates."""
40
+ return jaccard(token_set(text_a), token_set(text_b))
41
+
42
+
43
+ def _candidate_text(item: dict) -> str:
44
+ return f"{item.get('title', '')} {item.get('problem', '')}"
45
+
46
+
47
+ def find_near_duplicate(
48
+ cand: dict,
49
+ existing: list[dict],
50
+ threshold: float = DEFAULT_THRESHOLD,
51
+ ) -> Optional[dict]:
52
+ """Return the most similar existing item above ``threshold``, or None.
53
+
54
+ ``cand`` and each ``existing`` item are dicts with 'title' and 'problem'.
55
+ """
56
+ cand_text = _candidate_text(cand)
57
+ best: Optional[dict] = None
58
+ best_score = threshold
59
+ for item in existing:
60
+ score = similarity(cand_text, _candidate_text(item))
61
+ if score >= best_score:
62
+ best = item
63
+ best_score = score
64
+ return best
pmkit/discover.py ADDED
@@ -0,0 +1,83 @@
1
+ """Discovery orchestration: run connectors, dedup, and write candidates to the backlog.
2
+
3
+ Connectors are injectable so the orchestration is testable without the network. Each
4
+ candidate is matched against the existing backlog (near-duplicate by similarity, exact by
5
+ the backlog's own key) — a match attaches evidence, a miss creates a new item. Candidates
6
+ with weak engagement or no source are flagged low-confidence rather than dropped (R3).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Optional
12
+
13
+ from .backlog import Backlog, make_dedup_key
14
+ from .connectors import get_connectors
15
+ from .connectors.base import Config, ConnectorError
16
+ from .dedup import DEFAULT_THRESHOLD, find_near_duplicate
17
+
18
+
19
+ def run_discovery(
20
+ backlog: Backlog,
21
+ target: str,
22
+ connectors: Optional[list] = None,
23
+ cfg: Optional[Config] = None,
24
+ limit: int = 25,
25
+ near_threshold: float = DEFAULT_THRESHOLD,
26
+ ) -> dict:
27
+ cfg = cfg or Config.from_env()
28
+ connectors = connectors if connectors is not None else get_connectors()
29
+
30
+ summary = {
31
+ "target": target,
32
+ "fetched": 0,
33
+ "new": 0,
34
+ "merged": 0,
35
+ "low_confidence": 0,
36
+ "by_source": {},
37
+ "skipped": [],
38
+ }
39
+
40
+ # Seed with existing items for this target so re-runs dedup across runs (R10).
41
+ current = [it for it in backlog.list() if it["target"] == target]
42
+
43
+ for conn in connectors:
44
+ ok, reason = conn.available(cfg)
45
+ if not ok:
46
+ summary["skipped"].append({"source": conn.name, "reason": reason})
47
+ continue
48
+ try:
49
+ cands = conn.fetch(target, cfg, limit)
50
+ except ConnectorError as e:
51
+ summary["skipped"].append({"source": conn.name, "reason": str(e)})
52
+ continue
53
+ except Exception as e: # defensive: a connector bug must not abort the run
54
+ summary["skipped"].append({"source": conn.name, "reason": f"unexpected: {e}"})
55
+ continue
56
+
57
+ summary["by_source"][conn.name] = len(cands)
58
+ for cand in cands:
59
+ summary["fetched"] += 1
60
+ low_conf = cand["engagement"] < cfg.min_engagement or not cand["source"].get("url")
61
+ # Match exact-key first (the same dedup add_candidate would do internally), then
62
+ # near-duplicate — so a candidate that add_candidate would merge is counted as
63
+ # 'merged', not 'new' (fixes the new-count overcount).
64
+ key = make_dedup_key(target, cand["problem"])
65
+ dup = backlog.find_existing(target, key) or find_near_duplicate(cand, current, near_threshold)
66
+ if dup is not None:
67
+ backlog.attach_evidence(dup["id"], [cand["source"]])
68
+ summary["merged"] += 1
69
+ continue
70
+ opp_id = backlog.add_candidate(
71
+ target=target,
72
+ title=cand["title"],
73
+ problem=cand["problem"],
74
+ sources=[cand["source"]],
75
+ low_confidence=low_conf,
76
+ )
77
+ summary["new"] += 1
78
+ if low_conf:
79
+ summary["low_confidence"] += 1
80
+ # Make this candidate visible to later ones in the same run.
81
+ current.append({"id": opp_id, "title": cand["title"], "problem": cand["problem"]})
82
+
83
+ return summary
@@ -0,0 +1,7 @@
1
+ """pm-dogfood deterministic helpers.
2
+
3
+ The pm-dogfood *skill* owns inference and judgment (infer the usage scenario from a
4
+ product's docs, decide what counts as a gap, confirm reproducibility). This package holds
5
+ the mechanical, unit-testable pieces it calls: the clean-room install runner, the UI and
6
+ MCP drivers, the report/parity builder, and the confirmed+deduped backlog filer.
7
+ """
@@ -0,0 +1,52 @@
1
+ """File confirmed dogfood gaps back to the opportunity backlog (deduped).
2
+
3
+ Closes the funnel loop: a reproducible doc-vs-reality gap becomes a new opportunity.
4
+ Only *confirmed* gaps are filed (the skill marks the ones that reproduced on re-run);
5
+ flaky/unconfirmed gaps stay report-only. Filing reuses the backlog's exact-key dedup, so
6
+ re-running pm-dogfood doesn't pile up duplicates. Each filed item carries a
7
+ ``source=dogfood`` provenance entry.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Iterable
13
+
14
+ from ..backlog import Backlog, make_dedup_key
15
+ from .report import Finding
16
+
17
+
18
+ def file_gaps(
19
+ backlog: Backlog,
20
+ target: str,
21
+ gaps: Iterable[Finding],
22
+ confirmed: set[str],
23
+ *,
24
+ source: str = "dogfood",
25
+ ) -> dict:
26
+ """File confirmed gaps as backlog opportunities; skip unconfirmed; dedup the rest.
27
+
28
+ ``confirmed`` is the set of gap titles that reproduced. Returns counts of
29
+ filed / deduped / skipped (report-only) gap titles.
30
+ """
31
+ filed: list[int] = []
32
+ deduped: list[int] = []
33
+ skipped: list[str] = []
34
+ for g in gaps:
35
+ if g.title not in confirmed:
36
+ skipped.append(g.title)
37
+ continue
38
+ before = len(backlog.list())
39
+ opp_id = backlog.add_candidate(
40
+ target=target,
41
+ title=f"[dogfood] {g.title}",
42
+ problem=f"{g.claim} -> observed: {g.observed}".strip(),
43
+ # key on the gap's title (its real identity) so distinct gaps with similar
44
+ # output don't collapse into one — claim text is constant per interface.
45
+ dedup_key=make_dedup_key(target, f"{g.title} {g.observed}"),
46
+ sources=[{"type": source, "url": "", "excerpt": (g.observed or "")[:200]}],
47
+ )
48
+ if len(backlog.list()) > before:
49
+ filed.append(opp_id)
50
+ else:
51
+ deduped.append(opp_id)
52
+ return {"filed": filed, "deduped": deduped, "skipped": skipped}