pmkit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,111 @@
1
+ """Clean-room install runner.
2
+
3
+ Runs a product's *documented* install/run commands verbatim in a throwaway working
4
+ directory and reports what actually happens. A failed documented step is a gap (a
5
+ doc-vs-reality finding), never a crash — the run continues so later independent steps
6
+ still get exercised. Commands are expected to be self-contained (e.g. `uvx <pkg> ...`,
7
+ `uv run --no-project --with <pkg> ...`), which create their own ephemeral environments.
8
+
9
+ SECURITY: these command strings are executed verbatim through a shell (`shell=True`) with
10
+ the operator's local privileges — running them IS the point, so they must come from a
11
+ trusted, operator-reviewed source, not directly from untrusted/auto-scraped product docs
12
+ without a human in the loop. To limit blast radius, the run gets a minimal allow-listed
13
+ environment (no inherited API keys/secrets); callers opt specific vars back in via `env`.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ import tempfile
22
+ from dataclasses import asdict, dataclass, field
23
+ from typing import Optional
24
+
25
+ # Only these env vars are passed into the throwaway run; everything else (the operator's
26
+ # API keys and secrets) is withheld from the third-party documented commands.
27
+ _SAFE_ENV_KEYS = {
28
+ "PATH", "PATHEXT", "SYSTEMROOT", "SYSTEMDRIVE", "COMSPEC", "WINDIR",
29
+ "TEMP", "TMP", "TMPDIR", "HOME", "HOMEDRIVE", "HOMEPATH", "USERPROFILE",
30
+ "LANG", "LC_ALL", "LC_CTYPE", "PYTHONUTF8",
31
+ }
32
+
33
+
34
+ def _clean_env(extra: Optional[dict]) -> dict:
35
+ base = {k: v for k, v in os.environ.items() if k.upper() in _SAFE_ENV_KEYS}
36
+ base.update(extra or {})
37
+ return base
38
+
39
+
40
+ @dataclass
41
+ class StepResult:
42
+ command: str
43
+ ok: bool
44
+ exit_code: Optional[int]
45
+ output: str
46
+ gap: bool
47
+ reason: str = ""
48
+
49
+ def to_dict(self) -> dict:
50
+ return asdict(self)
51
+
52
+
53
+ @dataclass
54
+ class InstallReport:
55
+ steps: list[StepResult] = field(default_factory=list)
56
+
57
+ @property
58
+ def all_ok(self) -> bool:
59
+ return bool(self.steps) and all(s.ok for s in self.steps)
60
+
61
+ @property
62
+ def gaps(self) -> list[StepResult]:
63
+ return [s for s in self.steps if s.gap]
64
+
65
+ def to_dict(self) -> dict:
66
+ return {"all_ok": self.all_ok, "steps": [s.to_dict() for s in self.steps]}
67
+
68
+
69
+ def run_documented_install(
70
+ commands: list[str],
71
+ *,
72
+ workdir: Optional[str] = None,
73
+ timeout: float = 120.0,
74
+ env: Optional[dict] = None,
75
+ ) -> InstallReport:
76
+ """Run each documented command in a throwaway dir; return a per-step report.
77
+
78
+ A non-zero exit, timeout, or spawn failure marks that step as a gap and the run
79
+ proceeds to the next command rather than raising.
80
+ """
81
+ report = InstallReport()
82
+ cleanup = workdir is None
83
+ workdir = workdir or tempfile.mkdtemp(prefix="pmkit-dogfood-")
84
+ run_env = _clean_env(env)
85
+ try:
86
+ for cmd in commands:
87
+ report.steps.append(_run_one(cmd, workdir, timeout, run_env))
88
+ finally:
89
+ if cleanup:
90
+ shutil.rmtree(workdir, ignore_errors=True)
91
+ return report
92
+
93
+
94
+ def _run_one(cmd: str, cwd: str, timeout: float, env: dict) -> StepResult:
95
+ try:
96
+ p = subprocess.run(
97
+ cmd, shell=True, cwd=cwd, env=env,
98
+ capture_output=True, text=True, timeout=timeout,
99
+ )
100
+ output = ((p.stdout or "") + (p.stderr or ""))[-4000:]
101
+ ok = p.returncode == 0
102
+ return StepResult(cmd, ok, p.returncode, output, gap=not ok,
103
+ reason="" if ok else f"exit {p.returncode}")
104
+ except subprocess.TimeoutExpired as exc:
105
+ partial = (exc.stdout or "") + (exc.stderr or "")
106
+ if isinstance(partial, bytes):
107
+ partial = partial.decode("utf-8", "replace")
108
+ return StepResult(cmd, False, None, partial[-4000:], gap=True,
109
+ reason=f"timeout after {timeout}s")
110
+ except Exception as e: # spawn failure (bad command, missing interpreter, ...)
111
+ return StepResult(cmd, False, None, str(e), gap=True, reason=f"could not run: {e}")
pmkit/dogfood/mcp.py ADDED
@@ -0,0 +1,73 @@
1
+ """Agent/MCP driver — connects to an MCP server as a real client (FastMCP `Client`).
2
+
3
+ The call-plan validation is pure and unit-tested. The live handshake lazily imports the
4
+ FastMCP client and is gated on availability (FastMCP is an optional `pmkit[dogfood]` dep,
5
+ not a core one), so the suite runs without it. The driver launches the documented server
6
+ command over stdio and calls the documented tools — exercising the real wire, not the engine.
7
+
8
+ SECURITY: the server launch command is executed verbatim with the operator's local
9
+ privileges (same trust assumption as the install runner) — it must come from a trusted,
10
+ operator-reviewed source, not an untrusted/auto-scraped doc without a human in the loop.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ from typing import Any
17
+
18
+
19
+ def plan_calls(calls: list[dict]) -> list[dict]:
20
+ """Validate + normalize tool calls into a plan. Pure."""
21
+ plan: list[dict] = []
22
+ for i, c in enumerate(calls):
23
+ tool = c.get("tool")
24
+ if not tool:
25
+ raise ValueError(f"call {i}: missing 'tool'")
26
+ args = c.get("args", {})
27
+ if not isinstance(args, dict):
28
+ raise ValueError(f"call {i}: 'args' must be an object")
29
+ plan.append({"tool": tool, "args": args})
30
+ return plan
31
+
32
+
33
+ def mcp_client_available() -> bool:
34
+ try:
35
+ from fastmcp import Client # noqa: F401
36
+ except Exception:
37
+ return False
38
+ return True
39
+
40
+
41
+ def drive_mcp(server_cmd: list[str], calls: list[dict], *, timeout: float = 30.0) -> list[dict]:
42
+ """Launch the documented server over stdio and call its tools. Raises if FastMCP absent."""
43
+ plan = plan_calls(calls)
44
+ if not mcp_client_available():
45
+ raise RuntimeError("FastMCP client not available — install pmkit[dogfood] (fastmcp)")
46
+ return asyncio.run(_drive_async(server_cmd, plan, timeout))
47
+
48
+
49
+ async def _drive_async(server_cmd: list[str], plan: list[dict], timeout: float) -> list[dict]:
50
+ from fastmcp import Client
51
+ from fastmcp.client.transports import StdioTransport
52
+
53
+ obs: list[dict] = []
54
+ transport = StdioTransport(command=server_cmd[0], args=list(server_cmd[1:]))
55
+
56
+ async def _run() -> None:
57
+ async with Client(transport) as client:
58
+ for call in plan:
59
+ try:
60
+ res = await client.call_tool(call["tool"], call["args"])
61
+ obs.append({"step": f"{call['tool']}({call['args']})", "ok": True,
62
+ "observed": str(getattr(res, "data", res))[:500]})
63
+ except Exception as e:
64
+ obs.append({"step": call["tool"], "ok": False,
65
+ "observed": f"{type(e).__name__}: {e}"})
66
+
67
+ try:
68
+ await asyncio.wait_for(_run(), timeout)
69
+ except asyncio.TimeoutError:
70
+ obs.append({"step": "connect", "ok": False, "observed": f"timeout after {timeout}s"})
71
+ except Exception as e:
72
+ obs.append({"step": "connect", "ok": False, "observed": f"{type(e).__name__}: {e}"})
73
+ return obs
@@ -0,0 +1,157 @@
1
+ """Dogfood findings model, parity check, and report rendering.
2
+
3
+ Pure functions over the drivers' result shapes. An observation is a dict:
4
+ ``{"step": str, "ok": bool, "observed": Any, "claim": str (optional)}``. The report
5
+ normalizes install + UI + MCP observations into per-interface pass/fail findings, adds
6
+ parity findings (UI vs MCP must agree), and renders agent/human-readable markdown.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import asdict, dataclass, field
12
+ from typing import Any, Optional
13
+
14
+
15
+ @dataclass
16
+ class Finding:
17
+ interface: str # install | ui | mcp | parity
18
+ title: str
19
+ status: str # pass | fail
20
+ gap: bool
21
+ claim: str = ""
22
+ observed: str = ""
23
+
24
+ def to_dict(self) -> dict:
25
+ return asdict(self)
26
+
27
+
28
+ @dataclass
29
+ class DogfoodReport:
30
+ target: str
31
+ findings: list[Finding] = field(default_factory=list)
32
+
33
+ @property
34
+ def gaps(self) -> list[Finding]:
35
+ return [f for f in self.findings if f.gap]
36
+
37
+ def passed(self) -> bool:
38
+ return not self.gaps
39
+
40
+ def per_interface(self) -> dict[str, dict[str, int]]:
41
+ out: dict[str, dict[str, int]] = {}
42
+ for f in self.findings:
43
+ d = out.setdefault(f.interface, {"pass": 0, "fail": 0})
44
+ d["pass" if f.status == "pass" else "fail"] += 1
45
+ return out
46
+
47
+ def to_dict(self) -> dict:
48
+ return {
49
+ "target": self.target,
50
+ "passed": self.passed(),
51
+ "per_interface": self.per_interface(),
52
+ "findings": [f.to_dict() for f in self.findings],
53
+ }
54
+
55
+
56
+ def from_install(install_report) -> list[Finding]:
57
+ out: list[Finding] = []
58
+ for s in install_report.steps:
59
+ out.append(Finding(
60
+ interface="install",
61
+ title=s.command,
62
+ status="pass" if s.ok else "fail",
63
+ gap=s.gap,
64
+ claim="documented install step succeeds",
65
+ observed=s.reason or ("ok" if s.ok else (s.output or "")[:200]),
66
+ ))
67
+ return out
68
+
69
+
70
+ def _observations(interface: str, obs: Optional[list[dict]]) -> list[Finding]:
71
+ out: list[Finding] = []
72
+ for ob in obs or []:
73
+ ok = ob.get("ok", True)
74
+ out.append(Finding(
75
+ interface=interface,
76
+ title=str(ob.get("step", "")),
77
+ status="pass" if ok else "fail",
78
+ gap=not ok,
79
+ claim=str(ob.get("claim", "")),
80
+ observed=str(ob.get("observed", "")),
81
+ ))
82
+ return out
83
+
84
+
85
+ def parity_check(ui_state: dict, mcp_state: dict) -> list[Finding]:
86
+ """Compare the two surfaces' end states on shared keys; divergence is a gap.
87
+
88
+ Disjoint non-empty states are a gap too ("not checkable") — they must not read as a
89
+ clean pass. Two empty states yield nothing (there was no state to compare)."""
90
+ shared = set(ui_state) & set(mcp_state)
91
+ if not shared:
92
+ if ui_state or mcp_state:
93
+ return [Finding(
94
+ interface="parity",
95
+ title="parity not checkable: surfaces share no state keys",
96
+ status="fail",
97
+ gap=True,
98
+ claim="UI and MCP surfaces expose comparable state",
99
+ observed=f"ui keys={sorted(map(str, ui_state))} mcp keys={sorted(map(str, mcp_state))}",
100
+ )]
101
+ return []
102
+ diverged = [
103
+ Finding(
104
+ interface="parity",
105
+ title=f"surfaces disagree on {k!r}",
106
+ status="fail",
107
+ gap=True,
108
+ claim="UI and MCP surfaces agree (parity)",
109
+ observed=f"ui={ui_state[k]!r} mcp={mcp_state[k]!r}",
110
+ )
111
+ for k in sorted(shared, key=str)
112
+ if str(ui_state[k]) != str(mcp_state[k])
113
+ ]
114
+ if shared and not diverged:
115
+ return [Finding("parity", "UI/MCP parity holds", "pass", gap=False,
116
+ claim="UI and MCP surfaces agree", observed=f"{len(shared)} keys match")]
117
+ return diverged
118
+
119
+
120
+ def build_report(
121
+ target: str,
122
+ *,
123
+ install=None,
124
+ ui: Optional[list[dict]] = None,
125
+ mcp: Optional[list[dict]] = None,
126
+ ui_state: Optional[dict] = None,
127
+ mcp_state: Optional[dict] = None,
128
+ ) -> DogfoodReport:
129
+ findings: list[Finding] = []
130
+ if install is not None:
131
+ findings += from_install(install)
132
+ findings += _observations("ui", ui)
133
+ findings += _observations("mcp", mcp)
134
+ if ui_state is not None and mcp_state is not None:
135
+ findings += parity_check(ui_state, mcp_state)
136
+ return DogfoodReport(target, findings)
137
+
138
+
139
+ def render_markdown(report: DogfoodReport) -> str:
140
+ lines = [f"# Dogfood report: {report.target}", ""]
141
+ lines.append(f"**Result:** {'PASS' if report.passed() else 'GAPS FOUND'}")
142
+ lines.append("")
143
+ lines.append("## Per-interface")
144
+ for iface, c in report.per_interface().items():
145
+ lines.append(f"- {iface}: {c['pass']} pass, {c['fail']} fail")
146
+ gaps = report.gaps
147
+ lines.append("")
148
+ lines.append(f"## Gaps ({len(gaps)})")
149
+ if not gaps:
150
+ lines.append("- none")
151
+ for g in gaps:
152
+ lines.append(f"- [{g.interface}] {g.title}")
153
+ if g.claim:
154
+ lines.append(f" - claim: {g.claim}")
155
+ if g.observed:
156
+ lines.append(f" - observed: {g.observed}")
157
+ return "\n".join(lines)
@@ -0,0 +1,32 @@
1
+ """Sample-app synthesis.
2
+
3
+ When a product's documented scenario needs an app to act on (streamlit-mcp serves a
4
+ Streamlit app), pm-dogfood synthesizes a minimal representative one rather than requiring
5
+ the operator to supply it. Kept tiny and deterministic so the same scenario reruns cleanly.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ SAMPLE_STREAMLIT_APP = '''import streamlit as st
11
+
12
+ if "saves" not in st.session_state:
13
+ st.session_state.saves = 0
14
+
15
+ name = st.text_input("Name", value="world", key="name")
16
+ if st.button("Save", key="save"):
17
+ st.session_state.saves += 1
18
+
19
+ st.markdown(f"Hello, {name}!")
20
+ st.markdown(f"saves = {st.session_state.saves}")
21
+ '''
22
+
23
+
24
+ def sample_streamlit_source() -> str:
25
+ return SAMPLE_STREAMLIT_APP
26
+
27
+
28
+ def synth_streamlit_app(path: str) -> str:
29
+ """Write the sample Streamlit app to ``path`` and return the path."""
30
+ with open(path, "w", encoding="utf-8") as fh:
31
+ fh.write(SAMPLE_STREAMLIT_APP)
32
+ return path
pmkit/dogfood/ui.py ADDED
@@ -0,0 +1,106 @@
1
+ """Human/UI driver — drives a rendered app in a real browser via Playwright.
2
+
3
+ The step-translation (scenario -> action plan) is pure and unit-tested. The live browser
4
+ pass lazily imports Playwright and is gated on availability, so pmkit stays stdlib-only and
5
+ the test suite runs without a browser. Live selectors are best-effort for Streamlit (label
6
+ + role) and are validated by the integration run (U6), not unit tests.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Optional
12
+
13
+ SUPPORTED_ACTIONS = ("set", "click", "read")
14
+
15
+
16
+ def translate_steps(steps: list[dict]) -> list[dict]:
17
+ """Validate + normalize inferred scenario steps into a UI action plan. Pure."""
18
+ plan: list[dict] = []
19
+ for i, s in enumerate(steps):
20
+ action = s.get("action")
21
+ if action not in SUPPORTED_ACTIONS:
22
+ raise ValueError(f"step {i}: unknown action {action!r} (expected {SUPPORTED_ACTIONS})")
23
+ target = s.get("target")
24
+ if action in ("set", "click") and not target:
25
+ raise ValueError(f"step {i}: '{action}' needs a target")
26
+ if action == "set" and "value" not in s:
27
+ raise ValueError(f"step {i}: 'set' needs a value")
28
+ plan.append({"action": action, "target": target, "value": s.get("value")})
29
+ return plan
30
+
31
+
32
+ def playwright_available() -> bool:
33
+ # Import the sync API, not just the top-level package: the sync API pulls in
34
+ # greenlet's compiled extension, so a shallow `import playwright` can pass while
35
+ # the browser runtime cannot actually start (e.g. a missing VC++ runtime DLL on
36
+ # Windows). Importing sync_playwright makes the gate reflect launchability.
37
+ try:
38
+ from playwright.sync_api import sync_playwright # noqa: F401
39
+ except Exception:
40
+ return False
41
+ return True
42
+
43
+
44
+ def drive_ui(url: str, steps: list[dict], *, timeout_ms: int = 10000) -> list[dict]:
45
+ """Drive the app's rendered UI in a real browser. Raises if Playwright is absent."""
46
+ plan = translate_steps(steps)
47
+ if not playwright_available():
48
+ raise RuntimeError("Playwright not installed — run `pip install 'pmkit[dogfood]'` "
49
+ "then `playwright install chromium`")
50
+ from playwright.sync_api import sync_playwright
51
+
52
+ obs: list[dict] = []
53
+ with sync_playwright() as p:
54
+ browser = p.chromium.launch()
55
+ try:
56
+ page = browser.new_page()
57
+ try:
58
+ page.goto(url, timeout=timeout_ms)
59
+ except Exception as e:
60
+ # a failed connect is a gap, not a crash (and must not skip browser.close)
61
+ obs.append({"step": f"goto {url}", "ok": False,
62
+ "observed": f"{type(e).__name__}: {e}"})
63
+ return obs
64
+ for step in plan:
65
+ obs.append(_run_step(page, step))
66
+ finally:
67
+ browser.close()
68
+ return obs
69
+
70
+
71
+ def _settle(page: Any) -> None:
72
+ """Let a reactive frontend (e.g. Streamlit) finish its rerun before we read.
73
+
74
+ Streamlit reruns asynchronously over a websocket: an interaction returns
75
+ immediately but the new DOM paints a beat later. Reading too soon captures the
76
+ pre-rerun page. We wait for network to quiesce (best-effort — the websocket may
77
+ never fully idle) plus a short paint budget. Cheap and generic across frameworks.
78
+ """
79
+ try:
80
+ page.wait_for_load_state("networkidle", timeout=3000)
81
+ except Exception:
82
+ pass
83
+ page.wait_for_timeout(400)
84
+
85
+
86
+ def _run_step(page: Any, step: dict) -> dict:
87
+ action, target, value = step["action"], step["target"], step["value"]
88
+ try:
89
+ if action == "set":
90
+ loc = page.get_by_label(target)
91
+ loc.fill(str(value))
92
+ # Streamlit (and many reactive inputs) only commit a typed value on
93
+ # Enter/blur, not on a programmatic fill — without this the rerun never
94
+ # fires and the new value is silently dropped.
95
+ loc.press("Enter")
96
+ _settle(page)
97
+ return {"step": f"set {target}={value}", "ok": True, "observed": "set"}
98
+ if action == "click":
99
+ page.get_by_role("button", name=target).click()
100
+ _settle(page)
101
+ return {"step": f"click {target}", "ok": True, "observed": "clicked"}
102
+ # read
103
+ text = page.get_by_text(target).inner_text() if target else page.inner_text("body")
104
+ return {"step": f"read {target or 'page'}", "ok": True, "observed": text[:500]}
105
+ except Exception as e:
106
+ return {"step": f"{action} {target}", "ok": False, "observed": f"{type(e).__name__}: {e}"}
pmkit/killtest.py ADDED
@@ -0,0 +1,31 @@
1
+ """The kill-test survival rule — the single, tested source of truth.
2
+
3
+ A blunt "prune on >= N refutes" majority let a vendor-already-shipped idea survive on a
4
+ 2-of-4 split (the Dash->MCP case). The fix: the **already-solved** axis is *dispositive* —
5
+ a confident already-solved refutation prunes a candidate regardless of how the other axes
6
+ voted (if the thing already exists, nothing else matters). Otherwise the strict majority
7
+ applies. The pm-run orchestrator calls this via `pmkit backlog killtest --decide` rather
8
+ than re-implementing the rule, so there's no JS/Python drift.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ SOLVED_AXIS = "already-solved"
14
+ SOLVED_CONFIDENCE = 0.7 # an already-solved refute at/above this is a hard kill
15
+
16
+
17
+ def decide_survival(verdicts: list[dict], prune_at: int = 3) -> tuple[bool, str]:
18
+ """Return (survived, reason) for a candidate given its per-axis kill-test verdicts.
19
+
20
+ Each verdict is a dict with at least ``verdict`` ('refute'|'survive') and ``axis``;
21
+ ``confidence`` (0..1) is optional and defaults to 1.0.
22
+ """
23
+ refutes = [v for v in verdicts if v.get("verdict") == "refute"]
24
+ for v in refutes:
25
+ if v.get("axis") == SOLVED_AXIS and float(v.get("confidence", 1.0)) >= SOLVED_CONFIDENCE:
26
+ reason = (v.get("reason") or "").strip()
27
+ return False, f"pruned: already-solved is dispositive ({reason})"[:200]
28
+ n = len(refutes)
29
+ if n >= prune_at:
30
+ return False, f"pruned: majority refute ({n}/{len(verdicts)})"
31
+ return True, f"survived ({n} refute(s); need {prune_at}, no dispositive already-solved)"
@@ -0,0 +1,15 @@
1
+ """pm-launch — the funnel's launch/amplify stage (deterministic core).
2
+
3
+ This package holds the *logistics* layer of launching a shipped product: the launch-state
4
+ ledger and mod-policy cache (``store``), moderator-policy verdicts (``policy``), the
5
+ listen/feedback loop (``listen``), the emit-only launch plan (``plan``), Tier-A collateral
6
+ capture (``collateral``), and draft-starting-point storage (``drafts``).
7
+
8
+ Hard boundaries, enforced here rather than left to convention:
9
+ - Nothing in this package posts to any channel — ever (the human gate lives in the skill).
10
+ - Drafts are *starting-points*; there is no "final"/"postable" state in the data model.
11
+ - The launch plan is emit-only: it renders an artifact and creates no cron side-effects.
12
+
13
+ Judgment (which channels, reading prose rules, writing copy, slop critique) lives in the
14
+ ``agents/pm-launch-*`` personas; this package is the rule/ledger half.
15
+ """