pmkit 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pmkit/__init__.py +8 -0
- pmkit/backlog.py +409 -0
- pmkit/cli.py +723 -0
- pmkit/connectors/__init__.py +35 -0
- pmkit/connectors/base.py +67 -0
- pmkit/connectors/changelog.py +37 -0
- pmkit/connectors/github.py +49 -0
- pmkit/connectors/hn.py +42 -0
- pmkit/connectors/reddit.py +42 -0
- pmkit/connectors/web.py +44 -0
- pmkit/connectors/x.py +50 -0
- pmkit/dedup.py +64 -0
- pmkit/discover.py +83 -0
- pmkit/dogfood/__init__.py +7 -0
- pmkit/dogfood/file_gaps.py +52 -0
- pmkit/dogfood/install.py +111 -0
- pmkit/dogfood/mcp.py +73 -0
- pmkit/dogfood/report.py +157 -0
- pmkit/dogfood/sample.py +32 -0
- pmkit/dogfood/ui.py +106 -0
- pmkit/killtest.py +31 -0
- pmkit/launch/__init__.py +15 -0
- pmkit/launch/collateral.py +159 -0
- pmkit/launch/drafts.py +53 -0
- pmkit/launch/listen.py +88 -0
- pmkit/launch/plan.py +82 -0
- pmkit/launch/policy.py +153 -0
- pmkit/launch/store.py +260 -0
- pmkit/rice.py +54 -0
- pmkit-0.1.1.dist-info/METADATA +29 -0
- pmkit-0.1.1.dist-info/RECORD +33 -0
- pmkit-0.1.1.dist-info/WHEEL +4 -0
- pmkit-0.1.1.dist-info/entry_points.txt +2 -0
pmkit/dogfood/install.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Clean-room install runner.
|
|
2
|
+
|
|
3
|
+
Runs a product's *documented* install/run commands verbatim in a throwaway working
|
|
4
|
+
directory and reports what actually happens. A failed documented step is a gap (a
|
|
5
|
+
doc-vs-reality finding), never a crash — the run continues so later independent steps
|
|
6
|
+
still get exercised. Commands are expected to be self-contained (e.g. `uvx <pkg> ...`,
|
|
7
|
+
`uv run --no-project --with <pkg> ...`), which create their own ephemeral environments.
|
|
8
|
+
|
|
9
|
+
SECURITY: these command strings are executed verbatim through a shell (`shell=True`) with
|
|
10
|
+
the operator's local privileges — running them IS the point, so they must come from a
|
|
11
|
+
trusted, operator-reviewed source, not directly from untrusted/auto-scraped product docs
|
|
12
|
+
without a human in the loop. To limit blast radius, the run gets a minimal allow-listed
|
|
13
|
+
environment (no inherited API keys/secrets); callers opt specific vars back in via `env`.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
import tempfile
|
|
22
|
+
from dataclasses import asdict, dataclass, field
|
|
23
|
+
from typing import Optional
|
|
24
|
+
|
|
25
|
+
# Only these env vars are passed into the throwaway run; everything else (the operator's
|
|
26
|
+
# API keys and secrets) is withheld from the third-party documented commands.
|
|
27
|
+
_SAFE_ENV_KEYS = {
|
|
28
|
+
"PATH", "PATHEXT", "SYSTEMROOT", "SYSTEMDRIVE", "COMSPEC", "WINDIR",
|
|
29
|
+
"TEMP", "TMP", "TMPDIR", "HOME", "HOMEDRIVE", "HOMEPATH", "USERPROFILE",
|
|
30
|
+
"LANG", "LC_ALL", "LC_CTYPE", "PYTHONUTF8",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _clean_env(extra: Optional[dict]) -> dict:
|
|
35
|
+
base = {k: v for k, v in os.environ.items() if k.upper() in _SAFE_ENV_KEYS}
|
|
36
|
+
base.update(extra or {})
|
|
37
|
+
return base
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class StepResult:
|
|
42
|
+
command: str
|
|
43
|
+
ok: bool
|
|
44
|
+
exit_code: Optional[int]
|
|
45
|
+
output: str
|
|
46
|
+
gap: bool
|
|
47
|
+
reason: str = ""
|
|
48
|
+
|
|
49
|
+
def to_dict(self) -> dict:
|
|
50
|
+
return asdict(self)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class InstallReport:
|
|
55
|
+
steps: list[StepResult] = field(default_factory=list)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def all_ok(self) -> bool:
|
|
59
|
+
return bool(self.steps) and all(s.ok for s in self.steps)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def gaps(self) -> list[StepResult]:
|
|
63
|
+
return [s for s in self.steps if s.gap]
|
|
64
|
+
|
|
65
|
+
def to_dict(self) -> dict:
|
|
66
|
+
return {"all_ok": self.all_ok, "steps": [s.to_dict() for s in self.steps]}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def run_documented_install(
|
|
70
|
+
commands: list[str],
|
|
71
|
+
*,
|
|
72
|
+
workdir: Optional[str] = None,
|
|
73
|
+
timeout: float = 120.0,
|
|
74
|
+
env: Optional[dict] = None,
|
|
75
|
+
) -> InstallReport:
|
|
76
|
+
"""Run each documented command in a throwaway dir; return a per-step report.
|
|
77
|
+
|
|
78
|
+
A non-zero exit, timeout, or spawn failure marks that step as a gap and the run
|
|
79
|
+
proceeds to the next command rather than raising.
|
|
80
|
+
"""
|
|
81
|
+
report = InstallReport()
|
|
82
|
+
cleanup = workdir is None
|
|
83
|
+
workdir = workdir or tempfile.mkdtemp(prefix="pmkit-dogfood-")
|
|
84
|
+
run_env = _clean_env(env)
|
|
85
|
+
try:
|
|
86
|
+
for cmd in commands:
|
|
87
|
+
report.steps.append(_run_one(cmd, workdir, timeout, run_env))
|
|
88
|
+
finally:
|
|
89
|
+
if cleanup:
|
|
90
|
+
shutil.rmtree(workdir, ignore_errors=True)
|
|
91
|
+
return report
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _run_one(cmd: str, cwd: str, timeout: float, env: dict) -> StepResult:
|
|
95
|
+
try:
|
|
96
|
+
p = subprocess.run(
|
|
97
|
+
cmd, shell=True, cwd=cwd, env=env,
|
|
98
|
+
capture_output=True, text=True, timeout=timeout,
|
|
99
|
+
)
|
|
100
|
+
output = ((p.stdout or "") + (p.stderr or ""))[-4000:]
|
|
101
|
+
ok = p.returncode == 0
|
|
102
|
+
return StepResult(cmd, ok, p.returncode, output, gap=not ok,
|
|
103
|
+
reason="" if ok else f"exit {p.returncode}")
|
|
104
|
+
except subprocess.TimeoutExpired as exc:
|
|
105
|
+
partial = (exc.stdout or "") + (exc.stderr or "")
|
|
106
|
+
if isinstance(partial, bytes):
|
|
107
|
+
partial = partial.decode("utf-8", "replace")
|
|
108
|
+
return StepResult(cmd, False, None, partial[-4000:], gap=True,
|
|
109
|
+
reason=f"timeout after {timeout}s")
|
|
110
|
+
except Exception as e: # spawn failure (bad command, missing interpreter, ...)
|
|
111
|
+
return StepResult(cmd, False, None, str(e), gap=True, reason=f"could not run: {e}")
|
pmkit/dogfood/mcp.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Agent/MCP driver — connects to an MCP server as a real client (FastMCP `Client`).
|
|
2
|
+
|
|
3
|
+
The call-plan validation is pure and unit-tested. The live handshake lazily imports the
|
|
4
|
+
FastMCP client and is gated on availability (FastMCP is an optional `pmkit[dogfood]` dep,
|
|
5
|
+
not a core one), so the suite runs without it. The driver launches the documented server
|
|
6
|
+
command over stdio and calls the documented tools — exercising the real wire, not the engine.
|
|
7
|
+
|
|
8
|
+
SECURITY: the server launch command is executed verbatim with the operator's local
|
|
9
|
+
privileges (same trust assumption as the install runner) — it must come from a trusted,
|
|
10
|
+
operator-reviewed source, not an untrusted/auto-scraped doc without a human in the loop.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def plan_calls(calls: list[dict]) -> list[dict]:
|
|
20
|
+
"""Validate + normalize tool calls into a plan. Pure."""
|
|
21
|
+
plan: list[dict] = []
|
|
22
|
+
for i, c in enumerate(calls):
|
|
23
|
+
tool = c.get("tool")
|
|
24
|
+
if not tool:
|
|
25
|
+
raise ValueError(f"call {i}: missing 'tool'")
|
|
26
|
+
args = c.get("args", {})
|
|
27
|
+
if not isinstance(args, dict):
|
|
28
|
+
raise ValueError(f"call {i}: 'args' must be an object")
|
|
29
|
+
plan.append({"tool": tool, "args": args})
|
|
30
|
+
return plan
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def mcp_client_available() -> bool:
|
|
34
|
+
try:
|
|
35
|
+
from fastmcp import Client # noqa: F401
|
|
36
|
+
except Exception:
|
|
37
|
+
return False
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def drive_mcp(server_cmd: list[str], calls: list[dict], *, timeout: float = 30.0) -> list[dict]:
|
|
42
|
+
"""Launch the documented server over stdio and call its tools. Raises if FastMCP absent."""
|
|
43
|
+
plan = plan_calls(calls)
|
|
44
|
+
if not mcp_client_available():
|
|
45
|
+
raise RuntimeError("FastMCP client not available — install pmkit[dogfood] (fastmcp)")
|
|
46
|
+
return asyncio.run(_drive_async(server_cmd, plan, timeout))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def _drive_async(server_cmd: list[str], plan: list[dict], timeout: float) -> list[dict]:
|
|
50
|
+
from fastmcp import Client
|
|
51
|
+
from fastmcp.client.transports import StdioTransport
|
|
52
|
+
|
|
53
|
+
obs: list[dict] = []
|
|
54
|
+
transport = StdioTransport(command=server_cmd[0], args=list(server_cmd[1:]))
|
|
55
|
+
|
|
56
|
+
async def _run() -> None:
|
|
57
|
+
async with Client(transport) as client:
|
|
58
|
+
for call in plan:
|
|
59
|
+
try:
|
|
60
|
+
res = await client.call_tool(call["tool"], call["args"])
|
|
61
|
+
obs.append({"step": f"{call['tool']}({call['args']})", "ok": True,
|
|
62
|
+
"observed": str(getattr(res, "data", res))[:500]})
|
|
63
|
+
except Exception as e:
|
|
64
|
+
obs.append({"step": call["tool"], "ok": False,
|
|
65
|
+
"observed": f"{type(e).__name__}: {e}"})
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
await asyncio.wait_for(_run(), timeout)
|
|
69
|
+
except asyncio.TimeoutError:
|
|
70
|
+
obs.append({"step": "connect", "ok": False, "observed": f"timeout after {timeout}s"})
|
|
71
|
+
except Exception as e:
|
|
72
|
+
obs.append({"step": "connect", "ok": False, "observed": f"{type(e).__name__}: {e}"})
|
|
73
|
+
return obs
|
pmkit/dogfood/report.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Dogfood findings model, parity check, and report rendering.
|
|
2
|
+
|
|
3
|
+
Pure functions over the drivers' result shapes. An observation is a dict:
|
|
4
|
+
``{"step": str, "ok": bool, "observed": Any, "claim": str (optional)}``. The report
|
|
5
|
+
normalizes install + UI + MCP observations into per-interface pass/fail findings, adds
|
|
6
|
+
parity findings (UI vs MCP must agree), and renders agent/human-readable markdown.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import asdict, dataclass, field
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Finding:
|
|
17
|
+
interface: str # install | ui | mcp | parity
|
|
18
|
+
title: str
|
|
19
|
+
status: str # pass | fail
|
|
20
|
+
gap: bool
|
|
21
|
+
claim: str = ""
|
|
22
|
+
observed: str = ""
|
|
23
|
+
|
|
24
|
+
def to_dict(self) -> dict:
|
|
25
|
+
return asdict(self)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class DogfoodReport:
|
|
30
|
+
target: str
|
|
31
|
+
findings: list[Finding] = field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def gaps(self) -> list[Finding]:
|
|
35
|
+
return [f for f in self.findings if f.gap]
|
|
36
|
+
|
|
37
|
+
def passed(self) -> bool:
|
|
38
|
+
return not self.gaps
|
|
39
|
+
|
|
40
|
+
def per_interface(self) -> dict[str, dict[str, int]]:
|
|
41
|
+
out: dict[str, dict[str, int]] = {}
|
|
42
|
+
for f in self.findings:
|
|
43
|
+
d = out.setdefault(f.interface, {"pass": 0, "fail": 0})
|
|
44
|
+
d["pass" if f.status == "pass" else "fail"] += 1
|
|
45
|
+
return out
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> dict:
|
|
48
|
+
return {
|
|
49
|
+
"target": self.target,
|
|
50
|
+
"passed": self.passed(),
|
|
51
|
+
"per_interface": self.per_interface(),
|
|
52
|
+
"findings": [f.to_dict() for f in self.findings],
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def from_install(install_report) -> list[Finding]:
|
|
57
|
+
out: list[Finding] = []
|
|
58
|
+
for s in install_report.steps:
|
|
59
|
+
out.append(Finding(
|
|
60
|
+
interface="install",
|
|
61
|
+
title=s.command,
|
|
62
|
+
status="pass" if s.ok else "fail",
|
|
63
|
+
gap=s.gap,
|
|
64
|
+
claim="documented install step succeeds",
|
|
65
|
+
observed=s.reason or ("ok" if s.ok else (s.output or "")[:200]),
|
|
66
|
+
))
|
|
67
|
+
return out
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _observations(interface: str, obs: Optional[list[dict]]) -> list[Finding]:
|
|
71
|
+
out: list[Finding] = []
|
|
72
|
+
for ob in obs or []:
|
|
73
|
+
ok = ob.get("ok", True)
|
|
74
|
+
out.append(Finding(
|
|
75
|
+
interface=interface,
|
|
76
|
+
title=str(ob.get("step", "")),
|
|
77
|
+
status="pass" if ok else "fail",
|
|
78
|
+
gap=not ok,
|
|
79
|
+
claim=str(ob.get("claim", "")),
|
|
80
|
+
observed=str(ob.get("observed", "")),
|
|
81
|
+
))
|
|
82
|
+
return out
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def parity_check(ui_state: dict, mcp_state: dict) -> list[Finding]:
|
|
86
|
+
"""Compare the two surfaces' end states on shared keys; divergence is a gap.
|
|
87
|
+
|
|
88
|
+
Disjoint non-empty states are a gap too ("not checkable") — they must not read as a
|
|
89
|
+
clean pass. Two empty states yield nothing (there was no state to compare)."""
|
|
90
|
+
shared = set(ui_state) & set(mcp_state)
|
|
91
|
+
if not shared:
|
|
92
|
+
if ui_state or mcp_state:
|
|
93
|
+
return [Finding(
|
|
94
|
+
interface="parity",
|
|
95
|
+
title="parity not checkable: surfaces share no state keys",
|
|
96
|
+
status="fail",
|
|
97
|
+
gap=True,
|
|
98
|
+
claim="UI and MCP surfaces expose comparable state",
|
|
99
|
+
observed=f"ui keys={sorted(map(str, ui_state))} mcp keys={sorted(map(str, mcp_state))}",
|
|
100
|
+
)]
|
|
101
|
+
return []
|
|
102
|
+
diverged = [
|
|
103
|
+
Finding(
|
|
104
|
+
interface="parity",
|
|
105
|
+
title=f"surfaces disagree on {k!r}",
|
|
106
|
+
status="fail",
|
|
107
|
+
gap=True,
|
|
108
|
+
claim="UI and MCP surfaces agree (parity)",
|
|
109
|
+
observed=f"ui={ui_state[k]!r} mcp={mcp_state[k]!r}",
|
|
110
|
+
)
|
|
111
|
+
for k in sorted(shared, key=str)
|
|
112
|
+
if str(ui_state[k]) != str(mcp_state[k])
|
|
113
|
+
]
|
|
114
|
+
if shared and not diverged:
|
|
115
|
+
return [Finding("parity", "UI/MCP parity holds", "pass", gap=False,
|
|
116
|
+
claim="UI and MCP surfaces agree", observed=f"{len(shared)} keys match")]
|
|
117
|
+
return diverged
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def build_report(
|
|
121
|
+
target: str,
|
|
122
|
+
*,
|
|
123
|
+
install=None,
|
|
124
|
+
ui: Optional[list[dict]] = None,
|
|
125
|
+
mcp: Optional[list[dict]] = None,
|
|
126
|
+
ui_state: Optional[dict] = None,
|
|
127
|
+
mcp_state: Optional[dict] = None,
|
|
128
|
+
) -> DogfoodReport:
|
|
129
|
+
findings: list[Finding] = []
|
|
130
|
+
if install is not None:
|
|
131
|
+
findings += from_install(install)
|
|
132
|
+
findings += _observations("ui", ui)
|
|
133
|
+
findings += _observations("mcp", mcp)
|
|
134
|
+
if ui_state is not None and mcp_state is not None:
|
|
135
|
+
findings += parity_check(ui_state, mcp_state)
|
|
136
|
+
return DogfoodReport(target, findings)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def render_markdown(report: DogfoodReport) -> str:
|
|
140
|
+
lines = [f"# Dogfood report: {report.target}", ""]
|
|
141
|
+
lines.append(f"**Result:** {'PASS' if report.passed() else 'GAPS FOUND'}")
|
|
142
|
+
lines.append("")
|
|
143
|
+
lines.append("## Per-interface")
|
|
144
|
+
for iface, c in report.per_interface().items():
|
|
145
|
+
lines.append(f"- {iface}: {c['pass']} pass, {c['fail']} fail")
|
|
146
|
+
gaps = report.gaps
|
|
147
|
+
lines.append("")
|
|
148
|
+
lines.append(f"## Gaps ({len(gaps)})")
|
|
149
|
+
if not gaps:
|
|
150
|
+
lines.append("- none")
|
|
151
|
+
for g in gaps:
|
|
152
|
+
lines.append(f"- [{g.interface}] {g.title}")
|
|
153
|
+
if g.claim:
|
|
154
|
+
lines.append(f" - claim: {g.claim}")
|
|
155
|
+
if g.observed:
|
|
156
|
+
lines.append(f" - observed: {g.observed}")
|
|
157
|
+
return "\n".join(lines)
|
pmkit/dogfood/sample.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Sample-app synthesis.
|
|
2
|
+
|
|
3
|
+
When a product's documented scenario needs an app to act on (streamlit-mcp serves a
|
|
4
|
+
Streamlit app), pm-dogfood synthesizes a minimal representative one rather than requiring
|
|
5
|
+
the operator to supply it. Kept tiny and deterministic so the same scenario reruns cleanly.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
SAMPLE_STREAMLIT_APP = '''import streamlit as st
|
|
11
|
+
|
|
12
|
+
if "saves" not in st.session_state:
|
|
13
|
+
st.session_state.saves = 0
|
|
14
|
+
|
|
15
|
+
name = st.text_input("Name", value="world", key="name")
|
|
16
|
+
if st.button("Save", key="save"):
|
|
17
|
+
st.session_state.saves += 1
|
|
18
|
+
|
|
19
|
+
st.markdown(f"Hello, {name}!")
|
|
20
|
+
st.markdown(f"saves = {st.session_state.saves}")
|
|
21
|
+
'''
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def sample_streamlit_source() -> str:
|
|
25
|
+
return SAMPLE_STREAMLIT_APP
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def synth_streamlit_app(path: str) -> str:
|
|
29
|
+
"""Write the sample Streamlit app to ``path`` and return the path."""
|
|
30
|
+
with open(path, "w", encoding="utf-8") as fh:
|
|
31
|
+
fh.write(SAMPLE_STREAMLIT_APP)
|
|
32
|
+
return path
|
pmkit/dogfood/ui.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Human/UI driver — drives a rendered app in a real browser via Playwright.
|
|
2
|
+
|
|
3
|
+
The step-translation (scenario -> action plan) is pure and unit-tested. The live browser
|
|
4
|
+
pass lazily imports Playwright and is gated on availability, so pmkit stays stdlib-only and
|
|
5
|
+
the test suite runs without a browser. Live selectors are best-effort for Streamlit (label
|
|
6
|
+
+ role) and are validated by the integration run (U6), not unit tests.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
SUPPORTED_ACTIONS = ("set", "click", "read")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def translate_steps(steps: list[dict]) -> list[dict]:
|
|
17
|
+
"""Validate + normalize inferred scenario steps into a UI action plan. Pure."""
|
|
18
|
+
plan: list[dict] = []
|
|
19
|
+
for i, s in enumerate(steps):
|
|
20
|
+
action = s.get("action")
|
|
21
|
+
if action not in SUPPORTED_ACTIONS:
|
|
22
|
+
raise ValueError(f"step {i}: unknown action {action!r} (expected {SUPPORTED_ACTIONS})")
|
|
23
|
+
target = s.get("target")
|
|
24
|
+
if action in ("set", "click") and not target:
|
|
25
|
+
raise ValueError(f"step {i}: '{action}' needs a target")
|
|
26
|
+
if action == "set" and "value" not in s:
|
|
27
|
+
raise ValueError(f"step {i}: 'set' needs a value")
|
|
28
|
+
plan.append({"action": action, "target": target, "value": s.get("value")})
|
|
29
|
+
return plan
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def playwright_available() -> bool:
|
|
33
|
+
# Import the sync API, not just the top-level package: the sync API pulls in
|
|
34
|
+
# greenlet's compiled extension, so a shallow `import playwright` can pass while
|
|
35
|
+
# the browser runtime cannot actually start (e.g. a missing VC++ runtime DLL on
|
|
36
|
+
# Windows). Importing sync_playwright makes the gate reflect launchability.
|
|
37
|
+
try:
|
|
38
|
+
from playwright.sync_api import sync_playwright # noqa: F401
|
|
39
|
+
except Exception:
|
|
40
|
+
return False
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def drive_ui(url: str, steps: list[dict], *, timeout_ms: int = 10000) -> list[dict]:
|
|
45
|
+
"""Drive the app's rendered UI in a real browser. Raises if Playwright is absent."""
|
|
46
|
+
plan = translate_steps(steps)
|
|
47
|
+
if not playwright_available():
|
|
48
|
+
raise RuntimeError("Playwright not installed — run `pip install 'pmkit[dogfood]'` "
|
|
49
|
+
"then `playwright install chromium`")
|
|
50
|
+
from playwright.sync_api import sync_playwright
|
|
51
|
+
|
|
52
|
+
obs: list[dict] = []
|
|
53
|
+
with sync_playwright() as p:
|
|
54
|
+
browser = p.chromium.launch()
|
|
55
|
+
try:
|
|
56
|
+
page = browser.new_page()
|
|
57
|
+
try:
|
|
58
|
+
page.goto(url, timeout=timeout_ms)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
# a failed connect is a gap, not a crash (and must not skip browser.close)
|
|
61
|
+
obs.append({"step": f"goto {url}", "ok": False,
|
|
62
|
+
"observed": f"{type(e).__name__}: {e}"})
|
|
63
|
+
return obs
|
|
64
|
+
for step in plan:
|
|
65
|
+
obs.append(_run_step(page, step))
|
|
66
|
+
finally:
|
|
67
|
+
browser.close()
|
|
68
|
+
return obs
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _settle(page: Any) -> None:
|
|
72
|
+
"""Let a reactive frontend (e.g. Streamlit) finish its rerun before we read.
|
|
73
|
+
|
|
74
|
+
Streamlit reruns asynchronously over a websocket: an interaction returns
|
|
75
|
+
immediately but the new DOM paints a beat later. Reading too soon captures the
|
|
76
|
+
pre-rerun page. We wait for network to quiesce (best-effort — the websocket may
|
|
77
|
+
never fully idle) plus a short paint budget. Cheap and generic across frameworks.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
page.wait_for_load_state("networkidle", timeout=3000)
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
page.wait_for_timeout(400)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _run_step(page: Any, step: dict) -> dict:
|
|
87
|
+
action, target, value = step["action"], step["target"], step["value"]
|
|
88
|
+
try:
|
|
89
|
+
if action == "set":
|
|
90
|
+
loc = page.get_by_label(target)
|
|
91
|
+
loc.fill(str(value))
|
|
92
|
+
# Streamlit (and many reactive inputs) only commit a typed value on
|
|
93
|
+
# Enter/blur, not on a programmatic fill — without this the rerun never
|
|
94
|
+
# fires and the new value is silently dropped.
|
|
95
|
+
loc.press("Enter")
|
|
96
|
+
_settle(page)
|
|
97
|
+
return {"step": f"set {target}={value}", "ok": True, "observed": "set"}
|
|
98
|
+
if action == "click":
|
|
99
|
+
page.get_by_role("button", name=target).click()
|
|
100
|
+
_settle(page)
|
|
101
|
+
return {"step": f"click {target}", "ok": True, "observed": "clicked"}
|
|
102
|
+
# read
|
|
103
|
+
text = page.get_by_text(target).inner_text() if target else page.inner_text("body")
|
|
104
|
+
return {"step": f"read {target or 'page'}", "ok": True, "observed": text[:500]}
|
|
105
|
+
except Exception as e:
|
|
106
|
+
return {"step": f"{action} {target}", "ok": False, "observed": f"{type(e).__name__}: {e}"}
|
pmkit/killtest.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""The kill-test survival rule — the single, tested source of truth.
|
|
2
|
+
|
|
3
|
+
A blunt "prune on >= N refutes" majority let a vendor-already-shipped idea survive on a
|
|
4
|
+
2-of-4 split (the Dash->MCP case). The fix: the **already-solved** axis is *dispositive* —
|
|
5
|
+
a confident already-solved refutation prunes a candidate regardless of how the other axes
|
|
6
|
+
voted (if the thing already exists, nothing else matters). Otherwise the strict majority
|
|
7
|
+
applies. The pm-run orchestrator calls this via `pmkit backlog killtest --decide` rather
|
|
8
|
+
than re-implementing the rule, so there's no JS/Python drift.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
SOLVED_AXIS = "already-solved"
|
|
14
|
+
SOLVED_CONFIDENCE = 0.7 # an already-solved refute at/above this is a hard kill
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def decide_survival(verdicts: list[dict], prune_at: int = 3) -> tuple[bool, str]:
|
|
18
|
+
"""Return (survived, reason) for a candidate given its per-axis kill-test verdicts.
|
|
19
|
+
|
|
20
|
+
Each verdict is a dict with at least ``verdict`` ('refute'|'survive') and ``axis``;
|
|
21
|
+
``confidence`` (0..1) is optional and defaults to 1.0.
|
|
22
|
+
"""
|
|
23
|
+
refutes = [v for v in verdicts if v.get("verdict") == "refute"]
|
|
24
|
+
for v in refutes:
|
|
25
|
+
if v.get("axis") == SOLVED_AXIS and float(v.get("confidence", 1.0)) >= SOLVED_CONFIDENCE:
|
|
26
|
+
reason = (v.get("reason") or "").strip()
|
|
27
|
+
return False, f"pruned: already-solved is dispositive ({reason})"[:200]
|
|
28
|
+
n = len(refutes)
|
|
29
|
+
if n >= prune_at:
|
|
30
|
+
return False, f"pruned: majority refute ({n}/{len(verdicts)})"
|
|
31
|
+
return True, f"survived ({n} refute(s); need {prune_at}, no dispositive already-solved)"
|
pmkit/launch/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""pm-launch — the funnel's launch/amplify stage (deterministic core).
|
|
2
|
+
|
|
3
|
+
This package holds the *logistics* layer of launching a shipped product: the launch-state
|
|
4
|
+
ledger and mod-policy cache (``store``), moderator-policy verdicts (``policy``), the
|
|
5
|
+
listen/feedback loop (``listen``), the emit-only launch plan (``plan``), Tier-A collateral
|
|
6
|
+
capture (``collateral``), and draft-starting-point storage (``drafts``).
|
|
7
|
+
|
|
8
|
+
Hard boundaries, enforced here rather than left to convention:
|
|
9
|
+
- Nothing in this package posts to any channel — ever (the human gate lives in the skill).
|
|
10
|
+
- Drafts are *starting-points*; there is no "final"/"postable" state in the data model.
|
|
11
|
+
- The launch plan is emit-only: it renders an artifact and creates no cron side-effects.
|
|
12
|
+
|
|
13
|
+
Judgment (which channels, reading prose rules, writing copy, slop critique) lives in the
|
|
14
|
+
``agents/pm-launch-*`` personas; this package is the rule/ledger half.
|
|
15
|
+
"""
|