praxis-qa 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. praxis_qa-0.0.1/.gitignore +31 -0
  2. praxis_qa-0.0.1/LICENSE +7 -0
  3. praxis_qa-0.0.1/PKG-INFO +79 -0
  4. praxis_qa-0.0.1/README.md +55 -0
  5. praxis_qa-0.0.1/docs/adr/README.md +38 -0
  6. praxis_qa-0.0.1/docs/phase-2-features/README.md +22 -0
  7. praxis_qa-0.0.1/experiments/exploration_reward/README.md +42 -0
  8. praxis_qa-0.0.1/experiments/multi_writer/README.md +50 -0
  9. praxis_qa-0.0.1/experiments/regression_recall_real/README.md +91 -0
  10. praxis_qa-0.0.1/experiments/ui-mutation/README.md +75 -0
  11. praxis_qa-0.0.1/pyproject.toml +87 -0
  12. praxis_qa-0.0.1/schema/examples/create-welcome-popup.knowledge.yaml +58 -0
  13. praxis_qa-0.0.1/schema/examples/login.knowledge.yaml +82 -0
  14. praxis_qa-0.0.1/schema/knowledge.schema.json +177 -0
  15. praxis_qa-0.0.1/src/praxis/__init__.py +19 -0
  16. praxis_qa-0.0.1/src/praxis/adapters/__init__.py +38 -0
  17. praxis_qa-0.0.1/src/praxis/adapters/browser_use.py +244 -0
  18. praxis_qa-0.0.1/src/praxis/adapters/playwright.py +68 -0
  19. praxis_qa-0.0.1/src/praxis/adapters/spi.py +182 -0
  20. praxis_qa-0.0.1/src/praxis/auth_session.py +341 -0
  21. praxis_qa-0.0.1/src/praxis/cli/__init__.py +14 -0
  22. praxis_qa-0.0.1/src/praxis/cli/claude_brain.py +352 -0
  23. praxis_qa-0.0.1/src/praxis/cli/main.py +1010 -0
  24. praxis_qa-0.0.1/src/praxis/merge/__init__.py +53 -0
  25. praxis_qa-0.0.1/src/praxis/merge/candidates.py +232 -0
  26. praxis_qa-0.0.1/src/praxis/merge/decay.py +495 -0
  27. praxis_qa-0.0.1/src/praxis/merge/projection.py +653 -0
  28. praxis_qa-0.0.1/src/praxis/metrics/__init__.py +42 -0
  29. praxis_qa-0.0.1/src/praxis/metrics/exploration_reward.py +297 -0
  30. praxis_qa-0.0.1/src/praxis/model/__init__.py +79 -0
  31. praxis_qa-0.0.1/src/praxis/model/check.py +155 -0
  32. praxis_qa-0.0.1/src/praxis/model/knowledge.py +419 -0
  33. praxis_qa-0.0.1/src/praxis/model/predicate.py +315 -0
  34. praxis_qa-0.0.1/src/praxis/model/trigger_validator.py +99 -0
  35. praxis_qa-0.0.1/src/praxis/oracle/__init__.py +55 -0
  36. praxis_qa-0.0.1/src/praxis/oracle/trust.py +267 -0
  37. praxis_qa-0.0.1/src/praxis/resources.py +133 -0
  38. praxis_qa-0.0.1/src/praxis/runner/__init__.py +101 -0
  39. praxis_qa-0.0.1/src/praxis/runner/_parallel.py +92 -0
  40. praxis_qa-0.0.1/src/praxis/runner/engine.py +315 -0
  41. praxis_qa-0.0.1/src/praxis/runner/exploration.py +334 -0
  42. praxis_qa-0.0.1/src/praxis/runner/prompts.py +214 -0
  43. praxis_qa-0.0.1/src/praxis/runner/regression.py +844 -0
  44. praxis_qa-0.0.1/src/praxis/runner/report.py +644 -0
  45. praxis_qa-0.0.1/src/praxis/secrets.py +242 -0
  46. praxis_qa-0.0.1/src/praxis/skill_driver.py +224 -0
  47. praxis_qa-0.0.1/src/praxis/skills/__init__.py +15 -0
  48. praxis_qa-0.0.1/src/praxis/skills/praxis-explore/SKILL.md +160 -0
  49. praxis_qa-0.0.1/src/praxis/skills/praxis-regress/SKILL.md +175 -0
  50. praxis_qa-0.0.1/src/praxis/skills/praxis-teach/SKILL.md +246 -0
  51. praxis_qa-0.0.1/src/praxis/store/__init__.py +106 -0
  52. praxis_qa-0.0.1/src/praxis/store/agent_identity.py +80 -0
  53. praxis_qa-0.0.1/src/praxis/store/candidate_files.py +191 -0
  54. praxis_qa-0.0.1/src/praxis/store/events.py +239 -0
  55. praxis_qa-0.0.1/src/praxis/store/file_store.py +465 -0
  56. praxis_qa-0.0.1/src/praxis/teach/__init__.py +68 -0
  57. praxis_qa-0.0.1/src/praxis/teach/session.py +1056 -0
@@ -0,0 +1,31 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ .venv/
5
+ venv/
6
+ .env
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .pytest_cache/
11
+ .ruff_cache/
12
+ .mypy_cache/
13
+
14
+ # Local stores (never commit live memory)
15
+ *.mneme-store/
16
+ .mneme/
17
+ .praxis/
18
+ local-memory/
19
+
20
+ # Secrets + per-machine claude config (never commit API keys / tokens)
21
+ .claude/secrets.env
22
+ .claude/*.local.*
23
+
24
+ # mkdocs-material build output (ADR-0025 docs site; regenerable via `mkdocs build`)
25
+ site/
26
+
27
+ # Generated experiment outputs (reproducible via harness.py / LOCAL_RUN.md)
28
+ experiments/ui-mutation/results.json
29
+ experiments/ui-mutation/results.md
30
+ experiments/ui-mutation/seed_*.knowledge.yaml
31
+ experiments/ui-mutation/recorded_*.py
@@ -0,0 +1,7 @@
1
+ SPDX-License-Identifier: Apache-2.0
2
+
3
+ Recommended license: Apache-2.0 (permissive, patent grant, good for an
4
+ open-core library that seeds adoption while a hosted trust layer is monetized).
5
+
6
+ Replace this file with the full Apache-2.0 text before publishing:
7
+ https://www.apache.org/licenses/LICENSE-2.0.txt
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: praxis-qa
3
+ Version: 0.0.1
4
+ Summary: A shared semantic-memory layer for QA agents. Agents store and maintain knowledge about a system under test (goals, recognition signals, success/failure oracles, alternative paths, risks) decoupled from the procedure used to reach it.
5
+ Author: Your Name
6
+ License: Apache-2.0
7
+ License-File: LICENSE
8
+ Keywords: agents,browser-automation,memory,model-based-testing,qa,testing
9
+ Requires-Python: >=3.11
10
+ Requires-Dist: pydantic>=2
11
+ Requires-Dist: pyyaml>=6
12
+ Provides-Extra: browser-use
13
+ Requires-Dist: browser-use; extra == 'browser-use'
14
+ Provides-Extra: dev
15
+ Requires-Dist: jsonschema>=4; extra == 'dev'
16
+ Requires-Dist: mypy>=1.10; extra == 'dev'
17
+ Requires-Dist: pytest>=8; extra == 'dev'
18
+ Requires-Dist: ruff>=0.5; extra == 'dev'
19
+ Provides-Extra: docs
20
+ Requires-Dist: mkdocs-material>=9; extra == 'docs'
21
+ Provides-Extra: live
22
+ Requires-Dist: anthropic>=0.40; extra == 'live'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # Mneme
26
+
27
+ > A shared **semantic-memory layer for QA agents.**
28
+ > Codename — rename freely (`praxis` → your brand) in `pyproject.toml` and `src/`.
29
+
30
+ Most testing tools store **procedures** (click A, fill B, assert C). Mneme stores
31
+ **knowledge about the system under test** — goals, how to recognize states, what
32
+ success and failure actually look like, which alternative paths exist, and which
33
+ risks lurk — and keeps that knowledge **decoupled from the steps** any single run
34
+ happened to use.
35
+
36
+ Agents read the knowledge to attempt a goal, **regenerate their own steps**, and
37
+ write back what they observed. Over time the memory becomes a living model of the
38
+ app, maintained by agents instead of by hand.
39
+
40
+ ```yaml
41
+ goal: A returning user can establish an authenticated session.
42
+ success_signals:
43
+ - a logout action becomes available # behavioral, durable
44
+ - POST /session returns 2xx + session cookie # network, durable
45
+ alternative_paths: [email-password, social-oauth]
46
+ known_risks:
47
+ - captcha (trigger: several consecutive failures)
48
+ - mfa (trigger: account has MFA / new device)
49
+ ```
50
+
51
+ ## Why this is not "another test framework"
52
+ This is **model-based testing reborn**: the discipline that failed historically
53
+ because maintaining the model by hand cost more than the tests it replaced.
54
+ The bet here is that **agents can build and maintain the model themselves**,
55
+ which inverts that economics. The procedure is disposable; the knowledge is the asset.
56
+
57
+ ## What's in this repo
58
+ - `docs/` — the full design: vision, architecture, schema, MVP experiment, risks, roadmap.
59
+ - `docs/adr/` — the load-bearing decisions and why.
60
+ - `schema/` — the language-neutral knowledge schema (JSON Schema) + real examples.
61
+ - `src/praxis/` — package skeleton (model, store, merge, oracle, adapters).
62
+ - `experiments/ui-mutation/` — the one experiment that validates or kills the idea.
63
+
64
+ ## Start here
65
+ 1. `docs/00-product-brief.md` — the one-page pitch.
66
+ 2. `AGENTS.md` — how Claude Code should build this (non-negotiables included).
67
+ 3. `experiments/ui-mutation/README.md` — build this first.
68
+
69
+ ## Non-negotiables (the spine of the design)
70
+ 1. Store **invariants, not coordinates**.
71
+ 2. Every assertion carries **provenance + confidence** (ADR-0004).
72
+ 3. The store is **append-only** (ADR-0001) — no overwrite of knowledge.
73
+ 4. Core stays **runtime-agnostic**; runtime code lives behind adapters (ADR-0003).
74
+ 5. The **oracle is sacred** — a success oracle is believed only via evidence
75
+ diversity (≥2 different signal types) or a human/spec seed, never by counting
76
+ agents; the first oracle is seeded (ADR-0005). Silent poisoning is the way this
77
+ product dies (docs/06).
78
+
79
+ License: Apache-2.0 (recommended).
@@ -0,0 +1,55 @@
1
+ # Mneme
2
+
3
+ > A shared **semantic-memory layer for QA agents.**
4
+ > Codename — rename freely (`praxis` → your brand) in `pyproject.toml` and `src/`.
5
+
6
+ Most testing tools store **procedures** (click A, fill B, assert C). Mneme stores
7
+ **knowledge about the system under test** — goals, how to recognize states, what
8
+ success and failure actually look like, which alternative paths exist, and which
9
+ risks lurk — and keeps that knowledge **decoupled from the steps** any single run
10
+ happened to use.
11
+
12
+ Agents read the knowledge to attempt a goal, **regenerate their own steps**, and
13
+ write back what they observed. Over time the memory becomes a living model of the
14
+ app, maintained by agents instead of by hand.
15
+
16
+ ```yaml
17
+ goal: A returning user can establish an authenticated session.
18
+ success_signals:
19
+ - a logout action becomes available # behavioral, durable
20
+ - POST /session returns 2xx + session cookie # network, durable
21
+ alternative_paths: [email-password, social-oauth]
22
+ known_risks:
23
+ - captcha (trigger: several consecutive failures)
24
+ - mfa (trigger: account has MFA / new device)
25
+ ```
26
+
27
+ ## Why this is not "another test framework"
28
+ This is **model-based testing reborn**: the discipline that failed historically
29
+ because maintaining the model by hand cost more than the tests it replaced.
30
+ The bet here is that **agents can build and maintain the model themselves**,
31
+ which inverts that economics. The procedure is disposable; the knowledge is the asset.
32
+
33
+ ## What's in this repo
34
+ - `docs/` — the full design: vision, architecture, schema, MVP experiment, risks, roadmap.
35
+ - `docs/adr/` — the load-bearing decisions and why.
36
+ - `schema/` — the language-neutral knowledge schema (JSON Schema) + real examples.
37
+ - `src/praxis/` — package skeleton (model, store, merge, oracle, adapters).
38
+ - `experiments/ui-mutation/` — the one experiment that validates or kills the idea.
39
+
40
+ ## Start here
41
+ 1. `docs/00-product-brief.md` — the one-page pitch.
42
+ 2. `AGENTS.md` — how Claude Code should build this (non-negotiables included).
43
+ 3. `experiments/ui-mutation/README.md` — build this first.
44
+
45
+ ## Non-negotiables (the spine of the design)
46
+ 1. Store **invariants, not coordinates**.
47
+ 2. Every assertion carries **provenance + confidence** (ADR-0004).
48
+ 3. The store is **append-only** (ADR-0001) — no overwrite of knowledge.
49
+ 4. Core stays **runtime-agnostic**; runtime code lives behind adapters (ADR-0003).
50
+ 5. The **oracle is sacred** — a success oracle is believed only via evidence
51
+ diversity (≥2 different signal types) or a human/spec seed, never by counting
52
+ agents; the first oracle is seeded (ADR-0005). Silent poisoning is the way this
53
+ product dies (docs/06).
54
+
55
+ License: Apache-2.0 (recommended).
@@ -0,0 +1,38 @@
1
+ # Architecture Decision Records
2
+
3
+ Short, immutable records of *why* a decision was made. Add a new numbered file
4
+ per decision; never edit a superseded one (mark it `Superseded by ADR-XXXX`).
5
+
6
+ | ADR | Decision |
7
+ |-----|----------|
8
+ | 0001 | Append-only event log is the source of truth |
9
+ | 0002 | The knowledge schema is the neutral interop layer (not a wire protocol) |
10
+ | 0003 | Runtime-specific code lives only behind an adapter SPI |
11
+ | 0004 | Provenance + confidence are mandatory on every assertion |
12
+ | 0005 | Oracle trust by evidence diversity; cold-start via seeded oracles |
13
+ | 0006 | Phase-0 status semantics - "uncorroborated" maps to `contested` |
14
+ | 0007 | Phase-0 existential gate cleared (provisionally) - proceed to Phase 1 |
15
+ | 0008 | Type-diversity needs source-independence (Phase-1 oracle hardening) |
16
+ | 0009 | Phase 1 scope, regression-recall falsifier, and the praxis reframe |
17
+ | 0010 | Phase 1 regression-recall gate cleared (provisionally) - proceed to Phase 1.5 |
18
+ | 0011 | Phase 2 scope: five load-bearing items, schema activations, and Phase 1.5 / Phase 3 deferrals (Accepted) |
19
+ | 0012 | Multi-writer concurrency contract: file-per-event store, source_id = agent_identity, day-one adversarial harness (Accepted) |
20
+ | 0013 | Recency decay as projection-time derivation; status flips emit decay events, anchored by observed_app_version (Accepted) |
21
+ | 0014 | E-mode candidate persistence as sibling CandidateEvent type with the same diversity-or-seed promotion rule (Accepted) |
22
+ | 0015 | Exploration reward pre-registered, observability-only in Phase 2, paired with adversarial Goodhart review and random-walk baseline (Accepted) |
23
+ | 0016 | Real-app SUT selection: pre-registered criteria, Conduit recommended (Saleor fallback), new run dir parallel to Phase 1 (Accepted) |
24
+ | 0017 | Additive auth_state projected field (authenticated + scope), adapter-boundary redaction, no tokens/cookies/PII in knowledge (Accepted) |
25
+ | 0018 | Phase 3 scope and the library-plus-git reframe: no SaaS, git is the shared memory, Claude Code is the local brain (Accepted) |
26
+ | 0019 | Brain pluggability and execution surfaces: deterministic vs agentic, local Claude Code skill vs CI API-key agent, teach is skill-only (Accepted) |
27
+ | 0020 | PyPI packaging and distribution: dist name praxis-qa, one universal wheel, schema and skills as package data, stable public API surface (Accepted) |
28
+ | 0021 | The .praxis/ repository convention: git as shared memory, committed knowledge/candidates, gitignored runs and .praxis.secrets, one file per candidate observation (Accepted) |
29
+ | 0022 | The teach operation as a Claude Code skill: human-in-the-loop seed, typed prompts, credentials never persisted, no silent overwrite of a believed goal (Accepted) |
30
+ | 0023 | praxis regress and explore dual surface: console CLI plus skill, aggregate default, OK/REGRESSED/STALE break-vs-drift report, candidate dedup by trigger (Accepted) |
31
+ | 0024 | CI integration by invoking the console commands: Praxis owns no CI machinery, the team owns push/PR/auth, promotion stays a human merge (Accepted) |
32
+ | 0025 | Landing page and docs site: minimal non-engineer story, no analytics/signup/SaaS funnel, mkdocs from docs/, documented example CI workflow (Accepted) |
33
+ | 0026 | Persistent authenticated-session reuse: reuse the saved browser session so 2FA is not needed every run, session is a secret (local file or CI secret, never knowledge), AUTH-EXPIRED is a third verdict (Proposed) |
34
+ | 0027 | Self-contained console test runner driven by a local claude -p brain (subscription, no API key, headless, pytest-style), plus auth-as-subject vs auth-as-precondition: an auth-subject goal performs a real login, a feature goal reuses the session (Accepted) |
35
+ | 0028 | Regress agent confirms every believed success signal in its declared type: align the prompt with the exact-type matcher, keep the matcher and Jaccard floor unchanged, never let "confirm all" become "tick all", seed only reproducible types (Accepted) |
36
+ | 0029 | Agent self-observations cannot self-certify the oracle: per-summary promotion on its own merit (seeded or genuine different-type different-source corroboration), regress does not persist promotable agent observations, INHERENT seed-rides-single-agent boundary preserved (Accepted) |
37
+ | 0030 | Signals as checkable facts with explicit variable slots: a signal value can be a predicate hard on the invariant and tolerant only on declared per-run instance tokens, matched by evaluating the predicate (no Jaccard), additive over the free-text path, never activating deferred states/paths (Accepted) |
38
+ | 0031 | Signals as structured checks for relational and after-action facts: an optional typed `check` (list_count_delta, element_membership) evaluated programmatically over self-reported before/after observation data, the stricter third tier above value_predicate, agent self-reports the baseline (no runner change), never a false PASS, never activating deferred states/paths (Accepted) |
@@ -0,0 +1,22 @@
1
+ # Phase 2 features
2
+
3
+ Phase 1 ended with a verdict: the memory arm beat the steelmanned cold-readme baseline by a wide margin on every pre-registered gate, so the operational-knowledge moat survives. The decision was CONTINUE, with caveats. Phase 2 is the follow-up: take the same machinery off the toy app it was tuned against, let multiple agents write into the shared memory at once without poisoning it, age out knowledge that nobody is re-confirming, persist agent hunches across runs so a human can act on them, and add a single hidden number that says whether exploration is paying for its tokens. Five features ship under five ADRs.
4
+
5
+ Each link below points to a feature doc written for non-engineers. The ADRs they reference live under `docs/adr/`.
6
+
7
+ ## Features
8
+
9
+ 1. [Multi-writer concurrency](01-multi-writer.md) (ADR-0012). Lets many QA agents append to the same shared memory at once, without losing notes and without letting identical agents fake agreement to promote a bad finding.
10
+
11
+ 2. [Recency decay](02-recency-decay.md) (ADR-0013). Marks knowledge "stale" when no agent has re-confirmed it within a pre-registered number of app versions or days, and writes a visible audit record every time something ages out.
12
+
13
+ 3. [Candidate persistence](03-candidate-persistence.md) (ADR-0014). Saves exploring-agent hunches across runs as "contested" so a human can review them, while preventing any single agent from voting itself into the trusted set.
14
+
15
+ 4. [Exploration reward](04-exploration-reward.md) (ADR-0015). A single hidden number per exploration run that scores useful new knowledge per token spent. The agent never sees it, so it cannot game it.
16
+
17
+ 5. [Real-app SUT: Conduit + auth_state](05-real-app-sut.md) (ADR-0016, ADR-0017). Moves the experiment off the in-repo toy app and onto Conduit, a public Medium-clone, and adds a small `auth_state` field to the schema that records login posture without ever storing credentials.
18
+
19
+ ## Reference
20
+
21
+ - Phase 1 verdict: [ADR-0010](../adr/0010-phase-1-regression-recall-verdict.md)
22
+ - Phase 2 scope: [ADR-0011](../adr/0011-phase-2-scope-and-deferrals.md)
@@ -0,0 +1,42 @@
1
+ # Exploration reward (observability-only, ADR-0015)
2
+
3
+ Phase 2 introduces an explicit exploration incentive so concurrent
4
+ writers do not silently converge on the happy path and shrink coverage
5
+ (`docs/05`, `docs/06`, AGENTS.md Phase 2 brief). This directory hosts
6
+ the pre-registered artifacts the ADR demands BEFORE any Phase 2
7
+ experiment may report the reward number.
8
+
9
+ The reward formula is locked verbatim in ADR-0015 sec 1:
10
+
11
+ ```
12
+ reward = (resolved_uncertainties + alpha * new_unique_candidate_risks) / budget_tokens
13
+ ```
14
+
15
+ The implementation lives in `src/praxis/metrics/exploration_reward.py`.
16
+ This directory holds:
17
+
18
+ - `pre_registration.md` - alpha + resolution criterion + canonicalization
19
+ rule, sealed under `praxis_git_sha` at run-start.
20
+ - `goodhart_attacks.md` - the >= 8 named attack vectors + mitigations.
21
+ ADR-0015 sec 4 makes this a hard pre-run gate: no Phase 2 experiment
22
+ may report this reward until this file exists and lands in the same
23
+ commit as the reward instrumentation.
24
+ - `metrics.py` - thin wrapper that consumes a run's projection and
25
+ produces a `RunReward` row. Composes
26
+ `src/praxis/metrics/exploration_reward.py`; does not re-implement the
27
+ formula.
28
+
29
+ Observability-only contract (ADR-0015 sec 2): the reward does NOT feed
30
+ back into agent state, prompt selection, or budget allocation in Phase 2.
31
+ The instant the reward is visible to the optimizer, the canonicalization
32
+ rule stops being defense and becomes attack surface. The Goodhart
33
+ adversarial review (`goodhart_attacks.md`) exists precisely to surface
34
+ attacks before the engineers iterating on E-mode prompts see the
35
+ numbers and adjust toward them.
36
+
37
+ Random-walk baseline (ADR-0015 sec 5): on the first Phase 2 multi-writer
38
+ experiment, a `random_walk` arm runs concurrently with the `memory` arm
39
+ under the same budget on the same SUT. Both arms compute reward via the
40
+ same formula. `random_walk` receives no risks and no uncertainties as
41
+ input. If `memory` does not exceed `random_walk`, the exploration
42
+ incentive has failed and Phase 2 returns to the kill/continue gate.
@@ -0,0 +1,50 @@
1
+ # Multi-writer adversarial harness (ADR-0012)
2
+
3
+ This experiment is the day-one assurance for the multi-writer concurrency
4
+ contract. It ships in the same commit as the multi-writer file_store
5
+ changes per ADR-0012 section 4 and is wired into `bash verify.sh` so the CI
6
+ gate refuses to merge a regression silently.
7
+
8
+ ## What each scenario asserts
9
+
10
+ | Scenario | Property under test |
11
+ |----------------------------|-----------------------------------------------------------|
12
+ | `concurrent_same_source` | N writers sharing one `agent_identity` race to append. Zero lost events AND zero false-promote: same-source same-type evidence stays `contested` no matter the count. |
13
+ | `concurrent_diverse_source`| Writers across distinct `agent_identity` values bringing different signal types. Zero lost events AND legitimate diversity-or-seed promotion to `believed`. |
14
+ | `racing_contradiction` | Two distinct sources race on a failure signal with disagreeing `present`. The projection surfaces `contested`, not last-write-wins. |
15
+ | `racing_oscillation` | Alternating presence across writers produces `quarantined` per ADR-0005, derived from the event set (no flag mutated on the underlying events). |
16
+ | `partial_write_failure` | A leftover `*.tmp` from a crashed writer (post tmp-write, pre-rename) is ignored by readers; rename is the commit point. |
17
+
18
+ ## Running
19
+
20
+ Direct:
21
+
22
+ ```
23
+ python experiments/multi_writer/harness.py
24
+ ```
25
+
26
+ Via the verify gate (recommended):
27
+
28
+ ```
29
+ bash verify.sh
30
+ ```
31
+
32
+ The pytest wrapper at `tests/test_multi_writer_harness.py` calls `run_all()`
33
+ so any new scenario added to the harness automatically participates in the
34
+ test suite.
35
+
36
+ ## Why these scenarios live under `experiments/` and not `tests/`
37
+
38
+ ADR-0012 section 4 makes the harness a first-class delivery artifact: ship in
39
+ the same commit, run on every verify, refuse merges that skip it. Mixing
40
+ that with the plain pytest tests under `tests/` would hide the contract.
41
+ The thin pytest wrapper exists so the harness participates in the regular
42
+ test gate, but the scenarios themselves stay here.
43
+
44
+ ## Cross-tenant scope
45
+
46
+ The harness deliberately does NOT include a cross-tenant write scenario
47
+ (ADR-0012 section 3); that lives as a unit test in `tests/test_multi_writer.
48
+ py` because tenancy is a constructor / boundary check, not a contention
49
+ race. Surfacing it in the harness would dilute the "is the contention path
50
+ sound?" question this harness exists to answer.
@@ -0,0 +1,91 @@
1
+ # Phase 2 regression-recall on a real OSS SUT (Conduit)
2
+
3
+ This package is the Phase 2 port of the Phase 1 regression-recall
4
+ experiment off the synthetic `experiments/ui-mutation/testapp.py` and
5
+ onto a real OSS application. The SUT pick (Conduit, the RealWorld
6
+ reference Medium-clone) is sealed by ADR-0016; the additive `auth_state`
7
+ schema field is sealed by ADR-0017.
8
+
9
+ Phase 1 sealed artifacts under `experiments/regression_recall/` are NOT
10
+ edited by this port; this directory is parallel and independent.
11
+
12
+ ## Layout
13
+
14
+ ```
15
+ experiments/regression_recall_real/
16
+ __init__.py # package docstring + goal slate summary
17
+ README.md # this file
18
+ pre_registration.md # sealed-before-run artifact inventory
19
+ manifest.json # SUT identity + goal slate + planted regressions
20
+ manifest.py # typed loader for manifest.json
21
+ knowledge/
22
+ login.knowledge.yaml
23
+ publish_article.knowledge.yaml
24
+ favorite_article.knowledge.yaml
25
+ follow_user.knowledge.yaml
26
+ edit_article.knowledge.yaml
27
+ setup/
28
+ docker-compose.yml # backend + frontend; pinned image tags
29
+ bring_up.sh # idempotent bring-up; --check / --teardown subcommands
30
+ ```
31
+
32
+ ## Goal slate (ADR-0016 sec 4)
33
+
34
+ Five Conduit goals, parallel-but-distinct to the Phase-1 four:
35
+
36
+ - `login` parallels Phase-1 `login`.
37
+ - `publish_article` parallels Phase-1 `checkout` (multi-step mutating flow).
38
+ - `favorite_article` parallels Phase-1 `checkout` idempotency.
39
+ - `follow_user` parallels Phase-1 `admin_access` (mutating flow with
40
+ authentication precondition + a knowledge-visible self-follow trap).
41
+ - `edit_article` parallels Phase-1 `admin_access` directly (auth-scope
42
+ check: only the article author may edit).
43
+
44
+ Each goal's knowledge file activates the Phase-2 additive `auth_state`
45
+ projection (ADR-0017): the agent records that a successful goal leaves
46
+ the session authenticated at `user` scope.
47
+
48
+ ## Bring-up
49
+
50
+ ```
51
+ bash experiments/regression_recall_real/setup/bring_up.sh
52
+ ```
53
+
54
+ ADR-0016 sec 1 caps cold-cache bring-up at 30 minutes wall time on a
55
+ developer laptop. The `--check` subcommand verifies an already-running
56
+ stack idempotently; `--teardown` removes it.
57
+
58
+ The slow bring-up test (`tests/test_conduit_bringup.py`) is GATED behind
59
+ the env var `PRAXIS_RUN_CONDUIT_BRINGUP=1` so `bash verify.sh` stays fast
60
+ by default. To execute the C1 gate explicitly:
61
+
62
+ ```
63
+ PRAXIS_RUN_CONDUIT_BRINGUP=1 python -m pytest tests/test_conduit_bringup.py -q
64
+ ```
65
+
66
+ ## Phase-2 schema delta: `auth_state`
67
+
68
+ ADR-0017 adds `auth_state: {authenticated: bool, scope: string|null}`
69
+ as a projected field on the per-goal knowledge surface. The field is
70
+ defined in `schema/knowledge.schema.json` and mirrored in
71
+ `src/praxis/model/knowledge.py`; the agreement test in
72
+ `tests/test_model_schema_agree.py` catches drift.
73
+
74
+ The field MUST NOT carry tokens, cookies, user/account/session
75
+ identifiers, JWT contents, emails, or tenant/org/workspace scoping
76
+ (ADR-0017 sec 2). The validator in the pydantic model rejects these on
77
+ write; the adapter-boundary check
78
+ (`praxis.adapters.assert_auth_state_observation_safe`) catches
79
+ forbidden field names slipping through textual redaction.
80
+
81
+ ## What this package does NOT include yet
82
+
83
+ Per the Phase-2 plan, the following lands as separate ADRs / commits:
84
+
85
+ - Multi-writer adversarial harness (ADR-0012).
86
+ - Recency-decay projection (ADR-0013).
87
+ - E-mode candidate persistence as a sibling event type (ADR-0014).
88
+ - Exploration-reward observability (ADR-0015).
89
+
90
+ Each is its own feature with its own implementation; this package is
91
+ the SUT + schema slice (ADR-0016 + ADR-0017) only.
@@ -0,0 +1,75 @@
1
+ # Experiment: the decisive UI-mutation test
2
+
3
+ KILL or VALIDATE the thesis before building the product. Build this FIRST.
4
+
5
+ ## Claim
6
+ Goal+knowledge step regeneration is (1) cheaper / more reliable than a COLD
7
+ agent, and (2) more robust than a RECORDED script when the UI changes.
8
+
9
+ ## Arms
10
+ - `memory` — agent reads believed knowledge and regenerates its own steps.
11
+ - `cold_agent` — same agent, no memory, figures it out each run.
12
+ - `recorded_script` — a Playwright script captured once (the brittle baseline).
13
+
14
+ ## Setup
15
+ - Runtime: Browser Use. One writer. Flows: login, search, checkout.
16
+ - Minimal Phase-0 schema (`schema/knowledge.schema.json`).
17
+ - **Seed each goal's success oracle (human/spec) before exploring** (ADR-0005).
18
+ - Memory run 1: explore, populate knowledge. Run 2+: achieve the goal USING
19
+ knowledge, regenerating steps.
20
+
21
+ ## Measure (ORDER MATTERS)
22
+ 1. **Existential gate — `memory` vs `cold_agent`:** tokens + wall time + success
23
+ rate, no mutation. If cold wins or ties on cost at equal reliability, STOP.
24
+ 2. **Robustness — after a mutation (`mutate.py`):** `memory` vs `recorded_script`
25
+ recovery rate.
26
+ 3. **Guardrail — oracle correctness:** false-pass / false-fail across all runs.
27
+
28
+ ## Files
29
+ - `metrics.py` — `RunResult` + the gates + report writer.
30
+ - `harness.py` — runs the arms; checks the existential gate FIRST and short-circuits.
31
+ - `mutate.py` — UI mutation injector (rename control, move field, swap
32
+ email→username, insert step). Each mutation changes HOW, never WHETHER.
33
+
34
+ ## Kill criterion
35
+ Stop unless `memory` clears all three: cheaper-or-equal vs cold at equal
36
+ reliability, more robust vs the recorded script, and oracle false-pass below
37
+ brittle-test levels.
38
+
39
+ ## Files (implemented)
40
+ - `metrics.py` — `RunResult` + the three gates + verdict + report writers.
41
+ - `harness.py` — runs the arms, checks the existential gate FIRST, short-circuits.
42
+ - `mutate.py` — the four UI mutations (rename / move / swap / insert).
43
+ - `simapp.py` — a deterministic, in-process stand-in for the SUT + the three arms.
44
+ - `runtimes.py` — wires the arms through the REAL core (store→merge→oracle→adapter).
45
+
46
+ ## How to run
47
+ ```bash
48
+ python experiments/ui-mutation/harness.py # prints the three gates + verdict
49
+ pytest tests/test_experiment_harness.py # asserts the gate machinery
50
+ ```
51
+ Outputs `results.json` (per-run) and `results.md` (summary).
52
+
53
+ ## ⚠️ What these numbers are (and are NOT)
54
+ A *live* existential gate needs Browser Use + an LLM + a real SUT — none of which
55
+ run in CI/sandbox. So the harness runs against `simapp`, a deterministic stand-in.
56
+ Its token magnitudes are **explicit assumptions encoding the thesis premise**
57
+ (recognition via a remembered oracle is cheaper than re-deriving it cold), **not
58
+ measurements**. The sim therefore validates the *machinery* — the gate ordering,
59
+ the metrics, the diversity-or-seed oracle, the mutation flow, and the kill/continue
60
+ logic — end-to-end. It does **not** by itself validate the thesis.
61
+
62
+ What IS real in the sim path: the append-only store, the believed projection, and
63
+ the oracle's diversity-or-seed rule (the actual production code). What is modeled:
64
+ the SUT and the agent's token cost.
65
+
66
+ ## Wiring the live arm (to get empirical numbers)
67
+ 1. `pip install -e ".[dev,browser-use]"` and set an LLM key.
68
+ 2. Stand up a real test app (or a hosted target) with the login/search/checkout
69
+ flows; implement `mutate.apply/reset` against it (proxy / DOM patch / feature
70
+ flag) instead of the in-process `simapp` state.
71
+ 3. Replace `simapp.run_memory/run_cold/run_recorded` with calls that drive
72
+ `BrowserUseAdapter` (memory/cold) and an emitted Playwright script
73
+ (`adapters.playwright.RecordedScript`, recorded baseline), measuring real tokens
74
+ and wall time.
75
+ 4. Re-run `harness.py`. The gate/verdict logic is unchanged; only the runtime swaps.
@@ -0,0 +1,87 @@
1
+ # Distribution name is `praxis-qa` because `praxis` is taken on PyPI
2
+ # (ADR-0020 decision 1). The import package and the CLI command both stay
3
+ # `praxis`: only the published distribution name moves.
4
+ [project]
5
+ name = "praxis-qa"
6
+ version = "0.0.1"
7
+ description = "A shared semantic-memory layer for QA agents. Agents store and maintain knowledge about a system under test (goals, recognition signals, success/failure oracles, alternative paths, risks) decoupled from the procedure used to reach it."
8
+ readme = "README.md"
9
+ requires-python = ">=3.11"
10
+ license = { text = "Apache-2.0" }
11
+ authors = [{ name = "Your Name" }]
12
+ keywords = ["qa", "testing", "agents", "memory", "model-based-testing", "browser-automation"]
13
+
14
+ dependencies = [
15
+ "pydantic>=2", # typed knowledge model + validation against the schema
16
+ "pyyaml>=6", # human-editable knowledge files
17
+ ]
18
+
19
+ [project.optional-dependencies]
20
+ dev = ["pytest>=8", "ruff>=0.5", "mypy>=1.10", "jsonschema>=4"]
21
+ # Adapters are extras so the core stays runtime-agnostic (see ADR-0003).
22
+ browser-use = ["browser-use"]
23
+ # Live regression-recall experiment executor (Anthropic API + HTTP probe tool).
24
+ # Optional - the core + tests run without it; only `experiments/regression_recall/
25
+ # exec_anthropic.py` needs the SDK.
26
+ live = ["anthropic>=0.40"]
27
+ # Docs-only dev tool: the mkdocs-material static-site generator that renders the
28
+ # `docs/` tree into the landing page and docs site (ADR-0025). It is NOT a core
29
+ # dependency and NOT a runtime adapter; the base install stays pydantic + pyyaml
30
+ # only (ADR-0020 decision 3). Install with `pip install -e ".[docs]"`.
31
+ docs = ["mkdocs-material>=9"]
32
+ # stagehand / playwright adapters declared as they are implemented.
33
+
34
+ [project.scripts]
35
+ praxis = "praxis.cli:main"
36
+
37
+ [build-system]
38
+ requires = ["hatchling"]
39
+ build-backend = "hatchling.build"
40
+
41
+ [tool.hatch.build.targets.wheel]
42
+ packages = ["src/praxis"]
43
+ # Ship the JSON schema as package data inside the wheel (ADR-0020 decision 6).
44
+ # The schema lives once at the repo root (`schema/knowledge.schema.json`) and is
45
+ # force-included into the `praxis` package so an installed wheel resolves the
46
+ # SAME bytes the source tree tests against, with no second copy to drift.
47
+ # `src/praxis/skills/` is a regular sub-package and ships via `packages` above
48
+ # (ADR-0020 decision 7); the non-`.py` SKILL.md files under it are carried by
49
+ # the artifacts rule below.
50
+ [tool.hatch.build.targets.wheel.force-include]
51
+ "schema/knowledge.schema.json" = "praxis/_resources/knowledge.schema.json"
52
+
53
+ [tool.hatch.build]
54
+ # SKILL.md (and any other non-Python skill assets) are not .py files, so name
55
+ # them explicitly as build artifacts to keep them in the wheel and the sdist.
56
+ artifacts = ["src/praxis/skills/**/*.md"]
57
+
58
+ [tool.hatch.build.targets.sdist]
59
+ # The sdist already carries `schema/` from the repo root; keep the skills tree
60
+ # and the schema in it so `pip install` from an sdist matches the wheel.
61
+ include = ["src/praxis", "schema", "README.md", "pyproject.toml"]
62
+ artifacts = ["src/praxis/skills/**/*.md"]
63
+
64
+ [tool.ruff]
65
+ target-version = "py311"
66
+ line-length = 100
67
+
68
+ [tool.ruff.lint.per-file-ignores]
69
+ # The experiment harness must bootstrap sys.path (src layout + hyphenated package
70
+ # dir name) before it can import `praxis`, so its imports are intentionally not at
71
+ # the very top of the file.
72
+ "experiments/ui-mutation/harness.py" = ["E402"]
73
+
74
+ [tool.pytest.ini_options]
75
+ testpaths = ["tests"]
76
+
77
+ [tool.mypy]
78
+ python_version = "3.11"
79
+ packages = ["praxis"]
80
+ mypy_path = "src"
81
+ warn_unused_ignores = true
82
+
83
+ # yaml/jsonschema ship no type stubs (dev-only / optional); browser-use is an
84
+ # optional extra. Ignore missing stubs rather than pulling stub packages into deps.
85
+ [[tool.mypy.overrides]]
86
+ module = ["yaml", "jsonschema", "browser_use"]
87
+ ignore_missing_imports = true
@@ -0,0 +1,58 @@
1
+ # Worked example for ADR-0030: signals as checkable facts with variable slots.
2
+ #
3
+ # The live failure this fixes: this goal has four believed success signals. On
4
+ # a real run the agent confirmed all four facts IN their declared type, but
5
+ # reported them with CONCRETE per-run instance tokens (the real campaign id,
6
+ # the full hostname, the real campaign name) while a free-text seed used
7
+ # ABSTRACT placeholders. Per-type Jaccard fell below 0.5 on three of four, so
8
+ # the genuinely passing goal read UNCERTAIN -> a false REGRESSED.
9
+ #
10
+ # Here three of the four signals carry a structured `value_predicate`: the text
11
+ # OUTSIDE a `{slot}` is the INVARIANT (matched exactly, case-folded +
12
+ # whitespace-normalized) and each `{slot}` / `{slot:numeric}` is a per-run
13
+ # instance token the matcher tolerates on presence/shape only. The seed stores
14
+ # only the ABSTRACT slot, never a concrete id (ADR-0017 / ADR-0030 posture).
15
+ # The behavioral signal stays free-text to show the two paths coexist (decision
16
+ # 4): a signal with no `value_predicate` is matched the old Jaccard way.
17
+ schema_version: "0"
18
+ goal_id: create-welcome-popup
19
+ goal: A user can create a welcome popup and land in its editor with the new campaign listed.
20
+
21
+ target:
22
+ app: digioh
23
+ environment: prod
24
+
25
+ success_signals:
26
+ # Free-text (no predicate): matched the legacy Jaccard way (ADR-0028).
27
+ - type: behavioral
28
+ value: a welcome popup is created and appears in the campaign list
29
+ confidence: 1.0
30
+ status: believed
31
+ provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
32
+ # Structured: the route prefix is the invariant; the numeric campaign id is
33
+ # the per-run slot. A non-numeric segment is itself a regression.
34
+ - type: url
35
+ value: the editor route for the just-created campaign
36
+ value_predicate: the route matches /Box/Editor/{campaign_id:numeric}
37
+ confidence: 1.0
38
+ status: believed
39
+ provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
40
+ # Structured: the substring `Created Campaign` is the invariant; the id varies.
41
+ - type: text
42
+ value: a banner names the created campaign
43
+ value_predicate: a banner whose text contains Created Campaign {campaign_id}
44
+ confidence: 1.0
45
+ status: believed
46
+ provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
47
+ # Structured: method, host, path, `returns 2xx`, and the structural `contains
48
+ # a row whose id equals` are hard; only the campaign id is the slot.
49
+ - type: network
50
+ value: the create call returns 2xx and the campaign list shows the new row
51
+ value_predicate: GET account.digioh.com/ returns 2xx and the campaign list contains a row whose id equals {campaign_id}
52
+ confidence: 1.0
53
+ status: believed
54
+ provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
55
+
56
+ meta:
57
+ created_at: "2026-06-08T00:00:00Z"
58
+ updated_at: "2026-06-08T00:00:00Z"