PyPI - praxis-qa - Versions diffs - 0.0.1__tar.gz - Mend

praxis-qa 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

praxis_qa-0.0.1/.gitignore +31 -0
praxis_qa-0.0.1/LICENSE +7 -0
praxis_qa-0.0.1/PKG-INFO +79 -0
praxis_qa-0.0.1/README.md +55 -0
praxis_qa-0.0.1/docs/adr/README.md +38 -0
praxis_qa-0.0.1/docs/phase-2-features/README.md +22 -0
praxis_qa-0.0.1/experiments/exploration_reward/README.md +42 -0
praxis_qa-0.0.1/experiments/multi_writer/README.md +50 -0
praxis_qa-0.0.1/experiments/regression_recall_real/README.md +91 -0
praxis_qa-0.0.1/experiments/ui-mutation/README.md +75 -0
praxis_qa-0.0.1/pyproject.toml +87 -0
praxis_qa-0.0.1/schema/examples/create-welcome-popup.knowledge.yaml +58 -0
praxis_qa-0.0.1/schema/examples/login.knowledge.yaml +82 -0
praxis_qa-0.0.1/schema/knowledge.schema.json +177 -0
praxis_qa-0.0.1/src/praxis/__init__.py +19 -0
praxis_qa-0.0.1/src/praxis/adapters/__init__.py +38 -0
praxis_qa-0.0.1/src/praxis/adapters/browser_use.py +244 -0
praxis_qa-0.0.1/src/praxis/adapters/playwright.py +68 -0
praxis_qa-0.0.1/src/praxis/adapters/spi.py +182 -0
praxis_qa-0.0.1/src/praxis/auth_session.py +341 -0
praxis_qa-0.0.1/src/praxis/cli/__init__.py +14 -0
praxis_qa-0.0.1/src/praxis/cli/claude_brain.py +352 -0
praxis_qa-0.0.1/src/praxis/cli/main.py +1010 -0
praxis_qa-0.0.1/src/praxis/merge/__init__.py +53 -0
praxis_qa-0.0.1/src/praxis/merge/candidates.py +232 -0
praxis_qa-0.0.1/src/praxis/merge/decay.py +495 -0
praxis_qa-0.0.1/src/praxis/merge/projection.py +653 -0
praxis_qa-0.0.1/src/praxis/metrics/__init__.py +42 -0
praxis_qa-0.0.1/src/praxis/metrics/exploration_reward.py +297 -0
praxis_qa-0.0.1/src/praxis/model/__init__.py +79 -0
praxis_qa-0.0.1/src/praxis/model/check.py +155 -0
praxis_qa-0.0.1/src/praxis/model/knowledge.py +419 -0
praxis_qa-0.0.1/src/praxis/model/predicate.py +315 -0
praxis_qa-0.0.1/src/praxis/model/trigger_validator.py +99 -0
praxis_qa-0.0.1/src/praxis/oracle/__init__.py +55 -0
praxis_qa-0.0.1/src/praxis/oracle/trust.py +267 -0
praxis_qa-0.0.1/src/praxis/resources.py +133 -0
praxis_qa-0.0.1/src/praxis/runner/__init__.py +101 -0
praxis_qa-0.0.1/src/praxis/runner/_parallel.py +92 -0
praxis_qa-0.0.1/src/praxis/runner/engine.py +315 -0
praxis_qa-0.0.1/src/praxis/runner/exploration.py +334 -0
praxis_qa-0.0.1/src/praxis/runner/prompts.py +214 -0
praxis_qa-0.0.1/src/praxis/runner/regression.py +844 -0
praxis_qa-0.0.1/src/praxis/runner/report.py +644 -0
praxis_qa-0.0.1/src/praxis/secrets.py +242 -0
praxis_qa-0.0.1/src/praxis/skill_driver.py +224 -0
praxis_qa-0.0.1/src/praxis/skills/__init__.py +15 -0
praxis_qa-0.0.1/src/praxis/skills/praxis-explore/SKILL.md +160 -0
praxis_qa-0.0.1/src/praxis/skills/praxis-regress/SKILL.md +175 -0
praxis_qa-0.0.1/src/praxis/skills/praxis-teach/SKILL.md +246 -0
praxis_qa-0.0.1/src/praxis/store/__init__.py +106 -0
praxis_qa-0.0.1/src/praxis/store/agent_identity.py +80 -0
praxis_qa-0.0.1/src/praxis/store/candidate_files.py +191 -0
praxis_qa-0.0.1/src/praxis/store/events.py +239 -0
praxis_qa-0.0.1/src/praxis/store/file_store.py +465 -0
praxis_qa-0.0.1/src/praxis/teach/__init__.py +68 -0
praxis_qa-0.0.1/src/praxis/teach/session.py +1056 -0

praxis_qa-0.0.1/.gitignore ADDED Viewed

@@ -0,0 +1,31 @@
+# Python
+__pycache__/
+*.py[cod]
+.venv/
+venv/
+.env
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+# Local stores (never commit live memory)
+*.mneme-store/
+.mneme/
+.praxis/
+local-memory/
+# Secrets + per-machine claude config (never commit API keys / tokens)
+.claude/secrets.env
+.claude/*.local.*
+# mkdocs-material build output (ADR-0025 docs site; regenerable via `mkdocs build`)
+site/
+# Generated experiment outputs (reproducible via harness.py / LOCAL_RUN.md)
+experiments/ui-mutation/results.json
+experiments/ui-mutation/results.md
+experiments/ui-mutation/seed_*.knowledge.yaml
+experiments/ui-mutation/recorded_*.py

praxis_qa-0.0.1/LICENSE ADDED Viewed

@@ -0,0 +1,7 @@
+SPDX-License-Identifier: Apache-2.0
+Recommended license: Apache-2.0 (permissive, patent grant, good for an
+open-core library that seeds adoption while a hosted trust layer is monetized).
+Replace this file with the full Apache-2.0 text before publishing:
+https://www.apache.org/licenses/LICENSE-2.0.txt

praxis_qa-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,79 @@
+Metadata-Version: 2.4
+Name: praxis-qa
+Version: 0.0.1
+Summary: A shared semantic-memory layer for QA agents. Agents store and maintain knowledge about a system under test (goals, recognition signals, success/failure oracles, alternative paths, risks) decoupled from the procedure used to reach it.
+Author: Your Name
+License: Apache-2.0
+License-File: LICENSE
+Keywords: agents,browser-automation,memory,model-based-testing,qa,testing
+Requires-Python: >=3.11
+Requires-Dist: pydantic>=2
+Requires-Dist: pyyaml>=6
+Provides-Extra: browser-use
+Requires-Dist: browser-use; extra == 'browser-use'
+Provides-Extra: dev
+Requires-Dist: jsonschema>=4; extra == 'dev'
+Requires-Dist: mypy>=1.10; extra == 'dev'
+Requires-Dist: pytest>=8; extra == 'dev'
+Requires-Dist: ruff>=0.5; extra == 'dev'
+Provides-Extra: docs
+Requires-Dist: mkdocs-material>=9; extra == 'docs'
+Provides-Extra: live
+Requires-Dist: anthropic>=0.40; extra == 'live'
+Description-Content-Type: text/markdown
+# Mneme
+> A shared **semantic-memory layer for QA agents.**
+> Codename — rename freely (`praxis` → your brand) in `pyproject.toml` and `src/`.
+Most testing tools store **procedures** (click A, fill B, assert C). Mneme stores
+**knowledge about the system under test** — goals, how to recognize states, what
+success and failure actually look like, which alternative paths exist, and which
+risks lurk — and keeps that knowledge **decoupled from the steps** any single run
+happened to use.
+Agents read the knowledge to attempt a goal, **regenerate their own steps**, and
+write back what they observed. Over time the memory becomes a living model of the
+app, maintained by agents instead of by hand.
+```yaml
+goal: A returning user can establish an authenticated session.
+success_signals:
+  - a logout action becomes available          # behavioral, durable
+  - POST /session returns 2xx + session cookie  # network, durable
+alternative_paths: [email-password, social-oauth]
+known_risks:
+  - captcha (trigger: several consecutive failures)
+  - mfa     (trigger: account has MFA / new device)
+```
+## Why this is not "another test framework"
+This is **model-based testing reborn**: the discipline that failed historically
+because maintaining the model by hand cost more than the tests it replaced.
+The bet here is that **agents can build and maintain the model themselves**,
+which inverts that economics. The procedure is disposable; the knowledge is the asset.
+## What's in this repo
+- `docs/` — the full design: vision, architecture, schema, MVP experiment, risks, roadmap.
+- `docs/adr/` — the load-bearing decisions and why.
+- `schema/` — the language-neutral knowledge schema (JSON Schema) + real examples.
+- `src/praxis/` — package skeleton (model, store, merge, oracle, adapters).
+- `experiments/ui-mutation/` — the one experiment that validates or kills the idea.
+## Start here
+1. `docs/00-product-brief.md` — the one-page pitch.
+2. `AGENTS.md` — how Claude Code should build this (non-negotiables included).
+3. `experiments/ui-mutation/README.md` — build this first.
+## Non-negotiables (the spine of the design)
+1. Store **invariants, not coordinates**.
+2. Every assertion carries **provenance + confidence** (ADR-0004).
+3. The store is **append-only** (ADR-0001) — no overwrite of knowledge.
+4. Core stays **runtime-agnostic**; runtime code lives behind adapters (ADR-0003).
+5. The **oracle is sacred** — a success oracle is believed only via evidence
+   diversity (≥2 different signal types) or a human/spec seed, never by counting
+   agents; the first oracle is seeded (ADR-0005). Silent poisoning is the way this
+   product dies (docs/06).
+License: Apache-2.0 (recommended).

praxis_qa-0.0.1/README.md ADDED Viewed

@@ -0,0 +1,55 @@
+# Mneme
+> A shared **semantic-memory layer for QA agents.**
+> Codename — rename freely (`praxis` → your brand) in `pyproject.toml` and `src/`.
+Most testing tools store **procedures** (click A, fill B, assert C). Mneme stores
+**knowledge about the system under test** — goals, how to recognize states, what
+success and failure actually look like, which alternative paths exist, and which
+risks lurk — and keeps that knowledge **decoupled from the steps** any single run
+happened to use.
+Agents read the knowledge to attempt a goal, **regenerate their own steps**, and
+write back what they observed. Over time the memory becomes a living model of the
+app, maintained by agents instead of by hand.
+```yaml
+goal: A returning user can establish an authenticated session.
+success_signals:
+  - a logout action becomes available          # behavioral, durable
+  - POST /session returns 2xx + session cookie  # network, durable
+alternative_paths: [email-password, social-oauth]
+known_risks:
+  - captcha (trigger: several consecutive failures)
+  - mfa     (trigger: account has MFA / new device)
+```
+## Why this is not "another test framework"
+This is **model-based testing reborn**: the discipline that failed historically
+because maintaining the model by hand cost more than the tests it replaced.
+The bet here is that **agents can build and maintain the model themselves**,
+which inverts that economics. The procedure is disposable; the knowledge is the asset.
+## What's in this repo
+- `docs/` — the full design: vision, architecture, schema, MVP experiment, risks, roadmap.
+- `docs/adr/` — the load-bearing decisions and why.
+- `schema/` — the language-neutral knowledge schema (JSON Schema) + real examples.
+- `src/praxis/` — package skeleton (model, store, merge, oracle, adapters).
+- `experiments/ui-mutation/` — the one experiment that validates or kills the idea.
+## Start here
+1. `docs/00-product-brief.md` — the one-page pitch.
+2. `AGENTS.md` — how Claude Code should build this (non-negotiables included).
+3. `experiments/ui-mutation/README.md` — build this first.
+## Non-negotiables (the spine of the design)
+1. Store **invariants, not coordinates**.
+2. Every assertion carries **provenance + confidence** (ADR-0004).
+3. The store is **append-only** (ADR-0001) — no overwrite of knowledge.
+4. Core stays **runtime-agnostic**; runtime code lives behind adapters (ADR-0003).
+5. The **oracle is sacred** — a success oracle is believed only via evidence
+   diversity (≥2 different signal types) or a human/spec seed, never by counting
+   agents; the first oracle is seeded (ADR-0005). Silent poisoning is the way this
+   product dies (docs/06).
+License: Apache-2.0 (recommended).

praxis_qa-0.0.1/docs/adr/README.md ADDED Viewed

@@ -0,0 +1,38 @@
+# Architecture Decision Records
+Short, immutable records of *why* a decision was made. Add a new numbered file
+per decision; never edit a superseded one (mark it `Superseded by ADR-XXXX`).
+| ADR | Decision |
+|-----|----------|
+| 0001 | Append-only event log is the source of truth |
+| 0002 | The knowledge schema is the neutral interop layer (not a wire protocol) |
+| 0003 | Runtime-specific code lives only behind an adapter SPI |
+| 0004 | Provenance + confidence are mandatory on every assertion |
+| 0005 | Oracle trust by evidence diversity; cold-start via seeded oracles |
+| 0006 | Phase-0 status semantics - "uncorroborated" maps to `contested` |
+| 0007 | Phase-0 existential gate cleared (provisionally) - proceed to Phase 1 |
+| 0008 | Type-diversity needs source-independence (Phase-1 oracle hardening) |
+| 0009 | Phase 1 scope, regression-recall falsifier, and the praxis reframe |
+| 0010 | Phase 1 regression-recall gate cleared (provisionally) - proceed to Phase 1.5 |
+| 0011 | Phase 2 scope: five load-bearing items, schema activations, and Phase 1.5 / Phase 3 deferrals (Accepted) |
+| 0012 | Multi-writer concurrency contract: file-per-event store, source_id = agent_identity, day-one adversarial harness (Accepted) |
+| 0013 | Recency decay as projection-time derivation; status flips emit decay events, anchored by observed_app_version (Accepted) |
+| 0014 | E-mode candidate persistence as sibling CandidateEvent type with the same diversity-or-seed promotion rule (Accepted) |
+| 0015 | Exploration reward pre-registered, observability-only in Phase 2, paired with adversarial Goodhart review and random-walk baseline (Accepted) |
+| 0016 | Real-app SUT selection: pre-registered criteria, Conduit recommended (Saleor fallback), new run dir parallel to Phase 1 (Accepted) |
+| 0017 | Additive auth_state projected field (authenticated + scope), adapter-boundary redaction, no tokens/cookies/PII in knowledge (Accepted) |
+| 0018 | Phase 3 scope and the library-plus-git reframe: no SaaS, git is the shared memory, Claude Code is the local brain (Accepted) |
+| 0019 | Brain pluggability and execution surfaces: deterministic vs agentic, local Claude Code skill vs CI API-key agent, teach is skill-only (Accepted) |
+| 0020 | PyPI packaging and distribution: dist name praxis-qa, one universal wheel, schema and skills as package data, stable public API surface (Accepted) |
+| 0021 | The .praxis/ repository convention: git as shared memory, committed knowledge/candidates, gitignored runs and .praxis.secrets, one file per candidate observation (Accepted) |
+| 0022 | The teach operation as a Claude Code skill: human-in-the-loop seed, typed prompts, credentials never persisted, no silent overwrite of a believed goal (Accepted) |
+| 0023 | praxis regress and explore dual surface: console CLI plus skill, aggregate default, OK/REGRESSED/STALE break-vs-drift report, candidate dedup by trigger (Accepted) |
+| 0024 | CI integration by invoking the console commands: Praxis owns no CI machinery, the team owns push/PR/auth, promotion stays a human merge (Accepted) |
+| 0025 | Landing page and docs site: minimal non-engineer story, no analytics/signup/SaaS funnel, mkdocs from docs/, documented example CI workflow (Accepted) |
+| 0026 | Persistent authenticated-session reuse: reuse the saved browser session so 2FA is not needed every run, session is a secret (local file or CI secret, never knowledge), AUTH-EXPIRED is a third verdict (Proposed) |
+| 0027 | Self-contained console test runner driven by a local claude -p brain (subscription, no API key, headless, pytest-style), plus auth-as-subject vs auth-as-precondition: an auth-subject goal performs a real login, a feature goal reuses the session (Accepted) |
+| 0028 | Regress agent confirms every believed success signal in its declared type: align the prompt with the exact-type matcher, keep the matcher and Jaccard floor unchanged, never let "confirm all" become "tick all", seed only reproducible types (Accepted) |
+| 0029 | Agent self-observations cannot self-certify the oracle: per-summary promotion on its own merit (seeded or genuine different-type different-source corroboration), regress does not persist promotable agent observations, INHERENT seed-rides-single-agent boundary preserved (Accepted) |
+| 0030 | Signals as checkable facts with explicit variable slots: a signal value can be a predicate hard on the invariant and tolerant only on declared per-run instance tokens, matched by evaluating the predicate (no Jaccard), additive over the free-text path, never activating deferred states/paths (Accepted) |
+| 0031 | Signals as structured checks for relational and after-action facts: an optional typed `check` (list_count_delta, element_membership) evaluated programmatically over self-reported before/after observation data, the stricter third tier above value_predicate, agent self-reports the baseline (no runner change), never a false PASS, never activating deferred states/paths (Accepted) |

praxis_qa-0.0.1/docs/phase-2-features/README.md ADDED Viewed

@@ -0,0 +1,22 @@
+# Phase 2 features
+Phase 1 ended with a verdict: the memory arm beat the steelmanned cold-readme baseline by a wide margin on every pre-registered gate, so the operational-knowledge moat survives. The decision was CONTINUE, with caveats. Phase 2 is the follow-up: take the same machinery off the toy app it was tuned against, let multiple agents write into the shared memory at once without poisoning it, age out knowledge that nobody is re-confirming, persist agent hunches across runs so a human can act on them, and add a single hidden number that says whether exploration is paying for its tokens. Five features ship under five ADRs.
+Each link below points to a feature doc written for non-engineers. The ADRs they reference live under `docs/adr/`.
+## Features
+1. [Multi-writer concurrency](01-multi-writer.md) (ADR-0012). Lets many QA agents append to the same shared memory at once, without losing notes and without letting identical agents fake agreement to promote a bad finding.
+2. [Recency decay](02-recency-decay.md) (ADR-0013). Marks knowledge "stale" when no agent has re-confirmed it within a pre-registered number of app versions or days, and writes a visible audit record every time something ages out.
+3. [Candidate persistence](03-candidate-persistence.md) (ADR-0014). Saves exploring-agent hunches across runs as "contested" so a human can review them, while preventing any single agent from voting itself into the trusted set.
+4. [Exploration reward](04-exploration-reward.md) (ADR-0015). A single hidden number per exploration run that scores useful new knowledge per token spent. The agent never sees it, so it cannot game it.
+5. [Real-app SUT: Conduit + auth_state](05-real-app-sut.md) (ADR-0016, ADR-0017). Moves the experiment off the in-repo toy app and onto Conduit, a public Medium-clone, and adds a small `auth_state` field to the schema that records login posture without ever storing credentials.
+## Reference
+- Phase 1 verdict: [ADR-0010](../adr/0010-phase-1-regression-recall-verdict.md)
+- Phase 2 scope: [ADR-0011](../adr/0011-phase-2-scope-and-deferrals.md)

praxis_qa-0.0.1/experiments/exploration_reward/README.md ADDED Viewed

@@ -0,0 +1,42 @@
+# Exploration reward (observability-only, ADR-0015)
+Phase 2 introduces an explicit exploration incentive so concurrent
+writers do not silently converge on the happy path and shrink coverage
+(`docs/05`, `docs/06`, AGENTS.md Phase 2 brief). This directory hosts
+the pre-registered artifacts the ADR demands BEFORE any Phase 2
+experiment may report the reward number.
+The reward formula is locked verbatim in ADR-0015 sec 1:
+```
+reward = (resolved_uncertainties + alpha * new_unique_candidate_risks) / budget_tokens
+```
+The implementation lives in `src/praxis/metrics/exploration_reward.py`.
+This directory holds:
+- `pre_registration.md` - alpha + resolution criterion + canonicalization
+  rule, sealed under `praxis_git_sha` at run-start.
+- `goodhart_attacks.md` - the >= 8 named attack vectors + mitigations.
+  ADR-0015 sec 4 makes this a hard pre-run gate: no Phase 2 experiment
+  may report this reward until this file exists and lands in the same
+  commit as the reward instrumentation.
+- `metrics.py` - thin wrapper that consumes a run's projection and
+  produces a `RunReward` row. Composes
+  `src/praxis/metrics/exploration_reward.py`; does not re-implement the
+  formula.
+Observability-only contract (ADR-0015 sec 2): the reward does NOT feed
+back into agent state, prompt selection, or budget allocation in Phase 2.
+The instant the reward is visible to the optimizer, the canonicalization
+rule stops being defense and becomes attack surface. The Goodhart
+adversarial review (`goodhart_attacks.md`) exists precisely to surface
+attacks before the engineers iterating on E-mode prompts see the
+numbers and adjust toward them.
+Random-walk baseline (ADR-0015 sec 5): on the first Phase 2 multi-writer
+experiment, a `random_walk` arm runs concurrently with the `memory` arm
+under the same budget on the same SUT. Both arms compute reward via the
+same formula. `random_walk` receives no risks and no uncertainties as
+input. If `memory` does not exceed `random_walk`, the exploration
+incentive has failed and Phase 2 returns to the kill/continue gate.

praxis_qa-0.0.1/experiments/multi_writer/README.md ADDED Viewed

@@ -0,0 +1,50 @@
+# Multi-writer adversarial harness (ADR-0012)
+This experiment is the day-one assurance for the multi-writer concurrency
+contract. It ships in the same commit as the multi-writer file_store
+changes per ADR-0012 section 4 and is wired into `bash verify.sh` so the CI
+gate refuses to merge a regression silently.
+## What each scenario asserts
+| Scenario                   | Property under test                                       |
+|----------------------------|-----------------------------------------------------------|
+| `concurrent_same_source`   | N writers sharing one `agent_identity` race to append. Zero lost events AND zero false-promote: same-source same-type evidence stays `contested` no matter the count. |
+| `concurrent_diverse_source`| Writers across distinct `agent_identity` values bringing different signal types. Zero lost events AND legitimate diversity-or-seed promotion to `believed`. |
+| `racing_contradiction`     | Two distinct sources race on a failure signal with disagreeing `present`. The projection surfaces `contested`, not last-write-wins. |
+| `racing_oscillation`       | Alternating presence across writers produces `quarantined` per ADR-0005, derived from the event set (no flag mutated on the underlying events). |
+| `partial_write_failure`    | A leftover `*.tmp` from a crashed writer (post tmp-write, pre-rename) is ignored by readers; rename is the commit point. |
+## Running
+Direct:
+```
+python experiments/multi_writer/harness.py
+```
+Via the verify gate (recommended):
+```
+bash verify.sh
+```
+The pytest wrapper at `tests/test_multi_writer_harness.py` calls `run_all()`
+so any new scenario added to the harness automatically participates in the
+test suite.
+## Why these scenarios live under `experiments/` and not `tests/`
+ADR-0012 section 4 makes the harness a first-class delivery artifact: ship in
+the same commit, run on every verify, refuse merges that skip it. Mixing
+that with the plain pytest tests under `tests/` would hide the contract.
+The thin pytest wrapper exists so the harness participates in the regular
+test gate, but the scenarios themselves stay here.
+## Cross-tenant scope
+The harness deliberately does NOT include a cross-tenant write scenario
+(ADR-0012 section 3); that lives as a unit test in `tests/test_multi_writer.
+py` because tenancy is a constructor / boundary check, not a contention
+race. Surfacing it in the harness would dilute the "is the contention path
+sound?" question this harness exists to answer.

praxis_qa-0.0.1/experiments/regression_recall_real/README.md ADDED Viewed

@@ -0,0 +1,91 @@
+# Phase 2 regression-recall on a real OSS SUT (Conduit)
+This package is the Phase 2 port of the Phase 1 regression-recall
+experiment off the synthetic `experiments/ui-mutation/testapp.py` and
+onto a real OSS application. The SUT pick (Conduit, the RealWorld
+reference Medium-clone) is sealed by ADR-0016; the additive `auth_state`
+schema field is sealed by ADR-0017.
+Phase 1 sealed artifacts under `experiments/regression_recall/` are NOT
+edited by this port; this directory is parallel and independent.
+## Layout
+```
+experiments/regression_recall_real/
+  __init__.py              # package docstring + goal slate summary
+  README.md                # this file
+  pre_registration.md      # sealed-before-run artifact inventory
+  manifest.json            # SUT identity + goal slate + planted regressions
+  manifest.py              # typed loader for manifest.json
+  knowledge/
+    login.knowledge.yaml
+    publish_article.knowledge.yaml
+    favorite_article.knowledge.yaml
+    follow_user.knowledge.yaml
+    edit_article.knowledge.yaml
+  setup/
+    docker-compose.yml     # backend + frontend; pinned image tags
+    bring_up.sh            # idempotent bring-up; --check / --teardown subcommands
+```
+## Goal slate (ADR-0016 sec 4)
+Five Conduit goals, parallel-but-distinct to the Phase-1 four:
+- `login` parallels Phase-1 `login`.
+- `publish_article` parallels Phase-1 `checkout` (multi-step mutating flow).
+- `favorite_article` parallels Phase-1 `checkout` idempotency.
+- `follow_user` parallels Phase-1 `admin_access` (mutating flow with
+  authentication precondition + a knowledge-visible self-follow trap).
+- `edit_article` parallels Phase-1 `admin_access` directly (auth-scope
+  check: only the article author may edit).
+Each goal's knowledge file activates the Phase-2 additive `auth_state`
+projection (ADR-0017): the agent records that a successful goal leaves
+the session authenticated at `user` scope.
+## Bring-up
+```
+bash experiments/regression_recall_real/setup/bring_up.sh
+```
+ADR-0016 sec 1 caps cold-cache bring-up at 30 minutes wall time on a
+developer laptop. The `--check` subcommand verifies an already-running
+stack idempotently; `--teardown` removes it.
+The slow bring-up test (`tests/test_conduit_bringup.py`) is GATED behind
+the env var `PRAXIS_RUN_CONDUIT_BRINGUP=1` so `bash verify.sh` stays fast
+by default. To execute the C1 gate explicitly:
+```
+PRAXIS_RUN_CONDUIT_BRINGUP=1 python -m pytest tests/test_conduit_bringup.py -q
+```
+## Phase-2 schema delta: `auth_state`
+ADR-0017 adds `auth_state: {authenticated: bool, scope: string|null}`
+as a projected field on the per-goal knowledge surface. The field is
+defined in `schema/knowledge.schema.json` and mirrored in
+`src/praxis/model/knowledge.py`; the agreement test in
+`tests/test_model_schema_agree.py` catches drift.
+The field MUST NOT carry tokens, cookies, user/account/session
+identifiers, JWT contents, emails, or tenant/org/workspace scoping
+(ADR-0017 sec 2). The validator in the pydantic model rejects these on
+write; the adapter-boundary check
+(`praxis.adapters.assert_auth_state_observation_safe`) catches
+forbidden field names slipping through textual redaction.
+## What this package does NOT include yet
+Per the Phase-2 plan, the following lands as separate ADRs / commits:
+- Multi-writer adversarial harness (ADR-0012).
+- Recency-decay projection (ADR-0013).
+- E-mode candidate persistence as a sibling event type (ADR-0014).
+- Exploration-reward observability (ADR-0015).
+Each is its own feature with its own implementation; this package is
+the SUT + schema slice (ADR-0016 + ADR-0017) only.

praxis_qa-0.0.1/experiments/ui-mutation/README.md ADDED Viewed

@@ -0,0 +1,75 @@
+# Experiment: the decisive UI-mutation test
+KILL or VALIDATE the thesis before building the product. Build this FIRST.
+## Claim
+Goal+knowledge step regeneration is (1) cheaper / more reliable than a COLD
+agent, and (2) more robust than a RECORDED script when the UI changes.
+## Arms
+- `memory` — agent reads believed knowledge and regenerates its own steps.
+- `cold_agent` — same agent, no memory, figures it out each run.
+- `recorded_script` — a Playwright script captured once (the brittle baseline).
+## Setup
+- Runtime: Browser Use. One writer. Flows: login, search, checkout.
+- Minimal Phase-0 schema (`schema/knowledge.schema.json`).
+- **Seed each goal's success oracle (human/spec) before exploring** (ADR-0005).
+- Memory run 1: explore, populate knowledge. Run 2+: achieve the goal USING
+  knowledge, regenerating steps.
+## Measure (ORDER MATTERS)
+1. **Existential gate — `memory` vs `cold_agent`:** tokens + wall time + success
+   rate, no mutation. If cold wins or ties on cost at equal reliability, STOP.
+2. **Robustness — after a mutation (`mutate.py`):** `memory` vs `recorded_script`
+   recovery rate.
+3. **Guardrail — oracle correctness:** false-pass / false-fail across all runs.
+## Files
+- `metrics.py` — `RunResult` + the gates + report writer.
+- `harness.py` — runs the arms; checks the existential gate FIRST and short-circuits.
+- `mutate.py` — UI mutation injector (rename control, move field, swap
+  email→username, insert step). Each mutation changes HOW, never WHETHER.
+## Kill criterion
+Stop unless `memory` clears all three: cheaper-or-equal vs cold at equal
+reliability, more robust vs the recorded script, and oracle false-pass below
+brittle-test levels.
+## Files (implemented)
+- `metrics.py`   — `RunResult` + the three gates + verdict + report writers.
+- `harness.py`   — runs the arms, checks the existential gate FIRST, short-circuits.
+- `mutate.py`    — the four UI mutations (rename / move / swap / insert).
+- `simapp.py`    — a deterministic, in-process stand-in for the SUT + the three arms.
+- `runtimes.py`  — wires the arms through the REAL core (store→merge→oracle→adapter).
+## How to run
+```bash
+python experiments/ui-mutation/harness.py     # prints the three gates + verdict
+pytest tests/test_experiment_harness.py       # asserts the gate machinery
+```
+Outputs `results.json` (per-run) and `results.md` (summary).
+## ⚠️ What these numbers are (and are NOT)
+A *live* existential gate needs Browser Use + an LLM + a real SUT — none of which
+run in CI/sandbox. So the harness runs against `simapp`, a deterministic stand-in.
+Its token magnitudes are **explicit assumptions encoding the thesis premise**
+(recognition via a remembered oracle is cheaper than re-deriving it cold), **not
+measurements**. The sim therefore validates the *machinery* — the gate ordering,
+the metrics, the diversity-or-seed oracle, the mutation flow, and the kill/continue
+logic — end-to-end. It does **not** by itself validate the thesis.
+What IS real in the sim path: the append-only store, the believed projection, and
+the oracle's diversity-or-seed rule (the actual production code). What is modeled:
+the SUT and the agent's token cost.
+## Wiring the live arm (to get empirical numbers)
+1. `pip install -e ".[dev,browser-use]"` and set an LLM key.
+2. Stand up a real test app (or a hosted target) with the login/search/checkout
+   flows; implement `mutate.apply/reset` against it (proxy / DOM patch / feature
+   flag) instead of the in-process `simapp` state.
+3. Replace `simapp.run_memory/run_cold/run_recorded` with calls that drive
+   `BrowserUseAdapter` (memory/cold) and an emitted Playwright script
+   (`adapters.playwright.RecordedScript`, recorded baseline), measuring real tokens
+   and wall time.
+4. Re-run `harness.py`. The gate/verdict logic is unchanged; only the runtime swaps.

praxis_qa-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,87 @@
+# Distribution name is `praxis-qa` because `praxis` is taken on PyPI
+# (ADR-0020 decision 1). The import package and the CLI command both stay
+# `praxis`: only the published distribution name moves.
+[project]
+name = "praxis-qa"
+version = "0.0.1"
+description = "A shared semantic-memory layer for QA agents. Agents store and maintain knowledge about a system under test (goals, recognition signals, success/failure oracles, alternative paths, risks) decoupled from the procedure used to reach it."
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "Apache-2.0" }
+authors = [{ name = "Your Name" }]
+keywords = ["qa", "testing", "agents", "memory", "model-based-testing", "browser-automation"]
+dependencies = [
+    "pydantic>=2",     # typed knowledge model + validation against the schema
+    "pyyaml>=6",       # human-editable knowledge files
+]
+[project.optional-dependencies]
+dev = ["pytest>=8", "ruff>=0.5", "mypy>=1.10", "jsonschema>=4"]
+# Adapters are extras so the core stays runtime-agnostic (see ADR-0003).
+browser-use = ["browser-use"]
+# Live regression-recall experiment executor (Anthropic API + HTTP probe tool).
+# Optional - the core + tests run without it; only `experiments/regression_recall/
+# exec_anthropic.py` needs the SDK.
+live = ["anthropic>=0.40"]
+# Docs-only dev tool: the mkdocs-material static-site generator that renders the
+# `docs/` tree into the landing page and docs site (ADR-0025). It is NOT a core
+# dependency and NOT a runtime adapter; the base install stays pydantic + pyyaml
+# only (ADR-0020 decision 3). Install with `pip install -e ".[docs]"`.
+docs = ["mkdocs-material>=9"]
+# stagehand / playwright adapters declared as they are implemented.
+[project.scripts]
+praxis = "praxis.cli:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/praxis"]
+# Ship the JSON schema as package data inside the wheel (ADR-0020 decision 6).
+# The schema lives once at the repo root (`schema/knowledge.schema.json`) and is
+# force-included into the `praxis` package so an installed wheel resolves the
+# SAME bytes the source tree tests against, with no second copy to drift.
+# `src/praxis/skills/` is a regular sub-package and ships via `packages` above
+# (ADR-0020 decision 7); the non-`.py` SKILL.md files under it are carried by
+# the artifacts rule below.
+[tool.hatch.build.targets.wheel.force-include]
+"schema/knowledge.schema.json" = "praxis/_resources/knowledge.schema.json"
+[tool.hatch.build]
+# SKILL.md (and any other non-Python skill assets) are not .py files, so name
+# them explicitly as build artifacts to keep them in the wheel and the sdist.
+artifacts = ["src/praxis/skills/**/*.md"]
+[tool.hatch.build.targets.sdist]
+# The sdist already carries `schema/` from the repo root; keep the skills tree
+# and the schema in it so `pip install` from an sdist matches the wheel.
+include = ["src/praxis", "schema", "README.md", "pyproject.toml"]
+artifacts = ["src/praxis/skills/**/*.md"]
+[tool.ruff]
+target-version = "py311"
+line-length = 100
+[tool.ruff.lint.per-file-ignores]
+# The experiment harness must bootstrap sys.path (src layout + hyphenated package
+# dir name) before it can import `praxis`, so its imports are intentionally not at
+# the very top of the file.
+"experiments/ui-mutation/harness.py" = ["E402"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+[tool.mypy]
+python_version = "3.11"
+packages = ["praxis"]
+mypy_path = "src"
+warn_unused_ignores = true
+# yaml/jsonschema ship no type stubs (dev-only / optional); browser-use is an
+# optional extra. Ignore missing stubs rather than pulling stub packages into deps.
+[[tool.mypy.overrides]]
+module = ["yaml", "jsonschema", "browser_use"]
+ignore_missing_imports = true

praxis_qa-0.0.1/schema/examples/create-welcome-popup.knowledge.yaml ADDED Viewed

@@ -0,0 +1,58 @@
+# Worked example for ADR-0030: signals as checkable facts with variable slots.
+#
+# The live failure this fixes: this goal has four believed success signals. On
+# a real run the agent confirmed all four facts IN their declared type, but
+# reported them with CONCRETE per-run instance tokens (the real campaign id,
+# the full hostname, the real campaign name) while a free-text seed used
+# ABSTRACT placeholders. Per-type Jaccard fell below 0.5 on three of four, so
+# the genuinely passing goal read UNCERTAIN -> a false REGRESSED.
+#
+# Here three of the four signals carry a structured `value_predicate`: the text
+# OUTSIDE a `{slot}` is the INVARIANT (matched exactly, case-folded +
+# whitespace-normalized) and each `{slot}` / `{slot:numeric}` is a per-run
+# instance token the matcher tolerates on presence/shape only. The seed stores
+# only the ABSTRACT slot, never a concrete id (ADR-0017 / ADR-0030 posture).
+# The behavioral signal stays free-text to show the two paths coexist (decision
+# 4): a signal with no `value_predicate` is matched the old Jaccard way.
+schema_version: "0"
+goal_id: create-welcome-popup
+goal: A user can create a welcome popup and land in its editor with the new campaign listed.
+target:
+  app: digioh
+  environment: prod
+success_signals:
+  # Free-text (no predicate): matched the legacy Jaccard way (ADR-0028).
+  - type: behavioral
+    value: a welcome popup is created and appears in the campaign list
+    confidence: 1.0
+    status: believed
+    provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
+  # Structured: the route prefix is the invariant; the numeric campaign id is
+  # the per-run slot. A non-numeric segment is itself a regression.
+  - type: url
+    value: the editor route for the just-created campaign
+    value_predicate: the route matches /Box/Editor/{campaign_id:numeric}
+    confidence: 1.0
+    status: believed
+    provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
+  # Structured: the substring `Created Campaign` is the invariant; the id varies.
+  - type: text
+    value: a banner names the created campaign
+    value_predicate: a banner whose text contains Created Campaign {campaign_id}
+    confidence: 1.0
+    status: believed
+    provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
+  # Structured: method, host, path, `returns 2xx`, and the structural `contains
+  # a row whose id equals` are hard; only the campaign id is the slot.
+  - type: network
+    value: the create call returns 2xx and the campaign list shows the new row
+    value_predicate: GET account.digioh.com/ returns 2xx and the campaign list contains a row whose id equals {campaign_id}
+    confidence: 1.0
+    status: believed
+    provenance: { source_type: human, source_id: pablo-seed, last_verified: "2026-06-08T00:00:00Z", observation_count: 1 }
+meta:
+  created_at: "2026-06-08T00:00:00Z"
+  updated_at: "2026-06-08T00:00:00Z"