cognit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognit-0.1.0/.github/workflows/ci.yml +43 -0
- cognit-0.1.0/.github/workflows/release.yml +27 -0
- cognit-0.1.0/.gitignore +34 -0
- cognit-0.1.0/.python-version +1 -0
- cognit-0.1.0/CHANGELOG.md +55 -0
- cognit-0.1.0/INTENTS.md +262 -0
- cognit-0.1.0/LICENSE +21 -0
- cognit-0.1.0/PKG-INFO +13 -0
- cognit-0.1.0/PLAN-V1.md +2787 -0
- cognit-0.1.0/README.md +189 -0
- cognit-0.1.0/UI-REDESIGN-PLAN.md +1493 -0
- cognit-0.1.0/UI-REDESIGN.md +180 -0
- cognit-0.1.0/docs/img/cognit-questions.png +0 -0
- cognit-0.1.0/docs/img/cognit-results.png +0 -0
- cognit-0.1.0/docs/superpowers/plans/2026-05-22-claude-agent-sdk-engine.md +959 -0
- cognit-0.1.0/docs/superpowers/specs/2026-05-22-claude-agent-sdk-engine-design.md +186 -0
- cognit-0.1.0/docs/superpowers/specs/2026-05-23-agentic-outline-generation-design.md +78 -0
- cognit-0.1.0/pyproject.toml +48 -0
- cognit-0.1.0/scripts/dev_generate.py +30 -0
- cognit-0.1.0/scripts/screenshot.py +161 -0
- cognit-0.1.0/src/cognit/__init__.py +1 -0
- cognit-0.1.0/src/cognit/__main__.py +2 -0
- cognit-0.1.0/src/cognit/cli/__init__.py +36 -0
- cognit-0.1.0/src/cognit/cli/take.py +246 -0
- cognit-0.1.0/src/cognit/cli/version.py +1 -0
- cognit-0.1.0/src/cognit/comment/__init__.py +0 -0
- cognit-0.1.0/src/cognit/comment/parse.py +51 -0
- cognit-0.1.0/src/cognit/comment/render.py +123 -0
- cognit-0.1.0/src/cognit/engine/__init__.py +0 -0
- cognit-0.1.0/src/cognit/engine/_mermaid_docker/Dockerfile +20 -0
- cognit-0.1.0/src/cognit/engine/_mermaid_docker/README.md +28 -0
- cognit-0.1.0/src/cognit/engine/_mermaid_docker/__init__.py +0 -0
- cognit-0.1.0/src/cognit/engine/_mermaid_docker/validate.mjs +32 -0
- cognit-0.1.0/src/cognit/engine/generate.py +147 -0
- cognit-0.1.0/src/cognit/engine/grade.py +53 -0
- cognit-0.1.0/src/cognit/engine/llm.py +26 -0
- cognit-0.1.0/src/cognit/engine/llm_anthropic.py +268 -0
- cognit-0.1.0/src/cognit/engine/llm_claude_agent.py +216 -0
- cognit-0.1.0/src/cognit/engine/llm_fake.py +52 -0
- cognit-0.1.0/src/cognit/engine/mermaid.py +243 -0
- cognit-0.1.0/src/cognit/engine/models.py +138 -0
- cognit-0.1.0/src/cognit/engine/prompts/__init__.py +0 -0
- cognit-0.1.0/src/cognit/engine/prompts/generate.txt +15 -0
- cognit-0.1.0/src/cognit/engine/prompts/grade_open.txt +13 -0
- cognit-0.1.0/src/cognit/engine/prompts/mermaid.txt +15 -0
- cognit-0.1.0/src/cognit/engine/prompts/system_generate.txt +39 -0
- cognit-0.1.0/src/cognit/engine/prompts/system_grade.txt +29 -0
- cognit-0.1.0/src/cognit/engine/prompts/system_mermaid.txt +32 -0
- cognit-0.1.0/src/cognit/ghio/__init__.py +0 -0
- cognit-0.1.0/src/cognit/ghio/diff.py +78 -0
- cognit-0.1.0/src/cognit/ghio/pr.py +86 -0
- cognit-0.1.0/src/cognit/py.typed +0 -0
- cognit-0.1.0/src/cognit/server/__init__.py +0 -0
- cognit-0.1.0/src/cognit/server/app.py +106 -0
- cognit-0.1.0/src/cognit/server/assets/index.html +50 -0
- cognit-0.1.0/src/cognit/server/assets/mermaid.min.js +2024 -0
- cognit-0.1.0/src/cognit/server/assets/quiz.js +593 -0
- cognit-0.1.0/src/cognit/server/assets/styles.css +739 -0
- cognit-0.1.0/tests/__init__.py +0 -0
- cognit-0.1.0/tests/cli/__init__.py +0 -0
- cognit-0.1.0/tests/cli/test_root.py +15 -0
- cognit-0.1.0/tests/cli/test_take.py +262 -0
- cognit-0.1.0/tests/cli/test_take_select.py +26 -0
- cognit-0.1.0/tests/comment/__init__.py +0 -0
- cognit-0.1.0/tests/comment/test_render.py +36 -0
- cognit-0.1.0/tests/comment/test_roundtrip.py +59 -0
- cognit-0.1.0/tests/conftest.py +102 -0
- cognit-0.1.0/tests/engine/__init__.py +0 -0
- cognit-0.1.0/tests/engine/test_generate.py +235 -0
- cognit-0.1.0/tests/engine/test_grade.py +57 -0
- cognit-0.1.0/tests/engine/test_llm_anthropic.py +312 -0
- cognit-0.1.0/tests/engine/test_llm_claude_agent.py +255 -0
- cognit-0.1.0/tests/engine/test_llm_fake.py +48 -0
- cognit-0.1.0/tests/engine/test_mermaid.py +219 -0
- cognit-0.1.0/tests/engine/test_models.py +90 -0
- cognit-0.1.0/tests/fixtures/diffs/small_refactor.patch +12 -0
- cognit-0.1.0/tests/ghio/__init__.py +0 -0
- cognit-0.1.0/tests/ghio/test_diff.py +47 -0
- cognit-0.1.0/tests/ghio/test_pr.py +66 -0
- cognit-0.1.0/tests/server/__init__.py +0 -0
- cognit-0.1.0/tests/server/test_app.py +306 -0
- cognit-0.1.0/tests/server/test_submit_with_claude_agent.py +82 -0
- cognit-0.1.0/tests/server/test_ui_flow.py +163 -0
- cognit-0.1.0/tests/test_smoke.py +5 -0
- cognit-0.1.0/uv.lock +1290 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: astral-sh/setup-uv@v3
|
|
14
|
+
with:
|
|
15
|
+
enable-cache: true
|
|
16
|
+
- name: Install dependencies
|
|
17
|
+
run: uv sync --all-extras --dev
|
|
18
|
+
- name: Lint
|
|
19
|
+
run: uv run ruff check
|
|
20
|
+
- name: Format check
|
|
21
|
+
run: uv run ruff format --check
|
|
22
|
+
- name: Type check
|
|
23
|
+
run: uv run mypy src/
|
|
24
|
+
- name: Install Playwright browser
|
|
25
|
+
run: uv run playwright install --with-deps chromium
|
|
26
|
+
- name: Test
|
|
27
|
+
run: uv run pytest -v
|
|
28
|
+
|
|
29
|
+
cli-smoke:
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
needs: test
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v4
|
|
34
|
+
- uses: astral-sh/setup-uv@v3
|
|
35
|
+
- name: Install mermaid-cli
|
|
36
|
+
run: npm install -g @mermaid-js/mermaid-cli@10
|
|
37
|
+
- name: Install cognit from source
|
|
38
|
+
run: uv tool install --from . cognit
|
|
39
|
+
- name: Verify CLI works
|
|
40
|
+
run: |
|
|
41
|
+
cognit --version
|
|
42
|
+
cognit --help
|
|
43
|
+
cognit take --help
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
tags: ["v*"]
|
|
5
|
+
|
|
6
|
+
permissions:
|
|
7
|
+
contents: write
|
|
8
|
+
id-token: write # PyPI trusted publishing
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build-and-publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
environment: pypi
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: astral-sh/setup-uv@v3
|
|
17
|
+
- name: Install dependencies
|
|
18
|
+
run: uv sync
|
|
19
|
+
- name: Build distributions
|
|
20
|
+
run: uv build
|
|
21
|
+
- name: Publish to PyPI
|
|
22
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
23
|
+
- name: Create GitHub Release
|
|
24
|
+
uses: softprops/action-gh-release@v2
|
|
25
|
+
with:
|
|
26
|
+
files: dist/*
|
|
27
|
+
generate_release_notes: true
|
cognit-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
*.pyc
|
|
5
|
+
build/
|
|
6
|
+
dist/
|
|
7
|
+
wheels/
|
|
8
|
+
*.egg-info
|
|
9
|
+
*.egg-info/
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
|
|
14
|
+
# Tool caches
|
|
15
|
+
.ruff_cache/
|
|
16
|
+
.mypy_cache/
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
|
|
19
|
+
# IDE and OS files
|
|
20
|
+
.idea/
|
|
21
|
+
.vscode/
|
|
22
|
+
*.swp
|
|
23
|
+
*.swo
|
|
24
|
+
*~
|
|
25
|
+
.DS_Store
|
|
26
|
+
|
|
27
|
+
# Environment files
|
|
28
|
+
.env
|
|
29
|
+
|
|
30
|
+
# Claude Code per-project state (sessions, agent runs, settings)
|
|
31
|
+
.claude/
|
|
32
|
+
|
|
33
|
+
# Node tooling (mermaid-cli installs here when used locally without -g)
|
|
34
|
+
node_modules/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
4
|
+
|
|
5
|
+
## [Unreleased]
|
|
6
|
+
|
|
7
|
+
### Changed
|
|
8
|
+
- **Renamed `quizz` → `cognit`.** Package, CLI command, env var (`QUIZZ_LOG_LEVEL` → `COGNIT_LOG_LEVEL`), PR comment markers, cache dir (`$TMPDIR/quizz/` → `$TMPDIR/cognit/`), and MCP server name all move to `cognit`. The generic noun "quiz" (question artifact, `Quiz` model, `quiz.js`) is unchanged. Pre-1.0, nothing was published under the old name.
|
|
9
|
+
- **In-memory-only quiz storage.** `cognit take` no longer posts the quiz as a PR comment. The quiz is generated in memory and cached locally at `$TMPDIR/cognit/<sha1(pr_url)[:16]>.json`. Re-running `cognit take` against the same PR (e.g. after closing the browser) reuses the cached quiz instead of regenerating, so a closed-tab recovery doesn't pay another LLM bill. The PR thread now carries **at most one comment per take session**, only if the author clicks Publish — and that comment is self-contained (question prompts + author answers inlined via `render_results_inlined`). Reviewers no longer see an answer key in plaintext on the PR.
|
|
10
|
+
- `/publish` now requires a prior `/submit` (returns 400 otherwise) because the inlined comment needs the cached answers.
|
|
11
|
+
- Older PRs may still have legacy `<!-- cognit:quiz v1 -->` comments; they're dormant and harmless. New runs ignore them.
|
|
12
|
+
- **BREAKING — CLI collapsed to a single `cognit take` command.** `take` now auto-generates the quiz comment on the PR if none exists yet (calls the LLM with the diff, posts the rendered markdown), then opens the browser and grades in-session as before. The author runs one command instead of three. New flags on `take`: `--min-diff-lines` (default 50) and `--max-diff-lines` (default 2000) inherit from the old `cognit generate`. The engine layer (`engine/generate.py`, `engine/grade.py`) is unchanged — a future webhook or GitHub App can still call it directly.
|
|
13
|
+
- **UI redesign**: `cognit take` now uses a github-native design — replaces the editorial (paper/serif/narrow) UI. Spec in `UI-REDESIGN.md`.
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- `/publish` endpoint now returns `comment_url` so the UI can deep-link to the posted comment.
|
|
17
|
+
- Playwright integration tests in `tests/server/test_ui_flow.py` driving the question → results → published flow.
|
|
18
|
+
|
|
19
|
+
### Fixed
|
|
20
|
+
- **Quiz generation no longer ships giant vendored files into the prompt.** `fetch_diff_and_files` now skips a denylist of vendored/minified/lockfile/binary paths (`*.min.js`, `*.min.css`, `*.lock`, `package-lock.json`, `pnpm-lock.yaml`, images, fonts, PDFs) when inlining changed-file contents. The diff still lists them as touched, but their full text is omitted — fixing context-window blowups on PRs that touch large files like the 3.2 MB vendored `mermaid.min.js`. (A fuller fix — having the agent fetch only what it needs — is designed in `docs/superpowers/specs/2026-05-23-agentic-outline-generation-design.md`.)
|
|
21
|
+
- **Mermaid validator no longer silently skips when `mmdc` is missing.** Validation now layers: native `mmdc` if on PATH → dockerised parse-only image (lazily built from `src/cognit/engine/_mermaid_docker/` on first use, ~200 MB without Chromium) → Python regex backstop. The backstop catches the common LLM failure modes (missing diagram header, grossly unbalanced brackets, **`[/text]` parallelogram-shape traps** when labels contain URL-like paths — the bug that surfaced in the PR #4 smoke against this very repo). The artisan system prompt also explicitly forbids leading `/` or `\` in node labels now, so generation retries clean output instead of relying on validation to catch it.
|
|
22
|
+
- **Debug logging via `COGNIT_LOG_LEVEL`.** Surfaces which validator layer is being used per diagram, cache hits/misses, docker image build state, etc. Default level is WARNING (quiet); set `COGNIT_LOG_LEVEL=DEBUG` to trace internal decisions.
|
|
23
|
+
- **OAuth token rotation no longer 401s mid-session.** `AnthropicLLM` previously cached the Claude Code OAuth token at `__init__` and never re-read `~/.claude/.credentials.json`. A long-running `cognit take` (auto-generate → user answers → submit) would 401 on the grading call when `claude` rotated the token in between. The client now retries once on `AuthenticationError` with a freshly-read token, recovering automatically. API-key auth is unaffected — a 401 there is a real configuration problem and still bubbles up immediately.
|
|
24
|
+
- XSS hardening: quiz JSON injected into inline `<script>` is `</`-escaped; PR URL substituted into `href=` attributes and the JS global is properly HTML/JSON escaped.
|
|
25
|
+
- Mermaid `securityLevel` changed from `"loose"` to `"strict"` — prevents HTML rendering inside LLM-generated node labels.
|
|
26
|
+
- Inline backtick-code rendering in prompts (regression from the rewrite).
|
|
27
|
+
- Submit button is disabled until all questions are answered.
|
|
28
|
+
- Keyboard navigation + ARIA on MCQ / TF / Diagram options (a11y regression from the rewrite).
|
|
29
|
+
|
|
30
|
+
### Removed
|
|
31
|
+
- **BREAKING — `cognit generate` and `cognit grade` CLI commands.** Their behaviour is absorbed into `cognit take`: generation runs automatically when no quiz comment exists on the PR; grading runs in-session via the local FastAPI server. Tests for these CLI surfaces were deleted; engine-level tests for generation and grading remain intact.
|
|
32
|
+
- **GitHub Actions wrappers** (`actions/cognit-generate`, `actions/cognit-grade`) and the matching `.github/examples/` workflows. They were prototyped end-to-end but hit two compounding issues with GitHub Models (strict schema rejection + malformed free-form output from `gpt-4o-mini`). v1 ships as **local CLI only**; the auto-trigger Action ambition is now **dropped, not deferred** — the CLI no longer exposes a separate `generate` entrypoint to wrap.
|
|
33
|
+
|
|
34
|
+
## [0.1.0] — 2026-05-17
|
|
35
|
+
|
|
36
|
+
### Added
|
|
37
|
+
|
|
38
|
+
- **`cognit generate` CLI**: reads diff via `gh pr diff`, calls an LLM (Anthropic via tool use by default; GitHub Models as alternate), validates mermaid diagrams via `mmdc --parse`, posts a quiz comment to the PR.
|
|
39
|
+
- **`cognit take` CLI**: auto-detects the PR for the current branch, opens a local FastAPI server with a polished browser UI (mermaid diagrams rendered client-side via `mermaid.js` UMD bundle), posts an answers comment back to the PR, and polls for the results comment.
|
|
40
|
+
- **`cognit grade` CLI**: locates quiz + answers comments, LLM-grades the open question, posts a results comment with per-question feedback.
|
|
41
|
+
- **Four question types**: multiple choice, mermaid-diagram selection (auto-relabeled to neutral A/B/C/D to prevent answer leak), open (LLM-graded), true/false.
|
|
42
|
+
- **Anthropic LLM adapter** via tool use (guaranteed-schema output). Auth resolution: explicit `api_key` → `ANTHROPIC_API_KEY` env var → Claude Code OAuth at `~/.claude/.credentials.json`.
|
|
43
|
+
- **GitHub Models LLM adapter** (OpenAI-compatible) as an alternate provider.
|
|
44
|
+
- **Engine module** (`cognit.engine`): GitHub-agnostic schema + generation + grading logic.
|
|
45
|
+
- **Comment serialization** (`cognit.comment`): lossless markdown ↔ Pydantic roundtrip with embedded JSON state.
|
|
46
|
+
- Mermaid validation pipeline: `mmdc --parse` with up-to-2 retries; drop mermaid Q + add replacement MCQ on terminal failure.
|
|
47
|
+
- PR-level escape hatch: `quiz: skip` in PR body suppresses generation.
|
|
48
|
+
|
|
49
|
+
### Known limitations
|
|
50
|
+
|
|
51
|
+
- Fork PRs not supported in v1 (`gh pr comment` requires write access).
|
|
52
|
+
- Single LLM provider per invocation — fleet-of-LLMs deferred to v2.
|
|
53
|
+
- No team-specific knowledge injection (Skills) — deferred to v2.
|
|
54
|
+
- No GitHub App / Marketplace App — deferred to v2.
|
|
55
|
+
- No CI auto-trigger — local CLI only. See "Removed" above.
|
cognit-0.1.0/INTENTS.md
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# PR Author Quiz — Design
|
|
2
|
+
|
|
3
|
+
A voluntary tool that quizzes the **PR author** on their own pull request to surface the gap between what they think the code does and what it actually does — before they merge.
|
|
4
|
+
|
|
5
|
+
## The problem
|
|
6
|
+
|
|
7
|
+
Developers increasingly rely on AI tools to write and review code, which creates a comprehension gap — code gets merged that nobody on the team fully understands. The risk isn't bad code per se; it's **false confidence in code that looks reasonable but does something subtly different from what the developer expects**.
|
|
8
|
+
|
|
9
|
+
## The philosophy
|
|
10
|
+
|
|
11
|
+
- **Opt-in, not enforced.** Like CI checks, linters, or pre-commit hooks. Developers choose to enable it because it makes them better and protects them from their own blind spots. It assumes good intent.
|
|
12
|
+
- **Failing the quiz doesn't block the merge.** The author can ignore it. The goal is to surface the gap between their mental model and the code's actual behavior — so they don't merge code that doesn't match what they think it does.
|
|
13
|
+
- **The quiz is the diagnostic; the explanation is the medicine.** The "aha" moment when a developer answers wrong and realizes the code does something they didn't expect is the entire point.
|
|
14
|
+
- **North star: maximize the utility of human attention.** Let LLMs do the heavy lifting of probing understanding so the limited human time spent on a PR is spent on what genuinely needs a human mind.
|
|
15
|
+
|
|
16
|
+
## Why this exists (vs. what's out there)
|
|
17
|
+
|
|
18
|
+
- Existing comparable tools (`dkamm/pr-quiz`, Gater) target the **reviewer**, not the author. Reviewer-side gating is downstream of the real problem: people open PRs they don't fully understand, especially when AI wrote most of the code.
|
|
19
|
+
- A teaching loop is more valuable than a pass/fail gate.
|
|
20
|
+
- Voluntary use removes the entire blocking/override/branch-protection ceremony. The author opts in by running the CLI. If they don't, the PR still merges — the cost of skipping is forgone learning, not a procedural roadblock.
|
|
21
|
+
|
|
22
|
+
## Design principles for the MVP
|
|
23
|
+
|
|
24
|
+
- **Engine is portable.** The quiz generator and grader live in a standalone module (`engine/`) with no GitHub API calls inside. The CLI commands are thin wrappers that call into the engine and handle GitHub-specific I/O at the edges. This keeps the door open to a v2 GitHub Action / GitHub App that reuses the same engine.
|
|
25
|
+
- **PR thread is the canonical state.** No external storage, no state branches, no workflow artifacts crossing runs. Everything that needs to persist is a PR comment.
|
|
26
|
+
- **One LLM, one provider in v1.** Multi-LLM orchestration is deferred (see Future vision).
|
|
27
|
+
- **No team-specific knowledge injection in v1.** Skills integration is deferred (see Future vision).
|
|
28
|
+
- **Local CLI is the only surface in v1.** GitHub Action auto-trigger was prototyped and then deliberately dropped — see Future vision.
|
|
29
|
+
|
|
30
|
+
## Architecture
|
|
31
|
+
|
|
32
|
+
One CLI command. The PR thread carries only the (opt-in) results comment — the quiz itself never touches it.
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
┌────────────────────────────────────┐
|
|
36
|
+
│ `cognit take` (CLI, local) │
|
|
37
|
+
│ │
|
|
38
|
+
│ 1. detect PR from current branch │
|
|
39
|
+
│ 2. cache hit? │
|
|
40
|
+
│ yes → load Quiz from │ ──── reads ──► $TMPDIR/cognit/<sha1>.json
|
|
41
|
+
│ $TMPDIR/cognit/... │ ◄─── writes ──
|
|
42
|
+
│ no → fetch diff, call LLM, │
|
|
43
|
+
│ write Quiz to cache │
|
|
44
|
+
│ 3. open browser UI │
|
|
45
|
+
│ 4. grade everything in-session │
|
|
46
|
+
│ (det. + LLM open-Q grading) │
|
|
47
|
+
│ 5. show results inline │
|
|
48
|
+
└─────────────┬──────────────────────┘
|
|
49
|
+
│
|
|
50
|
+
│ 6. user clicks "Publish results"
|
|
51
|
+
│ (opt-in — nothing posted on submit)
|
|
52
|
+
▼
|
|
53
|
+
POST /publish ──────────────────────────► ┌──────────────────────────────┐
|
|
54
|
+
│ PR comment (results) │
|
|
55
|
+
│ ─ total + per-Q score │
|
|
56
|
+
│ ─ question prompts inlined │
|
|
57
|
+
│ ─ your answers inlined │
|
|
58
|
+
│ ─ open-Q LLM feedback │
|
|
59
|
+
└──────────────────────────────┘
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
The PR thread carries **at most one comment** per take session, and only if the author chose to publish. The quiz itself lives in memory + an ephemeral local cache (`$TMPDIR/cognit/`) — no quiz comment, no answer key visible on the PR by default. The results comment is self-contained (question prompts + author answers inlined) so reviewers reading the PR don't need a separate quiz comment to cross-reference.
|
|
63
|
+
|
|
64
|
+
## Components
|
|
65
|
+
|
|
66
|
+
### `cognit take` CLI
|
|
67
|
+
|
|
68
|
+
- Auth: uses local `gh auth login` for PR I/O. LLM auth via Claude Code OAuth (`~/.claude/.credentials.json`) or `ANTHROPIC_API_KEY`. Anthropic is the only supported provider in v1.
|
|
69
|
+
- Invocation: `cognit take [--pr <url-or-number>] [--model <name>] [--min-diff-lines N] [--max-diff-lines N]`. Auto-detects PR from current branch.
|
|
70
|
+
- Steps:
|
|
71
|
+
1. Detect the PR for the current branch via `gh pr view --json url`. (Skipped when `--pr` is passed.)
|
|
72
|
+
2. **Get the Quiz** — local cache first, else generate fresh:
|
|
73
|
+
- Cache path: `$TMPDIR/cognit/<sha1(pr_url)[:16]>.json`. If present, deserialize and skip to step 3.
|
|
74
|
+
- Otherwise: fetch PR title, body via `gh pr view`; diff via `gh pr diff`; touched-file contents via `git show HEAD:<path>`. Skip if diff < `--min-diff-lines` (default 50), > `--max-diff-lines` (default 2000), or if PR body contains `quiz: skip`.
|
|
75
|
+
- Call the LLM with a prompt that asks it to **decide both the count and the type-mix** based on diff size and complexity. A typo fix gets 2–3 probes; a 500-line refactor with new abstractions might warrant 8 or more. Question types: `mcq` (facts/invariants), `mermaid` (control or data flow — generated as 1 correct + 3 plausible-but-wrong variants in uniform style), `open` (LLM-graded against a rubric), `tf` (subtle behavioral claims).
|
|
76
|
+
- Validate mermaid diagrams with `@mermaid-js/mermaid-cli` parse pass (skip silently if not installed); retry per-question on failure (max 2 retries); drop mermaid Q as last resort.
|
|
77
|
+
- Post-process mermaid options to neutral A/B/C/D labels (prevents accidental answer leak from semantic labels like `correct`/`wrong_1`).
|
|
78
|
+
- Write the Quiz JSON to the cache. **The quiz is NOT posted to the PR.**
|
|
79
|
+
3. Spin up a local HTTP server (FastAPI + uvicorn, 127.0.0.1 only, random unused port), open the URL in the default browser. The Quiz is held in the server's closure.
|
|
80
|
+
4. Browser renders the quiz with `mermaid.js` (real diagram rendering, not GitHub's markdown view), real form controls for MCQ, a textarea for the open question.
|
|
81
|
+
5. On submit (POST `/submit`):
|
|
82
|
+
- Cache the submitted `Answers` in the server's closure (so `/publish` can render an inlined results comment).
|
|
83
|
+
- Grade MCQ + mermaid + T/F deterministically against the answer key.
|
|
84
|
+
- LLM-grade the open question in-session.
|
|
85
|
+
- Return the full `Results` JSON to the browser. **Nothing is posted to the PR yet.**
|
|
86
|
+
- Browser renders the result panel inline: total score, per-question breakdown, open-question feedback in a blockquote.
|
|
87
|
+
6. On clicking "Publish results to PR" (POST `/publish`):
|
|
88
|
+
- Server renders a **self-contained results comment** via `render_results_inlined(quiz, answers, results)` — each question's prompt, the author's answer, the score, and any feedback are inlined.
|
|
89
|
+
- Posts via `gh pr comment`. Confirms via status text in the UI.
|
|
90
|
+
- Requires that `/submit` ran first in this session; otherwise responds 400.
|
|
91
|
+
- Stays alive until the user closes the browser tab or hits Ctrl-C.
|
|
92
|
+
|
|
93
|
+
**One command, one diagnostic.** `cognit generate` and `cognit grade` existed in earlier versions as separate CLI commands; both were collapsed into `take` once it became clear that the only happy-path use case is the author running a single command after opening their PR. The engine layer (`engine/generate.py`, `engine/grade.py`) is still standalone — a future GitHub App or webhook receiver can call into it without going through the CLI.
|
|
94
|
+
|
|
95
|
+
**No quiz on the PR thread.** Earlier versions posted the quiz as a PR comment with marker `<!-- cognit:quiz v1 -->` and used the thread as canonical state. After the collapse to one command, that storage stopped being load-bearing — and it carried real costs (answer key visible in plaintext, noise for reviewers, an extra artifact unrelated to code review). The in-memory + ephemeral-cache design keeps recovery working (close the tab, re-run, same quiz, no LLM re-bill) without putting anything on the PR until the author opts in to publish.
|
|
96
|
+
|
|
97
|
+
**Publishing is opt-in.** Solo devs can practice in private without leaving a trail; users who want a record click the button.
|
|
98
|
+
|
|
99
|
+
**Who can run `cognit take`:** anyone with `gh` access to the repo. The CLI doesn't gate by PR-author identity; downstream consumers (you, the human) decide who runs the local CLI.
|
|
100
|
+
|
|
101
|
+
## Quiz comment format
|
|
102
|
+
|
|
103
|
+
Markdown for humans, JSON code block for the CLI. The answer key is in plaintext on purpose — this is a voluntary self-quiz, and scrolling past your own answer key to cheat is a choice the author makes against their own learning.
|
|
104
|
+
|
|
105
|
+
```markdown
|
|
106
|
+
<!-- cognit:quiz v1 -->
|
|
107
|
+
## Quiz on your PR
|
|
108
|
+
|
|
109
|
+
Take it in your terminal: `cognit take` (or `cognit take <this PR URL>`).
|
|
110
|
+
Or scroll down and answer in your head — see what you got wrong at the bottom.
|
|
111
|
+
|
|
112
|
+
### Question 1 — Multiple choice
|
|
113
|
+
Which assertion best describes the new caching strategy?
|
|
114
|
+
- A) Per-request, in-memory, no eviction
|
|
115
|
+
- B) Per-user, Redis-backed, TTL 5min
|
|
116
|
+
- C) Per-tenant, in-memory, LRU 1000 entries
|
|
117
|
+
- D) Per-request, Redis-backed, no TTL
|
|
118
|
+
|
|
119
|
+
### Question 2 — Pick the matching diagram
|
|
120
|
+
Which mermaid diagram best represents the new auth flow?
|
|
121
|
+
|
|
122
|
+
#### Option A
|
|
123
|
+
\`\`\`mermaid
|
|
124
|
+
flowchart LR
|
|
125
|
+
Client --> Gateway
|
|
126
|
+
Gateway --> Auth
|
|
127
|
+
Auth --> DB
|
|
128
|
+
\`\`\`
|
|
129
|
+
|
|
130
|
+
#### Option B
|
|
131
|
+
\`\`\`mermaid
|
|
132
|
+
flowchart LR
|
|
133
|
+
Client --> Auth
|
|
134
|
+
Auth --> Gateway
|
|
135
|
+
Gateway --> DB
|
|
136
|
+
\`\`\`
|
|
137
|
+
|
|
138
|
+
... (Options C and D)
|
|
139
|
+
|
|
140
|
+
### Question 3 — Open
|
|
141
|
+
Explain why you chose `RWMutex` over `Mutex` in `cache.go:42`.
|
|
142
|
+
|
|
143
|
+
(continue for questions 4 and 5)
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
<details>
|
|
147
|
+
<summary>Quiz state (used by the CLI — don't edit)</summary>
|
|
148
|
+
|
|
149
|
+
\`\`\`json
|
|
150
|
+
{
|
|
151
|
+
"version": "1",
|
|
152
|
+
"pr_number": 42,
|
|
153
|
+
"questions": [
|
|
154
|
+
{"id": "q1", "type": "mcq", "answer": "C", "options": ["A","B","C","D"]},
|
|
155
|
+
{"id": "q2", "type": "mermaid", "answer": "B", "options": ["A","B","C","D"]},
|
|
156
|
+
{"id": "q3", "type": "open", "rubric": "Must mention concurrent reads, write contention, ..."},
|
|
157
|
+
{"id": "q4", "type": "tf", "answer": true},
|
|
158
|
+
{"id": "q5", "type": "mcq", "answer": "B", "options": ["A","B","C","D"]}
|
|
159
|
+
]
|
|
160
|
+
}
|
|
161
|
+
\`\`\`
|
|
162
|
+
</details>
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Configuration
|
|
166
|
+
|
|
167
|
+
Single workflow input file with sensible defaults. Tunable knobs:
|
|
168
|
+
|
|
169
|
+
| Knob | Default |
|
|
170
|
+
|---|---|
|
|
171
|
+
| LLM provider | Anthropic only (Claude SDK). API key (`ANTHROPIC_API_KEY`) or Claude Code OAuth (`~/.claude/.credentials.json`) — auto-detected. |
|
|
172
|
+
| `--model` | `claude-sonnet-4-6` |
|
|
173
|
+
| `--min-diff-lines` | 50 (skip tiny PRs) |
|
|
174
|
+
| `--max-diff-lines` | 2000 (skip huge PRs) |
|
|
175
|
+
| `excludes` | `*-lock.*`, `*.lock`, `*.map`, `*.pb.*`, `*_pb2.py`, `*.generated.*`, `*.auto.*`, `dist/**`, `build/**` |
|
|
176
|
+
| question count | **LLM-decided.** Prompt instructs the model to pick the count and type-mix based on diff complexity. Typical range 2–10. |
|
|
177
|
+
| `context-strategy` | `diff + pr-body + touched-files-full` |
|
|
178
|
+
|
|
179
|
+
PR-level escape hatches: `quiz: skip` in PR description suppresses generation entirely.
|
|
180
|
+
|
|
181
|
+
## Error handling
|
|
182
|
+
|
|
183
|
+
| Failure | Behavior |
|
|
184
|
+
|---|---|
|
|
185
|
+
| Mermaid syntax invalid in any candidate | Retry per-question generation up to 2 times. If still invalid, drop the mermaid question. |
|
|
186
|
+
| Diff smaller than `--min-diff-lines` | `take` prints `"diff is N lines (< min) — skipping."` and exits zero. No PR comment. |
|
|
187
|
+
| Diff larger than `--max-diff-lines` | `take` prints `"diff is N lines (> max) — skipping."` and exits zero. No PR comment. |
|
|
188
|
+
| `quiz: skip` in PR body | `take` prints `"quiz: skip in PR body — skipping."` and exits zero. No PR comment. |
|
|
189
|
+
| LLM call fails (network, rate limit, validation) | CLI catches `AnthropicAPIError`/`ValidationError` and exits 1 with a friendly message. |
|
|
190
|
+
| `--show-results` with no results comment | Print `"no results comment found on this PR."` and exit 1. |
|
|
191
|
+
|
|
192
|
+
## Testing strategy
|
|
193
|
+
|
|
194
|
+
- **Unit tests** for the question generator: fixture diffs → assert valid JSON schema, valid mermaid, mermaid label neutralization.
|
|
195
|
+
- **Unit tests** for grading: fixture quiz JSON + fixture answers → assert correct deterministic + LLM-graded scoring.
|
|
196
|
+
- **Unit tests** for the LLM adapter using `respx` to mock `api.anthropic.com`. Covers both stages of generation (`generate_quiz_outline` and `generate_mermaid_set`) and `grade_open`, plus assertions on system-prompt + `cache_control` shape.
|
|
197
|
+
- **Unit tests** for the FastAPI server using `TestClient` (covers `/`, `/static/*`, `/submit`, `/publish`).
|
|
198
|
+
- **Playwright end-to-end** (manual, ad hoc): drive the browser through fill → submit → publish against the live PR; screenshot every state. The headless-Chrome `--screenshot` flag is a faster alternative for visual smoke.
|
|
199
|
+
- **CI** (`.github/workflows/ci.yml`): on every push, run `ruff check`, `ruff format --check`, `mypy --strict`, `pytest`, and a CLI install smoke (`cognit --help` etc.).
|
|
200
|
+
|
|
201
|
+
## Non-goals (for v1)
|
|
202
|
+
|
|
203
|
+
- No merge blocking. No Check Runs. No branch protection integration. (Opt-in philosophy — the discipline is taking the quiz, not being forced through it.)
|
|
204
|
+
- **No GitHub Action auto-trigger.** Composite Actions wrapping `cognit generate --post` were prototyped end-to-end and dropped — not deferred. The collapse to a single `cognit take` command means there's no separate generation entrypoint to wrap. A future automation surface would call the engine layer directly (webhook receiver, GitHub App).
|
|
205
|
+
- No GitHub App / Marketplace listing. No hosted infrastructure. No SaaS.
|
|
206
|
+
- No multi-LLM orchestration. Single provider — Anthropic (Claude SDK).
|
|
207
|
+
- No team-specific knowledge injection (Skills). Single generic prompt for now.
|
|
208
|
+
- No team enforcement. No "did the author pass the quiz" reporting up to managers.
|
|
209
|
+
- No GitLab / Bitbucket support. GitHub only.
|
|
210
|
+
- No support for fork-PRs (`gh` operations require write access to the PR; non-author contributors can take the quiz locally but can't post results back to a PR they don't own).
|
|
211
|
+
|
|
212
|
+
## Future vision (v2 and beyond — explicitly deferred but preserved)
|
|
213
|
+
|
|
214
|
+
The Android-session vision points beyond v1. These are real product ambitions, not feature creep — captured here so we don't lose them while shipping a focused MVP.
|
|
215
|
+
|
|
216
|
+
### Fleet of LLMs
|
|
217
|
+
A generation orchestrator that fans out to multiple providers (OpenAI, Anthropic, Gemini, GitHub Models, local models), deduplicates similar questions, and picks a balanced set. **Why it matters:** diversity of perspectives, harder for authors to learn to pattern-match the questions, surfaces a wider range of comprehension gaps. **What it adds:** a generation orchestrator module in the engine, per-provider adapters, deduplication logic, more API keys to manage.
|
|
218
|
+
|
|
219
|
+
### Skills integration (team knowledge injection)
|
|
220
|
+
A `.cognit/skills/` directory of markdown files in the repo, loaded into the generation prompt. Teams describe their codebase's invariants, conventions, and architectural choices; the quiz generator uses them to ask questions that reflect the team's reality rather than generic code-comprehension probes. **Why it matters:** this is the real differentiator vs. Gater — questions that know what's idiomatic for *this* codebase, not what's idiomatic in general. **What it adds:** a Skills loader, prompt-engineering work to weave Skills into the generation context, possibly a `cognit skills validate` CLI command.
|
|
221
|
+
|
|
222
|
+
### GitHub App graduation
|
|
223
|
+
A Marketplace-installable GitHub App that wraps the same engine. Webhook receiver, hosted backend (Cloudflare Workers + D1 most likely), OAuth user identity, hosted SPA quiz UI reusing the same JS as the local CLI's browser view. **Why it matters:** teams that don't want a workflow file in every repo, or want centralized config across many repos. **What it adds:** ~3–4 weeks of plumbing (webhook handler, OAuth flow, DB schema, hosting, Marketplace listing). The engine itself stays the same — that's the whole point of the v1 design principles.
|
|
224
|
+
|
|
225
|
+
### Other future possibilities (not committed, just named)
|
|
226
|
+
- **Richer question types:** sequence-of-events ordering, "what does this return?" with input fixtures, design-intent questions tagged separately from generic open questions.
|
|
227
|
+
- **Learning history:** opt-in record of which kinds of questions a developer tends to miss, so quizzes adapt over time (would require persistence, breaks the "ephemeral" rule — would graduate to the GitHub App).
|
|
228
|
+
- **Reviewer-side mode:** the original Gater / dkamm framing — quizzing the reviewer too. Different audience, but the same engine can serve it.
|
|
229
|
+
- **IDE integration:** quiz appears inline in VSCode / JetBrains rather than in a browser. Cool but a long way from MVP.
|
|
230
|
+
|
|
231
|
+
## Open items
|
|
232
|
+
|
|
233
|
+
- **CLI distribution.** Go binary via `go install`, Homebrew, or a GitHub-hosted releases page? Probably all three eventually, but the MVP picks one — likely `go install` for simplicity, given the audience is developers who already have a Go toolchain or are willing to install one.
|
|
234
|
+
- **Mermaid distractor quality.** The "all four in uniform style" prompt may still leak the answer through subtle cues (LLMs often draw the "right" one more confidently). May need an explicit style-spec in the prompt and/or a post-hoc rewrite pass to normalize.
|
|
235
|
+
- **Open-question rubric quality.** Generated rubrics are only as good as the LLM's understanding of the diff. Rubric-quality regression tests on a curated set of diffs are probably needed.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Research appendix
|
|
240
|
+
|
|
241
|
+
### Competitive landscape (May 2026)
|
|
242
|
+
|
|
243
|
+
Two existing tools cover adjacent ground; neither does what we're building:
|
|
244
|
+
|
|
245
|
+
- **`dkamm/pr-quiz`** — open source GitHub Action, MIT, 208 stars. Targets the *reviewer*, MCQ only, blocks merge via tunneled web UI (ngrok). Single human commit (June 2025), v0.1.0 release (July 2025), only dependabot activity since. Effectively dormant.
|
|
246
|
+
- **Gater (`usegater.app`)** — closed-source commercial. Targets the *reviewer*. Quiz lives in a Chrome extension over the GitHub PR page. Free personal tier, $20/mo Pro for 5/10/15 seats. Active marketing, no public technical blog.
|
|
247
|
+
|
|
248
|
+
Both quiz the reviewer of AI-generated code. **We quiz the author of any code, including their own.** No competing tool uses mermaid-diagram-selection as a question type.
|
|
249
|
+
|
|
250
|
+
### Feasibility study summary (May 2026)
|
|
251
|
+
|
|
252
|
+
Three architectures were investigated in parallel by independent agents:
|
|
253
|
+
|
|
254
|
+
| | A — Inline PR | B — GH Pages | C — GitHub App + UI |
|
|
255
|
+
|---|---|---|---|
|
|
256
|
+
| Verdict | YELLOW | YELLOW | GREEN |
|
|
257
|
+
| Effort | ~1–2 wks | ~3–4× A | ~6 wks |
|
|
258
|
+
| External infra | None | Tiny proxy needed | Full backend |
|
|
259
|
+
|
|
260
|
+
All three are buildable; B is dominated (more work than A, worse UX than C). C is the SaaS shape that Gater already occupies — we chose not to compete there.
|
|
261
|
+
|
|
262
|
+
**The architecture above is none of A/B/C** — it's a hybrid that emerged after the user pointed out the whole tool can be voluntary. By dropping the merge-blocking ceremony, we drop the need for Check Runs, branch protection, overrides, and the state-across-workflows complexity that made A awkward. What's left is two thin Actions plus a CLI that uses `gh` for auth and posting. No external hosting, no auth complexity, no SaaS.
|
cognit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jonas Brami
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cognit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cognit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Voluntary PR-author comprehension quiz tool
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: anthropic>=0.102.0
|
|
8
|
+
Requires-Dist: claude-agent-sdk>=0.1.44
|
|
9
|
+
Requires-Dist: fastapi>=0.136.1
|
|
10
|
+
Requires-Dist: httpx>=0.27
|
|
11
|
+
Requires-Dist: pydantic>=2.7
|
|
12
|
+
Requires-Dist: typer>=0.12
|
|
13
|
+
Requires-Dist: uvicorn>=0.47.0
|