refactorika 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- refactorika-0.2.0/.env.example +9 -0
- refactorika-0.2.0/.gitignore +24 -0
- refactorika-0.2.0/CLAUDE.md +99 -0
- refactorika-0.2.0/LICENSE +21 -0
- refactorika-0.2.0/Makefile +70 -0
- refactorika-0.2.0/PKG-INFO +541 -0
- refactorika-0.2.0/README.md +488 -0
- refactorika-0.2.0/demo_repo/billing.py +33 -0
- refactorika-0.2.0/demo_repo/orders.py +41 -0
- refactorika-0.2.0/demo_repo/pyrightconfig.json +5 -0
- refactorika-0.2.0/demo_repo/test_orders.py +28 -0
- refactorika-0.2.0/docs/01-problem-statement.md +32 -0
- refactorika-0.2.0/docs/02-scope.md +56 -0
- refactorika-0.2.0/docs/03-tech-stack.md +48 -0
- refactorika-0.2.0/docs/04-architecture.md +139 -0
- refactorika-0.2.0/docs/05-redis-iris.md +116 -0
- refactorika-0.2.0/docs/11-benchmarks-and-eval.md +114 -0
- refactorika-0.2.0/docs/12-benchmark-display-spec.md +284 -0
- refactorika-0.2.0/docs/12-harness-benchmark.md +71 -0
- refactorika-0.2.0/docs/13-full-system-benchmark.md +126 -0
- refactorika-0.2.0/docs/13-v3-roadmap.md +172 -0
- refactorika-0.2.0/docs/14-benchmark-case-catalog-and-stress-plan.md +446 -0
- refactorika-0.2.0/docs/15-four-arm-agent-benchmark-contract.md +40 -0
- refactorika-0.2.0/docs/v2-worklog.md +123 -0
- refactorika-0.2.0/docs/v2_spec.md +416 -0
- refactorika-0.2.0/docs/v3-worklog.md +33 -0
- refactorika-0.2.0/docs/v3_spec.md +145 -0
- refactorika-0.2.0/eval/PLAN_agentic_mcp_arm.md +627 -0
- refactorika-0.2.0/eval/README.md +127 -0
- refactorika-0.2.0/eval/__init__.py +2 -0
- refactorika-0.2.0/eval/agents/__init__.py +44 -0
- refactorika-0.2.0/eval/agents/campaign.py +220 -0
- refactorika-0.2.0/eval/agents/driver.py +651 -0
- refactorika-0.2.0/eval/agents/harness_tools.py +215 -0
- refactorika-0.2.0/eval/agents/loop.py +291 -0
- refactorika-0.2.0/eval/agents/metrics.py +197 -0
- refactorika-0.2.0/eval/agents/prompts.py +105 -0
- refactorika-0.2.0/eval/agents/providers.py +266 -0
- refactorika-0.2.0/eval/agents/schema.py +91 -0
- refactorika-0.2.0/eval/agents/tools.py +308 -0
- refactorika-0.2.0/eval/fetch_benchmarks.sh +34 -0
- refactorika-0.2.0/eval/full_system_bench.py +1745 -0
- refactorika-0.2.0/eval/full_system_cases/__init__.py +40 -0
- refactorika-0.2.0/eval/full_system_cases/behavior.py +328 -0
- refactorika-0.2.0/eval/full_system_cases/multifile.py +244 -0
- refactorika-0.2.0/eval/full_system_cases/recovery.py +229 -0
- refactorika-0.2.0/eval/full_system_cases/scale.py +327 -0
- refactorika-0.2.0/eval/full_system_cases/stress.py +485 -0
- refactorika-0.2.0/eval/full_system_cases/stress_contracts_extra.py +375 -0
- refactorika-0.2.0/eval/full_system_cases/stress_semantics_extra.py +490 -0
- refactorika-0.2.0/eval/full_system_cases/stress_systems_extra.py +561 -0
- refactorika-0.2.0/eval/harness_bench.py +663 -0
- refactorika-0.2.0/eval/harness_tasks.py +206 -0
- refactorika-0.2.0/eval/requirements.txt +12 -0
- refactorika-0.2.0/eval/run_eval.py +132 -0
- refactorika-0.2.0/eval/run_eval.sh +64 -0
- refactorika-0.2.0/pyproject.toml +68 -0
- refactorika-0.2.0/refactorika/__init__.py +3 -0
- refactorika-0.2.0/refactorika/agents/__init__.py +0 -0
- refactorika-0.2.0/refactorika/agents/base.py +23 -0
- refactorika-0.2.0/refactorika/agents/complexity_agent.py +28 -0
- refactorika-0.2.0/refactorika/agents/dead_code_agent.py +23 -0
- refactorika-0.2.0/refactorika/agents/duplicate_agent.py +27 -0
- refactorika-0.2.0/refactorika/agents/import_agent.py +15 -0
- refactorika-0.2.0/refactorika/agents/orchestrator.py +82 -0
- refactorika-0.2.0/refactorika/analysis/__init__.py +0 -0
- refactorika-0.2.0/refactorika/analysis/audit.py +86 -0
- refactorika-0.2.0/refactorika/analysis/call_graph.py +411 -0
- refactorika-0.2.0/refactorika/analysis/dead_code.py +248 -0
- refactorika-0.2.0/refactorika/analysis/duplicates.py +337 -0
- refactorika-0.2.0/refactorika/analysis/embeddings.py +164 -0
- refactorika-0.2.0/refactorika/analysis/parser.py +129 -0
- refactorika-0.2.0/refactorika/analysis/related.py +159 -0
- refactorika-0.2.0/refactorika/cli.py +382 -0
- refactorika-0.2.0/refactorika/core/__init__.py +1 -0
- refactorika-0.2.0/refactorika/core/analyze.py +137 -0
- refactorika-0.2.0/refactorika/core/apply.py +161 -0
- refactorika-0.2.0/refactorika/core/gates.py +126 -0
- refactorika-0.2.0/refactorika/core/schema.py +275 -0
- refactorika-0.2.0/refactorika/core/storage.py +157 -0
- refactorika-0.2.0/refactorika/dashboard.py +165 -0
- refactorika-0.2.0/refactorika/docs_gen.py +286 -0
- refactorika-0.2.0/refactorika/harness.py +266 -0
- refactorika-0.2.0/refactorika/languages/__init__.py +18 -0
- refactorika-0.2.0/refactorika/languages/base.py +45 -0
- refactorika-0.2.0/refactorika/languages/generic_adapter.py +18 -0
- refactorika-0.2.0/refactorika/languages/python_adapter.py +49 -0
- refactorika-0.2.0/refactorika/languages/registry.py +29 -0
- refactorika-0.2.0/refactorika/mcp_server.py +193 -0
- refactorika-0.2.0/refactorika/memory/__init__.py +0 -0
- refactorika-0.2.0/refactorika/memory/agent_memory.py +116 -0
- refactorika-0.2.0/refactorika/memory/context.py +113 -0
- refactorika-0.2.0/refactorika/memory/vector_index.py +325 -0
- refactorika-0.2.0/refactorika/observability.py +152 -0
- refactorika-0.2.0/refactorika/transforms/__init__.py +0 -0
- refactorika-0.2.0/refactorika/transforms/dead.py +94 -0
- refactorika-0.2.0/refactorika/transforms/imports.py +95 -0
- refactorika-0.2.0/scripts/backfill_sentry.py +368 -0
- refactorika-0.2.0/scripts/demo.py +234 -0
- refactorika-0.2.0/scripts/populate_sentry.py +210 -0
- refactorika-0.2.0/scripts/replay_benchmark_sentry.py +235 -0
- refactorika-0.2.0/scripts/replay_refactorbench_sentry.py +308 -0
- refactorika-0.2.0/scripts/warmup.sh +137 -0
- refactorika-0.2.0/tests/__init__.py +0 -0
- refactorika-0.2.0/tests/conftest.py +26 -0
- refactorika-0.2.0/tests/test_agent_campaign.py +160 -0
- refactorika-0.2.0/tests/test_agent_driver.py +390 -0
- refactorika-0.2.0/tests/test_agent_harness_tools.py +51 -0
- refactorika-0.2.0/tests/test_agent_loop.py +202 -0
- refactorika-0.2.0/tests/test_agent_memory.py +88 -0
- refactorika-0.2.0/tests/test_agent_metrics.py +172 -0
- refactorika-0.2.0/tests/test_agent_providers.py +129 -0
- refactorika-0.2.0/tests/test_agent_schema.py +72 -0
- refactorika-0.2.0/tests/test_agent_tools.py +118 -0
- refactorika-0.2.0/tests/test_apply_multi.py +86 -0
- refactorika-0.2.0/tests/test_audit.py +62 -0
- refactorika-0.2.0/tests/test_call_graph.py +154 -0
- refactorika-0.2.0/tests/test_confirm.py +69 -0
- refactorika-0.2.0/tests/test_core.py +86 -0
- refactorika-0.2.0/tests/test_dashboard.py +118 -0
- refactorika-0.2.0/tests/test_dead_code.py +143 -0
- refactorika-0.2.0/tests/test_docs_gen.py +193 -0
- refactorika-0.2.0/tests/test_duplicates.py +178 -0
- refactorika-0.2.0/tests/test_full_system_behavior_cases.py +63 -0
- refactorika-0.2.0/tests/test_full_system_bench.py +413 -0
- refactorika-0.2.0/tests/test_full_system_case_registry.py +37 -0
- refactorika-0.2.0/tests/test_full_system_multifile_cases.py +36 -0
- refactorika-0.2.0/tests/test_full_system_recovery_cases.py +82 -0
- refactorika-0.2.0/tests/test_gates.py +39 -0
- refactorika-0.2.0/tests/test_harness.py +92 -0
- refactorika-0.2.0/tests/test_harness_tasks.py +77 -0
- refactorika-0.2.0/tests/test_hybrid_live.py +89 -0
- refactorika-0.2.0/tests/test_observability.py +115 -0
- refactorika-0.2.0/tests/test_plan.py +61 -0
- refactorika-0.2.0/tests/test_related.py +71 -0
- refactorika-0.2.0/tests/test_scale_cases.py +110 -0
- refactorika-0.2.0/tests/test_storage_plan.py +43 -0
- refactorika-0.2.0/tests/test_stress_cases.py +63 -0
- refactorika-0.2.0/tests/test_vector_index.py +105 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Optional — Refactorika falls back to local JSON if unset or unreachable.
|
|
2
|
+
REDIS_URL=redis://localhost:6379/0
|
|
3
|
+
# Where the local-JSON fallback (edit log + analysis cache) lives.
|
|
4
|
+
REFACTORIKA_STATE=.refactorika/state.json
|
|
5
|
+
|
|
6
|
+
# Optional errors-only telemetry. Disabled when SENTRY_DSN is unset.
|
|
7
|
+
SENTRY_DSN=
|
|
8
|
+
SENTRY_ENVIRONMENT=development
|
|
9
|
+
SENTRY_RELEASE=
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.venv/
|
|
6
|
+
venv/
|
|
7
|
+
.pytest_cache/
|
|
8
|
+
.ruff_cache/
|
|
9
|
+
|
|
10
|
+
# Refactorika state + secrets
|
|
11
|
+
.refactorika/
|
|
12
|
+
.env
|
|
13
|
+
.claude/worktrees/
|
|
14
|
+
|
|
15
|
+
# Demo fixture's own git
|
|
16
|
+
demo_repo/.git/
|
|
17
|
+
|
|
18
|
+
# Eval: benchmark data + run artifacts (fetched/generated, never committed)
|
|
19
|
+
eval/external/
|
|
20
|
+
eval/results/
|
|
21
|
+
eval/.venv/
|
|
22
|
+
|
|
23
|
+
# OS / editor
|
|
24
|
+
.DS_Store
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# CLAUDE.md — Refactorika (Hackathon Project Memory)
|
|
2
|
+
|
|
3
|
+
> Self-contained context every Claude Code session and subagent inherits. Everything needed to act is **here** —
|
|
4
|
+
> `docs/` adds detail but you should never need to read it to make a correct move. Keep this short, current, ruthlessly relevant.
|
|
5
|
+
|
|
6
|
+
## What we're building
|
|
7
|
+
- **Product:** **Refactorika** — an **agent harness delivered as an MCP server**. Claude is the reasoning agent; Refactorika gives it three things it can't get alone: structure-aware analysis, a verification gate stack that proves every mutation safe, and Redis Iris cross-session memory. Written in Python, targets Python.
|
|
8
|
+
- **One-liner:** *Make safe structural change as frictionless as running a linter — point at a codebase, state the intent, get clean, reorganized, **proven-safe** code back, plus living docs of why it looks that way.*
|
|
9
|
+
- **The problem it kills:** Python repos rot four ways — **bad organization** (god-files, scattered/dup imports, bloated call sites), **rising complexity** (long functions, deep nesting), **context/doc rot** (the *why* evaporates as people leave), and **duplicate/dead code** (the same logic five ways; functions nothing reaches). Linters say *what's wrong*, not *how to restructure*; chat AI suggests fixes but is disconnected from the filesystem and has **no memory**. Refactorika runs as an MCP tool, so Claude reads, analyzes, applies, verifies, and *remembers* — without leaving the conversation.
|
|
10
|
+
- **The trust angle:** a mutation must change *shape, not behavior*. Every edit — including duplicate merges and dead-code deletions — passes gates (parse → `ruff` → `pyright` → `pytest`) before commit. The pitch is **"the agent restructured it, but nothing landed unverified."**
|
|
11
|
+
- **The memory angle:** knowledge *compounds*. Redis Iris (AST cache · vector index · agent memory · context retriever) makes the second run smarter than the first and keeps the *why* alive across sessions. See `docs/05-redis-iris.md`.
|
|
12
|
+
- **Target user:** a dev with a small/medium/legacy/AI-slop Python project who wants mechanical cleanup done *safely* — not by hand, not by trusting an agent blind.
|
|
13
|
+
|
|
14
|
+
## Two tool classes (everything is one or the other)
|
|
15
|
+
- **Advisory (read-only — finds + explains):** `analyze_file` · `find_duplicates` · `find_related` · `find_dead_code` · `generate_docs` · `get_context_map` · `audit_repo`/`get_plan`/`confirm_plan` (v3) · `get_log`. Surface ranked opportunities + memory; feed Claude's next proposal. `find_related` = impact check: hybrid-search the repo for semantically-similar code (+ call-graph dependents) before changing a file, so you don't fix one copy and miss the others.
|
|
16
|
+
- **Verified mutation (gated — single atomic entrypoint):** `apply_and_verify(path, new_content, refactor_kind)`. Every structural edit goes through it — `refactor_kind` includes `consolidate_duplicate` / `remove_dead_code`, so "find dead code" becomes "**safely remove** it, proven by your tests."
|
|
17
|
+
|
|
18
|
+
## The core flow (golden path — must always work)
|
|
19
|
+
`analyze → propose → apply → verify → commit`
|
|
20
|
+
1. **Analyze** a file/repo with an advisory tool (organization · complexity · duplicates · dead code · context).
|
|
21
|
+
2. **Propose** a concrete edit — Claude writes the new file contents.
|
|
22
|
+
3. **Apply** via `apply_and_verify` (the working tree is never left dirty).
|
|
23
|
+
4. **Verify** through the gate stack; roll back atomically on any failure.
|
|
24
|
+
5. **Commit** only verified edits; log the `EditRecord`; update agent memory.
|
|
25
|
+
|
|
26
|
+
## The 30-second magic moment (the demo)
|
|
27
|
+
Run Refactorika on a curated messy 1–2 file repo → watch a god-function get **split + nesting flattened live** → a planted behavior-breaking "clean-looking" edit gets **caught by the `pytest` gate after `pyright` passes, rolled back, and re-proposed** → final diff is smaller, flatter, type-clean, green. The whole product is *visible verification* — render the gate log, the catch, the rollback. Invisible checking scores zero.
|
|
28
|
+
|
|
29
|
+
## Shipped slice (the trust spine — keep it green)
|
|
30
|
+
Vertical slice on a **2-file curated repo**, end-to-end: `analyze → propose → apply_and_verify → commit/rollback`. This verified-refactor loop is **shipped** and is the foundation everything else hangs off — keep it green while broadening.
|
|
31
|
+
|
|
32
|
+
## What's IN scope — the fences we do not cross
|
|
33
|
+
Target: **small-to-medium Python codebases** — single-package or small multi-file/multi-package repos, structure shallow enough to reason about statically. The four capabilities ship as one harness, sequenced by Build order.
|
|
34
|
+
- **Organization (verified mutation):** split large files into modules · reorder + dedupe imports (stdlib → third-party → local) · extract helpers from bloated call sites.
|
|
35
|
+
- **Complexity (verified mutation):** break long functions into named units · flatten deep nesting (guard clauses) · replace repeated blocks with extracted parameterized functions.
|
|
36
|
+
- **Duplicate/dead code (advisory → verified mutation):** `find_duplicates` (structural fingerprint + semantic vector search) · `find_dead_code` (call-graph reachability + confidence). Never auto-delete — surface, then consolidate/remove through `apply_and_verify`.
|
|
37
|
+
- **Context/docs (advisory + memory):** `generate_docs` emits/self-updates `.refactorika/context/<module>.md` · persisted to Redis Iris agent memory so knowledge compounds across sessions.
|
|
38
|
+
|
|
39
|
+
## What's OUT — park it, don't drift
|
|
40
|
+
- Multi-language (JS/TS/Go/…) — **Python only**.
|
|
41
|
+
- Large-scale architectural rewrites (monolith → microservices).
|
|
42
|
+
- **Any mutation that alters runtime behavior or public API** — preserve behavior, full stop (the invariant; proven by `pytest`).
|
|
43
|
+
- Test generation / coverage work (we *run* your tests as the safety net; we don't write them).
|
|
44
|
+
- Dependency management / `pyproject.toml` edits.
|
|
45
|
+
- *(Exploratory, not now: large deep-hierarchy monorepos, framework-aware refactors for Django/FastAPI, more languages.)*
|
|
46
|
+
|
|
47
|
+
## Stack
|
|
48
|
+
- **Language:** Python 3.11+ (harness **and** target).
|
|
49
|
+
- **MCP:** `mcp` Python SDK (`FastMCP`) — exposes capabilities as tools Claude invokes inline.
|
|
50
|
+
- **Parse/analyze:** `tree-sitter` + `tree-sitter-python` — boundaries, import blocks, nesting depth, normalized AST fingerprints, the symbol graph for dead-code reachability.
|
|
51
|
+
- **Type gate:** `pyright` — reject only *new* type errors vs. pre-edit baseline (like lint; absolute "must be type-perfect" over-rejects correct code).
|
|
52
|
+
- **Lint/format gate:** `ruff` — normalize formatting, reject only *new* violations vs. pre-edit baseline.
|
|
53
|
+
- **Behavior gate:** `pytest` — type-clean ≠ behavior-preserving; catches silent regressions; *proves* dead-code/dup removals are safe.
|
|
54
|
+
- **Duplicate/dead-code analysis:** structural AST fingerprint (precise clones) **+** hybrid search — embeddings (`text-embedding-3-small` via OpenAI primary; `sentence-transformers` keyless fallback) fused with BM25 via Redis `FT.HYBRID`. Call-graph reachability for dead code.
|
|
55
|
+
- **Memory/state — Redis Iris via RedisVL (primary, JSON fallback):** four components — LangCache/AST-keyed cache · **Hybrid Search Index** (per-fn vector + BM25 + tags, `FT.HYBRID` RRF-fused — strictly better than pure cosine on code) · Agent Memory (cross-session context + refactor history) · Context Retriever (tag/num filters + hybrid retrieval). Hybrid needs Redis 8.4+ Query Engine — **as run: local Docker `redis:8` (8.8) on `:6380`, `--restart=always`** (Cloud/Stack also work); **degrades to brute-force vector / `.refactorika/` files** otherwise. Full detail: `docs/05-redis-iris.md`.
|
|
56
|
+
|
|
57
|
+
## Architecture — one core, thin shells
|
|
58
|
+
- **Interface-agnostic core library** (`refactorika/core/` + `analysis/` + `memory/`) holds all logic: analysis, gate stack, transforms, Iris memory. Reads/writes state itself so every shell sees the same thing. Canonical package is top-level **`refactorika/`** — the old `src/refactorika/` skeleton is abandoned, do not add to it.
|
|
59
|
+
- **Primary shell: MCP server** (`refactorika/mcp_server.py`) — thin wrapper. **Advisory tools:** `analyze_file · find_duplicates · find_dead_code · generate_docs · get_context_map · get_log`. **Verified mutation:** `apply_and_verify(path, new_content, refactor_kind)`. Claude proposes/drives; Refactorika verifies + remembers. **Freeze tool signatures + the `EditRecord` schema before parallel work** — that frozen interface IS the contract.
|
|
60
|
+
- **Per-edit log schema (freeze this):**
|
|
61
|
+
`{ file, refactor_kind, checks: { parse, lint, typecheck, tests }, retries, status, failure_reason, diff }`
|
|
62
|
+
where `status ∈ { committed, rolled-back, skipped-needs-human }`. **Skipped gates recorded explicitly (`null`), never silently passed** (honest coverage).
|
|
63
|
+
|
|
64
|
+
## Verification gates — cheapest-first, short-circuit on fail
|
|
65
|
+
1. **Parse** — `tree-sitter-python` must parse the edited file; reject malformed edits before spending anything.
|
|
66
|
+
2. **Lint/format** — `ruff check` + `ruff format --check` on touched files; reject only *new* violations.
|
|
67
|
+
3. **Type** — `pyright`; fail → roll back. No edit committed in a type-error state.
|
|
68
|
+
4. **Behavior** — `pytest` over tests covering touched files. Type-clean ≠ correct. Roll back on fail; record a **skip** where no test covers the file (never silent-pass).
|
|
69
|
+
5. **Re-propose loop** — bounded retries; surface the failure reason back to the agent.
|
|
70
|
+
6. **Escalation** — retries exhausted → mark `skipped-needs-human`, revert to last good state, flag it, continue. **Never force-commit.**
|
|
71
|
+
7. **Log** — append the structured record (powers the demo dashboard).
|
|
72
|
+
|
|
73
|
+
## Operating principles (hackathon — optimize for the demo)
|
|
74
|
+
- **Golden path first, always.** One repo, one flow, end-to-end. Green by halfway, kept green. 2-file slice before breadth.
|
|
75
|
+
- **Make the action visible.** Render the checking — gate log, caught regression, rollback, re-propose, `skipped-needs-human`. The product *is* visible verification.
|
|
76
|
+
- **Fake what we can't build.** Demo repo is **curated**: known messy structure, a planted behavior-breaking edit, **explicit return annotations** (tree-sitter sees syntax, not inferred types). Ground truth known → honest before/after.
|
|
77
|
+
- **Reliability over code quality.** Fewer moving parts. Hardcoded fallback for every external call (Redis → local JSON; `pyright`/`pytest` unavailable → skip-and-record, never silent-pass).
|
|
78
|
+
- **Stay in scope.** Out-of-scope temptations go to `## Parked`, not into the build.
|
|
79
|
+
- **Small diffs, frequent commits.** Checkpoint after each working increment.
|
|
80
|
+
|
|
81
|
+
## Parallel-build (skeleton-first, beat the serial dependency chain)
|
|
82
|
+
**Hour 0 (all together):** freeze MCP tool signatures + per-edit log schema; pin the demo-repo `pyright` config + `pytest` command. Then one dev ships a **skeleton where the whole golden path runs on mock data** (the running mock IS the contract); everyone else replaces stubs with real impls against the frozen interface. File-editing agents use `isolation: worktree`; read-only exploration runs free.
|
|
83
|
+
- **Dev 1 — Skeleton/integration:** MCP server with all tools stubbed → full demo runs day one · core module · curated demo repo · dashboard. Critical path.
|
|
84
|
+
- **Dev 2 — Analysis:** structure detection (file size, import order/dupes, function length, nesting depth) · opportunity ranking · **duplicate detection** (structural fingerprint + semantic embeddings) · **call-graph reachability** for dead code.
|
|
85
|
+
- **Dev 3 — Transforms:** the actual edits (split / reorder / extract / flatten · consolidate-duplicate · remove-dead-code) · diff generation · **`generate_docs`** context emission.
|
|
86
|
+
- **Dev 4 — Verify + memory:** the gate stack (parse→ruff→pyright→pytest, re-propose, escalation) · **Redis Iris** (AST cache · vector index · agent memory · context retriever) + local-file fallback.
|
|
87
|
+
|
|
88
|
+
## Build order (value-per-hour)
|
|
89
|
+
1. **Verified-refactor loop** *(shipped)* — 2-file slice, one refactor kind end-to-end, gate stack green. Trust spine. (Gate landing order: **parse + `pyright`** → **`pytest`** → **`ruff`**. Redis started as JSON, now primary.)
|
|
90
|
+
2. **Duplicate detection** — highest demo impact; reuses tree-sitter AST. Add structural fingerprint + Redis vector index; consolidation rides the existing gate stack.
|
|
91
|
+
3. **Dead-code analysis + verified removal** — call-graph reachability; parallel to the embedding pipeline; removal rides the gate stack.
|
|
92
|
+
4. **Cross-session memory + living docs** — promote storage to full Redis Iris (agent memory + context retriever); `generate_docs` builds on retrievable prior context.
|
|
93
|
+
|
|
94
|
+
## Environment
|
|
95
|
+
- Keys in `.env` (never commit; gitignored); `.env.example` lists what's needed: `REDIS_URL` (primary, falls back to local JSON), optional `OPENAI_API_KEY` (embeddings — else local `sentence-transformers`). `.worktreeinclude` copies env files into each worktree.
|
|
96
|
+
|
|
97
|
+
## Parked (tempting, explicitly NOT now)
|
|
98
|
+
- Multi-language · architectural rewrites · behavior/API changes · test generation · dependency/`pyproject.toml` edits.
|
|
99
|
+
- Large deep-hierarchy monorepos · framework-aware (Django/FastAPI) refactors · per-team private embedding models.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anikathapar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
.PHONY: help setup fetch eval eval-no-fetch benchmark benchmark-agent \
|
|
2
|
+
benchmark-full-calibrate benchmark-full-agent test clean-eval
|
|
3
|
+
|
|
4
|
+
help: ## Show available targets
|
|
5
|
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) \
|
|
6
|
+
| sort \
|
|
7
|
+
| awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-16s\033[0m %s\n", $$1, $$2}'
|
|
8
|
+
|
|
9
|
+
setup: ## Create eval venv and install dependencies
|
|
10
|
+
bash eval/run_eval.sh --setup
|
|
11
|
+
|
|
12
|
+
fetch: ## Fetch benchmark data into eval/external/ (gitignored)
|
|
13
|
+
bash eval/fetch_benchmarks.sh
|
|
14
|
+
|
|
15
|
+
eval: ## Full evaluation: setup -> fetch benchmarks -> run
|
|
16
|
+
bash eval/run_eval.sh
|
|
17
|
+
|
|
18
|
+
eval-no-fetch: ## Run evaluation using already-fetched benchmark data
|
|
19
|
+
bash eval/run_eval.sh --no-fetch
|
|
20
|
+
|
|
21
|
+
benchmark: ## Calibrate the shared-patch verification ablation
|
|
22
|
+
@test -x eval/.venv/bin/python || bash eval/run_eval.sh --setup
|
|
23
|
+
PATH="$(CURDIR)/eval/.venv/bin:$$PATH" eval/.venv/bin/python -m eval.harness_bench --calibrate-only
|
|
24
|
+
|
|
25
|
+
benchmark-agent: ## Run the shared-patch verification ablation
|
|
26
|
+
@test -x eval/.venv/bin/python || bash eval/run_eval.sh --setup
|
|
27
|
+
PATH="$(CURDIR)/eval/.venv/bin:$$PATH" eval/.venv/bin/python -m eval.harness_bench \
|
|
28
|
+
--provider "$${PROVIDER:-anthropic}" \
|
|
29
|
+
--model "$${MODEL:-claude-sonnet-4-5-20250929}" \
|
|
30
|
+
--base-url "$${BASE_URL:-http://localhost:11434/v1}" --trials "$${TRIALS:-3}" \
|
|
31
|
+
--input-cost-per-mtok "$${INPUT_COST_PER_MTOK:-0}" \
|
|
32
|
+
--output-cost-per-mtok "$${OUTPUT_COST_PER_MTOK:-0}"
|
|
33
|
+
|
|
34
|
+
benchmark-full-calibrate: ## Validate all full-system case baselines
|
|
35
|
+
@test -x eval/.venv/bin/python || bash eval/run_eval.sh --setup
|
|
36
|
+
PATH="$(CURDIR)/eval/.venv/bin:$$PATH" eval/.venv/bin/python \
|
|
37
|
+
-m eval.full_system_bench --calibrate-only
|
|
38
|
+
|
|
39
|
+
benchmark-full-agent: ## Run independent harness OFF-vs-ON full-system agents
|
|
40
|
+
@test -x eval/.venv/bin/python || bash eval/run_eval.sh --setup
|
|
41
|
+
PATH="$(CURDIR)/eval/.venv/bin:$$PATH" eval/.venv/bin/python \
|
|
42
|
+
-m eval.full_system_bench \
|
|
43
|
+
--provider "$${PROVIDER:-anthropic}" \
|
|
44
|
+
--model "$${MODEL:-claude-sonnet-4-5-20250929}" \
|
|
45
|
+
--base-url "$${BASE_URL:-http://localhost:11434/v1}" \
|
|
46
|
+
--trials "$${TRIALS:-3}" --max-retries "$${MAX_RETRIES:-2}" \
|
|
47
|
+
--request-timeout "$${REQUEST_TIMEOUT:-180}" \
|
|
48
|
+
--agent-timeout "$${AGENT_TIMEOUT:-900}" \
|
|
49
|
+
--shell-timeout "$${SHELL_TIMEOUT:-30}" \
|
|
50
|
+
--gate-timeout "$${GATE_TIMEOUT:-180}" \
|
|
51
|
+
--parallel-fallback-delay "$${PARALLEL_FALLBACK_DELAY:-2}" \
|
|
52
|
+
--agentic-model "$${AGENTIC_MODEL:-$${MODEL:-claude-sonnet-4-5-20250929}}" \
|
|
53
|
+
--agentic-max-iter "$${AGENTIC_MAX_ITER:-20}" \
|
|
54
|
+
--agentic-mcp-model "$${AGENTIC_MCP_MODEL:-$${MODEL:-claude-sonnet-4-5-20250929}}" \
|
|
55
|
+
--agentic-mcp-max-iter "$${AGENTIC_MCP_MAX_ITER:-20}" \
|
|
56
|
+
--input-cost-per-mtok "$${INPUT_COST_PER_MTOK:-0}" \
|
|
57
|
+
--output-cost-per-mtok "$${OUTPUT_COST_PER_MTOK:-0}" \
|
|
58
|
+
--cache-read-cost-per-mtok "$${CACHE_READ_COST_PER_MTOK:-0}" \
|
|
59
|
+
--cache-write-cost-per-mtok "$${CACHE_WRITE_COST_PER_MTOK:-0}" \
|
|
60
|
+
$${AGENTIC:+--agentic} \
|
|
61
|
+
$${AGENTIC_MCP:+--agentic-mcp} \
|
|
62
|
+
$${PARALLEL_ARMS:+--parallel-arms} \
|
|
63
|
+
$${BASELINE:+--baseline "$${BASELINE}"}
|
|
64
|
+
|
|
65
|
+
test: ## Run harness and benchmark unit tests
|
|
66
|
+
@test -x eval/.venv/bin/python || bash eval/run_eval.sh --setup
|
|
67
|
+
PATH="$(CURDIR)/eval/.venv/bin:$$PATH" eval/.venv/bin/python -m pytest -v tests
|
|
68
|
+
|
|
69
|
+
clean-eval: ## Remove the eval venv (keeps fetched benchmark data)
|
|
70
|
+
rm -rf eval/.venv
|