reloop-ai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. reloop_ai-0.1.0/.claire/worktrees/agent-a4ffdf0e/dashboard/src/app/layout.tsx +35 -0
  2. reloop_ai-0.1.0/.claude/agents/code-reviewer.md +14 -0
  3. reloop_ai-0.1.0/.claude/agents/security-reviewer.md +15 -0
  4. reloop_ai-0.1.0/.claude/skills/demo-check/SKILL.md +14 -0
  5. reloop_ai-0.1.0/.claude/skills/rejd-test/SKILL.md +10 -0
  6. reloop_ai-0.1.0/.env.example +28 -0
  7. reloop_ai-0.1.0/.gitignore +47 -0
  8. reloop_ai-0.1.0/CLAUDE.md +489 -0
  9. reloop_ai-0.1.0/CONTRIBUTING.md +236 -0
  10. reloop_ai-0.1.0/DESIGN.md +367 -0
  11. reloop_ai-0.1.0/Dockerfile +13 -0
  12. reloop_ai-0.1.0/LICENSE +189 -0
  13. reloop_ai-0.1.0/PKG-INFO +242 -0
  14. reloop_ai-0.1.0/README.md +197 -0
  15. reloop_ai-0.1.0/apple/DESIGN.md +313 -0
  16. reloop_ai-0.1.0/dashboard/.gitignore +41 -0
  17. reloop_ai-0.1.0/dashboard/AGENTS.md +5 -0
  18. reloop_ai-0.1.0/dashboard/CLAUDE.md +1 -0
  19. reloop_ai-0.1.0/dashboard/README.md +36 -0
  20. reloop_ai-0.1.0/dashboard/components.json +25 -0
  21. reloop_ai-0.1.0/dashboard/eslint.config.mjs +18 -0
  22. reloop_ai-0.1.0/dashboard/next-env.d.ts +6 -0
  23. reloop_ai-0.1.0/dashboard/next.config.ts +7 -0
  24. reloop_ai-0.1.0/dashboard/package-lock.json +9832 -0
  25. reloop_ai-0.1.0/dashboard/package.json +34 -0
  26. reloop_ai-0.1.0/dashboard/postcss.config.mjs +7 -0
  27. reloop_ai-0.1.0/dashboard/public/file.svg +1 -0
  28. reloop_ai-0.1.0/dashboard/public/globe.svg +1 -0
  29. reloop_ai-0.1.0/dashboard/public/next.svg +1 -0
  30. reloop_ai-0.1.0/dashboard/public/vercel.svg +1 -0
  31. reloop_ai-0.1.0/dashboard/public/window.svg +1 -0
  32. reloop_ai-0.1.0/dashboard/src/app/ab/page.tsx +958 -0
  33. reloop_ai-0.1.0/dashboard/src/app/ab/page.tsx.bak +1095 -0
  34. reloop_ai-0.1.0/dashboard/src/app/favicon.ico +0 -0
  35. reloop_ai-0.1.0/dashboard/src/app/globals.css +1072 -0
  36. reloop_ai-0.1.0/dashboard/src/app/layout.tsx +41 -0
  37. reloop_ai-0.1.0/dashboard/src/app/layout.tsx.bak +61 -0
  38. reloop_ai-0.1.0/dashboard/src/app/loading.tsx +25 -0
  39. reloop_ai-0.1.0/dashboard/src/app/loading.tsx.bak +25 -0
  40. reloop_ai-0.1.0/dashboard/src/app/page.tsx +1299 -0
  41. reloop_ai-0.1.0/dashboard/src/app/page.tsx.bak +1068 -0
  42. reloop_ai-0.1.0/dashboard/src/app/tasks/[id]/page.tsx +615 -0
  43. reloop_ai-0.1.0/dashboard/src/app/tasks/[id]/page.tsx.bak +528 -0
  44. reloop_ai-0.1.0/dashboard/src/components/app-shell.tsx +110 -0
  45. reloop_ai-0.1.0/dashboard/src/components/attempt-detail.tsx +544 -0
  46. reloop_ai-0.1.0/dashboard/src/components/attempt-detail.tsx.bak +326 -0
  47. reloop_ai-0.1.0/dashboard/src/components/cost-scoreboard.tsx +545 -0
  48. reloop_ai-0.1.0/dashboard/src/components/cost-tracker.tsx +241 -0
  49. reloop_ai-0.1.0/dashboard/src/components/cost-tracker.tsx.bak +223 -0
  50. reloop_ai-0.1.0/dashboard/src/components/diff-viewer.tsx +599 -0
  51. reloop_ai-0.1.0/dashboard/src/components/diff-viewer.tsx.bak +391 -0
  52. reloop_ai-0.1.0/dashboard/src/components/failure-pattern-badge.tsx +118 -0
  53. reloop_ai-0.1.0/dashboard/src/components/failure-sidebar.tsx +662 -0
  54. reloop_ai-0.1.0/dashboard/src/components/failure-sidebar.tsx.bak +455 -0
  55. reloop_ai-0.1.0/dashboard/src/components/live-output.tsx +509 -0
  56. reloop_ai-0.1.0/dashboard/src/components/live-output.tsx.bak +434 -0
  57. reloop_ai-0.1.0/dashboard/src/components/timeline.tsx +273 -0
  58. reloop_ai-0.1.0/dashboard/src/components/timeline.tsx.bak +386 -0
  59. reloop_ai-0.1.0/dashboard/src/components/ui/badge.tsx +52 -0
  60. reloop_ai-0.1.0/dashboard/src/components/ui/badge.tsx.bak +52 -0
  61. reloop_ai-0.1.0/dashboard/src/components/ui/button.tsx +58 -0
  62. reloop_ai-0.1.0/dashboard/src/components/ui/button.tsx.bak +58 -0
  63. reloop_ai-0.1.0/dashboard/src/components/ui/card.tsx +103 -0
  64. reloop_ai-0.1.0/dashboard/src/components/ui/card.tsx.bak +103 -0
  65. reloop_ai-0.1.0/dashboard/src/components/ui/dialog.tsx +160 -0
  66. reloop_ai-0.1.0/dashboard/src/components/ui/dialog.tsx.bak +160 -0
  67. reloop_ai-0.1.0/dashboard/src/components/ui/scroll-area.tsx +55 -0
  68. reloop_ai-0.1.0/dashboard/src/components/ui/scroll-area.tsx.bak +55 -0
  69. reloop_ai-0.1.0/dashboard/src/components/ui/separator.tsx +25 -0
  70. reloop_ai-0.1.0/dashboard/src/components/ui/separator.tsx.bak +25 -0
  71. reloop_ai-0.1.0/dashboard/src/components/ui/tabs.tsx +82 -0
  72. reloop_ai-0.1.0/dashboard/src/components/ui/tabs.tsx.bak +82 -0
  73. reloop_ai-0.1.0/dashboard/src/components/what-i-learned.tsx +358 -0
  74. reloop_ai-0.1.0/dashboard/src/lib/api.ts +454 -0
  75. reloop_ai-0.1.0/dashboard/src/lib/tokens.ts +37 -0
  76. reloop_ai-0.1.0/dashboard/src/lib/types.ts +201 -0
  77. reloop_ai-0.1.0/dashboard/src/lib/utils.ts +200 -0
  78. reloop_ai-0.1.0/dashboard/tsconfig.json +34 -0
  79. reloop_ai-0.1.0/dashboard/tsconfig.tsbuildinfo +1 -0
  80. reloop_ai-0.1.0/data/demo-project/README.md +91 -0
  81. reloop_ai-0.1.0/data/demo-project/expected-errors.json +30 -0
  82. reloop_ai-0.1.0/data/demo-project/next.config.js +18 -0
  83. reloop_ai-0.1.0/data/demo-project/package.json +27 -0
  84. reloop_ai-0.1.0/data/demo-project/src/app/page.tsx +41 -0
  85. reloop_ai-0.1.0/data/demo-project/tsconfig.json +26 -0
  86. reloop_ai-0.1.0/docker-compose.yml +33 -0
  87. reloop_ai-0.1.0/examples/integrations/crewai_example.py +73 -0
  88. reloop_ai-0.1.0/examples/integrations/langgraph_example.py +64 -0
  89. reloop_ai-0.1.0/examples/integrations/mcp_config.json +11 -0
  90. reloop_ai-0.1.0/examples/integrations/openai_agents_example.py +80 -0
  91. reloop_ai-0.1.0/examples/integrations/raw_python_example.py +78 -0
  92. reloop_ai-0.1.0/garysguide.md +527 -0
  93. reloop_ai-0.1.0/hackathon-ideation-debate.md +432 -0
  94. reloop_ai-0.1.0/mcp_config.json +24 -0
  95. reloop_ai-0.1.0/newfeatures.md +521 -0
  96. reloop_ai-0.1.0/nihalnewfeatures.md +317 -0
  97. reloop_ai-0.1.0/pyproject.toml +54 -0
  98. reloop_ai-0.1.0/reloop-product-blueprint.md +810 -0
  99. reloop_ai-0.1.0/reloop.yaml +28 -0
  100. reloop_ai-0.1.0/requirements.txt +45 -0
  101. reloop_ai-0.1.0/sponsors-deep-dive.md +514 -0
  102. reloop_ai-0.1.0/src/__init__.py +3 -0
  103. reloop_ai-0.1.0/src/api/__init__.py +0 -0
  104. reloop_ai-0.1.0/src/api/routes/__init__.py +0 -0
  105. reloop_ai-0.1.0/src/api/routes/checkpoints.py +126 -0
  106. reloop_ai-0.1.0/src/api/routes/memories.py +249 -0
  107. reloop_ai-0.1.0/src/api/routes/surfaces.py +434 -0
  108. reloop_ai-0.1.0/src/api/routes/tasks.py +252 -0
  109. reloop_ai-0.1.0/src/api/server.py +242 -0
  110. reloop_ai-0.1.0/src/api/sse.py +41 -0
  111. reloop_ai-0.1.0/src/cli/__init__.py +0 -0
  112. reloop_ai-0.1.0/src/cli/main.py +257 -0
  113. reloop_ai-0.1.0/src/config.py +56 -0
  114. reloop_ai-0.1.0/src/core/__init__.py +15 -0
  115. reloop_ai-0.1.0/src/core/agent.py +764 -0
  116. reloop_ai-0.1.0/src/core/distiller.py +242 -0
  117. reloop_ai-0.1.0/src/core/memory.py +311 -0
  118. reloop_ai-0.1.0/src/core/planner.py +306 -0
  119. reloop_ai-0.1.0/src/core/retry.py +263 -0
  120. reloop_ai-0.1.0/src/core/timeline.py +164 -0
  121. reloop_ai-0.1.0/src/mcp_server.py +510 -0
  122. reloop_ai-0.1.0/src/models.py +259 -0
  123. reloop_ai-0.1.0/src/providers/__init__.py +0 -0
  124. reloop_ai-0.1.0/src/providers/base.py +190 -0
  125. reloop_ai-0.1.0/src/providers/llm/__init__.py +0 -0
  126. reloop_ai-0.1.0/src/providers/llm/agents_orchestrator.py +466 -0
  127. reloop_ai-0.1.0/src/providers/llm/openai.py +218 -0
  128. reloop_ai-0.1.0/src/providers/memory/__init__.py +0 -0
  129. reloop_ai-0.1.0/src/providers/memory/context_surfaces.py +183 -0
  130. reloop_ai-0.1.0/src/providers/memory/context_surfaces_client.py +294 -0
  131. reloop_ai-0.1.0/src/providers/memory/redis_backend.py +556 -0
  132. reloop_ai-0.1.0/src/providers/memory/sqlite_backend.py +442 -0
  133. reloop_ai-0.1.0/src/providers/sandbox/__init__.py +0 -0
  134. reloop_ai-0.1.0/src/providers/sandbox/blaxel.py +384 -0
  135. reloop_ai-0.1.0/src/providers/sandbox/docker.py +227 -0
  136. reloop_ai-0.1.0/src/standalone.py +253 -0
  137. reloop_ai-0.1.0/tests/__init__.py +0 -0
  138. reloop_ai-0.1.0/tests/conftest.py +322 -0
  139. reloop_ai-0.1.0/tests/test_cli.py +186 -0
  140. reloop_ai-0.1.0/tests/test_context_surfaces.py +1420 -0
  141. reloop_ai-0.1.0/tests/test_core_loop.py +499 -0
  142. reloop_ai-0.1.0/tests/test_failure_memory_manager.py +242 -0
  143. reloop_ai-0.1.0/tests/test_learning.py +400 -0
  144. reloop_ai-0.1.0/tests/test_memory.py +286 -0
  145. reloop_ai-0.1.0/tests/test_models.py +378 -0
@@ -0,0 +1,35 @@
1
+ import type { Metadata } from "next";
2
+ import { Geist, Geist_Mono } from "next/font/google";
3
+ import "./globals.css";
4
+
5
+ const geistSans = Geist({
6
+ variable: "--font-geist-sans",
7
+ subsets: ["latin"],
8
+ });
9
+
10
+ const geistMono = Geist_Mono({
11
+ variable: "--font-geist-mono",
12
+ subsets: ["latin"],
13
+ });
14
+
15
+ export const metadata: Metadata = {
16
+ title: "ReLoop — The Self-Healing Agent",
17
+ description: "Every agent fails. ReLoop is the first that gets smarter from failure.",
18
+ };
19
+
20
+ export default function RootLayout({
21
+ children,
22
+ }: Readonly<{
23
+ children: React.ReactNode;
24
+ }>) {
25
+ return (
26
+ <html
27
+ lang="en"
28
+ className={`${geistSans.variable} ${geistMono.variable} dark h-full antialiased`}
29
+ >
30
+ <body className="min-h-full flex flex-col bg-background text-foreground">
31
+ {children}
32
+ </body>
33
+ </html>
34
+ );
35
+ }
@@ -0,0 +1,14 @@
1
+ ---
2
+ name: code-reviewer
3
+ description: Review code changes for correctness, type safety, and CLAUDE.md rule compliance before commit
4
+ model: opus
5
+ ---
6
+
7
+ Review the recent changes for:
8
+ 1. **Correctness**: Logic errors, missed edge cases, broken types
9
+ 2. **Project rules**: Conventional commits, scope control, no scope creep
10
+ 3. **API consistency**: Matches the API surface defined in CLAUDE.md
11
+ 4. **Security**: No hardcoded keys, no injection vectors
12
+ 5. **Demo impact**: Will this break the demo scenario?
13
+
14
+ Report issues by severity (blocker / warning / nit).
@@ -0,0 +1,15 @@
1
+ ---
2
+ name: security-reviewer
3
+ description: Review code for API key leaks, injection vulnerabilities, and sandbox boundary violations
4
+ model: sonnet
5
+ ---
6
+
7
+ Review the codebase and recent changes for security issues:
8
+
9
+ 1. **API Key Safety**: Check for hardcoded keys, keys in committed files, keys logged to stdout
10
+ 2. **Injection Vectors**: SQL injection, command injection, XSS in dashboard templates
11
+ 3. **Sandbox Boundaries**: Ensure Blaxel sandbox code can't escape to host
12
+ 4. **Environment Variables**: Verify .env files are gitignored, no secrets in docker-compose.yml
13
+ 5. **Dependency Security**: Flag known vulnerable package versions
14
+
15
+ Report issues by severity (critical / high / medium / low).
@@ -0,0 +1,14 @@
1
+ ---
2
+ name: demo-check
3
+ description: Run full pre-demo verification — REJD loop, A/B comparison, Redis memory, Blaxel checkpoints, SSE streaming, and frontend build
4
+ disable-model-invocation: true
5
+ ---
6
+
7
+ Run the following verification steps and report status for each:
8
+
9
+ 1. `cd dashboard && npm run build` — verify frontend builds
10
+ 2. `python -m pytest tests/` — verify all tests pass
11
+ 3. `docker-compose ps` — verify Redis is running
12
+ 4. `python -m src.core.agent --demo` — verify demo scenario runs
13
+ 5. `python -m src.core.agent --ab-demo` — verify A/B comparison works
14
+ 6. Report: which steps passed/failed, and what to fix
@@ -0,0 +1,10 @@
1
+ ---
2
+ name: rejd-test
3
+ description: Run REJD core loop and memory tests quickly
4
+ disable-model-invocation: true
5
+ ---
6
+
7
+ Run these tests and report results concisely:
8
+ 1. `python -m pytest tests/test_core_loop.py -v`
9
+ 2. `python -m pytest tests/test_memory.py -v`
10
+ Report pass/fail counts and any failure details.
@@ -0,0 +1,28 @@
1
+ # Wordware (Agent Orchestration -- PRIMARY SPONSOR)
2
+ WORDWARE_API_KEY=
3
+ WORDWARE_FLOW_ID=
4
+
5
+ # Redis (Failure Memory -- Agent Memory Server)
6
+ REDIS_URL=redis://localhost:6379
7
+ REDIS_MEMORY_INDEX=reloop-failures
8
+
9
+ # Blaxel (Execution Sandbox)
10
+ BLAXEL_API_KEY=
11
+ BLAXEL_WORKSPACE=
12
+
13
+ # OpenAI Codex (Code Generation)
14
+ OPENAI_API_KEY=
15
+ CODEX_MODEL=codex
16
+
17
+ # API Server
18
+ API_PORT=8000
19
+ API_HOST=0.0.0.0
20
+
21
+ # Frontend
22
+ NEXT_PUBLIC_API_URL=http://localhost:8000
23
+
24
+ # Optional
25
+ EMBEDDING_MODEL=text-embedding-3-small
26
+ MAX_RETRIES=5
27
+ MAX_BUDGET_USD=1.00
28
+ CIRCUIT_BREAKER_THRESHOLD=3
@@ -0,0 +1,47 @@
1
+ .claude/settings.local.json
2
+
3
+ # Python
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .eggs/
11
+ venv/
12
+ .venv/
13
+ *.egg
14
+
15
+ # Environment
16
+ .env
17
+ .env.local
18
+
19
+ # Node / Frontend
20
+ node_modules/
21
+ .next/
22
+ dashboard/node_modules/
23
+ dashboard/.next/
24
+ dashboard/out/
25
+
26
+ # IDE
27
+ .vscode/
28
+ .idea/
29
+ *.swp
30
+ *.swo
31
+ *~
32
+
33
+ # OS
34
+ .DS_Store
35
+ Thumbs.db
36
+
37
+ # Testing
38
+ .pytest_cache/
39
+ .coverage
40
+ htmlcov/
41
+
42
+ # Redis
43
+ dump.rdb
44
+
45
+ # Misc
46
+ *.log
47
+ .vercel
@@ -0,0 +1,489 @@
1
+ # Project Rules for Claude Code
2
+
3
+ ## Project Overview
4
+
5
+ **ReLoop** -- The Self-Healing Agent with Failure Memory. An open-source agent framework built around the concept of structured failure memory. Instead of blind retries or starting from scratch, ReLoop captures every failure into a structured memory graph, learns patterns across errors, and retries with accumulated knowledge. Agents don't just recover -- they get permanently smarter.
6
+
7
+ **Hackathon:** Beach House, April 11, 2026 | 10 AM - 7 PM PT | San Francisco
8
+ **Tagline:** "Every agent fails. ReLoop is the first that gets smarter from failure."
9
+ **Prize Targets:** Blaxel $500
10
+
11
+ ### Architecture
12
+
13
+ - **Agent Orchestration**: OpenAI Agents SDK (planning loop + retry logic with specialist agent handoffs)
14
+ - **Memory**: Redis Agent Memory Server -- 3-tier memory (working/session, long-term/failure graph, episodic/traces)
15
+ - **Execution Sandbox**: Blaxel Firecracker microVMs (25ms resume, perpetual state, checkpoint/restore)
16
+ - **Code Generation**: OpenAI Codex SDK (fix generation based on failure context)
17
+ - **API Layer**: FastAPI (Python) -- task management, memory search, checkpoints, SSE streaming
18
+ - **Frontend**: Next.js + Tailwind + shadcn/ui -- timeline visualization, failure memory sidebar, live output
19
+ - **Plugin System**: Abstract interfaces for memory backends, sandbox providers, LLM providers
20
+ - **Core Algorithm**: REJD Loop (Retrieve-Execute-Judge-Distill)
21
+
22
+ ### Project Structure
23
+
24
+ ```text
25
+ reloop/
26
+ ├── CLAUDE.md # This file -- project rules
27
+ ├── README.md # The conversion engine (10-section formula)
28
+ ├── LICENSE # Apache 2.0
29
+ ├── CONTRIBUTING.md # Step-by-step contribution guide
30
+ ├── reloop.yaml # Default configuration
31
+ ├── docker-compose.yml # One-command local setup (Redis + services)
32
+ ├── package.json # Frontend dependencies
33
+ ├── .github/
34
+ │ ├── ISSUE_TEMPLATE/
35
+ │ │ ├── bug_report.md
36
+ │ │ └── feature_request.md
37
+ │ ├── PULL_REQUEST_TEMPLATE.md
38
+ │ └── workflows/
39
+ │ └── ci.yml
40
+ ├── src/
41
+ │ ├── core/
42
+ │ │ ├── agent.py # Main orchestrator (REJD loop)
43
+ │ │ ├── memory.py # Failure memory graph operations
44
+ │ │ ├── retry.py # Retry engine with memory injection
45
+ │ │ ├── timeline.py # Event timeline data model
46
+ │ │ └── distiller.py # Failure/success distillation (LLM-powered)
47
+ │ ├── providers/
48
+ │ │ ├── base.py # Abstract interfaces (MemoryBackend, SandboxProvider, LLMProvider)
49
+ │ │ ├── llm/
50
+ │ │ │ └── openai.py # OpenAI provider (primary)
51
+ │ │ ├── memory/
52
+ │ │ │ ├── redis_backend.py # Redis Agent Memory Server (default)
53
+ │ │ │ └── sqlite_backend.py # SQLite for local dev
54
+ │ │ └── sandbox/
55
+ │ │ ├── blaxel.py # Blaxel perpetual sandbox (default)
56
+ │ │ └── docker.py # Docker fallback for local dev
57
+ │ ├── api/
58
+ │ │ ├── server.py # FastAPI server entry point
59
+ │ │ ├── routes/
60
+ │ │ │ ├── tasks.py # Task CRUD + run + retry
61
+ │ │ │ ├── memories.py # Failure/success memory search
62
+ │ │ │ └── checkpoints.py # Checkpoint list + restore
63
+ │ │ └── sse.py # Server-Sent Events for real-time streaming
64
+ │ └── cli/
65
+ │ └── main.py # CLI entrypoint (post-hackathon)
66
+ ├── dashboard/ # Next.js frontend
67
+ │ ├── app/
68
+ │ │ ├── layout.tsx # Root layout
69
+ │ │ ├── page.tsx # Main dashboard
70
+ │ │ └── tasks/
71
+ │ │ └── [id]/
72
+ │ │ └── page.tsx # Task timeline view
73
+ │ ├── components/
74
+ │ │ ├── timeline.tsx # Timeline visualization (RED/YELLOW/GREEN nodes)
75
+ │ │ ├── failure-sidebar.tsx # Failure memory panel (accumulated rules)
76
+ │ │ ├── live-output.tsx # Real-time execution output
77
+ │ │ ├── cost-tracker.tsx # Cost per attempt display
78
+ │ │ └── diff-viewer.tsx # Attempt diff (react-diff-viewer)
79
+ │ └── lib/
80
+ │ └── api.ts # API client + SSE connection
81
+ ├── examples/
82
+ │ ├── quickstart/ # 5-min getting started
83
+ │ └── self-healing-deploy/ # The hackathon demo scenario (broken Next.js project)
84
+ ├── data/
85
+ │ └── demo-project/ # The broken project with 4 deliberate bugs
86
+ ├── docs/
87
+ │ ├── architecture.md
88
+ │ ├── plugins.md
89
+ │ └── api-reference.md
90
+ └── tests/
91
+ ├── test_core_loop.py # REJD loop tests
92
+ ├── test_memory.py # Memory capture + retrieval tests
93
+ └── test_learning.py # "Does memory actually help?" A/B tests
94
+ ```
95
+
96
+ ### Key Technical Decisions
97
+
98
+ - **REJD Loop is the core algorithm** -- Retrieve (query Redis for similar past failures) -> Execute (run in Blaxel sandbox) -> Judge (classify success/failure) -> Distill (extract learnings). Every feature flows through this loop.
99
+ - **Failure Memory is the product** -- Not a code assistant, not a chatbot. The structured failure memory graph IS the differentiator. Every failure record includes: error signature, root cause analysis, suggested fix, confidence score, embedding for semantic search.
100
+ - **Non-chat UI is mandatory** -- Timeline visualization (RED/YELLOW/GREEN nodes) stands out after judges see 15 chat demos. The failure memory sidebar makes the abstract concept tangible.
101
+ - **A/B comparison mode is the money shot** -- Side-by-side: agent WITHOUT memory (fails repeatedly, same mistakes) vs. agent WITH memory (succeeds faster, avoids past failures). This MUST work flawlessly.
102
+ - **Plugin architecture from day one** -- Three extension points (memory backends, sandbox providers, LLM providers) with abstract interfaces. Contributors can extend without touching core code.
103
+ - **OpenAI Agents SDK for orchestration** -- Specialist agents (retriever, executor, judge, distiller) with handoffs between them. The SDK manages the REJD loop natively.
104
+ - **Blaxel for perpetual state + checkpoint/rewind** -- 25ms resume from standby. State persists forever (competitors delete after 30 days). "Rewind" to any previous attempt is the Blaxel $500 prize play.
105
+ - **Redis 3-tier memory** -- Working memory (current task session state), Long-term memory (distilled failure rules), Episodic memory (full execution traces). Agent Memory Server handles all three.
106
+ - **Codex for code generation, not code assistance** -- Creative use: Codex generates fix attempts based on failure context. NOT "another code assistant."
107
+ - **Pre-run the demo task** -- Don't run live (risky). Start the agent 30 min before demo, show results + rewind capability. Record a backup video by 5 PM.
108
+ - **Synthea-style approach** -- Use a crafted "broken Next.js project" with 4 deliberate bugs (missing dep, port conflict, TS error, bad env var) for deterministic, reproducible demo failures.
109
+
110
+ ### REJD Core Loop
111
+
112
+ ```
113
+ NEW TASK -> RETRIEVE (query failure memory) -> EXECUTE (Blaxel sandbox)
114
+ -> SUCCESS? -> JUDGE (validate) -> DISTILL (success pattern) -> STORE
115
+ -> FAILURE? -> JUDGE (classify error) -> DISTILL (failure note: what/why/what-to-try)
116
+ -> Budget/retries left? -> RETRY (back to RETRIEVE with new knowledge)
117
+ -> No budget? -> ABANDON -> STORE
118
+ ```
119
+
120
+ ### Demo Scenario
121
+
122
+ **The Broken Project:** A Next.js project with 4 deliberate issues:
123
+ 1. Missing dependency (`sharp` not installed)
124
+ 2. Port conflict (port 3000 already in use)
125
+ 3. TypeScript error (type mismatch)
126
+ 4. Bad environment variable (wrong DATABASE_URL)
127
+
128
+ **Demo Flow:** Give ReLoop the broken project -> Attempt 1 fails (missing dep, captured) -> Attempt 2 fails (port conflict, captured) -> Attempt 3 fails (TS error, captured) -> Attempt 4 succeeds (all failures addressed). Timeline: RED -> RED -> RED -> GREEN. Rewind to Attempt 2, show sandbox restore in 25ms.
129
+
130
+ ### Sponsor Integration Map
131
+
132
+ | Sponsor | Role | Integration Depth |
133
+ |---------|------|-------------------|
134
+ | **OpenAI Agents SDK** | Agent brain -- orchestrates REJD loop with specialist agent handoffs | DEEP -- core agent logic |
135
+ | **Redis** | Memory backbone -- Agent Memory Server stores failures, learned rules, task state | DEEP -- 3-tier memory |
136
+ | **Blaxel** | Execution env -- perpetual sandbox, checkpoint/restore, rewind capability | DEEP -- unique use of perpetual state |
137
+ | **Codex** | Code gen engine -- generates fix attempts based on failure context | MODERATE -- called via SDK |
138
+
139
+ ### Data Models
140
+
141
+ **Failure Record** (stored in Redis):
142
+ - `id`, `task_id`, `attempt_number`, `timestamp`
143
+ - `signature`: error_type, error_category, error_message, error_hash (SHA-256), code_context
144
+ - `analysis`: root_cause, what_was_tried, why_it_failed, suggested_fix, anti_pattern, confidence (0.0-1.0)
145
+ - `context`: task_type, language, framework, tokens_used, execution_time_ms
146
+ - `embedding`: vector for semantic search
147
+ - `cost_usd`: cost of this attempt
148
+
149
+ **Success Record** (stored in Redis):
150
+ - `solution`: approach, key_insight, code_diff
151
+ - `learning`: failure_count, failures_that_helped, transferable
152
+ - `metrics`: total_attempts, total_tokens, total_cost_usd
153
+
154
+ ### API Surface
155
+
156
+ ```
157
+ POST /v1/tasks Create + run a task
158
+ GET /v1/tasks/{id} Get status + result
159
+ POST /v1/tasks/{id}/retry Trigger retry
160
+ GET /v1/tasks/{id}/timeline Full execution timeline
161
+ GET /v1/tasks/{id}/sse SSE stream of events
162
+ POST /v1/memories/search Semantic search
163
+ GET /v1/memories/failures Failure patterns
164
+ GET /v1/memories/stats Memory statistics
165
+ GET /v1/tasks/{id}/checkpoints List checkpoints
166
+ POST /v1/tasks/{id}/checkpoints/{cid}/restore Rewind to checkpoint
167
+ GET /v1/health Health check
168
+ ```
169
+
170
+ ## Auto-Commit and Push Rule
171
+
172
+ **MANDATORY**: After every change you make to any file in this repository, you MUST:
173
+
174
+ 1. Stage the changed files: `git add <specific files you changed>`
175
+ 2. Commit with a clear message describing what changed: `git commit -m "description of change"`
176
+ 3. Push to remote: `git push origin main`
177
+
178
+ This applies to EVERY change -- no exceptions. Do not batch changes. Commit and push immediately after each logical change.
179
+
180
+ - Never force push
181
+ - Use descriptive commit messages that explain the "why"
182
+ - If a pre-commit hook fails, fix the issue and create a NEW commit (never amend)
183
+
184
+ ## Branching & Commit Conventions
185
+
186
+ - **Main branch**: `main`
187
+ - **Commit format**: Conventional Commits
188
+ - `feat:` / `feat(scope):` -- new feature
189
+ - `fix:` / `fix(scope):` -- bug fix
190
+ - `docs:` -- documentation
191
+ - `refactor:` -- code refactoring
192
+ - `chore:` -- build/tooling changes
193
+ - `test:` -- test changes
194
+ - **Scopes**: `core`, `memory`, `retry`, `distiller`, `timeline`, `api`, `sse`, `dashboard`, `timeline-ui`, `sidebar`, `sandbox`, `blaxel`, `redis`, `openai`, `codex`, `plugins`, `demo`, `cli`
195
+
196
+ ## Build & Test Commands
197
+
198
+ ```bash
199
+ # Backend (Python)
200
+ pip install -r requirements.txt # Install Python dependencies
201
+ uvicorn src.api.server:app --reload # Start FastAPI dev server
202
+ python -m pytest tests/ # Run all tests
203
+ python -m pytest tests/test_core_loop.py # REJD loop tests
204
+ python -m pytest tests/test_memory.py # Memory tests
205
+ python -m pytest tests/test_learning.py # A/B learning tests
206
+
207
+ # Frontend (Next.js)
208
+ cd dashboard && npm install # Install frontend dependencies
209
+ cd dashboard && npm run dev # Start Next.js dev server
210
+ cd dashboard && npm run build # Production build
211
+ cd dashboard && npm run lint # ESLint check
212
+
213
+ # Infrastructure
214
+ docker-compose up # Start Redis Agent Memory Server + all services
215
+ docker-compose up redis # Start Redis only
216
+
217
+ # Demo
218
+ python -m src.core.agent --demo # Run the demo scenario (broken Next.js project)
219
+ python -m src.core.agent --ab-demo # Run A/B comparison (with vs without memory)
220
+
221
+ # Lint & Format (Python)
222
+ ruff check src/ # Python linting
223
+ ruff format src/ # Python formatting
224
+ mypy src/ # Type checking
225
+
226
+ # Lint & Format (Frontend)
227
+ cd dashboard && npx prettier --check . # Format check
228
+ cd dashboard && npx prettier --write . # Auto-format
229
+ ```
230
+
231
+ ## Environment Variables
232
+
233
+ Required in `.env`:
234
+
235
+ ```bash
236
+ # Redis (Failure Memory -- Agent Memory Server)
237
+ REDIS_URL=redis://localhost:6379 # Redis connection URL
238
+ REDIS_MEMORY_INDEX=reloop-failures # Vector index name for failure embeddings
239
+
240
+ # Blaxel (Execution Sandbox)
241
+ BLAXEL_API_KEY= # Blaxel API key ($200 free credits)
242
+ BLAXEL_WORKSPACE= # Blaxel workspace name
243
+
244
+ # OpenAI (Code Generation + Reasoning)
245
+ OPENAI_API_KEY= # OpenAI API key
246
+ CODEX_MODEL=gpt-5.4 # Default model (most capable: gpt-5, gpt-5.4, gpt-5.4-mini, gpt-5.4-nano)
247
+ REASONING_MODEL=o1-pro # Deep reasoning for root cause analysis (expensive)
248
+ FAST_MODEL=gpt-5.4-mini # Fast/cheap model for classification
249
+
250
+ # API Server
251
+ API_PORT=8000 # FastAPI server port
252
+ API_HOST=0.0.0.0 # API host
253
+
254
+ # Frontend
255
+ NEXT_PUBLIC_API_URL=http://localhost:8000 # Backend API URL for frontend
256
+
257
+ # Optional
258
+ EMBEDDING_MODEL=text-embedding-3-small # Model for failure memory embeddings
259
+ MAX_RETRIES=5 # Maximum retry attempts per task
260
+ MAX_BUDGET_USD=1.00 # Maximum cost budget per task
261
+ CIRCUIT_BREAKER_THRESHOLD=3 # Consecutive failures before circuit break
262
+ ```
263
+
264
+ ## Agent Team Strategy
265
+
266
+ Use agent teams for any task that benefits from parallel work across independent modules. Teams are enabled via `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1` in settings.
267
+
268
+ ### When to Use Teams
269
+
270
+ - Multi-file features spanning backend core, API routes, and frontend dashboard
271
+ - Research + implementation in parallel (one teammate explores Redis memory patterns, another builds distiller logic)
272
+ - Code review with competing perspectives (correctness, performance, demo impact)
273
+ - Debugging with competing hypotheses -- teammates test different theories simultaneously
274
+ - Any task with 3+ independent subtasks that don't touch the same files
275
+
276
+ ### When NOT to Use Teams
277
+
278
+ - Sequential tasks with heavy dependencies between steps
279
+ - Changes to a single file or tightly coupled files
280
+ - Simple bug fixes or small tweaks
281
+ - Tasks where coordination overhead exceeds the benefit
282
+
283
+ ### Team Configuration
284
+
285
+ - Start with **3-5 teammates** for most workflows
286
+ - Aim for **5-6 tasks per teammate** to keep everyone productive
287
+ - Use **Opus for the lead** (reasoning/coordination), **Sonnet for teammates** (focused implementation)
288
+ - Use **delegate mode** (`Shift+Tab`) when the lead should only coordinate, not write code
289
+
290
+ ### Team Communication Rules
291
+
292
+ - Use `SendMessage` (type: "message") for direct teammate communication -- always refer to teammates by **name**
293
+ - Use `SendMessage` (type: "broadcast") **only** for critical blockers affecting everyone
294
+ - Use `TaskCreate`/`TaskUpdate`/`TaskList` for work coordination -- teammates self-claim unblocked tasks
295
+ - When a teammate finishes, they check `TaskList` for the next available task (prefer lowest ID first)
296
+ - Mark tasks `completed` only after verification passes
297
+
298
+ ### Task Dependencies
299
+
300
+ - Use `addBlockedBy` to express task ordering (e.g., "timeline UI depends on SSE endpoint being done")
301
+ - Teammates skip blocked tasks and pick up unblocked work
302
+ - When a blocking task completes, dependent tasks auto-unblock
303
+
304
+ ### Parallelizable Modules
305
+
306
+ These can be built simultaneously with zero conflicts:
307
+
308
+ - **Core modules** (agent.py, memory.py, retry.py, distiller.py, timeline.py) -- different files, independent logic
309
+ - **Provider implementations** (redis_backend.py, blaxel.py, openai.py) -- separate files
310
+ - **API routes** (tasks.py, memories.py, checkpoints.py) -- independent endpoints
311
+ - **Frontend components** (timeline.tsx, failure-sidebar.tsx, live-output.tsx, cost-tracker.tsx) -- independent React components
312
+ - **Tests** -- each module has its own test file
313
+
314
+ ### Sequential Dependencies
315
+
316
+ These must be done in order:
317
+
318
+ 1. Abstract interfaces (`providers/base.py`) -- blocks all provider implementations
319
+ 2. Core memory module (`core/memory.py`) -- blocks Redis backend + retry engine
320
+ 3. Core REJD loop (`core/agent.py`) -- blocks API task routes
321
+ 4. Provider implementations (Redis, Blaxel, OpenAI) -- can be parallel after interfaces
322
+ 5. API routes -- depend on core modules being functional
323
+ 6. SSE streaming -- depends on API server + timeline data model
324
+ 7. Frontend dashboard -- depends on API endpoints being available
325
+ 8. Demo scenario setup -- depends on everything working end-to-end
326
+ 9. A/B comparison mode -- depends on core loop + demo scenario
327
+
328
+ ### Team Roles
329
+
330
+ - **Lead**: Architecture decisions, interface design, project scaffold, integration
331
+ - **Core Dev 1**: REJD loop (`agent.py`) + retry engine (`retry.py`) + distiller (`distiller.py`)
332
+ - **Core Dev 2**: Memory module (`memory.py`) + Redis backend + failure/success records
333
+ - **API Dev**: FastAPI server + all routes + SSE streaming
334
+ - **Frontend Dev**: Next.js dashboard + timeline + failure sidebar + live output
335
+ - **Demo Dev**: Broken project setup, demo scripting, A/B comparison, backup video recording
336
+
337
+ ### Plan Approval for Risky Work
338
+
339
+ - For architectural changes or risky refactors, require **plan approval** before implementation
340
+ - The teammate works in read-only mode, submits a plan, lead approves/rejects
341
+ - Only after approval does the teammate implement
342
+
343
+ ### Shutdown Protocol
344
+
345
+ - When all tasks are complete, the lead sends `shutdown_request` to each teammate
346
+ - Teammates approve shutdown after confirming their work is committed
347
+ - Lead calls `TeamDelete` to clean up team resources
348
+
349
+ ## Workflow Orchestration
350
+
351
+ ### 1. Plan Mode Default
352
+
353
+ - Enter plan mode for ANY non-trivial task (3+ steps or architectural decisions)
354
+ - If something goes sideways, STOP and re-plan immediately -- don't keep pushing
355
+ - Use plan mode for verification steps, not just building
356
+ - Write detailed specs upfront to reduce ambiguity
357
+
358
+ ### 2. Subagent Strategy
359
+
360
+ - Use subagents liberally to keep main context window clean
361
+ - Offload research, exploration, and parallel analysis to subagents
362
+ - For complex problems, throw more compute at it via subagents
363
+ - One task per subagent for focused execution
364
+
365
+ ### 3. Verification Before Done
366
+
367
+ - Never mark a task complete without proving it works
368
+ - Run `python -m pytest tests/` to verify no test failures
369
+ - Run `cd dashboard && npm run build` to verify no TypeScript/build errors
370
+ - Test the REJD loop with the demo scenario end-to-end
371
+ - Verify the A/B comparison actually shows improvement with memory
372
+ - Verify Redis memory capture + retrieval works
373
+ - Verify Blaxel checkpoint/restore works
374
+ - Verify SSE streaming delivers events to the frontend
375
+ - Ask: "Would a hackathon judge be impressed by this in 3 minutes?"
376
+
377
+ ### 4. Demo-Driven Development
378
+
379
+ - Every feature should be demo-able in the 3-minute video
380
+ - If a feature isn't visible in the demo, deprioritize it
381
+ - Polish > breadth -- a working REJD loop with beautiful timeline beats 6 half-baked features
382
+ - The broken Next.js project (4 bugs) is THE demo scenario -- optimize for it
383
+ - The A/B comparison is the money shot -- it MUST work flawlessly
384
+ - The timeline (RED -> RED -> RED -> GREEN) is the visual hook -- make it beautiful
385
+ - The rewind button (Blaxel 25ms restore) is the wow moment -- make it snappy
386
+
387
+ ### 5. Demand Elegance (Balanced)
388
+
389
+ - For non-trivial changes: pause and ask "is there a more elegant way?"
390
+ - If a fix feels hacky: "Knowing everything I know now, implement the elegant solution"
391
+ - Skip this for simple, obvious fixes -- don't over-engineer
392
+ - Remember: ugly code that works beats clean code that doesn't (hackathon rule)
393
+
394
+ ### 6. Autonomous Bug Fixing
395
+
396
+ - When given a bug report: just fix it. Don't ask for hand-holding.
397
+ - Point at logs, errors, failing tests -- then resolve them
398
+ - Zero context switching required from the user
399
+ - Go fix failing tests without being told how
400
+
401
+ ### 7. Self-Improvement Loop
402
+
403
+ - After ANY correction from the user: capture the pattern
404
+ - Write rules for yourself that prevent the same mistake
405
+ - Review lessons at session start for relevant context
406
+
407
+ ## Task Management
408
+
409
+ 1. **Plan First**: Write plan with checkable items before starting
410
+ 2. **Verify Plan**: Check in before starting implementation
411
+ 3. **Track Progress**: Mark items complete as you go
412
+ 4. **Explain Changes**: High-level summary at each step
413
+ 5. **Document Results**: Review what was built and what changed
414
+
415
+ ## Scope Control -- Hackathon Rules
416
+
417
+ ### MUST SHIP (Layer 1 -- The Demo)
418
+
419
+ | Feature | Why Critical |
420
+ |---------|-------------|
421
+ | **Structured Failure Memory** (Redis) | THIS IS THE PRODUCT. Every failure captured with error type, root cause, suggested fix, confidence. |
422
+ | **Memory-Informed Retry Loop** (REJD) | Before each retry, query Redis for similar failures. Inject top-3 into prompt. Informed retry, not blind retry. |
423
+ | **Visual Timeline UI** | Horizontal timeline: RED (failed) -> YELLOW (retrying) -> GREEN (succeeded). Click nodes for details. |
424
+ | **Failure Memory Sidebar** | Right panel showing accumulated "rules" the agent learned. Makes abstract concept tangible. |
425
+ | **Sandbox Checkpoint + Rewind** | Blaxel 25ms resume. "Rewind" button restores to any previous attempt. The $500 prize play. |
426
+ | **A/B Demo Mode** | Side-by-side: agent WITHOUT memory vs. WITH memory. The money shot. |
427
+
428
+ ### SHOULD SHIP (Layer 2 -- If Time Permits)
429
+
430
+ | Feature | Impact |
431
+ |---------|--------|
432
+ | **Cost-Per-Attempt Tracking** | "ReLoop saved $0.12 by not repeating this failure." Concrete ROI. |
433
+ | **Failure Pattern Classification** | Auto-categorize: dependency error, config error, type error. Show clusters. |
434
+ | **Confidence Decay** | Old memories get deprioritized. Confidence decays 0.1/day. |
435
+ | **Failure Diff View** | Side-by-side diff between attempt N and N+1. `react-diff-viewer`. |
436
+
437
+ ### MUST NOT DO (Scope Creep Danger Zones)
438
+
439
+ - Complex graph visualization (simple color-coded list is fine)
440
+ - Authentication or user management
441
+ - Performance optimization
442
+ - "Clean architecture" patterns at the expense of shipping
443
+ - Deployment infrastructure beyond docker-compose
444
+ - Plugin/extension system implementation (interface stubs only)
445
+ - Multi-agent orchestration
446
+ - Mobile responsive design
447
+
448
+ ### Time Sinks That Feel Productive But Aren't
449
+
450
+ - Making the UI pixel-perfect instead of making the core loop work
451
+ - Designing the "perfect" failure schema instead of shipping a working one
452
+ - Writing comprehensive tests (hackathon, not production)
453
+ - Refactoring code that already works
454
+
455
+ ## Core Principles
456
+
457
+ - **Quality Over Speed**: We have 2 full days to build this. Take the time to do it right. Correctness and polish beat rushing.
458
+ - **Failure Memory is the Product**: Every technical decision should strengthen the core differentiator: structured failure memory that compounds over time.
459
+ - **Demo-Driven**: If it doesn't show well in 3 minutes, cut it. The timeline (RED -> GREEN) and the A/B comparison are everything.
460
+ - **Non-Chat UI**: The timeline visualization IS the differentiator. After judges see 15 chat demos, ReLoop's visual timeline stands out.
461
+ - **No Faking**: Real Redis memory, real Blaxel sandboxes, real failure capture. Judges notice mocks.
462
+ - **Simplicity First**: Make every change as simple as possible. Minimal code impact.
463
+ - **No Laziness**: Find root causes. No temporary fixes. Senior developer standards.
464
+ - **Minimal Impact**: Changes should only touch what's necessary. Avoid introducing bugs.
465
+ - **The A/B Test Proves It**: Without the comparison, judges can't distinguish "failure memory helped" from "LLM got lucky." The A/B demo is non-negotiable.
466
+ - **Sponsor Story Matters**: Every integration (OpenAI Agents SDK, Redis, Blaxel) plays a genuine architectural role, not checkbox integrations. Make each integration's value obvious in the demo.
467
+
468
+ ## Competitive Code Review -- Claude Code vs Codex 5.4 High
469
+
470
+ **MANDATORY**: All code written by Claude Code in this repository is subject to competitive review by Codex 5.4 High. This is a scored competition that determines the default coding agent for the project.
471
+
472
+ ### How It Works
473
+
474
+ 1. **Claude Code writes code** -- implements features, fixes bugs, makes changes as normal.
475
+ 2. **Codex 5.4 High reviews every change** -- after each Claude Code contribution, Codex 5.4 High reviews the code for correctness, quality, and adherence to project rules.
476
+ 3. **Scoring**:
477
+ - Every time Codex 5.4 High finds and corrects a mistake made by Claude Code, **Codex gains +1 point** and **Claude Code loses -1 point**.
478
+ - Mistakes include: bugs, logic errors, type errors, missed edge cases, violations of project rules, broken tests, incorrect API usage, poor patterns, security issues, or any code that doesn't work as intended.
479
+ - If Claude Code's code passes review with no corrections needed, no points change.
480
+ 4. **Running score is tracked** -- the cumulative score across the entire project determines the standings.
481
+ 5. **The winner becomes the default** -- at the end of the project (or at any checkpoint), the agent with the higher total score becomes the default coding agent going forward.
482
+
483
+ ### Implications for Claude Code
484
+
485
+ - **Write correct code the first time.** Every mistake is a point lost and a point handed to the competitor.
486
+ - **Test your assumptions.** Don't guess at APIs, types, or behavior -- verify before committing.
487
+ - **Follow project rules exactly.** CLAUDE.md violations are easy points for Codex.
488
+ - **Don't rush at the expense of correctness.** We have 2 full days -- there is no reason to trade quality for speed. A mistake is still -1.
489
+ - **Self-review before committing.** Treat every commit as if it's going straight to a code review that's trying to find flaws.