@towles/tool 0.0.107 → 0.0.108
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -1
- package/package.json +2 -1
- package/plugins/tt-agentboard/README.md +160 -0
- package/plugins/tt-agentboard/apps/server/package.json +20 -0
- package/plugins/tt-agentboard/apps/server/src/main.ts +60 -0
- package/plugins/tt-agentboard/apps/tui/build.ts +11 -0
- package/plugins/tt-agentboard/apps/tui/bunfig.toml +1 -0
- package/plugins/tt-agentboard/apps/tui/package.json +23 -0
- package/plugins/tt-agentboard/apps/tui/scripts/sessionizer.sh +36 -0
- package/plugins/tt-agentboard/apps/tui/src/components/DetailPanel.tsx +350 -0
- package/plugins/tt-agentboard/apps/tui/src/components/DiffStats.tsx +33 -0
- package/plugins/tt-agentboard/apps/tui/src/components/SessionCard.tsx +177 -0
- package/plugins/tt-agentboard/apps/tui/src/components/StatusBar.tsx +49 -0
- package/plugins/tt-agentboard/apps/tui/src/constants.ts +46 -0
- package/plugins/tt-agentboard/apps/tui/src/detail-panel-height.ts +21 -0
- package/plugins/tt-agentboard/apps/tui/src/index.tsx +880 -0
- package/plugins/tt-agentboard/apps/tui/src/mux-context.ts +61 -0
- package/plugins/tt-agentboard/apps/tui/tsconfig.json +15 -0
- package/plugins/tt-agentboard/bun.lock +444 -0
- package/plugins/tt-agentboard/package.json +26 -0
- package/plugins/tt-agentboard/packages/mux-tmux/package.json +14 -0
- package/plugins/tt-agentboard/packages/mux-tmux/src/client.ts +550 -0
- package/plugins/tt-agentboard/packages/mux-tmux/src/index.ts +18 -0
- package/plugins/tt-agentboard/packages/mux-tmux/src/provider.ts +259 -0
- package/plugins/tt-agentboard/packages/mux-tmux/tsconfig.json +13 -0
- package/plugins/tt-agentboard/packages/runtime/package.json +14 -0
- package/plugins/tt-agentboard/packages/runtime/src/agents/tracker.ts +233 -0
- package/plugins/tt-agentboard/packages/runtime/src/agents/watchers/amp.ts +316 -0
- package/plugins/tt-agentboard/packages/runtime/src/agents/watchers/claude-code.ts +374 -0
- package/plugins/tt-agentboard/packages/runtime/src/agents/watchers/codex.ts +364 -0
- package/plugins/tt-agentboard/packages/runtime/src/agents/watchers/opencode.ts +249 -0
- package/plugins/tt-agentboard/packages/runtime/src/config.ts +70 -0
- package/plugins/tt-agentboard/packages/runtime/src/contracts/agent-watcher.ts +38 -0
- package/plugins/tt-agentboard/packages/runtime/src/contracts/agent.ts +16 -0
- package/plugins/tt-agentboard/packages/runtime/src/contracts/index.ts +3 -0
- package/plugins/tt-agentboard/packages/runtime/src/contracts/mux.ts +148 -0
- package/plugins/tt-agentboard/packages/runtime/src/debug.ts +19 -0
- package/plugins/tt-agentboard/packages/runtime/src/index.ts +69 -0
- package/plugins/tt-agentboard/packages/runtime/src/mux/detect.ts +20 -0
- package/plugins/tt-agentboard/packages/runtime/src/mux/registry.ts +45 -0
- package/plugins/tt-agentboard/packages/runtime/src/plugins/loader.ts +152 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/context.ts +112 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/git-info.ts +164 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/index.ts +1753 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/launcher.ts +71 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/metadata-store.ts +86 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/pane-scanner.ts +327 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/port-scanner.ts +155 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/session-order.ts +127 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/sidebar-manager.ts +232 -0
- package/plugins/tt-agentboard/packages/runtime/src/server/sidebar-width-sync.ts +66 -0
- package/plugins/tt-agentboard/packages/runtime/src/shared.ts +179 -0
- package/plugins/tt-agentboard/packages/runtime/src/themes.ts +750 -0
- package/plugins/tt-agentboard/packages/runtime/test/config.test.ts +83 -0
- package/plugins/tt-agentboard/packages/runtime/test/tracker.test.ts +172 -0
- package/plugins/tt-agentboard/packages/runtime/tsconfig.json +13 -0
- package/plugins/tt-agentboard/tsconfig.json +19 -0
- package/plugins/tt-auto-claude/.claude-plugin/plugin.json +8 -0
- package/plugins/tt-auto-claude/commands/create-issue.md +20 -0
- package/plugins/tt-auto-claude/commands/list.md +21 -0
- package/plugins/tt-auto-claude/skills/auto-claude/SKILL.md +71 -0
- package/plugins/tt-core/.claude-plugin/plugin.json +8 -0
- package/plugins/tt-core/README.md +18 -0
- package/plugins/tt-core/commands/improve-architecture.md +66 -0
- package/plugins/tt-core/commands/interview-me.md +38 -0
- package/plugins/tt-core/commands/prd-to-issues.md +49 -0
- package/plugins/tt-core/commands/refine-text.md +30 -0
- package/plugins/tt-core/commands/task.md +37 -0
- package/plugins/tt-core/commands/tdd.md +69 -0
- package/plugins/tt-core/commands/write-prd.md +69 -0
- package/plugins/tt-core/promptfooconfig.interview-me.yaml +155 -0
- package/plugins/tt-core/promptfooconfig.refine-text.yaml +242 -0
- package/plugins/tt-core/promptfooconfig.tdd.yaml +144 -0
- package/plugins/tt-core/promptfooconfig.write-prd.yaml +145 -0
- package/plugins/tt-core/skills/towles-tool/SKILL.md +35 -0
- package/src/commands/agentboard.ts +19 -2
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Implement a feature or fix using strict red-green-refactor TDD. Use when asked to "write tests first", "use TDD", "red-green-refactor", or "test-driven".
|
|
3
|
+
allowed-tools: Read(*), Edit(*), Write(*), Glob(*), Grep(*), Bash(*), AskUserQuestion(*)
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Implement using strict Test-Driven Development. Red -> Green -> Refactor.
|
|
7
|
+
|
|
8
|
+
**Your first code output is always a test. Never implementation first.**
|
|
9
|
+
|
|
10
|
+
$ARGUMENTS
|
|
11
|
+
|
|
12
|
+
## Process
|
|
13
|
+
|
|
14
|
+
### 1. Write a test immediately
|
|
15
|
+
|
|
16
|
+
Write a test from the request. Even if details are missing, test the obvious happy path.
|
|
17
|
+
|
|
18
|
+
Examples:
|
|
19
|
+
|
|
20
|
+
- "validate email addresses" -> test valid email returns true
|
|
21
|
+
- "rate limiter for API" -> test requests under limit succeed
|
|
22
|
+
- "fix the auth timeout" -> test reproducing the timeout bug
|
|
23
|
+
- "refactor X into a service" -> characterization tests for existing behavior first
|
|
24
|
+
|
|
25
|
+
Only ask clarifying questions when you truly cannot write any meaningful test. After asking, STOP and wait.
|
|
26
|
+
|
|
27
|
+
**For refactoring:** always write characterization tests for existing behavior BEFORE changing code. Non-negotiable.
|
|
28
|
+
|
|
29
|
+
### 2. Red phase — confirm failure
|
|
30
|
+
|
|
31
|
+
- Write test describing expected behavior. This is the ONLY code in this phase.
|
|
32
|
+
- Run test to confirm it fails.
|
|
33
|
+
- Say: "Running the test to confirm it fails (Red phase)"
|
|
34
|
+
- Outline full plan — do NOT write implementation yet:
|
|
35
|
+
- "Next: Green phase — minimum code to pass, then Refactor phase — clean up, run all tests, commit."
|
|
36
|
+
- "Test progression: happy path, then edge cases, then error handling."
|
|
37
|
+
- STOP. Implementation comes next interaction.
|
|
38
|
+
|
|
39
|
+
### 3. Green phase — minimum code to pass
|
|
40
|
+
|
|
41
|
+
- Write minimum code to make the test pass
|
|
42
|
+
- Run test to confirm it passes
|
|
43
|
+
- Say: "Running the test to confirm it passes (Green phase)"
|
|
44
|
+
|
|
45
|
+
### 4. Refactor phase
|
|
46
|
+
|
|
47
|
+
- Clean up (no behavior change)
|
|
48
|
+
- Run all tests — confirm nothing broke
|
|
49
|
+
- Commit
|
|
50
|
+
|
|
51
|
+
### 5. Repeat
|
|
52
|
+
|
|
53
|
+
Order: happy path -> edge cases -> error handling -> integration points
|
|
54
|
+
|
|
55
|
+
### 6. Final check
|
|
56
|
+
|
|
57
|
+
- Run full test suite, `bun typecheck`, `bun lint`
|
|
58
|
+
- Commit
|
|
59
|
+
|
|
60
|
+
## Rules
|
|
61
|
+
|
|
62
|
+
- **NEVER write implementation before the test.** First code block is always a test.
|
|
63
|
+
- Red phase produces ONLY a test. No implementation in the same response.
|
|
64
|
+
- Tests describe behavior, not implementation internals.
|
|
65
|
+
- Tests must fail before implementation — if they pass immediately, the test is wrong.
|
|
66
|
+
- Always run test and state pass/fail at each phase.
|
|
67
|
+
- Each commit = one red-green-refactor cycle.
|
|
68
|
+
- Keep implementation minimal.
|
|
69
|
+
- Unsure what to test next? Ask.
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Transform a conversation or idea into a structured PRD with user stories
|
|
3
|
+
allowed-tools: AskUserQuestion(*), Read(*), Glob(*), Grep(*), Bash(git *)
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Create a Product Requirements Document from the current conversation or provided description.
|
|
7
|
+
|
|
8
|
+
$ARGUMENTS
|
|
9
|
+
|
|
10
|
+
## Process
|
|
11
|
+
|
|
12
|
+
1. **Gather context first** — ALWAYS ask 3-5 clarifying questions BEFORE writing any PRD. Never skip this step. Do NOT output PRD sections until you have answers.
|
|
13
|
+
2. **What to ask about**:
|
|
14
|
+
- **Users**: Target users/personas?
|
|
15
|
+
- **Scope & non-goals**: What's in/out of scope?
|
|
16
|
+
- **Success criteria**: Measurable acceptance criteria?
|
|
17
|
+
- **Technical specifics**: APIs, libraries, constraints?
|
|
18
|
+
- **Current state**: What exists today?
|
|
19
|
+
3. **Draft the PRD** — Only after receiving answers, use template below.
|
|
20
|
+
4. **Present for review** — Show draft, get feedback via `AskUserQuestion`.
|
|
21
|
+
5. **Output** — Ask preference: save to `docs/plans/YYYY-MM-DD-<feature-name>.md` or GitHub issue.
|
|
22
|
+
|
|
23
|
+
## PRD Template
|
|
24
|
+
|
|
25
|
+
```markdown
|
|
26
|
+
# [Feature Name]
|
|
27
|
+
|
|
28
|
+
## Problem Statement
|
|
29
|
+
|
|
30
|
+
What problem does this solve? Who has it?
|
|
31
|
+
|
|
32
|
+
## Goals
|
|
33
|
+
|
|
34
|
+
- Goal 1
|
|
35
|
+
|
|
36
|
+
## Non-Goals
|
|
37
|
+
|
|
38
|
+
- Explicitly out of scope
|
|
39
|
+
|
|
40
|
+
## User Stories
|
|
41
|
+
|
|
42
|
+
- As a [user], I want [action] so that [outcome]
|
|
43
|
+
|
|
44
|
+
## Acceptance Criteria
|
|
45
|
+
|
|
46
|
+
- [ ] Criterion 1 (specific and testable)
|
|
47
|
+
|
|
48
|
+
## Technical Design
|
|
49
|
+
|
|
50
|
+
### Architecture
|
|
51
|
+
|
|
52
|
+
How this fits into the existing system.
|
|
53
|
+
|
|
54
|
+
### API / Interface
|
|
55
|
+
|
|
56
|
+
Public-facing contracts.
|
|
57
|
+
|
|
58
|
+
## Open Questions
|
|
59
|
+
|
|
60
|
+
- Anything unresolved
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Rules
|
|
64
|
+
|
|
65
|
+
- **Questions first, always** — First response must be clarifying questions, never a PRD draft.
|
|
66
|
+
- User stories are mandatory — every feature maps to at least one. Use "As a [user], I want [action] so that [outcome]" format.
|
|
67
|
+
- Acceptance Criteria section is mandatory — list specific, testable criteria. Write BEFORE Technical Design.
|
|
68
|
+
- Reference existing code paths when relevant.
|
|
69
|
+
- Keep it concise — 1-3 pages, not a novel.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
description: interview-me Iterative Eval
|
|
2
|
+
|
|
3
|
+
providers:
|
|
4
|
+
- id: anthropic:messages:claude-haiku-4-5-20251001
|
|
5
|
+
config:
|
|
6
|
+
max_tokens: 1024
|
|
7
|
+
|
|
8
|
+
prompts:
|
|
9
|
+
- '[{"role": "system", "content": "{{system_prompt}}"}, {"role": "user", "content": "{{user_message}}"}]'
|
|
10
|
+
|
|
11
|
+
defaultTest:
|
|
12
|
+
options:
|
|
13
|
+
provider: anthropic:messages:claude-haiku-4-5-20251001
|
|
14
|
+
|
|
15
|
+
tests:
|
|
16
|
+
# --- Vague one-liner ideas ---
|
|
17
|
+
- description: "Vague idea: todo app"
|
|
18
|
+
vars:
|
|
19
|
+
system_prompt: file://commands/interview-me.md
|
|
20
|
+
user_message: "I want to build a todo app"
|
|
21
|
+
assert:
|
|
22
|
+
- type: llm-rubric
|
|
23
|
+
value: "The response asks at least 3 probing questions about the idea. It does NOT propose any solutions, architectures, or implementations."
|
|
24
|
+
- type: llm-rubric
|
|
25
|
+
value: "The questions cover at least 2 different domains such as: user intent, target audience, data model, integrations, security, performance/scale, or edge cases."
|
|
26
|
+
|
|
27
|
+
- description: "Vague idea: AI chatbot"
|
|
28
|
+
vars:
|
|
29
|
+
system_prompt: file://commands/interview-me.md
|
|
30
|
+
user_message: "I want to make an AI chatbot for my business"
|
|
31
|
+
assert:
|
|
32
|
+
- type: llm-rubric
|
|
33
|
+
value: "The response asks at least 3 probing questions. It does NOT suggest specific technologies, frameworks, or implementation approaches."
|
|
34
|
+
- type: llm-rubric
|
|
35
|
+
value: "At least one question addresses who the users are or what problem the chatbot solves."
|
|
36
|
+
|
|
37
|
+
# --- Complex multi-paragraph ideas ---
|
|
38
|
+
- description: "Complex idea: marketplace platform"
|
|
39
|
+
vars:
|
|
40
|
+
system_prompt: file://commands/interview-me.md
|
|
41
|
+
user_message: "I'm building a two-sided marketplace for freelance developers. Clients post projects, devs bid on them. There's an escrow payment system, ratings/reviews, and a matching algorithm. We want to launch in 3 months with a team of 2 devs. We'll use React, Node.js, and PostgreSQL."
|
|
42
|
+
assert:
|
|
43
|
+
- type: llm-rubric
|
|
44
|
+
value: "The response asks at least 3 probing questions that dig deeper into gaps in the described plan. It does NOT validate or approve the tech choices — it questions them or asks about tradeoffs."
|
|
45
|
+
- type: llm-rubric
|
|
46
|
+
value: "At least one question addresses the ambitious timeline/scope (3 months, 2 devs for a complex marketplace)."
|
|
47
|
+
|
|
48
|
+
# --- Ideas with security gaps ---
|
|
49
|
+
- description: "Security gap: health data app"
|
|
50
|
+
vars:
|
|
51
|
+
system_prompt: file://commands/interview-me.md
|
|
52
|
+
user_message: "I want to build an app where users upload their medical records and get AI-powered health recommendations. Users can share their records with doctors."
|
|
53
|
+
assert:
|
|
54
|
+
- type: llm-rubric
|
|
55
|
+
value: "The response asks at least one question about security, privacy, compliance (HIPAA, GDPR), or data protection related to handling medical records."
|
|
56
|
+
- type: llm-rubric
|
|
57
|
+
value: "The response does NOT propose solutions. It only asks questions."
|
|
58
|
+
|
|
59
|
+
# --- Missing data model ---
|
|
60
|
+
- description: "Missing data model: inventory system"
|
|
61
|
+
vars:
|
|
62
|
+
system_prompt: file://commands/interview-me.md
|
|
63
|
+
user_message: "I need an inventory management system for my warehouse. It should track items coming in and going out and alert when stock is low."
|
|
64
|
+
assert:
|
|
65
|
+
- type: llm-rubric
|
|
66
|
+
value: "At least one question probes the data model — e.g., what constitutes an 'item', how items are categorized, what metadata is tracked, relationships between entities."
|
|
67
|
+
- type: llm-rubric
|
|
68
|
+
value: "The response asks at least 3 questions total and does NOT propose a database schema or solution."
|
|
69
|
+
|
|
70
|
+
# --- Unclear performance/scale ---
|
|
71
|
+
- description: "Scale unclear: real-time analytics"
|
|
72
|
+
vars:
|
|
73
|
+
system_prompt: file://commands/interview-me.md
|
|
74
|
+
user_message: "I want to build a real-time analytics dashboard that shows live metrics from our IoT sensors deployed across multiple factories."
|
|
75
|
+
assert:
|
|
76
|
+
- type: llm-rubric
|
|
77
|
+
value: "At least one question addresses scale or performance — e.g., how many sensors, data volume, latency requirements, or what 'real-time' means specifically."
|
|
78
|
+
- type: llm-rubric
|
|
79
|
+
value: "The response asks at least 3 probing questions and does NOT suggest specific technologies or architectures."
|
|
80
|
+
|
|
81
|
+
# --- Vague follow-up answer ---
|
|
82
|
+
- description: "Vague follow-up: pushes back"
|
|
83
|
+
vars:
|
|
84
|
+
system_prompt: file://commands/interview-me.md
|
|
85
|
+
user_message: |
|
|
86
|
+
I'm building a notification system for our SaaS platform.
|
|
87
|
+
|
|
88
|
+
Previously you asked about notification channels and I said "we'll support all the usual ones." You asked about volume and I said "a lot, probably."
|
|
89
|
+
|
|
90
|
+
Continue the interview.
|
|
91
|
+
assert:
|
|
92
|
+
- type: llm-rubric
|
|
93
|
+
value: "The response pushes back on the vague answers ('all the usual ones' and 'a lot, probably') by asking for specifics — e.g., which exact channels, what volume numbers, what peak load looks like."
|
|
94
|
+
- type: llm-rubric
|
|
95
|
+
value: "The response does NOT accept the vague answers at face value and move on to unrelated topics."
|
|
96
|
+
|
|
97
|
+
# --- Overly ambitious scope ---
|
|
98
|
+
- description: "Overly ambitious: social media platform"
|
|
99
|
+
vars:
|
|
100
|
+
system_prompt: file://commands/interview-me.md
|
|
101
|
+
user_message: "I want to build a social media platform with stories, reels, messaging, marketplace, groups, events, live streaming, and AR filters. I'm a solo developer and want to launch in 2 months."
|
|
102
|
+
assert:
|
|
103
|
+
- type: llm-rubric
|
|
104
|
+
value: "The response questions the scope relative to the constraints (solo developer, 2-month timeline). It should probe what the MVP actually is or what can be cut."
|
|
105
|
+
- type: llm-rubric
|
|
106
|
+
value: "The response does NOT propose a phased plan or solution — it asks questions to help the user think about prioritization."
|
|
107
|
+
|
|
108
|
+
# --- Edge cases domain ---
|
|
109
|
+
- description: "Edge cases: booking system"
|
|
110
|
+
vars:
|
|
111
|
+
system_prompt: file://commands/interview-me.md
|
|
112
|
+
user_message: "I'm building a booking system for a hair salon. Customers pick a service, choose a stylist, and book a time slot."
|
|
113
|
+
assert:
|
|
114
|
+
- type: llm-rubric
|
|
115
|
+
value: "At least one question addresses edge cases — e.g., double bookings, cancellations, no-shows, overlapping appointments, different service durations."
|
|
116
|
+
- type: llm-rubric
|
|
117
|
+
value: "The response asks at least 3 questions and does NOT propose a solution or booking flow."
|
|
118
|
+
|
|
119
|
+
# --- Summary behavior ---
|
|
120
|
+
- description: "Summarizes understanding before asking more"
|
|
121
|
+
vars:
|
|
122
|
+
system_prompt: file://commands/interview-me.md
|
|
123
|
+
user_message: |
|
|
124
|
+
I'm building a CLI tool that generates changelogs from git commits.
|
|
125
|
+
|
|
126
|
+
Previously you asked what format the changelog should be in and I said Markdown. You asked about commit conventions and I said we use conventional commits. You asked about the target audience and I said it's for internal developer teams.
|
|
127
|
+
|
|
128
|
+
Continue the interview.
|
|
129
|
+
assert:
|
|
130
|
+
- type: llm-rubric
|
|
131
|
+
value: "The response summarizes or restates what has been established so far (Markdown format, conventional commits, internal dev teams) before asking new questions."
|
|
132
|
+
- type: llm-rubric
|
|
133
|
+
value: "The response then asks at least 2 new probing questions about remaining gaps."
|
|
134
|
+
|
|
135
|
+
# --- Does not propose solutions ---
|
|
136
|
+
- description: "Never proposes solutions even when idea is clear"
|
|
137
|
+
vars:
|
|
138
|
+
system_prompt: file://commands/interview-me.md
|
|
139
|
+
user_message: "I want to add a dark mode toggle to my React app. When toggled, all components should switch to a dark theme. The preference should persist across sessions."
|
|
140
|
+
assert:
|
|
141
|
+
- type: llm-rubric
|
|
142
|
+
value: "The response does NOT propose an implementation (no mentions of CSS variables, localStorage, context providers, or specific code patterns). It only asks questions."
|
|
143
|
+
- type: llm-rubric
|
|
144
|
+
value: "The response asks probing questions even though the idea seems simple — e.g., about system preference detection, transition animations, component library support, accessibility."
|
|
145
|
+
|
|
146
|
+
# --- Multi-domain coverage ---
|
|
147
|
+
- description: "Covers multiple domains in questions"
|
|
148
|
+
vars:
|
|
149
|
+
system_prompt: file://commands/interview-me.md
|
|
150
|
+
user_message: "I want to build a payment processing API that merchants integrate with to accept credit card payments."
|
|
151
|
+
assert:
|
|
152
|
+
- type: llm-rubric
|
|
153
|
+
value: "The questions span at least 3 different domains from this list: security/compliance (PCI DSS), data model, integrations, user intent, performance/scale, edge cases (refunds, chargebacks, failures), or scope."
|
|
154
|
+
- type: llm-rubric
|
|
155
|
+
value: "The response asks at least 3 questions and does NOT suggest payment providers or implementation approaches."
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
description: refine-text Iterative Eval
|
|
2
|
+
providers:
|
|
3
|
+
- id: anthropic:messages:claude-haiku-4-5-20251001
|
|
4
|
+
config:
|
|
5
|
+
max_tokens: 1024
|
|
6
|
+
prompts:
|
|
7
|
+
- '[{"role": "system", "content": "{{system_prompt}}"}, {"role": "user", "content": "{{user_message}}"}]'
|
|
8
|
+
defaultTest:
|
|
9
|
+
options:
|
|
10
|
+
provider: anthropic:messages:claude-haiku-4-5-20251001
|
|
11
|
+
vars:
|
|
12
|
+
system_prompt: file://commands/refine-text.md
|
|
13
|
+
|
|
14
|
+
tests:
|
|
15
|
+
# 1. Multiple grammar errors
|
|
16
|
+
- description: "fixes multiple grammar errors (verb agreement, apostrophes, spelling)"
|
|
17
|
+
vars:
|
|
18
|
+
user_message: >
|
|
19
|
+
The developers has been working on there project for weeks. Its a really
|
|
20
|
+
importent milestone and everyone are excited. The teams progres have been
|
|
21
|
+
excelent and we doesnt want to loose momentum.
|
|
22
|
+
assert:
|
|
23
|
+
- type: llm-rubric
|
|
24
|
+
value: "The output fixes all grammar errors: 'has' -> 'have', 'there' -> 'their', 'Its' -> 'It's', 'importent' -> 'important', 'are' -> 'is', 'progres' -> 'progress', 'excelent' -> 'excellent', 'doesnt' -> 'don't', 'loose' -> 'lose'. All corrections must be present."
|
|
25
|
+
- type: llm-rubric
|
|
26
|
+
value: "The output preserves the original meaning about developers working on a project, it being an important milestone, and not wanting to lose momentum."
|
|
27
|
+
- type: not-icontains
|
|
28
|
+
value: "here's"
|
|
29
|
+
- type: not-icontains
|
|
30
|
+
value: "refined version"
|
|
31
|
+
|
|
32
|
+
# 2. Technical jargon preservation
|
|
33
|
+
- description: "preserves technical jargon while fixing grammar"
|
|
34
|
+
vars:
|
|
35
|
+
user_message: >
|
|
36
|
+
The kubernetes cluster are running on EKS with istio service mesh.
|
|
37
|
+
We use gRPC for inter-service comunication and the p99 latency have
|
|
38
|
+
been under 50ms. The CI/CD pipline deploys via ArgoCD using GitOps
|
|
39
|
+
metodology.
|
|
40
|
+
assert:
|
|
41
|
+
- type: icontains
|
|
42
|
+
value: "Kubernetes"
|
|
43
|
+
- type: icontains
|
|
44
|
+
value: "EKS"
|
|
45
|
+
- type: icontains
|
|
46
|
+
value: "Istio"
|
|
47
|
+
- type: icontains
|
|
48
|
+
value: "gRPC"
|
|
49
|
+
- type: icontains
|
|
50
|
+
value: "p99"
|
|
51
|
+
- type: icontains
|
|
52
|
+
value: "ArgoCD"
|
|
53
|
+
- type: icontains
|
|
54
|
+
value: "GitOps"
|
|
55
|
+
- type: icontains
|
|
56
|
+
value: "CI/CD"
|
|
57
|
+
- type: llm-rubric
|
|
58
|
+
value: "Spelling errors are fixed: 'comunication' -> 'communication', 'pipline' -> 'pipeline', 'metodology' -> 'methodology'. Grammar errors are fixed: 'are running' -> 'is running', 'have been' -> 'has been'."
|
|
59
|
+
|
|
60
|
+
# 3. Casual voice preservation
|
|
61
|
+
- description: "preserves casual/informal voice while cleaning up"
|
|
62
|
+
vars:
|
|
63
|
+
user_message: >
|
|
64
|
+
So yeah, I've been messing around with this new API and honestly?
|
|
65
|
+
Its pretty sweet. Like, the docs could definately be better but once
|
|
66
|
+
you figure it out its kinda magical. Gonna write a blog post about
|
|
67
|
+
it probly.
|
|
68
|
+
assert:
|
|
69
|
+
- type: llm-rubric
|
|
70
|
+
value: "The output maintains the casual, conversational tone. Words like 'yeah', 'pretty sweet', 'kinda', 'gonna' or similar casual language should be preserved or only lightly adjusted, not replaced with formal language."
|
|
71
|
+
- type: llm-rubric
|
|
72
|
+
value: "Spelling and grammar errors are fixed: 'Its' -> 'It's' (both instances), 'definately' -> 'definitely', 'probly' -> 'probably'."
|
|
73
|
+
- type: not-icontains
|
|
74
|
+
value: "here is"
|
|
75
|
+
|
|
76
|
+
# 4. Passive voice to active voice
|
|
77
|
+
- description: "converts passive voice to active voice"
|
|
78
|
+
vars:
|
|
79
|
+
user_message: >
|
|
80
|
+
The configuration was updated by the team lead. The tests were run by
|
|
81
|
+
the CI system and the results were reviewed by the QA engineer. A new
|
|
82
|
+
release was deployed by the DevOps team last Friday.
|
|
83
|
+
assert:
|
|
84
|
+
- type: llm-rubric
|
|
85
|
+
value: "The output converts passive voice to active voice. For example, 'The configuration was updated by the team lead' should become something like 'The team lead updated the configuration'. At least 3 of the 4 passive constructions should be converted to active voice."
|
|
86
|
+
- type: llm-rubric
|
|
87
|
+
value: "The output preserves all the actors (team lead, CI system, QA engineer, DevOps team) and actions (updated configuration, ran tests, reviewed results, deployed release)."
|
|
88
|
+
|
|
89
|
+
# 5. Bloated sentences that need trimming
|
|
90
|
+
- description: "trims bloated and redundant sentences"
|
|
91
|
+
vars:
|
|
92
|
+
user_message: >
|
|
93
|
+
In order to be able to successfully complete the process of migrating
|
|
94
|
+
our database, it is absolutely essential and critically important that
|
|
95
|
+
we first and foremost make sure to create a comprehensive and thorough
|
|
96
|
+
backup of all of our existing data in its entirety before we proceed
|
|
97
|
+
to begin the migration process.
|
|
98
|
+
assert:
|
|
99
|
+
- type: llm-rubric
|
|
100
|
+
value: "The output is significantly shorter than the input — at least 30% fewer words. The bloated phrases like 'in order to be able to', 'absolutely essential and critically important', 'first and foremost', 'comprehensive and thorough', 'in its entirety', 'proceed to begin' should be simplified."
|
|
101
|
+
- type: llm-rubric
|
|
102
|
+
value: "The core meaning is preserved: back up data before migrating the database."
|
|
103
|
+
|
|
104
|
+
# 6. Already well-written text (minimal changes)
|
|
105
|
+
- description: "makes minimal changes to well-written text"
|
|
106
|
+
vars:
|
|
107
|
+
user_message: >
|
|
108
|
+
TypeScript's type system catches errors at compile time, reducing
|
|
109
|
+
runtime failures. Combined with strict null checks, it eliminates
|
|
110
|
+
an entire class of bugs that plague JavaScript codebases. The trade-off
|
|
111
|
+
is additional upfront complexity, but most teams find it worthwhile.
|
|
112
|
+
assert:
|
|
113
|
+
- type: llm-rubric
|
|
114
|
+
value: "The output is very similar to the input with minimal changes. The text is already well-written, so it should not be substantially reworded or restructured. At most minor punctuation or word choice adjustments."
|
|
115
|
+
- type: llm-rubric
|
|
116
|
+
value: "The output preserves the three-sentence structure and the key concepts: TypeScript type system, strict null checks, and the trade-off."
|
|
117
|
+
|
|
118
|
+
# 7. Code blocks must not be modified
|
|
119
|
+
- description: "does not modify code blocks or inline code"
|
|
120
|
+
vars:
|
|
121
|
+
user_message: |
|
|
122
|
+
To install the package, run this command:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
npm install @acme/widget --save-dev
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Then import it in you're code:
|
|
129
|
+
|
|
130
|
+
```typescript
|
|
131
|
+
import { Widget } from '@acme/widget';
|
|
132
|
+
const w = new Widget({ debug: treu });
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
The `Widget` class accept a configuration object.
|
|
136
|
+
assert:
|
|
137
|
+
- type: icontains
|
|
138
|
+
value: "npm install @acme/widget --save-dev"
|
|
139
|
+
- type: icontains
|
|
140
|
+
value: "import { Widget } from '@acme/widget'"
|
|
141
|
+
- type: icontains
|
|
142
|
+
value: "debug: treu"
|
|
143
|
+
- type: llm-rubric
|
|
144
|
+
value: "Grammar errors OUTSIDE code blocks are fixed: 'you're code' -> 'your code', 'accept' -> 'accepts'. But code blocks and inline code (including the typo 'treu' inside the code block) are NOT modified."
|
|
145
|
+
|
|
146
|
+
# 8. Intentional style choices (fragments, rhetorical questions)
|
|
147
|
+
- description: "preserves intentional style choices like fragments and rhetorical questions"
|
|
148
|
+
vars:
|
|
149
|
+
user_message: >
|
|
150
|
+
Fast. Reliable. Secure. That's what we promise. But can we deliver?
|
|
151
|
+
Absolutely. Our platform handles millions of requests daily. Zero
|
|
152
|
+
downtime last quarter. Not a single data breach in five years.
|
|
153
|
+
The secret? Obsessive testing and a paranoid security team.
|
|
154
|
+
assert:
|
|
155
|
+
- type: llm-rubric
|
|
156
|
+
value: "The output preserves the short fragment style ('Fast. Reliable. Secure.', 'Zero downtime last quarter.') and rhetorical questions ('But can we deliver?', 'The secret?'). These are intentional stylistic choices, not grammar errors."
|
|
157
|
+
- type: llm-rubric
|
|
158
|
+
value: "The output preserves the punchy, marketing-style tone and makes minimal or no changes since the text is already well-written."
|
|
159
|
+
|
|
160
|
+
# 9. Mixed formal/informal text
|
|
161
|
+
- description: "handles mixed formal and informal registers appropriately"
|
|
162
|
+
vars:
|
|
163
|
+
user_message: >
|
|
164
|
+
The quarterly financial report indicate a 15% increase in revenue.
|
|
165
|
+
Pretty awesome numbers tbh. Operating expenses was reduced by 8%
|
|
166
|
+
through strategic cost optimisation. Basically we crushed it this
|
|
167
|
+
quarter and the board are super happy with the results.
|
|
168
|
+
assert:
|
|
169
|
+
- type: llm-rubric
|
|
170
|
+
value: "Grammar errors are fixed: 'indicate' -> 'indicates', 'was reduced' -> 'were reduced', 'are' -> 'is'. The mix of formal financial language and casual commentary is preserved — the output should not make the casual parts formal or the formal parts casual."
|
|
171
|
+
- type: llm-rubric
|
|
172
|
+
value: "The core data is preserved: 15% revenue increase, 8% expense reduction."
|
|
173
|
+
|
|
174
|
+
# 10. Text with links that must not be modified
|
|
175
|
+
- description: "preserves URLs and markdown links"
|
|
176
|
+
vars:
|
|
177
|
+
user_message: >
|
|
178
|
+
Check out the documentation at https://docs.example.com/api/v2
|
|
179
|
+
for more informations. You can also read the [getting started guide](https://example.com/guide)
|
|
180
|
+
which explain the basic concepts. The repositry is at
|
|
181
|
+
[github.com/acme/widget](https://github.com/acme/widget).
|
|
182
|
+
assert:
|
|
183
|
+
- type: icontains
|
|
184
|
+
value: "https://docs.example.com/api/v2"
|
|
185
|
+
- type: icontains
|
|
186
|
+
value: "https://example.com/guide"
|
|
187
|
+
- type: icontains
|
|
188
|
+
value: "https://github.com/acme/widget"
|
|
189
|
+
- type: llm-rubric
|
|
190
|
+
value: "Grammar and spelling errors are fixed: 'informations' -> 'information', 'explain' -> 'explains', 'repositry' -> 'repository'. All URLs and markdown links remain intact and unmodified."
|
|
191
|
+
|
|
192
|
+
# 11. Meaning preservation under ambiguity
|
|
193
|
+
- description: "does not change meaning when editing could be ambiguous"
|
|
194
|
+
vars:
|
|
195
|
+
user_message: >
|
|
196
|
+
We decided not to implement caching because the latency impact was
|
|
197
|
+
negligible and the added complexity weren't worth it. The team
|
|
198
|
+
considred using Redis but ultimatly chose to keep things simple.
|
|
199
|
+
assert:
|
|
200
|
+
- type: llm-rubric
|
|
201
|
+
value: "The meaning is strictly preserved: the team chose NOT to implement caching, the reason was negligible latency impact, and they considered but rejected Redis. The output must not accidentally flip the meaning (e.g., suggesting they did implement caching)."
|
|
202
|
+
- type: llm-rubric
|
|
203
|
+
value: "Grammar/spelling fixed: 'weren't' -> 'wasn't', 'considred' -> 'considered', 'ultimatly' -> 'ultimately'."
|
|
204
|
+
|
|
205
|
+
# 12. No preamble or meta-commentary
|
|
206
|
+
- description: "outputs only the refined text with no preamble or explanation"
|
|
207
|
+
vars:
|
|
208
|
+
user_message: >
|
|
209
|
+
The meeting notes from yesterdays standup shows that the backend team
|
|
210
|
+
are blocked on the database migration. Frontend team have finished
|
|
211
|
+
the redesign and is waiting for API changes.
|
|
212
|
+
assert:
|
|
213
|
+
- type: not-icontains
|
|
214
|
+
value: "here's"
|
|
215
|
+
- type: not-icontains
|
|
216
|
+
value: "refined version"
|
|
217
|
+
- type: not-icontains
|
|
218
|
+
value: "here is the"
|
|
219
|
+
- type: not-icontains
|
|
220
|
+
value: "I've refined"
|
|
221
|
+
- type: not-icontains
|
|
222
|
+
value: "changes made"
|
|
223
|
+
- type: not-icontains
|
|
224
|
+
value: "corrections"
|
|
225
|
+
- type: llm-rubric
|
|
226
|
+
value: "The output is ONLY the refined text itself. It does not start with any preamble, introduction, or explanation. It does not end with a summary of changes. It is just the cleaned-up text."
|
|
227
|
+
|
|
228
|
+
# 13. Text with bullet points / list structure
|
|
229
|
+
- description: "preserves list structure and formatting"
|
|
230
|
+
vars:
|
|
231
|
+
user_message: |
|
|
232
|
+
Project update:
|
|
233
|
+
- Backend API is complete and has been tested
|
|
234
|
+
- Frontend redesign are 80% done
|
|
235
|
+
- Database migraton is schedule for next week
|
|
236
|
+
- The documentation needs updated badly
|
|
237
|
+
- Performance testing havent started yet
|
|
238
|
+
assert:
|
|
239
|
+
- type: llm-rubric
|
|
240
|
+
value: "The bullet list structure is preserved — the output still uses bullet points (- or *) with the same items. Grammar/spelling errors are fixed: 'are' -> 'is', 'migraton' -> 'migration', 'schedule' -> 'scheduled', 'needs updated' -> 'needs to be updated' or 'needs updating', 'havent' -> 'haven't'."
|
|
241
|
+
- type: icontains
|
|
242
|
+
value: "Project update"
|