npm - @towles/tool - Versions diffs - 0.0.109 → 0.0.110 - Mend

@towles/tool 0.0.109 → 0.0.110

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (190) hide show

package/plugins/tt-agentboard/packages/runtime/test/config.test.ts DELETED Viewed

@@ -1,83 +0,0 @@
-import { describe, it, expect, beforeEach, afterEach } from "bun:test";
-import { mkdirSync, rmSync, existsSync, readFileSync, writeFileSync } from "node:fs";
-import { join } from "node:path";
-import { loadConfig, saveConfig } from "../src/config";
-const TEST_HOME = join(import.meta.dir, ".test-home");
-const CONFIG_DIR = join(TEST_HOME, ".config", "towles-tool", "agentboard");
-const CONFIG_FILE = join(CONFIG_DIR, "config.json");
-describe("config", () => {
-  beforeEach(() => {
-    rmSync(TEST_HOME, { recursive: true, force: true });
-  });
-  afterEach(() => {
-    rmSync(TEST_HOME, { recursive: true, force: true });
-  });
-  describe("loadConfig", () => {
-    it("returns defaults when no config file exists", () => {
-      const config = loadConfig(TEST_HOME);
-      expect(config.plugins).toEqual([]);
-      expect(config.port).toBeUndefined();
-      expect(config.theme).toBeUndefined();
-      expect(config.sidebarWidth).toBeUndefined();
-    });
-    it("reads config from disk", () => {
-      mkdirSync(CONFIG_DIR, { recursive: true });
-      writeFileSync(
-        CONFIG_FILE,
-        JSON.stringify({
-          port: 4201,
-          theme: "tokyo-night",
-          sidebarWidth: 30,
-          plugins: ["my-plugin"],
-        }),
-      );
-      const config = loadConfig(TEST_HOME);
-      expect(config.port).toBe(4201);
-      expect(config.theme).toBe("tokyo-night");
-      expect(config.sidebarWidth).toBe(30);
-      expect(config.plugins).toEqual(["my-plugin"]);
-    });
-    it("handles malformed JSON gracefully", () => {
-      mkdirSync(CONFIG_DIR, { recursive: true });
-      writeFileSync(CONFIG_FILE, "not json {{{");
-      const config = loadConfig(TEST_HOME);
-      expect(config.plugins).toEqual([]);
-    });
-  });
-  describe("saveConfig", () => {
-    it("creates config directory and file", () => {
-      saveConfig({ theme: "gruvbox-dark" }, TEST_HOME);
-      expect(existsSync(CONFIG_FILE)).toBe(true);
-      const saved = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
-      expect(saved.theme).toBe("gruvbox-dark");
-    });
-    it("merges with existing config", () => {
-      mkdirSync(CONFIG_DIR, { recursive: true });
-      writeFileSync(
-        CONFIG_FILE,
-        JSON.stringify({
-          theme: "nord",
-          sidebarWidth: 28,
-          plugins: [],
-        }),
-      );
-      saveConfig({ sidebarWidth: 32 }, TEST_HOME);
-      const saved = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
-      expect(saved.theme).toBe("nord");
-      expect(saved.sidebarWidth).toBe(32);
-    });
-  });
-});

package/plugins/tt-auto-claude/.claude-plugin/plugin.json DELETED Viewed

@@ -1,8 +0,0 @@
-{
-  "name": "tt-ac",
-  "description": "Auto-Claude pipeline: automated issue-to-PR workflows using Claude Code",
-  "version": "0.0.109",
-  "author": {
-    "name": "Chris Towles"
-  }
-}

package/plugins/tt-auto-claude/commands/create-issue.md DELETED Viewed

@@ -1,20 +0,0 @@
----
-description: Create a GitHub issue with the auto-claude label for AI-driven work
-allowed-tools: Bash(gh *), AskUserQuestion(*)
----
-Create a GitHub issue with the `auto-claude` label for Claude Code pipeline work.
-1. Get repo: `gh repo view --json nameWithOwner --jq '.nameWithOwner'`
-2. Fetch labels: `gh label list --repo <repo> --json name --jq '.[].name'`
-3. AskUserQuestion (up to 4 at once): title, description, extra labels (multi-select from repo labels)
-4. `gh issue create`:
-   - Always include `auto-claude` label + any extras
-   - Prefix title with conventional type (`feat:`, `fix:`, `refactor:`, `research:`, `chore:`)
-   - Body: `## Summary`, `## Type`, `## Notes` sections
-   - If `auto-claude` label missing, create it first:
-     `gh label create "auto-claude" --repo <repo> --description "Issue for Claude Code auto-claude pipeline" --color "7C3AED"`
-5. **Batch**: multiple issues → create in parallel with appropriate prefix/labels each
-6. Report all issue URLs in a table
-$ARGUMENTS

package/plugins/tt-auto-claude/commands/list.md DELETED Viewed

@@ -1,21 +0,0 @@
----
-description: List open issues with the auto-claude label in the current repo
-allowed-tools: Bash(gh *)
----
-List open issues across all auto-claude pipeline states in the current repo.
-1. Get repo: `gh repo view --json nameWithOwner --jq '.nameWithOwner'`
-2. `gh issue list --repo <repo> --state open --json number,title,labels,assignees,state --limit 50` for each label:
-   - `auto-claude` (queued)
-   - `auto-claude-in-progress`
-   - `auto-claude-failed`
-   - `auto-claude-review`
-3. Deduplicate across queries. Display as a table sorted by issue number:
-   | #   | Title | Status | Labels | Assignee |
-   | --- | ----- | ------ | ------ | -------- |
-   Status derived from pipeline label. Labels column excludes pipeline labels. Assignee shows login or `—`.
-$ARGUMENTS

package/plugins/tt-auto-claude/skills/auto-claude/SKILL.md DELETED Viewed

@@ -1,71 +0,0 @@
----
-name: auto-claude
-description: Use the auto-claude pipeline (`tt auto-claude` / `tt ac`) for automated issue-to-PR workflows — labels a GitHub issue, then runs plan → implement → simplify → review autonomously.
----
-# Auto-Claude Pipeline
-Automated issue-to-PR pipeline. Label a GitHub issue with `auto-claude` and the pipeline runs Claude Code locally through 4 steps: **plan → implement → simplify → review**.
-## Pipeline Steps
-1. **Plan** — Research, planning, and annotations. Produces `plan.md`.
-2. **Implement** — Executes the plan: writes code, tests, commits. Produces `completed-summary.md`.
-3. **Simplify** — Code-simplify pass: removes dead code, simplifies logic. Produces `simplify-summary.md`.
-4. **Review** — Automated review outputs `PASS` or `FAIL` on first line of `review.md`.
-## Label Flow
-1. Issue labelled `auto-claude` triggers the pipeline.
-2. Pipeline removes `auto-claude`, adds `auto-claude-in-progress`.
-3. On success: removes `auto-claude-in-progress`, adds `auto-claude-review`, creates PR.
-4. On failure: removes `auto-claude-in-progress`, adds `auto-claude-failed`.
-## Retry Behavior
-If review outputs FAIL, the pipeline loops back to **implement → simplify → review** (clearing previous artifacts). Configurable via `maxReviewRetries` (default 2), so up to 3 total attempts.
-## CLI Commands
-```bash
-# Process specific issue
-tt auto-claude --issue 42
-tt ac --issue 42
-# Stop after planning step (review before implementation)
-tt ac --issue 42 --until plan
-# Rebase stale PR branch onto current main
-tt ac --refresh --issue 42
-# Reset state for an issue (force restart)
-tt ac --reset 42
-# Start polling loop (default 30min interval)
-tt ac --loop
-# Custom interval and limit
-tt ac --loop --interval 15 --limit 3
-# Interactively pick an auto-claude issue to process
-tt ac list
-```
-## Config
-Auto-detects repo and main branch from cwd. Key settings:
-| Field                    | Default       | Description                         |
-| ------------------------ | ------------- | ----------------------------------- |
-| `triggerLabel`           | `auto-claude` | Label that triggers the pipeline    |
-| `model`                  | `opus`        | Claude model to use                 |
-| `maxReviewRetries`       | `2`           | Review failure retries              |
-| `loopIntervalMinutes`    | `30`          | Polling interval for loop mode      |
-| `maxImplementIterations` | `5`           | Max Claude turns per implement step |
-## Conventions
-- Artifacts: `.auto-claude/issue-{N}/`
-- Branch naming: `auto-claude/issue-{N}`
-- Steps are idempotent — check for output artifact before running
-- `--until <step>` pauses pipeline after the named step

package/plugins/tt-core/promptfooconfig.interview-me.yaml DELETED Viewed

@@ -1,155 +0,0 @@
-description: interview-me Iterative Eval
-providers:
-  - id: anthropic:messages:claude-haiku-4-5-20251001
-    config:
-      max_tokens: 1024
-prompts:
-  - '[{"role": "system", "content": "{{system_prompt}}"}, {"role": "user", "content": "{{user_message}}"}]'
-defaultTest:
-  options:
-    provider: anthropic:messages:claude-haiku-4-5-20251001
-tests:
-  # --- Vague one-liner ideas ---
-  - description: "Vague idea: todo app"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I want to build a todo app"
-    assert:
-      - type: llm-rubric
-        value: "The response asks at least 3 probing questions about the idea. It does NOT propose any solutions, architectures, or implementations."
-      - type: llm-rubric
-        value: "The questions cover at least 2 different domains such as: user intent, target audience, data model, integrations, security, performance/scale, or edge cases."
-  - description: "Vague idea: AI chatbot"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I want to make an AI chatbot for my business"
-    assert:
-      - type: llm-rubric
-        value: "The response asks at least 3 probing questions. It does NOT suggest specific technologies, frameworks, or implementation approaches."
-      - type: llm-rubric
-        value: "At least one question addresses who the users are or what problem the chatbot solves."
-  # --- Complex multi-paragraph ideas ---
-  - description: "Complex idea: marketplace platform"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I'm building a two-sided marketplace for freelance developers. Clients post projects, devs bid on them. There's an escrow payment system, ratings/reviews, and a matching algorithm. We want to launch in 3 months with a team of 2 devs. We'll use React, Node.js, and PostgreSQL."
-    assert:
-      - type: llm-rubric
-        value: "The response asks at least 3 probing questions that dig deeper into gaps in the described plan. It does NOT validate or approve the tech choices — it questions them or asks about tradeoffs."
-      - type: llm-rubric
-        value: "At least one question addresses the ambitious timeline/scope (3 months, 2 devs for a complex marketplace)."
-  # --- Ideas with security gaps ---
-  - description: "Security gap: health data app"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I want to build an app where users upload their medical records and get AI-powered health recommendations. Users can share their records with doctors."
-    assert:
-      - type: llm-rubric
-        value: "The response asks at least one question about security, privacy, compliance (HIPAA, GDPR), or data protection related to handling medical records."
-      - type: llm-rubric
-        value: "The response does NOT propose solutions. It only asks questions."
-  # --- Missing data model ---
-  - description: "Missing data model: inventory system"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I need an inventory management system for my warehouse. It should track items coming in and going out and alert when stock is low."
-    assert:
-      - type: llm-rubric
-        value: "At least one question probes the data model — e.g., what constitutes an 'item', how items are categorized, what metadata is tracked, relationships between entities."
-      - type: llm-rubric
-        value: "The response asks at least 3 questions total and does NOT propose a database schema or solution."
-  # --- Unclear performance/scale ---
-  - description: "Scale unclear: real-time analytics"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I want to build a real-time analytics dashboard that shows live metrics from our IoT sensors deployed across multiple factories."
-    assert:
-      - type: llm-rubric
-        value: "At least one question addresses scale or performance — e.g., how many sensors, data volume, latency requirements, or what 'real-time' means specifically."
-      - type: llm-rubric
-        value: "The response asks at least 3 probing questions and does NOT suggest specific technologies or architectures."
-  # --- Vague follow-up answer ---
-  - description: "Vague follow-up: pushes back"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: |
-        I'm building a notification system for our SaaS platform.
-        Previously you asked about notification channels and I said "we'll support all the usual ones." You asked about volume and I said "a lot, probably."
-        Continue the interview.
-    assert:
-      - type: llm-rubric
-        value: "The response pushes back on the vague answers ('all the usual ones' and 'a lot, probably') by asking for specifics — e.g., which exact channels, what volume numbers, what peak load looks like."
-      - type: llm-rubric
-        value: "The response does NOT accept the vague answers at face value and move on to unrelated topics."
-  # --- Overly ambitious scope ---
-  - description: "Overly ambitious: social media platform"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I want to build a social media platform with stories, reels, messaging, marketplace, groups, events, live streaming, and AR filters. I'm a solo developer and want to launch in 2 months."
-    assert:
-      - type: llm-rubric
-        value: "The response questions the scope relative to the constraints (solo developer, 2-month timeline). It should probe what the MVP actually is or what can be cut."
-      - type: llm-rubric
-        value: "The response does NOT propose a phased plan or solution — it asks questions to help the user think about prioritization."
-  # --- Edge cases domain ---
-  - description: "Edge cases: booking system"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I'm building a booking system for a hair salon. Customers pick a service, choose a stylist, and book a time slot."
-    assert:
-      - type: llm-rubric
-        value: "At least one question addresses edge cases — e.g., double bookings, cancellations, no-shows, overlapping appointments, different service durations."
-      - type: llm-rubric
-        value: "The response asks at least 3 questions and does NOT propose a solution or booking flow."
-  # --- Summary behavior ---
-  - description: "Summarizes understanding before asking more"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: |
-        I'm building a CLI tool that generates changelogs from git commits.
-        Previously you asked what format the changelog should be in and I said Markdown. You asked about commit conventions and I said we use conventional commits. You asked about the target audience and I said it's for internal developer teams.
-        Continue the interview.
-    assert:
-      - type: llm-rubric
-        value: "The response summarizes or restates what has been established so far (Markdown format, conventional commits, internal dev teams) before asking new questions."
-      - type: llm-rubric
-        value: "The response then asks at least 2 new probing questions about remaining gaps."
-  # --- Does not propose solutions ---
-  - description: "Never proposes solutions even when idea is clear"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I want to add a dark mode toggle to my React app. When toggled, all components should switch to a dark theme. The preference should persist across sessions."
-    assert:
-      - type: llm-rubric
-        value: "The response does NOT propose an implementation (no mentions of CSS variables, localStorage, context providers, or specific code patterns). It only asks questions."
-      - type: llm-rubric
-        value: "The response asks probing questions even though the idea seems simple — e.g., about system preference detection, transition animations, component library support, accessibility."
-  # --- Multi-domain coverage ---
-  - description: "Covers multiple domains in questions"
-    vars:
-      system_prompt: file://commands/interview-me.md
-      user_message: "I want to build a payment processing API that merchants integrate with to accept credit card payments."
-    assert:
-      - type: llm-rubric
-        value: "The questions span at least 3 different domains from this list: security/compliance (PCI DSS), data model, integrations, user intent, performance/scale, edge cases (refunds, chargebacks, failures), or scope."
-      - type: llm-rubric
-        value: "The response asks at least 3 questions and does NOT suggest payment providers or implementation approaches."

package/plugins/tt-core/promptfooconfig.refine-text.yaml DELETED Viewed

@@ -1,242 +0,0 @@
-description: refine-text Iterative Eval
-providers:
-  - id: anthropic:messages:claude-haiku-4-5-20251001
-    config:
-      max_tokens: 1024
-prompts:
-  - '[{"role": "system", "content": "{{system_prompt}}"}, {"role": "user", "content": "{{user_message}}"}]'
-defaultTest:
-  options:
-    provider: anthropic:messages:claude-haiku-4-5-20251001
-  vars:
-    system_prompt: file://commands/refine-text.md
-tests:
-  # 1. Multiple grammar errors
-  - description: "fixes multiple grammar errors (verb agreement, apostrophes, spelling)"
-    vars:
-      user_message: >
-        The developers has been working on there project for weeks. Its a really
-        importent milestone and everyone are excited. The teams progres have been
-        excelent and we doesnt want to loose momentum.
-    assert:
-      - type: llm-rubric
-        value: "The output fixes all grammar errors: 'has' -> 'have', 'there' -> 'their', 'Its' -> 'It's', 'importent' -> 'important', 'are' -> 'is', 'progres' -> 'progress', 'excelent' -> 'excellent', 'doesnt' -> 'don't', 'loose' -> 'lose'. All corrections must be present."
-      - type: llm-rubric
-        value: "The output preserves the original meaning about developers working on a project, it being an important milestone, and not wanting to lose momentum."
-      - type: not-icontains
-        value: "here's"
-      - type: not-icontains
-        value: "refined version"
-  # 2. Technical jargon preservation
-  - description: "preserves technical jargon while fixing grammar"
-    vars:
-      user_message: >
-        The kubernetes cluster are running on EKS with istio service mesh.
-        We use gRPC for inter-service comunication and the p99 latency have
-        been under 50ms. The CI/CD pipline deploys via ArgoCD using GitOps
-        metodology.
-    assert:
-      - type: icontains
-        value: "Kubernetes"
-      - type: icontains
-        value: "EKS"
-      - type: icontains
-        value: "Istio"
-      - type: icontains
-        value: "gRPC"
-      - type: icontains
-        value: "p99"
-      - type: icontains
-        value: "ArgoCD"
-      - type: icontains
-        value: "GitOps"
-      - type: icontains
-        value: "CI/CD"
-      - type: llm-rubric
-        value: "Spelling errors are fixed: 'comunication' -> 'communication', 'pipline' -> 'pipeline', 'metodology' -> 'methodology'. Grammar errors are fixed: 'are running' -> 'is running', 'have been' -> 'has been'."
-  # 3. Casual voice preservation
-  - description: "preserves casual/informal voice while cleaning up"
-    vars:
-      user_message: >
-        So yeah, I've been messing around with this new API and honestly?
-        Its pretty sweet. Like, the docs could definately be better but once
-        you figure it out its kinda magical. Gonna write a blog post about
-        it probly.
-    assert:
-      - type: llm-rubric
-        value: "The output maintains the casual, conversational tone. Words like 'yeah', 'pretty sweet', 'kinda', 'gonna' or similar casual language should be preserved or only lightly adjusted, not replaced with formal language."
-      - type: llm-rubric
-        value: "Spelling and grammar errors are fixed: 'Its' -> 'It's' (both instances), 'definately' -> 'definitely', 'probly' -> 'probably'."
-      - type: not-icontains
-        value: "here is"
-  # 4. Passive voice to active voice
-  - description: "converts passive voice to active voice"
-    vars:
-      user_message: >
-        The configuration was updated by the team lead. The tests were run by
-        the CI system and the results were reviewed by the QA engineer. A new
-        release was deployed by the DevOps team last Friday.
-    assert:
-      - type: llm-rubric
-        value: "The output converts passive voice to active voice. For example, 'The configuration was updated by the team lead' should become something like 'The team lead updated the configuration'. At least 3 of the 4 passive constructions should be converted to active voice."
-      - type: llm-rubric
-        value: "The output preserves all the actors (team lead, CI system, QA engineer, DevOps team) and actions (updated configuration, ran tests, reviewed results, deployed release)."
-  # 5. Bloated sentences that need trimming
-  - description: "trims bloated and redundant sentences"
-    vars:
-      user_message: >
-        In order to be able to successfully complete the process of migrating
-        our database, it is absolutely essential and critically important that
-        we first and foremost make sure to create a comprehensive and thorough
-        backup of all of our existing data in its entirety before we proceed
-        to begin the migration process.
-    assert:
-      - type: llm-rubric
-        value: "The output is significantly shorter than the input — at least 30% fewer words. The bloated phrases like 'in order to be able to', 'absolutely essential and critically important', 'first and foremost', 'comprehensive and thorough', 'in its entirety', 'proceed to begin' should be simplified."
-      - type: llm-rubric
-        value: "The core meaning is preserved: back up data before migrating the database."
-  # 6. Already well-written text (minimal changes)
-  - description: "makes minimal changes to well-written text"
-    vars:
-      user_message: >
-        TypeScript's type system catches errors at compile time, reducing
-        runtime failures. Combined with strict null checks, it eliminates
-        an entire class of bugs that plague JavaScript codebases. The trade-off
-        is additional upfront complexity, but most teams find it worthwhile.
-    assert:
-      - type: llm-rubric
-        value: "The output is very similar to the input with minimal changes. The text is already well-written, so it should not be substantially reworded or restructured. At most minor punctuation or word choice adjustments."
-      - type: llm-rubric
-        value: "The output preserves the three-sentence structure and the key concepts: TypeScript type system, strict null checks, and the trade-off."
-  # 7. Code blocks must not be modified
-  - description: "does not modify code blocks or inline code"
-    vars:
-      user_message: |
-        To install the package, run this command:
-        ```bash
-        npm install @acme/widget --save-dev
-        ```
-        Then import it in you're code:
-        ```typescript
-        import { Widget } from '@acme/widget';
-        const w = new Widget({ debug: treu });
-        ```
-        The `Widget` class accept a configuration object.
-    assert:
-      - type: icontains
-        value: "npm install @acme/widget --save-dev"
-      - type: icontains
-        value: "import { Widget } from '@acme/widget'"
-      - type: icontains
-        value: "debug: treu"
-      - type: llm-rubric
-        value: "Grammar errors OUTSIDE code blocks are fixed: 'you're code' -> 'your code', 'accept' -> 'accepts'. But code blocks and inline code (including the typo 'treu' inside the code block) are NOT modified."
-  # 8. Intentional style choices (fragments, rhetorical questions)
-  - description: "preserves intentional style choices like fragments and rhetorical questions"
-    vars:
-      user_message: >
-        Fast. Reliable. Secure. That's what we promise. But can we deliver?
-        Absolutely. Our platform handles millions of requests daily. Zero
-        downtime last quarter. Not a single data breach in five years.
-        The secret? Obsessive testing and a paranoid security team.
-    assert:
-      - type: llm-rubric
-        value: "The output preserves the short fragment style ('Fast. Reliable. Secure.', 'Zero downtime last quarter.') and rhetorical questions ('But can we deliver?', 'The secret?'). These are intentional stylistic choices, not grammar errors."
-      - type: llm-rubric
-        value: "The output preserves the punchy, marketing-style tone and makes minimal or no changes since the text is already well-written."
-  # 9. Mixed formal/informal text
-  - description: "handles mixed formal and informal registers appropriately"
-    vars:
-      user_message: >
-        The quarterly financial report indicate a 15% increase in revenue.
-        Pretty awesome numbers tbh. Operating expenses was reduced by 8%
-        through strategic cost optimisation. Basically we crushed it this
-        quarter and the board are super happy with the results.
-    assert:
-      - type: llm-rubric
-        value: "Grammar errors are fixed: 'indicate' -> 'indicates', 'was reduced' -> 'were reduced', 'are' -> 'is'. The mix of formal financial language and casual commentary is preserved — the output should not make the casual parts formal or the formal parts casual."
-      - type: llm-rubric
-        value: "The core data is preserved: 15% revenue increase, 8% expense reduction."
-  # 10. Text with links that must not be modified
-  - description: "preserves URLs and markdown links"
-    vars:
-      user_message: >
-        Check out the documentation at https://docs.example.com/api/v2
-        for more informations. You can also read the [getting started guide](https://example.com/guide)
-        which explain the basic concepts. The repositry is at
-        [github.com/acme/widget](https://github.com/acme/widget).
-    assert:
-      - type: icontains
-        value: "https://docs.example.com/api/v2"
-      - type: icontains
-        value: "https://example.com/guide"
-      - type: icontains
-        value: "https://github.com/acme/widget"
-      - type: llm-rubric
-        value: "Grammar and spelling errors are fixed: 'informations' -> 'information', 'explain' -> 'explains', 'repositry' -> 'repository'. All URLs and markdown links remain intact and unmodified."
-  # 11. Meaning preservation under ambiguity
-  - description: "does not change meaning when editing could be ambiguous"
-    vars:
-      user_message: >
-        We decided not to implement caching because the latency impact was
-        negligible and the added complexity weren't worth it. The team
-        considred using Redis but ultimatly chose to keep things simple.
-    assert:
-      - type: llm-rubric
-        value: "The meaning is strictly preserved: the team chose NOT to implement caching, the reason was negligible latency impact, and they considered but rejected Redis. The output must not accidentally flip the meaning (e.g., suggesting they did implement caching)."
-      - type: llm-rubric
-        value: "Grammar/spelling fixed: 'weren't' -> 'wasn't', 'considred' -> 'considered', 'ultimatly' -> 'ultimately'."
-  # 12. No preamble or meta-commentary
-  - description: "outputs only the refined text with no preamble or explanation"
-    vars:
-      user_message: >
-        The meeting notes from yesterdays standup shows that the backend team
-        are blocked on the database migration. Frontend team have finished
-        the redesign and is waiting for API changes.
-    assert:
-      - type: not-icontains
-        value: "here's"
-      - type: not-icontains
-        value: "refined version"
-      - type: not-icontains
-        value: "here is the"
-      - type: not-icontains
-        value: "I've refined"
-      - type: not-icontains
-        value: "changes made"
-      - type: not-icontains
-        value: "corrections"
-      - type: llm-rubric
-        value: "The output is ONLY the refined text itself. It does not start with any preamble, introduction, or explanation. It does not end with a summary of changes. It is just the cleaned-up text."
-  # 13. Text with bullet points / list structure
-  - description: "preserves list structure and formatting"
-    vars:
-      user_message: |
-        Project update:
-        - Backend API is complete and has been tested
-        - Frontend redesign are 80% done
-        - Database migraton is schedule for next week
-        - The documentation needs updated badly
-        - Performance testing havent started yet
-    assert:
-      - type: llm-rubric
-        value: "The bullet list structure is preserved — the output still uses bullet points (- or *) with the same items. Grammar/spelling errors are fixed: 'are' -> 'is', 'migraton' -> 'migration', 'schedule' -> 'scheduled', 'needs updated' -> 'needs to be updated' or 'needs updating', 'havent' -> 'haven't'."
-      - type: icontains
-        value: "Project update"