npm - @plaited/acp-harness - Versions diffs - 0.3.1 → 0.3.2 - Mend

@plaited/acp-harness 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +84 -47
package/bin/cli.ts +1 -1
package/package.json +3 -3
package/src/adapter-check.ts +3 -3
package/src/adapter-scaffold.ts +10 -10
package/src/balance.ts +31 -6
package/src/calibrate.ts +23 -4
package/src/summarize.ts +18 -4
package/src/tests/adapter-scaffold.spec.ts +5 -5
package/src/tests/balance-helpers.spec.ts +279 -0
package/src/tests/calibrate-helpers.spec.ts +226 -0
package/src/tests/capture-helpers.spec.ts +553 -0
package/src/tests/summarize-helpers.spec.ts +339 -0
package/src/tests/trials-calculations.spec.ts +209 -0
package/src/trials.ts +14 -2

package/src/tests/balance-helpers.spec.ts ADDED Viewed

@@ -0,0 +1,279 @@
+import { describe, expect, test } from 'bun:test'
+import { analyzeCategories, findUnderrepresented, generateSuggestions } from '../balance.ts'
+import type { CategoryDistribution, PromptCase } from '../schemas.ts'
+// ============================================================================
+// analyzeCategories
+// ============================================================================
+describe('analyzeCategories', () => {
+  test('counts prompts by category', () => {
+    const prompts: PromptCase[] = [
+      { id: '1', input: 'test', metadata: { category: 'math' } },
+      { id: '2', input: 'test', metadata: { category: 'math' } },
+      { id: '3', input: 'test', metadata: { category: 'code' } },
+    ]
+    const result = analyzeCategories(prompts, 'category')
+    expect(result).toHaveLength(2)
+    const math = result.find((d) => d.name === 'math')
+    const code = result.find((d) => d.name === 'code')
+    expect(math?.count).toBe(2)
+    expect(code?.count).toBe(1)
+  })
+  test('calculates percentages correctly', () => {
+    const prompts: PromptCase[] = [
+      { id: '1', input: 'test', metadata: { category: 'a' } },
+      { id: '2', input: 'test', metadata: { category: 'a' } },
+      { id: '3', input: 'test', metadata: { category: 'b' } },
+      { id: '4', input: 'test', metadata: { category: 'b' } },
+    ]
+    const result = analyzeCategories(prompts, 'category')
+    expect(result[0]?.percentage).toBe(50)
+    expect(result[1]?.percentage).toBe(50)
+  })
+  test('sorts by count descending', () => {
+    const prompts: PromptCase[] = [
+      { id: '1', input: 'test', metadata: { category: 'small' } },
+      { id: '2', input: 'test', metadata: { category: 'large' } },
+      { id: '3', input: 'test', metadata: { category: 'large' } },
+      { id: '4', input: 'test', metadata: { category: 'large' } },
+      { id: '5', input: 'test', metadata: { category: 'medium' } },
+      { id: '6', input: 'test', metadata: { category: 'medium' } },
+    ]
+    const result = analyzeCategories(prompts, 'category')
+    expect(result[0]?.name).toBe('large')
+    expect(result[1]?.name).toBe('medium')
+    expect(result[2]?.name).toBe('small')
+  })
+  test('handles missing metadata as (uncategorized)', () => {
+    const prompts: PromptCase[] = [
+      { id: '1', input: 'test', metadata: { category: 'known' } },
+      { id: '2', input: 'test' }, // No metadata
+      { id: '3', input: 'test', metadata: {} }, // Empty metadata
+    ]
+    const result = analyzeCategories(prompts, 'category')
+    const uncategorized = result.find((d) => d.name === '(uncategorized)')
+    expect(uncategorized?.count).toBe(2)
+  })
+  test('handles different metadata keys', () => {
+    const prompts: PromptCase[] = [
+      { id: '1', input: 'test', metadata: { difficulty: 'easy', category: 'math' } },
+      { id: '2', input: 'test', metadata: { difficulty: 'hard', category: 'math' } },
+      { id: '3', input: 'test', metadata: { difficulty: 'easy', category: 'code' } },
+    ]
+    const byDifficulty = analyzeCategories(prompts, 'difficulty')
+    const byCategory = analyzeCategories(prompts, 'category')
+    expect(byDifficulty.find((d) => d.name === 'easy')?.count).toBe(2)
+    expect(byCategory.find((d) => d.name === 'math')?.count).toBe(2)
+  })
+  test('converts non-string metadata values to strings', () => {
+    const prompts: PromptCase[] = [
+      { id: '1', input: 'test', metadata: { level: 1 } },
+      { id: '2', input: 'test', metadata: { level: 1 } },
+      { id: '3', input: 'test', metadata: { level: 2 } },
+    ]
+    const result = analyzeCategories(prompts, 'level')
+    expect(result.find((d) => d.name === '1')?.count).toBe(2)
+    expect(result.find((d) => d.name === '2')?.count).toBe(1)
+  })
+  test('handles empty prompts array', () => {
+    const result = analyzeCategories([], 'category')
+    expect(result).toEqual([])
+  })
+  test('rounds percentages to integers', () => {
+    const prompts: PromptCase[] = [
+      { id: '1', input: 'test', metadata: { category: 'a' } },
+      { id: '2', input: 'test', metadata: { category: 'b' } },
+      { id: '3', input: 'test', metadata: { category: 'c' } },
+    ]
+    const result = analyzeCategories(prompts, 'category')
+    // 1/3 = 33.33... should round to 33
+    for (const dist of result) {
+      expect(Number.isInteger(dist.percentage)).toBe(true)
+    }
+  })
+})
+// ============================================================================
+// findUnderrepresented
+// ============================================================================
+describe('findUnderrepresented', () => {
+  test('identifies categories below threshold', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'large', count: 50, percentage: 50 },
+      { name: 'medium', count: 30, percentage: 30 },
+      { name: 'small', count: 20, percentage: 20 },
+    ]
+    // Even distribution would be 33.3% each
+    // With 50% threshold, anything below 16.65% is underrepresented
+    const result = findUnderrepresented(distributions, 50)
+    // At 50% threshold, 20% is above 16.65%, so nothing should be underrepresented
+    expect(result).toEqual([])
+  })
+  test('returns underrepresented categories at stricter threshold', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'large', count: 80, percentage: 80 },
+      { name: 'small', count: 20, percentage: 20 },
+    ]
+    // Even distribution would be 50% each
+    // With 50% threshold, anything below 25% is underrepresented
+    const result = findUnderrepresented(distributions, 50)
+    expect(result).toContain('small')
+    expect(result).not.toContain('large')
+  })
+  test('handles even distribution (no underrepresentation)', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'a', count: 25, percentage: 25 },
+      { name: 'b', count: 25, percentage: 25 },
+      { name: 'c', count: 25, percentage: 25 },
+      { name: 'd', count: 25, percentage: 25 },
+    ]
+    const result = findUnderrepresented(distributions, 50)
+    expect(result).toEqual([])
+  })
+  test('handles single category (never underrepresented)', () => {
+    const distributions: CategoryDistribution[] = [{ name: 'only', count: 100, percentage: 100 }]
+    const result = findUnderrepresented(distributions, 50)
+    expect(result).toEqual([])
+  })
+  test('threshold affects sensitivity', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'large', count: 70, percentage: 70 },
+      { name: 'small', count: 30, percentage: 30 },
+    ]
+    // Even = 50%, at 50% threshold: below 25% is underrepresented
+    const strict = findUnderrepresented(distributions, 50)
+    expect(strict).toEqual([])
+    // At 80% threshold: below 40% is underrepresented
+    const lenient = findUnderrepresented(distributions, 80)
+    expect(lenient).toContain('small')
+  })
+  test('handles empty distributions', () => {
+    const result = findUnderrepresented([], 50)
+    expect(result).toEqual([])
+  })
+})
+// ============================================================================
+// generateSuggestions
+// ============================================================================
+describe('generateSuggestions', () => {
+  test('suggests adding cases for underrepresented categories', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'math', count: 80, percentage: 80 },
+      { name: 'code', count: 20, percentage: 20 },
+    ]
+    const underrepresented = ['code']
+    const suggestions = generateSuggestions(distributions, underrepresented, 100)
+    expect(suggestions.some((s) => s.includes('code'))).toBe(true)
+    expect(suggestions.some((s) => s.toLowerCase().includes('add'))).toBe(true)
+  })
+  test('warns about dominant category (>50%)', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'dominant', count: 60, percentage: 60 },
+      { name: 'other', count: 40, percentage: 40 },
+    ]
+    const suggestions = generateSuggestions(distributions, [], 100)
+    expect(suggestions.some((s) => s.includes('dominant') && s.includes('60%'))).toBe(true)
+    expect(suggestions.some((s) => s.toLowerCase().includes('diversify'))).toBe(true)
+  })
+  test('warns about tiny categories (<3 cases)', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'large', count: 97, percentage: 97 },
+      { name: 'tiny', count: 2, percentage: 2 },
+      { name: 'also_tiny', count: 1, percentage: 1 },
+    ]
+    const suggestions = generateSuggestions(distributions, [], 100)
+    expect(suggestions.some((s) => s.includes('tiny') || s.includes('also_tiny'))).toBe(true)
+    expect(suggestions.some((s) => s.includes('< 3 cases'))).toBe(true)
+  })
+  test('suggests expanding small test sets (<20 cases)', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'a', count: 5, percentage: 50 },
+      { name: 'b', count: 5, percentage: 50 },
+    ]
+    const suggestions = generateSuggestions(distributions, [], 10)
+    expect(suggestions.some((s) => s.includes('10 cases') && s.toLowerCase().includes('expand'))).toBe(true)
+  })
+  test('returns "well-balanced" when no issues found', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'a', count: 25, percentage: 25 },
+      { name: 'b', count: 25, percentage: 25 },
+      { name: 'c', count: 25, percentage: 25 },
+      { name: 'd', count: 25, percentage: 25 },
+    ]
+    const suggestions = generateSuggestions(distributions, [], 100)
+    expect(suggestions.some((s) => s.toLowerCase().includes('well-balanced'))).toBe(true)
+  })
+  test('combines multiple suggestions', () => {
+    const distributions: CategoryDistribution[] = [
+      { name: 'huge', count: 8, percentage: 80 },
+      { name: 'tiny', count: 2, percentage: 20 },
+    ]
+    const underrepresented = ['tiny']
+    const suggestions = generateSuggestions(distributions, underrepresented, 10)
+    // Should have multiple suggestions: underrepresented, dominant, tiny count, small test set
+    expect(suggestions.length).toBeGreaterThanOrEqual(2)
+  })
+  test('handles empty distributions', () => {
+    const suggestions = generateSuggestions([], [], 0)
+    // Should suggest expanding (0 cases)
+    expect(suggestions.some((s) => s.includes('0 cases'))).toBe(true)
+  })
+})

package/src/tests/calibrate-helpers.spec.ts ADDED Viewed

@@ -0,0 +1,226 @@
+import { describe, expect, test } from 'bun:test'
+import { getTrajectorySnippet, sampleArray } from '../calibrate.ts'
+import type { TrajectoryStep } from '../schemas.ts'
+// ============================================================================
+// sampleArray
+// ============================================================================
+describe('sampleArray', () => {
+  test('returns n elements from array', () => {
+    const arr = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    const result = sampleArray(arr, 3)
+    expect(result).toHaveLength(3)
+  })
+  test('returns all elements when n >= array length', () => {
+    const arr = [1, 2, 3]
+    const result = sampleArray(arr, 5)
+    expect(result).toHaveLength(3)
+    expect(new Set(result)).toEqual(new Set(arr))
+  })
+  test('returns empty array for empty input', () => {
+    const result = sampleArray([], 5)
+    expect(result).toEqual([])
+  })
+  test('returns empty array when n is 0', () => {
+    const arr = [1, 2, 3]
+    const result = sampleArray(arr, 0)
+    expect(result).toEqual([])
+  })
+  test('does not modify original array', () => {
+    const arr = [1, 2, 3, 4, 5]
+    const original = [...arr]
+    sampleArray(arr, 3)
+    expect(arr).toEqual(original)
+  })
+  test('returns unique elements (no duplicates)', () => {
+    const arr = [1, 2, 3, 4, 5]
+    const result = sampleArray(arr, 3)
+    const uniqueResult = new Set(result)
+    expect(uniqueResult.size).toBe(result.length)
+  })
+  test('all returned elements exist in original array', () => {
+    const arr = ['a', 'b', 'c', 'd', 'e']
+    const result = sampleArray(arr, 3)
+    for (const item of result) {
+      expect(arr).toContain(item)
+    }
+  })
+  test('works with objects', () => {
+    const arr = [{ id: 1 }, { id: 2 }, { id: 3 }, { id: 4 }]
+    const result = sampleArray(arr, 2)
+    expect(result).toHaveLength(2)
+    for (const item of result) {
+      expect(arr).toContainEqual(item)
+    }
+  })
+  test('produces different results on multiple calls (randomness)', () => {
+    const arr = Array.from({ length: 100 }, (_, i) => i)
+    const results = new Set<string>()
+    // Run multiple times and check we get different orderings
+    for (let i = 0; i < 10; i++) {
+      const sample = sampleArray(arr, 10)
+      results.add(sample.join(','))
+    }
+    // With 100 elements, sampling 10, we should get different results
+    // This is probabilistic but extremely unlikely to fail
+    expect(results.size).toBeGreaterThan(1)
+  })
+})
+// ============================================================================
+// getTrajectorySnippet
+// ============================================================================
+describe('getTrajectorySnippet', () => {
+  const createStep = (index: number): TrajectoryStep => ({
+    type: 'message',
+    content: `Step ${index}`,
+    timestamp: index * 100,
+  })
+  test('returns full trajectory when length <= maxSteps', () => {
+    const trajectory: TrajectoryStep[] = [createStep(1), createStep(2), createStep(3)]
+    const result = getTrajectorySnippet(trajectory, 5)
+    expect(result).toHaveLength(3)
+    expect(result).toEqual(trajectory)
+  })
+  test('returns maxSteps elements for longer trajectories', () => {
+    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
+    const result = getTrajectorySnippet(trajectory, 5)
+    expect(result).toHaveLength(5)
+  })
+  test('includes first two steps', () => {
+    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
+    const result = getTrajectorySnippet(trajectory, 5)
+    expect(result[0]).toEqual(createStep(1))
+    expect(result[1]).toEqual(createStep(2))
+  })
+  test('includes last two steps', () => {
+    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
+    const result = getTrajectorySnippet(trajectory, 5)
+    expect(result[3]).toEqual(createStep(9))
+    expect(result[4]).toEqual(createStep(10))
+  })
+  test('includes middle step', () => {
+    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
+    const result = getTrajectorySnippet(trajectory, 5)
+    // Middle of 10 is index 5 (0-indexed), which is Step 6
+    expect(result[2]).toEqual(createStep(6))
+  })
+  test('handles empty trajectory', () => {
+    const result = getTrajectorySnippet([], 5)
+    expect(result).toEqual([])
+  })
+  test('handles single element trajectory', () => {
+    const trajectory: TrajectoryStep[] = [createStep(1)]
+    const result = getTrajectorySnippet(trajectory, 5)
+    expect(result).toEqual([createStep(1)])
+  })
+  test('handles trajectory exactly at maxSteps boundary', () => {
+    const trajectory: TrajectoryStep[] = Array.from({ length: 5 }, (_, i) => createStep(i + 1))
+    const result = getTrajectorySnippet(trajectory, 5)
+    expect(result).toHaveLength(5)
+    expect(result).toEqual(trajectory)
+  })
+  test('respects custom maxSteps parameter', () => {
+    const trajectory: TrajectoryStep[] = Array.from({ length: 20 }, (_, i) => createStep(i + 1))
+    const result3 = getTrajectorySnippet(trajectory, 3)
+    const result7 = getTrajectorySnippet(trajectory, 7)
+    // With maxSteps=3, should still return 5 (first 2 + middle + last 2)
+    // because the algorithm always takes first 2, middle 1, last 2
+    // But the function returns full trajectory if <= maxSteps
+    expect(result3.length).toBeLessThanOrEqual(trajectory.length)
+    expect(result7.length).toBeLessThanOrEqual(trajectory.length)
+  })
+  test('works with different step types', () => {
+    const trajectory: TrajectoryStep[] = [
+      { type: 'thought', content: 'Thinking...', timestamp: 0 },
+      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 100 },
+      { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 200 },
+      { type: 'tool_call', name: 'Bash', status: 'completed', timestamp: 300 },
+      { type: 'tool_call', name: 'Grep', status: 'completed', timestamp: 400 },
+      { type: 'tool_call', name: 'Glob', status: 'completed', timestamp: 500 },
+      { type: 'plan', entries: [{ content: 'Plan', status: 'done' }], timestamp: 600 },
+      { type: 'message', content: 'Done!', timestamp: 700 },
+    ]
+    const result = getTrajectorySnippet(trajectory, 5)
+    expect(result).toHaveLength(5)
+    // First two
+    expect(result[0]?.type).toBe('thought')
+    expect(result[1]?.type).toBe('tool_call')
+    // Last two
+    expect(result[3]?.type).toBe('plan')
+    expect(result[4]?.type).toBe('message')
+  })
+  test('preserves step content when extracting', () => {
+    const trajectory: TrajectoryStep[] = [
+      { type: 'thought', content: 'First thought', timestamp: 0 },
+      { type: 'message', content: 'First message', timestamp: 100 },
+      { type: 'tool_call', name: 'Read', status: 'completed', input: { file_path: '/test.ts' }, timestamp: 200 },
+      { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 300 },
+      { type: 'tool_call', name: 'Bash', status: 'failed', timestamp: 400 },
+      { type: 'message', content: 'Last message', timestamp: 500 },
+    ]
+    const result = getTrajectorySnippet(trajectory, 5)
+    // First step should preserve all properties
+    const firstStep = result[0]
+    if (firstStep?.type === 'thought') {
+      expect(firstStep.content).toBe('First thought')
+      expect(firstStep.timestamp).toBe(0)
+    }
+    // Last step should preserve all properties
+    const lastStep = result[result.length - 1]
+    if (lastStep?.type === 'message') {
+      expect(lastStep.content).toBe('Last message')
+    }
+  })
+})