thevoidforge 21.0.11 → 21.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.claude/commands/ai.md +69 -0
- package/dist/.claude/commands/architect.md +121 -0
- package/dist/.claude/commands/assemble.md +201 -0
- package/dist/.claude/commands/assess.md +75 -0
- package/dist/.claude/commands/blueprint.md +135 -0
- package/dist/.claude/commands/build.md +116 -0
- package/dist/.claude/commands/campaign.md +201 -0
- package/dist/.claude/commands/cultivation.md +166 -0
- package/dist/.claude/commands/current.md +128 -0
- package/dist/.claude/commands/dangerroom.md +74 -0
- package/dist/.claude/commands/debrief.md +178 -0
- package/dist/.claude/commands/deploy.md +99 -0
- package/dist/.claude/commands/devops.md +143 -0
- package/dist/.claude/commands/gauntlet.md +140 -0
- package/dist/.claude/commands/git.md +104 -0
- package/dist/.claude/commands/grow.md +146 -0
- package/dist/.claude/commands/imagine.md +126 -0
- package/dist/.claude/commands/portfolio.md +50 -0
- package/dist/.claude/commands/prd.md +113 -0
- package/dist/.claude/commands/qa.md +107 -0
- package/dist/.claude/commands/review.md +151 -0
- package/dist/.claude/commands/security.md +100 -0
- package/dist/.claude/commands/test.md +96 -0
- package/dist/.claude/commands/thumper.md +116 -0
- package/dist/.claude/commands/treasury.md +100 -0
- package/dist/.claude/commands/ux.md +118 -0
- package/dist/.claude/commands/vault.md +189 -0
- package/dist/.claude/commands/void.md +108 -0
- package/dist/CHANGELOG.md +1918 -0
- package/dist/CLAUDE.md +250 -0
- package/dist/HOLOCRON.md +856 -0
- package/dist/VERSION.md +123 -0
- package/dist/docs/NAMING_REGISTRY.md +478 -0
- package/dist/docs/methods/AI_INTELLIGENCE.md +276 -0
- package/dist/docs/methods/ASSEMBLER.md +142 -0
- package/dist/docs/methods/BACKEND_ENGINEER.md +165 -0
- package/dist/docs/methods/BUILD_JOURNAL.md +185 -0
- package/dist/docs/methods/BUILD_PROTOCOL.md +426 -0
- package/dist/docs/methods/CAMPAIGN.md +568 -0
- package/dist/docs/methods/CONTEXT_MANAGEMENT.md +189 -0
- package/dist/docs/methods/DEEP_CURRENT.md +184 -0
- package/dist/docs/methods/DEVOPS_ENGINEER.md +295 -0
- package/dist/docs/methods/FIELD_MEDIC.md +261 -0
- package/dist/docs/methods/FORGE_ARTIST.md +108 -0
- package/dist/docs/methods/FORGE_KEEPER.md +268 -0
- package/dist/docs/methods/GAUNTLET.md +344 -0
- package/dist/docs/methods/GROWTH_STRATEGIST.md +466 -0
- package/dist/docs/methods/HEARTBEAT.md +168 -0
- package/dist/docs/methods/MCP_INTEGRATION.md +139 -0
- package/dist/docs/methods/MUSTER.md +148 -0
- package/dist/docs/methods/PRD_GENERATOR.md +186 -0
- package/dist/docs/methods/PRODUCT_DESIGN_FRONTEND.md +250 -0
- package/dist/docs/methods/QA_ENGINEER.md +337 -0
- package/dist/docs/methods/RELEASE_MANAGER.md +145 -0
- package/dist/docs/methods/SECURITY_AUDITOR.md +320 -0
- package/dist/docs/methods/SUB_AGENTS.md +335 -0
- package/dist/docs/methods/SYSTEMS_ARCHITECT.md +171 -0
- package/dist/docs/methods/TESTING.md +359 -0
- package/dist/docs/methods/THUMPER.md +175 -0
- package/dist/docs/methods/TIME_VAULT.md +120 -0
- package/dist/docs/methods/TREASURY.md +184 -0
- package/dist/docs/methods/TROUBLESHOOTING.md +265 -0
- package/dist/docs/patterns/README.md +52 -0
- package/dist/docs/patterns/ad-billing-adapter.ts +537 -0
- package/dist/docs/patterns/ad-platform-adapter.ts +421 -0
- package/dist/docs/patterns/ai-classifier.ts +195 -0
- package/dist/docs/patterns/ai-eval.ts +272 -0
- package/dist/docs/patterns/ai-orchestrator.ts +341 -0
- package/dist/docs/patterns/ai-router.ts +194 -0
- package/dist/docs/patterns/ai-tool-schema.ts +237 -0
- package/dist/docs/patterns/api-route.ts +241 -0
- package/dist/docs/patterns/backtest-engine.ts +499 -0
- package/dist/docs/patterns/browser-review.ts +292 -0
- package/dist/docs/patterns/combobox.tsx +300 -0
- package/dist/docs/patterns/component.tsx +262 -0
- package/dist/docs/patterns/daemon-process.ts +338 -0
- package/dist/docs/patterns/data-pipeline.ts +297 -0
- package/dist/docs/patterns/database-migration.ts +466 -0
- package/dist/docs/patterns/e2e-test.ts +629 -0
- package/dist/docs/patterns/error-handling.ts +312 -0
- package/dist/docs/patterns/execution-safety.ts +601 -0
- package/dist/docs/patterns/financial-transaction.ts +342 -0
- package/dist/docs/patterns/funding-plan.ts +462 -0
- package/dist/docs/patterns/game-entity.ts +137 -0
- package/dist/docs/patterns/game-loop.ts +113 -0
- package/dist/docs/patterns/game-state.ts +143 -0
- package/dist/docs/patterns/job-queue.ts +225 -0
- package/dist/docs/patterns/kongo-integration.ts +164 -0
- package/dist/docs/patterns/middleware.ts +363 -0
- package/dist/docs/patterns/mobile-screen.tsx +139 -0
- package/dist/docs/patterns/mobile-service.ts +167 -0
- package/dist/docs/patterns/multi-tenant.ts +382 -0
- package/dist/docs/patterns/oauth-token-lifecycle.ts +223 -0
- package/dist/docs/patterns/outbound-rate-limiter.ts +260 -0
- package/dist/docs/patterns/prompt-template.ts +195 -0
- package/dist/docs/patterns/revenue-source-adapter.ts +311 -0
- package/dist/docs/patterns/service.ts +224 -0
- package/dist/docs/patterns/sse-endpoint.ts +118 -0
- package/dist/docs/patterns/stablecoin-adapter.ts +511 -0
- package/dist/docs/patterns/third-party-script.ts +68 -0
- package/dist/scripts/thumper/gom-jabbar.sh +241 -0
- package/dist/scripts/thumper/relay.sh +610 -0
- package/dist/scripts/thumper/scan.sh +359 -0
- package/dist/scripts/thumper/thumper.sh +190 -0
- package/dist/scripts/thumper/water-rings.sh +76 -0
- package/package.json +1 -1
- package/dist/tsconfig.tsbuildinfo +0 -1
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern: AI Eval
|
|
3
|
+
*
|
|
4
|
+
* Key principles:
|
|
5
|
+
* - Every AI feature needs a golden dataset — input/expected-output pairs
|
|
6
|
+
* - Automated eval runs catch regressions before they reach production
|
|
7
|
+
* - Compare scores across prompt versions — never ship a prompt that scores lower
|
|
8
|
+
* - Scoring functions are pluggable — exact match, semantic similarity, custom
|
|
9
|
+
* - Eval results are stored, not printed — you need history to detect drift
|
|
10
|
+
*
|
|
11
|
+
* Agents: Batman (testing/validation), Picard (architecture), L (monitoring)
|
|
12
|
+
*
|
|
13
|
+
* Provider note: Eval runs use the same model call patterns from ai-orchestrator.ts.
|
|
14
|
+
* The eval framework itself is provider-agnostic.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
// --- Core types ---
|
|
18
|
+
|
|
19
|
+
/** A single test case in a golden dataset. */
|
|
20
|
+
export interface EvalCase<TInput = string, TExpected = string> {
|
|
21
|
+
id: string // Stable ID for tracking across runs
|
|
22
|
+
input: TInput
|
|
23
|
+
expected: TExpected
|
|
24
|
+
tags?: string[] // e.g., ['edge-case', 'billing', 'multi-language']
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/** Result of evaluating a single case. */
|
|
28
|
+
export interface CaseResult {
|
|
29
|
+
caseId: string
|
|
30
|
+
passed: boolean
|
|
31
|
+
score: number // 0.0 - 1.0
|
|
32
|
+
actual: string // What the model returned
|
|
33
|
+
expected: string // What we wanted
|
|
34
|
+
latencyMs: number
|
|
35
|
+
error?: string // If the model call failed
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Aggregate result of an eval run. */
|
|
39
|
+
export interface EvalResult {
|
|
40
|
+
runId: string
|
|
41
|
+
promptVersion: string
|
|
42
|
+
model: string
|
|
43
|
+
timestamp: string
|
|
44
|
+
totalCases: number
|
|
45
|
+
passedCases: number
|
|
46
|
+
averageScore: number
|
|
47
|
+
averageLatencyMs: number
|
|
48
|
+
caseResults: CaseResult[]
|
|
49
|
+
tags: Record<string, { count: number; avgScore: number }> // Per-tag breakdown
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** Comparison between two eval runs. */
|
|
53
|
+
export interface VersionComparison {
|
|
54
|
+
baseVersion: string
|
|
55
|
+
candidateVersion: string
|
|
56
|
+
baseScore: number
|
|
57
|
+
candidateScore: number
|
|
58
|
+
delta: number // Positive = improvement, negative = regression
|
|
59
|
+
regressions: CaseResult[] // Cases that got worse
|
|
60
|
+
improvements: CaseResult[] // Cases that got better
|
|
61
|
+
verdict: 'pass' | 'fail' | 'review' // Based on regression threshold
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// --- Scoring functions ---
|
|
65
|
+
|
|
66
|
+
/** Exact string match (case-insensitive). */
|
|
67
|
+
export function exactMatch(actual: string, expected: string): number {
|
|
68
|
+
return actual.trim().toLowerCase() === expected.trim().toLowerCase() ? 1.0 : 0.0
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/** Check if expected value is contained in actual output. */
|
|
72
|
+
export function containsMatch(actual: string, expected: string): number {
|
|
73
|
+
return actual.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** JSON field match — compare specific fields in JSON outputs. */
|
|
77
|
+
export function jsonFieldMatch(
|
|
78
|
+
actual: string,
|
|
79
|
+
expected: string,
|
|
80
|
+
fields: string[]
|
|
81
|
+
): number {
|
|
82
|
+
try {
|
|
83
|
+
const actualObj = JSON.parse(actual)
|
|
84
|
+
const expectedObj = JSON.parse(expected)
|
|
85
|
+
let matches = 0
|
|
86
|
+
for (const field of fields) {
|
|
87
|
+
if (actualObj[field] === expectedObj[field]) matches++
|
|
88
|
+
}
|
|
89
|
+
return matches / fields.length
|
|
90
|
+
} catch {
|
|
91
|
+
return 0.0 // Parse failure = score 0
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// --- EvalSuite ---
|
|
96
|
+
|
|
97
|
+
type ModelRunner = (input: string) => Promise<string>
|
|
98
|
+
type ScoringFunction = (actual: string, expected: string) => number
|
|
99
|
+
|
|
100
|
+
export class EvalSuite<TInput = string> {
|
|
101
|
+
private cases: EvalCase<TInput, string>[] = []
|
|
102
|
+
private scoreFn: ScoringFunction = exactMatch
|
|
103
|
+
private passThreshold = 0.8 // Case passes if score >= this
|
|
104
|
+
|
|
105
|
+
constructor(private name: string) {}
|
|
106
|
+
|
|
107
|
+
/** Add a test case to the suite. */
|
|
108
|
+
addCase(testCase: EvalCase<TInput, string>): this {
|
|
109
|
+
this.cases.push(testCase)
|
|
110
|
+
return this
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/** Add multiple test cases. */
|
|
114
|
+
addCases(cases: EvalCase<TInput, string>[]): this {
|
|
115
|
+
this.cases.push(...cases)
|
|
116
|
+
return this
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/** Set the scoring function (default: exactMatch). */
|
|
120
|
+
withScoring(fn: ScoringFunction): this {
|
|
121
|
+
this.scoreFn = fn
|
|
122
|
+
return this
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/** Set the pass threshold (default: 0.8). */
|
|
126
|
+
withPassThreshold(threshold: number): this {
|
|
127
|
+
this.passThreshold = threshold
|
|
128
|
+
return this
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/** Run the eval suite against a model runner function. */
|
|
132
|
+
async run(
|
|
133
|
+
runner: ModelRunner,
|
|
134
|
+
promptVersion: string,
|
|
135
|
+
model: string
|
|
136
|
+
): Promise<EvalResult> {
|
|
137
|
+
const runId = `eval-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
|
138
|
+
const caseResults: CaseResult[] = []
|
|
139
|
+
|
|
140
|
+
// Run cases sequentially to avoid rate limits. For large suites,
|
|
141
|
+
// batch with concurrency limit (e.g., p-limit with concurrency 5).
|
|
142
|
+
for (const testCase of this.cases) {
|
|
143
|
+
const start = Date.now()
|
|
144
|
+
let actual = ''
|
|
145
|
+
let error: string | undefined
|
|
146
|
+
|
|
147
|
+
try {
|
|
148
|
+
actual = await runner(testCase.input as string)
|
|
149
|
+
} catch (e) {
|
|
150
|
+
error = e instanceof Error ? e.message : 'Unknown error'
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const score = error ? 0 : this.scoreFn(actual, testCase.expected)
|
|
154
|
+
|
|
155
|
+
caseResults.push({
|
|
156
|
+
caseId: testCase.id,
|
|
157
|
+
passed: score >= this.passThreshold,
|
|
158
|
+
score,
|
|
159
|
+
actual,
|
|
160
|
+
expected: testCase.expected,
|
|
161
|
+
latencyMs: Date.now() - start,
|
|
162
|
+
error,
|
|
163
|
+
})
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Compute per-tag breakdowns
|
|
167
|
+
const tags: Record<string, { count: number; avgScore: number }> = {}
|
|
168
|
+
for (const testCase of this.cases) {
|
|
169
|
+
for (const tag of testCase.tags ?? []) {
|
|
170
|
+
if (!tags[tag]) tags[tag] = { count: 0, avgScore: 0 }
|
|
171
|
+
const result = caseResults.find((r) => r.caseId === testCase.id)
|
|
172
|
+
if (result) {
|
|
173
|
+
tags[tag].count++
|
|
174
|
+
tags[tag].avgScore += result.score
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
for (const tag of Object.values(tags)) {
|
|
179
|
+
tag.avgScore = tag.avgScore / tag.count
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
runId,
|
|
184
|
+
promptVersion,
|
|
185
|
+
model,
|
|
186
|
+
timestamp: new Date().toISOString(),
|
|
187
|
+
totalCases: this.cases.length,
|
|
188
|
+
passedCases: caseResults.filter((r) => r.passed).length,
|
|
189
|
+
averageScore: caseResults.reduce((sum, r) => sum + r.score, 0) / caseResults.length,
|
|
190
|
+
averageLatencyMs: caseResults.reduce((sum, r) => sum + r.latencyMs, 0) / caseResults.length,
|
|
191
|
+
caseResults,
|
|
192
|
+
tags,
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// --- Version comparison ---
|
|
198
|
+
|
|
199
|
+
const REGRESSION_THRESHOLD = 0.02 // 2% drop triggers review
|
|
200
|
+
|
|
201
|
+
export function compareVersions(
|
|
202
|
+
base: EvalResult,
|
|
203
|
+
candidate: EvalResult
|
|
204
|
+
): VersionComparison {
|
|
205
|
+
const delta = candidate.averageScore - base.averageScore
|
|
206
|
+
|
|
207
|
+
// Find regressions: cases that scored lower in the candidate
|
|
208
|
+
const regressions: CaseResult[] = []
|
|
209
|
+
const improvements: CaseResult[] = []
|
|
210
|
+
|
|
211
|
+
for (const candidateCase of candidate.caseResults) {
|
|
212
|
+
const baseCase = base.caseResults.find((b) => b.caseId === candidateCase.caseId)
|
|
213
|
+
if (!baseCase) continue
|
|
214
|
+
|
|
215
|
+
if (candidateCase.score < baseCase.score) regressions.push(candidateCase)
|
|
216
|
+
if (candidateCase.score > baseCase.score) improvements.push(candidateCase)
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
let verdict: VersionComparison['verdict'] = 'pass'
|
|
220
|
+
if (delta < -REGRESSION_THRESHOLD) verdict = 'fail'
|
|
221
|
+
else if (regressions.length > 0) verdict = 'review'
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
baseVersion: base.promptVersion,
|
|
225
|
+
candidateVersion: candidate.promptVersion,
|
|
226
|
+
baseScore: base.averageScore,
|
|
227
|
+
candidateScore: candidate.averageScore,
|
|
228
|
+
delta,
|
|
229
|
+
regressions,
|
|
230
|
+
improvements,
|
|
231
|
+
verdict,
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// --- Usage example ---
|
|
236
|
+
|
|
237
|
+
// const suite = new EvalSuite('ticket-classifier')
|
|
238
|
+
// .withScoring(jsonFieldMatch)
|
|
239
|
+
// .addCases([
|
|
240
|
+
// { id: 'billing-1', input: 'I was charged twice', expected: '{"label":"billing"}', tags: ['billing'] },
|
|
241
|
+
// { id: 'tech-1', input: 'App crashes on login', expected: '{"label":"technical"}', tags: ['technical'] },
|
|
242
|
+
// ])
|
|
243
|
+
//
|
|
244
|
+
// const baseResult = await suite.run(classifyV1, '2024.01.01', 'claude-sonnet-4-20250514')
|
|
245
|
+
// const candidateResult = await suite.run(classifyV2, '2024.01.15', 'claude-sonnet-4-20250514')
|
|
246
|
+
// const comparison = compareVersions(baseResult, candidateResult)
|
|
247
|
+
//
|
|
248
|
+
// if (comparison.verdict === 'fail') {
|
|
249
|
+
// console.error(`Regression detected: ${comparison.delta.toFixed(3)} score drop`)
|
|
250
|
+
// process.exit(1) // Fail CI
|
|
251
|
+
// }
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Framework adaptations:
|
|
255
|
+
*
|
|
256
|
+
* Express:
|
|
257
|
+
* - Run evals in CI (GitHub Actions) on prompt file changes
|
|
258
|
+
* - Store EvalResult in S3/database for historical comparison
|
|
259
|
+
* - Endpoint to trigger eval: POST /api/admin/eval (admin-only)
|
|
260
|
+
*
|
|
261
|
+
* FastAPI:
|
|
262
|
+
* - Same EvalSuite shape in Python with pytest fixtures
|
|
263
|
+
* - Use pytest-benchmark for latency tracking
|
|
264
|
+
* - Store results in PostgreSQL with SQLAlchemy models
|
|
265
|
+
* - CI: run eval suite in GitHub Actions, compare with previous run
|
|
266
|
+
*
|
|
267
|
+
* Django:
|
|
268
|
+
* - Management command: python manage.py run_eval --suite ticket-classifier
|
|
269
|
+
* - EvalResult and CaseResult as Django models for admin dashboard
|
|
270
|
+
* - Compare versions in admin: side-by-side eval result view
|
|
271
|
+
* - Celery task for large eval suites (100+ cases)
|
|
272
|
+
*/
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern: AI Orchestrator
|
|
3
|
+
*
|
|
4
|
+
* Three patterns for coordinating AI model calls:
|
|
5
|
+
* 1. Simple completion — single call with structured output
|
|
6
|
+
* 2. Agent loop — model calls tools in a loop until done
|
|
7
|
+
* 3. Circuit breaker — prevent cascading AI failures
|
|
8
|
+
*
|
|
9
|
+
* Key principles:
|
|
10
|
+
* - Always set MAX_ITERATIONS on agent loops — unbounded loops burn tokens
|
|
11
|
+
* - Retry with exponential backoff on transient failures (429, 5xx)
|
|
12
|
+
* - Circuit breaker protects downstream when a provider is degraded
|
|
13
|
+
* - Parse and validate model output with Zod — never trust raw JSON
|
|
14
|
+
* - Log every model call with latency, tokens, and model version
|
|
15
|
+
*
|
|
16
|
+
* Agents: Picard (architecture), Stark (backend), Kenobi (rate limits)
|
|
17
|
+
*
|
|
18
|
+
* Provider note: Primary examples use Anthropic SDK (@anthropic-ai/sdk).
|
|
19
|
+
* OpenAI adaptation is noted inline. The shapes are provider-agnostic.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import Anthropic from '@anthropic-ai/sdk'
|
|
23
|
+
import { z } from 'zod'
|
|
24
|
+
|
|
25
|
+
// --- 1. Simple Completion — single call → structured output ---
|
|
26
|
+
|
|
27
|
+
const SummarySchema = z.object({
|
|
28
|
+
title: z.string(),
|
|
29
|
+
bullets: z.array(z.string()).min(1).max(5),
|
|
30
|
+
sentiment: z.enum(['positive', 'negative', 'neutral']),
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
type Summary = z.infer<typeof SummarySchema>
|
|
34
|
+
|
|
35
|
+
/** Single model call with retry and structured output parsing. */
|
|
36
|
+
export async function executeWithRetry<T>(
|
|
37
|
+
fn: () => Promise<T>,
|
|
38
|
+
maxRetries = 3,
|
|
39
|
+
baseDelayMs = 1000
|
|
40
|
+
): Promise<T> {
|
|
41
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
42
|
+
try {
|
|
43
|
+
return await fn()
|
|
44
|
+
} catch (error: unknown) {
|
|
45
|
+
const isRetryable =
|
|
46
|
+
error instanceof Error &&
|
|
47
|
+
('status' in error && [429, 500, 502, 503].includes((error as { status: number }).status))
|
|
48
|
+
|
|
49
|
+
if (!isRetryable || attempt === maxRetries) throw error
|
|
50
|
+
|
|
51
|
+
// Exponential backoff with jitter
|
|
52
|
+
const delay = baseDelayMs * 2 ** attempt + Math.random() * baseDelayMs
|
|
53
|
+
await new Promise((r) => setTimeout(r, delay))
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
throw new Error('Unreachable') // TypeScript needs this
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export async function summarize(client: Anthropic, text: string): Promise<Summary> {
|
|
60
|
+
const response = await executeWithRetry(() =>
|
|
61
|
+
client.messages.create({
|
|
62
|
+
model: 'claude-sonnet-4-20250514',
|
|
63
|
+
max_tokens: 512,
|
|
64
|
+
messages: [{ role: 'user', content: `Summarize as JSON: ${text}` }],
|
|
65
|
+
// System prompt enforces output shape
|
|
66
|
+
system: 'Respond with JSON matching: { title, bullets[], sentiment }',
|
|
67
|
+
})
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
// Extract text content, parse with Zod — never trust raw model output
|
|
71
|
+
const content = response.content[0]
|
|
72
|
+
if (content.type !== 'text') throw new Error('Expected text response')
|
|
73
|
+
const parsed = JSON.parse(content.text)
|
|
74
|
+
return SummarySchema.parse(parsed) // Throws ZodError on invalid shape
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// OpenAI adaptation:
|
|
78
|
+
// const response = await openai.chat.completions.create({
|
|
79
|
+
// model: 'gpt-4o', messages: [...],
|
|
80
|
+
// response_format: { type: 'json_object' }, // OpenAI JSON mode
|
|
81
|
+
// })
|
|
82
|
+
// const parsed = JSON.parse(response.choices[0].message.content)
|
|
83
|
+
|
|
84
|
+
// --- 2. Agent Loop — model calls tools until done ---
|
|
85
|
+
|
|
86
|
+
const MAX_ITERATIONS = 10 // Hard bound — never remove this
|
|
87
|
+
|
|
88
|
+
interface ToolDefinition {
|
|
89
|
+
name: string
|
|
90
|
+
description: string
|
|
91
|
+
input_schema: Record<string, unknown>
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
interface AgentResult {
|
|
95
|
+
finalResponse: string
|
|
96
|
+
toolCallCount: number
|
|
97
|
+
iterations: number
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/** Agent loop: model decides which tools to call, iterates until done. */
|
|
101
|
+
export async function runAgentLoop(
|
|
102
|
+
client: Anthropic,
|
|
103
|
+
prompt: string,
|
|
104
|
+
tools: ToolDefinition[],
|
|
105
|
+
executeTool: (name: string, input: Record<string, unknown>) => Promise<string>
|
|
106
|
+
): Promise<AgentResult> {
|
|
107
|
+
const messages: Anthropic.MessageParam[] = [{ role: 'user', content: prompt }]
|
|
108
|
+
let toolCallCount = 0
|
|
109
|
+
|
|
110
|
+
for (let i = 0; i < MAX_ITERATIONS; i++) {
|
|
111
|
+
const response = await executeWithRetry(() =>
|
|
112
|
+
client.messages.create({
|
|
113
|
+
model: 'claude-sonnet-4-20250514',
|
|
114
|
+
max_tokens: 4096,
|
|
115
|
+
messages,
|
|
116
|
+
tools: tools as Anthropic.Tool[],
|
|
117
|
+
})
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
// If model responds with text only (no tool calls), we're done
|
|
121
|
+
if (response.stop_reason === 'end_turn') {
|
|
122
|
+
const text = response.content.find((c) => c.type === 'text')
|
|
123
|
+
return { finalResponse: text?.text ?? '', toolCallCount, iterations: i + 1 }
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Process tool calls
|
|
127
|
+
const toolUseBlocks = response.content.filter((c) => c.type === 'tool_use')
|
|
128
|
+
const toolResults: Anthropic.ToolResultBlockParam[] = []
|
|
129
|
+
|
|
130
|
+
for (const block of toolUseBlocks) {
|
|
131
|
+
if (block.type !== 'tool_use') continue
|
|
132
|
+
toolCallCount++
|
|
133
|
+
try {
|
|
134
|
+
const result = await executeTool(block.name, block.input as Record<string, unknown>)
|
|
135
|
+
toolResults.push({ type: 'tool_result', tool_use_id: block.id, content: result })
|
|
136
|
+
} catch (error) {
|
|
137
|
+
// Send error back to model — it can recover or try a different approach
|
|
138
|
+
toolResults.push({
|
|
139
|
+
type: 'tool_result',
|
|
140
|
+
tool_use_id: block.id,
|
|
141
|
+
content: `Error: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
142
|
+
is_error: true,
|
|
143
|
+
})
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Feed assistant response + tool results back for next iteration
|
|
148
|
+
messages.push({ role: 'assistant', content: response.content })
|
|
149
|
+
messages.push({ role: 'user', content: toolResults })
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
throw new Error(`Agent loop exceeded MAX_ITERATIONS (${MAX_ITERATIONS})`)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// --- 3. Circuit Breaker — prevent cascading AI failures ---
|
|
156
|
+
|
|
157
|
+
type CircuitState = 'closed' | 'open' | 'half-open'
|
|
158
|
+
|
|
159
|
+
/** Circuit breaker for AI provider calls. Opens after threshold failures. */
|
|
160
|
+
export class CircuitBreaker {
|
|
161
|
+
private state: CircuitState = 'closed'
|
|
162
|
+
private failureCount = 0
|
|
163
|
+
private lastFailureTime = 0
|
|
164
|
+
|
|
165
|
+
constructor(
|
|
166
|
+
private readonly failureThreshold: number = 5,
|
|
167
|
+
private readonly resetTimeoutMs: number = 60_000
|
|
168
|
+
) {}
|
|
169
|
+
|
|
170
|
+
async execute<T>(fn: () => Promise<T>, fallback: () => Promise<T>): Promise<T> {
|
|
171
|
+
if (this.state === 'open') {
|
|
172
|
+
// Check if enough time has passed to try again
|
|
173
|
+
if (Date.now() - this.lastFailureTime > this.resetTimeoutMs) {
|
|
174
|
+
this.state = 'half-open'
|
|
175
|
+
} else {
|
|
176
|
+
return fallback() // Circuit open — use fallback immediately
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
try {
|
|
181
|
+
const result = await fn()
|
|
182
|
+
// Success — reset if we were testing
|
|
183
|
+
if (this.state === 'half-open') this.state = 'closed'
|
|
184
|
+
this.failureCount = 0
|
|
185
|
+
return result
|
|
186
|
+
} catch (error) {
|
|
187
|
+
this.failureCount++
|
|
188
|
+
this.lastFailureTime = Date.now()
|
|
189
|
+
|
|
190
|
+
if (this.failureCount >= this.failureThreshold) {
|
|
191
|
+
this.state = 'open'
|
|
192
|
+
}
|
|
193
|
+
return fallback()
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
getState(): CircuitState {
|
|
198
|
+
return this.state
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Usage:
|
|
203
|
+
// const breaker = new CircuitBreaker(5, 60_000)
|
|
204
|
+
// const result = await breaker.execute(
|
|
205
|
+
// () => summarize(anthropicClient, text),
|
|
206
|
+
// () => ruleBased.summarize(text) // Fallback: no AI, just rules
|
|
207
|
+
// )
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Framework adaptations:
|
|
211
|
+
*
|
|
212
|
+
* Express:
|
|
213
|
+
* - Wrap agent loop in an Express route with req.setTimeout() for long-running calls
|
|
214
|
+
* - Circuit breaker as singleton middleware: app.use(aiCircuitBreaker)
|
|
215
|
+
* - Stream partial results via res.write() for SSE (see sse-endpoint.ts)
|
|
216
|
+
*
|
|
217
|
+
* FastAPI:
|
|
218
|
+
* - executeWithRetry → tenacity.retry(wait=wait_exponential(), stop=stop_after_attempt(3))
|
|
219
|
+
* - Agent loop: same shape, use httpx.AsyncClient for provider calls
|
|
220
|
+
* - Circuit breaker: pybreaker library or roll your own with same state machine
|
|
221
|
+
* - Background agent loops: FastAPI BackgroundTasks or Celery
|
|
222
|
+
*
|
|
223
|
+
* Django:
|
|
224
|
+
* - Services layer (services.py) holds orchestration logic — never in views
|
|
225
|
+
* - Circuit breaker state in Django cache (Redis) for multi-process
|
|
226
|
+
* - Agent loops in Celery tasks with soft_time_limit for MAX_ITERATIONS equivalent
|
|
227
|
+
* - Use django-ratelimit on the view to protect upstream AI spend
|
|
228
|
+
*/
|
|
229
|
+
|
|
230
|
+
// --- 4. Multi-Tenant AI — per-org isolation, keys, cost tracking ---
|
|
231
|
+
|
|
232
|
+
/** Per-Tenant Circuit Breakers — scoped by provider+orgId, not just provider.
|
|
233
|
+
* One org's invalid API key must not trip the breaker for all orgs. */
|
|
234
|
+
const tenantBreakers = new Map<string, CircuitBreaker>()
|
|
235
|
+
|
|
236
|
+
function getTenantBreaker(provider: string, orgId: string): CircuitBreaker {
|
|
237
|
+
const key = `${provider}:${orgId}`
|
|
238
|
+
let breaker = tenantBreakers.get(key)
|
|
239
|
+
if (!breaker) {
|
|
240
|
+
breaker = new CircuitBreaker(5, 60_000)
|
|
241
|
+
tenantBreakers.set(key, breaker)
|
|
242
|
+
}
|
|
243
|
+
return breaker
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Usage:
|
|
247
|
+
// const breaker = getTenantBreaker('anthropic', org.id)
|
|
248
|
+
// await breaker.execute(() => summarize(client, text), fallback)
|
|
249
|
+
|
|
250
|
+
/** Shared Transport with Per-Tenant Keys — one connection pool, per-org auth.
|
|
251
|
+
* Avoids N connection pools for N orgs (~100 bytes overhead per org). */
|
|
252
|
+
const sharedTransport = new Anthropic() // Base client — shared pool
|
|
253
|
+
|
|
254
|
+
function getOrgClient(orgApiKey: string): Anthropic {
|
|
255
|
+
// Reuses the underlying transport; only overrides auth header
|
|
256
|
+
return new Anthropic({ apiKey: orgApiKey })
|
|
257
|
+
// OpenAI: new OpenAI({ apiKey: orgApiKey })
|
|
258
|
+
// Note: Anthropic SDK creates lightweight client instances.
|
|
259
|
+
// For providers supporting .withOptions(), prefer that to avoid any pool duplication.
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/** API Key Fallback Chain — 3-tier resolution for provider credentials.
|
|
263
|
+
* (1) Org-specific from encrypted store → (2) Default org key → (3) Env var. */
|
|
264
|
+
interface CredentialStore {
|
|
265
|
+
get(orgId: string, provider: string): Promise<string | null>
|
|
266
|
+
getDefault(provider: string): Promise<string | null>
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
async function resolveApiKey(
|
|
270
|
+
orgId: string,
|
|
271
|
+
provider: string,
|
|
272
|
+
store: CredentialStore
|
|
273
|
+
): Promise<string> {
|
|
274
|
+
// Tier 1: org-specific credential
|
|
275
|
+
const orgKey = await store.get(orgId, provider)
|
|
276
|
+
if (orgKey) return orgKey
|
|
277
|
+
|
|
278
|
+
// Tier 2: default org credential (shared across orgs without their own key)
|
|
279
|
+
const defaultKey = await store.getDefault(provider)
|
|
280
|
+
if (defaultKey) return defaultKey
|
|
281
|
+
|
|
282
|
+
// Tier 3: environment variable
|
|
283
|
+
const envMap: Record<string, string> = {
|
|
284
|
+
anthropic: 'ANTHROPIC_API_KEY',
|
|
285
|
+
openai: 'OPENAI_API_KEY',
|
|
286
|
+
}
|
|
287
|
+
const envKey = process.env[envMap[provider] ?? '']
|
|
288
|
+
if (envKey) return envKey
|
|
289
|
+
|
|
290
|
+
throw new Error(`No API key found for provider=${provider}, orgId=${orgId}`)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/** Credential Verification Probe — validate key before storing.
|
|
294
|
+
* Makes a lightweight API call (list models) to confirm the key works. */
|
|
295
|
+
async function verifyCredential(provider: string, apiKey: string): Promise<boolean> {
|
|
296
|
+
try {
|
|
297
|
+
if (provider === 'anthropic') {
|
|
298
|
+
const probe = new Anthropic({ apiKey })
|
|
299
|
+
// Minimal call — small max_tokens to burn near-zero quota
|
|
300
|
+
await probe.messages.create({
|
|
301
|
+
model: 'claude-sonnet-4-20250514',
|
|
302
|
+
max_tokens: 1,
|
|
303
|
+
messages: [{ role: 'user', content: 'ping' }],
|
|
304
|
+
})
|
|
305
|
+
}
|
|
306
|
+
// OpenAI: await new OpenAI({ apiKey }).models.list()
|
|
307
|
+
return true
|
|
308
|
+
} catch {
|
|
309
|
+
return false // Invalid key — do not store
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/** Per-Tenant Cost Attribution — thread orgId through all usage recording. */
|
|
314
|
+
interface AiUsageRecord {
|
|
315
|
+
orgId: string
|
|
316
|
+
provider: string
|
|
317
|
+
model: string
|
|
318
|
+
inputTokens: number
|
|
319
|
+
outputTokens: number
|
|
320
|
+
costCents: number
|
|
321
|
+
timestamp: number
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
interface UsageSink {
|
|
325
|
+
record(entry: AiUsageRecord): void
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
function recordUsage(
|
|
329
|
+
sink: UsageSink,
|
|
330
|
+
orgId: string,
|
|
331
|
+
provider: string,
|
|
332
|
+
model: string,
|
|
333
|
+
inputTokens: number,
|
|
334
|
+
outputTokens: number,
|
|
335
|
+
costCents: number
|
|
336
|
+
): void {
|
|
337
|
+
sink.record({ orgId, provider, model, inputTokens, outputTokens, costCents, timestamp: Date.now() })
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Usage:
|
|
341
|
+
// recordUsage(sink, org.id, 'anthropic', 'claude-sonnet-4-20250514', 320, 150, 2)
|