npm - outcome-cli - Versions diffs - 1.0.0 - Mend

outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/README.md +261 -0
package/package.json +95 -0
package/src/agents/README.md +139 -0
package/src/agents/adapters/anthropic.adapter.ts +166 -0
package/src/agents/adapters/dalle.adapter.ts +145 -0
package/src/agents/adapters/gemini.adapter.ts +134 -0
package/src/agents/adapters/imagen.adapter.ts +106 -0
package/src/agents/adapters/nano-banana.adapter.ts +129 -0
package/src/agents/adapters/openai.adapter.ts +165 -0
package/src/agents/adapters/veo.adapter.ts +130 -0
package/src/agents/agent.schema.property.test.ts +379 -0
package/src/agents/agent.schema.test.ts +148 -0
package/src/agents/agent.schema.ts +263 -0
package/src/agents/index.ts +60 -0
package/src/agents/registered-agent.schema.ts +356 -0
package/src/agents/registry.ts +97 -0
package/src/agents/tournament-configs.property.test.ts +266 -0
package/src/cli/README.md +145 -0
package/src/cli/commands/define.ts +79 -0
package/src/cli/commands/list.ts +46 -0
package/src/cli/commands/logs.ts +83 -0
package/src/cli/commands/run.ts +416 -0
package/src/cli/commands/verify.ts +110 -0
package/src/cli/index.ts +81 -0
package/src/config/README.md +128 -0
package/src/config/env.ts +262 -0
package/src/config/index.ts +19 -0
package/src/eval/README.md +318 -0
package/src/eval/ai-judge.test.ts +435 -0
package/src/eval/ai-judge.ts +368 -0
package/src/eval/code-validators.ts +414 -0
package/src/eval/evaluateOutcome.property.test.ts +1174 -0
package/src/eval/evaluateOutcome.ts +591 -0
package/src/eval/immigration-validators.ts +122 -0
package/src/eval/index.ts +90 -0
package/src/eval/judge-cache.ts +402 -0
package/src/eval/tournament-validators.property.test.ts +439 -0
package/src/eval/validators.property.test.ts +1118 -0
package/src/eval/validators.ts +1199 -0
package/src/eval/weighted-scorer.ts +285 -0
package/src/index.ts +17 -0
package/src/league/README.md +188 -0
package/src/league/health-check.ts +353 -0
package/src/league/index.ts +93 -0
package/src/league/killAgent.ts +151 -0
package/src/league/league.test.ts +1151 -0
package/src/league/runLeague.ts +843 -0
package/src/league/scoreAgent.ts +175 -0
package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
package/src/modules/omnibridge/api/.gitkeep +1 -0
package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
package/src/modules/omnibridge/auth/.gitkeep +1 -0
package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
package/src/modules/omnibridge/auth/session-vault.ts +577 -0
package/src/modules/omnibridge/core/.gitkeep +1 -0
package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
package/src/modules/omnibridge/core/types.ts +610 -0
package/src/modules/omnibridge/execution/.gitkeep +1 -0
package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
package/src/modules/omnibridge/index.ts +212 -0
package/src/modules/omnibridge/omnibridge.ts +510 -0
package/src/modules/omnibridge/verification/.gitkeep +1 -0
package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
package/src/outcomes/README.md +75 -0
package/src/outcomes/acquire-pilot-customer.ts +297 -0
package/src/outcomes/code-delivery-outcomes.ts +89 -0
package/src/outcomes/code-outcomes.ts +256 -0
package/src/outcomes/code_review_battle.test.ts +135 -0
package/src/outcomes/code_review_battle.ts +135 -0
package/src/outcomes/cold_email_battle.ts +97 -0
package/src/outcomes/content_creation_battle.ts +160 -0
package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
package/src/outcomes/index.ts +107 -0
package/src/outcomes/lead_gen_battle.test.ts +113 -0
package/src/outcomes/lead_gen_battle.ts +99 -0
package/src/outcomes/outcome.schema.property.test.ts +229 -0
package/src/outcomes/outcome.schema.ts +187 -0
package/src/outcomes/qualified_sales_interest.ts +118 -0
package/src/outcomes/swarm_planner.property.test.ts +370 -0
package/src/outcomes/swarm_planner.ts +96 -0
package/src/outcomes/web_extraction.ts +234 -0
package/src/runtime/README.md +220 -0
package/src/runtime/agentRunner.test.ts +341 -0
package/src/runtime/agentRunner.ts +746 -0
package/src/runtime/claudeAdapter.ts +232 -0
package/src/runtime/costTracker.ts +123 -0
package/src/runtime/index.ts +34 -0
package/src/runtime/modelAdapter.property.test.ts +305 -0
package/src/runtime/modelAdapter.ts +144 -0
package/src/runtime/openaiAdapter.ts +235 -0
package/src/utils/README.md +122 -0
package/src/utils/command-runner.ts +134 -0
package/src/utils/cost-guard.ts +379 -0
package/src/utils/errors.test.ts +290 -0
package/src/utils/errors.ts +442 -0
package/src/utils/index.ts +37 -0
package/src/utils/logger.test.ts +361 -0
package/src/utils/logger.ts +419 -0
package/src/utils/output-parsers.ts +216 -0

package/src/eval/ai-judge.ts ADDED Viewed

@@ -0,0 +1,368 @@
+/**
+ * AI-Powered Evaluation System
+ *
+ * Implements AI judges for subjective bounty criteria evaluation.
+ * Supports GPT-4o and Claude Opus models for fair evaluation of creative tasks.
+ *
+ * @module eval/ai-judge
+ * @see Requirements 10.1, 10.2, 10.3, 10.4, 10.5
+ */
+import Anthropic from '@anthropic-ai/sdk';
+import OpenAI from 'openai';
+import { getJudgeCache, type JudgeCache, type JudgeResult, type JudgeModel } from './judge-cache.js';
+import { createHash } from 'crypto';
+// Re-export types from judge-cache for convenience
+export type { JudgeModel, JudgeResult } from './judge-cache.js';
+/**
+ * Configuration for an AI judge evaluation.
+ *
+ * @see Requirements 10.1, 10.5
+ */
+export interface JudgeConfig {
+  /** The AI model to use for evaluation */
+  model: JudgeModel;
+  /** The rubric describing evaluation criteria */
+  rubric: string;
+  /** Maximum score that can be awarded */
+  maxScore: number;
+  /** Optional temperature for model responses (default: 0.3 for consistency) */
+  temperature?: number;
+  /** Optional maximum tokens for response (default: 1024) */
+  maxTokens?: number;
+}
+/**
+ * Internal structure for parsed judge response.
+ */
+interface ParsedJudgeResponse {
+  score: number;
+  reasoning: string;
+  highlights: string[];
+}
+/**
+ * Error thrown when AI judge evaluation fails.
+ */
+export class AIJudgeError extends Error {
+  constructor(
+    message: string,
+    public readonly model: JudgeModel,
+    public readonly cause?: Error
+  ) {
+    super(message);
+    this.name = 'AIJudgeError';
+  }
+}
+/**
+ * Creates a hash key for caching judge results.
+ * Combines artifact content and rubric to ensure cache consistency.
+ *
+ * @param artifact - The artifact being evaluated
+ * @param rubric - The evaluation rubric
+ * @returns SHA-256 hash string for cache key
+ *
+ * @see Requirements 10.4
+ */
+export function hashArtifact(artifact: unknown, rubric: string): string {
+  const content = JSON.stringify({ artifact, rubric });
+  return createHash('sha256').update(content).digest('hex');
+}
+/**
+ * Builds the evaluation prompt for the AI judge.
+ *
+ * @param artifact - The artifact to evaluate
+ * @param rubric - The evaluation rubric
+ * @param maxScore - Maximum score possible
+ * @returns Formatted prompt string
+ *
+ * @see Requirements 10.2
+ */
+function buildJudgePrompt(artifact: unknown, rubric: string, maxScore: number): string {
+  const artifactStr = typeof artifact === 'string'
+    ? artifact
+    : JSON.stringify(artifact, null, 2);
+  return `You are an expert AI judge evaluating an agent's output against a specific rubric.
+## Evaluation Rubric
+${rubric}
+## Agent Output to Evaluate
+${artifactStr}
+## Instructions
+1. Carefully evaluate the agent's output against each criterion in the rubric
+2. Provide a score from 0 to ${maxScore} based on how well the output meets the criteria
+3. Explain your reasoning in detail
+4. Highlight any notable strengths or weaknesses
+## Response Format
+Respond with a JSON object in this exact format:
+{
+  "score": <number from 0 to ${maxScore}>,
+  "reasoning": "<detailed explanation of your evaluation>",
+  "highlights": ["<notable aspect 1>", "<notable aspect 2>", ...]
+}
+Respond ONLY with the JSON object, no additional text.`;
+}
+/**
+ * Parses the AI judge response into a structured format.
+ *
+ * @param response - Raw response from the AI model
+ * @param maxScore - Maximum score for validation
+ * @returns Parsed judge response
+ * @throws AIJudgeError if parsing fails
+ */
+function parseJudgeResponse(response: string, maxScore: number, model: JudgeModel): ParsedJudgeResponse {
+  try {
+    // Try to extract JSON from the response
+    const jsonMatch = response.match(/\{[\s\S]*\}/);
+    if (!jsonMatch) {
+      throw new Error('No JSON object found in response');
+    }
+    const parsed = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
+    // Validate required fields
+    if (typeof parsed.score !== 'number') {
+      throw new Error('Missing or invalid "score" field');
+    }
+    if (typeof parsed.reasoning !== 'string' || parsed.reasoning.trim() === '') {
+      throw new Error('Missing or invalid "reasoning" field');
+    }
+    // Clamp score to valid range
+    const score = Math.max(0, Math.min(maxScore, parsed.score));
+    // Parse highlights (optional, default to empty array)
+    const highlights = Array.isArray(parsed.highlights)
+      ? parsed.highlights.filter((h): h is string => typeof h === 'string')
+      : [];
+    return {
+      score,
+      reasoning: parsed.reasoning,
+      highlights,
+    };
+  } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : 'Unknown parsing error';
+    throw new AIJudgeError(`Failed to parse judge response: ${errorMessage}`, model);
+  }
+}
+/**
+ * Evaluates an artifact using GPT-4o as the judge.
+ *
+ * @param artifact - The artifact to evaluate
+ * @param config - Judge configuration
+ * @returns Parsed judge response
+ */
+async function evaluateWithGPT4o(
+  artifact: unknown,
+  config: JudgeConfig
+): Promise<ParsedJudgeResponse> {
+  const apiKey = process.env.OPENAI_API_KEY;
+  if (!apiKey) {
+    throw new AIJudgeError('OPENAI_API_KEY not configured', 'gpt-4o');
+  }
+  const client = new OpenAI({ apiKey });
+  const prompt = buildJudgePrompt(artifact, config.rubric, config.maxScore);
+  try {
+    const response = await client.chat.completions.create({
+      model: 'gpt-4o',
+      messages: [
+        {
+          role: 'system',
+          content: 'You are an expert AI judge. Evaluate outputs fairly and provide detailed reasoning.',
+        },
+        {
+          role: 'user',
+          content: prompt,
+        },
+      ],
+      temperature: config.temperature ?? 0.3,
+      max_tokens: config.maxTokens ?? 1024,
+    });
+    const content = response.choices[0]?.message?.content;
+    if (!content) {
+      throw new AIJudgeError('Empty response from GPT-4o', 'gpt-4o');
+    }
+    return parseJudgeResponse(content, config.maxScore, 'gpt-4o');
+  } catch (error) {
+    if (error instanceof AIJudgeError) {
+      throw error;
+    }
+    const cause = error instanceof Error ? error : new Error(String(error));
+    throw new AIJudgeError(`GPT-4o evaluation failed: ${cause.message}`, 'gpt-4o', cause);
+  }
+}
+/**
+ * Evaluates an artifact using Claude Opus as the judge.
+ *
+ * @param artifact - The artifact to evaluate
+ * @param config - Judge configuration
+ * @returns Parsed judge response
+ */
+async function evaluateWithClaudeOpus(
+  artifact: unknown,
+  config: JudgeConfig
+): Promise<ParsedJudgeResponse> {
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  if (!apiKey) {
+    throw new AIJudgeError('ANTHROPIC_API_KEY not configured', 'claude-opus');
+  }
+  const client = new Anthropic({ apiKey });
+  const prompt = buildJudgePrompt(artifact, config.rubric, config.maxScore);
+  try {
+    const response = await client.messages.create({
+      model: 'claude-3-opus-20240229',
+      max_tokens: config.maxTokens ?? 1024,
+      system: 'You are an expert AI judge. Evaluate outputs fairly and provide detailed reasoning.',
+      messages: [
+        {
+          role: 'user',
+          content: prompt,
+        },
+      ],
+    });
+    const content = response.content[0];
+    if (content.type !== 'text' || !content.text) {
+      throw new AIJudgeError('Empty response from Claude Opus', 'claude-opus');
+    }
+    return parseJudgeResponse(content.text, config.maxScore, 'claude-opus');
+  } catch (error) {
+    if (error instanceof AIJudgeError) {
+      throw error;
+    }
+    const cause = error instanceof Error ? error : new Error(String(error));
+    throw new AIJudgeError(`Claude Opus evaluation failed: ${cause.message}`, 'claude-opus', cause);
+  }
+}
+/**
+ * Evaluates an artifact using an AI judge model.
+ *
+ * This function:
+ * 1. Checks the cache for existing results (idempotence)
+ * 2. If not cached, invokes the specified AI model
+ * 3. Parses and validates the response
+ * 4. Caches the result for future requests
+ *
+ * @param artifact - The artifact to evaluate
+ * @param config - Judge configuration including model, rubric, and maxScore
+ * @param cache - Optional cache instance (defaults to global cache)
+ * @returns JudgeResult with score, reasoning, and highlights
+ *
+ * @example
+ * const result = await evaluateWithAIJudge(
+ *   { message: "Hello world", quality: "high" },
+ *   {
+ *     model: 'gpt-4o',
+ *     rubric: 'Evaluate the message for clarity and professionalism...',
+ *     maxScore: 10
+ *   }
+ * );
+ *
+ * @see Requirements 10.1, 10.2, 10.3, 10.4, 10.5
+ */
+export async function evaluateWithAIJudge(
+  artifact: unknown,
+  config: JudgeConfig,
+  cache?: JudgeCache
+): Promise<JudgeResult> {
+  // Use provided cache or get global cache
+  const judgeCache = cache ?? getJudgeCache();
+  // Generate cache key from artifact + rubric
+  const cacheKey = hashArtifact(artifact, config.rubric);
+  // Check cache first (idempotence)
+  const cached = await judgeCache.get(cacheKey);
+  if (cached) {
+    return {
+      ...cached,
+      cached: true,
+    };
+  }
+  // Evaluate with the specified model
+  let parsed: ParsedJudgeResponse;
+  switch (config.model) {
+    case 'gpt-4o':
+      parsed = await evaluateWithGPT4o(artifact, config);
+      break;
+    case 'claude-opus':
+      parsed = await evaluateWithClaudeOpus(artifact, config);
+      break;
+    default: {
+      const exhaustiveCheck: never = config.model;
+      throw new AIJudgeError(`Unsupported judge model: ${exhaustiveCheck}`, config.model);
+    }
+  }
+  // Build result
+  const result: JudgeResult = {
+    score: parsed.score,
+    normalizedScore: config.maxScore > 0 ? parsed.score / config.maxScore : 0,
+    reasoning: parsed.reasoning,
+    highlights: parsed.highlights,
+    model: config.model,
+    cached: false,
+    evaluatedAt: new Date().toISOString(),
+  };
+  // Cache the result (without the 'cached' flag)
+  const cacheableResult: Omit<JudgeResult, 'cached'> & { cached?: boolean } = { ...result };
+  delete cacheableResult.cached;
+  await judgeCache.set(cacheKey, cacheableResult as JudgeResult);
+  return result;
+}
+/**
+ * Validates a JudgeConfig object.
+ *
+ * @param config - Configuration to validate
+ * @returns true if valid
+ * @throws Error if invalid
+ */
+export function validateJudgeConfig(config: JudgeConfig): boolean {
+  if (!['gpt-4o', 'claude-opus'].includes(config.model)) {
+    throw new Error(`Invalid judge model: ${config.model}`);
+  }
+  if (typeof config.rubric !== 'string' || config.rubric.trim() === '') {
+    throw new Error('Rubric must be a non-empty string');
+  }
+  if (typeof config.maxScore !== 'number' || config.maxScore <= 0) {
+    throw new Error('maxScore must be a positive number');
+  }
+  if (config.temperature !== undefined && (config.temperature < 0 || config.temperature > 2)) {
+    throw new Error('temperature must be between 0 and 2');
+  }
+  if (config.maxTokens !== undefined && (config.maxTokens < 1 || config.maxTokens > 4096)) {
+    throw new Error('maxTokens must be between 1 and 4096');
+  }
+  return true;
+}