npm - outcome-cli - Versions diffs - 1.0.0 - Mend

outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/README.md +261 -0
package/package.json +95 -0
package/src/agents/README.md +139 -0
package/src/agents/adapters/anthropic.adapter.ts +166 -0
package/src/agents/adapters/dalle.adapter.ts +145 -0
package/src/agents/adapters/gemini.adapter.ts +134 -0
package/src/agents/adapters/imagen.adapter.ts +106 -0
package/src/agents/adapters/nano-banana.adapter.ts +129 -0
package/src/agents/adapters/openai.adapter.ts +165 -0
package/src/agents/adapters/veo.adapter.ts +130 -0
package/src/agents/agent.schema.property.test.ts +379 -0
package/src/agents/agent.schema.test.ts +148 -0
package/src/agents/agent.schema.ts +263 -0
package/src/agents/index.ts +60 -0
package/src/agents/registered-agent.schema.ts +356 -0
package/src/agents/registry.ts +97 -0
package/src/agents/tournament-configs.property.test.ts +266 -0
package/src/cli/README.md +145 -0
package/src/cli/commands/define.ts +79 -0
package/src/cli/commands/list.ts +46 -0
package/src/cli/commands/logs.ts +83 -0
package/src/cli/commands/run.ts +416 -0
package/src/cli/commands/verify.ts +110 -0
package/src/cli/index.ts +81 -0
package/src/config/README.md +128 -0
package/src/config/env.ts +262 -0
package/src/config/index.ts +19 -0
package/src/eval/README.md +318 -0
package/src/eval/ai-judge.test.ts +435 -0
package/src/eval/ai-judge.ts +368 -0
package/src/eval/code-validators.ts +414 -0
package/src/eval/evaluateOutcome.property.test.ts +1174 -0
package/src/eval/evaluateOutcome.ts +591 -0
package/src/eval/immigration-validators.ts +122 -0
package/src/eval/index.ts +90 -0
package/src/eval/judge-cache.ts +402 -0
package/src/eval/tournament-validators.property.test.ts +439 -0
package/src/eval/validators.property.test.ts +1118 -0
package/src/eval/validators.ts +1199 -0
package/src/eval/weighted-scorer.ts +285 -0
package/src/index.ts +17 -0
package/src/league/README.md +188 -0
package/src/league/health-check.ts +353 -0
package/src/league/index.ts +93 -0
package/src/league/killAgent.ts +151 -0
package/src/league/league.test.ts +1151 -0
package/src/league/runLeague.ts +843 -0
package/src/league/scoreAgent.ts +175 -0
package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
package/src/modules/omnibridge/api/.gitkeep +1 -0
package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
package/src/modules/omnibridge/auth/.gitkeep +1 -0
package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
package/src/modules/omnibridge/auth/session-vault.ts +577 -0
package/src/modules/omnibridge/core/.gitkeep +1 -0
package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
package/src/modules/omnibridge/core/types.ts +610 -0
package/src/modules/omnibridge/execution/.gitkeep +1 -0
package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
package/src/modules/omnibridge/index.ts +212 -0
package/src/modules/omnibridge/omnibridge.ts +510 -0
package/src/modules/omnibridge/verification/.gitkeep +1 -0
package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
package/src/outcomes/README.md +75 -0
package/src/outcomes/acquire-pilot-customer.ts +297 -0
package/src/outcomes/code-delivery-outcomes.ts +89 -0
package/src/outcomes/code-outcomes.ts +256 -0
package/src/outcomes/code_review_battle.test.ts +135 -0
package/src/outcomes/code_review_battle.ts +135 -0
package/src/outcomes/cold_email_battle.ts +97 -0
package/src/outcomes/content_creation_battle.ts +160 -0
package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
package/src/outcomes/index.ts +107 -0
package/src/outcomes/lead_gen_battle.test.ts +113 -0
package/src/outcomes/lead_gen_battle.ts +99 -0
package/src/outcomes/outcome.schema.property.test.ts +229 -0
package/src/outcomes/outcome.schema.ts +187 -0
package/src/outcomes/qualified_sales_interest.ts +118 -0
package/src/outcomes/swarm_planner.property.test.ts +370 -0
package/src/outcomes/swarm_planner.ts +96 -0
package/src/outcomes/web_extraction.ts +234 -0
package/src/runtime/README.md +220 -0
package/src/runtime/agentRunner.test.ts +341 -0
package/src/runtime/agentRunner.ts +746 -0
package/src/runtime/claudeAdapter.ts +232 -0
package/src/runtime/costTracker.ts +123 -0
package/src/runtime/index.ts +34 -0
package/src/runtime/modelAdapter.property.test.ts +305 -0
package/src/runtime/modelAdapter.ts +144 -0
package/src/runtime/openaiAdapter.ts +235 -0
package/src/utils/README.md +122 -0
package/src/utils/command-runner.ts +134 -0
package/src/utils/cost-guard.ts +379 -0
package/src/utils/errors.test.ts +290 -0
package/src/utils/errors.ts +442 -0
package/src/utils/index.ts +37 -0
package/src/utils/logger.test.ts +361 -0
package/src/utils/logger.ts +419 -0
package/src/utils/output-parsers.ts +216 -0

package/src/eval/code-validators.ts ADDED Viewed

@@ -0,0 +1,414 @@
+/**
+ * Code Validators - SWE-bench style validation for code generation tasks
+ *
+ * Integrates software engineering evaluation into our existing validator system.
+ * These validators enable battles between code-focused agents (like Codeon).
+ *
+ * @module eval/code-validators
+ */
+import type { ValidationResult } from './validators.js';
+/**
+ * Code artifact content for SWE-bench style evaluation.
+ * Extends base artifact pattern to support code generation tasks.
+ */
+export interface CodeArtifactContent {
+  /** The generated code/patch */
+  code: string;
+  /** Language of the code (python, typescript, etc.) */
+  language: string;
+  /** The issue/problem description that was given */
+  issueDescription: string;
+  /** Optional: File paths that were modified */
+  modifiedFiles?: string[];
+  /** Optional: Explanation of the fix */
+  explanation?: string;
+}
+/**
+ * Test case for code evaluation.
+ */
+export interface TestCase {
+  /** Test name/description */
+  name: string;
+  /** Input to the function/code */
+  input: unknown;
+  /** Expected output */
+  expectedOutput: unknown;
+  /** Optional: Test type (unit, integration, edge-case) */
+  type?: 'unit' | 'integration' | 'edge-case';
+}
+/**
+ * Validates that generated code contains required syntax elements.
+ *
+ * @param code - The generated code to validate
+ * @param requiredElements - Array of syntax elements that must be present
+ * @returns ValidationResult indicating if all required elements are present
+ *
+ * @example
+ * validateCodeSyntax("def foo(): return 42", ["def", "return"])
+ * // { valid: true, errors: [] }
+ */
+export function validateCodeSyntax(
+  code: string,
+  requiredElements: string[]
+): ValidationResult {
+  const missingElements = requiredElements.filter(
+    (element) => !code.includes(element)
+  );
+  if (missingElements.length === 0) {
+    return { valid: true, errors: [] };
+  }
+  return {
+    valid: false,
+    errors: [
+      `Missing required syntax elements: ${missingElements.join(', ')}`,
+    ],
+  };
+}
+/**
+ * Validates that code follows basic structure requirements.
+ * Checks for proper function/class definitions, imports, etc.
+ *
+ * @param code - The generated code
+ * @param language - Programming language
+ * @param requirements - Structure requirements to check
+ * @returns ValidationResult
+ */
+export function validateCodeStructure(
+  code: string,
+  language: string,
+  requirements: {
+    mustHaveFunction?: boolean;
+    mustHaveClass?: boolean;
+    mustHaveImports?: boolean;
+    minLines?: number;
+    maxLines?: number;
+  }
+): ValidationResult {
+  const errors: string[] = [];
+  const lines = code.split('\n').filter((line) => line.trim().length > 0);
+  // Language-specific patterns
+  const patterns: Record<string, { function: RegExp; class: RegExp; import: RegExp }> = {
+    python: {
+      function: /def\s+\w+\s*\(/,
+      class: /class\s+\w+/,
+      import: /^(import|from)\s+/m,
+    },
+    typescript: {
+      function: /(function\s+\w+|const\s+\w+\s*=\s*(async\s*)?\(|=>\s*{)/,
+      class: /class\s+\w+/,
+      import: /^import\s+/m,
+    },
+    javascript: {
+      function: /(function\s+\w+|const\s+\w+\s*=\s*(async\s*)?\(|=>\s*{)/,
+      class: /class\s+\w+/,
+      import: /^(import|require)\s*/m,
+    },
+  };
+  const langPatterns = patterns[language] || patterns.javascript;
+  if (requirements.mustHaveFunction && !langPatterns.function.test(code)) {
+    errors.push(`Code must contain a function definition`);
+  }
+  if (requirements.mustHaveClass && !langPatterns.class.test(code)) {
+    errors.push(`Code must contain a class definition`);
+  }
+  if (requirements.mustHaveImports && !langPatterns.import.test(code)) {
+    errors.push(`Code must contain import statements`);
+  }
+  if (requirements.minLines && lines.length < requirements.minLines) {
+    errors.push(`Code too short: ${lines.length} lines, minimum ${requirements.minLines}`);
+  }
+  if (requirements.maxLines && lines.length > requirements.maxLines) {
+    errors.push(`Code too long: ${lines.length} lines, maximum ${requirements.maxLines}`);
+  }
+  return {
+    valid: errors.length === 0,
+    errors,
+  };
+}
+/**
+ * Validates that code produces expected outputs for given test cases.
+ * This is the core SWE-bench style evaluation.
+ *
+ * NOTE: In production, this would execute code in a sandboxed environment.
+ * For now, we do static analysis and pattern matching.
+ *
+ * @param code - The generated code
+ * @param testCases - Array of test cases to validate against
+ * @param options - Evaluation options
+ * @returns ValidationResult with test pass/fail details
+ */
+export function validateTestCases(
+  code: string,
+  testCases: TestCase[],
+  options: {
+    requireAllPass?: boolean;
+    minPassRate?: number;
+  } = {}
+): ValidationResult {
+  const { requireAllPass = true, minPassRate = 1.0 } = options;
+  const errors: string[] = [];
+  let passedCount = 0;
+  // Static analysis: Check if code handles the test cases conceptually
+  // In production, this would actually run the code in a sandbox (Docker like SWE-bench)
+  for (const testCase of testCases) {
+    // Basic heuristic: Check if code could plausibly handle this case
+    // Real implementation would execute in Docker container like SWE-bench
+    const hasRelevantLogic =
+      code.includes('return') ||
+      code.includes('yield') ||
+      code.includes('console.log') ||
+      code.includes('print');
+    if (hasRelevantLogic) {
+      passedCount++;
+    } else {
+      errors.push(`Test "${testCase.name}" may fail - no return/output statement found`);
+    }
+  }
+  const passRate = testCases.length > 0 ? passedCount / testCases.length : 0;
+  if (requireAllPass && passedCount < testCases.length) {
+    return {
+      valid: false,
+      errors: [`Only ${passedCount}/${testCases.length} tests passed. ${errors.join('; ')}`],
+    };
+  }
+  if (passRate < minPassRate) {
+    return {
+      valid: false,
+      errors: [`Pass rate ${(passRate * 100).toFixed(1)}% below minimum ${(minPassRate * 100).toFixed(1)}%`],
+    };
+  }
+  return { valid: true, errors: [] };
+}
+/**
+ * Validates that a patch/diff correctly addresses the issue.
+ * SWE-bench core: Does this patch fix the GitHub issue?
+ *
+ * @param patch - The generated patch/diff
+ * @param issueKeywords - Keywords from the issue that should be addressed
+ * @param options - Validation options
+ * @returns ValidationResult
+ */
+export function validatePatchRelevance(
+  patch: string,
+  issueKeywords: string[],
+  options: {
+    minKeywordsCovered?: number;
+    mustHaveAdditions?: boolean;
+    mustHaveDeletions?: boolean;
+  } = {}
+): ValidationResult {
+  const {
+    minKeywordsCovered = 1,
+    mustHaveAdditions = true,
+    mustHaveDeletions = false
+  } = options;
+  const errors: string[] = [];
+  const patchLower = patch.toLowerCase();
+  // Check keyword coverage
+  const coveredKeywords = issueKeywords.filter((kw) =>
+    patchLower.includes(kw.toLowerCase())
+  );
+  if (coveredKeywords.length < minKeywordsCovered) {
+    errors.push(
+      `Patch only addresses ${coveredKeywords.length}/${minKeywordsCovered} required keywords`
+    );
+  }
+  // Check for additions (lines starting with +)
+  const hasAdditions = /^\+[^+]/m.test(patch);
+  if (mustHaveAdditions && !hasAdditions) {
+    errors.push('Patch must contain code additions');
+  }
+  // Check for deletions (lines starting with -)
+  const hasDeletions = /^-[^-]/m.test(patch);
+  if (mustHaveDeletions && !hasDeletions) {
+    errors.push('Patch must contain code deletions');
+  }
+  return {
+    valid: errors.length === 0,
+    errors,
+  };
+}
+/**
+ * Validates code quality metrics.
+ * Checks for common issues like missing error handling, no comments, etc.
+ *
+ * @param code - The generated code
+ * @param requirements - Quality requirements
+ * @returns ValidationResult
+ */
+export function validateCodeQuality(
+  code: string,
+  requirements: {
+    mustHaveErrorHandling?: boolean;
+    mustHaveComments?: boolean;
+    maxComplexity?: number;
+    noConsoleLog?: boolean;
+  } = {}
+): ValidationResult {
+  const errors: string[] = [];
+  // Error handling check
+  if (requirements.mustHaveErrorHandling) {
+    const hasErrorHandling =
+      code.includes('try') ||
+      code.includes('catch') ||
+      code.includes('except') ||
+      code.includes('throw') ||
+      code.includes('raise');
+    if (!hasErrorHandling) {
+      errors.push('Code should include error handling (try/catch/except)');
+    }
+  }
+  // Comments check
+  if (requirements.mustHaveComments) {
+    const hasComments =
+      code.includes('//') ||
+      code.includes('/*') ||
+      code.includes('#') ||
+      code.includes('"""') ||
+      code.includes("'''");
+    if (!hasComments) {
+      errors.push('Code should include comments or documentation');
+    }
+  }
+  // Console.log check (often unwanted in production code)
+  if (requirements.noConsoleLog) {
+    if (code.includes('console.log') || code.includes('print(')) {
+      errors.push('Code should not contain debug print statements');
+    }
+  }
+  // Complexity check (simplified: count control flow statements)
+  if (requirements.maxComplexity) {
+    const controlFlowPatterns = /\b(if|else|for|while|switch|case|try|catch)\b/g;
+    const matches = code.match(controlFlowPatterns) || [];
+    if (matches.length > requirements.maxComplexity) {
+      errors.push(
+        `Code complexity ${matches.length} exceeds maximum ${requirements.maxComplexity}`
+      );
+    }
+  }
+  return {
+    valid: errors.length === 0,
+    errors,
+  };
+}
+/**
+ * Combined SWE-bench style evaluation.
+ * Runs multiple validators to get a comprehensive score.
+ *
+ * @param artifact - The code artifact to evaluate
+ * @param config - Evaluation configuration
+ * @returns Combined ValidationResult with score
+ */
+export function evaluateSWEBenchStyle(
+  artifact: CodeArtifactContent,
+  config: {
+    issueKeywords: string[];
+    requiredSyntax?: string[];
+    testCases?: TestCase[];
+    structureRequirements?: Parameters<typeof validateCodeStructure>[2];
+    qualityRequirements?: Parameters<typeof validateCodeQuality>[1];
+  }
+): ValidationResult & { score: number; breakdown: Record<string, boolean> } {
+  const breakdown: Record<string, boolean> = {};
+  const allErrors: string[] = [];
+  // 1. Syntax validation
+  if (config.requiredSyntax) {
+    const syntaxResult = validateCodeSyntax(artifact.code, config.requiredSyntax);
+    breakdown.syntax = syntaxResult.valid;
+    if (!syntaxResult.valid) allErrors.push(...syntaxResult.errors);
+  } else {
+    breakdown.syntax = true;
+  }
+  // 2. Structure validation
+  if (config.structureRequirements) {
+    const structureResult = validateCodeStructure(
+      artifact.code,
+      artifact.language,
+      config.structureRequirements
+    );
+    breakdown.structure = structureResult.valid;
+    if (!structureResult.valid) allErrors.push(...structureResult.errors);
+  } else {
+    breakdown.structure = true;
+  }
+  // 3. Test case validation
+  if (config.testCases && config.testCases.length > 0) {
+    const testResult = validateTestCases(artifact.code, config.testCases, {
+      requireAllPass: false,
+      minPassRate: 0.7,
+    });
+    breakdown.tests = testResult.valid;
+    if (!testResult.valid) allErrors.push(...testResult.errors);
+  } else {
+    breakdown.tests = true;
+  }
+  // 4. Relevance validation (does it address the issue?)
+  const relevanceResult = validatePatchRelevance(artifact.code, config.issueKeywords, {
+    minKeywordsCovered: Math.min(2, config.issueKeywords.length),
+  });
+  breakdown.relevance = relevanceResult.valid;
+  if (!relevanceResult.valid) allErrors.push(...relevanceResult.errors);
+  // 5. Quality validation
+  if (config.qualityRequirements) {
+    const qualityResult = validateCodeQuality(artifact.code, config.qualityRequirements);
+    breakdown.quality = qualityResult.valid;
+    if (!qualityResult.valid) allErrors.push(...qualityResult.errors);
+  } else {
+    breakdown.quality = true;
+  }
+  // Calculate score (weighted average)
+  const weights = { syntax: 0.15, structure: 0.15, tests: 0.35, relevance: 0.25, quality: 0.10 };
+  const score = Object.entries(breakdown).reduce((acc, [key, passed]) => {
+    return acc + (passed ? weights[key as keyof typeof weights] || 0 : 0);
+  }, 0);
+  return {
+    valid: allErrors.length === 0,
+    errors: allErrors,
+    score,
+    breakdown,
+  };
+}