npm - @townco/debugger - Versions diffs - 0.1.32 → 0.1.33 - Mend

@townco/debugger 0.1.32 → 0.1.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/package.json +14 -14
package/src/App.tsx +1 -0
package/src/analysis/analyzer.ts +1 -2
package/src/analysis/comparison-analyzer.ts +528 -0
package/src/analysis/comparison-schema.ts +151 -0
package/src/analysis/comparison-types.ts +194 -0
package/src/analysis-db.ts +13 -6
package/src/comparison-db.ts +75 -3
package/src/components/AnalyzeAllButton.tsx +6 -2
package/src/components/ComparisonAnalysisDialog.tsx +591 -0
package/src/components/DebuggerHeader.tsx +0 -1
package/src/components/LogList.tsx +9 -0
package/src/components/SessionTraceList.tsx +9 -0
package/src/components/SpanDetailsPanel.tsx +20 -1
package/src/components/SpanTimeline.tsx +31 -4
package/src/components/SpanTree.tsx +10 -1
package/src/components/TurnMetadataPanel.tsx +0 -1
package/src/components/UnifiedTimeline.tsx +25 -35
package/src/components/ui/button.tsx +1 -1
package/src/components/ui/card.tsx +1 -1
package/src/components/ui/checkbox.tsx +1 -0
package/src/components/ui/input.tsx +1 -1
package/src/components/ui/label.tsx +1 -1
package/src/components/ui/select.tsx +1 -1
package/src/components/ui/textarea.tsx +1 -1
package/src/frontend.tsx +2 -0
package/src/lib/metrics.test.ts +2 -0
package/src/lib/turnExtractor.ts +28 -0
package/src/pages/ComparisonView.tsx +586 -92
package/src/pages/FindSessions.tsx +3 -1
package/src/pages/TownHall.tsx +30 -14
package/src/server.ts +177 -7
package/src/types.ts +4 -0
package/styles/globals.css +120 -0
package/tsconfig.json +2 -2

package/src/analysis/comparison-schema.ts ADDED Viewed

@@ -0,0 +1,151 @@
+/**
+ * Zod schemas for session comparison analysis validation
+ */
+import { z } from "zod";
+// Enum schemas
+export const ConsistencyLevelSchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
+export const EffectivenessRatingSchema = z.enum([
+  "IMPROVED",
+  "DEGRADED",
+  "NEUTRAL",
+  "MIXED",
+]);
+export const ConfidenceLevelSchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
+export const SignificanceSchema = z.enum(["CRITICAL", "NOTABLE", "MINOR"]);
+export const ReproducibilityVerdictSchema = z.enum([
+  "STABLE",
+  "UNSTABLE",
+  "PARTIALLY_STABLE",
+]);
+export const EffectAssessmentSchema = z.enum([
+  "ACHIEVED",
+  "PARTIALLY_ACHIEVED",
+  "NOT_ACHIEVED",
+  "OPPOSITE_EFFECT",
+]);
+export const ImpactTypeSchema = z.enum(["POSITIVE", "NEGATIVE", "NEUTRAL"]);
+export const BehavioralCategorySchema = z.enum([
+  "TOOL_USAGE",
+  "RESPONSE_CONTENT",
+  "REASONING_PATH",
+  "ERROR_HANDLING",
+  "PERFORMANCE",
+]);
+export const PrioritySchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
+export const ComparisonDimensionSchema = z.enum([
+  "model",
+  "system_prompt",
+  "tools",
+]);
+// Component schemas
+export const BehavioralDifferenceSchema = z.object({
+  category: BehavioralCategorySchema,
+  observation: z.string().min(10),
+  evidence: z.string().min(10),
+  significance: SignificanceSchema,
+});
+export const IntendedEffectSchema = z.object({
+  expected_change: z.string().min(5),
+  observed_outcome: z.string().min(10),
+  evidence: z.string().min(10),
+  assessment: EffectAssessmentSchema,
+});
+export const UnintendedEffectSchema = z.object({
+  observation: z.string().min(10),
+  evidence: z.string().min(10),
+  impact: ImpactTypeSchema,
+  severity: SignificanceSchema,
+});
+export const RecommendationSchema = z.object({
+  priority: PrioritySchema,
+  action: z.string().min(10),
+  rationale: z.string().min(10),
+  expected_impact: z.string().optional(),
+});
+export const MetricComparisonSchema = z.object({
+  duration_delta_pct: z.number(),
+  token_delta_pct: z.number(),
+  cost_delta_pct: z.number(),
+  tool_call_delta: z.number(),
+  interpretation: z.string().min(10),
+});
+export const ToolUsageChangeSchema = z.object({
+  tool_name: z.string(),
+  control_calls: z.number(),
+  variant_calls: z.number(),
+  pattern_change: z.string(),
+});
+export const NextExperimentSchema = z.object({
+  hypothesis: z.string().min(10),
+  suggested_change: z.object({
+    dimension: ComparisonDimensionSchema,
+    description: z.string().min(10),
+    example: z.string().optional(),
+  }),
+  expected_outcome: z.string().min(10),
+});
+export const ConfigSummarySchema = z.object({
+  model_change: z
+    .object({
+      from: z.string(),
+      to: z.string(),
+    })
+    .optional(),
+  system_prompt_changed: z.boolean(),
+  tools_added: z.array(z.string()),
+  tools_removed: z.array(z.string()),
+});
+// Report schemas
+export const ReproducibilityReportSchema = z.object({
+  verdict: ReproducibilityVerdictSchema,
+  confidence: ConfidenceLevelSchema,
+  summary: z.string().min(20),
+  behavioral_differences: z.array(BehavioralDifferenceSchema),
+  metric_comparison: MetricComparisonSchema,
+  recommendations: z.array(RecommendationSchema),
+});
+export const ChangeImpactReportSchema = z.object({
+  verdict: EffectivenessRatingSchema,
+  confidence: ConfidenceLevelSchema,
+  summary: z.string().min(20),
+  hypothesis_assessment: z.string().min(10),
+  intended_effects: z.array(IntendedEffectSchema),
+  unintended_effects: z.array(UnintendedEffectSchema),
+  metric_comparison: MetricComparisonSchema,
+  tool_usage_changes: z.array(ToolUsageChangeSchema),
+  recommendations: z.array(RecommendationSchema),
+});
+// LLM output schema - what we expect the LLM to generate
+export const LLMComparisonOutputSchema = z.object({
+  reproducibility: ReproducibilityReportSchema,
+  change_impact: ChangeImpactReportSchema,
+  next_experiments: z.array(NextExperimentSchema).max(5),
+});
+// Full analysis schema
+export const SessionComparisonAnalysisSchema = z.object({
+  comparison_run_id: z.string(),
+  created_at: z.string(),
+  original_session_id: z.string(),
+  control_session_id: z.string(),
+  variant_session_id: z.string(),
+  hypothesis: z.string(),
+  dimensions_compared: z.array(ComparisonDimensionSchema),
+  config_summary: ConfigSummarySchema,
+  reproducibility: ReproducibilityReportSchema,
+  change_impact: ChangeImpactReportSchema,
+  next_experiments: z.array(NextExperimentSchema).max(5),
+});

package/src/analysis/comparison-types.ts ADDED Viewed

@@ -0,0 +1,194 @@
+/**
+ * Session Comparison Analysis Types
+ *
+ * Types for analyzing comparisons between original, control, and variant sessions
+ * to produce Reproducibility Reports and Change Impact Reports.
+ */
+import type { ComparisonDimension } from "../types";
+// Enum types
+export type ConsistencyLevel = "HIGH" | "MEDIUM" | "LOW";
+export type EffectivenessRating = "IMPROVED" | "DEGRADED" | "NEUTRAL" | "MIXED";
+export type ConfidenceLevel = "HIGH" | "MEDIUM" | "LOW";
+export type Significance = "CRITICAL" | "NOTABLE" | "MINOR";
+export type ReproducibilityVerdict = "STABLE" | "UNSTABLE" | "PARTIALLY_STABLE";
+export type EffectAssessment =
+  | "ACHIEVED"
+  | "PARTIALLY_ACHIEVED"
+  | "NOT_ACHIEVED"
+  | "OPPOSITE_EFFECT";
+export type ImpactType = "POSITIVE" | "NEGATIVE" | "NEUTRAL";
+export type BehavioralCategory =
+  | "TOOL_USAGE"
+  | "RESPONSE_CONTENT"
+  | "REASONING_PATH"
+  | "ERROR_HANDLING"
+  | "PERFORMANCE";
+export type Priority = "HIGH" | "MEDIUM" | "LOW";
+/**
+ * A behavioral difference observed between sessions
+ */
+export interface BehavioralDifference {
+  category: BehavioralCategory;
+  observation: string;
+  evidence: string; // Specific quotes/citations using [TOOL:name], [MSG:"quote..."] format
+  significance: Significance;
+}
+/**
+ * An intended effect from the config changes
+ */
+export interface IntendedEffect {
+  expected_change: string;
+  observed_outcome: string;
+  evidence: string;
+  assessment: EffectAssessment;
+}
+/**
+ * An unintended side effect from the config changes
+ */
+export interface UnintendedEffect {
+  observation: string;
+  evidence: string;
+  impact: ImpactType;
+  severity: Significance;
+}
+/**
+ * An actionable recommendation for improving the agent
+ */
+export interface Recommendation {
+  priority: Priority;
+  action: string; // Specific, actionable - e.g., "Add to system prompt: '...'"
+  rationale: string;
+  expected_impact?: string | undefined;
+}
+/**
+ * Metric comparison between sessions
+ */
+export interface MetricComparison {
+  duration_delta_pct: number; // Percentage change
+  token_delta_pct: number;
+  cost_delta_pct: number;
+  tool_call_delta: number; // Absolute difference in tool call count
+  interpretation: string; // Human-readable interpretation
+}
+/**
+ * Tool usage change between control and variant
+ */
+export interface ToolUsageChange {
+  tool_name: string;
+  control_calls: number;
+  variant_calls: number;
+  pattern_change: string; // Description of how usage changed
+}
+/**
+ * Suggested next experiment to run
+ */
+export interface NextExperiment {
+  hypothesis: string;
+  suggested_change: {
+    dimension: ComparisonDimension;
+    description: string;
+    example?: string | undefined; // Concrete example for prompt changes
+  };
+  expected_outcome: string;
+}
+/**
+ * Config summary showing what changed between control and variant
+ */
+export interface ConfigSummary {
+  model_change?: { from: string; to: string } | undefined;
+  system_prompt_changed: boolean;
+  tools_added: string[];
+  tools_removed: string[];
+}
+/**
+ * Reproducibility Report - Original vs Control analysis
+ */
+export interface ReproducibilityReport {
+  verdict: ReproducibilityVerdict;
+  confidence: ConfidenceLevel;
+  summary: string;
+  behavioral_differences: BehavioralDifference[];
+  metric_comparison: MetricComparison;
+  recommendations: Recommendation[];
+}
+/**
+ * Change Impact Report - Control vs Variant analysis
+ */
+export interface ChangeImpactReport {
+  verdict: EffectivenessRating;
+  confidence: ConfidenceLevel;
+  summary: string;
+  hypothesis_assessment: string; // Did changes achieve the user's hypothesis?
+  intended_effects: IntendedEffect[];
+  unintended_effects: UnintendedEffect[];
+  metric_comparison: MetricComparison;
+  tool_usage_changes: ToolUsageChange[];
+  recommendations: Recommendation[];
+}
+/**
+ * Complete Session Comparison Analysis
+ */
+export interface SessionComparisonAnalysis {
+  comparison_run_id: string;
+  created_at: string;
+  // Session IDs
+  original_session_id: string;
+  control_session_id: string;
+  variant_session_id: string;
+  // User's hypothesis
+  hypothesis: string;
+  // Config context
+  dimensions_compared: ComparisonDimension[];
+  config_summary: ConfigSummary;
+  // Section 1: Reproducibility Report (Original vs Control)
+  reproducibility: ReproducibilityReport;
+  // Section 2: Change Impact Report (Control vs Variant)
+  change_impact: ChangeImpactReport;
+  // Suggested next experiments
+  next_experiments: NextExperiment[];
+}
+/**
+ * LLM output schema - what the LLM generates (subset of full analysis)
+ */
+export interface LLMComparisonOutput {
+  reproducibility: {
+    verdict: ReproducibilityVerdict;
+    confidence: ConfidenceLevel;
+    summary: string;
+    behavioral_differences: BehavioralDifference[];
+    metric_comparison: MetricComparison;
+    recommendations: Recommendation[];
+  };
+  change_impact: {
+    verdict: EffectivenessRating;
+    confidence: ConfidenceLevel;
+    summary: string;
+    hypothesis_assessment: string;
+    intended_effects: IntendedEffect[];
+    unintended_effects: UnintendedEffect[];
+    metric_comparison: MetricComparison;
+    tool_usage_changes: ToolUsageChange[];
+    recommendations: Recommendation[];
+  };
+  next_experiments: NextExperiment[];
+}

package/src/analysis-db.ts CHANGED Viewed

@@ -163,7 +163,10 @@ export class AnalysisDb {
         .limit(1000) // Should be enough for most use cases
         .toArray();
-      const record = allRecords.find((r: any) => r.session_id === sessionId);
+      const record = allRecords.find(
+        (r: { session_id: string; embedding: number[] }) =>
+          r.session_id === sessionId,
+      );
       if (!record) {
         return null;
@@ -200,10 +203,12 @@ export class AnalysisDb {
         .limit(limit)
         .toArray();
-      return results.map((result: any) => ({
-        session_id: result.session_id,
-        distance: result._distance,
-      }));
+      return results.map(
+        (result: { session_id: string; _distance: number }) => ({
+          session_id: result.session_id,
+          distance: result._distance,
+        }),
+      );
     } catch (error) {
       console.error("Error searching similar sessions:", error);
       return [];
@@ -229,7 +234,9 @@ export class AnalysisDb {
         .limit(1000)
         .toArray();
-      return allRecords.some((r: any) => r.session_id === sessionId);
+      return allRecords.some(
+        (r: { session_id: string }) => r.session_id === sessionId,
+      );
     } catch (error) {
       console.error("Error checking embedding existence:", error);
       return false;

package/src/comparison-db.ts CHANGED Viewed

@@ -8,7 +8,7 @@ import type {
   SessionMetrics,
 } from "./types";
-const SCHEMA_VERSION = 2;
+const SCHEMA_VERSION = 3;
 export class ComparisonDb {
   private db: Database;
@@ -98,6 +98,33 @@ export class ComparisonDb {
       }
     }
+    // Migration: add hypothesis column and comparison_analyses table
+    if (currentVersion < 3) {
+      // Add hypothesis column to comparison_configs
+      const tableInfo = this.db
+        .query<{ name: string }, []>("PRAGMA table_info(comparison_configs)")
+        .all();
+      const hasHypothesisColumn = tableInfo.some(
+        (col) => col.name === "hypothesis",
+      );
+      if (!hasHypothesisColumn) {
+        this.db.run(
+          `ALTER TABLE comparison_configs ADD COLUMN hypothesis TEXT`,
+        );
+      }
+      // Create comparison_analyses table
+      this.db.run(`
+        CREATE TABLE IF NOT EXISTS comparison_analyses (
+          comparison_run_id TEXT PRIMARY KEY,
+          analysis_json TEXT NOT NULL,
+          created_at TEXT NOT NULL,
+          updated_at TEXT NOT NULL
+        )
+      `);
+    }
     // Update schema version
     this.db.run(`PRAGMA user_version = ${SCHEMA_VERSION}`);
   }
@@ -108,14 +135,15 @@ export class ComparisonDb {
     const now = new Date().toISOString();
     this.db.run(
       `
-      INSERT INTO comparison_configs (id, dimensions, control_model, variant_model, variant_system_prompt, variant_tools, created_at, updated_at)
-      VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+      INSERT INTO comparison_configs (id, dimensions, control_model, variant_model, variant_system_prompt, variant_tools, hypothesis, created_at, updated_at)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
       ON CONFLICT(id) DO UPDATE SET
         dimensions = excluded.dimensions,
         control_model = excluded.control_model,
         variant_model = excluded.variant_model,
         variant_system_prompt = excluded.variant_system_prompt,
         variant_tools = excluded.variant_tools,
+        hypothesis = excluded.hypothesis,
         updated_at = excluded.updated_at
       `,
       [
@@ -125,6 +153,7 @@ export class ComparisonDb {
         config.variantModel ?? null,
         config.variantSystemPrompt ?? null,
         config.variantTools ? JSON.stringify(config.variantTools) : null,
+        config.hypothesis ?? null,
         config.createdAt || now,
         now,
       ],
@@ -199,6 +228,7 @@ export class ComparisonDb {
       variantModel: row.variant_model ?? undefined,
       variantSystemPrompt: row.variant_system_prompt ?? undefined,
       variantTools,
+      hypothesis: row.hypothesis ?? undefined,
       createdAt: row.created_at,
       updatedAt: row.updated_at,
     };
@@ -377,4 +407,46 @@ export class ComparisonDb {
       variantResponse: row.variant_response ?? null,
     };
   }
+  // Comparison Analysis methods
+  saveComparisonAnalysis(runId: string, analysis: unknown): void {
+    const now = new Date().toISOString();
+    this.db.run(
+      `
+      INSERT INTO comparison_analyses (comparison_run_id, analysis_json, created_at, updated_at)
+      VALUES (?, ?, ?, ?)
+      ON CONFLICT(comparison_run_id) DO UPDATE SET
+        analysis_json = excluded.analysis_json,
+        updated_at = excluded.updated_at
+      `,
+      [runId, JSON.stringify(analysis), now, now],
+    );
+  }
+  getComparisonAnalysis(runId: string): unknown | null {
+    const row = this.db
+      .query<{ analysis_json: string }, [string]>(
+        `SELECT analysis_json FROM comparison_analyses WHERE comparison_run_id = ?`,
+      )
+      .get(runId);
+    if (!row) return null;
+    try {
+      return JSON.parse(row.analysis_json);
+    } catch (e) {
+      console.error("Failed to parse comparison analysis JSON:", e);
+      return null;
+    }
+  }
+  hasComparisonAnalysis(runId: string): boolean {
+    const row = this.db
+      .query<{ count: number }, [string]>(
+        `SELECT COUNT(*) as count FROM comparison_analyses WHERE comparison_run_id = ?`,
+      )
+      .get(runId);
+    return (row?.count ?? 0) > 0;
+  }
 }

package/src/components/AnalyzeAllButton.tsx CHANGED Viewed

@@ -36,8 +36,12 @@ export function AnalyzeAllButton({ sessionIds, onComplete }: Props) {
       const { results } = await response.json();
       // Count successes and errors
-      const completed = results.filter((r: any) => r.success).length;
-      const errors = results.filter((r: any) => !r.success).length;
+      const completed = results.filter(
+        (r: { success: boolean }) => r.success,
+      ).length;
+      const errors = results.filter(
+        (r: { success: boolean }) => !r.success,
+      ).length;
       setProgress({ completed, total: sessionIds.length, errors });