@townco/debugger 0.1.32 → 0.1.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ /**
2
+ * Zod schemas for session comparison analysis validation
3
+ */
4
+
5
+ import { z } from "zod";
6
+
7
+ // Enum schemas
8
+ export const ConsistencyLevelSchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
9
+ export const EffectivenessRatingSchema = z.enum([
10
+ "IMPROVED",
11
+ "DEGRADED",
12
+ "NEUTRAL",
13
+ "MIXED",
14
+ ]);
15
+ export const ConfidenceLevelSchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
16
+ export const SignificanceSchema = z.enum(["CRITICAL", "NOTABLE", "MINOR"]);
17
+ export const ReproducibilityVerdictSchema = z.enum([
18
+ "STABLE",
19
+ "UNSTABLE",
20
+ "PARTIALLY_STABLE",
21
+ ]);
22
+ export const EffectAssessmentSchema = z.enum([
23
+ "ACHIEVED",
24
+ "PARTIALLY_ACHIEVED",
25
+ "NOT_ACHIEVED",
26
+ "OPPOSITE_EFFECT",
27
+ ]);
28
+ export const ImpactTypeSchema = z.enum(["POSITIVE", "NEGATIVE", "NEUTRAL"]);
29
+ export const BehavioralCategorySchema = z.enum([
30
+ "TOOL_USAGE",
31
+ "RESPONSE_CONTENT",
32
+ "REASONING_PATH",
33
+ "ERROR_HANDLING",
34
+ "PERFORMANCE",
35
+ ]);
36
+ export const PrioritySchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
37
+ export const ComparisonDimensionSchema = z.enum([
38
+ "model",
39
+ "system_prompt",
40
+ "tools",
41
+ ]);
42
+
43
+ // Component schemas
44
+ export const BehavioralDifferenceSchema = z.object({
45
+ category: BehavioralCategorySchema,
46
+ observation: z.string().min(10),
47
+ evidence: z.string().min(10),
48
+ significance: SignificanceSchema,
49
+ });
50
+
51
+ export const IntendedEffectSchema = z.object({
52
+ expected_change: z.string().min(5),
53
+ observed_outcome: z.string().min(10),
54
+ evidence: z.string().min(10),
55
+ assessment: EffectAssessmentSchema,
56
+ });
57
+
58
+ export const UnintendedEffectSchema = z.object({
59
+ observation: z.string().min(10),
60
+ evidence: z.string().min(10),
61
+ impact: ImpactTypeSchema,
62
+ severity: SignificanceSchema,
63
+ });
64
+
65
+ export const RecommendationSchema = z.object({
66
+ priority: PrioritySchema,
67
+ action: z.string().min(10),
68
+ rationale: z.string().min(10),
69
+ expected_impact: z.string().optional(),
70
+ });
71
+
72
+ export const MetricComparisonSchema = z.object({
73
+ duration_delta_pct: z.number(),
74
+ token_delta_pct: z.number(),
75
+ cost_delta_pct: z.number(),
76
+ tool_call_delta: z.number(),
77
+ interpretation: z.string().min(10),
78
+ });
79
+
80
+ export const ToolUsageChangeSchema = z.object({
81
+ tool_name: z.string(),
82
+ control_calls: z.number(),
83
+ variant_calls: z.number(),
84
+ pattern_change: z.string(),
85
+ });
86
+
87
+ export const NextExperimentSchema = z.object({
88
+ hypothesis: z.string().min(10),
89
+ suggested_change: z.object({
90
+ dimension: ComparisonDimensionSchema,
91
+ description: z.string().min(10),
92
+ example: z.string().optional(),
93
+ }),
94
+ expected_outcome: z.string().min(10),
95
+ });
96
+
97
+ export const ConfigSummarySchema = z.object({
98
+ model_change: z
99
+ .object({
100
+ from: z.string(),
101
+ to: z.string(),
102
+ })
103
+ .optional(),
104
+ system_prompt_changed: z.boolean(),
105
+ tools_added: z.array(z.string()),
106
+ tools_removed: z.array(z.string()),
107
+ });
108
+
109
+ // Report schemas
110
+ export const ReproducibilityReportSchema = z.object({
111
+ verdict: ReproducibilityVerdictSchema,
112
+ confidence: ConfidenceLevelSchema,
113
+ summary: z.string().min(20),
114
+ behavioral_differences: z.array(BehavioralDifferenceSchema),
115
+ metric_comparison: MetricComparisonSchema,
116
+ recommendations: z.array(RecommendationSchema),
117
+ });
118
+
119
+ export const ChangeImpactReportSchema = z.object({
120
+ verdict: EffectivenessRatingSchema,
121
+ confidence: ConfidenceLevelSchema,
122
+ summary: z.string().min(20),
123
+ hypothesis_assessment: z.string().min(10),
124
+ intended_effects: z.array(IntendedEffectSchema),
125
+ unintended_effects: z.array(UnintendedEffectSchema),
126
+ metric_comparison: MetricComparisonSchema,
127
+ tool_usage_changes: z.array(ToolUsageChangeSchema),
128
+ recommendations: z.array(RecommendationSchema),
129
+ });
130
+
131
+ // LLM output schema - what we expect the LLM to generate
132
+ export const LLMComparisonOutputSchema = z.object({
133
+ reproducibility: ReproducibilityReportSchema,
134
+ change_impact: ChangeImpactReportSchema,
135
+ next_experiments: z.array(NextExperimentSchema).max(5),
136
+ });
137
+
138
+ // Full analysis schema
139
+ export const SessionComparisonAnalysisSchema = z.object({
140
+ comparison_run_id: z.string(),
141
+ created_at: z.string(),
142
+ original_session_id: z.string(),
143
+ control_session_id: z.string(),
144
+ variant_session_id: z.string(),
145
+ hypothesis: z.string(),
146
+ dimensions_compared: z.array(ComparisonDimensionSchema),
147
+ config_summary: ConfigSummarySchema,
148
+ reproducibility: ReproducibilityReportSchema,
149
+ change_impact: ChangeImpactReportSchema,
150
+ next_experiments: z.array(NextExperimentSchema).max(5),
151
+ });
@@ -0,0 +1,194 @@
1
+ /**
2
+ * Session Comparison Analysis Types
3
+ *
4
+ * Types for analyzing comparisons between original, control, and variant sessions
5
+ * to produce Reproducibility Reports and Change Impact Reports.
6
+ */
7
+
8
+ import type { ComparisonDimension } from "../types";
9
+
10
+ // Enum types
11
+ export type ConsistencyLevel = "HIGH" | "MEDIUM" | "LOW";
12
+ export type EffectivenessRating = "IMPROVED" | "DEGRADED" | "NEUTRAL" | "MIXED";
13
+ export type ConfidenceLevel = "HIGH" | "MEDIUM" | "LOW";
14
+ export type Significance = "CRITICAL" | "NOTABLE" | "MINOR";
15
+ export type ReproducibilityVerdict = "STABLE" | "UNSTABLE" | "PARTIALLY_STABLE";
16
+ export type EffectAssessment =
17
+ | "ACHIEVED"
18
+ | "PARTIALLY_ACHIEVED"
19
+ | "NOT_ACHIEVED"
20
+ | "OPPOSITE_EFFECT";
21
+ export type ImpactType = "POSITIVE" | "NEGATIVE" | "NEUTRAL";
22
+ export type BehavioralCategory =
23
+ | "TOOL_USAGE"
24
+ | "RESPONSE_CONTENT"
25
+ | "REASONING_PATH"
26
+ | "ERROR_HANDLING"
27
+ | "PERFORMANCE";
28
+ export type Priority = "HIGH" | "MEDIUM" | "LOW";
29
+
30
+ /**
31
+ * A behavioral difference observed between sessions
32
+ */
33
+ export interface BehavioralDifference {
34
+ category: BehavioralCategory;
35
+ observation: string;
36
+ evidence: string; // Specific quotes/citations using [TOOL:name], [MSG:"quote..."] format
37
+ significance: Significance;
38
+ }
39
+
40
+ /**
41
+ * An intended effect from the config changes
42
+ */
43
+ export interface IntendedEffect {
44
+ expected_change: string;
45
+ observed_outcome: string;
46
+ evidence: string;
47
+ assessment: EffectAssessment;
48
+ }
49
+
50
+ /**
51
+ * An unintended side effect from the config changes
52
+ */
53
+ export interface UnintendedEffect {
54
+ observation: string;
55
+ evidence: string;
56
+ impact: ImpactType;
57
+ severity: Significance;
58
+ }
59
+
60
+ /**
61
+ * An actionable recommendation for improving the agent
62
+ */
63
+ export interface Recommendation {
64
+ priority: Priority;
65
+ action: string; // Specific, actionable - e.g., "Add to system prompt: '...'"
66
+ rationale: string;
67
+ expected_impact?: string | undefined;
68
+ }
69
+
70
+ /**
71
+ * Metric comparison between sessions
72
+ */
73
+ export interface MetricComparison {
74
+ duration_delta_pct: number; // Percentage change
75
+ token_delta_pct: number;
76
+ cost_delta_pct: number;
77
+ tool_call_delta: number; // Absolute difference in tool call count
78
+ interpretation: string; // Human-readable interpretation
79
+ }
80
+
81
+ /**
82
+ * Tool usage change between control and variant
83
+ */
84
+ export interface ToolUsageChange {
85
+ tool_name: string;
86
+ control_calls: number;
87
+ variant_calls: number;
88
+ pattern_change: string; // Description of how usage changed
89
+ }
90
+
91
+ /**
92
+ * Suggested next experiment to run
93
+ */
94
+ export interface NextExperiment {
95
+ hypothesis: string;
96
+ suggested_change: {
97
+ dimension: ComparisonDimension;
98
+ description: string;
99
+ example?: string | undefined; // Concrete example for prompt changes
100
+ };
101
+ expected_outcome: string;
102
+ }
103
+
104
+ /**
105
+ * Config summary showing what changed between control and variant
106
+ */
107
+ export interface ConfigSummary {
108
+ model_change?: { from: string; to: string } | undefined;
109
+ system_prompt_changed: boolean;
110
+ tools_added: string[];
111
+ tools_removed: string[];
112
+ }
113
+
114
+ /**
115
+ * Reproducibility Report - Original vs Control analysis
116
+ */
117
+ export interface ReproducibilityReport {
118
+ verdict: ReproducibilityVerdict;
119
+ confidence: ConfidenceLevel;
120
+ summary: string;
121
+ behavioral_differences: BehavioralDifference[];
122
+ metric_comparison: MetricComparison;
123
+ recommendations: Recommendation[];
124
+ }
125
+
126
+ /**
127
+ * Change Impact Report - Control vs Variant analysis
128
+ */
129
+ export interface ChangeImpactReport {
130
+ verdict: EffectivenessRating;
131
+ confidence: ConfidenceLevel;
132
+ summary: string;
133
+ hypothesis_assessment: string; // Did changes achieve the user's hypothesis?
134
+ intended_effects: IntendedEffect[];
135
+ unintended_effects: UnintendedEffect[];
136
+ metric_comparison: MetricComparison;
137
+ tool_usage_changes: ToolUsageChange[];
138
+ recommendations: Recommendation[];
139
+ }
140
+
141
+ /**
142
+ * Complete Session Comparison Analysis
143
+ */
144
+ export interface SessionComparisonAnalysis {
145
+ comparison_run_id: string;
146
+ created_at: string;
147
+
148
+ // Session IDs
149
+ original_session_id: string;
150
+ control_session_id: string;
151
+ variant_session_id: string;
152
+
153
+ // User's hypothesis
154
+ hypothesis: string;
155
+
156
+ // Config context
157
+ dimensions_compared: ComparisonDimension[];
158
+ config_summary: ConfigSummary;
159
+
160
+ // Section 1: Reproducibility Report (Original vs Control)
161
+ reproducibility: ReproducibilityReport;
162
+
163
+ // Section 2: Change Impact Report (Control vs Variant)
164
+ change_impact: ChangeImpactReport;
165
+
166
+ // Suggested next experiments
167
+ next_experiments: NextExperiment[];
168
+ }
169
+
170
+ /**
171
+ * LLM output schema - what the LLM generates (subset of full analysis)
172
+ */
173
+ export interface LLMComparisonOutput {
174
+ reproducibility: {
175
+ verdict: ReproducibilityVerdict;
176
+ confidence: ConfidenceLevel;
177
+ summary: string;
178
+ behavioral_differences: BehavioralDifference[];
179
+ metric_comparison: MetricComparison;
180
+ recommendations: Recommendation[];
181
+ };
182
+ change_impact: {
183
+ verdict: EffectivenessRating;
184
+ confidence: ConfidenceLevel;
185
+ summary: string;
186
+ hypothesis_assessment: string;
187
+ intended_effects: IntendedEffect[];
188
+ unintended_effects: UnintendedEffect[];
189
+ metric_comparison: MetricComparison;
190
+ tool_usage_changes: ToolUsageChange[];
191
+ recommendations: Recommendation[];
192
+ };
193
+ next_experiments: NextExperiment[];
194
+ }
@@ -163,7 +163,10 @@ export class AnalysisDb {
163
163
  .limit(1000) // Should be enough for most use cases
164
164
  .toArray();
165
165
 
166
- const record = allRecords.find((r: any) => r.session_id === sessionId);
166
+ const record = allRecords.find(
167
+ (r: { session_id: string; embedding: number[] }) =>
168
+ r.session_id === sessionId,
169
+ );
167
170
 
168
171
  if (!record) {
169
172
  return null;
@@ -200,10 +203,12 @@ export class AnalysisDb {
200
203
  .limit(limit)
201
204
  .toArray();
202
205
 
203
- return results.map((result: any) => ({
204
- session_id: result.session_id,
205
- distance: result._distance,
206
- }));
206
+ return results.map(
207
+ (result: { session_id: string; _distance: number }) => ({
208
+ session_id: result.session_id,
209
+ distance: result._distance,
210
+ }),
211
+ );
207
212
  } catch (error) {
208
213
  console.error("Error searching similar sessions:", error);
209
214
  return [];
@@ -229,7 +234,9 @@ export class AnalysisDb {
229
234
  .limit(1000)
230
235
  .toArray();
231
236
 
232
- return allRecords.some((r: any) => r.session_id === sessionId);
237
+ return allRecords.some(
238
+ (r: { session_id: string }) => r.session_id === sessionId,
239
+ );
233
240
  } catch (error) {
234
241
  console.error("Error checking embedding existence:", error);
235
242
  return false;
@@ -8,7 +8,7 @@ import type {
8
8
  SessionMetrics,
9
9
  } from "./types";
10
10
 
11
- const SCHEMA_VERSION = 2;
11
+ const SCHEMA_VERSION = 3;
12
12
 
13
13
  export class ComparisonDb {
14
14
  private db: Database;
@@ -98,6 +98,33 @@ export class ComparisonDb {
98
98
  }
99
99
  }
100
100
 
101
+ // Migration: add hypothesis column and comparison_analyses table
102
+ if (currentVersion < 3) {
103
+ // Add hypothesis column to comparison_configs
104
+ const tableInfo = this.db
105
+ .query<{ name: string }, []>("PRAGMA table_info(comparison_configs)")
106
+ .all();
107
+ const hasHypothesisColumn = tableInfo.some(
108
+ (col) => col.name === "hypothesis",
109
+ );
110
+
111
+ if (!hasHypothesisColumn) {
112
+ this.db.run(
113
+ `ALTER TABLE comparison_configs ADD COLUMN hypothesis TEXT`,
114
+ );
115
+ }
116
+
117
+ // Create comparison_analyses table
118
+ this.db.run(`
119
+ CREATE TABLE IF NOT EXISTS comparison_analyses (
120
+ comparison_run_id TEXT PRIMARY KEY,
121
+ analysis_json TEXT NOT NULL,
122
+ created_at TEXT NOT NULL,
123
+ updated_at TEXT NOT NULL
124
+ )
125
+ `);
126
+ }
127
+
101
128
  // Update schema version
102
129
  this.db.run(`PRAGMA user_version = ${SCHEMA_VERSION}`);
103
130
  }
@@ -108,14 +135,15 @@ export class ComparisonDb {
108
135
  const now = new Date().toISOString();
109
136
  this.db.run(
110
137
  `
111
- INSERT INTO comparison_configs (id, dimensions, control_model, variant_model, variant_system_prompt, variant_tools, created_at, updated_at)
112
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
138
+ INSERT INTO comparison_configs (id, dimensions, control_model, variant_model, variant_system_prompt, variant_tools, hypothesis, created_at, updated_at)
139
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
113
140
  ON CONFLICT(id) DO UPDATE SET
114
141
  dimensions = excluded.dimensions,
115
142
  control_model = excluded.control_model,
116
143
  variant_model = excluded.variant_model,
117
144
  variant_system_prompt = excluded.variant_system_prompt,
118
145
  variant_tools = excluded.variant_tools,
146
+ hypothesis = excluded.hypothesis,
119
147
  updated_at = excluded.updated_at
120
148
  `,
121
149
  [
@@ -125,6 +153,7 @@ export class ComparisonDb {
125
153
  config.variantModel ?? null,
126
154
  config.variantSystemPrompt ?? null,
127
155
  config.variantTools ? JSON.stringify(config.variantTools) : null,
156
+ config.hypothesis ?? null,
128
157
  config.createdAt || now,
129
158
  now,
130
159
  ],
@@ -199,6 +228,7 @@ export class ComparisonDb {
199
228
  variantModel: row.variant_model ?? undefined,
200
229
  variantSystemPrompt: row.variant_system_prompt ?? undefined,
201
230
  variantTools,
231
+ hypothesis: row.hypothesis ?? undefined,
202
232
  createdAt: row.created_at,
203
233
  updatedAt: row.updated_at,
204
234
  };
@@ -377,4 +407,46 @@ export class ComparisonDb {
377
407
  variantResponse: row.variant_response ?? null,
378
408
  };
379
409
  }
410
+
411
+ // Comparison Analysis methods
412
+
413
+ saveComparisonAnalysis(runId: string, analysis: unknown): void {
414
+ const now = new Date().toISOString();
415
+ this.db.run(
416
+ `
417
+ INSERT INTO comparison_analyses (comparison_run_id, analysis_json, created_at, updated_at)
418
+ VALUES (?, ?, ?, ?)
419
+ ON CONFLICT(comparison_run_id) DO UPDATE SET
420
+ analysis_json = excluded.analysis_json,
421
+ updated_at = excluded.updated_at
422
+ `,
423
+ [runId, JSON.stringify(analysis), now, now],
424
+ );
425
+ }
426
+
427
+ getComparisonAnalysis(runId: string): unknown | null {
428
+ const row = this.db
429
+ .query<{ analysis_json: string }, [string]>(
430
+ `SELECT analysis_json FROM comparison_analyses WHERE comparison_run_id = ?`,
431
+ )
432
+ .get(runId);
433
+
434
+ if (!row) return null;
435
+ try {
436
+ return JSON.parse(row.analysis_json);
437
+ } catch (e) {
438
+ console.error("Failed to parse comparison analysis JSON:", e);
439
+ return null;
440
+ }
441
+ }
442
+
443
+ hasComparisonAnalysis(runId: string): boolean {
444
+ const row = this.db
445
+ .query<{ count: number }, [string]>(
446
+ `SELECT COUNT(*) as count FROM comparison_analyses WHERE comparison_run_id = ?`,
447
+ )
448
+ .get(runId);
449
+
450
+ return (row?.count ?? 0) > 0;
451
+ }
380
452
  }
@@ -36,8 +36,12 @@ export function AnalyzeAllButton({ sessionIds, onComplete }: Props) {
36
36
  const { results } = await response.json();
37
37
 
38
38
  // Count successes and errors
39
- const completed = results.filter((r: any) => r.success).length;
40
- const errors = results.filter((r: any) => !r.success).length;
39
+ const completed = results.filter(
40
+ (r: { success: boolean }) => r.success,
41
+ ).length;
42
+ const errors = results.filter(
43
+ (r: { success: boolean }) => !r.success,
44
+ ).length;
41
45
 
42
46
  setProgress({ completed, total: sessionIds.length, errors });
43
47