@townco/debugger 0.1.32 → 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/App.tsx +1 -0
- package/src/analysis/analyzer.ts +1 -2
- package/src/analysis/comparison-analyzer.ts +528 -0
- package/src/analysis/comparison-schema.ts +151 -0
- package/src/analysis/comparison-types.ts +194 -0
- package/src/analysis-db.ts +13 -6
- package/src/comparison-db.ts +75 -3
- package/src/components/AnalyzeAllButton.tsx +6 -2
- package/src/components/ComparisonAnalysisDialog.tsx +591 -0
- package/src/components/DebuggerHeader.tsx +0 -1
- package/src/components/LogList.tsx +9 -0
- package/src/components/SessionTraceList.tsx +9 -0
- package/src/components/SpanDetailsPanel.tsx +20 -1
- package/src/components/SpanTimeline.tsx +31 -4
- package/src/components/SpanTree.tsx +10 -1
- package/src/components/TurnMetadataPanel.tsx +0 -1
- package/src/components/UnifiedTimeline.tsx +25 -35
- package/src/components/ui/button.tsx +1 -1
- package/src/components/ui/card.tsx +1 -1
- package/src/components/ui/checkbox.tsx +1 -0
- package/src/components/ui/input.tsx +1 -1
- package/src/components/ui/label.tsx +1 -1
- package/src/components/ui/select.tsx +1 -1
- package/src/components/ui/textarea.tsx +1 -1
- package/src/frontend.tsx +2 -0
- package/src/lib/metrics.test.ts +2 -0
- package/src/lib/turnExtractor.ts +28 -0
- package/src/pages/ComparisonView.tsx +586 -92
- package/src/pages/FindSessions.tsx +3 -1
- package/src/pages/TownHall.tsx +30 -14
- package/src/server.ts +177 -7
- package/src/types.ts +4 -0
- package/styles/globals.css +120 -0
- package/tsconfig.json +2 -2
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod schemas for session comparison analysis validation
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
|
|
7
|
+
// Enum schemas
|
|
8
|
+
export const ConsistencyLevelSchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
|
|
9
|
+
export const EffectivenessRatingSchema = z.enum([
|
|
10
|
+
"IMPROVED",
|
|
11
|
+
"DEGRADED",
|
|
12
|
+
"NEUTRAL",
|
|
13
|
+
"MIXED",
|
|
14
|
+
]);
|
|
15
|
+
export const ConfidenceLevelSchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
|
|
16
|
+
export const SignificanceSchema = z.enum(["CRITICAL", "NOTABLE", "MINOR"]);
|
|
17
|
+
export const ReproducibilityVerdictSchema = z.enum([
|
|
18
|
+
"STABLE",
|
|
19
|
+
"UNSTABLE",
|
|
20
|
+
"PARTIALLY_STABLE",
|
|
21
|
+
]);
|
|
22
|
+
export const EffectAssessmentSchema = z.enum([
|
|
23
|
+
"ACHIEVED",
|
|
24
|
+
"PARTIALLY_ACHIEVED",
|
|
25
|
+
"NOT_ACHIEVED",
|
|
26
|
+
"OPPOSITE_EFFECT",
|
|
27
|
+
]);
|
|
28
|
+
export const ImpactTypeSchema = z.enum(["POSITIVE", "NEGATIVE", "NEUTRAL"]);
|
|
29
|
+
export const BehavioralCategorySchema = z.enum([
|
|
30
|
+
"TOOL_USAGE",
|
|
31
|
+
"RESPONSE_CONTENT",
|
|
32
|
+
"REASONING_PATH",
|
|
33
|
+
"ERROR_HANDLING",
|
|
34
|
+
"PERFORMANCE",
|
|
35
|
+
]);
|
|
36
|
+
export const PrioritySchema = z.enum(["HIGH", "MEDIUM", "LOW"]);
|
|
37
|
+
export const ComparisonDimensionSchema = z.enum([
|
|
38
|
+
"model",
|
|
39
|
+
"system_prompt",
|
|
40
|
+
"tools",
|
|
41
|
+
]);
|
|
42
|
+
|
|
43
|
+
// Component schemas
|
|
44
|
+
export const BehavioralDifferenceSchema = z.object({
|
|
45
|
+
category: BehavioralCategorySchema,
|
|
46
|
+
observation: z.string().min(10),
|
|
47
|
+
evidence: z.string().min(10),
|
|
48
|
+
significance: SignificanceSchema,
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
export const IntendedEffectSchema = z.object({
|
|
52
|
+
expected_change: z.string().min(5),
|
|
53
|
+
observed_outcome: z.string().min(10),
|
|
54
|
+
evidence: z.string().min(10),
|
|
55
|
+
assessment: EffectAssessmentSchema,
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
export const UnintendedEffectSchema = z.object({
|
|
59
|
+
observation: z.string().min(10),
|
|
60
|
+
evidence: z.string().min(10),
|
|
61
|
+
impact: ImpactTypeSchema,
|
|
62
|
+
severity: SignificanceSchema,
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
export const RecommendationSchema = z.object({
|
|
66
|
+
priority: PrioritySchema,
|
|
67
|
+
action: z.string().min(10),
|
|
68
|
+
rationale: z.string().min(10),
|
|
69
|
+
expected_impact: z.string().optional(),
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
export const MetricComparisonSchema = z.object({
|
|
73
|
+
duration_delta_pct: z.number(),
|
|
74
|
+
token_delta_pct: z.number(),
|
|
75
|
+
cost_delta_pct: z.number(),
|
|
76
|
+
tool_call_delta: z.number(),
|
|
77
|
+
interpretation: z.string().min(10),
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
export const ToolUsageChangeSchema = z.object({
|
|
81
|
+
tool_name: z.string(),
|
|
82
|
+
control_calls: z.number(),
|
|
83
|
+
variant_calls: z.number(),
|
|
84
|
+
pattern_change: z.string(),
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
export const NextExperimentSchema = z.object({
|
|
88
|
+
hypothesis: z.string().min(10),
|
|
89
|
+
suggested_change: z.object({
|
|
90
|
+
dimension: ComparisonDimensionSchema,
|
|
91
|
+
description: z.string().min(10),
|
|
92
|
+
example: z.string().optional(),
|
|
93
|
+
}),
|
|
94
|
+
expected_outcome: z.string().min(10),
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
export const ConfigSummarySchema = z.object({
|
|
98
|
+
model_change: z
|
|
99
|
+
.object({
|
|
100
|
+
from: z.string(),
|
|
101
|
+
to: z.string(),
|
|
102
|
+
})
|
|
103
|
+
.optional(),
|
|
104
|
+
system_prompt_changed: z.boolean(),
|
|
105
|
+
tools_added: z.array(z.string()),
|
|
106
|
+
tools_removed: z.array(z.string()),
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
// Report schemas
|
|
110
|
+
export const ReproducibilityReportSchema = z.object({
|
|
111
|
+
verdict: ReproducibilityVerdictSchema,
|
|
112
|
+
confidence: ConfidenceLevelSchema,
|
|
113
|
+
summary: z.string().min(20),
|
|
114
|
+
behavioral_differences: z.array(BehavioralDifferenceSchema),
|
|
115
|
+
metric_comparison: MetricComparisonSchema,
|
|
116
|
+
recommendations: z.array(RecommendationSchema),
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
export const ChangeImpactReportSchema = z.object({
|
|
120
|
+
verdict: EffectivenessRatingSchema,
|
|
121
|
+
confidence: ConfidenceLevelSchema,
|
|
122
|
+
summary: z.string().min(20),
|
|
123
|
+
hypothesis_assessment: z.string().min(10),
|
|
124
|
+
intended_effects: z.array(IntendedEffectSchema),
|
|
125
|
+
unintended_effects: z.array(UnintendedEffectSchema),
|
|
126
|
+
metric_comparison: MetricComparisonSchema,
|
|
127
|
+
tool_usage_changes: z.array(ToolUsageChangeSchema),
|
|
128
|
+
recommendations: z.array(RecommendationSchema),
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
// LLM output schema - what we expect the LLM to generate
|
|
132
|
+
export const LLMComparisonOutputSchema = z.object({
|
|
133
|
+
reproducibility: ReproducibilityReportSchema,
|
|
134
|
+
change_impact: ChangeImpactReportSchema,
|
|
135
|
+
next_experiments: z.array(NextExperimentSchema).max(5),
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
// Full analysis schema
|
|
139
|
+
export const SessionComparisonAnalysisSchema = z.object({
|
|
140
|
+
comparison_run_id: z.string(),
|
|
141
|
+
created_at: z.string(),
|
|
142
|
+
original_session_id: z.string(),
|
|
143
|
+
control_session_id: z.string(),
|
|
144
|
+
variant_session_id: z.string(),
|
|
145
|
+
hypothesis: z.string(),
|
|
146
|
+
dimensions_compared: z.array(ComparisonDimensionSchema),
|
|
147
|
+
config_summary: ConfigSummarySchema,
|
|
148
|
+
reproducibility: ReproducibilityReportSchema,
|
|
149
|
+
change_impact: ChangeImpactReportSchema,
|
|
150
|
+
next_experiments: z.array(NextExperimentSchema).max(5),
|
|
151
|
+
});
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Session Comparison Analysis Types
|
|
3
|
+
*
|
|
4
|
+
* Types for analyzing comparisons between original, control, and variant sessions
|
|
5
|
+
* to produce Reproducibility Reports and Change Impact Reports.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { ComparisonDimension } from "../types";
|
|
9
|
+
|
|
10
|
+
// Enum types
|
|
11
|
+
export type ConsistencyLevel = "HIGH" | "MEDIUM" | "LOW";
|
|
12
|
+
export type EffectivenessRating = "IMPROVED" | "DEGRADED" | "NEUTRAL" | "MIXED";
|
|
13
|
+
export type ConfidenceLevel = "HIGH" | "MEDIUM" | "LOW";
|
|
14
|
+
export type Significance = "CRITICAL" | "NOTABLE" | "MINOR";
|
|
15
|
+
export type ReproducibilityVerdict = "STABLE" | "UNSTABLE" | "PARTIALLY_STABLE";
|
|
16
|
+
export type EffectAssessment =
|
|
17
|
+
| "ACHIEVED"
|
|
18
|
+
| "PARTIALLY_ACHIEVED"
|
|
19
|
+
| "NOT_ACHIEVED"
|
|
20
|
+
| "OPPOSITE_EFFECT";
|
|
21
|
+
export type ImpactType = "POSITIVE" | "NEGATIVE" | "NEUTRAL";
|
|
22
|
+
export type BehavioralCategory =
|
|
23
|
+
| "TOOL_USAGE"
|
|
24
|
+
| "RESPONSE_CONTENT"
|
|
25
|
+
| "REASONING_PATH"
|
|
26
|
+
| "ERROR_HANDLING"
|
|
27
|
+
| "PERFORMANCE";
|
|
28
|
+
export type Priority = "HIGH" | "MEDIUM" | "LOW";
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* A behavioral difference observed between sessions
|
|
32
|
+
*/
|
|
33
|
+
export interface BehavioralDifference {
|
|
34
|
+
category: BehavioralCategory;
|
|
35
|
+
observation: string;
|
|
36
|
+
evidence: string; // Specific quotes/citations using [TOOL:name], [MSG:"quote..."] format
|
|
37
|
+
significance: Significance;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* An intended effect from the config changes
|
|
42
|
+
*/
|
|
43
|
+
export interface IntendedEffect {
|
|
44
|
+
expected_change: string;
|
|
45
|
+
observed_outcome: string;
|
|
46
|
+
evidence: string;
|
|
47
|
+
assessment: EffectAssessment;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* An unintended side effect from the config changes
|
|
52
|
+
*/
|
|
53
|
+
export interface UnintendedEffect {
|
|
54
|
+
observation: string;
|
|
55
|
+
evidence: string;
|
|
56
|
+
impact: ImpactType;
|
|
57
|
+
severity: Significance;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* An actionable recommendation for improving the agent
|
|
62
|
+
*/
|
|
63
|
+
export interface Recommendation {
|
|
64
|
+
priority: Priority;
|
|
65
|
+
action: string; // Specific, actionable - e.g., "Add to system prompt: '...'"
|
|
66
|
+
rationale: string;
|
|
67
|
+
expected_impact?: string | undefined;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Metric comparison between sessions
|
|
72
|
+
*/
|
|
73
|
+
export interface MetricComparison {
|
|
74
|
+
duration_delta_pct: number; // Percentage change
|
|
75
|
+
token_delta_pct: number;
|
|
76
|
+
cost_delta_pct: number;
|
|
77
|
+
tool_call_delta: number; // Absolute difference in tool call count
|
|
78
|
+
interpretation: string; // Human-readable interpretation
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Tool usage change between control and variant
|
|
83
|
+
*/
|
|
84
|
+
export interface ToolUsageChange {
|
|
85
|
+
tool_name: string;
|
|
86
|
+
control_calls: number;
|
|
87
|
+
variant_calls: number;
|
|
88
|
+
pattern_change: string; // Description of how usage changed
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Suggested next experiment to run
|
|
93
|
+
*/
|
|
94
|
+
export interface NextExperiment {
|
|
95
|
+
hypothesis: string;
|
|
96
|
+
suggested_change: {
|
|
97
|
+
dimension: ComparisonDimension;
|
|
98
|
+
description: string;
|
|
99
|
+
example?: string | undefined; // Concrete example for prompt changes
|
|
100
|
+
};
|
|
101
|
+
expected_outcome: string;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Config summary showing what changed between control and variant
|
|
106
|
+
*/
|
|
107
|
+
export interface ConfigSummary {
|
|
108
|
+
model_change?: { from: string; to: string } | undefined;
|
|
109
|
+
system_prompt_changed: boolean;
|
|
110
|
+
tools_added: string[];
|
|
111
|
+
tools_removed: string[];
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Reproducibility Report - Original vs Control analysis
|
|
116
|
+
*/
|
|
117
|
+
export interface ReproducibilityReport {
|
|
118
|
+
verdict: ReproducibilityVerdict;
|
|
119
|
+
confidence: ConfidenceLevel;
|
|
120
|
+
summary: string;
|
|
121
|
+
behavioral_differences: BehavioralDifference[];
|
|
122
|
+
metric_comparison: MetricComparison;
|
|
123
|
+
recommendations: Recommendation[];
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Change Impact Report - Control vs Variant analysis
|
|
128
|
+
*/
|
|
129
|
+
export interface ChangeImpactReport {
|
|
130
|
+
verdict: EffectivenessRating;
|
|
131
|
+
confidence: ConfidenceLevel;
|
|
132
|
+
summary: string;
|
|
133
|
+
hypothesis_assessment: string; // Did changes achieve the user's hypothesis?
|
|
134
|
+
intended_effects: IntendedEffect[];
|
|
135
|
+
unintended_effects: UnintendedEffect[];
|
|
136
|
+
metric_comparison: MetricComparison;
|
|
137
|
+
tool_usage_changes: ToolUsageChange[];
|
|
138
|
+
recommendations: Recommendation[];
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Complete Session Comparison Analysis
|
|
143
|
+
*/
|
|
144
|
+
export interface SessionComparisonAnalysis {
|
|
145
|
+
comparison_run_id: string;
|
|
146
|
+
created_at: string;
|
|
147
|
+
|
|
148
|
+
// Session IDs
|
|
149
|
+
original_session_id: string;
|
|
150
|
+
control_session_id: string;
|
|
151
|
+
variant_session_id: string;
|
|
152
|
+
|
|
153
|
+
// User's hypothesis
|
|
154
|
+
hypothesis: string;
|
|
155
|
+
|
|
156
|
+
// Config context
|
|
157
|
+
dimensions_compared: ComparisonDimension[];
|
|
158
|
+
config_summary: ConfigSummary;
|
|
159
|
+
|
|
160
|
+
// Section 1: Reproducibility Report (Original vs Control)
|
|
161
|
+
reproducibility: ReproducibilityReport;
|
|
162
|
+
|
|
163
|
+
// Section 2: Change Impact Report (Control vs Variant)
|
|
164
|
+
change_impact: ChangeImpactReport;
|
|
165
|
+
|
|
166
|
+
// Suggested next experiments
|
|
167
|
+
next_experiments: NextExperiment[];
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* LLM output schema - what the LLM generates (subset of full analysis)
|
|
172
|
+
*/
|
|
173
|
+
export interface LLMComparisonOutput {
|
|
174
|
+
reproducibility: {
|
|
175
|
+
verdict: ReproducibilityVerdict;
|
|
176
|
+
confidence: ConfidenceLevel;
|
|
177
|
+
summary: string;
|
|
178
|
+
behavioral_differences: BehavioralDifference[];
|
|
179
|
+
metric_comparison: MetricComparison;
|
|
180
|
+
recommendations: Recommendation[];
|
|
181
|
+
};
|
|
182
|
+
change_impact: {
|
|
183
|
+
verdict: EffectivenessRating;
|
|
184
|
+
confidence: ConfidenceLevel;
|
|
185
|
+
summary: string;
|
|
186
|
+
hypothesis_assessment: string;
|
|
187
|
+
intended_effects: IntendedEffect[];
|
|
188
|
+
unintended_effects: UnintendedEffect[];
|
|
189
|
+
metric_comparison: MetricComparison;
|
|
190
|
+
tool_usage_changes: ToolUsageChange[];
|
|
191
|
+
recommendations: Recommendation[];
|
|
192
|
+
};
|
|
193
|
+
next_experiments: NextExperiment[];
|
|
194
|
+
}
|
package/src/analysis-db.ts
CHANGED
|
@@ -163,7 +163,10 @@ export class AnalysisDb {
|
|
|
163
163
|
.limit(1000) // Should be enough for most use cases
|
|
164
164
|
.toArray();
|
|
165
165
|
|
|
166
|
-
const record = allRecords.find(
|
|
166
|
+
const record = allRecords.find(
|
|
167
|
+
(r: { session_id: string; embedding: number[] }) =>
|
|
168
|
+
r.session_id === sessionId,
|
|
169
|
+
);
|
|
167
170
|
|
|
168
171
|
if (!record) {
|
|
169
172
|
return null;
|
|
@@ -200,10 +203,12 @@ export class AnalysisDb {
|
|
|
200
203
|
.limit(limit)
|
|
201
204
|
.toArray();
|
|
202
205
|
|
|
203
|
-
return results.map(
|
|
204
|
-
session_id:
|
|
205
|
-
|
|
206
|
-
|
|
206
|
+
return results.map(
|
|
207
|
+
(result: { session_id: string; _distance: number }) => ({
|
|
208
|
+
session_id: result.session_id,
|
|
209
|
+
distance: result._distance,
|
|
210
|
+
}),
|
|
211
|
+
);
|
|
207
212
|
} catch (error) {
|
|
208
213
|
console.error("Error searching similar sessions:", error);
|
|
209
214
|
return [];
|
|
@@ -229,7 +234,9 @@ export class AnalysisDb {
|
|
|
229
234
|
.limit(1000)
|
|
230
235
|
.toArray();
|
|
231
236
|
|
|
232
|
-
return allRecords.some(
|
|
237
|
+
return allRecords.some(
|
|
238
|
+
(r: { session_id: string }) => r.session_id === sessionId,
|
|
239
|
+
);
|
|
233
240
|
} catch (error) {
|
|
234
241
|
console.error("Error checking embedding existence:", error);
|
|
235
242
|
return false;
|
package/src/comparison-db.ts
CHANGED
|
@@ -8,7 +8,7 @@ import type {
|
|
|
8
8
|
SessionMetrics,
|
|
9
9
|
} from "./types";
|
|
10
10
|
|
|
11
|
-
const SCHEMA_VERSION =
|
|
11
|
+
const SCHEMA_VERSION = 3;
|
|
12
12
|
|
|
13
13
|
export class ComparisonDb {
|
|
14
14
|
private db: Database;
|
|
@@ -98,6 +98,33 @@ export class ComparisonDb {
|
|
|
98
98
|
}
|
|
99
99
|
}
|
|
100
100
|
|
|
101
|
+
// Migration: add hypothesis column and comparison_analyses table
|
|
102
|
+
if (currentVersion < 3) {
|
|
103
|
+
// Add hypothesis column to comparison_configs
|
|
104
|
+
const tableInfo = this.db
|
|
105
|
+
.query<{ name: string }, []>("PRAGMA table_info(comparison_configs)")
|
|
106
|
+
.all();
|
|
107
|
+
const hasHypothesisColumn = tableInfo.some(
|
|
108
|
+
(col) => col.name === "hypothesis",
|
|
109
|
+
);
|
|
110
|
+
|
|
111
|
+
if (!hasHypothesisColumn) {
|
|
112
|
+
this.db.run(
|
|
113
|
+
`ALTER TABLE comparison_configs ADD COLUMN hypothesis TEXT`,
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Create comparison_analyses table
|
|
118
|
+
this.db.run(`
|
|
119
|
+
CREATE TABLE IF NOT EXISTS comparison_analyses (
|
|
120
|
+
comparison_run_id TEXT PRIMARY KEY,
|
|
121
|
+
analysis_json TEXT NOT NULL,
|
|
122
|
+
created_at TEXT NOT NULL,
|
|
123
|
+
updated_at TEXT NOT NULL
|
|
124
|
+
)
|
|
125
|
+
`);
|
|
126
|
+
}
|
|
127
|
+
|
|
101
128
|
// Update schema version
|
|
102
129
|
this.db.run(`PRAGMA user_version = ${SCHEMA_VERSION}`);
|
|
103
130
|
}
|
|
@@ -108,14 +135,15 @@ export class ComparisonDb {
|
|
|
108
135
|
const now = new Date().toISOString();
|
|
109
136
|
this.db.run(
|
|
110
137
|
`
|
|
111
|
-
INSERT INTO comparison_configs (id, dimensions, control_model, variant_model, variant_system_prompt, variant_tools, created_at, updated_at)
|
|
112
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
138
|
+
INSERT INTO comparison_configs (id, dimensions, control_model, variant_model, variant_system_prompt, variant_tools, hypothesis, created_at, updated_at)
|
|
139
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
113
140
|
ON CONFLICT(id) DO UPDATE SET
|
|
114
141
|
dimensions = excluded.dimensions,
|
|
115
142
|
control_model = excluded.control_model,
|
|
116
143
|
variant_model = excluded.variant_model,
|
|
117
144
|
variant_system_prompt = excluded.variant_system_prompt,
|
|
118
145
|
variant_tools = excluded.variant_tools,
|
|
146
|
+
hypothesis = excluded.hypothesis,
|
|
119
147
|
updated_at = excluded.updated_at
|
|
120
148
|
`,
|
|
121
149
|
[
|
|
@@ -125,6 +153,7 @@ export class ComparisonDb {
|
|
|
125
153
|
config.variantModel ?? null,
|
|
126
154
|
config.variantSystemPrompt ?? null,
|
|
127
155
|
config.variantTools ? JSON.stringify(config.variantTools) : null,
|
|
156
|
+
config.hypothesis ?? null,
|
|
128
157
|
config.createdAt || now,
|
|
129
158
|
now,
|
|
130
159
|
],
|
|
@@ -199,6 +228,7 @@ export class ComparisonDb {
|
|
|
199
228
|
variantModel: row.variant_model ?? undefined,
|
|
200
229
|
variantSystemPrompt: row.variant_system_prompt ?? undefined,
|
|
201
230
|
variantTools,
|
|
231
|
+
hypothesis: row.hypothesis ?? undefined,
|
|
202
232
|
createdAt: row.created_at,
|
|
203
233
|
updatedAt: row.updated_at,
|
|
204
234
|
};
|
|
@@ -377,4 +407,46 @@ export class ComparisonDb {
|
|
|
377
407
|
variantResponse: row.variant_response ?? null,
|
|
378
408
|
};
|
|
379
409
|
}
|
|
410
|
+
|
|
411
|
+
// Comparison Analysis methods
|
|
412
|
+
|
|
413
|
+
saveComparisonAnalysis(runId: string, analysis: unknown): void {
|
|
414
|
+
const now = new Date().toISOString();
|
|
415
|
+
this.db.run(
|
|
416
|
+
`
|
|
417
|
+
INSERT INTO comparison_analyses (comparison_run_id, analysis_json, created_at, updated_at)
|
|
418
|
+
VALUES (?, ?, ?, ?)
|
|
419
|
+
ON CONFLICT(comparison_run_id) DO UPDATE SET
|
|
420
|
+
analysis_json = excluded.analysis_json,
|
|
421
|
+
updated_at = excluded.updated_at
|
|
422
|
+
`,
|
|
423
|
+
[runId, JSON.stringify(analysis), now, now],
|
|
424
|
+
);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
getComparisonAnalysis(runId: string): unknown | null {
|
|
428
|
+
const row = this.db
|
|
429
|
+
.query<{ analysis_json: string }, [string]>(
|
|
430
|
+
`SELECT analysis_json FROM comparison_analyses WHERE comparison_run_id = ?`,
|
|
431
|
+
)
|
|
432
|
+
.get(runId);
|
|
433
|
+
|
|
434
|
+
if (!row) return null;
|
|
435
|
+
try {
|
|
436
|
+
return JSON.parse(row.analysis_json);
|
|
437
|
+
} catch (e) {
|
|
438
|
+
console.error("Failed to parse comparison analysis JSON:", e);
|
|
439
|
+
return null;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
hasComparisonAnalysis(runId: string): boolean {
|
|
444
|
+
const row = this.db
|
|
445
|
+
.query<{ count: number }, [string]>(
|
|
446
|
+
`SELECT COUNT(*) as count FROM comparison_analyses WHERE comparison_run_id = ?`,
|
|
447
|
+
)
|
|
448
|
+
.get(runId);
|
|
449
|
+
|
|
450
|
+
return (row?.count ?? 0) > 0;
|
|
451
|
+
}
|
|
380
452
|
}
|
|
@@ -36,8 +36,12 @@ export function AnalyzeAllButton({ sessionIds, onComplete }: Props) {
|
|
|
36
36
|
const { results } = await response.json();
|
|
37
37
|
|
|
38
38
|
// Count successes and errors
|
|
39
|
-
const completed = results.filter(
|
|
40
|
-
|
|
39
|
+
const completed = results.filter(
|
|
40
|
+
(r: { success: boolean }) => r.success,
|
|
41
|
+
).length;
|
|
42
|
+
const errors = results.filter(
|
|
43
|
+
(r: { success: boolean }) => !r.success,
|
|
44
|
+
).length;
|
|
41
45
|
|
|
42
46
|
setProgress({ completed, total: sessionIds.length, errors });
|
|
43
47
|
|