@superblocksteam/vite-plugin-file-sync 2.0.67 → 2.0.68-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/dist/ai-service/agent/tools/apis/analysis.d.ts.map +1 -1
  2. package/dist/ai-service/agent/tools/apis/analysis.js +4 -0
  3. package/dist/ai-service/agent/tools/apis/analysis.js.map +1 -1
  4. package/dist/ai-service/agent/tools/apis/api-executor.d.ts +9 -1
  5. package/dist/ai-service/agent/tools/apis/api-executor.d.ts.map +1 -1
  6. package/dist/ai-service/agent/tools/apis/api-executor.js +4 -1
  7. package/dist/ai-service/agent/tools/apis/api-executor.js.map +1 -1
  8. package/dist/ai-service/agent/tools/apis/api-validation-orchestrator.d.ts +1 -0
  9. package/dist/ai-service/agent/tools/apis/api-validation-orchestrator.d.ts.map +1 -1
  10. package/dist/ai-service/agent/tools/apis/api-validation-orchestrator.js +1 -1
  11. package/dist/ai-service/agent/tools/apis/api-validation-orchestrator.js.map +1 -1
  12. package/dist/ai-service/agent/tools/apis/test-api.d.ts +5 -0
  13. package/dist/ai-service/agent/tools/apis/test-api.d.ts.map +1 -1
  14. package/dist/ai-service/agent/tools/apis/test-api.js +9 -1
  15. package/dist/ai-service/agent/tools/apis/test-api.js.map +1 -1
  16. package/dist/ai-service/agent/tools2/tools/read.d.ts +1 -1
  17. package/dist/ai-service/index.d.ts +23 -2
  18. package/dist/ai-service/index.d.ts.map +1 -1
  19. package/dist/ai-service/index.js +99 -0
  20. package/dist/ai-service/index.js.map +1 -1
  21. package/dist/ai-service/judge/debug-browser.d.ts +8 -0
  22. package/dist/ai-service/judge/debug-browser.d.ts.map +1 -0
  23. package/dist/ai-service/judge/debug-browser.js +79 -0
  24. package/dist/ai-service/judge/debug-browser.js.map +1 -0
  25. package/dist/ai-service/judge/index.d.ts +12 -0
  26. package/dist/ai-service/judge/index.d.ts.map +1 -0
  27. package/dist/ai-service/judge/index.js +11 -0
  28. package/dist/ai-service/judge/index.js.map +1 -0
  29. package/dist/ai-service/judge/integration/mcp-client.d.ts +82 -0
  30. package/dist/ai-service/judge/integration/mcp-client.d.ts.map +1 -0
  31. package/dist/ai-service/judge/integration/mcp-client.js +276 -0
  32. package/dist/ai-service/judge/integration/mcp-client.js.map +1 -0
  33. package/dist/ai-service/judge/integration/playwright-bridge.d.ts +142 -0
  34. package/dist/ai-service/judge/integration/playwright-bridge.d.ts.map +1 -0
  35. package/dist/ai-service/judge/integration/playwright-bridge.js +217 -0
  36. package/dist/ai-service/judge/integration/playwright-bridge.js.map +1 -0
  37. package/dist/ai-service/judge/judge-eval-http.d.ts +3 -0
  38. package/dist/ai-service/judge/judge-eval-http.d.ts.map +1 -0
  39. package/dist/ai-service/judge/judge-eval-http.js +541 -0
  40. package/dist/ai-service/judge/judge-eval-http.js.map +1 -0
  41. package/dist/ai-service/judge/judge-eval-service-runner.d.ts +35 -0
  42. package/dist/ai-service/judge/judge-eval-service-runner.d.ts.map +1 -0
  43. package/dist/ai-service/judge/judge-eval-service-runner.js +124 -0
  44. package/dist/ai-service/judge/judge-eval-service-runner.js.map +1 -0
  45. package/dist/ai-service/judge/judge-executor.d.ts +65 -0
  46. package/dist/ai-service/judge/judge-executor.d.ts.map +1 -0
  47. package/dist/ai-service/judge/judge-executor.js +334 -0
  48. package/dist/ai-service/judge/judge-executor.js.map +1 -0
  49. package/dist/ai-service/judge/judge-service.d.ts +161 -0
  50. package/dist/ai-service/judge/judge-service.d.ts.map +1 -0
  51. package/dist/ai-service/judge/judge-service.js +241 -0
  52. package/dist/ai-service/judge/judge-service.js.map +1 -0
  53. package/dist/ai-service/judge/prompts/evaluation-criteria.d.ts +37 -0
  54. package/dist/ai-service/judge/prompts/evaluation-criteria.d.ts.map +1 -0
  55. package/dist/ai-service/judge/prompts/evaluation-criteria.js +283 -0
  56. package/dist/ai-service/judge/prompts/evaluation-criteria.js.map +1 -0
  57. package/dist/ai-service/judge/prompts/system-prompt.d.ts +30 -0
  58. package/dist/ai-service/judge/prompts/system-prompt.d.ts.map +1 -0
  59. package/dist/ai-service/judge/prompts/system-prompt.js +212 -0
  60. package/dist/ai-service/judge/prompts/system-prompt.js.map +1 -0
  61. package/dist/ai-service/judge/storage/csv-storage.d.ts +99 -0
  62. package/dist/ai-service/judge/storage/csv-storage.d.ts.map +1 -0
  63. package/dist/ai-service/judge/storage/csv-storage.js +274 -0
  64. package/dist/ai-service/judge/storage/csv-storage.js.map +1 -0
  65. package/dist/ai-service/judge/storage/index.d.ts +9 -0
  66. package/dist/ai-service/judge/storage/index.d.ts.map +1 -0
  67. package/dist/ai-service/judge/storage/index.js +7 -0
  68. package/dist/ai-service/judge/storage/index.js.map +1 -0
  69. package/dist/ai-service/judge/storage/interface.d.ts +51 -0
  70. package/dist/ai-service/judge/storage/interface.d.ts.map +1 -0
  71. package/dist/ai-service/judge/storage/interface.js +8 -0
  72. package/dist/ai-service/judge/storage/interface.js.map +1 -0
  73. package/dist/ai-service/judge/storage/types.d.ts +54 -0
  74. package/dist/ai-service/judge/storage/types.d.ts.map +1 -0
  75. package/dist/ai-service/judge/storage/types.js +7 -0
  76. package/dist/ai-service/judge/storage/types.js.map +1 -0
  77. package/dist/ai-service/judge/tools/index.d.ts +22 -0
  78. package/dist/ai-service/judge/tools/index.d.ts.map +1 -0
  79. package/dist/ai-service/judge/tools/index.js +29 -0
  80. package/dist/ai-service/judge/tools/index.js.map +1 -0
  81. package/dist/ai-service/judge/tools/playwright-action.d.ts +18 -0
  82. package/dist/ai-service/judge/tools/playwright-action.d.ts.map +1 -0
  83. package/dist/ai-service/judge/tools/playwright-action.js +171 -0
  84. package/dist/ai-service/judge/tools/playwright-action.js.map +1 -0
  85. package/dist/ai-service/judge/tools/submit-feedback.d.ts +41 -0
  86. package/dist/ai-service/judge/tools/submit-feedback.d.ts.map +1 -0
  87. package/dist/ai-service/judge/tools/submit-feedback.js +150 -0
  88. package/dist/ai-service/judge/tools/submit-feedback.js.map +1 -0
  89. package/dist/ai-service/judge/types.d.ts +169 -0
  90. package/dist/ai-service/judge/types.d.ts.map +1 -0
  91. package/dist/ai-service/judge/types.js +8 -0
  92. package/dist/ai-service/judge/types.js.map +1 -0
  93. package/dist/ai-service/llm/interaction/adapters/vercel.d.ts.map +1 -1
  94. package/dist/ai-service/llm/interaction/adapters/vercel.js.map +1 -1
  95. package/dist/ai-service/llm/interaction/provider.d.ts +10 -9
  96. package/dist/ai-service/llm/interaction/provider.d.ts.map +1 -1
  97. package/dist/ai-service/llmobs/middleware/stream-text.d.ts +8 -8
  98. package/dist/ai-service/llmobs/middleware/stream-text.d.ts.map +1 -1
  99. package/dist/ai-service/llmobs/middleware/stream-text.js.map +1 -1
  100. package/dist/ai-service/llmobs/tracer.d.ts.map +1 -1
  101. package/dist/ai-service/llmobs/tracer.js +2 -1
  102. package/dist/ai-service/llmobs/tracer.js.map +1 -1
  103. package/dist/ai-service/mcp/embedded-playwright-mcp-server.d.ts +53 -0
  104. package/dist/ai-service/mcp/embedded-playwright-mcp-server.d.ts.map +1 -0
  105. package/dist/ai-service/mcp/embedded-playwright-mcp-server.js +541 -0
  106. package/dist/ai-service/mcp/embedded-playwright-mcp-server.js.map +1 -0
  107. package/dist/ai-service/mcp/playwright-server.d.ts +114 -0
  108. package/dist/ai-service/mcp/playwright-server.d.ts.map +1 -0
  109. package/dist/ai-service/mcp/playwright-server.js +109 -0
  110. package/dist/ai-service/mcp/playwright-server.js.map +1 -0
  111. package/dist/server-rpc/client.js +1 -1
  112. package/dist/server-rpc/client.js.map +1 -1
  113. package/dist/socket-manager.d.ts.map +1 -1
  114. package/dist/socket-manager.js +8 -0
  115. package/dist/socket-manager.js.map +1 -1
  116. package/package.json +7 -6
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Judge service for evaluating AI-generated applications.
3
+ *
4
+ * Encapsulates the complete judge evaluation workflow including:
5
+ * - LLM client access for executing evaluations
6
+ * - Storage for persisting evaluation results
7
+ * - MCP server lifecycle management
8
+ * - Executor orchestration
9
+ */
10
+ import type { JudgeEvaluation, EvaluationCriteria, JudgeConfig } from "./types.js";
11
+ import type { Logger } from "../../util/logger.js";
12
+ import type { LLMClient } from "../llm/client.js";
13
+ import type { LLMProvider } from "../llm/types.js";
14
+ import type { JudgeStorage, StoredEvaluation } from "./storage/index.js";
15
+ import type { PlaywrightMcpServerManager } from "../mcp/playwright-server.js";
16
+ import type { LLMProviderConfig } from "@superblocksteam/library-shared/types";
17
+ /**
18
+ * Options for judge evaluation.
19
+ */
20
+ export interface JudgeEvaluationOptions {
21
+ /** Application URL to evaluate */
22
+ appUrl: string;
23
+ /** JWT token for AI Gateway authentication */
24
+ jwt?: string;
25
+ /** LLM configuration (provider, thinking opts, disabled tools) */
26
+ llmConfig?: LLMProviderConfig;
27
+ /** Optional websocket endpoint to reuse an existing Playwright browser */
28
+ connectWsEndpoint?: string;
29
+ /** Raw storage state data to seed Playwright context */
30
+ storageStateData?: any;
31
+ /** Session storage entries to seed for the app origin */
32
+ sessionStorageData?: {
33
+ origin: string;
34
+ items: Array<{
35
+ name: string;
36
+ value: string;
37
+ }>;
38
+ };
39
+ /** Additional origins storage to seed (e.g., auth domain local/session storage) */
40
+ extraOrigins?: Array<{
41
+ origin: string;
42
+ localStorage: Array<{
43
+ name: string;
44
+ value: string;
45
+ }>;
46
+ sessionStorage?: Array<{
47
+ name: string;
48
+ value: string;
49
+ }>;
50
+ }>;
51
+ /** Optional evaluation criteria (auto-generated if not provided) */
52
+ criteria?: EvaluationCriteria;
53
+ /** Optional judge configuration overrides */
54
+ config?: Partial<JudgeConfig>;
55
+ /** Complexity level for model selection */
56
+ complexity?: "low" | "medium" | "high";
57
+ /** Optional path to Playwright storage state for authentication */
58
+ storageStatePath?: string;
59
+ /** Whether the Playwright MCP browser should run headless */
60
+ playwrightHeadless?: boolean;
61
+ /** Browser engine to use for Playwright MCP */
62
+ playwrightBrowser?: "chromium" | "firefox" | "webkit";
63
+ /** External Playwright MCP server to reuse instead of starting a new one */
64
+ playwrightServerUrl?: string;
65
+ }
66
+ /**
67
+ * Result of a judge evaluation with metadata.
68
+ */
69
+ export interface JudgeEvaluationResult extends JudgeEvaluation {
70
+ /** Prompt ID that was evaluated */
71
+ promptId: string;
72
+ /** Application ID */
73
+ appId: string;
74
+ }
75
+ /**
76
+ * Judge service for orchestrating application evaluations.
77
+ *
78
+ * Provides a high-level API for running judge evaluations
79
+ * and managing their lifecycle.
80
+ */
81
+ export declare class JudgeService {
82
+ private llmClient;
83
+ private llmProvider;
84
+ private storage;
85
+ private mcpServerManager;
86
+ private logger;
87
+ constructor(config: {
88
+ llmClient: LLMClient;
89
+ llmProvider: LLMProvider;
90
+ storage: JudgeStorage;
91
+ mcpServerManager: PlaywrightMcpServerManager;
92
+ logger: Logger;
93
+ });
94
+ /**
95
+ * Evaluates an application against a prompt.
96
+ *
97
+ * This is the main entry point for judge evaluations. It:
98
+ * 1. Ensures MCP server is running
99
+ * 2. Generates or uses provided criteria
100
+ * 3. Executes the evaluation via JudgeExecutor
101
+ * 4. Stores the result
102
+ * 5. Returns the evaluation
103
+ *
104
+ * @param promptId - Unique identifier for the prompt
105
+ * @param prompt - The prompt text that was used to generate the app
106
+ * @param appId - Application identifier
107
+ * @param options - Evaluation options
108
+ * @returns Complete evaluation result
109
+ * @throws Error if evaluation fails
110
+ */
111
+ evaluateApplication(promptId: string, prompt: string, appId: string, options: JudgeEvaluationOptions): Promise<JudgeEvaluationResult>;
112
+ /**
113
+ * Retrieves stored evaluations.
114
+ *
115
+ * @param promptId - Optional prompt ID to filter by
116
+ * @returns Array of stored evaluations
117
+ */
118
+ getEvaluations(promptId?: string): Promise<StoredEvaluation[]>;
119
+ /**
120
+ * Retrieves evaluations for a specific branch.
121
+ *
122
+ * @param branchName - Git branch name
123
+ * @returns Array of stored evaluations for the branch
124
+ */
125
+ getEvaluationsByBranch(branchName: string): Promise<StoredEvaluation[]>;
126
+ /**
127
+ * Ensures the MCP server is running.
128
+ *
129
+ * Starts the server if not already running.
130
+ *
131
+ * @param storageStatePath - Optional path to Playwright storage state for authentication
132
+ * @returns MCP server URL
133
+ */
134
+ private ensureMcpServer;
135
+ /**
136
+ * Gets the current git branch name.
137
+ *
138
+ * @returns Branch name or 'unknown' if git is not available
139
+ */
140
+ private getCurrentBranch;
141
+ /**
142
+ * Gets the current git commit SHA.
143
+ *
144
+ * @returns Commit SHA or 'unknown' if git is not available
145
+ */
146
+ private getCurrentCommitSha;
147
+ /**
148
+ * Selects the optimal model task based on complexity.
149
+ *
150
+ * @param complexity - Task complexity level
151
+ * @returns LLM task identifier for the provider
152
+ */
153
+ private getModelTaskForComplexity;
154
+ /**
155
+ * Cleans up resources used by the judge service.
156
+ *
157
+ * Stops the MCP server if running.
158
+ */
159
+ dispose(): Promise<void>;
160
+ }
161
+ //# sourceMappingURL=judge-service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge-service.d.ts","sourceRoot":"","sources":["../../../src/ai-service/judge/judge-service.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,OAAO,KAAK,EACV,eAAe,EACf,kBAAkB,EAClB,WAAW,EACZ,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,KAAK,EAAE,WAAW,EAAW,MAAM,iBAAiB,CAAC;AAC5D,OAAO,KAAK,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACzE,OAAO,KAAK,EAAE,0BAA0B,EAAE,MAAM,6BAA6B,CAAC;AAC9E,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,uCAAuC,CAAC;AAE/E;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC,kCAAkC;IAClC,MAAM,EAAE,MAAM,CAAC;IAEf,8CAA8C;IAC9C,GAAG,CAAC,EAAE,MAAM,CAAC;IAEb,kEAAkE;IAClE,SAAS,CAAC,EAAE,iBAAiB,CAAC;IAE9B,0EAA0E;IAC1E,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAE3B,wDAAwD;IACxD,gBAAgB,CAAC,EAAE,GAAG,CAAC;IAEvB,yDAAyD;IACzD,kBAAkB,CAAC,EAAE;QACnB,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,KAAK,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,KAAK,EAAE,MAAM,CAAA;SAAE,CAAC,CAAC;KAC/C,CAAC;IAEF,mFAAmF;IACnF,YAAY,CAAC,EAAE,KAAK,CAAC;QACnB,MAAM,EAAE,MAAM,CAAC;QACf,YAAY,EAAE,KAAK,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,KAAK,EAAE,MAAM,CAAA;SAAE,CAAC,CAAC;QACrD,cAAc,CAAC,EAAE,KAAK,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,KAAK,EAAE,MAAM,CAAA;SAAE,CAAC,CAAC;KACzD,CAAC,CAAC;IAEH,oEAAoE;IACpE,QAAQ,CAAC,EAAE,kBAAkB,CAAC;IAE9B,6CAA6C;IAC7C,MAAM,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAE9B,2CAA2C;IAC3C,UAAU,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;IAEvC,mEAAmE;IACnE,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAE1B,6DAA6D;IAC7D,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAE7B,+CAA+C;IAC/C,iBAAiB,CAAC,EAAE,UAAU,GAAG,SAAS,GAAG,QAAQ,CAAC;IAEtD,4EAA4E;IAC5E,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,qBAAsB,SAAQ,eAAe;IAC5D,mCAAmC;IACnC,QAAQ,EAAE,MAAM,CAAC;IAEjB,qBAAqB;IACrB,KAAK,EAAE,MAAM,CAAC;CACf;AAED;;;;;GAKG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,SAAS,CAAY;IAC7B,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,OAAO,CAAe;IAC9B,OAAO,CAAC,gBAAgB,CAA6B;IACrD,OAAO,CAAC,MAAM,CAAS;gBAEX,MAAM,EAAE;QAClB,SAAS,EAAE,SAAS,CAAC;QACrB,WAAW,EAAE,WAAW,CAAC;QACzB,OAAO,EAAE,YAAY,CAAC;QACtB,gBAAgB,EAAE,0BAA0B,CAAC;QAC7C,MAAM,EAAE,MAAM,CAAC;KAChB;IAQD;;;;;;;;;;;;;;;;OAgBG;IACG,mBAAmB,CACvB,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,sBAAsB,GAC9B,OAAO,CAAC,qBAAqB,CAAC;IA0GjC;;;;;OAKG;IACG,cAAc,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAOpE;;;;;OAKG;IACG,sBAAsB,CAC1B,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAI9B;;;;;;;OAOG;YACW,eAAe;IA8C7B;;;;OAIG;YACW,gBAAgB;IAa9B;;;;OAIG;YACW,mBAAmB;IAajC;;;;;OAKG;IACH,OAAO,CAAC,yBAAyB;IAkBjC;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAO/B"}
@@ -0,0 +1,241 @@
1
+ /**
2
+ * Judge service for evaluating AI-generated applications.
3
+ *
4
+ * Encapsulates the complete judge evaluation workflow including:
5
+ * - LLM client access for executing evaluations
6
+ * - Storage for persisting evaluation results
7
+ * - MCP server lifecycle management
8
+ * - Executor orchestration
9
+ */
10
+ import { JudgeExecutor } from "./judge-executor.js";
11
+ import { buildCriteriaFromPrompts } from "./prompts/system-prompt.js";
12
+ /**
13
+ * Judge service for orchestrating application evaluations.
14
+ *
15
+ * Provides a high-level API for running judge evaluations
16
+ * and managing their lifecycle.
17
+ */
18
+ export class JudgeService {
19
+ llmClient;
20
+ llmProvider;
21
+ storage;
22
+ mcpServerManager;
23
+ logger;
24
+ constructor(config) {
25
+ this.llmClient = config.llmClient;
26
+ this.llmProvider = config.llmProvider;
27
+ this.storage = config.storage;
28
+ this.mcpServerManager = config.mcpServerManager;
29
+ this.logger = config.logger;
30
+ }
31
+ /**
32
+ * Evaluates an application against a prompt.
33
+ *
34
+ * This is the main entry point for judge evaluations. It:
35
+ * 1. Ensures MCP server is running
36
+ * 2. Generates or uses provided criteria
37
+ * 3. Executes the evaluation via JudgeExecutor
38
+ * 4. Stores the result
39
+ * 5. Returns the evaluation
40
+ *
41
+ * @param promptId - Unique identifier for the prompt
42
+ * @param prompt - The prompt text that was used to generate the app
43
+ * @param appId - Application identifier
44
+ * @param options - Evaluation options
45
+ * @returns Complete evaluation result
46
+ * @throws Error if evaluation fails
47
+ */
48
+ async evaluateApplication(promptId, prompt, appId, options) {
49
+ this.logger.info(`Starting judge evaluation: promptId=${promptId}, appId=${appId}, appUrl=${options.appUrl}`);
50
+ try {
51
+ const _useExternalServer = !!options.playwrightServerUrl;
52
+ let mcpUrl = options.playwrightServerUrl;
53
+ if (!mcpUrl) {
54
+ mcpUrl = await this.ensureMcpServer(options.storageStatePath, options.playwrightBrowser, options.playwrightHeadless, options.jwt, options.connectWsEndpoint, options.storageStateData, options.appUrl);
55
+ }
56
+ // Generate criteria if not provided
57
+ const criteria = options.criteria ||
58
+ buildCriteriaFromPrompts([prompt], options.complexity || "medium");
59
+ // Select model based on complexity - create LanguageModel instance from provider
60
+ const model = this.llmProvider.modelForTask("generateBroadEdit");
61
+ // Build judge config
62
+ const judgeConfig = {
63
+ ...options.config,
64
+ playwrightMcpUrl: mcpUrl,
65
+ jwt: options.jwt,
66
+ };
67
+ // Create executor
68
+ const executor = new JudgeExecutor(this.llmClient, model, this.logger, judgeConfig);
69
+ // Initialize executor
70
+ await executor.initialize();
71
+ try {
72
+ // Execute evaluation
73
+ // Note: JudgeExecutor expects a simulation result, but we'll adapt it
74
+ const simulationResult = {
75
+ promptId,
76
+ success: true, // Assume success since we're evaluating the final result
77
+ stepResults: [{ prompt }],
78
+ tokens: { input: 0, output: 0, total: 0 },
79
+ duration: 0,
80
+ };
81
+ const evaluation = await executor.evaluateSimulation(simulationResult, criteria, options.appUrl);
82
+ // Get current git metadata for tracking
83
+ const branchName = await this.getCurrentBranch();
84
+ const commitSha = await this.getCurrentCommitSha();
85
+ // Store evaluation
86
+ const storedEvaluation = {
87
+ timestamp: new Date().toISOString(),
88
+ promptId,
89
+ branchName,
90
+ commitSha,
91
+ appId,
92
+ prompt,
93
+ evaluation,
94
+ };
95
+ await this.storage.saveEvaluation(storedEvaluation);
96
+ this.logger.info(`Judge evaluation completed and stored: promptId=${promptId}, appId=${appId}, score=${evaluation.overallScore}, passed=${evaluation.passed}`);
97
+ // Return result with metadata
98
+ return {
99
+ ...evaluation,
100
+ promptId,
101
+ appId,
102
+ };
103
+ }
104
+ finally {
105
+ // Always cleanup executor
106
+ await executor.cleanup();
107
+ }
108
+ }
109
+ catch (error) {
110
+ this.logger.error(`Judge evaluation failed: ${String(error)}`, {
111
+ error: {
112
+ kind: "JudgeEvaluationError",
113
+ message: String(error),
114
+ stack: error instanceof Error ? error.stack : undefined,
115
+ },
116
+ });
117
+ throw error;
118
+ }
119
+ }
120
+ /**
121
+ * Retrieves stored evaluations.
122
+ *
123
+ * @param promptId - Optional prompt ID to filter by
124
+ * @returns Array of stored evaluations
125
+ */
126
+ async getEvaluations(promptId) {
127
+ if (promptId) {
128
+ return this.storage.getEvaluationsByPrompt(promptId);
129
+ }
130
+ return this.storage.getEvaluations();
131
+ }
132
+ /**
133
+ * Retrieves evaluations for a specific branch.
134
+ *
135
+ * @param branchName - Git branch name
136
+ * @returns Array of stored evaluations for the branch
137
+ */
138
+ async getEvaluationsByBranch(branchName) {
139
+ return this.storage.getEvaluationsByBranch(branchName);
140
+ }
141
+ /**
142
+ * Ensures the MCP server is running.
143
+ *
144
+ * Starts the server if not already running.
145
+ *
146
+ * @param storageStatePath - Optional path to Playwright storage state for authentication
147
+ * @returns MCP server URL
148
+ */
149
+ async ensureMcpServer(storageStatePath, browser = "chromium", headless = true, jwt, connectWsEndpoint, storageStateData, appUrl, sessionStorageData, extraOrigins) {
150
+ if (!this.mcpServerManager.isRunning()) {
151
+ this.logger.info(`Starting Playwright MCP server for judge`, storageStatePath
152
+ ? `with storage state: ${storageStatePath}`
153
+ : "without storage state");
154
+ return await this.mcpServerManager.start({
155
+ browser,
156
+ headless,
157
+ storageStatePath,
158
+ jwt,
159
+ connectWsEndpoint,
160
+ storageStateData,
161
+ appUrl,
162
+ sessionStorageData,
163
+ extraOrigins,
164
+ });
165
+ }
166
+ const url = this.mcpServerManager.getUrl();
167
+ if (!url) {
168
+ throw new Error("MCP server is running but URL is not available");
169
+ }
170
+ return url;
171
+ }
172
+ /**
173
+ * Gets the current git branch name.
174
+ *
175
+ * @returns Branch name or 'unknown' if git is not available
176
+ */
177
+ async getCurrentBranch() {
178
+ try {
179
+ const { execSync } = await import("child_process");
180
+ const branch = execSync("git rev-parse --abbrev-ref HEAD", {
181
+ encoding: "utf-8",
182
+ cwd: process.cwd(),
183
+ }).trim();
184
+ return branch;
185
+ }
186
+ catch {
187
+ return "unknown";
188
+ }
189
+ }
190
+ /**
191
+ * Gets the current git commit SHA.
192
+ *
193
+ * @returns Commit SHA or 'unknown' if git is not available
194
+ */
195
+ async getCurrentCommitSha() {
196
+ try {
197
+ const { execSync } = await import("child_process");
198
+ const sha = execSync("git rev-parse HEAD", {
199
+ encoding: "utf-8",
200
+ cwd: process.cwd(),
201
+ }).trim();
202
+ return sha;
203
+ }
204
+ catch {
205
+ return "unknown";
206
+ }
207
+ }
208
+ /**
209
+ * Selects the optimal model task based on complexity.
210
+ *
211
+ * @param complexity - Task complexity level
212
+ * @returns LLM task identifier for the provider
213
+ */
214
+ getModelTaskForComplexity(complexity) {
215
+ switch (complexity) {
216
+ case "low":
217
+ // Fast evaluation for simple tasks - use explain task (lighter model)
218
+ return "explain";
219
+ case "medium":
220
+ // Balanced evaluation - use broad edit task
221
+ return "generateBroadEdit";
222
+ case "high":
223
+ // Thorough evaluation for complex tasks - use full app gen task
224
+ return "generateFullAppGen";
225
+ default:
226
+ return "generateBroadEdit";
227
+ }
228
+ }
229
+ /**
230
+ * Cleans up resources used by the judge service.
231
+ *
232
+ * Stops the MCP server if running.
233
+ */
234
+ async dispose() {
235
+ this.logger.info("Disposing judge service");
236
+ if (this.mcpServerManager.isRunning()) {
237
+ await this.mcpServerManager.stop();
238
+ }
239
+ }
240
+ }
241
+ //# sourceMappingURL=judge-service.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge-service.js","sourceRoot":"","sources":["../../../src/ai-service/judge/judge-service.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,wBAAwB,EAAE,MAAM,4BAA4B,CAAC;AA8EtE;;;;;GAKG;AACH,MAAM,OAAO,YAAY;IACf,SAAS,CAAY;IACrB,WAAW,CAAc;IACzB,OAAO,CAAe;IACtB,gBAAgB,CAA6B;IAC7C,MAAM,CAAS;IAEvB,YAAY,MAMX;QACC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;QAClC,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;QACtC,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC;QAC9B,IAAI,CAAC,gBAAgB,GAAG,MAAM,CAAC,gBAAgB,CAAC;QAChD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;IAC9B,CAAC;IAED;;;;;;;;;;;;;;;;OAgBG;IACH,KAAK,CAAC,mBAAmB,CACvB,QAAgB,EAChB,MAAc,EACd,KAAa,EACb,OAA+B;QAE/B,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,uCAAuC,QAAQ,WAAW,KAAK,YAAY,OAAO,CAAC,MAAM,EAAE,CAC5F,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,kBAAkB,GAAG,CAAC,CAAC,OAAO,CAAC,mBAAmB,CAAC;YACzD,IAAI,MAAM,GAAG,OAAO,CAAC,mBAAmB,CAAC;YACzC,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,MAAM,GAAG,MAAM,IAAI,CAAC,eAAe,CACjC,OAAO,CAAC,gBAAgB,EACxB,OAAO,CAAC,iBAAiB,EACzB,OAAO,CAAC,kBAAkB,EAC1B,OAAO,CAAC,GAAG,EACX,OAAO,CAAC,iBAAiB,EACzB,OAAO,CAAC,gBAAgB,EACxB,OAAO,CAAC,MAAM,CACf,CAAC;YACJ,CAAC;YAED,oCAAoC;YACpC,MAAM,QAAQ,GACZ,OAAO,CAAC,QAAQ;gBAChB,wBAAwB,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,CAAC,UAAU,IAAI,QAAQ,CAAC,CAAC;YAErE,iFAAiF;YACjF,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,CAAC,YAAY,CAAC,mBAAmB,CAAC,CAAC;YAEjE,qBAAqB;YACrB,MAAM,WAAW,GAAyB;gBACxC,GAAG,OAAO,CAAC,MAAM;gBACjB,gBAAgB,EAAE,MAAM;gBACxB,GAAG,EAAE,OAAO,CAAC,GAAG;aACjB,CAAC;YAEF,kBAAkB;YAClB,MAAM,QAAQ,GAAG,IAAI,aAAa,CAChC,IAAI,CAAC,SAAS,EACd,KAAK,EACL,IAAI,CAAC,MAAM,EACX,WAAW,CACZ,CAAC;YAEF,sBAAsB;YACtB,MAAM,QAAQ,CAAC,UAAU,EAAE,CAAC;YAE5B,IAAI,CAAC;gBACH,qBAAqB;gBACrB,sEAAsE;gBACtE,MAAM,gBAAgB,GAAG;oBACvB,QAAQ;oBACR,OAAO,EAAE,IAAI,EAAE,yDAAyD;oBACxE,WAAW,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC;oBACzB,MAAM,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE;oBACzC,QAAQ,EAAE,CAAC;iBACZ,CAAC;gBAEF,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,kBAAkB,CAClD,gBAAgB,EAChB,QAAQ,EACR,OAAO,CAAC,MAAM,CACf,CAAC;gBAEF,wCAAwC;gBACxC,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACjD,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,mBAAmB,EAAE,CAAC;gBAEnD,mBAAmB;gBACnB,MAAM,gBAAgB,GAAqB;oBACzC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;oBACnC,QAAQ;oBACR,UAAU;oBACV,SAAS;oBACT,KAAK;oBACL,MAAM;oBACN,UAAU;iBACX,CAAC;gBAEF,MAAM,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,gBAAgB,CAAC,CAAC;gBAEpD,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,mDAAmD,QAAQ,WAAW,KAAK,WAAW,UAAU,CAAC,YAAY,YAAY,UAAU,CAAC,MAAM,EAAE,CAC7I,CAAC;gBAEF,8BAA8B;gBAC9B,OAAO;oBACL,GAAG,UAAU;oBACb,QAAQ;oBACR,KAAK;iBACN,CAAC;YACJ,CAAC;oBAAS,CAAC;gBACT,0BAA0B;gBAC1B,MAAM,QAAQ,CAAC,OAAO,EAAE,CAAC;YAC3B,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,CAAC,EAAE,EAAE;gBAC7D,KAAK,EAAE;oBACL,IAAI,EAAE,sBAAsB;oBAC5B,OAAO,EAAE,MAAM,CAAC,KAAK,CAAC;oBACtB,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;iBACxD;aACF,CAAC,CAAC;YACH,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,cAAc,CAAC,QAAiB;QACpC,IAAI,QAAQ,EAAE,CAAC;YACb,OAAO,IAAI,CAAC,OAAO,CAAC,sBAAsB,CAAC,QAAQ,CAAC,CAAC;QACvD,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC;IACvC,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,sBAAsB,CAC1B,UAAkB;QAElB,OAAO,IAAI,CAAC,OAAO,CAAC,sBAAsB,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC;IAED;;;;;;;OAOG;IACK,KAAK,CAAC,eAAe,CAC3B,gBAAyB,EACzB,UAA6C,UAAU,EACvD,WAAoB,IAAI,EACxB,GAAY,EACZ,iBAA0B,EAC1B,gBAAsB,EACtB,MAAe,EACf,kBAGC,EACD,YAIE;QAEF,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,EAAE,CAAC;YACvC,IAAI,CAAC,MAAM,CAAC,IAAI,CACd,0CAA0C,EAC1C,gBAAgB;gBACd,CAAC,CAAC,uBAAuB,gBAAgB,EAAE;gBAC3C,CAAC,CAAC,uBAAuB,CAC5B,CAAC;YACF,OAAO,MAAM,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC;gBACvC,OAAO;gBACP,QAAQ;gBACR,gBAAgB;gBAChB,GAAG;gBACH,iBAAiB;gBACjB,gBAAgB;gBAChB,MAAM;gBACN,kBAAkB;gBAClB,YAAY;aACb,CAAC,CAAC;QACL,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,EAAE,CAAC;QAC3C,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;QACpE,CAAC;QAED,OAAO,GAAG,CAAC;IACb,CAAC;IAED;;;;OAIG;IACK,KAAK,CAAC,gBAAgB;QAC5B,IAAI,CAAC;YACH,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACnD,MAAM,MAAM,GAAG,QAAQ,CAAC,iCAAiC,EAAE;gBACzD,QAAQ,EAAE,OAAO;gBACjB,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE;aACnB,CAAC,CAAC,IAAI,EAAE,CAAC;YACV,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAED;;;;OAIG;IACK,KAAK,CAAC,mBAAmB;QAC/B,IAAI,CAAC;YACH,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACnD,MAAM,GAAG,GAAG,QAAQ,CAAC,oBAAoB,EAAE;gBACzC,QAAQ,EAAE,OAAO;gBACjB,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE;aACnB,CAAC,CAAC,IAAI,EAAE,CAAC;YACV,OAAO,GAAG,CAAC;QACb,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAED;;;;;OAKG;IACK,yBAAyB,CAC/B,UAAqC;QAErC,QAAQ,UAAU,EAAE,CAAC;YACnB,KAAK,KAAK;gBACR,sEAAsE;gBACtE,OAAO,SAAS,CAAC;YACnB,KAAK,QAAQ;gBACX,4CAA4C;gBAC5C,OAAO,mBAAmB,CAAC;YAC7B,KAAK,MAAM;gBACT,gEAAgE;gBAChE,OAAO,oBAAoB,CAAC;YAC9B;gBACE,OAAO,mBAAmB,CAAC;QAC/B,CAAC;IACH,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,yBAAyB,CAAC,CAAC;QAE5C,IAAI,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC;QACrC,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Evaluation criteria builder for judge assessments.
3
+ *
4
+ * Provides utilities for creating structured evaluation criteria
5
+ * based on simulation prompts and complexity levels.
6
+ */
7
+ import type { EvaluationCriteria } from "../types.js";
8
+ /**
9
+ * Simulation prompt interface matching the test infrastructure.
10
+ */
11
+ interface SimulationPrompt {
12
+ id: string;
13
+ name: string;
14
+ description: string;
15
+ prompts: string[];
16
+ complexity: "low" | "medium" | "high";
17
+ }
18
+ /**
19
+ * Builds evaluation criteria from a simulation prompt.
20
+ *
21
+ * Analyzes the prompt structure and content to generate
22
+ * appropriate evaluation criteria for the judge.
23
+ *
24
+ * @param prompt - Simulation prompt object
25
+ * @returns Structured evaluation criteria
26
+ */
27
+ export declare function buildCriteriaFromPrompt(prompt: SimulationPrompt): EvaluationCriteria;
28
+ /**
29
+ * Creates minimal criteria for basic testing.
30
+ *
31
+ * Used when specific criteria cannot be extracted.
32
+ *
33
+ * @returns Minimal evaluation criteria
34
+ */
35
+ export declare function createMinimalCriteria(): EvaluationCriteria;
36
+ export {};
37
+ //# sourceMappingURL=evaluation-criteria.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"evaluation-criteria.d.ts","sourceRoot":"","sources":["../../../../src/ai-service/judge/prompts/evaluation-criteria.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAEtD;;GAEG;AACH,UAAU,gBAAgB;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,UAAU,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CACvC;AAED;;;;;;;;GAQG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,gBAAgB,GACvB,kBAAkB,CAwCpB;AAkTD;;;;;;GAMG;AACH,wBAAgB,qBAAqB,IAAI,kBAAkB,CAS1D"}