task-o-matic-core 0.1.4 → 0.1.5-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +946 -222
- package/dist/index.d.ts +1 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -3
- package/dist/lib/ai-service/prd-operations.d.ts.map +1 -1
- package/dist/lib/ai-service/prd-operations.js +3 -36
- package/dist/lib/benchmark/executor.d.ts +93 -0
- package/dist/lib/benchmark/executor.d.ts.map +1 -0
- package/dist/lib/benchmark/executor.js +395 -0
- package/dist/lib/benchmark/index.d.ts +16 -0
- package/dist/lib/benchmark/index.d.ts.map +1 -0
- package/dist/lib/benchmark/index.js +36 -0
- package/dist/lib/benchmark/metrics-collector.d.ts +84 -0
- package/dist/lib/benchmark/metrics-collector.d.ts.map +1 -0
- package/dist/lib/benchmark/metrics-collector.js +297 -0
- package/dist/lib/benchmark/operations/index.d.ts +70 -0
- package/dist/lib/benchmark/operations/index.d.ts.map +1 -0
- package/dist/lib/benchmark/operations/index.js +298 -0
- package/dist/lib/benchmark/orchestrator.d.ts +88 -0
- package/dist/lib/benchmark/orchestrator.d.ts.map +1 -0
- package/dist/lib/benchmark/orchestrator.js +337 -0
- package/dist/lib/benchmark/store.d.ts +140 -0
- package/dist/lib/benchmark/store.d.ts.map +1 -0
- package/dist/lib/benchmark/store.js +417 -0
- package/dist/lib/benchmark/types.d.ts +243 -60
- package/dist/lib/benchmark/types.d.ts.map +1 -1
- package/dist/lib/benchmark/types.js +7 -0
- package/dist/lib/benchmark/worktree-manager.d.ts +127 -0
- package/dist/lib/benchmark/worktree-manager.d.ts.map +1 -0
- package/dist/lib/benchmark/worktree-manager.js +325 -0
- package/dist/lib/benchmark/worktree-pool.d.ts +97 -0
- package/dist/lib/benchmark/worktree-pool.d.ts.map +1 -0
- package/dist/lib/benchmark/worktree-pool.js +198 -0
- package/dist/lib/executors/opencode-executor.js +5 -5
- package/dist/lib/index.d.ts +0 -5
- package/dist/lib/index.d.ts.map +1 -1
- package/dist/lib/index.js +1 -7
- package/dist/lib/task-execution-core.js +17 -1
- package/dist/lib/task-review.d.ts +7 -0
- package/dist/lib/task-review.d.ts.map +1 -1
- package/dist/lib/task-review.js +30 -10
- package/dist/services/prd.d.ts.map +1 -1
- package/dist/services/prd.js +20 -44
- package/dist/services/tasks.d.ts.map +1 -1
- package/dist/services/tasks.js +12 -54
- package/dist/test/benchmark/metrics.test.d.ts +7 -0
- package/dist/test/benchmark/metrics.test.d.ts.map +1 -0
- package/dist/test/benchmark/metrics.test.js +267 -0
- package/dist/test/benchmark/orchestrator.test.d.ts +12 -0
- package/dist/test/benchmark/orchestrator.test.d.ts.map +1 -0
- package/dist/test/benchmark/orchestrator.test.js +316 -0
- package/dist/test/benchmark/store.test.d.ts +7 -0
- package/dist/test/benchmark/store.test.d.ts.map +1 -0
- package/dist/test/benchmark/store.test.js +356 -0
- package/dist/test/benchmark/worktree.test.d.ts +7 -0
- package/dist/test/benchmark/worktree.test.d.ts.map +1 -0
- package/dist/test/benchmark/worktree.test.js +347 -0
- package/dist/test/lib/task-review.test.d.ts +2 -0
- package/dist/test/lib/task-review.test.d.ts.map +1 -0
- package/dist/test/lib/task-review.test.js +178 -0
- package/dist/test/services/task-service.test.js +31 -8
- package/package.json +2 -2
- package/dist/lib/benchmark/registry.d.ts +0 -11
- package/dist/lib/benchmark/registry.d.ts.map +0 -1
- package/dist/lib/benchmark/registry.js +0 -212
- package/dist/lib/benchmark/runner.d.ts +0 -6
- package/dist/lib/benchmark/runner.d.ts.map +0 -1
- package/dist/lib/benchmark/runner.js +0 -150
- package/dist/lib/benchmark/storage.d.ts +0 -13
- package/dist/lib/benchmark/storage.d.ts.map +0 -1
- package/dist/lib/benchmark/storage.js +0 -100
- package/dist/services/benchmark.d.ts +0 -26
- package/dist/services/benchmark.d.ts.map +0 -1
- package/dist/services/benchmark.js +0 -343
- package/dist/services/workflow-benchmark.d.ts +0 -34
- package/dist/services/workflow-benchmark.d.ts.map +0 -1
- package/dist/services/workflow-benchmark.js +0 -318
package/dist/index.d.ts
CHANGED
|
@@ -15,10 +15,8 @@ export * from "./lib/task-loop-execution";
|
|
|
15
15
|
export * from "./services/tasks";
|
|
16
16
|
export * from "./services/workflow";
|
|
17
17
|
export * from "./services/prd";
|
|
18
|
-
export * from "./services/benchmark";
|
|
19
18
|
export * from "./services/project-analysis";
|
|
20
|
-
export * from "./lib/benchmark
|
|
21
|
-
export * from "./lib/benchmark/types";
|
|
19
|
+
export * from "./lib/benchmark";
|
|
22
20
|
export * from "./utils/ai-service-factory";
|
|
23
21
|
export * from "./utils/task-o-matic-error";
|
|
24
22
|
export * from "./utils/stack-detector";
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,SAAS,CAAC;AACxB,cAAc,cAAc,CAAC;AAC7B,cAAc,cAAc,CAAC;AAC7B,cAAc,0BAA0B,CAAC;AACzC,cAAc,kBAAkB,CAAC;AACjC,cAAc,uBAAuB,CAAC;AACtC,cAAc,gCAAgC,CAAC;AAC/C,cAAc,kCAAkC,CAAC;AACjD,cAAc,iCAAiC,CAAC;AAChD,cAAc,2CAA2C,CAAC;AAC1D,cAAc,kCAAkC,CAAC;AACjD,cAAc,sBAAsB,CAAC;AACrC,cAAc,sBAAsB,CAAC;AACrC,cAAc,2BAA2B,CAAC;AAE1C,cAAc,kBAAkB,CAAC;AACjC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,SAAS,CAAC;AACxB,cAAc,cAAc,CAAC;AAC7B,cAAc,cAAc,CAAC;AAC7B,cAAc,0BAA0B,CAAC;AACzC,cAAc,kBAAkB,CAAC;AACjC,cAAc,uBAAuB,CAAC;AACtC,cAAc,gCAAgC,CAAC;AAC/C,cAAc,kCAAkC,CAAC;AACjD,cAAc,iCAAiC,CAAC;AAChD,cAAc,2CAA2C,CAAC;AAC1D,cAAc,kCAAkC,CAAC;AACjD,cAAc,sBAAsB,CAAC;AACrC,cAAc,sBAAsB,CAAC;AACrC,cAAc,2BAA2B,CAAC;AAE1C,cAAc,kBAAkB,CAAC;AACjC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,6BAA6B,CAAC;AAC5C,cAAc,iBAAiB,CAAC;AAEhC,cAAc,4BAA4B,CAAC;AAC3C,cAAc,4BAA4B,CAAC;AAC3C,cAAc,wBAAwB,CAAC;AAEvC,cAAc,aAAa,CAAC;AAO5B,cAAc,+BAA+B,CAAC;AAC9C,cAAc,sBAAsB,CAAC;AAErC,cAAc,WAAW,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -31,10 +31,8 @@ __exportStar(require("./lib/task-loop-execution"), exports);
|
|
|
31
31
|
__exportStar(require("./services/tasks"), exports);
|
|
32
32
|
__exportStar(require("./services/workflow"), exports);
|
|
33
33
|
__exportStar(require("./services/prd"), exports);
|
|
34
|
-
__exportStar(require("./services/benchmark"), exports);
|
|
35
34
|
__exportStar(require("./services/project-analysis"), exports);
|
|
36
|
-
__exportStar(require("./lib/benchmark
|
|
37
|
-
__exportStar(require("./lib/benchmark/types"), exports);
|
|
35
|
+
__exportStar(require("./lib/benchmark"), exports);
|
|
38
36
|
__exportStar(require("./utils/ai-service-factory"), exports);
|
|
39
37
|
__exportStar(require("./utils/task-o-matic-error"), exports);
|
|
40
38
|
__exportStar(require("./utils/stack-detector"), exports);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prd-operations.d.ts","sourceRoot":"","sources":["../../../src/lib/ai-service/prd-operations.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"prd-operations.d.ts","sourceRoot":"","sources":["../../../src/lib/ai-service/prd-operations.ts"],"names":[],"mappings":"AACA,OAAO,EACL,QAAQ,EAER,gBAAgB,EAChB,gBAAgB,EAChB,WAAW,EAIX,SAAS,EACV,MAAM,aAAa,CAAC;AAYrB,OAAO,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAOnD,qBAAa,aAAc,SAAQ,cAAc;IACzC,QAAQ,CACZ,UAAU,EAAE,MAAM,EAClB,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,EACpB,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,EAClC,gBAAgB,CAAC,EAAE,MAAM,EACzB,qBAAqB,CAAC,EAAE,OAAO,GAC9B,OAAO,CAAC,gBAAgB,CAAC;IA6MtB,SAAS,CACb,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,EACpB,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,EAClC,gBAAgB,CAAC,EAAE,MAAM,EACzB,qBAAqB,CAAC,EAAE,OAAO,GAC9B,OAAO,CAAC,MAAM,CAAC;IA+GZ,oBAAoB,CACxB,UAAU,EAAE,MAAM,EAClB,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,EACpB,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,EAClC,gBAAgB,CAAC,EAAE,MAAM,EACzB,qBAAqB,CAAC,EAAE,OAAO,GAC9B,OAAO,CAAC,MAAM,EAAE,CAAC;IA2Hd,kBAAkB,CACtB,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,MAAM,EAAE,EACnB,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,WAAW,CAAC,EAAE;QACZ,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,kBAAkB,CAAC,EAAE,MAAM,CAAC;KAC7B,EACD,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,GACjC,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IA8F5B,WAAW,CACf,WAAW,EAAE,MAAM,EACnB,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,EACpB,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,GACjC,OAAO,CAAC,MAAM,CAAC;IAqBZ,WAAW,CACf,IAAI,EAAE,MAAM,EAAE,EACd,mBAAmB,EAAE,MAAM,EAC3B,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,EACpB,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,GACjC,OAAO,CAAC,MAAM,CAAC;IA6BlB;;OAEG;IACG,YAAY,CAChB,UAAU,EAAE,MAAM,EAClB,WAAW,CAAC,EAAE,MAAM,EACpB,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,EACpB,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,EAClC,gBAAgB,CAAC,EAAE,MAAM,EACzB,qBAAqB,CAAC,EAAE,OAAO,GAC9B,OAAO,CAAC;QAAE,MAAM,EAAE,SAAS,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,CAAC;IAiJpD;;;OAGG;IACG,uBAAuB,CAC3B,eAAe,EAAE;QACf,WAAW,EAAE,MAAM,CAAC;QACpB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,QAAQ,EAAE,MAAM,CAAC;QACjB,SAAS,EAAE,MAAM,CAAC;QAClB,gBAAgB,EAAE,MAAM,CAAC;QACzB,aAAa,EAAE,MAAM,CAAC;QACtB,KAAK,EAAE,MAAM,CAAC;QACd,aAAa,EAAE,MAAM,CAAC;KACvB,EACD,MAAM,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,EAC1B,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,EAClC,qBAAqB,CAAC,EAAE,OAAO,GAC9B,OAAO,CAAC,MAAM,CAAC;IA0FlB;;OAEG;IACH,OAAO,CAAC,mBAAmB;CAqB5B"}
|
|
@@ -1,37 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
3
|
exports.PRDOperations = void 0;
|
|
37
4
|
const ai_1 = require("ai");
|
|
@@ -40,6 +7,7 @@ const prompts_1 = require("../../prompts");
|
|
|
40
7
|
const filesystem_tools_1 = require("./filesystem-tools");
|
|
41
8
|
const base_operations_1 = require("./base-operations");
|
|
42
9
|
const task_o_matic_error_1 = require("../../utils/task-o-matic-error");
|
|
10
|
+
const prompts_2 = require("../../prompts");
|
|
43
11
|
class PRDOperations extends base_operations_1.BaseOperations {
|
|
44
12
|
async parsePRD(prdContent, config, promptOverride, userMessage, streamingOptions, retryConfig, workingDirectory, enableFilesystemTools) {
|
|
45
13
|
// console.log(
|
|
@@ -321,7 +289,6 @@ Use these tools to understand the current project structure, existing code patte
|
|
|
321
289
|
}
|
|
322
290
|
prompt = promptResult.prompt;
|
|
323
291
|
}
|
|
324
|
-
const { PRD_QUESTION_SYSTEM_PROMPT } = await Promise.resolve().then(() => __importStar(require("../../prompts")));
|
|
325
292
|
let response;
|
|
326
293
|
if (enableFilesystemTools) {
|
|
327
294
|
const model = this.modelProvider.getModel({
|
|
@@ -332,7 +299,7 @@ Use these tools to understand the current project structure, existing code patte
|
|
|
332
299
|
const result = await (0, ai_1.streamText)({
|
|
333
300
|
model,
|
|
334
301
|
tools: allTools,
|
|
335
|
-
system: PRD_QUESTION_SYSTEM_PROMPT +
|
|
302
|
+
system: prompts_2.PRD_QUESTION_SYSTEM_PROMPT +
|
|
336
303
|
`\n\nYou have access to filesystem tools to check existing code/structure if needed.`,
|
|
337
304
|
messages: [{ role: "user", content: userMessage || prompt }],
|
|
338
305
|
maxRetries: 0,
|
|
@@ -360,7 +327,7 @@ Use these tools to understand the current project structure, existing code patte
|
|
|
360
327
|
response = await result.text;
|
|
361
328
|
}
|
|
362
329
|
else {
|
|
363
|
-
response = await this.streamText("", config, PRD_QUESTION_SYSTEM_PROMPT, userMessage || prompt, streamingOptions, { maxAttempts: 1 });
|
|
330
|
+
response = await this.streamText("", config, prompts_2.PRD_QUESTION_SYSTEM_PROMPT, userMessage || prompt, streamingOptions, { maxAttempts: 1 });
|
|
364
331
|
}
|
|
365
332
|
const parseResult = this.jsonParser.parseJSONFromResponse(response);
|
|
366
333
|
if (!parseResult.success) {
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BenchmarkExecutor - Per-worktree execution logic for benchmarks
|
|
3
|
+
*
|
|
4
|
+
* This class handles executing benchmark operations in isolated worktrees.
|
|
5
|
+
* It supports operations, single task execution, task loops, and full workflows.
|
|
6
|
+
* Each execution captures timing, token metrics, and code changes.
|
|
7
|
+
*/
|
|
8
|
+
import type { Worktree } from "./worktree-manager";
|
|
9
|
+
import { MetricsCollector } from "./metrics-collector";
|
|
10
|
+
import type { BenchmarkModelConfig, BenchmarkModelResult, OperationBenchmarkInput, ExecutionBenchmarkInput, ExecuteLoopBenchmarkInput, WorkflowBenchmarkInput, BenchmarkableOperation } from "./types";
|
|
11
|
+
/**
|
|
12
|
+
* BenchmarkExecutor runs benchmark operations in isolated worktrees
|
|
13
|
+
* and collects comprehensive metrics for comparison.
|
|
14
|
+
*/
|
|
15
|
+
export declare class BenchmarkExecutor {
|
|
16
|
+
private metricsCollector;
|
|
17
|
+
private operationRegistry;
|
|
18
|
+
constructor(metricsCollector?: MetricsCollector, operationRegistry?: Map<string, BenchmarkableOperation>);
|
|
19
|
+
/**
|
|
20
|
+
* Register an operation for benchmarking
|
|
21
|
+
*/
|
|
22
|
+
registerOperation(operation: BenchmarkableOperation): void;
|
|
23
|
+
/**
|
|
24
|
+
* Get a registered operation by ID
|
|
25
|
+
*/
|
|
26
|
+
getOperation(id: string): BenchmarkableOperation | undefined;
|
|
27
|
+
/**
|
|
28
|
+
* List all registered operations
|
|
29
|
+
*/
|
|
30
|
+
listOperations(): BenchmarkableOperation[];
|
|
31
|
+
/**
|
|
32
|
+
* Execute a registered operation in a worktree
|
|
33
|
+
*
|
|
34
|
+
* @param worktree - The worktree to execute in
|
|
35
|
+
* @param model - Model configuration to use
|
|
36
|
+
* @param input - Operation input parameters
|
|
37
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
38
|
+
* @returns Benchmark result with metrics
|
|
39
|
+
*/
|
|
40
|
+
executeOperation(worktree: Worktree, model: BenchmarkModelConfig, input: OperationBenchmarkInput, baseCommit: string): Promise<BenchmarkModelResult>;
|
|
41
|
+
/**
|
|
42
|
+
* Execute a single task in a worktree
|
|
43
|
+
*
|
|
44
|
+
* @param worktree - The worktree to execute in
|
|
45
|
+
* @param model - Model configuration to use
|
|
46
|
+
* @param input - Task execution input
|
|
47
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
48
|
+
* @returns Benchmark result with metrics
|
|
49
|
+
*/
|
|
50
|
+
executeTask(worktree: Worktree, model: BenchmarkModelConfig, input: ExecutionBenchmarkInput, baseCommit: string): Promise<BenchmarkModelResult>;
|
|
51
|
+
/**
|
|
52
|
+
* Execute a task loop in a worktree
|
|
53
|
+
*
|
|
54
|
+
* @param worktree - The worktree to execute in
|
|
55
|
+
* @param model - Model configuration to use
|
|
56
|
+
* @param input - Execute loop input
|
|
57
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
58
|
+
* @returns Benchmark result with metrics
|
|
59
|
+
*/
|
|
60
|
+
executeLoop(worktree: Worktree, model: BenchmarkModelConfig, input: ExecuteLoopBenchmarkInput, baseCommit: string): Promise<BenchmarkModelResult>;
|
|
61
|
+
/**
|
|
62
|
+
* Execute a full workflow in a worktree
|
|
63
|
+
*
|
|
64
|
+
* For workflow benchmarks, we execute a series of steps:
|
|
65
|
+
* 1. Initialize project (if needed)
|
|
66
|
+
* 2. Parse PRD to generate tasks
|
|
67
|
+
* 3. Execute tasks with the given model
|
|
68
|
+
*
|
|
69
|
+
* @param worktree - The worktree to execute in
|
|
70
|
+
* @param model - Model configuration to use
|
|
71
|
+
* @param input - Workflow input
|
|
72
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
73
|
+
* @returns Benchmark result with metrics
|
|
74
|
+
*/
|
|
75
|
+
executeWorkflow(worktree: Worktree, model: BenchmarkModelConfig, input: WorkflowBenchmarkInput, baseCommit: string): Promise<BenchmarkModelResult>;
|
|
76
|
+
/**
|
|
77
|
+
* Build AI options for a model configuration
|
|
78
|
+
*/
|
|
79
|
+
private buildModelAIOptions;
|
|
80
|
+
/**
|
|
81
|
+
* Collect all metrics for a completed execution
|
|
82
|
+
*/
|
|
83
|
+
private collectMetrics;
|
|
84
|
+
/**
|
|
85
|
+
* Create a success result
|
|
86
|
+
*/
|
|
87
|
+
private createSuccessResult;
|
|
88
|
+
/**
|
|
89
|
+
* Create an error result
|
|
90
|
+
*/
|
|
91
|
+
private createErrorResult;
|
|
92
|
+
}
|
|
93
|
+
//# sourceMappingURL=executor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"executor.d.ts","sourceRoot":"","sources":["../../../src/lib/benchmark/executor.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AACvD,OAAO,KAAK,EACV,oBAAoB,EACpB,oBAAoB,EAEpB,uBAAuB,EACvB,uBAAuB,EACvB,yBAAyB,EACzB,sBAAsB,EAGtB,sBAAsB,EACvB,MAAM,SAAS,CAAC;AAqBjB;;;GAGG;AACH,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,gBAAgB,CAAmB;IAC3C,OAAO,CAAC,iBAAiB,CAAsC;gBAG7D,gBAAgB,CAAC,EAAE,gBAAgB,EACnC,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,EAAE,sBAAsB,CAAC;IAMzD;;OAEG;IACH,iBAAiB,CAAC,SAAS,EAAE,sBAAsB,GAAG,IAAI;IAI1D;;OAEG;IACH,YAAY,CAAC,EAAE,EAAE,MAAM,GAAG,sBAAsB,GAAG,SAAS;IAI5D;;OAEG;IACH,cAAc,IAAI,sBAAsB,EAAE;IAI1C;;;;;;;;OAQG;IACG,gBAAgB,CACpB,QAAQ,EAAE,QAAQ,EAClB,KAAK,EAAE,oBAAoB,EAC3B,KAAK,EAAE,uBAAuB,EAC9B,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,oBAAoB,CAAC;IAuFhC;;;;;;;;OAQG;IACG,WAAW,CACf,QAAQ,EAAE,QAAQ,EAClB,KAAK,EAAE,oBAAoB,EAC3B,KAAK,EAAE,uBAAuB,EAC9B,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,oBAAoB,CAAC;IA0EhC;;;;;;;;OAQG;IACG,WAAW,CACf,QAAQ,EAAE,QAAQ,EAClB,KAAK,EAAE,oBAAoB,EAC3B,KAAK,EAAE,yBAAyB,EAChC,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,oBAAoB,CAAC;IAmEhC;;;;;;;;;;;;;OAaG;IACG,eAAe,CACnB,QAAQ,EAAE,QAAQ,EAClB,KAAK,EAAE,oBAAoB,EAC3B,KAAK,EAAE,sBAAsB,EAC7B,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,oBAAoB,CAAC;IA8GhC;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAsB3B;;OAEG;YACW,cAAc;IAkC5B;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAkB3B;;OAEG;IACH,OAAO,CAAC,iBAAiB;CA6B1B"}
|
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* BenchmarkExecutor - Per-worktree execution logic for benchmarks
|
|
4
|
+
*
|
|
5
|
+
* This class handles executing benchmark operations in isolated worktrees.
|
|
6
|
+
* It supports operations, single task execution, task loops, and full workflows.
|
|
7
|
+
* Each execution captures timing, token metrics, and code changes.
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.BenchmarkExecutor = void 0;
|
|
11
|
+
const metrics_collector_1 = require("./metrics-collector");
|
|
12
|
+
const logger_1 = require("../logger");
|
|
13
|
+
const config_1 = require("../config");
|
|
14
|
+
const task_execution_core_1 = require("../task-execution-core");
|
|
15
|
+
const task_loop_execution_1 = require("../task-loop-execution");
|
|
16
|
+
const workflow_1 = require("../../services/workflow");
|
|
17
|
+
const prd_1 = require("../../services/prd");
|
|
18
|
+
const node_fs_1 = require("node:fs");
|
|
19
|
+
const node_path_1 = require("node:path");
|
|
20
|
+
const task_o_matic_error_1 = require("../../utils/task-o-matic-error");
|
|
21
|
+
/**
|
|
22
|
+
* BenchmarkExecutor runs benchmark operations in isolated worktrees
|
|
23
|
+
* and collects comprehensive metrics for comparison.
|
|
24
|
+
*/
|
|
25
|
+
class BenchmarkExecutor {
|
|
26
|
+
metricsCollector;
|
|
27
|
+
operationRegistry;
|
|
28
|
+
constructor(metricsCollector, operationRegistry) {
|
|
29
|
+
this.metricsCollector = metricsCollector ?? new metrics_collector_1.MetricsCollector();
|
|
30
|
+
this.operationRegistry = operationRegistry ?? new Map();
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Register an operation for benchmarking
|
|
34
|
+
*/
|
|
35
|
+
registerOperation(operation) {
|
|
36
|
+
this.operationRegistry.set(operation.id, operation);
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Get a registered operation by ID
|
|
40
|
+
*/
|
|
41
|
+
getOperation(id) {
|
|
42
|
+
return this.operationRegistry.get(id);
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* List all registered operations
|
|
46
|
+
*/
|
|
47
|
+
listOperations() {
|
|
48
|
+
return Array.from(this.operationRegistry.values());
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Execute a registered operation in a worktree
|
|
52
|
+
*
|
|
53
|
+
* @param worktree - The worktree to execute in
|
|
54
|
+
* @param model - Model configuration to use
|
|
55
|
+
* @param input - Operation input parameters
|
|
56
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
57
|
+
* @returns Benchmark result with metrics
|
|
58
|
+
*/
|
|
59
|
+
async executeOperation(worktree, model, input, baseCommit) {
|
|
60
|
+
const modelId = `${model.provider}:${model.model}`;
|
|
61
|
+
const startedAt = Date.now();
|
|
62
|
+
let timeToFirstOutput;
|
|
63
|
+
logger_1.logger.info(`Executing operation ${input.operationId} with ${modelId}`);
|
|
64
|
+
try {
|
|
65
|
+
// Get the operation from registry
|
|
66
|
+
const operation = this.operationRegistry.get(input.operationId);
|
|
67
|
+
if (!operation) {
|
|
68
|
+
throw (0, task_o_matic_error_1.createStandardError)(task_o_matic_error_1.TaskOMaticErrorCodes.INVALID_INPUT, `Operation not found: ${input.operationId}`, {
|
|
69
|
+
context: `The requested operation "${input.operationId}" is not registered.`,
|
|
70
|
+
suggestions: ["Check the operation ID", "List available operations"],
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
// Validate input
|
|
74
|
+
if (!operation.validateInput(input.params)) {
|
|
75
|
+
throw (0, task_o_matic_error_1.createStandardError)(task_o_matic_error_1.TaskOMaticErrorCodes.INVALID_INPUT, `Invalid input for operation ${input.operationId}`, {
|
|
76
|
+
context: `The provided parameters do not match the requirements for "${input.operationId}".`,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
// Setup working directory to the worktree
|
|
80
|
+
await (0, config_1.setupWorkingDirectory)(worktree.path);
|
|
81
|
+
// Build AI options for this model
|
|
82
|
+
const aiOptions = this.buildModelAIOptions(model);
|
|
83
|
+
// Token tracking
|
|
84
|
+
const tokenTracker = { prompt: 0, completion: 0 };
|
|
85
|
+
// Create streaming options with token tracking
|
|
86
|
+
const streamingOptions = {
|
|
87
|
+
enabled: true,
|
|
88
|
+
onChunk: () => {
|
|
89
|
+
if (!timeToFirstOutput) {
|
|
90
|
+
timeToFirstOutput = Date.now() - startedAt;
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
onFinish: (result) => {
|
|
94
|
+
if (result.usage) {
|
|
95
|
+
tokenTracker.prompt += result.usage.promptTokens ?? 0;
|
|
96
|
+
tokenTracker.completion += result.usage.completionTokens ?? 0;
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
};
|
|
100
|
+
// Execute the operation
|
|
101
|
+
const output = await operation.execute(input.params, aiOptions, streamingOptions);
|
|
102
|
+
const completedAt = Date.now();
|
|
103
|
+
// Collect metrics
|
|
104
|
+
const metrics = await this.collectMetrics(worktree.path, baseCommit, startedAt, completedAt, timeToFirstOutput, tokenTracker);
|
|
105
|
+
return this.createSuccessResult(worktree, modelId, output, metrics, completedAt - startedAt);
|
|
106
|
+
}
|
|
107
|
+
catch (error) {
|
|
108
|
+
const completedAt = Date.now();
|
|
109
|
+
return this.createErrorResult(worktree, modelId, error, completedAt - startedAt, startedAt, completedAt, timeToFirstOutput);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Execute a single task in a worktree
|
|
114
|
+
*
|
|
115
|
+
* @param worktree - The worktree to execute in
|
|
116
|
+
* @param model - Model configuration to use
|
|
117
|
+
* @param input - Task execution input
|
|
118
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
119
|
+
* @returns Benchmark result with metrics
|
|
120
|
+
*/
|
|
121
|
+
async executeTask(worktree, model, input, baseCommit) {
|
|
122
|
+
const modelId = `${model.provider}:${model.model}`;
|
|
123
|
+
const startedAt = Date.now();
|
|
124
|
+
let timeToFirstOutput;
|
|
125
|
+
logger_1.logger.info(`Executing task ${input.taskId} with ${modelId}`);
|
|
126
|
+
try {
|
|
127
|
+
// Setup working directory to the worktree
|
|
128
|
+
await (0, config_1.setupWorkingDirectory)(worktree.path);
|
|
129
|
+
// Token tracking
|
|
130
|
+
const tokenTracker = { prompt: 0, completion: 0 };
|
|
131
|
+
// Build execution config with model override
|
|
132
|
+
// The executeTaskCore takes taskId and a TaskExecutionConfig object
|
|
133
|
+
const config = {
|
|
134
|
+
tool: "opencode", // Default tool
|
|
135
|
+
enableRetry: (input.maxRetries ?? 1) > 1,
|
|
136
|
+
maxRetries: input.maxRetries ?? 1,
|
|
137
|
+
verificationCommands: input.verificationCommands ?? [],
|
|
138
|
+
executorConfig: {
|
|
139
|
+
model: `${model.provider}:${model.model}`,
|
|
140
|
+
},
|
|
141
|
+
};
|
|
142
|
+
// Execute the task
|
|
143
|
+
const result = await (0, task_execution_core_1.executeTaskCore)(input.taskId, config);
|
|
144
|
+
const completedAt = Date.now();
|
|
145
|
+
if (!timeToFirstOutput) {
|
|
146
|
+
timeToFirstOutput = completedAt - startedAt;
|
|
147
|
+
}
|
|
148
|
+
// Collect metrics
|
|
149
|
+
const metrics = await this.collectMetrics(worktree.path, baseCommit, startedAt, completedAt, timeToFirstOutput, tokenTracker, input.verificationCommands);
|
|
150
|
+
// TaskExecutionResult has success and attempts, extract error from last attempt if failed
|
|
151
|
+
const lastAttempt = result.attempts[result.attempts.length - 1];
|
|
152
|
+
const errorMsg = result.success ? undefined : lastAttempt?.error;
|
|
153
|
+
const status = result.success ? "success" : "failed";
|
|
154
|
+
return {
|
|
155
|
+
modelId,
|
|
156
|
+
worktree,
|
|
157
|
+
status,
|
|
158
|
+
duration: completedAt - startedAt,
|
|
159
|
+
output: result,
|
|
160
|
+
error: errorMsg,
|
|
161
|
+
metrics,
|
|
162
|
+
timestamp: completedAt,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
catch (error) {
|
|
166
|
+
const completedAt = Date.now();
|
|
167
|
+
return this.createErrorResult(worktree, modelId, error, completedAt - startedAt, startedAt, completedAt, timeToFirstOutput);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Execute a task loop in a worktree
|
|
172
|
+
*
|
|
173
|
+
* @param worktree - The worktree to execute in
|
|
174
|
+
* @param model - Model configuration to use
|
|
175
|
+
* @param input - Execute loop input
|
|
176
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
177
|
+
* @returns Benchmark result with metrics
|
|
178
|
+
*/
|
|
179
|
+
async executeLoop(worktree, model, input, baseCommit) {
|
|
180
|
+
const modelId = `${model.provider}:${model.model}`;
|
|
181
|
+
const startedAt = Date.now();
|
|
182
|
+
let timeToFirstOutput;
|
|
183
|
+
logger_1.logger.info(`Executing task loop with ${modelId}`);
|
|
184
|
+
try {
|
|
185
|
+
// Setup working directory to the worktree
|
|
186
|
+
await (0, config_1.setupWorkingDirectory)(worktree.path);
|
|
187
|
+
// Token tracking
|
|
188
|
+
const tokenTracker = { prompt: 0, completion: 0 };
|
|
189
|
+
// Override model in loop options
|
|
190
|
+
const loopOptions = {
|
|
191
|
+
...input.loopOptions,
|
|
192
|
+
config: {
|
|
193
|
+
...input.loopOptions.config,
|
|
194
|
+
model: `${model.provider}:${model.model}`,
|
|
195
|
+
},
|
|
196
|
+
};
|
|
197
|
+
// Execute the loop
|
|
198
|
+
const result = await (0, task_loop_execution_1.executeTaskLoop)(loopOptions);
|
|
199
|
+
const completedAt = Date.now();
|
|
200
|
+
if (!timeToFirstOutput) {
|
|
201
|
+
timeToFirstOutput = completedAt - startedAt;
|
|
202
|
+
}
|
|
203
|
+
// Collect metrics
|
|
204
|
+
const metrics = await this.collectMetrics(worktree.path, baseCommit, startedAt, completedAt, timeToFirstOutput, tokenTracker, input.loopOptions.config?.verificationCommands);
|
|
205
|
+
const hasFailures = result.failedTasks > 0;
|
|
206
|
+
return {
|
|
207
|
+
modelId,
|
|
208
|
+
worktree,
|
|
209
|
+
status: hasFailures ? "failed" : "success",
|
|
210
|
+
duration: completedAt - startedAt,
|
|
211
|
+
output: result,
|
|
212
|
+
error: hasFailures ? `${result.failedTasks} tasks failed` : undefined,
|
|
213
|
+
metrics,
|
|
214
|
+
timestamp: completedAt,
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
catch (error) {
|
|
218
|
+
const completedAt = Date.now();
|
|
219
|
+
return this.createErrorResult(worktree, modelId, error, completedAt - startedAt, startedAt, completedAt, timeToFirstOutput);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Execute a full workflow in a worktree
|
|
224
|
+
*
|
|
225
|
+
* For workflow benchmarks, we execute a series of steps:
|
|
226
|
+
* 1. Initialize project (if needed)
|
|
227
|
+
* 2. Parse PRD to generate tasks
|
|
228
|
+
* 3. Execute tasks with the given model
|
|
229
|
+
*
|
|
230
|
+
* @param worktree - The worktree to execute in
|
|
231
|
+
* @param model - Model configuration to use
|
|
232
|
+
* @param input - Workflow input
|
|
233
|
+
* @param baseCommit - Base commit for metrics comparison
|
|
234
|
+
* @returns Benchmark result with metrics
|
|
235
|
+
*/
|
|
236
|
+
async executeWorkflow(worktree, model, input, baseCommit) {
|
|
237
|
+
const modelId = `${model.provider}:${model.model}`;
|
|
238
|
+
const startedAt = Date.now();
|
|
239
|
+
let timeToFirstOutput;
|
|
240
|
+
logger_1.logger.info(`Executing workflow with ${modelId}`);
|
|
241
|
+
try {
|
|
242
|
+
// Setup working directory to the worktree
|
|
243
|
+
const projectDir = input.projectDir ?? worktree.path;
|
|
244
|
+
await (0, config_1.setupWorkingDirectory)(projectDir);
|
|
245
|
+
// Token tracking
|
|
246
|
+
const tokenTracker = { prompt: 0, completion: 0 };
|
|
247
|
+
// Create workflow service
|
|
248
|
+
const workflowService = new workflow_1.WorkflowService();
|
|
249
|
+
// Build AI options for the model
|
|
250
|
+
const aiOptions = {
|
|
251
|
+
aiProvider: model.provider,
|
|
252
|
+
aiModel: model.model,
|
|
253
|
+
aiReasoning: model.reasoningTokens?.toString(),
|
|
254
|
+
};
|
|
255
|
+
// Execute workflow steps based on collected responses
|
|
256
|
+
const results = {};
|
|
257
|
+
// Step 1: Initialize project if needed
|
|
258
|
+
if (input.collectedResponses.projectName) {
|
|
259
|
+
results.init = await workflowService.initializeProject({
|
|
260
|
+
projectName: input.collectedResponses.projectName,
|
|
261
|
+
projectDir,
|
|
262
|
+
initMethod: input.collectedResponses.initMethod,
|
|
263
|
+
projectDescription: input.collectedResponses.projectDescription,
|
|
264
|
+
aiOptions,
|
|
265
|
+
stackConfig: input.collectedResponses.stackConfig,
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
// Step 2: Parse PRD if content is provided
|
|
269
|
+
if (input.collectedResponses.prdContent || input.collectedResponses.prdFile) {
|
|
270
|
+
const prdService = new prd_1.PRDService();
|
|
271
|
+
if (input.collectedResponses.prdContent) {
|
|
272
|
+
// If we have PRD content, save it first
|
|
273
|
+
const prdDir = (0, node_path_1.join)(projectDir, ".task-o-matic", "prd");
|
|
274
|
+
(0, node_fs_1.mkdirSync)(prdDir, { recursive: true });
|
|
275
|
+
const prdPath = (0, node_path_1.join)(prdDir, "benchmark-prd.md");
|
|
276
|
+
(0, node_fs_1.writeFileSync)(prdPath, input.collectedResponses.prdContent);
|
|
277
|
+
results.prdParse = await prdService.parsePRD({
|
|
278
|
+
file: prdPath,
|
|
279
|
+
aiOptions,
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
else if (input.collectedResponses.prdFile) {
|
|
283
|
+
results.prdParse = await prdService.parsePRD({
|
|
284
|
+
file: input.collectedResponses.prdFile,
|
|
285
|
+
aiOptions,
|
|
286
|
+
});
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
// Step 3: Execute tasks if requested
|
|
290
|
+
if (input.collectedResponses.generateTasks) {
|
|
291
|
+
const loopOptions = {
|
|
292
|
+
filters: { status: "todo" },
|
|
293
|
+
tool: (input.workflowOptions.executeTool ?? "opencode"),
|
|
294
|
+
config: {
|
|
295
|
+
maxRetries: input.workflowOptions.executeMaxRetries ?? 3,
|
|
296
|
+
verificationCommands: input.workflowOptions.verificationCommands ?? [],
|
|
297
|
+
model: modelId,
|
|
298
|
+
},
|
|
299
|
+
};
|
|
300
|
+
results.execution = await (0, task_loop_execution_1.executeTaskLoop)(loopOptions);
|
|
301
|
+
}
|
|
302
|
+
const completedAt = Date.now();
|
|
303
|
+
if (!timeToFirstOutput) {
|
|
304
|
+
timeToFirstOutput = completedAt - startedAt;
|
|
305
|
+
}
|
|
306
|
+
// Collect metrics
|
|
307
|
+
const verificationCommands = input.workflowOptions.verificationCommands ?? [];
|
|
308
|
+
const metrics = await this.collectMetrics(projectDir, baseCommit, startedAt, completedAt, timeToFirstOutput, tokenTracker, verificationCommands);
|
|
309
|
+
return this.createSuccessResult(worktree, modelId, results, metrics, completedAt - startedAt);
|
|
310
|
+
}
|
|
311
|
+
catch (error) {
|
|
312
|
+
const completedAt = Date.now();
|
|
313
|
+
return this.createErrorResult(worktree, modelId, error, completedAt - startedAt, startedAt, completedAt, timeToFirstOutput);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
/**
|
|
317
|
+
* Build AI options for a model configuration
|
|
318
|
+
*/
|
|
319
|
+
buildModelAIOptions(model) {
|
|
320
|
+
// Get API key from environment based on provider
|
|
321
|
+
const envKeyMap = {
|
|
322
|
+
anthropic: "ANTHROPIC_API_KEY",
|
|
323
|
+
openai: "OPENAI_API_KEY",
|
|
324
|
+
openrouter: "OPENROUTER_API_KEY",
|
|
325
|
+
google: "GOOGLE_API_KEY",
|
|
326
|
+
gemini: "GEMINI_API_KEY",
|
|
327
|
+
zai: "ZAI_API_KEY",
|
|
328
|
+
};
|
|
329
|
+
const envKey = envKeyMap[model.provider] ?? `${model.provider.toUpperCase()}_API_KEY`;
|
|
330
|
+
const apiKey = process.env[envKey];
|
|
331
|
+
return {
|
|
332
|
+
aiProvider: model.provider,
|
|
333
|
+
aiModel: model.model,
|
|
334
|
+
aiKey: apiKey,
|
|
335
|
+
aiReasoning: model.reasoningTokens?.toString(),
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Collect all metrics for a completed execution
|
|
340
|
+
*/
|
|
341
|
+
async collectMetrics(worktreePath, baseCommit, startedAt, completedAt, timeToFirstOutput, tokenTracker, verificationCommands) {
|
|
342
|
+
const timing = {
|
|
343
|
+
startedAt,
|
|
344
|
+
completedAt,
|
|
345
|
+
duration: completedAt - startedAt,
|
|
346
|
+
timeToFirstOutput,
|
|
347
|
+
};
|
|
348
|
+
const tokens = tokenTracker.prompt > 0 || tokenTracker.completion > 0
|
|
349
|
+
? {
|
|
350
|
+
prompt: tokenTracker.prompt,
|
|
351
|
+
completion: tokenTracker.completion,
|
|
352
|
+
total: tokenTracker.prompt + tokenTracker.completion,
|
|
353
|
+
}
|
|
354
|
+
: undefined;
|
|
355
|
+
return this.metricsCollector.collectAll(worktreePath, baseCommit, timing, tokens, verificationCommands);
|
|
356
|
+
}
|
|
357
|
+
/**
|
|
358
|
+
* Create a success result
|
|
359
|
+
*/
|
|
360
|
+
createSuccessResult(worktree, modelId, output, metrics, duration) {
|
|
361
|
+
return {
|
|
362
|
+
modelId,
|
|
363
|
+
worktree,
|
|
364
|
+
status: "success",
|
|
365
|
+
duration,
|
|
366
|
+
output,
|
|
367
|
+
metrics,
|
|
368
|
+
timestamp: Date.now(),
|
|
369
|
+
};
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Create an error result
|
|
373
|
+
*/
|
|
374
|
+
createErrorResult(worktree, modelId, error, duration, startedAt, completedAt, timeToFirstOutput) {
|
|
375
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
376
|
+
logger_1.logger.error(`Execution failed for ${modelId}: ${errorMessage}`);
|
|
377
|
+
return {
|
|
378
|
+
modelId,
|
|
379
|
+
worktree,
|
|
380
|
+
status: "error",
|
|
381
|
+
duration,
|
|
382
|
+
error: errorMessage,
|
|
383
|
+
metrics: {
|
|
384
|
+
timing: {
|
|
385
|
+
startedAt,
|
|
386
|
+
completedAt,
|
|
387
|
+
duration,
|
|
388
|
+
timeToFirstOutput,
|
|
389
|
+
},
|
|
390
|
+
},
|
|
391
|
+
timestamp: completedAt,
|
|
392
|
+
};
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
exports.BenchmarkExecutor = BenchmarkExecutor;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark System
|
|
3
|
+
*
|
|
4
|
+
* A unified system for benchmarking AI models across various operations.
|
|
5
|
+
* Supports parallel execution using git worktrees, persistent storage,
|
|
6
|
+
* and comprehensive metrics collection.
|
|
7
|
+
*/
|
|
8
|
+
export * from "./types";
|
|
9
|
+
export * from "./worktree-manager";
|
|
10
|
+
export * from "./worktree-pool";
|
|
11
|
+
export * from "./store";
|
|
12
|
+
export { MetricsCollector, type VerificationOptions } from "./metrics-collector";
|
|
13
|
+
export * from "./executor";
|
|
14
|
+
export * from "./orchestrator";
|
|
15
|
+
export * from "./operations";
|
|
16
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/benchmark/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,cAAc,SAAS,CAAC;AACxB,cAAc,oBAAoB,CAAC;AACnC,cAAc,iBAAiB,CAAC;AAChC,cAAc,SAAS,CAAC;AACxB,OAAO,EAAE,gBAAgB,EAAE,KAAK,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAGjF,cAAc,YAAY,CAAC;AAC3B,cAAc,gBAAgB,CAAC;AAG/B,cAAc,cAAc,CAAC"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Benchmark System
|
|
4
|
+
*
|
|
5
|
+
* A unified system for benchmarking AI models across various operations.
|
|
6
|
+
* Supports parallel execution using git worktrees, persistent storage,
|
|
7
|
+
* and comprehensive metrics collection.
|
|
8
|
+
*/
|
|
9
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
12
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
13
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
14
|
+
}
|
|
15
|
+
Object.defineProperty(o, k2, desc);
|
|
16
|
+
}) : (function(o, m, k, k2) {
|
|
17
|
+
if (k2 === undefined) k2 = k;
|
|
18
|
+
o[k2] = m[k];
|
|
19
|
+
}));
|
|
20
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
21
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
22
|
+
};
|
|
23
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
24
|
+
exports.MetricsCollector = void 0;
|
|
25
|
+
// Infrastructure
|
|
26
|
+
__exportStar(require("./types"), exports);
|
|
27
|
+
__exportStar(require("./worktree-manager"), exports);
|
|
28
|
+
__exportStar(require("./worktree-pool"), exports);
|
|
29
|
+
__exportStar(require("./store"), exports);
|
|
30
|
+
var metrics_collector_1 = require("./metrics-collector");
|
|
31
|
+
Object.defineProperty(exports, "MetricsCollector", { enumerable: true, get: function () { return metrics_collector_1.MetricsCollector; } });
|
|
32
|
+
// Execution & Coordination
|
|
33
|
+
__exportStar(require("./executor"), exports);
|
|
34
|
+
__exportStar(require("./orchestrator"), exports);
|
|
35
|
+
// Operations
|
|
36
|
+
__exportStar(require("./operations"), exports);
|