task-o-matic 0.0.8 ā 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +200 -0
- package/dist/commands/benchmark.d.ts.map +1 -1
- package/dist/commands/benchmark.js +342 -0
- package/dist/commands/tasks/execute-loop.d.ts +3 -0
- package/dist/commands/tasks/execute-loop.d.ts.map +1 -0
- package/dist/commands/tasks/execute-loop.js +118 -0
- package/dist/commands/tasks/index.d.ts +1 -0
- package/dist/commands/tasks/index.d.ts.map +1 -1
- package/dist/commands/tasks/index.js +1 -0
- package/dist/commands/tasks.d.ts.map +1 -1
- package/dist/commands/tasks.js +1 -0
- package/dist/commands/workflow.js +39 -0
- package/dist/lib/benchmark/registry.d.ts.map +1 -1
- package/dist/lib/benchmark/registry.js +11 -0
- package/dist/lib/benchmark/types.d.ts +50 -0
- package/dist/lib/benchmark/types.d.ts.map +1 -1
- package/dist/lib/task-loop-execution.d.ts +25 -0
- package/dist/lib/task-loop-execution.d.ts.map +1 -0
- package/dist/lib/task-loop-execution.js +473 -0
- package/dist/services/prd.d.ts.map +1 -1
- package/dist/services/prd.js +36 -1
- package/dist/services/tasks.d.ts.map +1 -1
- package/dist/services/tasks.js +90 -3
- package/dist/services/workflow-benchmark.d.ts +34 -0
- package/dist/services/workflow-benchmark.d.ts.map +1 -0
- package/dist/services/workflow-benchmark.js +317 -0
- package/dist/services/workflow.d.ts.map +1 -1
- package/dist/services/workflow.js +120 -7
- package/dist/test/task-loop-git.test.d.ts +2 -0
- package/dist/test/task-loop-git.test.d.ts.map +1 -0
- package/dist/test/task-loop-git.test.js +62 -0
- package/dist/types/index.d.ts +50 -0
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/results.d.ts +29 -1
- package/dist/types/results.d.ts.map +1 -1
- package/dist/types/workflow-results.d.ts +27 -0
- package/dist/types/workflow-results.d.ts.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -17,6 +17,7 @@ AI-powered task management for CLI, TUI, and web applications. Parse PRDs, enhan
|
|
|
17
17
|
- š **Smart Breakdown**: AI-powered task decomposition into subtasks
|
|
18
18
|
- š **Real-time Streaming**: Watch AI responses generate live with streaming output
|
|
19
19
|
- š **Model Benchmarking**: Compare performance and quality across different AI models
|
|
20
|
+
- š **Workflow Benchmarking**: Test complete workflows across multiple models and compare results
|
|
20
21
|
- š **Single-Project Focus**: Self-contained within each project directory
|
|
21
22
|
- š» **Framework-Agnostic**: Easily integrate into TUI, web apps, or any Node.js project
|
|
22
23
|
|
|
@@ -267,6 +268,11 @@ import type {
|
|
|
267
268
|
RefinePRDResult,
|
|
268
269
|
GenerateTasksResult,
|
|
269
270
|
SplitTasksResult,
|
|
271
|
+
// Benchmark types
|
|
272
|
+
WorkflowBenchmarkInput,
|
|
273
|
+
WorkflowBenchmarkResult,
|
|
274
|
+
BenchmarkConfig,
|
|
275
|
+
BenchmarkResult,
|
|
270
276
|
} from "task-o-matic";
|
|
271
277
|
```
|
|
272
278
|
|
|
@@ -350,6 +356,9 @@ task-o-matic workflow
|
|
|
350
356
|
|
|
351
357
|
# With streaming AI output
|
|
352
358
|
task-o-matic workflow --stream
|
|
359
|
+
|
|
360
|
+
# Want to test multiple AI models? Try workflow benchmarking:
|
|
361
|
+
task-o-matic benchmark workflow --models "openai:gpt-4o,anthropic:claude-3-5-sonnet"
|
|
353
362
|
```
|
|
354
363
|
|
|
355
364
|
**The workflow will guide you through:**
|
|
@@ -389,6 +398,7 @@ task-o-matic workflow --stream
|
|
|
389
398
|
- [AI Integration](docs/ai-integration.md) - AI providers and prompt engineering
|
|
390
399
|
- [Project Initialization](docs/projects.md) - Project setup and bootstrapping
|
|
391
400
|
- [Streaming Output](docs/streaming.md) - Real-time AI streaming capabilities
|
|
401
|
+
- [Model Benchmarking](docs/benchmarking.md) - Compare AI models and workflow performance
|
|
392
402
|
|
|
393
403
|
## šÆ Common Workflows
|
|
394
404
|
|
|
@@ -458,6 +468,55 @@ task-o-matic benchmark compare <run-id>
|
|
|
458
468
|
task-o-matic benchmark show <run-id>
|
|
459
469
|
```
|
|
460
470
|
|
|
471
|
+
### Workflow 3b: Complete Workflow Benchmarking
|
|
472
|
+
|
|
473
|
+
Test entire workflows across multiple AI models and automatically set up your project with the best results.
|
|
474
|
+
|
|
475
|
+
```bash
|
|
476
|
+
# 1. Basic workflow benchmark with interactive setup
|
|
477
|
+
task-o-matic benchmark workflow \
|
|
478
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet,openrouter:qwen/qwen-2.5-72b-instruct" \
|
|
479
|
+
--concurrency 2 \
|
|
480
|
+
--delay 1000
|
|
481
|
+
|
|
482
|
+
# 2. Automated workflow benchmark
|
|
483
|
+
task-o-matic benchmark workflow \
|
|
484
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet" \
|
|
485
|
+
--project-name "my-saas-app" \
|
|
486
|
+
--project-description "Team collaboration platform with real-time chat" \
|
|
487
|
+
--init-method ai \
|
|
488
|
+
--prd-method ai \
|
|
489
|
+
--auto-accept \
|
|
490
|
+
--skip-all
|
|
491
|
+
|
|
492
|
+
# 3. Benchmark with specific workflow options
|
|
493
|
+
task-o-matic benchmark workflow \
|
|
494
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet" \
|
|
495
|
+
--project-name "e-commerce-app" \
|
|
496
|
+
--init-method custom \
|
|
497
|
+
--frontend next \
|
|
498
|
+
--backend hono \
|
|
499
|
+
--database postgres \
|
|
500
|
+
--prd-method ai \
|
|
501
|
+
--prd-description "Modern e-commerce platform with AI recommendations" \
|
|
502
|
+
--refine-feedback "Focus on scalability and security" \
|
|
503
|
+
--split-all
|
|
504
|
+
|
|
505
|
+
# Results include:
|
|
506
|
+
# - Comprehensive comparison table (duration, tasks, PRD size, costs)
|
|
507
|
+
# - Detailed per-model breakdowns with timing and token metrics
|
|
508
|
+
# - Interactive selection to choose the best model
|
|
509
|
+
# - Automatic project setup with selected model's results
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
**Workflow Benchmark Features:**
|
|
513
|
+
|
|
514
|
+
- **Two-Phase Execution**: Interactive question collection, then parallel execution
|
|
515
|
+
- **Complete Workflow**: Project init ā PRD creation ā task generation ā task splitting
|
|
516
|
+
- **Comprehensive Metrics**: Performance, cost, quality, and output comparison
|
|
517
|
+
- **Model Selection**: Choose the best performer and auto-setup your project
|
|
518
|
+
- **Identical Conditions**: All models receive the same inputs for fair comparison
|
|
519
|
+
|
|
461
520
|
### Workflow 4: Project Bootstrapping
|
|
462
521
|
|
|
463
522
|
```bash
|
|
@@ -477,6 +536,130 @@ task-o-matic init bootstrap my-app
|
|
|
477
536
|
task-o-matic tasks create --title "Set up development environment" --ai-enhance --stream
|
|
478
537
|
```
|
|
479
538
|
|
|
539
|
+
## š Benchmarking Commands
|
|
540
|
+
|
|
541
|
+
### Basic Model Benchmarking
|
|
542
|
+
|
|
543
|
+
Compare different AI models on specific operations:
|
|
544
|
+
|
|
545
|
+
```bash
|
|
546
|
+
# Benchmark PRD parsing across multiple models
|
|
547
|
+
task-o-matic benchmark run prd-parse \
|
|
548
|
+
--file requirements.md \
|
|
549
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet,openrouter:qwen/qwen-2.5-72b-instruct" \
|
|
550
|
+
--concurrency 3 \
|
|
551
|
+
--delay 1000
|
|
552
|
+
|
|
553
|
+
# Benchmark task splitting
|
|
554
|
+
task-o-matic benchmark run task-breakdown \
|
|
555
|
+
--task-id <task-id> \
|
|
556
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet" \
|
|
557
|
+
--concurrency 2
|
|
558
|
+
|
|
559
|
+
# View benchmark results
|
|
560
|
+
task-o-matic benchmark list
|
|
561
|
+
task-o-matic benchmark show <run-id>
|
|
562
|
+
task-o-matic benchmark compare <run-id>
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
### Complete Workflow Benchmarking
|
|
566
|
+
|
|
567
|
+
Test entire project workflows across multiple AI models:
|
|
568
|
+
|
|
569
|
+
```bash
|
|
570
|
+
# Interactive workflow benchmark (recommended)
|
|
571
|
+
task-o-matic benchmark workflow \
|
|
572
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet,openrouter:qwen/qwen-2.5-72b-instruct"
|
|
573
|
+
```
|
|
574
|
+
|
|
575
|
+
**What happens:**
|
|
576
|
+
1. **Phase 1**: You answer workflow questions once (project setup, PRD creation, etc.)
|
|
577
|
+
2. **Phase 2**: All models execute the identical workflow in parallel
|
|
578
|
+
3. **Results**: Comprehensive comparison table with metrics and model selection
|
|
579
|
+
|
|
580
|
+
**Full automation example:**
|
|
581
|
+
|
|
582
|
+
```bash
|
|
583
|
+
task-o-matic benchmark workflow \
|
|
584
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet" \
|
|
585
|
+
--project-name "my-saas-platform" \
|
|
586
|
+
--project-description "Team collaboration platform with real-time messaging" \
|
|
587
|
+
--init-method ai \
|
|
588
|
+
--prd-method ai \
|
|
589
|
+
--auto-accept \
|
|
590
|
+
--refine-feedback "Add more technical details and security considerations" \
|
|
591
|
+
--generate-instructions "Focus on MVP features and break into small tasks" \
|
|
592
|
+
--split-all \
|
|
593
|
+
--concurrency 2 \
|
|
594
|
+
--delay 2000
|
|
595
|
+
```
|
|
596
|
+
|
|
597
|
+
**Output includes:**
|
|
598
|
+
|
|
599
|
+
```
|
|
600
|
+
š Workflow Benchmark Results
|
|
601
|
+
|
|
602
|
+
Model | Duration | Tasks | PRD Size | Steps | Cost
|
|
603
|
+
---------------------------------------- | ---------- | ----- | ---------- | ----- | ----------
|
|
604
|
+
openai:gpt-4o | 45234ms | 12 | 2843 chars | 5/5 | $0.023400
|
|
605
|
+
anthropic:claude-3-5-sonnet | 42156ms | 15 | 3021 chars | 5/5 | $0.019800
|
|
606
|
+
|
|
607
|
+
š Detailed Comparison
|
|
608
|
+
|
|
609
|
+
[1] openai:gpt-4o
|
|
610
|
+
Duration: 45234ms
|
|
611
|
+
Steps Completed: 5/5
|
|
612
|
+
Init: 2341ms
|
|
613
|
+
PRD Generation: 12456ms
|
|
614
|
+
Task Generation: 8234ms
|
|
615
|
+
Task Splitting: 3421ms
|
|
616
|
+
Tasks Created: 12
|
|
617
|
+
PRD Size: 2843 characters
|
|
618
|
+
Tokens: 4521 (Prompt: 2341, Completion: 2180)
|
|
619
|
+
Cost: $0.023400
|
|
620
|
+
|
|
621
|
+
šÆ Model Selection
|
|
622
|
+
Would you like to select a model and set up your project with its results? (y/N)
|
|
623
|
+
```
|
|
624
|
+
|
|
625
|
+
### Benchmark Options
|
|
626
|
+
|
|
627
|
+
All benchmark commands support:
|
|
628
|
+
|
|
629
|
+
- `--models <list>`: Comma-separated model list (required)
|
|
630
|
+
- `--concurrency <number>`: Max parallel requests (default: 3)
|
|
631
|
+
- `--delay <ms>`: Delay between requests (default: 1000ms)
|
|
632
|
+
|
|
633
|
+
**Model format:** `provider:model[:reasoning=<tokens>]`
|
|
634
|
+
|
|
635
|
+
**Examples:**
|
|
636
|
+
- `openai:gpt-4o`
|
|
637
|
+
- `anthropic:claude-3-5-sonnet`
|
|
638
|
+
- `openrouter:anthropic/claude-3.5-sonnet`
|
|
639
|
+
- `openrouter:openai/o1-preview:reasoning=50000`
|
|
640
|
+
|
|
641
|
+
### Workflow Benchmark Inheritance
|
|
642
|
+
|
|
643
|
+
The `benchmark workflow` command supports ALL workflow command options:
|
|
644
|
+
|
|
645
|
+
```bash
|
|
646
|
+
# All these workflow options work in benchmarks:
|
|
647
|
+
--project-name, --init-method, --project-description
|
|
648
|
+
--frontend, --backend, --database, --auth/--no-auth
|
|
649
|
+
--prd-method, --prd-file, --prd-description, --prd-content
|
|
650
|
+
--refine-feedback, --generate-instructions
|
|
651
|
+
--split-tasks, --split-all, --split-instructions
|
|
652
|
+
--skip-init, --skip-prd, --skip-refine, --skip-generate, --skip-split
|
|
653
|
+
--stream, --auto-accept, --config-file
|
|
654
|
+
```
|
|
655
|
+
|
|
656
|
+
This allows you to:
|
|
657
|
+
- **Pre-configure workflow steps** via command-line options
|
|
658
|
+
- **Skip interactive questions** for automated benchmarking
|
|
659
|
+
- **Compare identical workflows** across different models
|
|
660
|
+
- **Test specific scenarios** (e.g., only AI vs only custom stack)
|
|
661
|
+
```
|
|
662
|
+
|
|
480
663
|
## š§ Environment Variables
|
|
481
664
|
|
|
482
665
|
```bash
|
|
@@ -508,6 +691,23 @@ AI_TEMPERATURE=0.7
|
|
|
508
691
|
- **PRD Parsing**: `claude-3.5-sonnet` or `gpt-4`
|
|
509
692
|
- **Task Enhancement**: `claude-3-haiku` or `gpt-3.5-turbo`
|
|
510
693
|
- **Task Breakdown**: `claude-3.5-sonnet` for complex tasks
|
|
694
|
+
- **Workflow Benchmarking**: Test 2-3 models to find optimal performance for your use case
|
|
695
|
+
|
|
696
|
+
### Choosing the Right Model
|
|
697
|
+
|
|
698
|
+
Not sure which model to use? Try workflow benchmarking:
|
|
699
|
+
|
|
700
|
+
```bash
|
|
701
|
+
# Test your specific workflow across multiple models
|
|
702
|
+
task-o-matic benchmark workflow \
|
|
703
|
+
--models "openai:gpt-4o,anthropic:claude-3-5-sonnet,openrouter:qwen/qwen-2.5-72b-instruct" \
|
|
704
|
+
--project-description "Your project description here"
|
|
705
|
+
|
|
706
|
+
# The benchmark will show you:
|
|
707
|
+
# - Performance (speed, tokens, cost)
|
|
708
|
+
# - Quality (tasks created, PRD completeness)
|
|
709
|
+
# - Best model for your specific needs
|
|
710
|
+
```
|
|
511
711
|
|
|
512
712
|
## š Storage Structure
|
|
513
713
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark.d.ts","sourceRoot":"","sources":["../../src/commands/benchmark.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"benchmark.d.ts","sourceRoot":"","sources":["../../src/commands/benchmark.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAapC,eAAO,MAAM,gBAAgB,SAE5B,CAAC"}
|
|
@@ -1,4 +1,37 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
2
35
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
36
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
37
|
};
|
|
@@ -7,6 +40,7 @@ exports.benchmarkCommand = void 0;
|
|
|
7
40
|
const commander_1 = require("commander");
|
|
8
41
|
const chalk_1 = __importDefault(require("chalk"));
|
|
9
42
|
const benchmark_1 = require("../services/benchmark");
|
|
43
|
+
const workflow_prompts_1 = require("../utils/workflow-prompts");
|
|
10
44
|
exports.benchmarkCommand = new commander_1.Command("benchmark").description("Run and manage AI benchmarks");
|
|
11
45
|
// Helper to parse model string
|
|
12
46
|
// Format: provider:model[:reasoning=<tokens>]
|
|
@@ -225,3 +259,311 @@ exports.benchmarkCommand
|
|
|
225
259
|
}));
|
|
226
260
|
console.table(table);
|
|
227
261
|
});
|
|
262
|
+
exports.benchmarkCommand
|
|
263
|
+
.command("workflow")
|
|
264
|
+
.description("Benchmark complete workflow execution across multiple models")
|
|
265
|
+
.requiredOption("--models <list>", "Comma-separated list of models (provider:model[:reasoning=<tokens>])")
|
|
266
|
+
.option("--concurrency <number>", "Max concurrent requests", "3")
|
|
267
|
+
.option("--delay <number>", "Delay between requests in ms", "1000")
|
|
268
|
+
// Inherit all workflow command options
|
|
269
|
+
.option("--stream", "Show streaming AI output")
|
|
270
|
+
.option("--skip-all", "Skip all optional steps (use defaults)")
|
|
271
|
+
.option("--auto-accept", "Auto-accept all AI suggestions")
|
|
272
|
+
.option("--config-file <path>", "Load workflow options from JSON file")
|
|
273
|
+
// Step 1: Initialize
|
|
274
|
+
.option("--skip-init", "Skip initialization step")
|
|
275
|
+
.option("--project-name <name>", "Project name")
|
|
276
|
+
.option("--init-method <method>", "Initialization method: quick, custom, ai")
|
|
277
|
+
.option("--project-description <desc>", "Project description for AI-assisted init")
|
|
278
|
+
.option("--frontend <framework>", "Frontend framework")
|
|
279
|
+
.option("--backend <framework>", "Backend framework")
|
|
280
|
+
.option("--database <db>", "Database choice")
|
|
281
|
+
.option("--auth", "Include authentication")
|
|
282
|
+
.option("--no-auth", "Exclude authentication")
|
|
283
|
+
.option("--bootstrap", "Bootstrap with Better-T-Stack")
|
|
284
|
+
.option("--no-bootstrap", "Skip bootstrapping")
|
|
285
|
+
// Step 2: Define PRD
|
|
286
|
+
.option("--skip-prd", "Skip PRD definition")
|
|
287
|
+
.option("--prd-method <method>", "PRD method: upload, manual, ai, skip")
|
|
288
|
+
.option("--prd-file <path>", "Path to existing PRD file")
|
|
289
|
+
.option("--prd-description <desc>", "Product description for AI-assisted PRD")
|
|
290
|
+
.option("--prd-content <content>", "Direct PRD content")
|
|
291
|
+
// Step 3: Refine PRD
|
|
292
|
+
.option("--skip-refine", "Skip PRD refinement")
|
|
293
|
+
.option("--refine-method <method>", "Refinement method: manual, ai, skip")
|
|
294
|
+
.option("--refine-feedback <feedback>", "Feedback for AI refinement")
|
|
295
|
+
// Step 4: Generate Tasks
|
|
296
|
+
.option("--skip-generate", "Skip task generation")
|
|
297
|
+
.option("--generate-method <method>", "Generation method: standard, ai")
|
|
298
|
+
.option("--generate-instructions <instructions>", "Custom task generation instructions")
|
|
299
|
+
// Step 5: Split Tasks
|
|
300
|
+
.option("--skip-split", "Skip task splitting")
|
|
301
|
+
.option("--split-tasks <ids>", "Comma-separated task IDs to split")
|
|
302
|
+
.option("--split-all", "Split all tasks")
|
|
303
|
+
.option("--split-method <method>", "Split method: interactive, standard, custom")
|
|
304
|
+
.option("--split-instructions <instructions>", "Custom split instructions")
|
|
305
|
+
.action(async (options) => {
|
|
306
|
+
try {
|
|
307
|
+
await runWorkflowBenchmark(options);
|
|
308
|
+
}
|
|
309
|
+
catch (error) {
|
|
310
|
+
console.error(chalk_1.default.red("Workflow benchmark failed:"), error.message);
|
|
311
|
+
process.exit(1);
|
|
312
|
+
}
|
|
313
|
+
});
|
|
314
|
+
/**
|
|
315
|
+
* Execute workflow benchmark across multiple models
|
|
316
|
+
*/
|
|
317
|
+
async function runWorkflowBenchmark(options) {
|
|
318
|
+
console.log(chalk_1.default.blue.bold("\nš Task-O-Matic Workflow Benchmark\n"));
|
|
319
|
+
// Parse models
|
|
320
|
+
const modelStrings = options.models.split(",");
|
|
321
|
+
const models = modelStrings.map((s) => parseModelString(s.trim()));
|
|
322
|
+
const config = {
|
|
323
|
+
models,
|
|
324
|
+
concurrency: parseInt(options.concurrency, 10),
|
|
325
|
+
delay: parseInt(options.delay, 10),
|
|
326
|
+
};
|
|
327
|
+
console.log(chalk_1.default.dim(`Models: ${models.length}, Concurrency: ${config.concurrency}, Delay: ${config.delay}ms`));
|
|
328
|
+
// Phase 1: Collect user responses interactively
|
|
329
|
+
console.log(chalk_1.default.blue.bold("\nš Phase 1: Collecting Workflow Responses\n"));
|
|
330
|
+
console.log(chalk_1.default.gray("Please answer the following questions. Your responses will be used for all models."));
|
|
331
|
+
const collectedResponses = await collectWorkflowResponses(options);
|
|
332
|
+
// Phase 2: Execute workflow on all models
|
|
333
|
+
console.log(chalk_1.default.blue.bold("\nā” Phase 2: Executing Workflows\n"));
|
|
334
|
+
console.log(chalk_1.default.gray(`Running workflow on ${models.length} models...\n`));
|
|
335
|
+
// Prepare workflow input
|
|
336
|
+
const workflowInput = {
|
|
337
|
+
collectedResponses,
|
|
338
|
+
workflowOptions: options,
|
|
339
|
+
tempDirBase: "/tmp",
|
|
340
|
+
};
|
|
341
|
+
// Prepare dashboard
|
|
342
|
+
console.log(chalk_1.default.bold("Benchmark Progress:"));
|
|
343
|
+
const modelMap = new Map();
|
|
344
|
+
const modelStatus = new Map();
|
|
345
|
+
// Print initial lines and map indices
|
|
346
|
+
models.forEach((m, i) => {
|
|
347
|
+
const id = `${m.provider}:${m.model}${m.reasoningTokens ? `:reasoning=${m.reasoningTokens}` : ""}`;
|
|
348
|
+
modelMap.set(id, i);
|
|
349
|
+
modelStatus.set(id, "Waiting...");
|
|
350
|
+
console.log(chalk_1.default.dim(`- ${id}: Waiting...`));
|
|
351
|
+
});
|
|
352
|
+
const totalModels = models.length;
|
|
353
|
+
const run = await benchmark_1.benchmarkService.runBenchmark("workflow-full", workflowInput, config, (event) => {
|
|
354
|
+
const index = modelMap.get(event.modelId);
|
|
355
|
+
if (index === undefined)
|
|
356
|
+
return;
|
|
357
|
+
// Update status in memory
|
|
358
|
+
let statusStr = "";
|
|
359
|
+
if (event.type === "start") {
|
|
360
|
+
statusStr = chalk_1.default.yellow("Starting...");
|
|
361
|
+
}
|
|
362
|
+
else if (event.type === "progress") {
|
|
363
|
+
statusStr = chalk_1.default.blue("Running workflow...");
|
|
364
|
+
}
|
|
365
|
+
else if (event.type === "complete") {
|
|
366
|
+
statusStr = chalk_1.default.green(`Completed (${event.duration}ms)`);
|
|
367
|
+
}
|
|
368
|
+
else if (event.type === "error") {
|
|
369
|
+
statusStr = chalk_1.default.red(`Failed: ${event.error}`);
|
|
370
|
+
}
|
|
371
|
+
modelStatus.set(event.modelId, statusStr);
|
|
372
|
+
// Update display
|
|
373
|
+
const up = totalModels - index;
|
|
374
|
+
process.stdout.write(`\x1B[${up}A`); // Move up
|
|
375
|
+
process.stdout.write(`\x1B[2K`); // Clear line
|
|
376
|
+
process.stdout.write(`- ${chalk_1.default.bold(event.modelId)}: ${statusStr}\r`);
|
|
377
|
+
process.stdout.write(`\x1B[${up}B`); // Move down
|
|
378
|
+
});
|
|
379
|
+
console.log(chalk_1.default.green(`\nā
Workflow benchmark completed! Run ID: ${run.id}`));
|
|
380
|
+
// Display results
|
|
381
|
+
await displayWorkflowBenchmarkResults(run);
|
|
382
|
+
// Optional: Let user select a model for project setup
|
|
383
|
+
await promptForModelSelection(run, collectedResponses);
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Collect workflow responses from user interactively
|
|
387
|
+
*/
|
|
388
|
+
async function collectWorkflowResponses(options) {
|
|
389
|
+
// Use provided options or prompt user
|
|
390
|
+
const getOrPrompt = async (preAnswered, promptFn, skipCondition = false) => {
|
|
391
|
+
if (skipCondition) {
|
|
392
|
+
throw new Error("Step skipped");
|
|
393
|
+
}
|
|
394
|
+
if (preAnswered !== undefined) {
|
|
395
|
+
return preAnswered;
|
|
396
|
+
}
|
|
397
|
+
return promptFn();
|
|
398
|
+
};
|
|
399
|
+
// Project setup questions
|
|
400
|
+
const projectName = await getOrPrompt(options.projectName, () => (0, workflow_prompts_1.textInputPrompt)("What is the name of your project?", "my-benchmark-project"));
|
|
401
|
+
const initMethod = await getOrPrompt(options.initMethod, () => (0, workflow_prompts_1.selectPrompt)("How would you like to configure your project stack?", [
|
|
402
|
+
{ name: "Quick start (recommended defaults)", value: "quick" },
|
|
403
|
+
{ name: "Custom configuration", value: "custom" },
|
|
404
|
+
{ name: "AI-assisted (describe your project)", value: "ai" },
|
|
405
|
+
]));
|
|
406
|
+
let projectDescription;
|
|
407
|
+
if (initMethod === "ai") {
|
|
408
|
+
projectDescription = await getOrPrompt(options.projectDescription, () => (0, workflow_prompts_1.textInputPrompt)("Describe your project (e.g., 'A SaaS app for team collaboration'):"));
|
|
409
|
+
}
|
|
410
|
+
// Stack configuration (for custom method)
|
|
411
|
+
let stackConfig = {};
|
|
412
|
+
if (initMethod === "custom") {
|
|
413
|
+
stackConfig.frontend = await getOrPrompt(options.frontend, () => (0, workflow_prompts_1.selectPrompt)("Frontend framework:", ["next", "react", "vue", "svelte"]));
|
|
414
|
+
stackConfig.backend = await getOrPrompt(options.backend, () => (0, workflow_prompts_1.selectPrompt)("Backend framework:", ["hono", "express", "fastify", "nestjs"]));
|
|
415
|
+
stackConfig.database = await getOrPrompt(options.database, () => (0, workflow_prompts_1.selectPrompt)("Database:", ["sqlite", "postgres", "mysql", "mongodb"]));
|
|
416
|
+
stackConfig.auth = await getOrPrompt(options.auth, () => (0, workflow_prompts_1.confirmPrompt)("Include authentication?", true));
|
|
417
|
+
}
|
|
418
|
+
// PRD questions
|
|
419
|
+
const prdMethod = await getOrPrompt(options.prdMethod, () => (0, workflow_prompts_1.selectPrompt)("How would you like to define your PRD?", [
|
|
420
|
+
{ name: "AI-assisted creation", value: "ai" },
|
|
421
|
+
{ name: "Upload existing file", value: "upload" },
|
|
422
|
+
{ name: "Write manually", value: "manual" },
|
|
423
|
+
{ name: "Skip PRD", value: "skip" },
|
|
424
|
+
]));
|
|
425
|
+
let prdDescription;
|
|
426
|
+
let prdFile;
|
|
427
|
+
let prdContent;
|
|
428
|
+
if (prdMethod === "ai") {
|
|
429
|
+
prdDescription = await getOrPrompt(options.prdDescription, () => (0, workflow_prompts_1.textInputPrompt)("Describe your product in detail:"));
|
|
430
|
+
}
|
|
431
|
+
else if (prdMethod === "upload") {
|
|
432
|
+
prdFile = await getOrPrompt(options.prdFile, () => (0, workflow_prompts_1.textInputPrompt)("Path to PRD file:"));
|
|
433
|
+
}
|
|
434
|
+
else if (prdMethod === "manual") {
|
|
435
|
+
prdContent = await getOrPrompt(options.prdContent, () => (0, workflow_prompts_1.editorPrompt)("Write your PRD:", "# Product Requirements Document\n\n## Overview\n\n## Features\n\n"));
|
|
436
|
+
}
|
|
437
|
+
// Additional workflow questions
|
|
438
|
+
const refinePrd = !options.skipRefine && prdMethod !== "skip" ?
|
|
439
|
+
await (0, workflow_prompts_1.confirmPrompt)("Refine PRD with AI feedback?", false) : false;
|
|
440
|
+
let refineFeedback;
|
|
441
|
+
if (refinePrd) {
|
|
442
|
+
refineFeedback = await getOrPrompt(options.refineFeedback, () => (0, workflow_prompts_1.textInputPrompt)("What feedback should be used for PRD refinement?", "Add more technical details and clarify requirements"));
|
|
443
|
+
}
|
|
444
|
+
const generateTasks = !options.skipGenerate && prdMethod !== "skip";
|
|
445
|
+
const customInstructions = options.generateInstructions ||
|
|
446
|
+
(generateTasks ? await (0, workflow_prompts_1.textInputPrompt)("Custom task generation instructions (optional):", "") : undefined);
|
|
447
|
+
const splitTasks = !options.skipSplit && generateTasks ?
|
|
448
|
+
await (0, workflow_prompts_1.confirmPrompt)("Split complex tasks into subtasks?", true) : false;
|
|
449
|
+
const splitInstructions = splitTasks && options.splitInstructions ?
|
|
450
|
+
options.splitInstructions :
|
|
451
|
+
(splitTasks ? await (0, workflow_prompts_1.textInputPrompt)("Custom splitting instructions (optional):", "Break into 2-4 hour chunks") : undefined);
|
|
452
|
+
return {
|
|
453
|
+
projectName,
|
|
454
|
+
initMethod: initMethod,
|
|
455
|
+
projectDescription,
|
|
456
|
+
stackConfig,
|
|
457
|
+
prdMethod: prdMethod,
|
|
458
|
+
prdContent,
|
|
459
|
+
prdDescription,
|
|
460
|
+
prdFile,
|
|
461
|
+
refinePrd,
|
|
462
|
+
refineFeedback,
|
|
463
|
+
generateTasks,
|
|
464
|
+
customInstructions,
|
|
465
|
+
splitTasks,
|
|
466
|
+
splitInstructions,
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
/**
|
|
470
|
+
* Display workflow benchmark results in a comprehensive format
|
|
471
|
+
*/
|
|
472
|
+
async function displayWorkflowBenchmarkResults(run) {
|
|
473
|
+
console.log(chalk_1.default.bold("\nš Workflow Benchmark Results\n"));
|
|
474
|
+
// Summary table
|
|
475
|
+
console.log(chalk_1.default.bold(`${"Model".padEnd(40)} | ${"Duration".padEnd(10)} | ${"Tasks".padEnd(8)} | ${"PRD Size".padEnd(10)} | ${"Steps".padEnd(8)} | ${"Cost".padEnd(10)}`));
|
|
476
|
+
console.log("-".repeat(130));
|
|
477
|
+
run.results.forEach((r) => {
|
|
478
|
+
const duration = `${r.duration}ms`.padEnd(10);
|
|
479
|
+
const taskCount = r.output?.stats?.totalTasks || 0;
|
|
480
|
+
const tasks = `${taskCount}`.padEnd(8);
|
|
481
|
+
const prdSize = r.output?.stats?.prdSize ? `${r.output.stats.prdSize} chars`.padEnd(10) : "-".padEnd(10);
|
|
482
|
+
const steps = r.output?.stats ? `${r.output.stats.successfulSteps}/${r.output.stats.totalSteps}`.padEnd(8) : "-".padEnd(8);
|
|
483
|
+
const cost = r.cost ? `$${r.cost.toFixed(6)}`.padEnd(10) : "-".padEnd(10);
|
|
484
|
+
console.log(`${r.modelId.padEnd(40)} | ${duration} | ${tasks} | ${prdSize} | ${steps} | ${cost}`);
|
|
485
|
+
if (r.error) {
|
|
486
|
+
console.log(chalk_1.default.red(` Error: ${r.error}`));
|
|
487
|
+
}
|
|
488
|
+
});
|
|
489
|
+
// Detailed comparison
|
|
490
|
+
console.log(chalk_1.default.bold("\nš Detailed Comparison\n"));
|
|
491
|
+
run.results.forEach((r, index) => {
|
|
492
|
+
if (r.error)
|
|
493
|
+
return;
|
|
494
|
+
console.log(chalk_1.default.cyan(`\n[${index + 1}] ${r.modelId}`));
|
|
495
|
+
console.log(`Duration: ${r.duration}ms`);
|
|
496
|
+
if (r.output?.stats) {
|
|
497
|
+
const stats = r.output.stats;
|
|
498
|
+
console.log(`Steps Completed: ${stats.successfulSteps}/${stats.totalSteps}`);
|
|
499
|
+
if (stats.initDuration)
|
|
500
|
+
console.log(` Init: ${stats.initDuration}ms`);
|
|
501
|
+
if (stats.prdGenerationDuration)
|
|
502
|
+
console.log(` PRD Generation: ${stats.prdGenerationDuration}ms`);
|
|
503
|
+
if (stats.taskGenerationDuration)
|
|
504
|
+
console.log(` Task Generation: ${stats.taskGenerationDuration}ms`);
|
|
505
|
+
if (stats.taskSplittingDuration)
|
|
506
|
+
console.log(` Task Splitting: ${stats.taskSplittingDuration}ms`);
|
|
507
|
+
console.log(`Tasks Created: ${stats.totalTasks}`);
|
|
508
|
+
if (stats.tasksWithSubtasks)
|
|
509
|
+
console.log(`Tasks with Subtasks: ${stats.tasksWithSubtasks}`);
|
|
510
|
+
if (stats.prdSize)
|
|
511
|
+
console.log(`PRD Size: ${stats.prdSize} characters`);
|
|
512
|
+
}
|
|
513
|
+
if (r.tokenUsage) {
|
|
514
|
+
console.log(`Tokens: ${r.tokenUsage.total} (Prompt: ${r.tokenUsage.prompt}, Completion: ${r.tokenUsage.completion})`);
|
|
515
|
+
}
|
|
516
|
+
if (r.cost) {
|
|
517
|
+
console.log(`Cost: $${r.cost.toFixed(6)}`);
|
|
518
|
+
}
|
|
519
|
+
});
|
|
520
|
+
}
|
|
521
|
+
/**
|
|
522
|
+
* Allow user to select a model and set up project with its results
|
|
523
|
+
*/
|
|
524
|
+
async function promptForModelSelection(run, responses) {
|
|
525
|
+
const successfulResults = run.results.filter((r) => !r.error);
|
|
526
|
+
if (successfulResults.length === 0) {
|
|
527
|
+
console.log(chalk_1.default.yellow("\nā ļø No successful results to select from."));
|
|
528
|
+
return;
|
|
529
|
+
}
|
|
530
|
+
if (successfulResults.length === 1) {
|
|
531
|
+
console.log(chalk_1.default.green(`\nā
Only one successful result from ${successfulResults[0].modelId}`));
|
|
532
|
+
return;
|
|
533
|
+
}
|
|
534
|
+
console.log(chalk_1.default.blue.bold("\nšÆ Model Selection\n"));
|
|
535
|
+
const shouldSelect = await (0, workflow_prompts_1.confirmPrompt)("Would you like to select a model and set up your project with its results?", false);
|
|
536
|
+
if (!shouldSelect) {
|
|
537
|
+
console.log(chalk_1.default.gray("Benchmark complete. Results have been saved."));
|
|
538
|
+
return;
|
|
539
|
+
}
|
|
540
|
+
const choices = successfulResults.map((r, index) => ({
|
|
541
|
+
name: `${r.modelId} (${r.duration}ms, ${r.output?.stats?.totalTasks || 0} tasks, $${r.cost?.toFixed(6) || 'unknown'})`,
|
|
542
|
+
value: index,
|
|
543
|
+
}));
|
|
544
|
+
const selectedIndex = await (0, workflow_prompts_1.selectPrompt)("Select the model whose results you want to use for your project:", choices);
|
|
545
|
+
const selectedResult = successfulResults[selectedIndex];
|
|
546
|
+
console.log(chalk_1.default.green(`\nā
Selected: ${selectedResult.modelId}`));
|
|
547
|
+
console.log(chalk_1.default.gray("Setting up your project with the selected results..."));
|
|
548
|
+
// Get target directory
|
|
549
|
+
const targetDir = await (0, workflow_prompts_1.textInputPrompt)("Enter target directory for your project:", `./${responses.projectName}`);
|
|
550
|
+
try {
|
|
551
|
+
console.log(chalk_1.default.cyan("\nš§ Applying benchmark results..."));
|
|
552
|
+
const { workflowBenchmarkService } = await Promise.resolve().then(() => __importStar(require("../services/workflow-benchmark")));
|
|
553
|
+
const result = await workflowBenchmarkService.applyBenchmarkResult(selectedResult, targetDir, responses);
|
|
554
|
+
if (result.success) {
|
|
555
|
+
console.log(chalk_1.default.green(`\nā
${result.message}`));
|
|
556
|
+
console.log(chalk_1.default.cyan("\nNext steps:"));
|
|
557
|
+
console.log(chalk_1.default.gray(` ⢠Navigate to: cd ${targetDir}`));
|
|
558
|
+
console.log(chalk_1.default.gray(" ⢠Review your tasks: task-o-matic tasks list"));
|
|
559
|
+
console.log(chalk_1.default.gray(" ⢠View task tree: task-o-matic tasks tree"));
|
|
560
|
+
console.log(chalk_1.default.gray(" ⢠Start working: task-o-matic tasks next"));
|
|
561
|
+
}
|
|
562
|
+
else {
|
|
563
|
+
console.log(chalk_1.default.red(`\nā ${result.message}`));
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
catch (error) {
|
|
567
|
+
console.log(chalk_1.default.red(`\nā Failed to apply results: ${error instanceof Error ? error.message : String(error)}`));
|
|
568
|
+
}
|
|
569
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"execute-loop.d.ts","sourceRoot":"","sources":["../../../src/commands/tasks/execute-loop.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAiDpC,eAAO,MAAM,kBAAkB,SAoI3B,CAAC"}
|