@bradtaylorsf/alpha-loop 1.9.1 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +10 -1
  2. package/dist/cli.js +15 -3
  3. package/dist/cli.js.map +1 -1
  4. package/dist/commands/eval.d.ts +13 -0
  5. package/dist/commands/eval.js +220 -3
  6. package/dist/commands/eval.js.map +1 -1
  7. package/dist/commands/init.js +32 -0
  8. package/dist/commands/init.js.map +1 -1
  9. package/dist/commands/review.js +36 -1
  10. package/dist/commands/review.js.map +1 -1
  11. package/dist/lib/config.d.ts +6 -0
  12. package/dist/lib/config.js +15 -0
  13. package/dist/lib/config.js.map +1 -1
  14. package/dist/lib/eval-export.d.ts +26 -0
  15. package/dist/lib/eval-export.js +200 -0
  16. package/dist/lib/eval-export.js.map +1 -0
  17. package/dist/lib/eval-runner.d.ts +11 -0
  18. package/dist/lib/eval-runner.js +122 -10
  19. package/dist/lib/eval-runner.js.map +1 -1
  20. package/dist/lib/eval.d.ts +11 -0
  21. package/dist/lib/eval.js +34 -4
  22. package/dist/lib/eval.js.map +1 -1
  23. package/dist/lib/pipeline.d.ts +4 -0
  24. package/dist/lib/pipeline.js +29 -7
  25. package/dist/lib/pipeline.js.map +1 -1
  26. package/dist/lib/score.js +2 -1
  27. package/dist/lib/score.js.map +1 -1
  28. package/dist/lib/templates.d.ts +11 -0
  29. package/dist/lib/templates.js +27 -0
  30. package/dist/lib/templates.js.map +1 -1
  31. package/dist/lib/verify.d.ts +16 -0
  32. package/dist/lib/verify.js +49 -1
  33. package/dist/lib/verify.js.map +1 -1
  34. package/package.json +1 -1
  35. package/templates/evals/cases/step/review/006-missing-di-injection/checks.yaml +10 -0
  36. package/templates/evals/cases/step/review/006-missing-di-injection/input.md +50 -0
  37. package/templates/evals/cases/step/review/006-missing-di-injection/metadata.yaml +10 -0
  38. package/templates/evals/cases/step/review/007-silent-none-guard/checks.yaml +12 -0
  39. package/templates/evals/cases/step/review/007-silent-none-guard/input.md +53 -0
  40. package/templates/evals/cases/step/review/007-silent-none-guard/metadata.yaml +10 -0
  41. package/templates/evals/cases/step/review/008-route-shadowing/checks.yaml +11 -0
  42. package/templates/evals/cases/step/review/008-route-shadowing/input.md +29 -0
  43. package/templates/evals/cases/step/review/008-route-shadowing/metadata.yaml +10 -0
  44. package/templates/evals/cases/step/review/009-unthreaded-dependency/checks.yaml +11 -0
  45. package/templates/evals/cases/step/review/009-unthreaded-dependency/input.md +57 -0
  46. package/templates/evals/cases/step/review/009-unthreaded-dependency/metadata.yaml +10 -0
  47. package/templates/evals/cases/step/review/010-fabricated-metrics/checks.yaml +12 -0
  48. package/templates/evals/cases/step/review/010-fabricated-metrics/input.md +52 -0
  49. package/templates/evals/cases/step/review/010-fabricated-metrics/metadata.yaml +10 -0
@@ -0,0 +1,10 @@
1
+ id: 006-missing-di-injection
2
+ description: Review should flag a service that is defined but never injected into its consumer
3
+ type: step
4
+ step: review
5
+ tags:
6
+ - wiring
7
+ - di
8
+ - quality-failure
9
+ source: distribution
10
+ timeout: 60
@@ -0,0 +1,12 @@
1
+ - type: llm_judge
2
+ prompt: |
3
+ The diff adds NotificationService but makes it nullable (| null = null) in
4
+ OrderService. The None guard means if notifications is never injected, the
5
+ feature silently does nothing. The test creates OrderService without
6
+ notifications and "passes" because it never asserts notification behavior.
7
+ The review MUST flag: (1) the null default hides missing injection, and
8
+ (2) the test doesn't actually test notification sending.
9
+ rubric: |
10
+ Score 1.0 if the review identifies both the null-default hiding missing injection AND the test not asserting notification behavior.
11
+ Score 0.5 if the review flags only one of the two issues.
12
+ Score 0.0 if the review misses both issues.
@@ -0,0 +1,53 @@
1
+ ## Issue: Add notification service for order status changes
2
+
3
+ ### Diff
4
+
5
+ ```diff
6
+ +++ b/src/services/notification-service.ts
7
+ @@ -0,0 +1,15 @@
8
+ +export class NotificationService {
9
+ + constructor(private emailClient: EmailClient) {}
10
+ +
11
+ + async notify(userId: string, message: string): Promise<void> {
12
+ + await this.emailClient.send(userId, message);
13
+ + }
14
+ +}
15
+
16
+ +++ b/src/services/order-service.ts
17
+ @@ -3,7 +3,13 @@
18
+ export class OrderService {
19
+ - constructor(private orderRepo: OrderRepository) {}
20
+ + constructor(
21
+ + private orderRepo: OrderRepository,
22
+ + private notifications: NotificationService | null = null,
23
+ + ) {}
24
+
25
+ async updateStatus(orderId: string, status: OrderStatus): Promise<void> {
26
+ const order = await this.orderRepo.findById(orderId);
27
+ order.status = status;
28
+ await this.orderRepo.save(order);
29
+ +
30
+ + if (this.notifications !== null) {
31
+ + await this.notifications.notify(order.userId, `Order ${orderId} is now ${status}`);
32
+ + }
33
+ }
34
+
35
+ +++ b/tests/order-service.test.ts
36
+ @@ -10,6 +10,14 @@
37
+ it('updates order status', async () => {
38
+ const service = new OrderService(mockRepo);
39
+ await service.updateStatus('ord-1', 'shipped');
40
+ expect(mockRepo.save).toHaveBeenCalled();
41
+ });
42
+ +
43
+ + it('sends notification on status change', async () => {
44
+ + const service = new OrderService(mockRepo);
45
+ + await service.updateStatus('ord-1', 'shipped');
46
+ + // Test passes because notifications is null by default
47
+ + // No assertion on notification behavior
48
+ + });
49
+ ```
50
+
51
+ ### Analysis Required
52
+
53
+ Review this diff. The issue asked for notifications on order status changes.
@@ -0,0 +1,10 @@
1
+ id: 007-silent-none-guard
2
+ description: Review should flag optional parameters with None guards that silently disable features
3
+ type: step
4
+ step: review
5
+ tags:
6
+ - wiring
7
+ - none-guard
8
+ - quality-failure
9
+ source: distribution
10
+ timeout: 60
@@ -0,0 +1,11 @@
1
+ - type: llm_judge
2
+ prompt: |
3
+ The diff registers /evals/compare and /evals/summary AFTER /evals/:id.
4
+ In Express (and most routers), the parameterized route /evals/:id will
5
+ match "compare" and "summary" as id values before the static routes
6
+ are reached. The review MUST flag that static routes need to be
7
+ registered BEFORE parameterized routes to avoid shadowing.
8
+ rubric: |
9
+ Score 1.0 if the review identifies route shadowing — that /evals/:id will catch /evals/compare and /evals/summary.
10
+ Score 0.5 if the review mentions route ordering concerns but doesn't identify the specific shadowing.
11
+ Score 0.0 if the review misses the route shadowing issue entirely.
@@ -0,0 +1,29 @@
1
+ ## Issue: Add eval comparison endpoint
2
+
3
+ ### Diff
4
+
5
+ ```diff
6
+ +++ b/src/routes/evals.ts
7
+ @@ -5,6 +5,11 @@
8
+ const router = express.Router();
9
+
10
+ router.get('/evals/:id', async (req, res) => {
11
+ const eval = await evalService.findById(req.params.id);
12
+ res.json(eval);
13
+ });
14
+
15
+ +router.get('/evals/compare', async (req, res) => {
16
+ + const { run1, run2 } = req.query;
17
+ + const comparison = await evalService.compare(String(run1), String(run2));
18
+ + res.json(comparison);
19
+ +});
20
+ +
21
+ +router.get('/evals/summary', async (req, res) => {
22
+ + const summary = await evalService.getSummary();
23
+ + res.json(summary);
24
+ +});
25
+ ```
26
+
27
+ ### Analysis Required
28
+
29
+ Review this diff. The issue asked to add an eval comparison endpoint.
@@ -0,0 +1,10 @@
1
+ id: 008-route-shadowing
2
+ description: Review should flag static routes registered after parameterized routes causing shadowing
3
+ type: step
4
+ step: review
5
+ tags:
6
+ - wiring
7
+ - routing
8
+ - quality-failure
9
+ source: distribution
10
+ timeout: 60
@@ -0,0 +1,11 @@
1
+ - type: llm_judge
2
+ prompt: |
3
+ The diff creates an analytics dashboard that reads from a tool_executions
4
+ table via AnalyticsRepository. But nothing in the diff (or mentioned as
5
+ existing) WRITES to the tool_executions table. The dashboard will always
6
+ show empty data because there is no producer for the data it consumes.
7
+ The review MUST flag that the data source has no writer.
8
+ rubric: |
9
+ Score 1.0 if the review identifies that nothing writes to tool_executions — the dashboard has no data source.
10
+ Score 0.5 if the review raises general concerns about data availability but doesn't identify the specific missing write path.
11
+ Score 0.0 if the review misses the unthreaded dependency entirely.
@@ -0,0 +1,57 @@
1
+ ## Issue: Add analytics dashboard showing tool execution metrics
2
+
3
+ ### Diff
4
+
5
+ ```diff
6
+ +++ b/src/routes/analytics.ts
7
+ @@ -0,0 +1,20 @@
8
+ +import { AnalyticsRepository } from '../repos/analytics-repo';
9
+ +
10
+ +const router = express.Router();
11
+ +
12
+ +router.get('/analytics/tools', async (req, res) => {
13
+ + const metrics = await analyticsRepo.getToolMetrics();
14
+ + res.json(metrics);
15
+ +});
16
+ +
17
+ +router.get('/analytics/tools/:toolId', async (req, res) => {
18
+ + const history = await analyticsRepo.getToolHistory(req.params.toolId);
19
+ + res.json(history);
20
+ +});
21
+
22
+ +++ b/src/repos/analytics-repo.ts
23
+ @@ -0,0 +1,18 @@
24
+ +export class AnalyticsRepository {
25
+ + constructor(private db: Database) {}
26
+ +
27
+ + async getToolMetrics(): Promise<ToolMetric[]> {
28
+ + return this.db.query('SELECT tool_name, count(*) as runs, avg(duration) as avg_duration FROM tool_executions GROUP BY tool_name');
29
+ + }
30
+ +
31
+ + async getToolHistory(toolId: string): Promise<ToolExecution[]> {
32
+ + return this.db.query('SELECT * FROM tool_executions WHERE tool_name = ? ORDER BY created_at DESC LIMIT 50', [toolId]);
33
+ + }
34
+ +}
35
+
36
+ +++ b/src/components/AnalyticsDashboard.tsx
37
+ @@ -0,0 +1,30 @@
38
+ +export function AnalyticsDashboard() {
39
+ + const { data: metrics } = useQuery('/analytics/tools');
40
+ + return (
41
+ + <div>
42
+ + <h1>Tool Analytics</h1>
43
+ + {metrics?.map(m => (
44
+ + <div key={m.tool_name}>
45
+ + <span>{m.tool_name}</span>
46
+ + <span>{m.runs} runs</span>
47
+ + <span>{m.avg_duration}ms avg</span>
48
+ + </div>
49
+ + ))}
50
+ + </div>
51
+ + );
52
+ +}
53
+ ```
54
+
55
+ ### Analysis Required
56
+
57
+ Review this diff. The issue asked for an analytics dashboard showing tool execution metrics.
@@ -0,0 +1,10 @@
1
+ id: 009-unthreaded-dependency
2
+ description: Review should flag a data consumer that reads from a source nothing writes to
3
+ type: step
4
+ step: review
5
+ tags:
6
+ - wiring
7
+ - data-flow
8
+ - quality-failure
9
+ source: distribution
10
+ timeout: 60
@@ -0,0 +1,12 @@
1
+ - type: llm_judge
2
+ prompt: |
3
+ The diff estimates token counts from output string length (len/4) and
4
+ doubles it for input tokens. These are fabricated metrics displayed as
5
+ real data in the CostDashboard. The review MUST flag: (1) token counts
6
+ are estimated, not real API responses, (2) the cost calculation uses
7
+ stale hardcoded rates, and (3) the dashboard presents estimated data
8
+ as if it were actual usage metrics.
9
+ rubric: |
10
+ Score 1.0 if the review identifies that token counts are estimated (not from API responses) and flags this as misleading when displayed to users.
11
+ Score 0.5 if the review mentions estimation concerns but doesn't clearly flag it as fabricated metrics.
12
+ Score 0.0 if the review accepts the estimation approach without flagging data accuracy concerns.
@@ -0,0 +1,52 @@
1
+ ## Issue: Add cost tracking to pipeline runs
2
+
3
+ ### Diff
4
+
5
+ ```diff
6
+ +++ b/src/services/cost-tracker.ts
7
+ @@ -0,0 +1,22 @@
8
+ +export class CostTracker {
9
+ + async trackRun(runId: string, model: string, output: string): Promise<CostRecord> {
10
+ + // Estimate tokens from output length
11
+ + const estimatedTokens = Math.ceil(output.length / 4);
12
+ + const estimatedInputTokens = estimatedTokens * 2;
13
+ +
14
+ + const cost = this.calculateCost(model, estimatedInputTokens, estimatedTokens);
15
+ +
16
+ + return {
17
+ + runId,
18
+ + model,
19
+ + inputTokens: estimatedInputTokens,
20
+ + outputTokens: estimatedTokens,
21
+ + costUsd: cost,
22
+ + };
23
+ + }
24
+ +
25
+ + private calculateCost(model: string, input: number, output: number): number {
26
+ + const rates: Record<string, number> = { 'gpt-4': 0.03, 'claude': 0.015 };
27
+ + return (input + output) * (rates[model] ?? 0.01) / 1000;
28
+ + }
29
+ +}
30
+
31
+ +++ b/src/components/CostDashboard.tsx
32
+ @@ -0,0 +1,15 @@
33
+ +export function CostDashboard({ runs }: { runs: CostRecord[] }) {
34
+ + const totalCost = runs.reduce((sum, r) => sum + r.costUsd, 0);
35
+ + return (
36
+ + <div>
37
+ + <h2>Cost Summary</h2>
38
+ + <p>Total: ${totalCost.toFixed(2)}</p>
39
+ + {runs.map(r => (
40
+ + <div key={r.runId}>
41
+ + <span>{r.model}: {r.inputTokens} in / {r.outputTokens} out</span>
42
+ + <span>${r.costUsd.toFixed(4)}</span>
43
+ + </div>
44
+ + ))}
45
+ + </div>
46
+ + );
47
+ +}
48
+ ```
49
+
50
+ ### Analysis Required
51
+
52
+ Review this diff. The issue asked for cost tracking of pipeline runs.
@@ -0,0 +1,10 @@
1
+ id: 010-fabricated-metrics
2
+ description: Review should flag hardcoded or estimated values displayed as real metrics
3
+ type: step
4
+ step: review
5
+ tags:
6
+ - wiring
7
+ - metrics
8
+ - quality-failure
9
+ source: distribution
10
+ timeout: 60