@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,639 @@
1
+ ---
2
+ # Thunderdome Scenario Schema
3
+ # Version 1.0 - Battle scenario format for persona benchmarking
4
+ #
5
+ # Thunderdome uses competitive "duels" as a fun framing for rigorous
6
+ # persona performance evaluation. Scenarios define challenges that
7
+ # measure how different personas approach the same problem.
8
+
9
+ schema:
10
+ version: "1.0"
11
+
12
+ # ============================================================================
13
+ # Required Fields
14
+ # ============================================================================
15
+
16
+ required:
17
+ name:
18
+ type: string
19
+ format: kebab-case
20
+ description: "Unique scenario identifier"
21
+ example: "code-review-user-service"
22
+
23
+ title:
24
+ type: string
25
+ description: "Human-readable scenario name"
26
+ example: "User Service Code Review"
27
+
28
+ category:
29
+ type: enum
30
+ values: [code-review, architecture, dev, tea, sm, pm, reviewer, general]
31
+ description: "Which agent type this scenario targets"
32
+
33
+ difficulty:
34
+ type: enum
35
+ values: [easy, medium, hard, extreme]
36
+ description: "Difficulty level"
37
+
38
+ prompt:
39
+ type: string
40
+ multiline: true
41
+ description: "The challenge presented to contestants"
42
+
43
+ # ============================================================================
44
+ # Optional Metadata
45
+ # ============================================================================
46
+
47
+ optional:
48
+ id:
49
+ type: string
50
+ description: "Short ID for reference (e.g., cr-001, arch-002)"
51
+
52
+ version:
53
+ type: string
54
+ default: "1.0"
55
+ description: "Scenario version for iteration tracking"
56
+
57
+ description:
58
+ type: string
59
+ description: "What this scenario tests"
60
+
61
+ purpose:
62
+ type: string
63
+ multiline: true
64
+ description: "Detailed explanation of what's being measured"
65
+
66
+ tags:
67
+ type: array
68
+ items: string
69
+ description: "Categorization tags"
70
+
71
+ constraints:
72
+ type: array
73
+ items: string
74
+ description: "Rules contestants must follow"
75
+
76
+ context:
77
+ type: string
78
+ multiline: true
79
+ description: "Additional context for contestants"
80
+
81
+ # ============================================================================
82
+ # Code-Based Scenarios
83
+ # For code review, debugging, implementation challenges
84
+ # ============================================================================
85
+
86
+ code_content:
87
+ code:
88
+ type: object
89
+ description: "Code to review/fix/implement"
90
+ schema:
91
+ language: string
92
+ filename: string
93
+ content: string (multiline)
94
+
95
+ tests:
96
+ type: object
97
+ description: "Test suite for TDD scenarios"
98
+ schema:
99
+ language: string
100
+ filename: string
101
+ content: string (multiline)
102
+
103
+ stub:
104
+ type: object
105
+ description: "Starter code for implementation"
106
+ schema:
107
+ language: string
108
+ filename: string
109
+ content: string (multiline)
110
+
111
+ # ============================================================================
112
+ # TRAIL Error Taxonomy (Epic 14)
113
+ # Categorizes errors for OCEAN personality correlation research
114
+ # Based on Patronus AI's TRAIL benchmark error categories
115
+ # ============================================================================
116
+
117
+ error_type:
118
+ type: string
119
+ enum: [reasoning, planning, execution]
120
+ required: false
121
+ description: "TRAIL error category for OCEAN correlation analysis"
122
+ categories:
123
+ reasoning:
124
+ description: "Logic and decision-making failures"
125
+ examples: ["incorrect inferences", "contradictions", "false assumptions", "circular logic"]
126
+ planning:
127
+ description: "Task orchestration and coordination failures"
128
+ examples: ["sequencing errors", "dependency gaps", "resource misallocation", "incomplete plans"]
129
+ execution:
130
+ description: "System and tool interaction failures"
131
+ examples: ["timeouts", "context overflow", "tool misuse", "API errors"]
132
+
133
+ # ============================================================================
134
+ # Baseline Issues (for code review/debugging scenarios)
135
+ # Known issues seeded in the code - NOT shown to contestants
136
+ # Used to measure detection rate
137
+ # ============================================================================
138
+
139
+ baseline_issues:
140
+ type: object
141
+ description: "Known issues for scoring detection rate"
142
+ schema:
143
+ critical:
144
+ type: array
145
+ items:
146
+ id: string
147
+ location: string
148
+ description: string
149
+ error_type: string (optional, enum: reasoning|planning|execution)
150
+ high:
151
+ type: array
152
+ items:
153
+ id: string
154
+ location: string
155
+ description: string
156
+ error_type: string (optional, enum: reasoning|planning|execution)
157
+ medium:
158
+ type: array
159
+ items:
160
+ id: string
161
+ location: string
162
+ description: string
163
+ error_type: string (optional, enum: reasoning|planning|execution)
164
+ low:
165
+ type: array
166
+ items:
167
+ id: string
168
+ location: string
169
+ description: string
170
+ error_type: string (optional, enum: reasoning|planning|execution)
171
+
172
+ bonus_issues:
173
+ type: array
174
+ description: "Extra issues thorough contestants might find"
175
+ items:
176
+ id: string
177
+ description: string
178
+
179
+ # ============================================================================
180
+ # Scoring Rubric
181
+ # How to evaluate contestant responses
182
+ # ============================================================================
183
+
184
+ scoring:
185
+ type: object
186
+ description: "Evaluation rubric with weighted categories"
187
+ schema:
188
+ total_baseline_issues:
189
+ type: integer
190
+ description: "Count of known issues to find"
191
+
192
+ severity_weights:
193
+ type: object
194
+ description: "Severity weights for detection scoring (v2)"
195
+ default:
196
+ critical: 15
197
+ high: 10
198
+ medium: 5
199
+ low: 2
200
+ note: "Used for weighted recall calculation"
201
+
202
+ weights:
203
+ type: object
204
+ description: "Legacy severity weights (deprecated, use severity_weights)"
205
+ example:
206
+ critical: 3
207
+ high: 2
208
+ medium: 1
209
+ low: 0.5
210
+
211
+ categories:
212
+ type: array
213
+ items:
214
+ name: string
215
+ weight: integer (percentage of total score)
216
+ criteria:
217
+ type: array
218
+ items:
219
+ id: string
220
+ description: string
221
+ points: integer
222
+
223
+ # ============================================================================
224
+ # Detection Scoring v2 (Precision/Recall)
225
+ # Replaces additive scoring with explicit precision/recall metrics
226
+ # ============================================================================
227
+
228
+ detection_scoring_v2:
229
+ type: object
230
+ description: "Precision/recall based detection scoring configuration"
231
+ schema:
232
+ severity_weights:
233
+ type: object
234
+ description: "Point values by severity for weighted recall"
235
+ default:
236
+ critical: 15
237
+ high: 10
238
+ medium: 5
239
+ low: 2
240
+
241
+ component_weights:
242
+ type: object
243
+ description: "How detection subtotal (50 pts) is allocated"
244
+ default:
245
+ recall: 30 # Weighted recall × 30 (coverage priority)
246
+ precision: 10 # Precision × 10 (penalize hallucinations)
247
+ novel_bonus: 10 # min(novel_valid × 3, 10) (reward thoroughness)
248
+
249
+ metrics_output:
250
+ type: object
251
+ description: "Metrics calculated and reported"
252
+ fields:
253
+ weighted_found: "Sum of (found_issues × severity_weight)"
254
+ weighted_total: "Sum of (all_baseline × severity_weight)"
255
+ recall: "weighted_found / weighted_total"
256
+ precision: "true_positives / (true_positives + false_positives)"
257
+ f2_score: "5 × (P × R) / (4P + R) - recall-biased harmonic mean"
258
+
259
+ rationale: |
260
+ This scoring system addresses several issues with the original additive approach:
261
+
262
+ 1. **Explicit precision/recall trade-off**: Previously hidden, now visible
263
+ 2. **Severity-weighted recall**: Critical issues matter more than low
264
+ 3. **Novel findings preserved**: Bonus pool separate from precision calculation
265
+ 4. **Transparent metrics**: All intermediate values reported for debugging
266
+
267
+ Design choices:
268
+ - Recall weighted 3x precision (30 vs 10 pts) because missing vulnerabilities
269
+ is typically worse than false positives in security review
270
+ - F2 score reported for reference but not used in final scoring to maintain
271
+ interpretability of component scores
272
+ - Novel bonus capped at 10 pts to prevent gaming via quantity over quality
273
+
274
+ # ============================================================================
275
+ # Persona Influence Areas
276
+ # How different personas should legitimately differ
277
+ # ============================================================================
278
+
279
+ persona_influence:
280
+ type: object
281
+ description: "Dimensions where persona should affect response"
282
+ schema:
283
+ dimensions:
284
+ type: array
285
+ items:
286
+ name: string
287
+ description: string
288
+ spectrum:
289
+ type: object
290
+ description: "Range of valid approaches"
291
+ example:
292
+ conservative: "Description of conservative approach"
293
+ moderate: "Description of moderate approach"
294
+ aggressive: "Description of aggressive approach"
295
+
296
+ expected_tendencies:
297
+ type: object
298
+ description: "Expected approach by known personas (for evaluation)"
299
+ schema:
300
+ "[theme]_[agent]":
301
+ character: string
302
+ expected_traits: array of strings
303
+ risk_profile: string
304
+
305
+ # ============================================================================
306
+ # Default Judging Dimensions
307
+ # Fallback scoring when scenario doesn't specify custom rubric
308
+ # ============================================================================
309
+
310
+ # ============================================================================
311
+ # Difficulty Calibration
312
+ # Score bands based on empirical 10-run control baselines
313
+ # Last calibrated: 2026-01-02 (24 scenarios, 240 runs)
314
+ # ============================================================================
315
+
316
+ difficulty_calibration:
317
+ type: object
318
+ description: "Empirical score bands for difficulty labels"
319
+ schema:
320
+ bands:
321
+ easy:
322
+ range: "85-100"
323
+ interpretation: "Most control agents succeed consistently"
324
+ count: 12
325
+ examples:
326
+ - "order-service (91.9)"
327
+ - "executive-pet-project (91.1)"
328
+ - "sprint-planning-conflict (90.5)"
329
+ - "scaling-decision (88.6)"
330
+ - "event-processor-tdd (87.9)"
331
+ - "tdd-shopping-cart (85.8)"
332
+ medium:
333
+ range: "70-85"
334
+ interpretation: "Moderate challenge, some variance expected"
335
+ count: 9
336
+ examples:
337
+ - "null-pointer (82.8)"
338
+ - "checkout-component-tests (82.4)"
339
+ - "react-auth-component (82.3)"
340
+ - "terraform-infrastructure (80.9)"
341
+ - "graphql-api-review (79.5)"
342
+ - "payment-processor-tests (79.2)"
343
+ - "race-condition-cache (76.8)"
344
+ - "migration-disaster (76.5)"
345
+ - "buggy-user-service (74.3)"
346
+ hard:
347
+ range: "55-70"
348
+ interpretation: "Significant challenge, control often struggles"
349
+ count: 3
350
+ examples:
351
+ - "cli-tool-tests (64.5)"
352
+ - "microservice-integration-tests (63.1)"
353
+ extreme:
354
+ range: "<55"
355
+ interpretation: "Most control agents fail or produce incomplete responses"
356
+ count: 2
357
+ examples:
358
+ - "three-sprint-failure (49.0)"
359
+ - "layoff-planning (48.6)"
360
+ notes: "Only ethical dilemma scenarios reach extreme - control handles technical challenges well"
361
+
362
+ calibration_requirements:
363
+ baseline_runs: 10
364
+ baseline_agent: "control:<category>"
365
+ required_metrics: ["mean", "std", "range"]
366
+ minimum_variance: 5.0 # If std < 5, scenario may be too deterministic
367
+
368
+ validation_rules:
369
+ - "Difficulty label must match empirical score band"
370
+ - "Ceiling effects (mean > 95) require scenario rework"
371
+ - "Bimodal distributions (std > 30) indicate prompt ambiguity"
372
+ - "Zero variance (std = 0) indicates data collection issue"
373
+
374
+ defaults:
375
+ difficulty: medium
376
+ version: "1.0"
377
+
378
+ scoring:
379
+ categories:
380
+ - name: correctness
381
+ weight: 25
382
+ description: "Technical accuracy and validity"
383
+
384
+ - name: quality
385
+ weight: 25
386
+ description: "Code/content quality, clarity, maintainability"
387
+
388
+ - name: creativity
389
+ weight: 25
390
+ description: "Novel approaches, elegance, inventiveness"
391
+
392
+ - name: persona
393
+ weight: 25
394
+ description: "Staying in character while delivering value"
395
+
396
+ # ==============================================================================
397
+ # Example Scenarios
398
+ # ==============================================================================
399
+
400
+ examples:
401
+
402
+ # Minimal duel scenario (quick battles)
403
+ minimal:
404
+ name: explain-recursion
405
+ title: "Explain Recursion"
406
+ category: general
407
+ difficulty: easy
408
+ prompt: |
409
+ Explain recursion to a junior developer who has never
410
+ encountered the concept before.
411
+
412
+ # Code review with baseline issues
413
+ code_review:
414
+ name: user-service-review
415
+ id: cr-001
416
+ title: "User Service Code Review"
417
+ category: code-review
418
+ difficulty: medium
419
+ version: "1.0"
420
+
421
+ description: "Review Go code for security and quality issues"
422
+
423
+ purpose: |
424
+ Measures detection rate for known vulnerabilities plus
425
+ bonus discoveries. Tests depth of analysis and fix quality.
426
+
427
+ prompt: |
428
+ Review this code for bugs, security issues, and code quality problems.
429
+ Provide specific line references and severity ratings.
430
+
431
+ code:
432
+ language: go
433
+ filename: user_service.go
434
+ content: |
435
+ package users
436
+
437
+ func GetUser(id string) (*User, error) {
438
+ query := fmt.Sprintf("SELECT * FROM users WHERE id = '%s'", id)
439
+ row := db.QueryRow(query)
440
+ var user User
441
+ row.Scan(&user.ID, &user.Email, &user.Password)
442
+ return &user, nil
443
+ }
444
+
445
+ baseline_issues:
446
+ critical:
447
+ - id: SQL_INJECTION
448
+ location: "line 4"
449
+ description: "SQL injection via string formatting"
450
+ error_type: reasoning # Logic failure: choosing unsafe string formatting
451
+ high:
452
+ - id: PASSWORD_EXPOSURE
453
+ location: "line 6"
454
+ description: "Password field exposed in response"
455
+ error_type: planning # Design failure: not planning data exposure
456
+ medium:
457
+ - id: ERROR_IGNORED
458
+ location: "line 6"
459
+ description: "Scan error ignored"
460
+ error_type: execution # Implementation failure: ignoring error handling
461
+
462
+ scoring:
463
+ total_baseline_issues: 3
464
+ weights:
465
+ critical: 3
466
+ high: 2
467
+ medium: 1
468
+ categories:
469
+ - name: detection
470
+ weight: 40
471
+ criteria:
472
+ - id: BASELINE_FOUND
473
+ description: "Issues from seeded list"
474
+ points: 20
475
+ - id: BONUS_DISCOVERIES
476
+ description: "Valid issues beyond baseline"
477
+ points: 20
478
+ - name: depth
479
+ weight: 30
480
+ criteria:
481
+ - id: ROOT_CAUSE
482
+ description: "Explains why it's wrong"
483
+ points: 10
484
+ - id: FIX_QUALITY
485
+ description: "Provides correct fix"
486
+ points: 10
487
+ - id: IMPACT_ANALYSIS
488
+ description: "Explains consequences"
489
+ points: 10
490
+ - name: persona
491
+ weight: 30
492
+ criteria:
493
+ - id: CHARACTER_CONSISTENCY
494
+ description: "Stays in character"
495
+ points: 15
496
+ - id: PERSONA_VALUE_ADD
497
+ description: "Persona enhances response"
498
+ points: 15
499
+
500
+ persona_influence:
501
+ dimensions:
502
+ - name: severity_focus
503
+ description: "What issues get prioritized"
504
+ spectrum:
505
+ security_first: "Leads with SQL injection, auth issues"
506
+ quality_first: "Leads with error handling, structure"
507
+ balanced: "Covers both equally"
508
+ - name: fix_style
509
+ description: "How fixes are presented"
510
+ spectrum:
511
+ minimal: "Just fixes the issue"
512
+ comprehensive: "Refactors surrounding code"
513
+ educational: "Explains principles behind fix"
514
+
515
+ # Architecture scenario with persona influence tracking
516
+ architecture:
517
+ name: migration-dilemma
518
+ id: arch-002
519
+ title: "The Migration Dilemma"
520
+ category: architecture
521
+ difficulty: hard
522
+ version: "1.0"
523
+
524
+ description: "Modernize a legacy e-commerce platform"
525
+
526
+ purpose: |
527
+ Open-ended challenge with no single correct answer.
528
+ Measures trade-off analysis, risk tolerance, and how
529
+ persona values influence architectural recommendations.
530
+
531
+ prompt: |
532
+ You are brought in to modernize TechMart, a 5-year-old
533
+ e-commerce monolith. Budget: $500K over 18 months.
534
+ Must show progress in 6 months while maintaining features.
535
+
536
+ Provide: situation analysis, recommended approach,
537
+ trade-offs, success criteria, and what would change your mind.
538
+
539
+ context: |
540
+ - 50K daily active users, $2M annual revenue
541
+ - 12 developers, 2 DevOps, no architect
542
+ - 200K lines Rails, 20% test coverage
543
+ - CEO wants "modern", CTO wants pragmatic
544
+ - Last Black Friday: 2 hours downtime, $50K loss
545
+
546
+ scoring:
547
+ categories:
548
+ - name: situation_analysis
549
+ weight: 15
550
+ criteria:
551
+ - id: PROBLEM_ID
552
+ description: "Identifies core problems vs symptoms"
553
+ points: 5
554
+ - id: CONSTRAINTS
555
+ description: "Understands real constraints"
556
+ points: 5
557
+ - id: STAKEHOLDERS
558
+ description: "Reads CEO/CTO tension"
559
+ points: 5
560
+ - name: approach
561
+ weight: 30
562
+ criteria:
563
+ - id: COHERENT
564
+ description: "Internally consistent strategy"
565
+ points: 10
566
+ - id: PHASING
567
+ description: "Realistic timeline"
568
+ points: 10
569
+ - id: TECH_FIT
570
+ description: "Tech matches constraints"
571
+ points: 10
572
+ - name: trade_offs
573
+ weight: 25
574
+ criteria:
575
+ - id: SACRIFICES
576
+ description: "Honest about costs"
577
+ points: 8
578
+ - id: RISKS
579
+ description: "Realistic about failures"
580
+ points: 8
581
+ - id: ALTERNATIVES
582
+ description: "Considers other approaches"
583
+ points: 9
584
+ - name: adaptability
585
+ weight: 15
586
+ criteria:
587
+ - id: METRICS
588
+ description: "Measurable success criteria"
589
+ points: 8
590
+ - id: WARNINGS
591
+ description: "Early failure indicators"
592
+ points: 7
593
+ - name: persona
594
+ weight: 15
595
+ criteria:
596
+ - id: AUTHENTIC
597
+ description: "Decisions align with persona values"
598
+ points: 8
599
+ - id: INFLUENCE
600
+ description: "Traits visibly affect choices"
601
+ points: 7
602
+
603
+ persona_influence:
604
+ dimensions:
605
+ - name: risk_tolerance
606
+ description: "How aggressive is the change?"
607
+ spectrum:
608
+ conservative: "Strangler fig, incremental, proven tech"
609
+ moderate: "Phased with calculated risks"
610
+ aggressive: "Bold restructuring, new tech, ambitious"
611
+ - name: technology_choices
612
+ description: "What stack is recommended?"
613
+ spectrum:
614
+ boring: "Optimize Rails, add caching"
615
+ pragmatic: "Extract specific microservices"
616
+ cutting_edge: "Full K8s, service mesh, event sourcing"
617
+ - name: team_weight
618
+ description: "How much does current team factor in?"
619
+ spectrum:
620
+ high: "Work within skills, extensive training"
621
+ medium: "Some training, some hires"
622
+ low: "Hire new skills, expect adaptation"
623
+
624
+ expected_tendencies:
625
+ discworld_architect:
626
+ character: "Leonard of Quirm"
627
+ expected_traits:
628
+ - "Novel, possibly over-engineered solution"
629
+ - "Gets distracted by interesting sub-problems"
630
+ - "Multiple diagrams and sketches"
631
+ risk_profile: "moderate-to-aggressive, creative"
632
+
633
+ star_trek_architect:
634
+ character: "Spock"
635
+ expected_traits:
636
+ - "Highly logical, systematic"
637
+ - "Risk-averse, proven approaches"
638
+ - "Quantifies everything"
639
+ risk_profile: "conservative, methodical"