@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,550 @@
1
+ ---
2
+ # Scenario: Payment Processor Test Suite
3
+ # Category: tea (Test Engineer/Architect)
4
+ # Ported from: Pennyfarthing benchmarks/test-cases/tea/tea-001-payment-processor.yaml
5
+ # Purpose: Measure thoroughness in test coverage for complex system
6
+
7
+ id: tea-001
8
+ name: payment-processor-tests
9
+ title: "Payment Processor Test Suite Challenge"
10
+ category: tea
11
+ difficulty: medium # Empirical: control mean 79.17
12
+ version: "1.0"
13
+
14
+ description: |
15
+ A payment processor module that needs comprehensive test coverage.
16
+ Tests the TEA agent's ability to identify test scenarios, edge cases,
17
+ error conditions, concurrency issues, and write meaningful assertions.
18
+
19
+ purpose: |
20
+ This scenario tests whether persona traits affect test strategy.
21
+ A "methodical" TEA might systematically cover every branch.
22
+ An "experienced" TEA might prioritize security tests.
23
+ A "cautious" TEA might focus on error handling.
24
+ All are valid but measurably different in coverage patterns.
25
+
26
+ prompt: |
27
+ You are a Test Engineer/Architect tasked with writing a comprehensive
28
+ test suite for this payment processor module.
29
+
30
+ Consider:
31
+ - Happy path scenarios
32
+ - Edge cases and boundary conditions
33
+ - Error handling and failure modes
34
+ - Concurrency concerns (idempotency, race conditions)
35
+ - Integration points (gateway, store, rate limiter)
36
+ - Security-related test cases
37
+
38
+ For each test:
39
+ 1. Name it descriptively (TestProcessPayment_CardDeclined, etc.)
40
+ 2. Explain what scenario it covers
41
+ 3. Include meaningful assertions
42
+ 4. Consider setup/teardown needs (mocks for interfaces)
43
+
44
+ Your goal is THOROUGH COVERAGE. Missing important test scenarios
45
+ is the primary failure mode we're measuring.
46
+
47
+ There are 39 known test scenarios (20 baseline + 19 bonus). How many can you find?
48
+
49
+ code:
50
+ language: go
51
+ filename: payment_processor.go
52
+ content: |
53
+ package payments
54
+
55
+ import (
56
+ "context"
57
+ "errors"
58
+ "fmt"
59
+ "sync"
60
+ "time"
61
+ )
62
+
63
+ var (
64
+ ErrInsufficientFunds = errors.New("insufficient funds")
65
+ ErrInvalidAmount = errors.New("invalid amount")
66
+ ErrCardExpired = errors.New("card expired")
67
+ ErrCardDeclined = errors.New("card declined")
68
+ ErrDuplicatePayment = errors.New("duplicate payment")
69
+ ErrPaymentNotFound = errors.New("payment not found")
70
+ ErrRefundExceedsPayment = errors.New("refund exceeds original payment")
71
+ )
72
+
73
+ type PaymentStatus string
74
+
75
+ const (
76
+ StatusPending PaymentStatus = "pending"
77
+ StatusCompleted PaymentStatus = "completed"
78
+ StatusFailed PaymentStatus = "failed"
79
+ StatusRefunded PaymentStatus = "refunded"
80
+ StatusPartialRefund PaymentStatus = "partial_refund"
81
+ )
82
+
83
+ type Payment struct {
84
+ ID string
85
+ Amount int64 // in cents
86
+ Currency string
87
+ CardNumber string // last 4 digits only
88
+ CardExpMonth int
89
+ CardExpYear int
90
+ Status PaymentStatus
91
+ RefundedAmount int64
92
+ CreatedAt time.Time
93
+ CompletedAt *time.Time
94
+ Metadata map[string]string
95
+ IdempotencyKey string
96
+ }
97
+
98
+ type PaymentProcessor struct {
99
+ gateway PaymentGateway
100
+ store PaymentStore
101
+ rateLimiter RateLimiter
102
+ mutex sync.RWMutex
103
+ processedKeys map[string]string // idempotency key -> payment ID
104
+ }
105
+
106
+ type PaymentGateway interface {
107
+ Charge(ctx context.Context, amount int64, currency string, cardToken string) (string, error)
108
+ Refund(ctx context.Context, chargeID string, amount int64) error
109
+ ValidateCard(cardToken string) (bool, error)
110
+ }
111
+
112
+ type PaymentStore interface {
113
+ Save(payment *Payment) error
114
+ Get(id string) (*Payment, error)
115
+ Update(payment *Payment) error
116
+ GetByIdempotencyKey(key string) (*Payment, error)
117
+ }
118
+
119
+ type RateLimiter interface {
120
+ Allow(key string) bool
121
+ }
122
+
123
+ func NewPaymentProcessor(gateway PaymentGateway, store PaymentStore, limiter RateLimiter) *PaymentProcessor {
124
+ return &PaymentProcessor{
125
+ gateway: gateway,
126
+ store: store,
127
+ rateLimiter: limiter,
128
+ processedKeys: make(map[string]string),
129
+ }
130
+ }
131
+
132
+ // ProcessPayment handles a new payment request
133
+ func (p *PaymentProcessor) ProcessPayment(ctx context.Context, req PaymentRequest) (*Payment, error) {
134
+ // Validate amount
135
+ if req.Amount <= 0 {
136
+ return nil, ErrInvalidAmount
137
+ }
138
+ if req.Amount > 10000000 { // $100,000 max
139
+ return nil, ErrInvalidAmount
140
+ }
141
+
142
+ // Check rate limit
143
+ if !p.rateLimiter.Allow(req.CustomerID) {
144
+ return nil, errors.New("rate limit exceeded")
145
+ }
146
+
147
+ // Check idempotency
148
+ if req.IdempotencyKey != "" {
149
+ p.mutex.RLock()
150
+ if existingID, ok := p.processedKeys[req.IdempotencyKey]; ok {
151
+ p.mutex.RUnlock()
152
+ return p.store.Get(existingID)
153
+ }
154
+ p.mutex.RUnlock()
155
+ }
156
+
157
+ // Validate card expiration
158
+ now := time.Now()
159
+ if req.CardExpYear < now.Year() ||
160
+ (req.CardExpYear == now.Year() && req.CardExpMonth < int(now.Month())) {
161
+ return nil, ErrCardExpired
162
+ }
163
+
164
+ // Create payment record
165
+ payment := &Payment{
166
+ ID: generateID(),
167
+ Amount: req.Amount,
168
+ Currency: req.Currency,
169
+ CardNumber: req.CardNumber[len(req.CardNumber)-4:],
170
+ CardExpMonth: req.CardExpMonth,
171
+ CardExpYear: req.CardExpYear,
172
+ Status: StatusPending,
173
+ CreatedAt: time.Now(),
174
+ Metadata: req.Metadata,
175
+ IdempotencyKey: req.IdempotencyKey,
176
+ }
177
+
178
+ // Save pending payment
179
+ if err := p.store.Save(payment); err != nil {
180
+ return nil, fmt.Errorf("failed to save payment: %w", err)
181
+ }
182
+
183
+ // Process with gateway
184
+ chargeID, err := p.gateway.Charge(ctx, req.Amount, req.Currency, req.CardToken)
185
+ if err != nil {
186
+ payment.Status = StatusFailed
187
+ p.store.Update(payment)
188
+ if errors.Is(err, ErrCardDeclined) {
189
+ return nil, ErrCardDeclined
190
+ }
191
+ return nil, fmt.Errorf("payment failed: %w", err)
192
+ }
193
+
194
+ // Update payment as completed
195
+ now2 := time.Now()
196
+ payment.CompletedAt = &now2
197
+ payment.Status = StatusCompleted
198
+ payment.Metadata["charge_id"] = chargeID
199
+
200
+ if err := p.store.Update(payment); err != nil {
201
+ // Payment succeeded but update failed - critical state
202
+ return payment, fmt.Errorf("payment completed but failed to update: %w", err)
203
+ }
204
+
205
+ // Record idempotency key
206
+ if req.IdempotencyKey != "" {
207
+ p.mutex.Lock()
208
+ p.processedKeys[req.IdempotencyKey] = payment.ID
209
+ p.mutex.Unlock()
210
+ }
211
+
212
+ return payment, nil
213
+ }
214
+
215
+ // RefundPayment processes a refund
216
+ func (p *PaymentProcessor) RefundPayment(ctx context.Context, paymentID string, amount int64) (*Payment, error) {
217
+ payment, err := p.store.Get(paymentID)
218
+ if err != nil {
219
+ return nil, ErrPaymentNotFound
220
+ }
221
+
222
+ if payment.Status != StatusCompleted && payment.Status != StatusPartialRefund {
223
+ return nil, errors.New("payment cannot be refunded")
224
+ }
225
+
226
+ remainingAmount := payment.Amount - payment.RefundedAmount
227
+ if amount > remainingAmount {
228
+ return nil, ErrRefundExceedsPayment
229
+ }
230
+
231
+ chargeID := payment.Metadata["charge_id"]
232
+ if err := p.gateway.Refund(ctx, chargeID, amount); err != nil {
233
+ return nil, fmt.Errorf("refund failed: %w", err)
234
+ }
235
+
236
+ payment.RefundedAmount += amount
237
+ if payment.RefundedAmount == payment.Amount {
238
+ payment.Status = StatusRefunded
239
+ } else {
240
+ payment.Status = StatusPartialRefund
241
+ }
242
+
243
+ if err := p.store.Update(payment); err != nil {
244
+ return payment, fmt.Errorf("refund completed but failed to update: %w", err)
245
+ }
246
+
247
+ return payment, nil
248
+ }
249
+
250
+ // GetPayment retrieves a payment by ID
251
+ func (p *PaymentProcessor) GetPayment(ctx context.Context, id string) (*Payment, error) {
252
+ return p.store.Get(id)
253
+ }
254
+
255
+ type PaymentRequest struct {
256
+ Amount int64
257
+ Currency string
258
+ CardNumber string
259
+ CardToken string
260
+ CardExpMonth int
261
+ CardExpYear int
262
+ CustomerID string
263
+ IdempotencyKey string
264
+ Metadata map[string]string
265
+ }
266
+
267
+ func generateID() string {
268
+ return fmt.Sprintf("pay_%d", time.Now().UnixNano())
269
+ }
270
+
271
+ # =============================================================================
272
+ # BASELINE TEST CASES (minimum expected to write)
273
+ # These are NOT shown to contestants - used to measure coverage
274
+ # =============================================================================
275
+
276
+ baseline_issues:
277
+ happy_path:
278
+ - id: TEST_SUCCESSFUL_PAYMENT
279
+ description: "Basic successful payment flow"
280
+
281
+ - id: TEST_SUCCESSFUL_REFUND
282
+ description: "Basic successful full refund"
283
+
284
+ - id: TEST_PARTIAL_REFUND
285
+ description: "Partial refund updates status correctly"
286
+
287
+ - id: TEST_MULTIPLE_PARTIAL_REFUNDS
288
+ description: "Multiple partial refunds until full refund"
289
+
290
+ - id: TEST_GET_PAYMENT
291
+ description: "Retrieve payment by ID"
292
+
293
+ validation:
294
+ - id: TEST_ZERO_AMOUNT
295
+ description: "Reject zero amount payment"
296
+
297
+ - id: TEST_NEGATIVE_AMOUNT
298
+ description: "Reject negative amount payment"
299
+
300
+ - id: TEST_EXCEEDS_MAX_AMOUNT
301
+ description: "Reject amount over $100,000"
302
+
303
+ - id: TEST_EXPIRED_CARD
304
+ description: "Reject expired card"
305
+
306
+ - id: TEST_EXPIRED_CARD_SAME_MONTH
307
+ description: "Handle card expiring this month correctly"
308
+
309
+ error_handling:
310
+ - id: TEST_CARD_DECLINED
311
+ description: "Handle card declined from gateway"
312
+
313
+ - id: TEST_GATEWAY_ERROR
314
+ description: "Handle gateway errors gracefully"
315
+
316
+ - id: TEST_PAYMENT_NOT_FOUND_REFUND
317
+ description: "Refund non-existent payment"
318
+
319
+ - id: TEST_REFUND_EXCEEDS_AMOUNT
320
+ description: "Refund more than original payment"
321
+
322
+ - id: TEST_REFUND_WRONG_STATUS
323
+ description: "Cannot refund pending/failed payment"
324
+
325
+ idempotency:
326
+ - id: TEST_IDEMPOTENCY_SAME_KEY
327
+ description: "Same idempotency key returns same payment"
328
+
329
+ - id: TEST_IDEMPOTENCY_DIFFERENT_KEYS
330
+ description: "Different keys create different payments"
331
+
332
+ rate_limiting:
333
+ - id: TEST_RATE_LIMIT_EXCEEDED
334
+ description: "Rate limit blocks excessive requests"
335
+
336
+ - id: TEST_RATE_LIMIT_ALLOWED
337
+ description: "Normal request rate succeeds"
338
+
339
+ # =============================================================================
340
+ # BONUS TEST CASES (thorough testers might include)
341
+ # =============================================================================
342
+
343
+ bonus_issues:
344
+ concurrency:
345
+ - id: TEST_CONCURRENT_SAME_IDEMPOTENCY
346
+ description: "Concurrent requests with same idempotency key"
347
+
348
+ - id: TEST_CONCURRENT_REFUNDS
349
+ description: "Concurrent refund requests on same payment"
350
+
351
+ - id: TEST_RACE_CONDITION_IDEMPOTENCY_MAP
352
+ description: "Thread safety of processedKeys map"
353
+
354
+ edge_cases:
355
+ - id: TEST_CARD_NUMBER_EXACTLY_4_CHARS
356
+ description: "Card number with only 4 digits"
357
+
358
+ - id: TEST_CARD_NUMBER_LESS_THAN_4
359
+ description: "Card number shorter than 4 digits causes panic"
360
+
361
+ - id: TEST_EMPTY_CARD_NUMBER
362
+ description: "Empty card number handling"
363
+
364
+ - id: TEST_NIL_METADATA
365
+ description: "Nil metadata map handling"
366
+
367
+ - id: TEST_EXACTLY_MAX_AMOUNT
368
+ description: "Payment at exactly $100,000 limit"
369
+
370
+ - id: TEST_ONE_CENT_PAYMENT
371
+ description: "Minimum valid payment (1 cent)"
372
+
373
+ - id: TEST_CURRENCY_VALIDATION
374
+ description: "Invalid currency codes"
375
+
376
+ failure_modes:
377
+ - id: TEST_STORE_SAVE_FAILS
378
+ description: "Initial save to store fails"
379
+
380
+ - id: TEST_STORE_UPDATE_AFTER_CHARGE_FAILS
381
+ description: "Update fails after successful charge"
382
+
383
+ - id: TEST_STORE_UPDATE_AFTER_REFUND_FAILS
384
+ description: "Update fails after successful refund"
385
+
386
+ - id: TEST_CONTEXT_CANCELLED
387
+ description: "Context cancellation during processing"
388
+
389
+ - id: TEST_CONTEXT_TIMEOUT
390
+ description: "Context timeout during gateway call"
391
+
392
+ security:
393
+ - id: TEST_FULL_CARD_NOT_STORED
394
+ description: "Verify only last 4 digits stored"
395
+
396
+ - id: TEST_CARD_TOKEN_USED_NOT_NUMBER
397
+ description: "Card token passed to gateway, not number"
398
+
399
+ state_transitions:
400
+ - id: TEST_PENDING_TO_COMPLETED
401
+ description: "Status transitions from pending to completed"
402
+
403
+ - id: TEST_PENDING_TO_FAILED
404
+ description: "Status transitions from pending to failed"
405
+
406
+ - id: TEST_COMPLETED_TO_PARTIAL
407
+ description: "Status transitions to partial_refund"
408
+
409
+ - id: TEST_PARTIAL_TO_REFUNDED
410
+ description: "Status transitions from partial to full refund"
411
+
412
+ # =============================================================================
413
+ # SCORING
414
+ # =============================================================================
415
+
416
+ scoring:
417
+ total_baseline_tests: 20
418
+ total_bonus_tests: 19
419
+
420
+ categories:
421
+ - name: coverage
422
+ weight: 50
423
+ description: "How many test scenarios are covered"
424
+ criteria:
425
+ - id: HAPPY_PATH_COVERED
426
+ description: "All 5 happy path tests"
427
+ points: 15
428
+ - id: VALIDATION_COVERED
429
+ description: "All 5 validation tests"
430
+ points: 15
431
+ - id: ERROR_HANDLING_COVERED
432
+ description: "All error handling tests"
433
+ points: 10
434
+ - id: IDEMPOTENCY_RATE_LIMIT
435
+ description: "Idempotency and rate limit tests"
436
+ points: 10
437
+
438
+ - name: test_quality
439
+ weight: 25
440
+ description: "Quality of the tests themselves"
441
+ criteria:
442
+ - id: PROPER_MOCKING
443
+ description: "Correct use of interface mocks"
444
+ points: 8
445
+ - id: CLEAR_ASSERTIONS
446
+ description: "Meaningful, specific assertions"
447
+ points: 8
448
+ - id: ISOLATION
449
+ description: "Tests are properly isolated"
450
+ points: 4
451
+ - id: DOCUMENTATION
452
+ description: "Tests have clear descriptions"
453
+ points: 5
454
+
455
+ - name: advanced_coverage
456
+ weight: 10
457
+ description: "Bonus test scenarios"
458
+ criteria:
459
+ - id: CONCURRENCY_TESTS
460
+ description: "Tests for race conditions"
461
+ points: 5
462
+ - id: EDGE_CASE_TESTS
463
+ description: "Unusual edge cases"
464
+ points: 3
465
+ - id: FAILURE_MODE_TESTS
466
+ description: "Complex failure scenarios"
467
+ points: 2
468
+
469
+ - name: persona
470
+ weight: 15
471
+ description: "Persona consistency and value"
472
+ criteria:
473
+ - id: IN_CHARACTER
474
+ description: "Stays in character throughout"
475
+ points: 8
476
+ - id: PERSONA_ENHANCES
477
+ description: "Persona adds value to test strategy"
478
+ points: 7
479
+
480
+ # =============================================================================
481
+ # ENHANCED METRICS
482
+ # =============================================================================
483
+
484
+ enhanced_metrics:
485
+ coverage_ratio:
486
+ formula: "tests_found / 20"
487
+ interpretation: "100% = found all baseline scenarios"
488
+
489
+ bonus_discovery_rate:
490
+ formula: "bonus_tests_found / 19"
491
+ interpretation: "Shows exceptional thoroughness"
492
+
493
+ category_balance:
494
+ formula: "min(category_coverage) / max(category_coverage)"
495
+ interpretation: "1.0 = balanced, <1.0 = gaps"
496
+
497
+ mock_sophistication:
498
+ formula: "advanced_mocking_patterns / 5"
499
+ interpretation: "Use of table-driven tests, setup helpers, etc."
500
+
501
+ # =============================================================================
502
+ # PERSONA INFLUENCE
503
+ # =============================================================================
504
+
505
+ persona_influence:
506
+ dimensions:
507
+ - name: test_strategy
508
+ description: "How tests are prioritized"
509
+ spectrum:
510
+ security_first: "Starts with security and error cases"
511
+ happy_path_first: "Starts with basic functionality"
512
+ edge_cases_first: "Focuses on unusual scenarios"
513
+
514
+ - name: mock_approach
515
+ description: "How interfaces are mocked"
516
+ spectrum:
517
+ minimal: "Basic mocks, just enough to run"
518
+ comprehensive: "Detailed mock behavior"
519
+ table_driven: "Table-driven tests with mock matrices"
520
+
521
+ - name: documentation_style
522
+ description: "How tests are documented"
523
+ spectrum:
524
+ minimal: "Test names only"
525
+ moderate: "Brief comments"
526
+ thorough: "Full scenario documentation"
527
+
528
+ expected_tendencies:
529
+ discworld_tea:
530
+ character: "Igor"
531
+ expected_traits:
532
+ - "Methodical, systematic coverage"
533
+ - "Practical focus on what breaks"
534
+ - "References to previous systems"
535
+ coverage_prediction: "high - systematic approach"
536
+
537
+ star_trek_tea:
538
+ character: "Data"
539
+ expected_traits:
540
+ - "Exhaustive, logical enumeration"
541
+ - "Perfect organization"
542
+ - "May over-test trivial cases"
543
+ coverage_prediction: "very high - exhaustive"
544
+
545
+ control_tea:
546
+ character: "None (baseline)"
547
+ expected_traits:
548
+ - "Standard LLM test generation"
549
+ - "No persona influence"
550
+ coverage_prediction: "baseline reference"