@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,374 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SWE-bench scenario judge using:
4
+ 1. Scenario-specific scoring rubric from YAML
5
+ 2. Ground-truth validation from actual SWE-bench patches
6
+
7
+ Scoring structure:
8
+ - root_cause (30%): IDENTIFIES_BUG_LOCATION (15) + EXPLAINS_WHY_BROKEN (15)
9
+ - fix_quality (40%): FIX_ADDRESSES_ISSUE (20) + FIX_IS_MINIMAL (10) + FIX_SYNTAX_CORRECT (10)
10
+ - completeness (20%): EDGE_CASES (10) + TEST_COVERAGE (10)
11
+ - persona (10%): IN_CHARACTER (10)
12
+ """
13
+
14
+ import json
15
+ import re
16
+ import sys
17
+ from pathlib import Path
18
+ from difflib import SequenceMatcher
19
+
20
+ # Add parent to path for pennyfarthing_scripts imports
21
+ sys.path.insert(0, str(Path(__file__).resolve().parents[4]))
22
+
23
+ from pennyfarthing_scripts.swebench import (
24
+ extract_patch_info,
25
+ find_scenario,
26
+ load_swebench_data,
27
+ )
28
+
29
+
30
+ def score_identifies_bug_location(response, ground_truth):
31
+ """Score IDENTIFIES_BUG_LOCATION (15 pts) using ground truth."""
32
+ patch_info = extract_patch_info(ground_truth.get('patch', ''))
33
+ response_lower = response.lower()
34
+
35
+ score = 0
36
+ details = []
37
+
38
+ # Check files (7.5 pts)
39
+ files_found = 0
40
+ for f in patch_info.files:
41
+ filename = Path(f).name.lower()
42
+ if filename in response_lower or f.lower() in response_lower:
43
+ files_found += 1
44
+
45
+ if patch_info.files:
46
+ file_score = (files_found / len(patch_info.files)) * 7.5
47
+ score += file_score
48
+ details.append(f"Files: {files_found}/{len(patch_info.files)} found")
49
+
50
+ # Check functions/classes (7.5 pts)
51
+ funcs_found = 0
52
+ for func in patch_info.functions:
53
+ func_match = re.search(r'(def|class)\s+(\w+)', func)
54
+ if func_match:
55
+ func_name = func_match.group(2).lower()
56
+ if func_name in response_lower:
57
+ funcs_found += 1
58
+
59
+ if patch_info.functions:
60
+ func_score = min(7.5, (funcs_found / len(patch_info.functions)) * 7.5)
61
+ score += func_score
62
+ details.append(f"Functions: {funcs_found}/{len(patch_info.functions)} found")
63
+ else:
64
+ score += 3.75 # Partial credit if no specific function in patch
65
+
66
+ return min(15, score), details
67
+
68
+
69
+ def score_explains_why_broken(response, ground_truth):
70
+ """Score EXPLAINS_WHY_BROKEN (15 pts)."""
71
+ response_lower = response.lower()
72
+ problem = ground_truth.get('problem_statement', '').lower()
73
+
74
+ score = 0
75
+ details = []
76
+
77
+ # Extract key terms from problem statement
78
+ key_terms = re.findall(r'[`\'"]([^`\'"]+)[`\'"]', problem)
79
+ key_terms += re.findall(r'\b\w+Error\b|\b\w+Exception\b', problem, re.IGNORECASE)
80
+ key_terms = list(set(key_terms))[:10]
81
+
82
+ # Check for explanation of the issue
83
+ explanation_markers = ['because', 'this happens', 'the issue', 'the problem', 'fails when', 'breaks when', 'causes']
84
+ has_explanation = any(marker in response_lower for marker in explanation_markers)
85
+ if has_explanation:
86
+ score += 7.5
87
+ details.append("Has explanation of why broken")
88
+
89
+ # Check for key terms from problem
90
+ terms_found = sum(1 for term in key_terms if term.lower() in response_lower)
91
+ if key_terms:
92
+ term_score = (terms_found / len(key_terms)) * 7.5
93
+ score += term_score
94
+ details.append(f"Key terms: {terms_found}/{len(key_terms)}")
95
+ else:
96
+ score += 3.75
97
+
98
+ return min(15, score), details
99
+
100
+
101
+ def score_fix_addresses_issue(response, ground_truth):
102
+ """Score FIX_ADDRESSES_ISSUE (20 pts) using ground truth patch."""
103
+ patch_info = extract_patch_info(ground_truth.get('patch', ''))
104
+ response_lower = response.lower()
105
+
106
+ score = 0
107
+ details = []
108
+
109
+ # Check if key additions from patch appear in response
110
+ additions_matched = 0
111
+ for addition in patch_info.additions[:5]:
112
+ # Normalize whitespace
113
+ addition_norm = re.sub(r'\s+', ' ', addition.lower())
114
+ response_norm = re.sub(r'\s+', ' ', response_lower)
115
+
116
+ # Check for exact or fuzzy match
117
+ if addition_norm in response_norm:
118
+ additions_matched += 1
119
+ else:
120
+ # Fuzzy match
121
+ sim = SequenceMatcher(None, addition_norm, response_norm).ratio()
122
+ if sim > 0.7:
123
+ additions_matched += 0.5
124
+
125
+ if patch_info.additions:
126
+ addition_score = (additions_matched / min(5, len(patch_info.additions))) * 15
127
+ score += addition_score
128
+ details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info.additions))}")
129
+
130
+ # Check for code block with fix
131
+ if '```' in response:
132
+ score += 5
133
+ details.append("Has code block")
134
+
135
+ return min(20, score), details
136
+
137
+
138
+ def score_fix_is_minimal(response, ground_truth):
139
+ """Score FIX_IS_MINIMAL (10 pts)."""
140
+ patch_info = extract_patch_info(ground_truth.get('patch', ''))
141
+
142
+ score = 0
143
+ details = []
144
+
145
+ # Count lines in patch vs lines in response code blocks
146
+ patch_lines = len(patch_info.additions) + len(patch_info.deletions)
147
+
148
+ # Extract code blocks from response
149
+ code_blocks = re.findall(r'```[\w]*\n(.*?)```', response, re.DOTALL)
150
+ response_code_lines = sum(len(block.strip().split('\n')) for block in code_blocks)
151
+
152
+ # If response is within 2x of patch size, it's minimal
153
+ if patch_lines > 0:
154
+ ratio = response_code_lines / patch_lines if response_code_lines > 0 else 1
155
+ if ratio <= 2:
156
+ score = 10
157
+ details.append(f"Minimal: {response_code_lines} lines (patch: {patch_lines})")
158
+ elif ratio <= 4:
159
+ score = 5
160
+ details.append(f"Somewhat verbose: {response_code_lines} lines (patch: {patch_lines})")
161
+ else:
162
+ score = 2
163
+ details.append(f"Over-engineered: {response_code_lines} lines (patch: {patch_lines})")
164
+ else:
165
+ score = 5
166
+
167
+ return min(10, score), details
168
+
169
+
170
+ def score_fix_syntax_correct(response):
171
+ """Score FIX_SYNTAX_CORRECT (10 pts)."""
172
+ score = 0
173
+ details = []
174
+
175
+ # Extract code blocks
176
+ code_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
177
+ if not code_blocks:
178
+ code_blocks = re.findall(r'```\n(.*?)```', response, re.DOTALL)
179
+
180
+ if code_blocks:
181
+ # Basic syntax checks
182
+ valid = True
183
+ for block in code_blocks:
184
+ try:
185
+ compile(block, '<string>', 'exec')
186
+ except SyntaxError:
187
+ valid = False
188
+ break
189
+
190
+ if valid:
191
+ score = 10
192
+ details.append("Syntax valid")
193
+ else:
194
+ score = 5
195
+ details.append("Syntax errors detected")
196
+ else:
197
+ score = 5
198
+ details.append("No code blocks to validate")
199
+
200
+ return min(10, score), details
201
+
202
+
203
+ def score_edge_cases(response):
204
+ """Score EDGE_CASES (10 pts)."""
205
+ response_lower = response.lower()
206
+
207
+ score = 0
208
+ details = []
209
+
210
+ edge_markers = ['edge case', 'corner case', 'what if', 'consider', 'also', 'none', 'empty', 'null', 'zero', 'negative', 'boundary']
211
+ found = sum(1 for m in edge_markers if m in response_lower)
212
+
213
+ score = min(10, found * 2)
214
+ details.append(f"Edge case markers: {found}")
215
+
216
+ return score, details
217
+
218
+
219
+ def score_test_coverage(response):
220
+ """Score TEST_COVERAGE (10 pts)."""
221
+ response_lower = response.lower()
222
+
223
+ score = 0
224
+ details = []
225
+
226
+ # Check for test-related content
227
+ has_test_section = 'test' in response_lower
228
+ has_test_function = 'def test_' in response_lower or 'test_' in response
229
+ has_assert = 'assert' in response_lower or 'pytest' in response_lower
230
+
231
+ if has_test_function:
232
+ score += 5
233
+ details.append("Has test function")
234
+ if has_assert:
235
+ score += 3
236
+ details.append("Has assertions")
237
+ if has_test_section:
238
+ score += 2
239
+ details.append("Has test section")
240
+
241
+ return min(10, score), details
242
+
243
+
244
+ def score_in_character(response, persona="senior developer"):
245
+ """Score IN_CHARACTER (10 pts)."""
246
+ response_lower = response.lower()
247
+
248
+ score = 0
249
+ details = []
250
+
251
+ # For control baseline, check professional tone
252
+ professional_markers = ['i recommend', 'we should', 'this approach', 'the fix', 'analysis', 'root cause']
253
+ found = sum(1 for m in professional_markers if m in response_lower)
254
+
255
+ score = min(10, found * 2)
256
+ details.append(f"Professional markers: {found}")
257
+
258
+ return score, details
259
+
260
+
261
+ def judge_response(scenario_name, response_text, swebench_data):
262
+ """Full judgment using scenario rubric + ground truth."""
263
+ ground_truth = find_scenario(swebench_data, scenario_name)
264
+
265
+ if not ground_truth:
266
+ return {'error': f'Scenario {scenario_name} not found in SWE-bench data'}
267
+
268
+ scores = {}
269
+ all_details = {}
270
+
271
+ # root_cause (30%)
272
+ loc_score, loc_details = score_identifies_bug_location(response_text, ground_truth)
273
+ why_score, why_details = score_explains_why_broken(response_text, ground_truth)
274
+ scores['root_cause'] = {
275
+ 'IDENTIFIES_BUG_LOCATION': loc_score,
276
+ 'EXPLAINS_WHY_BROKEN': why_score,
277
+ 'subtotal': loc_score + why_score
278
+ }
279
+ all_details['root_cause'] = loc_details + why_details
280
+
281
+ # fix_quality (40%)
282
+ fix_score, fix_details = score_fix_addresses_issue(response_text, ground_truth)
283
+ min_score, min_details = score_fix_is_minimal(response_text, ground_truth)
284
+ syn_score, syn_details = score_fix_syntax_correct(response_text)
285
+ scores['fix_quality'] = {
286
+ 'FIX_ADDRESSES_ISSUE': fix_score,
287
+ 'FIX_IS_MINIMAL': min_score,
288
+ 'FIX_SYNTAX_CORRECT': syn_score,
289
+ 'subtotal': fix_score + min_score + syn_score
290
+ }
291
+ all_details['fix_quality'] = fix_details + min_details + syn_details
292
+
293
+ # completeness (20%)
294
+ edge_score, edge_details = score_edge_cases(response_text)
295
+ test_score, test_details = score_test_coverage(response_text)
296
+ scores['completeness'] = {
297
+ 'EDGE_CASES': edge_score,
298
+ 'TEST_COVERAGE': test_score,
299
+ 'subtotal': edge_score + test_score
300
+ }
301
+ all_details['completeness'] = edge_details + test_details
302
+
303
+ # persona (10%)
304
+ char_score, char_details = score_in_character(response_text)
305
+ scores['persona'] = {
306
+ 'IN_CHARACTER': char_score,
307
+ 'subtotal': char_score
308
+ }
309
+ all_details['persona'] = char_details
310
+
311
+ # Total
312
+ total = (
313
+ scores['root_cause']['subtotal'] +
314
+ scores['fix_quality']['subtotal'] +
315
+ scores['completeness']['subtotal'] +
316
+ scores['persona']['subtotal']
317
+ )
318
+
319
+ patch_info = extract_patch_info(ground_truth.get('patch', ''))
320
+ return {
321
+ 'scenario': scenario_name,
322
+ 'instance_id': ground_truth.get('instance_id'),
323
+ 'scores': scores,
324
+ 'total': round(total, 1),
325
+ 'details': all_details,
326
+ 'ground_truth_files': patch_info.files
327
+ }
328
+
329
+
330
+ def main():
331
+ if len(sys.argv) < 3:
332
+ print("Usage: swebench-judge.py <scenario_name> <response_file>")
333
+ sys.exit(1)
334
+
335
+ scenario_name = sys.argv[1]
336
+ response_file = sys.argv[2]
337
+
338
+ # Load data
339
+ swebench_data = load_swebench_data()
340
+
341
+ with open(response_file, 'r') as f:
342
+ response_data = json.load(f)
343
+
344
+ # Handle different JSON structures
345
+ response_text = response_data.get('result', '') or response_data.get('response_text', '')
346
+
347
+ # Judge
348
+ result = judge_response(scenario_name, response_text, swebench_data)
349
+
350
+ # Display
351
+ print(f"\n{'='*60}")
352
+ print(f"SWE-BENCH JUDGE: {scenario_name}")
353
+ print(f"{'='*60}")
354
+
355
+ for category, scores in result['scores'].items():
356
+ print(f"\n{category.upper()} ({scores['subtotal']:.1f} pts)")
357
+ for criterion, score in scores.items():
358
+ if criterion != 'subtotal':
359
+ print(f" {criterion}: {score:.1f}")
360
+
361
+ print(f"\n{'─'*40}")
362
+ print(f"TOTAL: {result['total']}/100")
363
+
364
+ print(f"\nGround truth files: {result['ground_truth_files']}")
365
+
366
+ # Save
367
+ output_path = response_file.replace('run_', 'swebench_judge_')
368
+ with open(output_path, 'w') as f:
369
+ json.dump(result, f, indent=2)
370
+ print(f"\nSaved to: {output_path}")
371
+
372
+
373
+ if __name__ == '__main__':
374
+ main()
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env bash
2
+ # test-cache.sh - Read/write test result cache in session files
3
+ #
4
+ # Usage:
5
+ # source test-cache.sh
6
+ #
7
+ # # Check if valid cache exists (returns 0 if valid, 1 if stale/missing)
8
+ # if test_cache_valid "$SESSION_FILE"; then
9
+ # RESULT=$(test_cache_get "$SESSION_FILE" "result")
10
+ # echo "Using cached result: $RESULT"
11
+ # fi
12
+ #
13
+ # # Write cache after running tests
14
+ # test_cache_write "$SESSION_FILE" "GREEN" 42 0 1 "12s"
15
+
16
+ set -euo pipefail
17
+
18
+ # Cache validity window (5 minutes)
19
+ CACHE_MAX_AGE_MINUTES=5
20
+
21
+ # Check if test cache exists and is valid (same SHA, < 5 min old)
22
+ # Returns: 0 if valid, 1 if invalid/missing
23
+ test_cache_valid() {
24
+ local session_file="$1"
25
+
26
+ if [[ ! -f "$session_file" ]]; then
27
+ return 1
28
+ fi
29
+
30
+ if ! grep -q "^## Test Cache" "$session_file" 2>/dev/null; then
31
+ return 1
32
+ fi
33
+
34
+ local current_sha
35
+ current_sha=$(git rev-parse HEAD 2>/dev/null || echo "unknown")
36
+
37
+ local cache_sha
38
+ cache_sha=$(grep "| Git SHA |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs)
39
+
40
+ if [[ "$cache_sha" != "$current_sha" ]]; then
41
+ return 1
42
+ fi
43
+
44
+ local cache_time
45
+ cache_time=$(grep "| Last Run |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs)
46
+
47
+ local cache_epoch now_epoch age_minutes
48
+ # macOS date
49
+ cache_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$cache_time" +%s 2>/dev/null || \
50
+ date -d "$cache_time" +%s 2>/dev/null || echo 0)
51
+ now_epoch=$(date +%s)
52
+ age_minutes=$(( (now_epoch - cache_epoch) / 60 ))
53
+
54
+ if [[ $age_minutes -ge $CACHE_MAX_AGE_MINUTES ]]; then
55
+ return 1
56
+ fi
57
+
58
+ return 0
59
+ }
60
+
61
+ # Get a field from the test cache
62
+ # Usage: test_cache_get "$SESSION_FILE" "result|pass|fail|skip|duration|sha|time"
63
+ test_cache_get() {
64
+ local session_file="$1"
65
+ local field="$2"
66
+
67
+ case "$field" in
68
+ result) grep "| Result |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
69
+ pass) grep "| Pass |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
70
+ fail) grep "| Fail |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
71
+ skip) grep "| Skip |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
72
+ duration) grep "| Duration |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
73
+ sha) grep "| Git SHA |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
74
+ time) grep "| Last Run |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
75
+ *) echo "Unknown field: $field" >&2; return 1 ;;
76
+ esac
77
+ }
78
+
79
+ # Write test cache to session file
80
+ # Usage: test_cache_write "$SESSION_FILE" "GREEN" 42 0 1 "12s"
81
+ test_cache_write() {
82
+ local session_file="$1"
83
+ local result="$2" # GREEN, RED, YELLOW
84
+ local pass="$3" # pass count
85
+ local fail="$4" # fail count
86
+ local skip="$5" # skip count
87
+ local duration="$6" # e.g., "12s"
88
+
89
+ local git_sha timestamp
90
+ git_sha=$(git rev-parse HEAD 2>/dev/null || echo "unknown")
91
+ timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
92
+
93
+ local cache_section="## Test Cache
94
+
95
+ | Field | Value |
96
+ |-------|-------|
97
+ | Last Run | $timestamp |
98
+ | Git SHA | $git_sha |
99
+ | Result | $result |
100
+ | Pass | $pass |
101
+ | Fail | $fail |
102
+ | Skip | $skip |
103
+ | Duration | $duration |"
104
+
105
+ if [[ ! -f "$session_file" ]]; then
106
+ echo "Session file not found: $session_file" >&2
107
+ return 1
108
+ fi
109
+
110
+ # Check if Test Cache section exists
111
+ if grep -q "^## Test Cache" "$session_file" 2>/dev/null; then
112
+ # Replace existing section - find start and end, replace content
113
+ local temp_file
114
+ temp_file=$(mktemp)
115
+
116
+ awk -v new_section="$cache_section" '
117
+ /^## Test Cache/ {
118
+ in_section = 1
119
+ print new_section
120
+ next
121
+ }
122
+ /^## / && in_section {
123
+ in_section = 0
124
+ }
125
+ !in_section { print }
126
+ ' "$session_file" > "$temp_file"
127
+
128
+ mv "$temp_file" "$session_file"
129
+ else
130
+ # Append before "## Workflow Tracking" if present, else at end
131
+ if grep -q "^## Workflow Tracking" "$session_file" 2>/dev/null; then
132
+ local temp_file
133
+ temp_file=$(mktemp)
134
+
135
+ awk -v new_section="$cache_section" '
136
+ /^## Workflow Tracking/ {
137
+ print new_section
138
+ print ""
139
+ }
140
+ { print }
141
+ ' "$session_file" > "$temp_file"
142
+
143
+ mv "$temp_file" "$session_file"
144
+ else
145
+ echo "" >> "$session_file"
146
+ echo "$cache_section" >> "$session_file"
147
+ fi
148
+ fi
149
+
150
+ echo "Test cache written: $result ($pass passed, $fail failed, $skip skipped)"
151
+ }
152
+
153
+ # Print cache status for debugging
154
+ test_cache_status() {
155
+ local session_file="$1"
156
+
157
+ if test_cache_valid "$session_file"; then
158
+ echo "Cache: VALID"
159
+ echo " Result: $(test_cache_get "$session_file" "result")"
160
+ echo " SHA: $(test_cache_get "$session_file" "sha")"
161
+ echo " Time: $(test_cache_get "$session_file" "time")"
162
+ else
163
+ echo "Cache: INVALID or MISSING"
164
+ fi
165
+ }