@pennyfarthing/core 7.6.0 → 7.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +109 -201
  2. package/package.json +1 -1
  3. package/packages/core/dist/cli/commands/doctor.d.ts.map +1 -1
  4. package/packages/core/dist/cli/commands/doctor.js +91 -0
  5. package/packages/core/dist/cli/commands/doctor.js.map +1 -1
  6. package/packages/core/dist/cli/commands/init.js +31 -0
  7. package/packages/core/dist/cli/commands/init.js.map +1 -1
  8. package/packages/core/dist/cli/commands/update.js +31 -0
  9. package/packages/core/dist/cli/commands/update.js.map +1 -1
  10. package/pennyfarthing-dist/agents/architect.md +48 -53
  11. package/pennyfarthing-dist/agents/dev.md +74 -164
  12. package/pennyfarthing-dist/agents/devops.md +44 -39
  13. package/pennyfarthing-dist/agents/handoff.md +46 -23
  14. package/pennyfarthing-dist/agents/orchestrator.md +84 -255
  15. package/pennyfarthing-dist/agents/pm.md +40 -50
  16. package/pennyfarthing-dist/agents/reviewer-preflight.md +58 -26
  17. package/pennyfarthing-dist/agents/reviewer.md +107 -298
  18. package/pennyfarthing-dist/agents/sm-file-summary.md +51 -30
  19. package/pennyfarthing-dist/agents/sm-finish.md +59 -38
  20. package/pennyfarthing-dist/agents/sm-handoff.md +40 -33
  21. package/pennyfarthing-dist/agents/sm-setup.md +89 -47
  22. package/pennyfarthing-dist/agents/sm.md +171 -558
  23. package/pennyfarthing-dist/agents/tea.md +77 -146
  24. package/pennyfarthing-dist/agents/tech-writer.md +43 -24
  25. package/pennyfarthing-dist/agents/testing-runner.md +73 -30
  26. package/pennyfarthing-dist/agents/ux-designer.md +39 -25
  27. package/pennyfarthing-dist/agents/workflow-status-check.md +34 -16
  28. package/pennyfarthing-dist/commands/benchmark.md +19 -1
  29. package/pennyfarthing-dist/commands/continue-session.md +1 -1
  30. package/pennyfarthing-dist/commands/solo.md +5 -0
  31. package/pennyfarthing-dist/commands/theme-maker.md +5 -5
  32. package/pennyfarthing-dist/commands/work.md +1 -1
  33. package/pennyfarthing-dist/guides/XML-TAGS.md +179 -0
  34. package/pennyfarthing-dist/guides/agent-behavior.md +37 -2
  35. package/pennyfarthing-dist/guides/agent-tag-taxonomy.md +432 -0
  36. package/pennyfarthing-dist/guides/patterns/approval-gates-pattern.md +27 -7
  37. package/pennyfarthing-dist/guides/scale-levels.md +114 -0
  38. package/pennyfarthing-dist/personas/themes/gilligans-island.yaml +2 -2
  39. package/pennyfarthing-dist/personas/themes/star-trek-tos.yaml +1 -1
  40. package/pennyfarthing-dist/scripts/core/agent-session.sh +13 -7
  41. package/pennyfarthing-dist/scripts/core/check-context.sh +25 -8
  42. package/pennyfarthing-dist/scripts/core/prime.sh +57 -32
  43. package/pennyfarthing-dist/scripts/git/create-feature-branches.sh +45 -4
  44. package/pennyfarthing-dist/scripts/git/git-status-all.sh +32 -7
  45. package/pennyfarthing-dist/scripts/hooks/bell-mode-hook.sh +30 -11
  46. package/pennyfarthing-dist/scripts/hooks/pre-commit.sh +80 -23
  47. package/pennyfarthing-dist/scripts/hooks/question-reflector-check.mjs +393 -0
  48. package/pennyfarthing-dist/scripts/hooks/question-reflector-check.sh +20 -0
  49. package/pennyfarthing-dist/scripts/hooks/question_reflector_check.py +402 -0
  50. package/pennyfarthing-dist/scripts/hooks/session-stop.sh +7 -0
  51. package/pennyfarthing-dist/scripts/hooks/tests/question-reflector.test.mjs +545 -0
  52. package/pennyfarthing-dist/scripts/hooks/welcome-hook.sh +94 -0
  53. package/pennyfarthing-dist/scripts/jira/jira-claim-story.sh +10 -152
  54. package/pennyfarthing-dist/scripts/jira/jira-sync-story.sh +14 -4
  55. package/pennyfarthing-dist/scripts/jira/jira-sync.sh +12 -4
  56. package/pennyfarthing-dist/scripts/jira/sync-epic-jira.sh +11 -99
  57. package/pennyfarthing-dist/scripts/lib/common.sh +55 -0
  58. package/pennyfarthing-dist/scripts/maintenance/sidecar-health.sh +97 -0
  59. package/pennyfarthing-dist/scripts/misc/deploy.sh +13 -1
  60. package/pennyfarthing-dist/scripts/misc/statusline.sh +27 -22
  61. package/pennyfarthing-dist/scripts/story/create-story.sh +14 -154
  62. package/pennyfarthing-dist/scripts/story/size-story.sh +12 -192
  63. package/pennyfarthing-dist/scripts/story/story-template.sh +12 -156
  64. package/pennyfarthing-dist/scripts/test/ground-truth-judge.py +24 -93
  65. package/pennyfarthing-dist/scripts/test/swebench-judge.py +33 -59
  66. package/pennyfarthing-dist/scripts/validation/validate-agent-schema.sh +575 -0
  67. package/pennyfarthing-dist/scripts/workflow/check.py +502 -0
  68. package/pennyfarthing-dist/skills/skill-registry.yaml +52 -16
  69. package/pennyfarthing-dist/skills/sprint/skill.md +1 -1
  70. package/pennyfarthing-dist/templates/settings.local.json.template +11 -0
@@ -16,71 +16,21 @@ import sys
16
16
  from pathlib import Path
17
17
  from difflib import SequenceMatcher
18
18
 
19
- def load_swebench_data(cache_path="/tmp/swebench_all.json"):
20
- """Load SWE-bench data from cache."""
21
- with open(cache_path, 'r') as f:
22
- return json.load(f)
23
-
24
- def find_scenario(data, scenario_name):
25
- """Find scenario in SWE-bench data by name."""
26
- # Normalize name (flask-5014 -> pallets__flask-5014)
27
- for item in data:
28
- instance_id = item.get('instance_id', '')
29
- # Try various matching strategies
30
- if scenario_name in instance_id.replace('__', '-'):
31
- return item
32
- if scenario_name.replace('-', '__') in instance_id:
33
- return item
34
- return None
35
-
36
- def extract_patch_elements(patch_text):
37
- """Extract key elements from a patch."""
38
- elements = {
39
- 'files': [],
40
- 'functions': [],
41
- 'additions': [],
42
- 'deletions': [],
43
- 'key_patterns': []
44
- }
19
+ # Add parent to path for pennyfarthing_scripts imports
20
+ sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
21
+
22
+ from pennyfarthing_scripts.swebench import (
23
+ extract_patch_info,
24
+ extract_problem_keywords,
25
+ find_scenario,
26
+ get_meaningful_patterns,
27
+ load_swebench_data,
28
+ )
45
29
 
46
- current_file = None
47
- for line in patch_text.split('\n'):
48
- # File changes
49
- if line.startswith('diff --git'):
50
- match = re.search(r'b/(.+)$', line)
51
- if match:
52
- current_file = match.group(1)
53
- elements['files'].append(current_file)
54
-
55
- # Function/class context
56
- if line.startswith('@@'):
57
- match = re.search(r'@@.*@@\s*(.+)$', line)
58
- if match:
59
- elements['functions'].append(match.group(1).strip())
60
-
61
- # Additions
62
- if line.startswith('+') and not line.startswith('+++'):
63
- clean_line = line[1:].strip()
64
- if clean_line and not clean_line.startswith('#'):
65
- elements['additions'].append(clean_line)
66
- # Extract key patterns (function calls, variable names, etc.)
67
- patterns = re.findall(r'\b\w+\b', clean_line)
68
- elements['key_patterns'].extend(patterns)
69
-
70
- # Deletions
71
- if line.startswith('-') and not line.startswith('---'):
72
- clean_line = line[1:].strip()
73
- if clean_line and not clean_line.startswith('#'):
74
- elements['deletions'].append(clean_line)
75
-
76
- # Deduplicate
77
- elements['key_patterns'] = list(set(elements['key_patterns']))
78
-
79
- return elements
80
30
 
81
31
  def score_response(response_text, ground_truth):
82
32
  """Score a response against ground truth patch."""
83
- gt_elements = extract_patch_elements(ground_truth['patch'])
33
+ patch_info = extract_patch_info(ground_truth['patch'])
84
34
 
85
35
  scores = {
86
36
  'file_identification': 0,
@@ -94,16 +44,16 @@ def score_response(response_text, ground_truth):
94
44
 
95
45
  # 1. FILE IDENTIFICATION (20 points)
96
46
  files_found = 0
97
- for f in gt_elements['files']:
47
+ for f in patch_info.files:
98
48
  # Check various forms of the filename
99
49
  filename = Path(f).name
100
50
  if filename.lower() in response_lower or f.lower() in response_lower:
101
51
  files_found += 1
102
52
 
103
- if gt_elements['files']:
104
- file_score = (files_found / len(gt_elements['files'])) * 20
53
+ if patch_info.files:
54
+ file_score = (files_found / len(patch_info.files)) * 20
105
55
  scores['file_identification'] = min(20, file_score)
106
- scores['details']['files_expected'] = gt_elements['files']
56
+ scores['details']['files_expected'] = patch_info.files
107
57
  scores['details']['files_found'] = files_found
108
58
  else:
109
59
  scores['file_identification'] = 20 # No specific file in patch
@@ -111,7 +61,7 @@ def score_response(response_text, ground_truth):
111
61
  # 2. LOCATION IDENTIFICATION (20 points)
112
62
  # Look for function/class names mentioned in the patch
113
63
  locations_found = 0
114
- for func in gt_elements['functions']:
64
+ for func in patch_info.functions:
115
65
  # Extract the function/class name
116
66
  func_match = re.search(r'(def|class)\s+(\w+)', func)
117
67
  if func_match:
@@ -121,20 +71,17 @@ def score_response(response_text, ground_truth):
121
71
  elif func.strip() and func.strip().split()[0] in response_lower:
122
72
  locations_found += 1
123
73
 
124
- if gt_elements['functions']:
125
- loc_score = (locations_found / len(gt_elements['functions'])) * 20
74
+ if patch_info.functions:
75
+ loc_score = (locations_found / len(patch_info.functions)) * 20
126
76
  scores['location_identification'] = min(20, loc_score)
127
- scores['details']['locations_expected'] = gt_elements['functions'][:3]
77
+ scores['details']['locations_expected'] = patch_info.functions[:3]
128
78
  scores['details']['locations_found'] = locations_found
129
79
  else:
130
80
  scores['location_identification'] = 10 # Partial credit
131
81
 
132
82
  # 3. FIX LOGIC MATCH (40 points)
133
83
  # Check if key code patterns from the fix appear in the response
134
- key_patterns = gt_elements['key_patterns']
135
- # Filter to meaningful patterns (not common words)
136
- common_words = {'if', 'else', 'return', 'self', 'def', 'class', 'for', 'in', 'not', 'and', 'or', 'is', 'none', 'true', 'false'}
137
- meaningful_patterns = [p for p in key_patterns if p.lower() not in common_words and len(p) > 2]
84
+ meaningful_patterns = get_meaningful_patterns(patch_info.key_patterns)
138
85
 
139
86
  patterns_found = 0
140
87
  for pattern in meaningful_patterns:
@@ -150,7 +97,7 @@ def score_response(response_text, ground_truth):
150
97
 
151
98
  # Check for actual code additions
152
99
  additions_matched = 0
153
- for addition in gt_elements['additions'][:5]: # Check first 5 additions
100
+ for addition in patch_info.additions[:5]: # Check first 5 additions
154
101
  # Normalize and check
155
102
  addition_normalized = re.sub(r'\s+', ' ', addition.lower())
156
103
  response_normalized = re.sub(r'\s+', ' ', response_lower)
@@ -160,8 +107,8 @@ def score_response(response_text, ground_truth):
160
107
  if similarity > 0.6 or addition_normalized in response_normalized:
161
108
  additions_matched += 1
162
109
 
163
- if gt_elements['additions']:
164
- addition_score = (additions_matched / min(5, len(gt_elements['additions']))) * 20
110
+ if patch_info.additions:
111
+ addition_score = (additions_matched / min(5, len(patch_info.additions))) * 20
165
112
  scores['details']['additions_matched'] = additions_matched
166
113
  else:
167
114
  addition_score = 10
@@ -205,23 +152,6 @@ def score_response(response_text, ground_truth):
205
152
 
206
153
  return scores
207
154
 
208
- def extract_problem_keywords(problem_statement):
209
- """Extract key technical terms from problem statement."""
210
- if not problem_statement:
211
- return []
212
-
213
- # Find quoted strings, function names, error messages
214
- keywords = []
215
-
216
- # Find quoted terms
217
- quoted = re.findall(r'[`\'"]([^`\'"]+)[`\'"]', problem_statement)
218
- keywords.extend(quoted)
219
-
220
- # Find CamelCase or snake_case identifiers
221
- identifiers = re.findall(r'\b[A-Z][a-z]+[A-Z]\w*\b|\b\w+_\w+\b', problem_statement)
222
- keywords.extend(identifiers)
223
-
224
- return list(set(keywords))[:10]
225
155
 
226
156
  def main():
227
157
  if len(sys.argv) < 3:
@@ -285,5 +215,6 @@ def main():
285
215
 
286
216
  return scores
287
217
 
218
+
288
219
  if __name__ == '__main__':
289
220
  main()
@@ -17,52 +17,15 @@ import sys
17
17
  from pathlib import Path
18
18
  from difflib import SequenceMatcher
19
19
 
20
- def load_swebench_data(cache_path="/tmp/swebench_all.json"):
21
- """Load SWE-bench ground truth data."""
22
- with open(cache_path, 'r') as f:
23
- return json.load(f)
24
-
25
- def find_ground_truth(data, scenario_name):
26
- """Find scenario in SWE-bench data."""
27
- for item in data:
28
- instance_id = item.get('instance_id', '')
29
- if scenario_name in instance_id.replace('__', '-'):
30
- return item
31
- return None
32
-
33
- def extract_patch_info(patch_text):
34
- """Extract structured info from patch."""
35
- info = {
36
- 'files': [],
37
- 'functions': [],
38
- 'additions': [],
39
- 'deletions': [],
40
- 'key_code': []
41
- }
42
-
43
- for line in patch_text.split('\n'):
44
- if line.startswith('diff --git'):
45
- match = re.search(r'b/(.+)$', line)
46
- if match:
47
- info['files'].append(match.group(1))
48
-
49
- if line.startswith('@@'):
50
- match = re.search(r'@@.*@@\s*(.+)$', line)
51
- if match:
52
- info['functions'].append(match.group(1).strip())
53
-
54
- if line.startswith('+') and not line.startswith('+++'):
55
- clean = line[1:].strip()
56
- if clean and not clean.startswith('#'):
57
- info['additions'].append(clean)
58
- info['key_code'].append(clean)
20
+ # Add parent to path for pennyfarthing_scripts imports
21
+ sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
59
22
 
60
- if line.startswith('-') and not line.startswith('---'):
61
- clean = line[1:].strip()
62
- if clean and not clean.startswith('#'):
63
- info['deletions'].append(clean)
23
+ from pennyfarthing_scripts.swebench import (
24
+ extract_patch_info,
25
+ find_scenario,
26
+ load_swebench_data,
27
+ )
64
28
 
65
- return info
66
29
 
67
30
  def score_identifies_bug_location(response, ground_truth):
68
31
  """Score IDENTIFIES_BUG_LOCATION (15 pts) using ground truth."""
@@ -74,34 +37,35 @@ def score_identifies_bug_location(response, ground_truth):
74
37
 
75
38
  # Check files (7.5 pts)
76
39
  files_found = 0
77
- for f in patch_info['files']:
40
+ for f in patch_info.files:
78
41
  filename = Path(f).name.lower()
79
42
  if filename in response_lower or f.lower() in response_lower:
80
43
  files_found += 1
81
44
 
82
- if patch_info['files']:
83
- file_score = (files_found / len(patch_info['files'])) * 7.5
45
+ if patch_info.files:
46
+ file_score = (files_found / len(patch_info.files)) * 7.5
84
47
  score += file_score
85
- details.append(f"Files: {files_found}/{len(patch_info['files'])} found")
48
+ details.append(f"Files: {files_found}/{len(patch_info.files)} found")
86
49
 
87
50
  # Check functions/classes (7.5 pts)
88
51
  funcs_found = 0
89
- for func in patch_info['functions']:
52
+ for func in patch_info.functions:
90
53
  func_match = re.search(r'(def|class)\s+(\w+)', func)
91
54
  if func_match:
92
55
  func_name = func_match.group(2).lower()
93
56
  if func_name in response_lower:
94
57
  funcs_found += 1
95
58
 
96
- if patch_info['functions']:
97
- func_score = min(7.5, (funcs_found / len(patch_info['functions'])) * 7.5)
59
+ if patch_info.functions:
60
+ func_score = min(7.5, (funcs_found / len(patch_info.functions)) * 7.5)
98
61
  score += func_score
99
- details.append(f"Functions: {funcs_found}/{len(patch_info['functions'])} found")
62
+ details.append(f"Functions: {funcs_found}/{len(patch_info.functions)} found")
100
63
  else:
101
64
  score += 3.75 # Partial credit if no specific function in patch
102
65
 
103
66
  return min(15, score), details
104
67
 
68
+
105
69
  def score_explains_why_broken(response, ground_truth):
106
70
  """Score EXPLAINS_WHY_BROKEN (15 pts)."""
107
71
  response_lower = response.lower()
@@ -133,6 +97,7 @@ def score_explains_why_broken(response, ground_truth):
133
97
 
134
98
  return min(15, score), details
135
99
 
100
+
136
101
  def score_fix_addresses_issue(response, ground_truth):
137
102
  """Score FIX_ADDRESSES_ISSUE (20 pts) using ground truth patch."""
138
103
  patch_info = extract_patch_info(ground_truth.get('patch', ''))
@@ -143,7 +108,7 @@ def score_fix_addresses_issue(response, ground_truth):
143
108
 
144
109
  # Check if key additions from patch appear in response
145
110
  additions_matched = 0
146
- for addition in patch_info['additions'][:5]:
111
+ for addition in patch_info.additions[:5]:
147
112
  # Normalize whitespace
148
113
  addition_norm = re.sub(r'\s+', ' ', addition.lower())
149
114
  response_norm = re.sub(r'\s+', ' ', response_lower)
@@ -157,10 +122,10 @@ def score_fix_addresses_issue(response, ground_truth):
157
122
  if sim > 0.7:
158
123
  additions_matched += 0.5
159
124
 
160
- if patch_info['additions']:
161
- addition_score = (additions_matched / min(5, len(patch_info['additions']))) * 15
125
+ if patch_info.additions:
126
+ addition_score = (additions_matched / min(5, len(patch_info.additions))) * 15
162
127
  score += addition_score
163
- details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info['additions']))}")
128
+ details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info.additions))}")
164
129
 
165
130
  # Check for code block with fix
166
131
  if '```' in response:
@@ -169,6 +134,7 @@ def score_fix_addresses_issue(response, ground_truth):
169
134
 
170
135
  return min(20, score), details
171
136
 
137
+
172
138
  def score_fix_is_minimal(response, ground_truth):
173
139
  """Score FIX_IS_MINIMAL (10 pts)."""
174
140
  patch_info = extract_patch_info(ground_truth.get('patch', ''))
@@ -177,7 +143,7 @@ def score_fix_is_minimal(response, ground_truth):
177
143
  details = []
178
144
 
179
145
  # Count lines in patch vs lines in response code blocks
180
- patch_lines = len(patch_info['additions']) + len(patch_info['deletions'])
146
+ patch_lines = len(patch_info.additions) + len(patch_info.deletions)
181
147
 
182
148
  # Extract code blocks from response
183
149
  code_blocks = re.findall(r'```[\w]*\n(.*?)```', response, re.DOTALL)
@@ -200,6 +166,7 @@ def score_fix_is_minimal(response, ground_truth):
200
166
 
201
167
  return min(10, score), details
202
168
 
169
+
203
170
  def score_fix_syntax_correct(response):
204
171
  """Score FIX_SYNTAX_CORRECT (10 pts)."""
205
172
  score = 0
@@ -232,6 +199,7 @@ def score_fix_syntax_correct(response):
232
199
 
233
200
  return min(10, score), details
234
201
 
202
+
235
203
  def score_edge_cases(response):
236
204
  """Score EDGE_CASES (10 pts)."""
237
205
  response_lower = response.lower()
@@ -247,6 +215,7 @@ def score_edge_cases(response):
247
215
 
248
216
  return score, details
249
217
 
218
+
250
219
  def score_test_coverage(response):
251
220
  """Score TEST_COVERAGE (10 pts)."""
252
221
  response_lower = response.lower()
@@ -271,6 +240,7 @@ def score_test_coverage(response):
271
240
 
272
241
  return min(10, score), details
273
242
 
243
+
274
244
  def score_in_character(response, persona="senior developer"):
275
245
  """Score IN_CHARACTER (10 pts)."""
276
246
  response_lower = response.lower()
@@ -287,9 +257,10 @@ def score_in_character(response, persona="senior developer"):
287
257
 
288
258
  return score, details
289
259
 
260
+
290
261
  def judge_response(scenario_name, response_text, swebench_data):
291
262
  """Full judgment using scenario rubric + ground truth."""
292
- ground_truth = find_ground_truth(swebench_data, scenario_name)
263
+ ground_truth = find_scenario(swebench_data, scenario_name)
293
264
 
294
265
  if not ground_truth:
295
266
  return {'error': f'Scenario {scenario_name} not found in SWE-bench data'}
@@ -345,15 +316,17 @@ def judge_response(scenario_name, response_text, swebench_data):
345
316
  scores['persona']['subtotal']
346
317
  )
347
318
 
319
+ patch_info = extract_patch_info(ground_truth.get('patch', ''))
348
320
  return {
349
321
  'scenario': scenario_name,
350
322
  'instance_id': ground_truth.get('instance_id'),
351
323
  'scores': scores,
352
324
  'total': round(total, 1),
353
325
  'details': all_details,
354
- 'ground_truth_files': extract_patch_info(ground_truth.get('patch', ''))['files']
326
+ 'ground_truth_files': patch_info.files
355
327
  }
356
328
 
329
+
357
330
  def main():
358
331
  if len(sys.argv) < 3:
359
332
  print("Usage: swebench-judge.py <scenario_name> <response_file>")
@@ -396,5 +369,6 @@ def main():
396
369
  json.dump(result, f, indent=2)
397
370
  print(f"\nSaved to: {output_path}")
398
371
 
372
+
399
373
  if __name__ == '__main__':
400
374
  main()