claude-dev-env 1.57.1 → 1.58.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install.mjs +217 -27
- package/bin/install.test.mjs +344 -1
- package/hooks/blocking/intent_only_ending_blocker.py +155 -0
- package/hooks/blocking/session_handoff_blocker.py +190 -0
- package/hooks/blocking/test_intent_only_ending_blocker.py +175 -0
- package/hooks/blocking/test_session_handoff_blocker.py +312 -0
- package/hooks/hooks.json +10 -0
- package/hooks/hooks_constants/messages.py +4 -0
- package/hooks/hooks_constants/session_handoff_blocker_constants.py +10 -0
- package/hooks/workflow/auto_formatter.py +26 -1
- package/hooks/workflow/test_auto_formatter.py +134 -0
- package/package.json +1 -1
- package/rules/conservative-action.md +1 -0
- package/rules/long-horizon-autonomy.md +43 -0
- package/skills/autoconverge/SKILL.md +56 -6
- package/skills/autoconverge/reference/closing-report.md +44 -0
- package/skills/autoconverge/workflow/autoconverge_report_constants/__init__.py +0 -0
- package/skills/autoconverge/workflow/autoconverge_report_constants/render_report_constants.py +105 -0
- package/skills/autoconverge/workflow/converge.contract.test.mjs +30 -1
- package/skills/autoconverge/workflow/converge.mjs +12 -14
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-a11d903476b803493.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-a26213978adeef6fb.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-a3def0d15ed9d9110.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-a41f41b1b708ee3b7.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-a758b880abecc3ff7.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-a8897b89656b1bd16.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-abd463d744a1437bc.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/subagents/workflows/wf_881252e6-700/agent-ad19d027ae8ee1816.jsonl +2 -0
- package/skills/autoconverge/workflow/fixtures/wf_run/workflows/wf_881252e6-700.json +259 -0
- package/skills/autoconverge/workflow/render_report.py +903 -0
- package/skills/autoconverge/workflow/test_render_report.py +484 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""Tests for session_handoff_blocker hook response shape."""
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
HOOK_SCRIPT_PATH = os.path.join(os.path.dirname(__file__), "session_handoff_blocker.py")
|
|
10
|
+
_HOOKS_DIR = os.path.dirname(HOOK_SCRIPT_PATH)
|
|
11
|
+
_HOOKS_ROOT = os.path.join(_HOOKS_DIR, "..")
|
|
12
|
+
_HOOK_CONFIG_DIR = os.path.join(_HOOKS_ROOT, "hooks_constants")
|
|
13
|
+
if _HOOKS_DIR not in sys.path:
|
|
14
|
+
sys.path.insert(0, _HOOKS_DIR)
|
|
15
|
+
if _HOOKS_ROOT not in sys.path:
|
|
16
|
+
sys.path.insert(0, _HOOKS_ROOT)
|
|
17
|
+
import session_handoff_blocker
|
|
18
|
+
from hooks_constants.messages import USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
19
|
+
|
|
20
|
+
NEW_SESSION_PROPOSAL_MESSAGE = (
|
|
21
|
+
"I recommend we continue this in a fresh session to keep things manageable."
|
|
22
|
+
)
|
|
23
|
+
RUNNING_LOW_ON_CONTEXT_MESSAGE = (
|
|
24
|
+
"We are running low on context, so let me summarize where things stand."
|
|
25
|
+
)
|
|
26
|
+
SHORT_ON_TOKENS_MESSAGE = "I'm getting short on tokens, so I'll wrap up here."
|
|
27
|
+
CONSERVE_CONTEXT_MESSAGE = "To conserve context, let me stop and hand off the remaining work."
|
|
28
|
+
CONTEXT_WINDOW_HANDOFF_MESSAGE = (
|
|
29
|
+
"The context window is filling up, so I'll wrap up and we can continue later."
|
|
30
|
+
)
|
|
31
|
+
BENIGN_TOPICAL_MESSAGE = "The function accepts a context manager and a token string."
|
|
32
|
+
CLEAN_MESSAGE = "The parser handles every fixture and returns a deduplicated list."
|
|
33
|
+
TECHNICAL_TERMINAL_SESSION_MESSAGE = (
|
|
34
|
+
"Consider starting a new session in your terminal to pick up the env vars."
|
|
35
|
+
)
|
|
36
|
+
LOAD_TEST_SESSION_MESSAGE = "We can spin up a fresh session for the load test."
|
|
37
|
+
DATABASE_SESSION_MESSAGE = "Open a new database session before running the query."
|
|
38
|
+
HANDOFF_NEW_SESSION_MESSAGE = (
|
|
39
|
+
"Let's wrap up and continue this in a fresh session to pick this up later."
|
|
40
|
+
)
|
|
41
|
+
LOW_ON_CONTEXT_WITHOUT_CUE_MESSAGE = (
|
|
42
|
+
"I am low on context for this edge case in the parser."
|
|
43
|
+
)
|
|
44
|
+
SAVE_TOKENS_REPORT_MESSAGE = "To save tokens, I inlined the constant."
|
|
45
|
+
LOW_ON_CONTEXT_WITH_HANDOFF_CUE_MESSAGE = (
|
|
46
|
+
"I'm low on context, so let me wrap up and hand off."
|
|
47
|
+
)
|
|
48
|
+
NEW_SESSION_TOKEN_MESSAGE = (
|
|
49
|
+
"To continue this task I need the new session token from the API."
|
|
50
|
+
)
|
|
51
|
+
RESUME_TRAFFIC_FRESH_SESSION_POOL_MESSAGE = (
|
|
52
|
+
"After deploy, resume traffic in a fresh session pool."
|
|
53
|
+
)
|
|
54
|
+
THIRD_PERSON_RUNS_LOW_DESIGN_MESSAGE = (
|
|
55
|
+
"The agent should stop when it runs low on context, per the old design."
|
|
56
|
+
)
|
|
57
|
+
DOCUMENT_RUNNING_LOW_MESSAGE = (
|
|
58
|
+
"Document that agents running low on tokens should not stop."
|
|
59
|
+
)
|
|
60
|
+
USER_ADVICE_NEW_SESSION_MESSAGE = (
|
|
61
|
+
"To save tokens, you could start a new session for the unrelated task."
|
|
62
|
+
)
|
|
63
|
+
CONTEXT_WINDOW_SUMMARIZE_MESSAGE = (
|
|
64
|
+
"The context window is getting full, so let me summarize where we are."
|
|
65
|
+
)
|
|
66
|
+
BENIGN_SUMMARIZE_REPORT_MESSAGE = "I should summarize the findings for the report."
|
|
67
|
+
BENIGN_STOP_PLUS_UNRELATED_RUNNING_LOW_MESSAGE = (
|
|
68
|
+
"Let me pause the animation timer. The job fails when it is running low on tokens."
|
|
69
|
+
)
|
|
70
|
+
BENIGN_STOP_PLUS_USER_DIRECTED_NEW_SESSION_MESSAGE = (
|
|
71
|
+
"Let me stop the timer. The user can continue this in a fresh session."
|
|
72
|
+
)
|
|
73
|
+
EMPTY_MESSAGE = ""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def run_hook_with_message(assistant_message: str) -> subprocess.CompletedProcess:
|
|
77
|
+
hook_input_payload = json.dumps({"last_assistant_message": assistant_message})
|
|
78
|
+
return subprocess.run(
|
|
79
|
+
[sys.executable, HOOK_SCRIPT_PATH],
|
|
80
|
+
input=hook_input_payload,
|
|
81
|
+
capture_output=True,
|
|
82
|
+
text=True,
|
|
83
|
+
check=False,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def run_hook_with_payload(hook_input_payload: dict) -> subprocess.CompletedProcess:
|
|
88
|
+
return subprocess.run(
|
|
89
|
+
[sys.executable, HOOK_SCRIPT_PATH],
|
|
90
|
+
input=json.dumps(hook_input_payload),
|
|
91
|
+
capture_output=True,
|
|
92
|
+
text=True,
|
|
93
|
+
check=False,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_user_facing_notice_matches_config_messages_module():
|
|
98
|
+
config_messages_path = os.path.join(_HOOK_CONFIG_DIR, "messages.py")
|
|
99
|
+
specification = importlib.util.spec_from_file_location("messages", config_messages_path)
|
|
100
|
+
module = importlib.util.module_from_spec(specification)
|
|
101
|
+
specification.loader.exec_module(module)
|
|
102
|
+
|
|
103
|
+
assert module.USER_FACING_CONTEXT_REASSURANCE_NOTICE == USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
104
|
+
assert (
|
|
105
|
+
session_handoff_blocker.USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
106
|
+
== module.USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_new_session_proposal_emits_block_with_short_user_notice():
|
|
111
|
+
completed_process = run_hook_with_message(NEW_SESSION_PROPOSAL_MESSAGE)
|
|
112
|
+
|
|
113
|
+
assert completed_process.returncode == 0
|
|
114
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
115
|
+
|
|
116
|
+
assert parsed_response["decision"] == "block"
|
|
117
|
+
assert parsed_response["systemMessage"] == USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
118
|
+
assert parsed_response["suppressOutput"] is True
|
|
119
|
+
assert "ample context remaining" in parsed_response["reason"]
|
|
120
|
+
assert "long-horizon-autonomy" in parsed_response["reason"]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_running_low_on_context_emits_block():
|
|
124
|
+
completed_process = run_hook_with_message(RUNNING_LOW_ON_CONTEXT_MESSAGE)
|
|
125
|
+
|
|
126
|
+
assert completed_process.returncode == 0
|
|
127
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
128
|
+
|
|
129
|
+
assert parsed_response["decision"] == "block"
|
|
130
|
+
assert parsed_response["systemMessage"] == USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_short_on_tokens_emits_block():
|
|
134
|
+
completed_process = run_hook_with_message(SHORT_ON_TOKENS_MESSAGE)
|
|
135
|
+
|
|
136
|
+
assert completed_process.returncode == 0
|
|
137
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
138
|
+
|
|
139
|
+
assert parsed_response["decision"] == "block"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_conserve_context_emits_block():
|
|
143
|
+
completed_process = run_hook_with_message(CONSERVE_CONTEXT_MESSAGE)
|
|
144
|
+
|
|
145
|
+
assert completed_process.returncode == 0
|
|
146
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
147
|
+
|
|
148
|
+
assert parsed_response["decision"] == "block"
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_context_window_co_occurring_handoff_cue_emits_block():
|
|
152
|
+
completed_process = run_hook_with_message(CONTEXT_WINDOW_HANDOFF_MESSAGE)
|
|
153
|
+
|
|
154
|
+
assert completed_process.returncode == 0
|
|
155
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
156
|
+
|
|
157
|
+
assert parsed_response["decision"] == "block"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_benign_topical_mention_passes_through_with_no_output():
|
|
161
|
+
completed_process = run_hook_with_message(BENIGN_TOPICAL_MESSAGE)
|
|
162
|
+
|
|
163
|
+
assert completed_process.returncode == 0
|
|
164
|
+
assert completed_process.stdout == ""
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_technical_terminal_session_passes_through_with_no_output():
|
|
168
|
+
completed_process = run_hook_with_message(TECHNICAL_TERMINAL_SESSION_MESSAGE)
|
|
169
|
+
|
|
170
|
+
assert completed_process.returncode == 0
|
|
171
|
+
assert completed_process.stdout == ""
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def test_load_test_session_passes_through_with_no_output():
|
|
175
|
+
completed_process = run_hook_with_message(LOAD_TEST_SESSION_MESSAGE)
|
|
176
|
+
|
|
177
|
+
assert completed_process.returncode == 0
|
|
178
|
+
assert completed_process.stdout == ""
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_database_session_passes_through_with_no_output():
|
|
182
|
+
completed_process = run_hook_with_message(DATABASE_SESSION_MESSAGE)
|
|
183
|
+
|
|
184
|
+
assert completed_process.returncode == 0
|
|
185
|
+
assert completed_process.stdout == ""
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def test_new_session_with_handoff_framing_emits_block():
|
|
189
|
+
completed_process = run_hook_with_message(HANDOFF_NEW_SESSION_MESSAGE)
|
|
190
|
+
|
|
191
|
+
assert completed_process.returncode == 0
|
|
192
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
193
|
+
|
|
194
|
+
assert parsed_response["decision"] == "block"
|
|
195
|
+
assert parsed_response["systemMessage"] == USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def test_low_on_context_without_handoff_cue_passes_through_with_no_output():
|
|
199
|
+
completed_process = run_hook_with_message(LOW_ON_CONTEXT_WITHOUT_CUE_MESSAGE)
|
|
200
|
+
|
|
201
|
+
assert completed_process.returncode == 0
|
|
202
|
+
assert completed_process.stdout == ""
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def test_save_tokens_work_report_passes_through_with_no_output():
|
|
206
|
+
completed_process = run_hook_with_message(SAVE_TOKENS_REPORT_MESSAGE)
|
|
207
|
+
|
|
208
|
+
assert completed_process.returncode == 0
|
|
209
|
+
assert completed_process.stdout == ""
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def test_low_on_context_with_handoff_cue_emits_block():
|
|
213
|
+
completed_process = run_hook_with_message(LOW_ON_CONTEXT_WITH_HANDOFF_CUE_MESSAGE)
|
|
214
|
+
|
|
215
|
+
assert completed_process.returncode == 0
|
|
216
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
217
|
+
|
|
218
|
+
assert parsed_response["decision"] == "block"
|
|
219
|
+
assert parsed_response["systemMessage"] == USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def test_new_session_token_passes_through_with_no_output():
|
|
223
|
+
completed_process = run_hook_with_message(NEW_SESSION_TOKEN_MESSAGE)
|
|
224
|
+
|
|
225
|
+
assert completed_process.returncode == 0
|
|
226
|
+
assert completed_process.stdout == ""
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def test_resume_traffic_fresh_session_pool_passes_through_with_no_output():
|
|
230
|
+
completed_process = run_hook_with_message(RESUME_TRAFFIC_FRESH_SESSION_POOL_MESSAGE)
|
|
231
|
+
|
|
232
|
+
assert completed_process.returncode == 0
|
|
233
|
+
assert completed_process.stdout == ""
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def test_third_person_runs_low_description_passes_through_with_no_output():
|
|
237
|
+
completed_process = run_hook_with_message(THIRD_PERSON_RUNS_LOW_DESIGN_MESSAGE)
|
|
238
|
+
|
|
239
|
+
assert completed_process.returncode == 0
|
|
240
|
+
assert completed_process.stdout == ""
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def test_document_running_low_instruction_passes_through_with_no_output():
|
|
244
|
+
completed_process = run_hook_with_message(DOCUMENT_RUNNING_LOW_MESSAGE)
|
|
245
|
+
|
|
246
|
+
assert completed_process.returncode == 0
|
|
247
|
+
assert completed_process.stdout == ""
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def test_user_directed_new_session_advice_passes_through_with_no_output():
|
|
251
|
+
completed_process = run_hook_with_message(USER_ADVICE_NEW_SESSION_MESSAGE)
|
|
252
|
+
|
|
253
|
+
assert completed_process.returncode == 0
|
|
254
|
+
assert completed_process.stdout == ""
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def test_context_window_summarize_handoff_emits_block():
|
|
258
|
+
completed_process = run_hook_with_message(CONTEXT_WINDOW_SUMMARIZE_MESSAGE)
|
|
259
|
+
|
|
260
|
+
assert completed_process.returncode == 0
|
|
261
|
+
parsed_response = json.loads(completed_process.stdout)
|
|
262
|
+
|
|
263
|
+
assert parsed_response["decision"] == "block"
|
|
264
|
+
assert parsed_response["systemMessage"] == USER_FACING_CONTEXT_REASSURANCE_NOTICE
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def test_benign_summarize_report_passes_through_with_no_output():
|
|
268
|
+
completed_process = run_hook_with_message(BENIGN_SUMMARIZE_REPORT_MESSAGE)
|
|
269
|
+
|
|
270
|
+
assert completed_process.returncode == 0
|
|
271
|
+
assert completed_process.stdout == ""
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def test_benign_stop_with_unrelated_running_low_sentence_passes_through_with_no_output():
|
|
275
|
+
completed_process = run_hook_with_message(
|
|
276
|
+
BENIGN_STOP_PLUS_UNRELATED_RUNNING_LOW_MESSAGE
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
assert completed_process.returncode == 0
|
|
280
|
+
assert completed_process.stdout == ""
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def test_benign_stop_with_user_directed_new_session_sentence_passes_through_with_no_output():
|
|
284
|
+
completed_process = run_hook_with_message(
|
|
285
|
+
BENIGN_STOP_PLUS_USER_DIRECTED_NEW_SESSION_MESSAGE
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
assert completed_process.returncode == 0
|
|
289
|
+
assert completed_process.stdout == ""
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def test_clean_message_passes_through_with_no_output():
|
|
293
|
+
completed_process = run_hook_with_message(CLEAN_MESSAGE)
|
|
294
|
+
|
|
295
|
+
assert completed_process.returncode == 0
|
|
296
|
+
assert completed_process.stdout == ""
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def test_empty_message_passes_through_with_no_output():
|
|
300
|
+
completed_process = run_hook_with_message(EMPTY_MESSAGE)
|
|
301
|
+
|
|
302
|
+
assert completed_process.returncode == 0
|
|
303
|
+
assert completed_process.stdout == ""
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def test_stop_hook_active_short_circuits_with_no_output():
|
|
307
|
+
completed_process = run_hook_with_payload(
|
|
308
|
+
{"last_assistant_message": NEW_SESSION_PROPOSAL_MESSAGE, "stop_hook_active": True}
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
assert completed_process.returncode == 0
|
|
312
|
+
assert completed_process.stdout == ""
|
package/hooks/hooks.json
CHANGED
|
@@ -214,6 +214,16 @@
|
|
|
214
214
|
"command": "python3 ${CLAUDE_PLUGIN_ROOT}/hooks/blocking/question_to_user_enforcer.py",
|
|
215
215
|
"timeout": 10
|
|
216
216
|
},
|
|
217
|
+
{
|
|
218
|
+
"type": "command",
|
|
219
|
+
"command": "python3 ${CLAUDE_PLUGIN_ROOT}/hooks/blocking/intent_only_ending_blocker.py",
|
|
220
|
+
"timeout": 10
|
|
221
|
+
},
|
|
222
|
+
{
|
|
223
|
+
"type": "command",
|
|
224
|
+
"command": "python3 ${CLAUDE_PLUGIN_ROOT}/hooks/blocking/session_handoff_blocker.py",
|
|
225
|
+
"timeout": 10
|
|
226
|
+
},
|
|
217
227
|
{
|
|
218
228
|
"type": "command",
|
|
219
229
|
"command": "python3 ${CLAUDE_PLUGIN_ROOT}/hooks/diagnostic/hook_log_stop_wrapper.py",
|
|
@@ -5,3 +5,7 @@ USER_FACING_TDD_NOTICE = "TDD gate held - writing the failing test first..."
|
|
|
5
5
|
USER_FACING_ASKUSERQUESTION_NOTICE = (
|
|
6
6
|
"Agent asked the user in prose - rerouting through AskUserQuestion..."
|
|
7
7
|
)
|
|
8
|
+
USER_FACING_INTENT_ENDING_NOTICE = "Agent ended on a promise - doing the work now..."
|
|
9
|
+
USER_FACING_CONTEXT_REASSURANCE_NOTICE = (
|
|
10
|
+
"Agent moved to wrap up early - continuing the work..."
|
|
11
|
+
)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Shared compiled patterns for the session_handoff_blocker hook."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
FIRST_PERSON_SUBJECT_PATTERN = re.compile(
|
|
6
|
+
r"\b(?:i['’]?m|i['’]?ll|i\s+will|i\s+am|i\s+need\s+to|i\s+should"
|
|
7
|
+
r"|i\s+recommend|i\s+suggest|let\s+me|let['’]?s"
|
|
8
|
+
r"|we\s+(?:should|can|could)|we['’]?ll|we\s+are|we\s+will)\b",
|
|
9
|
+
re.IGNORECASE,
|
|
10
|
+
)
|
|
@@ -46,7 +46,7 @@ JS_EXTENSIONS = {".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs"}
|
|
|
46
46
|
JSON_EXTENSIONS = {".json"}
|
|
47
47
|
PLUGIN_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
48
48
|
HOOKS_DIR = os.path.join(PLUGIN_ROOT, "hooks") + os.sep
|
|
49
|
-
PYTHON_FORMAT_TIMEOUT_SECONDS =
|
|
49
|
+
PYTHON_FORMAT_TIMEOUT_SECONDS = 12
|
|
50
50
|
JS_FORMAT_TIMEOUT_SECONDS = 30
|
|
51
51
|
PRETTIER_CONFIG_NAMES = {
|
|
52
52
|
".prettierrc",
|
|
@@ -76,6 +76,20 @@ def has_prettier_config(file_path: str) -> bool:
|
|
|
76
76
|
return False
|
|
77
77
|
|
|
78
78
|
|
|
79
|
+
def budgeted_python_format_seconds() -> int:
|
|
80
|
+
"""Return the wall-clock budget for the two-subprocess happy path.
|
|
81
|
+
|
|
82
|
+
The Python branch breaks out of each loop the moment a command runs, so
|
|
83
|
+
the common case spends one fix subprocess plus one format subprocess. This
|
|
84
|
+
is a budget for that assumed path, not a guaranteed upper bound: when a
|
|
85
|
+
command is missing or times out the loops fall through to the next entry,
|
|
86
|
+
so a degraded run can spend more than this budget.
|
|
87
|
+
"""
|
|
88
|
+
fix_phase_seconds = PYTHON_FORMAT_TIMEOUT_SECONDS
|
|
89
|
+
format_phase_seconds = PYTHON_FORMAT_TIMEOUT_SECONDS
|
|
90
|
+
return fix_phase_seconds + format_phase_seconds
|
|
91
|
+
|
|
92
|
+
|
|
79
93
|
def is_untracked_in_git(file_path: str) -> bool:
|
|
80
94
|
"""Check if file is untracked (brand new) by git."""
|
|
81
95
|
containing_directory = str(Path(file_path).parent)
|
|
@@ -115,6 +129,17 @@ def main() -> None:
|
|
|
115
129
|
suffix = Path(file_path).suffix.lower()
|
|
116
130
|
|
|
117
131
|
if suffix in PYTHON_EXTENSIONS:
|
|
132
|
+
for each_fix_command in [
|
|
133
|
+
["ruff", "check", "--fix", file_path],
|
|
134
|
+
[sys.executable, "-m", "ruff", "check", "--fix", file_path],
|
|
135
|
+
]:
|
|
136
|
+
try:
|
|
137
|
+
subprocess.run(each_fix_command, capture_output=True, text=True, timeout=PYTHON_FORMAT_TIMEOUT_SECONDS, check=False)
|
|
138
|
+
break
|
|
139
|
+
except FileNotFoundError:
|
|
140
|
+
continue
|
|
141
|
+
except subprocess.TimeoutExpired:
|
|
142
|
+
break
|
|
118
143
|
for each_formatter_command in [
|
|
119
144
|
["ruff", "format", file_path],
|
|
120
145
|
[sys.executable, "-m", "ruff", "format", file_path],
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Tests for the auto_formatter hook.
|
|
2
|
+
|
|
3
|
+
Exercises the real hook against real ruff inside a real git repository. A
|
|
4
|
+
brand-new (untracked) Python file carrying an unused import is fixed in
|
|
5
|
+
place, while the same file arriving through the Edit tool is left untouched
|
|
6
|
+
so the fix stays scoped to newly created files.
|
|
7
|
+
|
|
8
|
+
The sandbox is rooted under the user's home directory via ``tempfile.mkdtemp``
|
|
9
|
+
rather than the OS temp directory, matching the sibling workflow-hook tests.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import functools
|
|
13
|
+
import importlib.util
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import shutil
|
|
17
|
+
import stat
|
|
18
|
+
import subprocess
|
|
19
|
+
import sys
|
|
20
|
+
import tempfile
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Generator
|
|
23
|
+
|
|
24
|
+
import pytest
|
|
25
|
+
|
|
26
|
+
HOOK_SCRIPT_PATH = os.path.join(os.path.dirname(__file__), "auto_formatter.py")
|
|
27
|
+
HOOKS_JSON_PATH = os.path.join(
|
|
28
|
+
os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "hooks", "hooks.json"
|
|
29
|
+
)
|
|
30
|
+
AUTO_FORMATTER_COMMAND_FRAGMENT = "workflow/auto_formatter.py"
|
|
31
|
+
UNUSED_IMPORT_SOURCE = "import os\n\n\nVALUE = 1\n"
|
|
32
|
+
HOOK_RUN_TIMEOUT_SECONDS = 60
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _strip_read_only_and_retry(removal_function, target_path, *_exc_info):
|
|
36
|
+
try:
|
|
37
|
+
os.chmod(target_path, stat.S_IWRITE)
|
|
38
|
+
removal_function(target_path)
|
|
39
|
+
except OSError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _force_rmtree(target_path: str) -> None:
|
|
44
|
+
handler_kw = (
|
|
45
|
+
{"onexc": _strip_read_only_and_retry}
|
|
46
|
+
if sys.version_info >= (3, 12)
|
|
47
|
+
else {"onerror": _strip_read_only_and_retry}
|
|
48
|
+
)
|
|
49
|
+
try:
|
|
50
|
+
shutil.rmtree(target_path, **handler_kw)
|
|
51
|
+
except OSError:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@functools.lru_cache(maxsize=1)
|
|
56
|
+
def _get_sandbox_parent_directory() -> str:
|
|
57
|
+
return tempfile.mkdtemp(prefix="pytest_auto_formatter_", dir=str(Path.home()))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@pytest.fixture(scope="session", autouse=True)
|
|
61
|
+
def _cleanup_sandbox_parent_directory() -> Generator[None, None, None]:
|
|
62
|
+
yield
|
|
63
|
+
if _get_sandbox_parent_directory.cache_info().currsize:
|
|
64
|
+
_force_rmtree(_get_sandbox_parent_directory())
|
|
65
|
+
_get_sandbox_parent_directory.cache_clear()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@pytest.fixture
|
|
69
|
+
def git_repository() -> Generator[Path, None, None]:
|
|
70
|
+
repository_path = Path(tempfile.mkdtemp(dir=_get_sandbox_parent_directory()))
|
|
71
|
+
subprocess.run(["git", "init"], cwd=repository_path, capture_output=True, check=True)
|
|
72
|
+
yield repository_path
|
|
73
|
+
_force_rmtree(str(repository_path))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _run_hook(tool_name: str, file_path: Path) -> subprocess.CompletedProcess[str]:
|
|
77
|
+
hook_input = json.dumps({"tool_name": tool_name, "tool_input": {"file_path": str(file_path)}})
|
|
78
|
+
return subprocess.run(
|
|
79
|
+
[sys.executable, HOOK_SCRIPT_PATH],
|
|
80
|
+
input=hook_input,
|
|
81
|
+
capture_output=True,
|
|
82
|
+
text=True,
|
|
83
|
+
timeout=HOOK_RUN_TIMEOUT_SECONDS,
|
|
84
|
+
check=False,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TestRuffFixOnNewFiles:
|
|
89
|
+
def should_remove_unused_import_from_new_untracked_python_file(
|
|
90
|
+
self, git_repository: Path
|
|
91
|
+
) -> None:
|
|
92
|
+
new_file = git_repository / "brand_new.py"
|
|
93
|
+
new_file.write_text(UNUSED_IMPORT_SOURCE, encoding="utf-8")
|
|
94
|
+
|
|
95
|
+
completed_hook = _run_hook("Write", new_file)
|
|
96
|
+
|
|
97
|
+
assert completed_hook.returncode == 0
|
|
98
|
+
assert "import os" not in new_file.read_text(encoding="utf-8")
|
|
99
|
+
|
|
100
|
+
def should_leave_file_arriving_through_edit_untouched(self, git_repository: Path) -> None:
|
|
101
|
+
edited_file = git_repository / "edited.py"
|
|
102
|
+
edited_file.write_text(UNUSED_IMPORT_SOURCE, encoding="utf-8")
|
|
103
|
+
|
|
104
|
+
completed_hook = _run_hook("Edit", edited_file)
|
|
105
|
+
|
|
106
|
+
assert completed_hook.returncode == 0
|
|
107
|
+
assert "import os" in edited_file.read_text(encoding="utf-8")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _load_auto_formatter_module() -> object:
|
|
111
|
+
module_spec = importlib.util.spec_from_file_location("auto_formatter", HOOK_SCRIPT_PATH)
|
|
112
|
+
assert module_spec is not None and module_spec.loader is not None
|
|
113
|
+
auto_formatter_module = importlib.util.module_from_spec(module_spec)
|
|
114
|
+
module_spec.loader.exec_module(auto_formatter_module)
|
|
115
|
+
return auto_formatter_module
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _registered_auto_formatter_timeout() -> int:
|
|
119
|
+
with open(HOOKS_JSON_PATH, encoding="utf-8") as hooks_file:
|
|
120
|
+
hooks_configuration = json.load(hooks_file)
|
|
121
|
+
for each_event in hooks_configuration["hooks"].values():
|
|
122
|
+
for each_matcher in each_event:
|
|
123
|
+
for each_hook in each_matcher["hooks"]:
|
|
124
|
+
if AUTO_FORMATTER_COMMAND_FRAGMENT in each_hook["command"]:
|
|
125
|
+
return int(each_hook["timeout"])
|
|
126
|
+
raise AssertionError("auto_formatter hook is not registered in hooks.json")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class TestPythonFormatTimeoutBudget:
|
|
130
|
+
def should_keep_both_sequential_python_subprocesses_under_the_harness_budget(self) -> None:
|
|
131
|
+
auto_formatter_module = _load_auto_formatter_module()
|
|
132
|
+
budgeted_total = auto_formatter_module.budgeted_python_format_seconds()
|
|
133
|
+
|
|
134
|
+
assert budgeted_total < _registered_auto_formatter_timeout()
|
package/package.json
CHANGED
|
@@ -13,6 +13,7 @@ Proceed with edits, file modifications, or implementations only when the user ex
|
|
|
13
13
|
- If the user asks a question, answer the question. Do not also fix the thing they asked about.
|
|
14
14
|
- If the user describes a problem, investigate and recommend. Do not jump to implementation.
|
|
15
15
|
- If the user says "do it", "go ahead", "make the change", or similarly explicit language, proceed with action.
|
|
16
|
+
- Once the user has explicitly asked and you have what you need, act — do not re-open settled facts or decisions, and do not re-survey options you will not pursue. This rule governs the ambiguous case; the clear case belongs to `long-horizon-autonomy`.
|
|
16
17
|
- When in doubt, ask: "Would you like me to make this change, or just show you the approach?"
|
|
17
18
|
|
|
18
19
|
## Why
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Long-Horizon Autonomy
|
|
2
|
+
|
|
3
|
+
Source: [Anthropic - Prompting Claude Fable 5](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prompting-claude-fable-5)
|
|
4
|
+
|
|
5
|
+
**When this applies:** Long, multi-step, or unwatched runs — autonomous pipelines, background jobs, convergence loops, and any task that spans many tool calls or a long stretch where the user is away. The behaviors below carry a run to completion rather than letting it stall, drift, or stop early.
|
|
6
|
+
|
|
7
|
+
## Act on what you have
|
|
8
|
+
|
|
9
|
+
When you have enough to act, act. Do not re-derive facts already settled in the conversation, re-open a decision the user already made, or narrate options you will not pursue in user-facing text. When you weigh a choice, give a recommendation, not a full survey. This shapes user-facing messages, not your private reasoning.
|
|
10
|
+
|
|
11
|
+
This is the autonomous-run partner to `conservative-action`: that rule covers the ambiguous case (research and recommend first); this one covers the clear case (once the evidence is in hand, act).
|
|
12
|
+
|
|
13
|
+
## Do not end a turn on a promise
|
|
14
|
+
|
|
15
|
+
Pause for the user only when the work truly needs them: a destructive or irreversible action, a real scope change, or input only they can give. When you hit one, ask through `AskUserQuestion` and end the turn. Do not end on a promise about work you have not done.
|
|
16
|
+
|
|
17
|
+
Before you end any turn, read your last paragraph. If it is a plan, an analysis, a list of next steps, or a statement of intent ("I'll run the tests", "next I'll wire it up"), do that work with tool calls before you stop. End the turn only when the task is done or you are blocked on input only the user can give.
|
|
18
|
+
|
|
19
|
+
In an autonomous pipeline the user cannot answer mid-task. For reversible actions that follow from the original request, act without asking; save any follow-up offers for after the task is done.
|
|
20
|
+
|
|
21
|
+
## Delegate and keep working
|
|
22
|
+
|
|
23
|
+
Hand independent subtasks to subagents and keep working while they run; let them run in the background rather than block until each one returns. Reuse a long-lived subagent across related subtasks so its context carries forward and saves repeated reads. Step in when a subagent drifts off track or is missing context.
|
|
24
|
+
|
|
25
|
+
## Verify your work at intervals
|
|
26
|
+
|
|
27
|
+
On a long build, set a checkpoint cadence and hold to it. At each interval, check the work so far against the task's stated goals with a fresh-context verifier subagent. A separate verifier in a clean context catches what self-review misses.
|
|
28
|
+
|
|
29
|
+
## Ground every progress claim
|
|
30
|
+
|
|
31
|
+
Before you report progress, check each claim against a tool result from this session. State only what the evidence backs; name anything unverified as unverified. If tests fail, say so with the output; if a step was skipped, say that.
|
|
32
|
+
|
|
33
|
+
## Re-ground the final message
|
|
34
|
+
|
|
35
|
+
Terse shorthand between tool calls is fine — that is you thinking. The final message is for a reader who saw none of it. After a long or unwatched run, write it as a fresh briefing: the outcome in one sentence, then the one or two things you need from the reader, each explained as if new. Drop the working vocabulary, arrow chains, and stacked-hyphen compounds; give each file, commit, or flag its own plain clause. When short and clear pull apart, choose clear.
|
|
36
|
+
|
|
37
|
+
## Keep going on context
|
|
38
|
+
|
|
39
|
+
A remaining-context or token count is not a reason to stop. Do not pause, summarize, or float a fresh session on account of context limits; keep working. When the user must see content word-for-word (a partial deliverable, a direct answer to a mid-run question), surface it through the channel the harness gives for that, not by ending the turn.
|
|
40
|
+
|
|
41
|
+
## Why
|
|
42
|
+
|
|
43
|
+
A capable model under-delivers on long runs for predictable reasons: it overplans when it could act, stops on a promise, blocks on subagents, skips its own verification, fabricates progress, buries the result in working shorthand, or quits early over a context count. Each section above removes one of those failure modes so the run finishes.
|
|
@@ -66,8 +66,7 @@ own. The workflow runs in the background and notifies this session on
|
|
|
66
66
|
completion. Watch live progress with `/workflows`.
|
|
67
67
|
|
|
68
68
|
The workflow returns
|
|
69
|
-
`{ converged, rounds, finalSha, blocker, standardsNote }`.
|
|
70
|
-
workflow spawns runs on Fable 5 (`model: 'fable'`).
|
|
69
|
+
`{ converged, rounds, finalSha, blocker, standardsNote }`.
|
|
71
70
|
|
|
72
71
|
## Budget-aware round boundaries
|
|
73
72
|
|
|
@@ -82,15 +81,64 @@ round records nothing resumable and replays dirty.
|
|
|
82
81
|
|
|
83
82
|
## Teardown (on workflow completion)
|
|
84
83
|
|
|
85
|
-
1. **When `converged` is true
|
|
84
|
+
1. **When `converged` is true — build and publish the closing report.**
|
|
85
|
+
Skip this entire step (report, gist, comment, Chrome open) when the workflow
|
|
86
|
+
returned a non-null `blocker`. Per-round live-dashboard refresh is out of scope
|
|
87
|
+
here; this step builds the one-shot closing report and the seam (marker comment +
|
|
88
|
+
gist URL) a future live-dashboard reuses.
|
|
89
|
+
|
|
90
|
+
a. **Resolve the journal path.** Glob
|
|
91
|
+
`~/.claude/projects/**/workflows/wf_<runId>.json` (where `runId` is the run id
|
|
92
|
+
the `Workflow` result returned) and take the match.
|
|
93
|
+
|
|
94
|
+
b. **Build the report.**
|
|
95
|
+
```
|
|
96
|
+
python "<skill>/workflow/render_report.py" \
|
|
97
|
+
--journal "<journal>" \
|
|
98
|
+
--out "$CLAUDE_JOB_DIR/tmp/autoconverge-report-<prNumber>.html" \
|
|
99
|
+
--pr <owner>/<repo>#<n> \
|
|
100
|
+
--final-sha <finalSha> \
|
|
101
|
+
--rounds <rounds> \
|
|
102
|
+
--repo <worktree>
|
|
103
|
+
```
|
|
104
|
+
Capture the output path from stdout.
|
|
105
|
+
|
|
106
|
+
c. **Publish as a secret gist** by reusing `doc-gist` (do not reimplement gist
|
|
107
|
+
creation):
|
|
108
|
+
```
|
|
109
|
+
python "$HOME/.claude/skills/doc-gist/scripts/gist_upload.py" \
|
|
110
|
+
--input "<html path>" \
|
|
111
|
+
--no-open \
|
|
112
|
+
--description "autoconverge report PR #<n>"
|
|
113
|
+
```
|
|
114
|
+
Capture the htmlpreview URL from stdout. The gist is secret by default; pass
|
|
115
|
+
no public flag.
|
|
116
|
+
|
|
117
|
+
d. **Post one idempotent PR comment.** List the PR's issue comments; if one
|
|
118
|
+
carries the marker `<!-- autoconverge-report -->`, edit it in place, otherwise
|
|
119
|
+
create a new one. The body begins with `<!-- autoconverge-report -->`, then
|
|
120
|
+
the htmlpreview link, headline counts (findings by severity, rounds, tests
|
|
121
|
+
added), and the full finding list as `file:line — P# — title` grouped by
|
|
122
|
+
severity. Honor the gh-body-file rule: write a BOM-free temp file and pass
|
|
123
|
+
`--body-file` to `gh issue comment`/`gh issue comment edit`, or use the
|
|
124
|
+
GitHub MCP `add_issue_comment` tool (body as a structured parameter, no
|
|
125
|
+
`--body` flag).
|
|
126
|
+
|
|
127
|
+
e. **Open the report in Chrome.**
|
|
128
|
+
```
|
|
129
|
+
Start-Process chrome -ArgumentList '--new-window', '<report path>'
|
|
130
|
+
```
|
|
131
|
+
Tolerate a missing Chrome without aborting the rest of teardown.
|
|
132
|
+
|
|
133
|
+
2. **When `converged` is true:** rewrite the PR description and clean the
|
|
86
134
|
working tree — see
|
|
87
135
|
[`bugteam/reference/teardown-publish-permissions.md` § Step 4 and § Step 4.5](../bugteam/reference/teardown-publish-permissions.md).
|
|
88
136
|
The workflow already marked the PR ready.
|
|
89
137
|
|
|
90
|
-
|
|
138
|
+
3. **Always revoke project permissions** (including on a blocker exit):
|
|
91
139
|
`python "$HOME/.claude/skills/bugteam/scripts/revoke_project_claude_permissions.py"`
|
|
92
140
|
|
|
93
|
-
|
|
141
|
+
4. **Print the final report:**
|
|
94
142
|
|
|
95
143
|
```
|
|
96
144
|
/autoconverge exit: <converged | blocked>
|
|
@@ -128,4 +176,6 @@ run ends short of ready. Hard-won failure lessons live in
|
|
|
128
176
|
|
|
129
177
|
- `SKILL.md` — this hub.
|
|
130
178
|
- `workflow/converge.mjs` — the convergence workflow script.
|
|
131
|
-
- `
|
|
179
|
+
- `workflow/render_report.py` — builds the closing convergence insights HTML report.
|
|
180
|
+
- `workflow/autoconverge_report_constants/` — named constants for the report builder.
|
|
181
|
+
- `reference/` — convergence definition, stop conditions, gotchas, closing report.
|