claude-dev-env 1.30.1 → 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/agents/clean-coder.md +275 -111
  2. package/agents/code-quality-agent.md +196 -209
  3. package/bin/install.mjs +81 -0
  4. package/bin/install.test.mjs +158 -0
  5. package/bin/install_mypy_ini.mjs +51 -0
  6. package/bin/install_mypy_ini.test.mjs +121 -0
  7. package/commands/hook-log-extract.md +70 -0
  8. package/commands/hook-log-init.md +76 -0
  9. package/hooks/blocking/code_rules_enforcer.py +5 -3
  10. package/hooks/blocking/destructive_command_blocker.py +187 -0
  11. package/hooks/blocking/question_to_user_enforcer.py +140 -0
  12. package/hooks/blocking/test_code_rules_enforcer_file_global_constants.py +39 -0
  13. package/hooks/blocking/test_destructive_command_blocker.py +397 -0
  14. package/hooks/blocking/test_question_to_user_enforcer.py +163 -0
  15. package/hooks/config/hook_log_extractor_constants.py +221 -0
  16. package/hooks/config/messages.py +3 -0
  17. package/hooks/config/test_hook_log_extractor_constants.py +96 -0
  18. package/hooks/config/test_messages.py +5 -0
  19. package/hooks/diagnostic/hook_log_extractor.py +907 -0
  20. package/hooks/diagnostic/hook_log_init.py +202 -0
  21. package/hooks/diagnostic/hook_log_stop_wrapper.py +84 -0
  22. package/hooks/diagnostic/migrations/2026-04-25-drop-themes-hook-events.sql +3 -0
  23. package/hooks/diagnostic/migrations/README.md +77 -0
  24. package/hooks/diagnostic/queries/block_details_for_hook.sql +26 -0
  25. package/hooks/diagnostic/queries/blocks_by_category.sql +10 -0
  26. package/hooks/diagnostic/queries/blocks_by_tool.sql +9 -0
  27. package/hooks/diagnostic/queries/blocks_last_7_days.sql +11 -0
  28. package/hooks/diagnostic/queries/top_blockers_last_24_hours.sql +12 -0
  29. package/hooks/diagnostic/queries/top_blockers_overall.sql +12 -0
  30. package/hooks/diagnostic/requirements-hook-logs-dev.txt +2 -0
  31. package/hooks/diagnostic/requirements-hook-logs.txt +1 -0
  32. package/hooks/diagnostic/schema.sql +51 -0
  33. package/hooks/diagnostic/test_hook_log_extractor.py +1531 -0
  34. package/hooks/diagnostic/test_hook_log_init.py +227 -0
  35. package/hooks/diagnostic/test_hook_log_stop_wrapper.py +98 -0
  36. package/hooks/hooks.json +10 -0
  37. package/package.json +1 -1
  38. package/rules/ask-user-question-required.md +44 -0
  39. package/scripts/config/test_spec_implementer_prompt.py +0 -4
  40. package/scripts/test_groq_bugteam_spec.py +0 -8
@@ -0,0 +1,1531 @@
1
+ """Failing-first tests for hook_log_extractor.
2
+
3
+ Covers category derivation (15 known + uncategorized fallback), outcome
4
+ mapping (4 attachment types), excerpt truncation, offset advance,
5
+ idempotence via ON CONFLICT, offline graceful fallback, and batched
6
+ INSERT shape. psycopg is mocked at the connect boundary.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import contextlib
12
+ import errno
13
+ import json
14
+ import sys
15
+ import threading
16
+ import time
17
+ from pathlib import Path
18
+ from typing import Any
19
+ from unittest.mock import MagicMock, patch
20
+
21
+ import pytest
22
+
23
+ _HOOKS_ROOT = Path(__file__).resolve().parent.parent
24
+ if str(_HOOKS_ROOT) not in sys.path:
25
+ sys.path.insert(0, str(_HOOKS_ROOT))
26
+
27
+ from diagnostic import hook_log_extractor
28
+ from config.hook_log_extractor_constants import (
29
+ COMMAND_EXCERPT_MAX_CHARACTERS,
30
+ EXIT_CODE_UNKNOWN_QUERY,
31
+ HOOK_CATEGORY_UNCATEGORIZED,
32
+ KNOWN_HOOK_CATEGORIES,
33
+ NEON_DATABASE_URL_ENVIRONMENT_VARIABLE,
34
+ OUTCOME_ADDED_CONTEXT,
35
+ OUTCOME_BLOCKED,
36
+ OUTCOME_NON_BLOCKING_ERROR,
37
+ OUTCOME_SUCCESS,
38
+ OUTCOME_SYSTEM_MESSAGE,
39
+ STDERR_EXCERPT_MAX_CHARACTERS,
40
+ STDOUT_EXCERPT_MAX_CHARACTERS,
41
+ )
42
+
43
+
44
+ def _make_success_line(
45
+ session_id: str = "session-alpha",
46
+ hook_name: str = "PreToolUse:Bash",
47
+ hook_event: str = "PreToolUse",
48
+ tool_use_id: str = "toolu_001",
49
+ command: str = "python C:/Users/jon/.claude/hooks/blocking/destructive_command_blocker.py",
50
+ stdout: str = "ok\n",
51
+ stderr: str = "",
52
+ exit_code: int = 0,
53
+ duration_ms: int = 42,
54
+ timestamp: str = "2026-04-24T13:32:07.978Z",
55
+ cwd: str = "Y:\\Projects\\repo",
56
+ git_branch: str = "main",
57
+ ) -> str:
58
+ record = {
59
+ "type": "attachment",
60
+ "attachment": {
61
+ "type": "hook_success",
62
+ "hookName": hook_name,
63
+ "hookEvent": hook_event,
64
+ "toolUseID": tool_use_id,
65
+ "command": command,
66
+ "stdout": stdout,
67
+ "stderr": stderr,
68
+ "exitCode": exit_code,
69
+ "durationMs": duration_ms,
70
+ },
71
+ "timestamp": timestamp,
72
+ "sessionId": session_id,
73
+ "cwd": cwd,
74
+ "gitBranch": git_branch,
75
+ }
76
+ return json.dumps(record)
77
+
78
+
79
+ def _make_blocking_line(
80
+ session_id: str = "session-alpha",
81
+ hook_name: str = "PreToolUse:Bash",
82
+ hook_event: str = "PreToolUse",
83
+ tool_use_id: str = "toolu_002",
84
+ blocking_message: str = "blocked for reason",
85
+ command: str = "python C:/Users/jon/.claude/hooks/blocking/content_search_to_zoekt_redirector.py",
86
+ timestamp: str = "2026-04-24T13:32:54.293Z",
87
+ cwd: str = "Y:\\Projects\\repo",
88
+ git_branch: str = "main",
89
+ ) -> str:
90
+ record = {
91
+ "type": "attachment",
92
+ "attachment": {
93
+ "type": "hook_blocking_error",
94
+ "hookName": hook_name,
95
+ "hookEvent": hook_event,
96
+ "toolUseID": tool_use_id,
97
+ "blockingError": {
98
+ "blockingError": blocking_message,
99
+ "command": command,
100
+ },
101
+ },
102
+ "timestamp": timestamp,
103
+ "sessionId": session_id,
104
+ "cwd": cwd,
105
+ "gitBranch": git_branch,
106
+ }
107
+ return json.dumps(record)
108
+
109
+
110
+ def _make_system_message_line(
111
+ session_id: str = "session-alpha",
112
+ hook_name: str = "PreToolUse:Bash",
113
+ hook_event: str = "PreToolUse",
114
+ tool_use_id: str = "toolu_003",
115
+ content: str = "[destructive-gate] blocked",
116
+ timestamp: str = "2026-04-24T13:32:54.293Z",
117
+ cwd: str = "Y:\\Projects\\repo",
118
+ git_branch: str = "main",
119
+ ) -> str:
120
+ record = {
121
+ "type": "attachment",
122
+ "attachment": {
123
+ "type": "hook_system_message",
124
+ "hookName": hook_name,
125
+ "hookEvent": hook_event,
126
+ "toolUseID": tool_use_id,
127
+ "content": content,
128
+ },
129
+ "timestamp": timestamp,
130
+ "sessionId": session_id,
131
+ "cwd": cwd,
132
+ "gitBranch": git_branch,
133
+ }
134
+ return json.dumps(record)
135
+
136
+
137
+ def _make_additional_context_line(
138
+ session_id: str = "session-alpha",
139
+ hook_name: str = "PreToolUse:Bash",
140
+ hook_event: str = "PreToolUse",
141
+ tool_use_id: str = "toolu_004",
142
+ content: list[str] | None = None,
143
+ timestamp: str = "2026-04-24T13:32:54.293Z",
144
+ cwd: str = "Y:\\Projects\\repo",
145
+ git_branch: str = "main",
146
+ ) -> str:
147
+ record = {
148
+ "type": "attachment",
149
+ "attachment": {
150
+ "type": "hook_additional_context",
151
+ "hookName": hook_name,
152
+ "hookEvent": hook_event,
153
+ "toolUseID": tool_use_id,
154
+ "content": content or ["extra context"],
155
+ },
156
+ "timestamp": timestamp,
157
+ "sessionId": session_id,
158
+ "cwd": cwd,
159
+ "gitBranch": git_branch,
160
+ }
161
+ return json.dumps(record)
162
+
163
+
164
+ @pytest.mark.parametrize(
165
+ "expected_category",
166
+ sorted(KNOWN_HOOK_CATEGORIES),
167
+ )
168
+ def test_derive_category_accepts_each_known_category(expected_category: str) -> None:
169
+ script_path = f"python C:/Users/jon/.claude/hooks/{expected_category}/some_hook.py"
170
+ assert hook_log_extractor.derive_category(script_path) == expected_category
171
+
172
+
173
+ def test_derive_category_returns_uncategorized_for_unknown_parent() -> None:
174
+ script_path = "python C:/Users/jon/.claude/hooks/unheard_of_bucket/some_hook.py"
175
+ assert (
176
+ hook_log_extractor.derive_category(script_path) == HOOK_CATEGORY_UNCATEGORIZED
177
+ )
178
+
179
+
180
+ def test_derive_category_returns_uncategorized_for_empty_path() -> None:
181
+ assert hook_log_extractor.derive_category(None) == HOOK_CATEGORY_UNCATEGORIZED
182
+ assert hook_log_extractor.derive_category("") == HOOK_CATEGORY_UNCATEGORIZED
183
+
184
+
185
+ def test_derive_category_handles_windows_backslash_paths() -> None:
186
+ script_path = "python C:\\Users\\jon\\.claude\\hooks\\blocking\\destructive_command_blocker.py"
187
+ assert hook_log_extractor.derive_category(script_path) == "blocking"
188
+
189
+
190
+ def test_derive_category_strips_python_launcher_prefix() -> None:
191
+ script_path = "python3 /home/jon/.claude/hooks/session/code_rules_reminder.py"
192
+ assert hook_log_extractor.derive_category(script_path) == "session"
193
+
194
+
195
+ def test_derive_outcome_maps_hook_success() -> None:
196
+ assert hook_log_extractor.derive_outcome("hook_success") == OUTCOME_SUCCESS
197
+
198
+
199
+ def test_derive_outcome_maps_hook_blocking_error() -> None:
200
+ assert hook_log_extractor.derive_outcome("hook_blocking_error") == OUTCOME_BLOCKED
201
+
202
+
203
+ def test_derive_outcome_maps_hook_system_message() -> None:
204
+ assert (
205
+ hook_log_extractor.derive_outcome("hook_system_message")
206
+ == OUTCOME_SYSTEM_MESSAGE
207
+ )
208
+
209
+
210
+ def test_derive_outcome_maps_hook_additional_context() -> None:
211
+ assert (
212
+ hook_log_extractor.derive_outcome("hook_additional_context")
213
+ == OUTCOME_ADDED_CONTEXT
214
+ )
215
+
216
+
217
+ def test_derive_outcome_maps_hook_non_blocking_error() -> None:
218
+ assert (
219
+ hook_log_extractor.derive_outcome("hook_non_blocking_error")
220
+ == OUTCOME_NON_BLOCKING_ERROR
221
+ )
222
+
223
+
224
+ def test_iter_attachment_records_skips_unknown_hook_attachment_type(
225
+ tmp_path: Path,
226
+ ) -> None:
227
+ jsonl_path = tmp_path / "session-with-unknown-hook-type.jsonl"
228
+ unknown_type_record = {
229
+ "type": "attachment",
230
+ "attachment": {
231
+ "type": "hook_future_unknown_variant",
232
+ "hookName": "PreToolUse:Bash",
233
+ "hookEvent": "PreToolUse",
234
+ },
235
+ "timestamp": "2026-04-24T13:32:54.293Z",
236
+ "sessionId": "session-alpha",
237
+ "cwd": "Y:/Projects/repo",
238
+ "gitBranch": "main",
239
+ }
240
+ jsonl_path.write_text(
241
+ _make_success_line() + "\n" + json.dumps(unknown_type_record) + "\n",
242
+ encoding="utf-8",
243
+ )
244
+ all_yielded_records = list(
245
+ hook_log_extractor.iter_attachment_records_from_file(
246
+ str(jsonl_path),
247
+ start_offset=0,
248
+ ),
249
+ )
250
+ assert len(all_yielded_records) == 1
251
+ first_parsed_record, _line_number, _offset = all_yielded_records[0]
252
+ assert first_parsed_record["attachment"]["type"] == "hook_success"
253
+
254
+
255
+ def test_derive_outcome_raises_on_unknown_type() -> None:
256
+ with pytest.raises(KeyError):
257
+ hook_log_extractor.derive_outcome("hook_something_else")
258
+
259
+
260
+ def test_extract_script_path_from_success_record() -> None:
261
+ record_json = _make_success_line(
262
+ command="python C:/Users/jon/.claude/hooks/blocking/foo.py",
263
+ )
264
+ parsed = json.loads(record_json)
265
+ assert (
266
+ hook_log_extractor.extract_script_path(parsed["attachment"])
267
+ == "C:/Users/jon/.claude/hooks/blocking/foo.py"
268
+ )
269
+
270
+
271
+ def test_extract_script_path_from_blocking_record() -> None:
272
+ record_json = _make_blocking_line(
273
+ command="python3 /home/jon/.claude/hooks/blocking/bar.py",
274
+ )
275
+ parsed = json.loads(record_json)
276
+ assert (
277
+ hook_log_extractor.extract_script_path(parsed["attachment"])
278
+ == "/home/jon/.claude/hooks/blocking/bar.py"
279
+ )
280
+
281
+
282
+ def test_extract_script_path_returns_none_for_system_message() -> None:
283
+ record_json = _make_system_message_line()
284
+ parsed = json.loads(record_json)
285
+ assert hook_log_extractor.extract_script_path(parsed["attachment"]) is None
286
+
287
+
288
+ def test_excerpt_truncation_respects_command_limit() -> None:
289
+ long_command = "x" * (COMMAND_EXCERPT_MAX_CHARACTERS + 50)
290
+ truncated = hook_log_extractor.truncate_command_excerpt(long_command)
291
+ assert len(truncated) == COMMAND_EXCERPT_MAX_CHARACTERS
292
+
293
+
294
+ def test_excerpt_truncation_preserves_short_command() -> None:
295
+ short_command = "python foo.py"
296
+ assert hook_log_extractor.truncate_command_excerpt(short_command) == short_command
297
+
298
+
299
+ def test_excerpt_truncation_handles_none_command() -> None:
300
+ assert hook_log_extractor.truncate_command_excerpt(None) is None
301
+
302
+
303
+ def test_excerpt_truncation_respects_stdout_limit() -> None:
304
+ long_stdout = "y" * (STDOUT_EXCERPT_MAX_CHARACTERS + 100)
305
+ truncated = hook_log_extractor.truncate_stdout_excerpt(long_stdout)
306
+ assert len(truncated) == STDOUT_EXCERPT_MAX_CHARACTERS
307
+
308
+
309
+ def test_excerpt_truncation_respects_stderr_limit() -> None:
310
+ long_stderr = "z" * (STDERR_EXCERPT_MAX_CHARACTERS + 100)
311
+ truncated = hook_log_extractor.truncate_stderr_excerpt(long_stderr)
312
+ assert len(truncated) == STDERR_EXCERPT_MAX_CHARACTERS
313
+
314
+
315
+ def test_build_row_from_success_attachment() -> None:
316
+ record_json = _make_success_line()
317
+ parsed = json.loads(record_json)
318
+ row = hook_log_extractor.build_row_from_attachment(
319
+ parsed_record=parsed,
320
+ source_jsonl_path="C:/fake/path.jsonl",
321
+ source_line_number=1,
322
+ )
323
+ assert row["session_id"] == "session-alpha"
324
+ assert row["hook_event"] == "PreToolUse"
325
+ assert row["hook_name"] == "PreToolUse:Bash"
326
+ assert row["tool_name"] == "Bash"
327
+ assert row["tool_use_id"] == "toolu_001"
328
+ assert row["outcome"] == OUTCOME_SUCCESS
329
+ assert row["exit_code"] == 0
330
+ assert row["duration_ms"] == 42
331
+ assert row["hook_category"] == "blocking"
332
+ assert row["source_jsonl_path"] == "C:/fake/path.jsonl"
333
+ assert row["source_line_number"] == 1
334
+
335
+
336
+ def test_build_row_from_blocking_attachment_has_no_exit_code_or_duration() -> None:
337
+ record_json = _make_blocking_line()
338
+ parsed = json.loads(record_json)
339
+ row = hook_log_extractor.build_row_from_attachment(
340
+ parsed_record=parsed,
341
+ source_jsonl_path="C:/fake/path.jsonl",
342
+ source_line_number=2,
343
+ )
344
+ assert row["outcome"] == OUTCOME_BLOCKED
345
+ assert row["exit_code"] is None
346
+ assert row["duration_ms"] is None
347
+ assert (
348
+ row["stderr_excerpt"] is not None
349
+ and "blocked for reason" in row["stderr_excerpt"]
350
+ )
351
+ assert row["hook_category"] == "blocking"
352
+
353
+
354
+ def test_build_row_from_system_message_uses_content_as_stdout_excerpt() -> None:
355
+ record_json = _make_system_message_line(content="[gate] blocked Bash(grep)")
356
+ parsed = json.loads(record_json)
357
+ row = hook_log_extractor.build_row_from_attachment(
358
+ parsed_record=parsed,
359
+ source_jsonl_path="C:/fake/path.jsonl",
360
+ source_line_number=3,
361
+ )
362
+ assert row["outcome"] == OUTCOME_SYSTEM_MESSAGE
363
+ assert row["stdout_excerpt"] == "[gate] blocked Bash(grep)"
364
+
365
+
366
+ def test_build_row_from_additional_context_joins_list_content() -> None:
367
+ record_json = _make_additional_context_line(content=["first note", "second note"])
368
+ parsed = json.loads(record_json)
369
+ row = hook_log_extractor.build_row_from_attachment(
370
+ parsed_record=parsed,
371
+ source_jsonl_path="C:/fake/path.jsonl",
372
+ source_line_number=4,
373
+ )
374
+ assert row["outcome"] == OUTCOME_ADDED_CONTEXT
375
+ assert row["stdout_excerpt"] is not None
376
+ assert "first note" in row["stdout_excerpt"]
377
+ assert "second note" in row["stdout_excerpt"]
378
+
379
+
380
+ def test_iter_attachment_records_skips_non_attachment_rows(tmp_path: Path) -> None:
381
+ jsonl_file = tmp_path / "session.jsonl"
382
+ lines = [
383
+ json.dumps({"type": "user", "content": "hi"}),
384
+ _make_success_line(),
385
+ json.dumps({"type": "assistant", "content": "hello"}),
386
+ _make_blocking_line(),
387
+ ]
388
+ jsonl_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
389
+
390
+ all_parsed_records = list(
391
+ hook_log_extractor.iter_attachment_records_from_file(
392
+ str(jsonl_file), start_offset=0
393
+ ),
394
+ )
395
+
396
+ assert len(all_parsed_records) == 2
397
+ first_parsed_record, first_line_number, _first_offset = all_parsed_records[0]
398
+ assert first_parsed_record["attachment"]["type"] == "hook_success"
399
+ assert first_line_number == 2
400
+
401
+
402
+ def test_iter_attachment_records_resumes_from_offset(tmp_path: Path) -> None:
403
+ jsonl_file = tmp_path / "session.jsonl"
404
+ first_line = _make_success_line(tool_use_id="toolu_a")
405
+ second_line = _make_success_line(tool_use_id="toolu_b")
406
+ jsonl_file.write_text(first_line + "\n" + second_line + "\n", encoding="utf-8")
407
+ first_line_byte_length = len((first_line + "\n").encode("utf-8"))
408
+
409
+ all_parsed_records = list(
410
+ hook_log_extractor.iter_attachment_records_from_file(
411
+ str(jsonl_file),
412
+ start_offset=first_line_byte_length,
413
+ ),
414
+ )
415
+
416
+ assert len(all_parsed_records) == 1
417
+ assert all_parsed_records[0][0]["attachment"]["toolUseID"] == "toolu_b"
418
+
419
+
420
+ def test_iter_attachment_records_ignores_malformed_json(tmp_path: Path) -> None:
421
+ jsonl_file = tmp_path / "session.jsonl"
422
+ lines = [
423
+ "{this is not json",
424
+ _make_success_line(),
425
+ ]
426
+ jsonl_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
427
+
428
+ all_parsed_records = list(
429
+ hook_log_extractor.iter_attachment_records_from_file(
430
+ str(jsonl_file), start_offset=0
431
+ ),
432
+ )
433
+
434
+ assert len(all_parsed_records) == 1
435
+
436
+
437
+ def test_load_offsets_returns_empty_when_file_missing(tmp_path: Path) -> None:
438
+ missing_state_file = tmp_path / "does_not_exist.json"
439
+ assert hook_log_extractor.load_offsets(str(missing_state_file)) == {}
440
+
441
+
442
+ def test_save_and_load_offsets_round_trips(tmp_path: Path) -> None:
443
+ state_file = tmp_path / "nested" / "state.json"
444
+ original_offset_by_path = {
445
+ "C:/foo.jsonl": {"byte_offset": 100, "line_number": 3},
446
+ "C:/bar.jsonl": {"byte_offset": 250, "line_number": 8},
447
+ }
448
+ hook_log_extractor.save_offsets(str(state_file), original_offset_by_path)
449
+ round_tripped = hook_log_extractor.load_offsets(str(state_file))
450
+ assert round_tripped == original_offset_by_path
451
+
452
+
453
+ def test_insert_rows_batches_uses_execute_values_or_executemany() -> None:
454
+ fake_cursor = MagicMock()
455
+ fake_connection = MagicMock()
456
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
457
+
458
+ all_rows = [
459
+ {
460
+ "event_timestamp": "2026-04-24T13:32:07.978Z",
461
+ "session_id": "s1",
462
+ "cwd": "c",
463
+ "git_branch": "b",
464
+ "hook_event": "PreToolUse",
465
+ "hook_name": "PreToolUse:Bash",
466
+ "hook_category": "blocking",
467
+ "script_path": "s",
468
+ "tool_name": "Bash",
469
+ "tool_use_id": "t",
470
+ "outcome": OUTCOME_SUCCESS,
471
+ "exit_code": 0,
472
+ "duration_ms": 1,
473
+ "command_excerpt": "cmd",
474
+ "stdout_excerpt": "out",
475
+ "stderr_excerpt": "",
476
+ "source_jsonl_path": "/p.jsonl",
477
+ "source_line_number": each_line_number,
478
+ }
479
+ for each_line_number in range(1, 4)
480
+ ]
481
+
482
+ hook_log_extractor.insert_rows_batch(fake_connection, all_rows)
483
+
484
+ assert fake_cursor.executemany.called or fake_cursor.execute.called
485
+
486
+
487
+ def test_run_full_extraction_advances_offset(tmp_path: Path) -> None:
488
+ jsonl_file = tmp_path / "session.jsonl"
489
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
490
+
491
+ state_file = tmp_path / "offsets.json"
492
+
493
+ fake_connection = MagicMock()
494
+ fake_connection.cursor.return_value.__enter__.return_value = MagicMock()
495
+
496
+ with patch.object(
497
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
498
+ ):
499
+ exit_code = hook_log_extractor.run_full_extraction(
500
+ transcripts_root=str(tmp_path),
501
+ state_file_path=str(state_file),
502
+ full_rebuild=False,
503
+ )
504
+
505
+ assert exit_code == 0
506
+ saved_offsets = hook_log_extractor.load_offsets(str(state_file))
507
+ assert str(jsonl_file) in saved_offsets
508
+ assert saved_offsets[str(jsonl_file)]["byte_offset"] > 0
509
+ assert saved_offsets[str(jsonl_file)]["line_number"] >= 1
510
+
511
+
512
+ def test_run_full_extraction_idempotent_when_offset_at_end(tmp_path: Path) -> None:
513
+ jsonl_file = tmp_path / "session.jsonl"
514
+ success_line = _make_success_line() + "\n"
515
+ jsonl_file.write_text(success_line, encoding="utf-8")
516
+
517
+ state_file = tmp_path / "offsets.json"
518
+ hook_log_extractor.save_offsets(
519
+ str(state_file),
520
+ {
521
+ str(jsonl_file): {
522
+ "byte_offset": len(success_line.encode("utf-8")),
523
+ "line_number": 1,
524
+ },
525
+ },
526
+ )
527
+
528
+ fake_cursor = MagicMock()
529
+ fake_connection = MagicMock()
530
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
531
+
532
+ with patch.object(
533
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
534
+ ):
535
+ exit_code = hook_log_extractor.run_full_extraction(
536
+ transcripts_root=str(tmp_path),
537
+ state_file_path=str(state_file),
538
+ full_rebuild=False,
539
+ )
540
+
541
+ assert exit_code == 0
542
+ assert not fake_cursor.executemany.called
543
+
544
+
545
+ def test_run_full_rebuild_clears_offsets_and_truncates(tmp_path: Path) -> None:
546
+ jsonl_file = tmp_path / "session.jsonl"
547
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
548
+
549
+ state_file = tmp_path / "offsets.json"
550
+ hook_log_extractor.save_offsets(
551
+ str(state_file),
552
+ {str(jsonl_file): {"byte_offset": 99999, "line_number": 100}},
553
+ )
554
+
555
+ fake_cursor = MagicMock()
556
+ fake_connection = MagicMock()
557
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
558
+
559
+ with patch.object(
560
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
561
+ ):
562
+ exit_code = hook_log_extractor.run_full_extraction(
563
+ transcripts_root=str(tmp_path),
564
+ state_file_path=str(state_file),
565
+ full_rebuild=True,
566
+ )
567
+
568
+ assert exit_code == 0
569
+ all_executed_statements = [
570
+ each_call.args[0] for each_call in fake_cursor.execute.call_args_list
571
+ ]
572
+ assert any(
573
+ "TRUNCATE" in each_statement.upper()
574
+ for each_statement in all_executed_statements
575
+ )
576
+ saved_offsets_after_rebuild = hook_log_extractor.load_offsets(str(state_file))
577
+ rebuilt_entry = saved_offsets_after_rebuild.get(str(jsonl_file), {})
578
+ assert rebuilt_entry.get("byte_offset", 0) > 0
579
+ assert rebuilt_entry.get("line_number", 0) >= 1
580
+
581
+
582
+ def test_offline_fallback_writes_one_log_line_when_connect_fails(
583
+ tmp_path: Path,
584
+ ) -> None:
585
+ jsonl_file = tmp_path / "session.jsonl"
586
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
587
+
588
+ state_file = tmp_path / "offsets.json"
589
+ warning_log = tmp_path / "hook-extractor.log"
590
+
591
+ class _FakeOperationalError(Exception):
592
+ pass
593
+
594
+ def _raise(*_args: Any, **_kwargs: Any) -> None:
595
+ raise _FakeOperationalError("boom")
596
+
597
+ with (
598
+ patch.object(hook_log_extractor, "connect_to_neon", side_effect=_raise),
599
+ patch.object(hook_log_extractor, "is_operational_error", return_value=True),
600
+ patch.object(hook_log_extractor, "OFFLINE_WARNING_LOG", str(warning_log)),
601
+ ):
602
+ exit_code = hook_log_extractor.run_full_extraction(
603
+ transcripts_root=str(tmp_path),
604
+ state_file_path=str(state_file),
605
+ full_rebuild=False,
606
+ )
607
+
608
+ assert exit_code == 0
609
+ log_contents = warning_log.read_text(encoding="utf-8")
610
+ assert len(log_contents.strip().splitlines()) == 1
611
+
612
+
613
+ def test_tool_name_extracted_from_hook_name_prefix() -> None:
614
+ assert hook_log_extractor.extract_tool_name("PreToolUse:Bash") == "Bash"
615
+ assert hook_log_extractor.extract_tool_name("PreToolUse:Write|Edit") == "Write|Edit"
616
+ assert hook_log_extractor.extract_tool_name("SessionStart") is None
617
+ assert hook_log_extractor.extract_tool_name("UserPromptSubmit") is None
618
+
619
+
620
+ def test_run_summary_prints_no_new_blocks_when_cursor_empty(
621
+ capsys: pytest.CaptureFixture[str],
622
+ ) -> None:
623
+ fake_cursor = MagicMock()
624
+ fake_cursor.fetchall.return_value = []
625
+ fake_connection = MagicMock()
626
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
627
+
628
+ with patch.object(
629
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
630
+ ):
631
+ exit_code = hook_log_extractor.run_summary()
632
+
633
+ captured = capsys.readouterr()
634
+ assert exit_code == 0
635
+ assert "No new blocks since last run." in captured.out
636
+
637
+
638
+ def test_run_summary_prints_table_when_rows_returned(
639
+ capsys: pytest.CaptureFixture[str],
640
+ ) -> None:
641
+ fake_cursor = MagicMock()
642
+ fake_cursor.fetchall.return_value = [
643
+ ("content_search_to_zoekt_redirector.py", "blocking", 7, "Bash(grep foo)"),
644
+ ]
645
+ fake_connection = MagicMock()
646
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
647
+
648
+ with patch.object(
649
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
650
+ ):
651
+ exit_code = hook_log_extractor.run_summary()
652
+
653
+ captured = capsys.readouterr()
654
+ assert exit_code == 0
655
+ assert "content_search_to_zoekt_redirector.py" in captured.out
656
+ assert "blocking" in captured.out
657
+ assert "7" in captured.out
658
+
659
+
660
+ def test_run_full_extraction_returns_zero_when_database_url_missing(
661
+ tmp_path: Path,
662
+ monkeypatch: pytest.MonkeyPatch,
663
+ ) -> None:
664
+ """C1: Stop-hook path must exit 0 when NEON URL env var is unset."""
665
+ monkeypatch.delenv(NEON_DATABASE_URL_ENVIRONMENT_VARIABLE, raising=False)
666
+
667
+ jsonl_file = tmp_path / "session.jsonl"
668
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
669
+ state_file = tmp_path / "offsets.json"
670
+ warning_log = tmp_path / "hook-extractor.log"
671
+
672
+ with patch.object(hook_log_extractor, "OFFLINE_WARNING_LOG", str(warning_log)):
673
+ exit_code = hook_log_extractor.run_full_extraction(
674
+ transcripts_root=str(tmp_path),
675
+ state_file_path=str(state_file),
676
+ full_rebuild=False,
677
+ )
678
+
679
+ assert exit_code == 0
680
+ assert warning_log.exists()
681
+ warning_text = warning_log.read_text(encoding="utf-8")
682
+ assert "MissingNeonDatabaseUrlError" in warning_text
683
+
684
+
685
+ def test_run_full_extraction_returns_zero_when_psycopg_not_installed(
686
+ tmp_path: Path,
687
+ ) -> None:
688
+ """C10: Stop-hook path must exit 0 when psycopg module is absent."""
689
+ jsonl_file = tmp_path / "session.jsonl"
690
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
691
+ state_file = tmp_path / "offsets.json"
692
+ warning_log = tmp_path / "hook-extractor.log"
693
+
694
+ with (
695
+ patch.object(hook_log_extractor, "psycopg", None),
696
+ patch.object(hook_log_extractor, "OFFLINE_WARNING_LOG", str(warning_log)),
697
+ ):
698
+ exit_code = hook_log_extractor.run_full_extraction(
699
+ transcripts_root=str(tmp_path),
700
+ state_file_path=str(state_file),
701
+ full_rebuild=False,
702
+ )
703
+
704
+ assert exit_code == 0
705
+ assert warning_log.exists()
706
+ warning_text = warning_log.read_text(encoding="utf-8")
707
+ assert "MissingPsycopgDependencyError" in warning_text
708
+
709
+
710
+ def test_offline_warning_line_does_not_leak_exception_message(
711
+ tmp_path: Path,
712
+ ) -> None:
713
+ """C12: Offline warning log must record only timestamp + class name."""
714
+ warning_log = tmp_path / "hook-extractor.log"
715
+
716
+ class _FakeOperationalError(Exception):
717
+ pass
718
+
719
+ def _raise_with_sensitive_url(*_args: Any, **_kwargs: Any) -> None:
720
+ raise _FakeOperationalError(
721
+ "connection failed to postgres://user:secret@host/db",
722
+ )
723
+
724
+ jsonl_file = tmp_path / "session.jsonl"
725
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
726
+ state_file = tmp_path / "offsets.json"
727
+
728
+ with (
729
+ patch.object(
730
+ hook_log_extractor,
731
+ "connect_to_neon",
732
+ side_effect=_raise_with_sensitive_url,
733
+ ),
734
+ patch.object(hook_log_extractor, "is_operational_error", return_value=True),
735
+ patch.object(hook_log_extractor, "OFFLINE_WARNING_LOG", str(warning_log)),
736
+ ):
737
+ hook_log_extractor.run_full_extraction(
738
+ transcripts_root=str(tmp_path),
739
+ state_file_path=str(state_file),
740
+ full_rebuild=False,
741
+ )
742
+
743
+ warning_text = warning_log.read_text(encoding="utf-8")
744
+ assert "secret" not in warning_text
745
+ assert "postgres://" not in warning_text
746
+
747
+
748
+ def test_offline_fallback_still_exits_zero_when_warning_log_write_raises(
749
+ tmp_path: Path,
750
+ ) -> None:
751
+ """Disk-error during warning log write must not break offline-graceful exit.
752
+
753
+ The Stop hook contract requires that connect failures log a warning
754
+ and exit with the documented offline status so session shutdown
755
+ never stalls. A read-only filesystem, a missing parent path, or an
756
+ EACCES on the warning log itself must not propagate and must not
757
+ flip the exit code. This test patches ``io.open`` so only the
758
+ OFFLINE_WARNING_LOG path raises, exercising the real inner
759
+ ``try/except OSError`` guard inside ``_append_offline_warning_line``
760
+ rather than monkeypatching the function itself.
761
+ """
762
+ jsonl_file = tmp_path / "session.jsonl"
763
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
764
+ state_file = tmp_path / "offsets.json"
765
+ warning_log = tmp_path / "hook-extractor.log"
766
+ warning_log_path_string = str(warning_log)
767
+
768
+ class _FakeOperationalError(Exception):
769
+ pass
770
+
771
+ def _raise_connection_failure(*_args: Any, **_kwargs: Any) -> None:
772
+ raise _FakeOperationalError("connect failed")
773
+
774
+ real_io_open = hook_log_extractor.io.open
775
+
776
+ def _io_open_blocking_warning_log(
777
+ path_argument: Any, *args: Any, **kwargs: Any
778
+ ) -> Any:
779
+ if str(path_argument) == warning_log_path_string:
780
+ raise OSError(errno.EACCES, "permission denied")
781
+ return real_io_open(path_argument, *args, **kwargs)
782
+
783
+ with (
784
+ patch.object(
785
+ hook_log_extractor,
786
+ "connect_to_neon",
787
+ side_effect=_raise_connection_failure,
788
+ ),
789
+ patch.object(hook_log_extractor, "is_operational_error", return_value=True),
790
+ patch.object(
791
+ hook_log_extractor, "OFFLINE_WARNING_LOG", warning_log_path_string
792
+ ),
793
+ patch.object(
794
+ hook_log_extractor.io,
795
+ "open",
796
+ side_effect=_io_open_blocking_warning_log,
797
+ ),
798
+ ):
799
+ exit_code = hook_log_extractor.run_full_extraction(
800
+ transcripts_root=str(tmp_path),
801
+ state_file_path=str(state_file),
802
+ full_rebuild=False,
803
+ )
804
+
805
+ assert exit_code == 0
806
+
807
+
808
+ def test_main_accepts_incremental_flag_as_noop(
809
+ tmp_path: Path,
810
+ monkeypatch: pytest.MonkeyPatch,
811
+ ) -> None:
812
+ """C8: ``--incremental`` must be recognized and route to default extraction."""
813
+ captured_arguments: dict[str, object] = {}
814
+
815
+ def _fake_run_full_extraction(
816
+ transcripts_root: str,
817
+ state_file_path: str,
818
+ full_rebuild: bool,
819
+ ) -> int:
820
+ captured_arguments["transcripts_root"] = transcripts_root
821
+ captured_arguments["state_file_path"] = state_file_path
822
+ captured_arguments["full_rebuild"] = full_rebuild
823
+ return 0
824
+
825
+ monkeypatch.setattr(sys, "argv", ["hook_log_extractor.py", "--incremental"])
826
+ monkeypatch.setattr(
827
+ hook_log_extractor, "run_full_extraction", _fake_run_full_extraction
828
+ )
829
+
830
+ exit_code = hook_log_extractor.main()
831
+
832
+ assert exit_code == 0
833
+ assert captured_arguments["full_rebuild"] is False
834
+
835
+
836
+ def test_run_query_returns_nonzero_for_unknown_query(
837
+ capsys: pytest.CaptureFixture[str],
838
+ ) -> None:
839
+ exit_code = hook_log_extractor.run_query("definitely_not_a_query_name")
840
+
841
+ captured = capsys.readouterr()
842
+ assert exit_code == EXIT_CODE_UNKNOWN_QUERY
843
+ assert "Unknown query" in captured.err
844
+
845
+
846
+ def test_run_query_returns_nonzero_for_invalid_query_name(
847
+ capsys: pytest.CaptureFixture[str],
848
+ ) -> None:
849
+ exit_code = hook_log_extractor.run_query("../../../etc/passwd")
850
+
851
+ captured = capsys.readouterr()
852
+ assert exit_code == EXIT_CODE_UNKNOWN_QUERY
853
+ assert "Invalid query name" in captured.err
854
+
855
+
856
+ def test_run_query_rejects_uppercase_and_hyphen_names(
857
+ capsys: pytest.CaptureFixture[str],
858
+ ) -> None:
859
+ exit_code_upper = hook_log_extractor.run_query("UPPER_CASE")
860
+ exit_code_hyphen = hook_log_extractor.run_query("has-hyphen")
861
+
862
+ captured = capsys.readouterr()
863
+ assert exit_code_upper == EXIT_CODE_UNKNOWN_QUERY
864
+ assert exit_code_hyphen == EXIT_CODE_UNKNOWN_QUERY
865
+ assert captured.err.count("Invalid query name") == 2
866
+
867
+
868
+ def test_save_offsets_cleans_up_temp_file_when_replace_fails(
869
+ tmp_path: Path,
870
+ monkeypatch: pytest.MonkeyPatch,
871
+ ) -> None:
872
+ state_file = tmp_path / "state.json"
873
+
874
+ def _fail_replace(*_args: Any, **_kwargs: Any) -> None:
875
+ raise OSError("replace failed")
876
+
877
+ monkeypatch.setattr(hook_log_extractor.os, "replace", _fail_replace)
878
+
879
+ with pytest.raises(OSError):
880
+ hook_log_extractor.save_offsets(
881
+ str(state_file),
882
+ {"C:/foo.jsonl": {"byte_offset": 100, "line_number": 2}},
883
+ )
884
+
885
+ leftover_temp_files = list(tmp_path.glob("tmp*"))
886
+ assert leftover_temp_files == []
887
+
888
+
889
+ def test_save_offsets_cleans_up_temp_file_when_json_dump_fails(
890
+ tmp_path: Path,
891
+ monkeypatch: pytest.MonkeyPatch,
892
+ ) -> None:
893
+ state_file = tmp_path / "state.json"
894
+
895
+ def _fail_dump(*_args: Any, **_kwargs: Any) -> None:
896
+ raise ValueError("dump failed")
897
+
898
+ monkeypatch.setattr(hook_log_extractor.json, "dump", _fail_dump)
899
+
900
+ with pytest.raises(ValueError):
901
+ hook_log_extractor.save_offsets(
902
+ str(state_file),
903
+ {"C:/foo.jsonl": {"byte_offset": 100, "line_number": 2}},
904
+ )
905
+
906
+ leftover_temp_files = list(tmp_path.glob("tmp*"))
907
+ assert leftover_temp_files == []
908
+
909
+
910
+ def test_load_offsets_propagates_os_error_other_than_missing_file(
911
+ tmp_path: Path,
912
+ monkeypatch: pytest.MonkeyPatch,
913
+ ) -> None:
914
+ state_file = tmp_path / "state.json"
915
+ state_file.write_text("{}", encoding="utf-8")
916
+
917
+ def _raise_permission(*_args: Any, **_kwargs: Any) -> None:
918
+ raise PermissionError("denied")
919
+
920
+ monkeypatch.setattr(hook_log_extractor.io, "open", _raise_permission)
921
+
922
+ with pytest.raises(PermissionError):
923
+ hook_log_extractor.load_offsets(str(state_file))
924
+
925
+
926
+ def test_load_offsets_returns_empty_for_malformed_json(tmp_path: Path) -> None:
927
+ state_file = tmp_path / "state.json"
928
+ state_file.write_text("not valid json {", encoding="utf-8")
929
+
930
+ assert hook_log_extractor.load_offsets(str(state_file)) == {}
931
+
932
+
933
+ def test_iter_attachment_records_accepts_start_line_number(tmp_path: Path) -> None:
934
+ jsonl_file = tmp_path / "session.jsonl"
935
+ first_line = _make_success_line(tool_use_id="toolu_a")
936
+ second_line = _make_success_line(tool_use_id="toolu_b")
937
+ jsonl_file.write_text(first_line + "\n" + second_line + "\n", encoding="utf-8")
938
+ first_line_byte_length = len((first_line + "\n").encode("utf-8"))
939
+
940
+ all_parsed_records_with_zero_start = list(
941
+ hook_log_extractor.iter_attachment_records_from_file(
942
+ str(jsonl_file),
943
+ start_offset=first_line_byte_length,
944
+ start_line_number=0,
945
+ ),
946
+ )
947
+ all_parsed_records_with_offset_start = list(
948
+ hook_log_extractor.iter_attachment_records_from_file(
949
+ str(jsonl_file),
950
+ start_offset=first_line_byte_length,
951
+ start_line_number=10,
952
+ ),
953
+ )
954
+
955
+ assert len(all_parsed_records_with_offset_start) == 1
956
+ _, zero_start_line_number, _ = all_parsed_records_with_zero_start[0]
957
+ _, offset_start_line_number, _ = all_parsed_records_with_offset_start[0]
958
+ assert offset_start_line_number == zero_start_line_number + 10
959
+
960
+
961
+ def test_load_offsets_migrates_bare_int_legacy_entries_to_empty(
962
+ tmp_path: Path,
963
+ ) -> None:
964
+ state_file = tmp_path / "state.json"
965
+ legacy_content = json.dumps({"C:/legacy.jsonl": 1234})
966
+ state_file.write_text(legacy_content, encoding="utf-8")
967
+ warning_log = tmp_path / "hook-extractor.log"
968
+
969
+ with patch.object(hook_log_extractor, "OFFLINE_WARNING_LOG", str(warning_log)):
970
+ loaded_offsets = hook_log_extractor.load_offsets(str(state_file))
971
+
972
+ assert loaded_offsets == {}
973
+ assert warning_log.exists()
974
+ assert "legacy_offsets_format" in warning_log.read_text(encoding="utf-8")
975
+
976
+
977
+ def test_load_offsets_ignores_legacy_warning_write_failure(
978
+ tmp_path: Path,
979
+ monkeypatch: pytest.MonkeyPatch,
980
+ ) -> None:
981
+ state_file = tmp_path / "state.json"
982
+ legacy_content = json.dumps({"C:/legacy.jsonl": 1234})
983
+ state_file.write_text(legacy_content, encoding="utf-8")
984
+ warning_log = tmp_path / "hook-extractor.log"
985
+
986
+ real_io_open = hook_log_extractor.io.open
987
+
988
+ def _io_open_fails_only_for_warning_log(
989
+ opened_file_path: str,
990
+ *args: Any,
991
+ **kwargs: Any,
992
+ ) -> Any:
993
+ if opened_file_path == str(warning_log):
994
+ raise OSError("read-only filesystem")
995
+ return real_io_open(opened_file_path, *args, **kwargs)
996
+
997
+ monkeypatch.setattr(
998
+ hook_log_extractor.io, "open", _io_open_fails_only_for_warning_log
999
+ )
1000
+
1001
+ with patch.object(hook_log_extractor, "OFFLINE_WARNING_LOG", str(warning_log)):
1002
+ loaded_offsets = hook_log_extractor.load_offsets(str(state_file))
1003
+
1004
+ assert loaded_offsets == {}
1005
+
1006
+
1007
+ def test_save_and_load_offsets_round_trips_new_shape(tmp_path: Path) -> None:
1008
+ state_file = tmp_path / "nested" / "state.json"
1009
+ original_offset_by_path = {
1010
+ "C:/foo.jsonl": {"byte_offset": 100, "line_number": 2},
1011
+ "C:/bar.jsonl": {"byte_offset": 250, "line_number": 5},
1012
+ }
1013
+ hook_log_extractor.save_offsets(str(state_file), original_offset_by_path)
1014
+ round_tripped = hook_log_extractor.load_offsets(str(state_file))
1015
+ assert round_tripped == original_offset_by_path
1016
+
1017
+
1018
+ def test_run_full_extraction_skips_transcripts_deleted_mid_run(
1019
+ tmp_path: Path,
1020
+ monkeypatch: pytest.MonkeyPatch,
1021
+ ) -> None:
1022
+ jsonl_file = tmp_path / "session.jsonl"
1023
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
1024
+ state_file = tmp_path / "offsets.json"
1025
+
1026
+ real_exists = hook_log_extractor.os.path.exists
1027
+
1028
+ def _return_false_for_target(each_path: str) -> bool:
1029
+ if each_path == str(jsonl_file):
1030
+ return False
1031
+ return real_exists(each_path)
1032
+
1033
+ fake_connection = MagicMock()
1034
+ fake_connection.cursor.return_value.__enter__.return_value = MagicMock()
1035
+
1036
+ with (
1037
+ patch.object(
1038
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
1039
+ ),
1040
+ patch.object(
1041
+ hook_log_extractor.os.path,
1042
+ "exists",
1043
+ side_effect=_return_false_for_target,
1044
+ ),
1045
+ ):
1046
+ exit_code = hook_log_extractor.run_full_extraction(
1047
+ transcripts_root=str(tmp_path),
1048
+ state_file_path=str(state_file),
1049
+ full_rebuild=False,
1050
+ )
1051
+
1052
+ assert exit_code == 0
1053
+
1054
+
1055
+ def test_iter_attachment_records_exposes_final_line_number_after_trailing_non_attachment(
1056
+ tmp_path: Path,
1057
+ ) -> None:
1058
+ """Final line count must include non-attachment lines after last yield."""
1059
+ jsonl_file = tmp_path / "session.jsonl"
1060
+ lines = [
1061
+ _make_success_line(tool_use_id="toolu_a"),
1062
+ json.dumps({"type": "user", "content": "noise"}),
1063
+ json.dumps({"type": "assistant", "content": "more noise"}),
1064
+ ]
1065
+ jsonl_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
1066
+
1067
+ attachment_iterator = hook_log_extractor.iter_attachment_records_from_file(
1068
+ str(jsonl_file),
1069
+ start_offset=0,
1070
+ )
1071
+ all_yielded = list(attachment_iterator)
1072
+
1073
+ assert len(all_yielded) == 1
1074
+ assert attachment_iterator.final_line_number == 3
1075
+
1076
+
1077
+ def test_run_full_extraction_persists_lines_consumed_with_trailing_noise(
1078
+ tmp_path: Path,
1079
+ ) -> None:
1080
+ """Resumption must not miscount when non-attachment lines follow the last yield."""
1081
+ jsonl_file = tmp_path / "session.jsonl"
1082
+ lines = [
1083
+ _make_success_line(tool_use_id="toolu_a"),
1084
+ json.dumps({"type": "user", "content": "trailing noise"}),
1085
+ ]
1086
+ jsonl_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
1087
+
1088
+ state_file = tmp_path / "offsets.json"
1089
+
1090
+ fake_cursor = MagicMock()
1091
+ fake_connection = MagicMock()
1092
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
1093
+
1094
+ with patch.object(
1095
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
1096
+ ):
1097
+ hook_log_extractor.run_full_extraction(
1098
+ transcripts_root=str(tmp_path),
1099
+ state_file_path=str(state_file),
1100
+ full_rebuild=False,
1101
+ )
1102
+
1103
+ saved_offsets = hook_log_extractor.load_offsets(str(state_file))
1104
+ assert saved_offsets[str(jsonl_file)]["line_number"] == 2
1105
+
1106
+
1107
+ def test_iter_attachment_records_final_line_number_when_no_yields(tmp_path: Path) -> None:
1108
+ """Final line count reflects lines consumed even when zero records yielded."""
1109
+ jsonl_file = tmp_path / "session.jsonl"
1110
+ lines = [
1111
+ json.dumps({"type": "user", "content": "a"}),
1112
+ json.dumps({"type": "assistant", "content": "b"}),
1113
+ ]
1114
+ jsonl_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
1115
+
1116
+ attachment_iterator = hook_log_extractor.iter_attachment_records_from_file(
1117
+ str(jsonl_file),
1118
+ start_offset=0,
1119
+ )
1120
+ all_yielded = list(attachment_iterator)
1121
+
1122
+ assert all_yielded == []
1123
+ assert attachment_iterator.final_line_number == 2
1124
+
1125
+
1126
+ def test_iter_attachment_records_exposes_final_byte_offset_after_drain(
1127
+ tmp_path: Path,
1128
+ ) -> None:
1129
+ """Iterator must report byte position reached after EOF, even with zero yields."""
1130
+ jsonl_file = tmp_path / "session.jsonl"
1131
+ lines = [
1132
+ json.dumps({"type": "user", "content": "a"}),
1133
+ json.dumps({"type": "assistant", "content": "b"}),
1134
+ ]
1135
+ full_bytes = ("\n".join(lines) + "\n").encode("utf-8")
1136
+ jsonl_file.write_bytes(full_bytes)
1137
+
1138
+ attachment_iterator = hook_log_extractor.iter_attachment_records_from_file(
1139
+ str(jsonl_file),
1140
+ start_offset=0,
1141
+ )
1142
+ list(attachment_iterator)
1143
+
1144
+ assert attachment_iterator.final_byte_offset == len(full_bytes)
1145
+
1146
+
1147
+ def test_run_full_extraction_persists_offset_with_only_non_hook_attachments(
1148
+ tmp_path: Path,
1149
+ ) -> None:
1150
+ """Offset must advance when iterator drained file yielding zero hook records."""
1151
+ jsonl_file = tmp_path / "session.jsonl"
1152
+ lines = [
1153
+ json.dumps({"type": "user", "content": "noise"}),
1154
+ json.dumps({"type": "assistant", "content": "more noise"}),
1155
+ ]
1156
+ jsonl_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
1157
+
1158
+ state_file = tmp_path / "offsets.json"
1159
+
1160
+ fake_cursor = MagicMock()
1161
+ fake_connection = MagicMock()
1162
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
1163
+
1164
+ with patch.object(
1165
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
1166
+ ):
1167
+ hook_log_extractor.run_full_extraction(
1168
+ transcripts_root=str(tmp_path),
1169
+ state_file_path=str(state_file),
1170
+ full_rebuild=False,
1171
+ )
1172
+
1173
+ saved_offsets = hook_log_extractor.load_offsets(str(state_file))
1174
+ assert str(jsonl_file) in saved_offsets
1175
+ persisted_byte_offset = saved_offsets[str(jsonl_file)]["byte_offset"]
1176
+ assert persisted_byte_offset == jsonl_file.stat().st_size
1177
+
1178
+
1179
+ def test_run_full_extraction_persists_final_offset_not_file_size(
1180
+ tmp_path: Path,
1181
+ monkeypatch: pytest.MonkeyPatch,
1182
+ ) -> None:
1183
+ """Persisted byte_offset must equal iterator.final_byte_offset.
1184
+
1185
+ Iterator-derived persistence is proven by equality between the
1186
+ saved offset and the iterator's ``final_byte_offset`` for a
1187
+ transcript that has been read to completion. The iterator's final
1188
+ offset matches the known initial byte length, and the persisted
1189
+ value matches that same iterator-reported value, so the save path
1190
+ sources its number from the iterator rather than from
1191
+ ``os.path.getsize`` (which the production code no longer calls).
1192
+ """
1193
+ jsonl_file = tmp_path / "session.jsonl"
1194
+ initial_line_bytes = (_make_success_line() + "\n").encode("utf-8")
1195
+ jsonl_file.write_bytes(initial_line_bytes)
1196
+ initial_byte_length = len(initial_line_bytes)
1197
+
1198
+ state_file = tmp_path / "offsets.json"
1199
+
1200
+ captured_iterators: list[hook_log_extractor.AttachmentRecordIterator] = []
1201
+ real_iterator_factory = hook_log_extractor.iter_attachment_records_from_file
1202
+
1203
+ def _capturing_iterator_factory(
1204
+ jsonl_file_path: str,
1205
+ start_offset: int,
1206
+ start_line_number: int = 0,
1207
+ ) -> hook_log_extractor.AttachmentRecordIterator:
1208
+ produced_iterator = real_iterator_factory(
1209
+ jsonl_file_path,
1210
+ start_offset=start_offset,
1211
+ start_line_number=start_line_number,
1212
+ )
1213
+ captured_iterators.append(produced_iterator)
1214
+ return produced_iterator
1215
+
1216
+ monkeypatch.setattr(
1217
+ hook_log_extractor,
1218
+ "iter_attachment_records_from_file",
1219
+ _capturing_iterator_factory,
1220
+ )
1221
+
1222
+ fake_cursor = MagicMock()
1223
+ fake_connection = MagicMock()
1224
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
1225
+
1226
+ with patch.object(
1227
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
1228
+ ):
1229
+ hook_log_extractor.run_full_extraction(
1230
+ transcripts_root=str(tmp_path),
1231
+ state_file_path=str(state_file),
1232
+ full_rebuild=False,
1233
+ )
1234
+
1235
+ saved_offsets = hook_log_extractor.load_offsets(str(state_file))
1236
+ assert captured_iterators, "iterator was never produced"
1237
+ iterator_reported_final_offset = captured_iterators[0].final_byte_offset
1238
+ assert iterator_reported_final_offset == initial_byte_length
1239
+ assert (
1240
+ saved_offsets[str(jsonl_file)]["byte_offset"]
1241
+ == iterator_reported_final_offset
1242
+ )
1243
+
1244
+
1245
+ def test_save_offsets_is_serialized_across_threads(tmp_path: Path) -> None:
1246
+ """Locked read-modify-write cycles across threads must not clobber entries."""
1247
+ state_file = tmp_path / "offsets.json"
1248
+ hook_log_extractor.save_offsets(str(state_file), {})
1249
+
1250
+ def _writer_for_path(writer_path: str) -> None:
1251
+ with hook_log_extractor._acquire_offsets_lock(str(state_file)):
1252
+ existing_offsets = hook_log_extractor.load_offsets(str(state_file))
1253
+ existing_offsets[writer_path] = {"byte_offset": 100, "line_number": 1}
1254
+ hook_log_extractor.save_offsets(str(state_file), existing_offsets)
1255
+
1256
+ concurrent_threads = [
1257
+ threading.Thread(target=_writer_for_path, args=(f"C:/file_{each_index}.jsonl",))
1258
+ for each_index in range(5)
1259
+ ]
1260
+ for each_thread in concurrent_threads:
1261
+ each_thread.start()
1262
+ for each_thread in concurrent_threads:
1263
+ each_thread.join()
1264
+
1265
+ final_offsets = hook_log_extractor.load_offsets(str(state_file))
1266
+ assert len(final_offsets) == 5
1267
+ for each_index in range(5):
1268
+ assert f"C:/file_{each_index}.jsonl" in final_offsets
1269
+
1270
+
1271
+ def test_run_full_extraction_holds_lock_across_load_and_save(
1272
+ tmp_path: Path,
1273
+ monkeypatch: pytest.MonkeyPatch,
1274
+ ) -> None:
1275
+ """The extraction cycle must acquire a lock around load→mutate→save."""
1276
+ jsonl_file = tmp_path / "session.jsonl"
1277
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
1278
+ state_file = tmp_path / "offsets.json"
1279
+
1280
+ lock_acquisition_count = {"count": 0}
1281
+
1282
+ real_lock_helper = hook_log_extractor._acquire_offsets_lock
1283
+
1284
+ def _counting_lock_helper(state_file_path: str) -> Any:
1285
+ lock_acquisition_count["count"] += 1
1286
+ return real_lock_helper(state_file_path)
1287
+
1288
+ monkeypatch.setattr(
1289
+ hook_log_extractor, "_acquire_offsets_lock", _counting_lock_helper
1290
+ )
1291
+
1292
+ fake_cursor = MagicMock()
1293
+ fake_connection = MagicMock()
1294
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
1295
+
1296
+ with patch.object(
1297
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
1298
+ ):
1299
+ hook_log_extractor.run_full_extraction(
1300
+ transcripts_root=str(tmp_path),
1301
+ state_file_path=str(state_file),
1302
+ full_rebuild=False,
1303
+ )
1304
+
1305
+ assert lock_acquisition_count["count"] >= 1
1306
+
1307
+
1308
+ def test_lock_file_handle_blocking_reraises_permanent_oserror_quickly(
1309
+ tmp_path: Path,
1310
+ monkeypatch: pytest.MonkeyPatch,
1311
+ ) -> None:
1312
+ """Permanent OSErrors (e.g. EBADF) must not be retried — re-raise fast.
1313
+
1314
+ EACCES is the documented contention errno for ``LK_NBLCK`` per the
1315
+ Microsoft ``_locking`` spec, so this test uses ``EBADF`` (invalid
1316
+ file descriptor) as a genuinely permanent failure that must bubble
1317
+ up without consuming the retry budget.
1318
+ """
1319
+ if hook_log_extractor.msvcrt is None:
1320
+ pytest.skip("msvcrt retry loop only exists on Windows runtimes")
1321
+
1322
+ lock_file_handle = (tmp_path / "offsets.json.lock").open("a+", encoding="utf-8")
1323
+ try:
1324
+ def _raise_permanent_oserror(
1325
+ file_descriptor: int,
1326
+ mode_flag: int,
1327
+ byte_count: int,
1328
+ ) -> None:
1329
+ raise OSError(errno.EBADF, "invalid file descriptor")
1330
+
1331
+ monkeypatch.setattr(
1332
+ hook_log_extractor.msvcrt, "locking", _raise_permanent_oserror
1333
+ )
1334
+
1335
+ started_at = time.monotonic()
1336
+ with pytest.raises(OSError) as excinfo:
1337
+ hook_log_extractor._lock_file_handle_blocking(lock_file_handle)
1338
+ elapsed_seconds = time.monotonic() - started_at
1339
+
1340
+ assert excinfo.value.errno == errno.EBADF
1341
+ assert elapsed_seconds < 1.0
1342
+ finally:
1343
+ lock_file_handle.close()
1344
+
1345
+
1346
+ def test_lock_file_handle_blocking_caps_retries_on_contention_errno(
1347
+ tmp_path: Path,
1348
+ monkeypatch: pytest.MonkeyPatch,
1349
+ ) -> None:
1350
+ """Contention-errno must be retried a bounded number of times, then raise.
1351
+
1352
+ With ``LK_NBLCK``, contention surfaces as ``EACCES`` per the
1353
+ Microsoft ``_locking`` spec; the retry loop must bound the number
1354
+ of attempts to ``LOCK_MAXIMUM_RETRY_COUNT`` and then re-raise.
1355
+ """
1356
+ if hook_log_extractor.msvcrt is None:
1357
+ pytest.skip("msvcrt retry loop only exists on Windows runtimes")
1358
+
1359
+ lock_file_handle = (tmp_path / "offsets.json.lock").open("a+", encoding="utf-8")
1360
+ try:
1361
+ attempt_count = {"value": 0}
1362
+
1363
+ def _raise_contention_oserror(
1364
+ file_descriptor: int,
1365
+ mode_flag: int,
1366
+ byte_count: int,
1367
+ ) -> None:
1368
+ attempt_count["value"] += 1
1369
+ raise OSError(errno.EACCES, "retries exhausted")
1370
+
1371
+ monkeypatch.setattr(
1372
+ hook_log_extractor.msvcrt, "locking", _raise_contention_oserror
1373
+ )
1374
+ monkeypatch.setattr(hook_log_extractor.time, "sleep", lambda _seconds: None)
1375
+
1376
+ with pytest.raises(OSError) as excinfo:
1377
+ hook_log_extractor._lock_file_handle_blocking(lock_file_handle)
1378
+
1379
+ assert excinfo.value.errno == errno.EACCES
1380
+ assert (
1381
+ attempt_count["value"]
1382
+ == hook_log_extractor.LOCK_MAXIMUM_RETRY_COUNT
1383
+ )
1384
+ finally:
1385
+ lock_file_handle.close()
1386
+
1387
+
1388
+ def test_lock_file_handle_blocking_uses_nonblocking_mode_flag(
1389
+ tmp_path: Path,
1390
+ monkeypatch: pytest.MonkeyPatch,
1391
+ ) -> None:
1392
+ """Windows branch must call msvcrt.locking with LK_NBLCK, not LK_LOCK.
1393
+
1394
+ LK_LOCK blocks internally for ~10 seconds per attempt per the
1395
+ Microsoft _locking spec, which compounded with the retry loop
1396
+ produces a worst-case wait of ~303s under sustained contention.
1397
+ LK_NBLCK raises OSError(EACCES) immediately, leaving the
1398
+ Python-level ``time.sleep`` as the sole pacing mechanism so the
1399
+ retry budget stays within its intended ~3s total.
1400
+ """
1401
+ if hook_log_extractor.msvcrt is None:
1402
+ pytest.skip("msvcrt mode-flag check only applies on Windows runtimes")
1403
+
1404
+ lock_file_handle = (tmp_path / "offsets.json.lock").open("a+", encoding="utf-8")
1405
+ try:
1406
+ observed_mode_flags: list[int] = []
1407
+
1408
+ def _record_mode_flag(
1409
+ file_descriptor: int,
1410
+ mode_flag: int,
1411
+ byte_count: int,
1412
+ ) -> None:
1413
+ observed_mode_flags.append(mode_flag)
1414
+
1415
+ monkeypatch.setattr(
1416
+ hook_log_extractor.msvcrt, "locking", _record_mode_flag
1417
+ )
1418
+
1419
+ hook_log_extractor._lock_file_handle_blocking(lock_file_handle)
1420
+
1421
+ assert observed_mode_flags == [hook_log_extractor.msvcrt.LK_NBLCK]
1422
+ finally:
1423
+ lock_file_handle.close()
1424
+
1425
+
1426
+ def test_run_full_extraction_does_not_hold_lock_across_db_io(
1427
+ tmp_path: Path,
1428
+ monkeypatch: pytest.MonkeyPatch,
1429
+ ) -> None:
1430
+ """DB insert must execute while the offsets lock is NOT held."""
1431
+ jsonl_file = tmp_path / "session.jsonl"
1432
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
1433
+ state_file = tmp_path / "offsets.json"
1434
+
1435
+ lock_currently_held = {"value": False}
1436
+ lock_held_during_insert = {"value": False}
1437
+
1438
+ real_lock_helper = hook_log_extractor._acquire_offsets_lock
1439
+
1440
+ @contextlib.contextmanager
1441
+ def _tracking_lock_helper(passed_state_file_path: str) -> Any:
1442
+ lock_currently_held["value"] = True
1443
+ try:
1444
+ with real_lock_helper(passed_state_file_path):
1445
+ yield
1446
+ finally:
1447
+ lock_currently_held["value"] = False
1448
+
1449
+ def _observe_lock_during_insert(*_args: Any, **_kwargs: Any) -> None:
1450
+ if lock_currently_held["value"]:
1451
+ lock_held_during_insert["value"] = True
1452
+
1453
+ monkeypatch.setattr(
1454
+ hook_log_extractor, "_acquire_offsets_lock", _tracking_lock_helper
1455
+ )
1456
+
1457
+ fake_cursor = MagicMock()
1458
+ fake_cursor.executemany.side_effect = _observe_lock_during_insert
1459
+ fake_connection = MagicMock()
1460
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
1461
+
1462
+ with patch.object(
1463
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
1464
+ ):
1465
+ hook_log_extractor.run_full_extraction(
1466
+ transcripts_root=str(tmp_path),
1467
+ state_file_path=str(state_file),
1468
+ full_rebuild=False,
1469
+ )
1470
+
1471
+ assert fake_cursor.executemany.called, (
1472
+ "Test setup failed: DB insert never ran"
1473
+ )
1474
+ assert not lock_held_during_insert["value"], (
1475
+ "Offsets lock must not be held during DB insert calls"
1476
+ )
1477
+
1478
+
1479
+ def test_run_full_extraction_preserves_external_offset_updates(
1480
+ tmp_path: Path,
1481
+ ) -> None:
1482
+ """Narrow-scope save must merge with concurrent writers, not clobber."""
1483
+ jsonl_file = tmp_path / "session.jsonl"
1484
+ jsonl_file.write_text(_make_success_line() + "\n", encoding="utf-8")
1485
+ state_file = tmp_path / "offsets.json"
1486
+
1487
+ fake_cursor = MagicMock()
1488
+ fake_connection = MagicMock()
1489
+ fake_connection.cursor.return_value.__enter__.return_value = fake_cursor
1490
+
1491
+ other_path_entry = {
1492
+ "C:/other_session.jsonl": {"byte_offset": 777, "line_number": 9},
1493
+ }
1494
+
1495
+ original_save_offsets = hook_log_extractor.save_offsets
1496
+
1497
+ def _save_then_inject_external_writer(
1498
+ passed_state_file_path: str,
1499
+ passed_offsets: dict[str, dict[str, int]],
1500
+ ) -> None:
1501
+ original_save_offsets(passed_state_file_path, passed_offsets)
1502
+ if other_path_entry["C:/other_session.jsonl"][
1503
+ "byte_offset"
1504
+ ] == 777 and "C:/other_session.jsonl" not in passed_offsets:
1505
+ loaded_from_disk = hook_log_extractor.load_offsets(passed_state_file_path)
1506
+ loaded_from_disk["C:/other_session.jsonl"] = other_path_entry[
1507
+ "C:/other_session.jsonl"
1508
+ ]
1509
+ original_save_offsets(passed_state_file_path, loaded_from_disk)
1510
+
1511
+ with (
1512
+ patch.object(
1513
+ hook_log_extractor, "connect_to_neon", return_value=fake_connection
1514
+ ),
1515
+ patch.object(
1516
+ hook_log_extractor, "save_offsets", _save_then_inject_external_writer
1517
+ ),
1518
+ ):
1519
+ hook_log_extractor.run_full_extraction(
1520
+ transcripts_root=str(tmp_path),
1521
+ state_file_path=str(state_file),
1522
+ full_rebuild=False,
1523
+ )
1524
+
1525
+ final_offsets = hook_log_extractor.load_offsets(str(state_file))
1526
+ assert "C:/other_session.jsonl" in final_offsets
1527
+ assert final_offsets["C:/other_session.jsonl"] == {
1528
+ "byte_offset": 777,
1529
+ "line_number": 9,
1530
+ }
1531
+ assert str(jsonl_file) in final_offsets