claude-jacked 0.2.3__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. claude_jacked-0.2.9.dist-info/METADATA +523 -0
  2. claude_jacked-0.2.9.dist-info/RECORD +33 -0
  3. jacked/cli.py +752 -47
  4. jacked/client.py +196 -29
  5. jacked/data/agents/code-simplicity-reviewer.md +87 -0
  6. jacked/data/agents/defensive-error-handler.md +93 -0
  7. jacked/data/agents/double-check-reviewer.md +214 -0
  8. jacked/data/agents/git-pr-workflow-manager.md +149 -0
  9. jacked/data/agents/issue-pr-coordinator.md +131 -0
  10. jacked/data/agents/pr-workflow-checker.md +199 -0
  11. jacked/data/agents/readme-maintainer.md +123 -0
  12. jacked/data/agents/test-coverage-engineer.md +155 -0
  13. jacked/data/agents/test-coverage-improver.md +139 -0
  14. jacked/data/agents/wiki-documentation-architect.md +580 -0
  15. jacked/data/commands/audit-rules.md +103 -0
  16. jacked/data/commands/dc.md +155 -0
  17. jacked/data/commands/learn.md +89 -0
  18. jacked/data/commands/pr.md +4 -0
  19. jacked/data/commands/redo.md +85 -0
  20. jacked/data/commands/techdebt.md +115 -0
  21. jacked/data/prompts/security_gatekeeper.txt +58 -0
  22. jacked/data/rules/jacked_behaviors.md +11 -0
  23. jacked/data/skills/jacked/SKILL.md +162 -0
  24. jacked/index_write_tracker.py +227 -0
  25. jacked/indexer.py +255 -129
  26. jacked/retriever.py +389 -137
  27. jacked/searcher.py +65 -13
  28. jacked/transcript.py +339 -0
  29. claude_jacked-0.2.3.dist-info/METADATA +0 -483
  30. claude_jacked-0.2.3.dist-info/RECORD +0 -13
  31. {claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/WHEEL +0 -0
  32. {claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/entry_points.txt +0 -0
  33. {claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/licenses/LICENSE +0 -0
jacked/indexer.py CHANGED
@@ -2,6 +2,13 @@
2
2
  Session indexing for Jacked.
3
3
 
4
4
  Handles parsing Claude sessions and upserting to Qdrant with server-side embedding.
5
+
6
+ Content types indexed:
7
+ - plan: Full implementation strategy from ~/.claude/plans/{slug}.md
8
+ - subagent_summary: Rich summaries from subagent outputs
9
+ - summary_label: Tiny chapter titles from compaction events
10
+ - user_message: First few user messages for intent matching
11
+ - chunk: Full transcript chunks for full retrieval mode
5
12
  """
6
13
 
7
14
  import logging
@@ -20,11 +27,11 @@ from jacked.config import (
20
27
  )
21
28
  from jacked.client import QdrantSessionClient, INFERENCE_MODEL
22
29
  from jacked.transcript import (
23
- parse_jsonl_file,
30
+ parse_jsonl_file_enriched,
24
31
  chunk_text,
25
- chunk_intent_text,
26
- ParsedTranscript,
32
+ EnrichedTranscript,
27
33
  )
34
+ from jacked.index_write_tracker import IndexWriteTracker
28
35
 
29
36
 
30
37
  logger = logging.getLogger(__name__)
@@ -34,9 +41,12 @@ class SessionIndexer:
34
41
  """
35
42
  Indexes Claude sessions to Qdrant using server-side embedding.
36
43
 
37
- Creates two types of points for each session:
38
- - Intent points: User messages for semantic search
39
- - Chunk points: Full transcript chunks for retrieval
44
+ Creates multiple content types for each session:
45
+ - plan: Full implementation strategy (gold - highest priority)
46
+ - subagent_summary: Rich summaries from agent outputs (gold)
47
+ - summary_label: Tiny chapter titles from compaction
48
+ - user_message: First few user messages for intent matching
49
+ - chunk: Full transcript chunks for full retrieval mode
40
50
 
41
51
  Qdrant Cloud Inference handles all embedding server-side.
42
52
 
@@ -60,6 +70,10 @@ class SessionIndexer:
60
70
  """
61
71
  self.config = config
62
72
  self.client = client or QdrantSessionClient(config)
73
+ # Config hash for detecting chunk_size/overlap changes
74
+ self._config_hash = content_hash(f"{config.chunk_size}:{config.chunk_overlap}")
75
+ # Write tracker for incremental indexing (NOT for retrieval!)
76
+ self._tracker = IndexWriteTracker(self._config_hash)
63
77
 
64
78
  def index_session(
65
79
  self,
@@ -68,20 +82,24 @@ class SessionIndexer:
68
82
  force: bool = False,
69
83
  ) -> dict:
70
84
  """
71
- Index a single session to Qdrant.
85
+ Index a single session to Qdrant with incremental updates.
86
+
87
+ Uses local SQLite tracker to avoid re-pushing unchanged content.
88
+ Only indexes NEW or CHANGED points - much more efficient than
89
+ the old delete-all-and-replace approach.
72
90
 
73
91
  Args:
74
92
  session_path: Path to the .jsonl session file
75
93
  repo_path: Full path to the repository
76
- force: If True, re-index even if unchanged
94
+ force: If True, clear tracker and re-seed from Qdrant
77
95
 
78
96
  Returns:
79
97
  Dict with indexing results:
80
98
  - session_id: The session ID
81
- - indexed: Whether the session was indexed
82
- - skipped: Whether it was skipped (unchanged)
83
- - intent_chunks: Number of intent chunks created
84
- - transcript_chunks: Number of transcript chunks created
99
+ - indexed: Whether new content was indexed
100
+ - skipped: Whether it was skipped (no new content)
101
+ - new_points: Number of new/changed points indexed
102
+ - plans, subagent_summaries, etc.: Counts by content type
85
103
  - error: Error message if failed
86
104
 
87
105
  Examples:
@@ -92,8 +110,12 @@ class SessionIndexer:
92
110
  "session_id": session_path.stem,
93
111
  "indexed": False,
94
112
  "skipped": False,
95
- "intent_chunks": 0,
96
- "transcript_chunks": 0,
113
+ "new_points": 0,
114
+ "plans": 0,
115
+ "subagent_summaries": 0,
116
+ "summary_labels": 0,
117
+ "user_messages": 0,
118
+ "chunks": 0,
97
119
  "error": None,
98
120
  }
99
121
 
@@ -101,46 +123,79 @@ class SessionIndexer:
101
123
  # Ensure collection exists
102
124
  self.client.ensure_collection()
103
125
 
104
- # Parse the transcript
105
- transcript = parse_jsonl_file(session_path)
106
- result["session_id"] = transcript.session_id
107
-
108
- # Check if we should skip (unchanged)
109
- if not force:
110
- current_hash = content_hash(transcript.full_text)
111
- existing = self._get_existing_hash(transcript.session_id)
112
- if existing == current_hash:
113
- logger.debug(f"Session {transcript.session_id} unchanged, skipping")
114
- result["skipped"] = True
115
- return result
116
-
117
- # Build points
118
- points = self._build_points(transcript, repo_path)
119
-
120
- if not points:
121
- logger.warning(f"No points to index for session {transcript.session_id}")
122
- result["error"] = "No content to index"
126
+ # Parse the transcript with enriched data
127
+ transcript = parse_jsonl_file_enriched(session_path)
128
+ session_id = transcript.session_id
129
+ result["session_id"] = session_id
130
+
131
+ # Check session metadata from tracker
132
+ meta = self._tracker.get_session_meta(session_id)
133
+
134
+ # Config changed? Clear and re-seed from Qdrant
135
+ if meta and meta["config_hash"] != self._config_hash:
136
+ logger.info(f"Config changed for session {session_id}, re-seeding from Qdrant")
137
+ self._tracker.clear_session(session_id)
138
+ meta = None
139
+
140
+ # Previous crash mid-indexing? Force re-index
141
+ if meta and meta["status"] == "indexing":
142
+ logger.info(f"Session {session_id} was interrupted mid-index, forcing re-seed")
143
+ force = True
144
+
145
+ # Cache miss or force? Seed from Qdrant (source of truth, THIS USER ONLY)
146
+ if meta is None or force:
147
+ self._tracker.clear_session(session_id)
148
+ self._tracker.seed_from_qdrant(session_id, self.client, self.config.user_name)
149
+
150
+ # Get what's already indexed
151
+ indexed = self._tracker.get_session_state(session_id)
152
+
153
+ # Mark as indexing BEFORE doing work (crash safety)
154
+ self._tracker.mark_indexing(session_id)
155
+
156
+ # Build only NEW/CHANGED points
157
+ points_to_index, points_metadata = self._build_incremental_points(
158
+ transcript, repo_path, indexed
159
+ )
160
+
161
+ if not points_to_index:
162
+ self._tracker.mark_complete(session_id)
163
+ result["skipped"] = True
164
+ logger.debug(f"Session {session_id}: no new content to index")
123
165
  return result
124
166
 
125
- # Delete existing points for this session (if any)
126
- self.client.delete_by_session(transcript.session_id)
167
+ # Upsert to Qdrant (no delete needed - deterministic IDs handle overwrites)
168
+ self.client.upsert_points(points_to_index)
127
169
 
128
- # Upsert new points
129
- self.client.upsert_points(points)
170
+ # Record what we indexed in tracker
171
+ for content_type, idx, hash_val, point_id in points_metadata:
172
+ self._tracker.record_indexed(session_id, content_type, idx, hash_val, str(point_id))
130
173
 
131
- # Count results
174
+ self._tracker.mark_complete(session_id)
175
+
176
+ # Count results by content_type
132
177
  result["indexed"] = True
133
- for p in points:
134
- payload = p.payload or {}
135
- if payload.get("type") == "intent":
136
- result["intent_chunks"] += 1
137
- elif payload.get("type") == "chunk":
138
- result["transcript_chunks"] += 1
178
+ result["new_points"] = len(points_to_index)
179
+ for content_type, _, _, _ in points_metadata:
180
+ if content_type == "plan":
181
+ result["plans"] += 1
182
+ elif content_type == "subagent_summary":
183
+ result["subagent_summaries"] += 1
184
+ elif content_type == "summary_label":
185
+ result["summary_labels"] += 1
186
+ elif content_type == "user_message":
187
+ result["user_messages"] += 1
188
+ elif content_type == "chunk":
189
+ result["chunks"] += 1
139
190
 
140
191
  logger.info(
141
- f"Indexed session {transcript.session_id}: "
142
- f"{result['intent_chunks']} intent chunks, "
143
- f"{result['transcript_chunks']} transcript chunks"
192
+ f"Indexed session {session_id}: "
193
+ f"{result['new_points']} new points ("
194
+ f"{result['plans']} plan, "
195
+ f"{result['subagent_summaries']} summaries, "
196
+ f"{result['summary_labels']} labels, "
197
+ f"{result['user_messages']} msgs, "
198
+ f"{result['chunks']} chunks)"
144
199
  )
145
200
 
146
201
  return result
@@ -150,41 +205,40 @@ class SessionIndexer:
150
205
  result["error"] = str(e)
151
206
  return result
152
207
 
153
- def _get_existing_hash(self, session_id: str) -> Optional[str]:
154
- """
155
- Get the content hash of an existing indexed session.
208
+ def _make_point_id(self, session_id: str, content_type: str, index: int) -> str:
209
+ """Generate deterministic point ID.
156
210
 
157
211
  Args:
158
- session_id: Session ID to check
212
+ session_id: The session UUID
213
+ content_type: One of plan, subagent_summary, summary_label, user_message, chunk
214
+ index: Index within that content type
159
215
 
160
216
  Returns:
161
- Content hash string or None if not found
217
+ UUID5 string for the point
162
218
  """
163
- # Look for the first intent point using deterministic UUID
164
- point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{session_id}_intent_0"))
165
- point = self.client.get_point_by_id(point_id)
166
- if point and point.payload:
167
- return point.payload.get("content_hash")
168
- return None
169
-
170
- def _build_points(
219
+ return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{session_id}:{content_type}:{index}"))
220
+
221
+ def _build_incremental_points(
171
222
  self,
172
- transcript: ParsedTranscript,
223
+ transcript: EnrichedTranscript,
173
224
  repo_path: str,
174
- ) -> list[models.PointStruct]:
225
+ indexed: dict,
226
+ ) -> tuple[list[models.PointStruct], list[tuple]]:
175
227
  """
176
- Build Qdrant points for a transcript.
177
-
178
- Uses models.Document for server-side embedding via Qdrant Cloud Inference.
228
+ Build only NEW or CHANGED points by comparing against what's already indexed.
179
229
 
180
230
  Args:
181
- transcript: Parsed transcript
231
+ transcript: EnrichedTranscript with all extracted data
182
232
  repo_path: Full path to the repository
233
+ indexed: Dict mapping (content_type, index) -> content_hash from tracker
183
234
 
184
235
  Returns:
185
- List of PointStruct objects
236
+ Tuple of (points_to_index, points_metadata) where points_metadata is
237
+ a list of (content_type, index, content_hash, point_id) tuples
186
238
  """
187
- points = []
239
+ points_to_index = []
240
+ points_metadata = [] # (content_type, index, hash, point_id)
241
+
188
242
  repo_id = get_repo_id(repo_path)
189
243
  repo_name = get_repo_name(repo_path)
190
244
  full_hash = content_hash(transcript.full_text)
@@ -194,79 +248,151 @@ class SessionIndexer:
194
248
  else datetime.now().isoformat()
195
249
  )
196
250
 
197
- # Build intent points (user messages for semantic search)
198
- intent_chunks = chunk_intent_text(
199
- transcript.intent_text,
200
- max_tokens=self.config.intent_max_tokens,
201
- )
251
+ # Base payload for all points
252
+ base_payload = {
253
+ "repo_id": repo_id,
254
+ "repo_name": repo_name,
255
+ "repo_path": repo_path,
256
+ "session_id": transcript.session_id,
257
+ "user_name": self.config.user_name,
258
+ "machine": self.config.machine_name,
259
+ "timestamp": timestamp_str,
260
+ "content_hash": full_hash,
261
+ "slug": transcript.slug,
262
+ }
263
+
264
+ # 1. Plan - check hash
265
+ if transcript.plan:
266
+ plan_hash = content_hash(transcript.plan.content)
267
+ if indexed.get(("plan", 0)) != plan_hash:
268
+ point_id = self._make_point_id(transcript.session_id, "plan", 0)
269
+ points_to_index.append(
270
+ models.PointStruct(
271
+ id=point_id,
272
+ vector=models.Document(
273
+ text=transcript.plan.content[:8000],
274
+ model=INFERENCE_MODEL,
275
+ ),
276
+ payload={
277
+ **base_payload,
278
+ "type": "plan",
279
+ "content_type": "plan",
280
+ "content": transcript.plan.content,
281
+ "plan_path": str(transcript.plan.path),
282
+ "chunk_index": 0,
283
+ },
284
+ )
285
+ )
286
+ points_metadata.append(("plan", 0, plan_hash, point_id))
287
+
288
+ # 2. User messages - compare by content hash
289
+ max_user_messages = 5
290
+ for i, msg in enumerate(transcript.user_messages[:max_user_messages]):
291
+ if not msg.content or len(msg.content) < 20:
292
+ continue
293
+ msg_hash = content_hash(msg.content)
294
+ if indexed.get(("user_message", i)) != msg_hash:
295
+ point_id = self._make_point_id(transcript.session_id, "user_message", i)
296
+ points_to_index.append(
297
+ models.PointStruct(
298
+ id=point_id,
299
+ vector=models.Document(
300
+ text=msg.content[:2000],
301
+ model=INFERENCE_MODEL,
302
+ ),
303
+ payload={
304
+ **base_payload,
305
+ "type": "user_message",
306
+ "content_type": "user_message",
307
+ "content": msg.content,
308
+ "chunk_index": i,
309
+ },
310
+ )
311
+ )
312
+ points_metadata.append(("user_message", i, msg_hash, point_id))
313
+
314
+ # 3. Agent summaries - compare by hash
315
+ for i, agent_summary in enumerate(transcript.agent_summaries):
316
+ summary_hash = content_hash(agent_summary.summary_text)
317
+ if indexed.get(("subagent_summary", i)) != summary_hash:
318
+ point_id = self._make_point_id(transcript.session_id, "subagent_summary", i)
319
+ points_to_index.append(
320
+ models.PointStruct(
321
+ id=point_id,
322
+ vector=models.Document(
323
+ text=agent_summary.summary_text[:8000],
324
+ model=INFERENCE_MODEL,
325
+ ),
326
+ payload={
327
+ **base_payload,
328
+ "type": "subagent_summary",
329
+ "content_type": "subagent_summary",
330
+ "content": agent_summary.summary_text,
331
+ "agent_id": agent_summary.agent_id,
332
+ "agent_type": agent_summary.agent_type,
333
+ "chunk_index": i,
334
+ },
335
+ )
336
+ )
337
+ points_metadata.append(("subagent_summary", i, summary_hash, point_id))
338
+
339
+ # 4. Summary labels - compare by hash
340
+ for i, label in enumerate(transcript.summary_labels):
341
+ label_hash = content_hash(label.label)
342
+ if indexed.get(("summary_label", i)) != label_hash:
343
+ point_id = self._make_point_id(transcript.session_id, "summary_label", i)
344
+ points_to_index.append(
345
+ models.PointStruct(
346
+ id=point_id,
347
+ vector=models.Document(
348
+ text=label.label,
349
+ model=INFERENCE_MODEL,
350
+ ),
351
+ payload={
352
+ **base_payload,
353
+ "type": "summary_label",
354
+ "content_type": "summary_label",
355
+ "content": label.label,
356
+ "leaf_uuid": label.leaf_uuid,
357
+ "chunk_index": i,
358
+ },
359
+ )
360
+ )
361
+ points_metadata.append(("summary_label", i, label_hash, point_id))
202
362
 
203
- # Get total transcript chunks for metadata
363
+ # 5. Chunks - compare by hash (handles boundary drift)
204
364
  transcript_chunks = chunk_text(
205
365
  transcript.full_text,
206
366
  chunk_size=self.config.chunk_size,
207
367
  overlap=self.config.chunk_overlap,
208
368
  )
209
369
 
210
- # Create intent points with Document for server-side embedding
211
- for i, chunk in enumerate(intent_chunks):
212
- if not chunk.strip():
213
- continue
214
-
215
- # Generate deterministic UUID from session_id + type + index
216
- point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{transcript.session_id}_intent_{i}"))
217
- points.append(
218
- models.PointStruct(
219
- id=point_id,
220
- vector=models.Document(
221
- text=chunk,
222
- model=INFERENCE_MODEL,
223
- ),
224
- payload={
225
- "type": "intent",
226
- "repo_id": repo_id,
227
- "repo_name": repo_name,
228
- "repo_path": repo_path,
229
- "session_id": transcript.session_id,
230
- "user_name": self.config.user_name,
231
- "machine": self.config.machine_name,
232
- "timestamp": timestamp_str,
233
- "content_hash": full_hash,
234
- "intent_text": chunk,
235
- "chunk_index": i,
236
- "total_chunks": len(intent_chunks),
237
- "transcript_chunk_count": len(transcript_chunks),
238
- },
239
- )
240
- )
241
-
242
- # Create transcript chunk points for retrieval
243
370
  for i, chunk in enumerate(transcript_chunks):
244
371
  if not chunk.strip():
245
372
  continue
246
-
247
- # Generate deterministic UUID from session_id + type + index
248
- point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{transcript.session_id}_chunk_{i}"))
249
- points.append(
250
- models.PointStruct(
251
- id=point_id,
252
- vector=models.Document(
253
- text=chunk,
254
- model=INFERENCE_MODEL,
255
- ),
256
- payload={
257
- "type": "chunk",
258
- "repo_id": repo_id,
259
- "repo_name": repo_name,
260
- "session_id": transcript.session_id,
261
- "user_name": self.config.user_name,
262
- "chunk_index": i,
263
- "total_chunks": len(transcript_chunks),
264
- "content": chunk,
265
- },
373
+ chunk_hash = content_hash(chunk)
374
+ if indexed.get(("chunk", i)) != chunk_hash:
375
+ point_id = self._make_point_id(transcript.session_id, "chunk", i)
376
+ points_to_index.append(
377
+ models.PointStruct(
378
+ id=point_id,
379
+ vector=models.Document(
380
+ text=chunk[:4000],
381
+ model=INFERENCE_MODEL,
382
+ ),
383
+ payload={
384
+ **base_payload,
385
+ "type": "chunk",
386
+ "content_type": "chunk",
387
+ "content": chunk,
388
+ "chunk_index": i,
389
+ "total_chunks": len(transcript_chunks),
390
+ },
391
+ )
266
392
  )
267
- )
393
+ points_metadata.append(("chunk", i, chunk_hash, point_id))
268
394
 
269
- return points
395
+ return points_to_index, points_metadata
270
396
 
271
397
  def index_all_sessions(
272
398
  self,