claude-jacked 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jacked/indexer.py CHANGED
@@ -31,6 +31,7 @@ from jacked.transcript import (
31
31
  chunk_text,
32
32
  EnrichedTranscript,
33
33
  )
34
+ from jacked.index_write_tracker import IndexWriteTracker
34
35
 
35
36
 
36
37
  logger = logging.getLogger(__name__)
@@ -69,6 +70,10 @@ class SessionIndexer:
69
70
  """
70
71
  self.config = config
71
72
  self.client = client or QdrantSessionClient(config)
73
+ # Config hash for detecting chunk_size/overlap changes
74
+ self._config_hash = content_hash(f"{config.chunk_size}:{config.chunk_overlap}")
75
+ # Write tracker for incremental indexing (NOT for retrieval!)
76
+ self._tracker = IndexWriteTracker(self._config_hash)
72
77
 
73
78
  def index_session(
74
79
  self,
@@ -77,23 +82,24 @@ class SessionIndexer:
77
82
  force: bool = False,
78
83
  ) -> dict:
79
84
  """
80
- Index a single session to Qdrant with all content types.
85
+ Index a single session to Qdrant with incremental updates.
86
+
87
+ Uses local SQLite tracker to avoid re-pushing unchanged content.
88
+ Only indexes NEW or CHANGED points - much more efficient than
89
+ the old delete-all-and-replace approach.
81
90
 
82
91
  Args:
83
92
  session_path: Path to the .jsonl session file
84
93
  repo_path: Full path to the repository
85
- force: If True, re-index even if unchanged
94
+ force: If True, clear tracker and re-seed from Qdrant
86
95
 
87
96
  Returns:
88
97
  Dict with indexing results:
89
98
  - session_id: The session ID
90
- - indexed: Whether the session was indexed
91
- - skipped: Whether it was skipped (unchanged)
92
- - plans: Number of plan points (0 or 1)
93
- - subagent_summaries: Number of subagent summary points
94
- - summary_labels: Number of summary label points
95
- - user_messages: Number of user message points
96
- - chunks: Number of transcript chunk points
99
+ - indexed: Whether new content was indexed
100
+ - skipped: Whether it was skipped (no new content)
101
+ - new_points: Number of new/changed points indexed
102
+ - plans, subagent_summaries, etc.: Counts by content type
97
103
  - error: Error message if failed
98
104
 
99
105
  Examples:
@@ -104,6 +110,7 @@ class SessionIndexer:
104
110
  "session_id": session_path.stem,
105
111
  "indexed": False,
106
112
  "skipped": False,
113
+ "new_points": 0,
107
114
  "plans": 0,
108
115
  "subagent_summaries": 0,
109
116
  "summary_labels": 0,
@@ -118,36 +125,58 @@ class SessionIndexer:
118
125
 
119
126
  # Parse the transcript with enriched data
120
127
  transcript = parse_jsonl_file_enriched(session_path)
121
- result["session_id"] = transcript.session_id
122
-
123
- # Check if we should skip (unchanged)
124
- if not force:
125
- current_hash = content_hash(transcript.full_text)
126
- existing = self._get_existing_hash(transcript.session_id)
127
- if existing == current_hash:
128
- logger.debug(f"Session {transcript.session_id} unchanged, skipping")
129
- result["skipped"] = True
130
- return result
131
-
132
- # Build points for all content types
133
- points = self._build_points(transcript, repo_path)
134
-
135
- if not points:
136
- logger.warning(f"No points to index for session {transcript.session_id}")
137
- result["error"] = "No content to index"
128
+ session_id = transcript.session_id
129
+ result["session_id"] = session_id
130
+
131
+ # Check session metadata from tracker
132
+ meta = self._tracker.get_session_meta(session_id)
133
+
134
+ # Config changed? Clear and re-seed from Qdrant
135
+ if meta and meta["config_hash"] != self._config_hash:
136
+ logger.info(f"Config changed for session {session_id}, re-seeding from Qdrant")
137
+ self._tracker.clear_session(session_id)
138
+ meta = None
139
+
140
+ # Previous crash mid-indexing? Force re-index
141
+ if meta and meta["status"] == "indexing":
142
+ logger.info(f"Session {session_id} was interrupted mid-index, forcing re-seed")
143
+ force = True
144
+
145
+ # Cache miss or force? Seed from Qdrant (source of truth, THIS USER ONLY)
146
+ if meta is None or force:
147
+ self._tracker.clear_session(session_id)
148
+ self._tracker.seed_from_qdrant(session_id, self.client, self.config.user_name)
149
+
150
+ # Get what's already indexed
151
+ indexed = self._tracker.get_session_state(session_id)
152
+
153
+ # Mark as indexing BEFORE doing work (crash safety)
154
+ self._tracker.mark_indexing(session_id)
155
+
156
+ # Build only NEW/CHANGED points
157
+ points_to_index, points_metadata = self._build_incremental_points(
158
+ transcript, repo_path, indexed
159
+ )
160
+
161
+ if not points_to_index:
162
+ self._tracker.mark_complete(session_id)
163
+ result["skipped"] = True
164
+ logger.debug(f"Session {session_id}: no new content to index")
138
165
  return result
139
166
 
140
- # Delete existing points for this session (if any)
141
- self.client.delete_by_session(transcript.session_id)
167
+ # Upsert to Qdrant (no delete needed - deterministic IDs handle overwrites)
168
+ self.client.upsert_points(points_to_index)
142
169
 
143
- # Upsert new points
144
- self.client.upsert_points(points)
170
+ # Record what we indexed in tracker
171
+ for content_type, idx, hash_val, point_id in points_metadata:
172
+ self._tracker.record_indexed(session_id, content_type, idx, hash_val, str(point_id))
173
+
174
+ self._tracker.mark_complete(session_id)
145
175
 
146
176
  # Count results by content_type
147
177
  result["indexed"] = True
148
- for p in points:
149
- payload = p.payload or {}
150
- content_type = payload.get("content_type", payload.get("type"))
178
+ result["new_points"] = len(points_to_index)
179
+ for content_type, _, _, _ in points_metadata:
151
180
  if content_type == "plan":
152
181
  result["plans"] += 1
153
182
  elif content_type == "subagent_summary":
@@ -160,12 +189,13 @@ class SessionIndexer:
160
189
  result["chunks"] += 1
161
190
 
162
191
  logger.info(
163
- f"Indexed session {transcript.session_id}: "
192
+ f"Indexed session {session_id}: "
193
+ f"{result['new_points']} new points ("
164
194
  f"{result['plans']} plan, "
165
- f"{result['subagent_summaries']} agent summaries, "
195
+ f"{result['subagent_summaries']} summaries, "
166
196
  f"{result['summary_labels']} labels, "
167
- f"{result['user_messages']} user msgs, "
168
- f"{result['chunks']} chunks"
197
+ f"{result['user_messages']} msgs, "
198
+ f"{result['chunks']} chunks)"
169
199
  )
170
200
 
171
201
  return result
@@ -175,23 +205,6 @@ class SessionIndexer:
175
205
  result["error"] = str(e)
176
206
  return result
177
207
 
178
- def _get_existing_hash(self, session_id: str) -> Optional[str]:
179
- """
180
- Get the content hash of an existing indexed session.
181
-
182
- Args:
183
- session_id: Session ID to check
184
-
185
- Returns:
186
- Content hash string or None if not found
187
- """
188
- # Look for the first user_message point using deterministic UUID
189
- point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{session_id}:user_message:0"))
190
- point = self.client.get_point_by_id(point_id)
191
- if point and point.payload:
192
- return point.payload.get("content_hash")
193
- return None
194
-
195
208
  def _make_point_id(self, session_id: str, content_type: str, index: int) -> str:
196
209
  """Generate deterministic point ID.
197
210
 
@@ -205,29 +218,27 @@ class SessionIndexer:
205
218
  """
206
219
  return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{session_id}:{content_type}:{index}"))
207
220
 
208
- def _build_points(
221
+ def _build_incremental_points(
209
222
  self,
210
223
  transcript: EnrichedTranscript,
211
224
  repo_path: str,
212
- ) -> list[models.PointStruct]:
225
+ indexed: dict,
226
+ ) -> tuple[list[models.PointStruct], list[tuple]]:
213
227
  """
214
- Build Qdrant points for all content types in a transcript.
215
-
216
- Creates points for:
217
- - plan: Full implementation strategy (if exists)
218
- - subagent_summary: Rich summaries from agent outputs
219
- - summary_label: Tiny chapter titles from compaction
220
- - user_message: First few user messages for intent matching
221
- - chunk: Full transcript chunks for full retrieval
228
+ Build only NEW or CHANGED points by comparing against what's already indexed.
222
229
 
223
230
  Args:
224
231
  transcript: EnrichedTranscript with all extracted data
225
232
  repo_path: Full path to the repository
233
+ indexed: Dict mapping (content_type, index) -> content_hash from tracker
226
234
 
227
235
  Returns:
228
- List of PointStruct objects
236
+ Tuple of (points_to_index, points_metadata) where points_metadata is
237
+ a list of (content_type, index, content_hash, point_id) tuples
229
238
  """
230
- points = []
239
+ points_to_index = []
240
+ points_metadata = [] # (content_type, index, hash, point_id)
241
+
231
242
  repo_id = get_repo_id(repo_path)
232
243
  repo_name = get_repo_name(repo_path)
233
244
  full_hash = content_hash(transcript.full_text)
@@ -250,93 +261,106 @@ class SessionIndexer:
250
261
  "slug": transcript.slug,
251
262
  }
252
263
 
253
- # 1. Plan file (gold - highest priority)
264
+ # 1. Plan - check hash
254
265
  if transcript.plan:
255
- point_id = self._make_point_id(transcript.session_id, "plan", 0)
256
- points.append(
257
- models.PointStruct(
258
- id=point_id,
259
- vector=models.Document(
260
- text=transcript.plan.content[:8000], # Limit for embedding
261
- model=INFERENCE_MODEL,
262
- ),
263
- payload={
264
- **base_payload,
265
- "type": "plan", # Keep for backwards compat
266
- "content_type": "plan",
267
- "content": transcript.plan.content,
268
- "plan_path": str(transcript.plan.path),
269
- },
270
- )
271
- )
272
-
273
- # 2. Subagent summaries (gold)
274
- for i, agent_summary in enumerate(transcript.agent_summaries):
275
- point_id = self._make_point_id(transcript.session_id, "subagent_summary", i)
276
- points.append(
277
- models.PointStruct(
278
- id=point_id,
279
- vector=models.Document(
280
- text=agent_summary.summary_text[:8000], # Limit for embedding
281
- model=INFERENCE_MODEL,
282
- ),
283
- payload={
284
- **base_payload,
285
- "type": "subagent_summary",
286
- "content_type": "subagent_summary",
287
- "content": agent_summary.summary_text,
288
- "agent_id": agent_summary.agent_id,
289
- "agent_type": agent_summary.agent_type,
290
- "chunk_index": i,
291
- },
266
+ plan_hash = content_hash(transcript.plan.content)
267
+ if indexed.get(("plan", 0)) != plan_hash:
268
+ point_id = self._make_point_id(transcript.session_id, "plan", 0)
269
+ points_to_index.append(
270
+ models.PointStruct(
271
+ id=point_id,
272
+ vector=models.Document(
273
+ text=transcript.plan.content[:8000],
274
+ model=INFERENCE_MODEL,
275
+ ),
276
+ payload={
277
+ **base_payload,
278
+ "type": "plan",
279
+ "content_type": "plan",
280
+ "content": transcript.plan.content,
281
+ "plan_path": str(transcript.plan.path),
282
+ "chunk_index": 0,
283
+ },
284
+ )
292
285
  )
293
- )
294
-
295
- # 3. Summary labels (chapter titles from compaction)
296
- for i, label in enumerate(transcript.summary_labels):
297
- point_id = self._make_point_id(transcript.session_id, "summary_label", i)
298
- points.append(
299
- models.PointStruct(
300
- id=point_id,
301
- vector=models.Document(
302
- text=label.label,
303
- model=INFERENCE_MODEL,
304
- ),
305
- payload={
306
- **base_payload,
307
- "type": "summary_label",
308
- "content_type": "summary_label",
309
- "content": label.label,
310
- "leaf_uuid": label.leaf_uuid,
311
- "chunk_index": i,
312
- },
313
- )
314
- )
286
+ points_metadata.append(("plan", 0, plan_hash, point_id))
315
287
 
316
- # 4. User messages (first 5 for intent matching)
288
+ # 2. User messages - compare by content hash
317
289
  max_user_messages = 5
318
290
  for i, msg in enumerate(transcript.user_messages[:max_user_messages]):
319
291
  if not msg.content or len(msg.content) < 20:
320
292
  continue
321
- point_id = self._make_point_id(transcript.session_id, "user_message", i)
322
- points.append(
323
- models.PointStruct(
324
- id=point_id,
325
- vector=models.Document(
326
- text=msg.content[:2000], # Limit for embedding
327
- model=INFERENCE_MODEL,
328
- ),
329
- payload={
330
- **base_payload,
331
- "type": "user_message",
332
- "content_type": "user_message",
333
- "content": msg.content,
334
- "chunk_index": i,
335
- },
293
+ msg_hash = content_hash(msg.content)
294
+ if indexed.get(("user_message", i)) != msg_hash:
295
+ point_id = self._make_point_id(transcript.session_id, "user_message", i)
296
+ points_to_index.append(
297
+ models.PointStruct(
298
+ id=point_id,
299
+ vector=models.Document(
300
+ text=msg.content[:2000],
301
+ model=INFERENCE_MODEL,
302
+ ),
303
+ payload={
304
+ **base_payload,
305
+ "type": "user_message",
306
+ "content_type": "user_message",
307
+ "content": msg.content,
308
+ "chunk_index": i,
309
+ },
310
+ )
336
311
  )
337
- )
312
+ points_metadata.append(("user_message", i, msg_hash, point_id))
338
313
 
339
- # 5. Transcript chunks (for full retrieval mode)
314
+ # 3. Agent summaries - compare by hash
315
+ for i, agent_summary in enumerate(transcript.agent_summaries):
316
+ summary_hash = content_hash(agent_summary.summary_text)
317
+ if indexed.get(("subagent_summary", i)) != summary_hash:
318
+ point_id = self._make_point_id(transcript.session_id, "subagent_summary", i)
319
+ points_to_index.append(
320
+ models.PointStruct(
321
+ id=point_id,
322
+ vector=models.Document(
323
+ text=agent_summary.summary_text[:8000],
324
+ model=INFERENCE_MODEL,
325
+ ),
326
+ payload={
327
+ **base_payload,
328
+ "type": "subagent_summary",
329
+ "content_type": "subagent_summary",
330
+ "content": agent_summary.summary_text,
331
+ "agent_id": agent_summary.agent_id,
332
+ "agent_type": agent_summary.agent_type,
333
+ "chunk_index": i,
334
+ },
335
+ )
336
+ )
337
+ points_metadata.append(("subagent_summary", i, summary_hash, point_id))
338
+
339
+ # 4. Summary labels - compare by hash
340
+ for i, label in enumerate(transcript.summary_labels):
341
+ label_hash = content_hash(label.label)
342
+ if indexed.get(("summary_label", i)) != label_hash:
343
+ point_id = self._make_point_id(transcript.session_id, "summary_label", i)
344
+ points_to_index.append(
345
+ models.PointStruct(
346
+ id=point_id,
347
+ vector=models.Document(
348
+ text=label.label,
349
+ model=INFERENCE_MODEL,
350
+ ),
351
+ payload={
352
+ **base_payload,
353
+ "type": "summary_label",
354
+ "content_type": "summary_label",
355
+ "content": label.label,
356
+ "leaf_uuid": label.leaf_uuid,
357
+ "chunk_index": i,
358
+ },
359
+ )
360
+ )
361
+ points_metadata.append(("summary_label", i, label_hash, point_id))
362
+
363
+ # 5. Chunks - compare by hash (handles boundary drift)
340
364
  transcript_chunks = chunk_text(
341
365
  transcript.full_text,
342
366
  chunk_size=self.config.chunk_size,
@@ -346,27 +370,29 @@ class SessionIndexer:
346
370
  for i, chunk in enumerate(transcript_chunks):
347
371
  if not chunk.strip():
348
372
  continue
349
-
350
- point_id = self._make_point_id(transcript.session_id, "chunk", i)
351
- points.append(
352
- models.PointStruct(
353
- id=point_id,
354
- vector=models.Document(
355
- text=chunk[:4000], # Limit for embedding
356
- model=INFERENCE_MODEL,
357
- ),
358
- payload={
359
- **base_payload,
360
- "type": "chunk",
361
- "content_type": "chunk",
362
- "content": chunk,
363
- "chunk_index": i,
364
- "total_chunks": len(transcript_chunks),
365
- },
373
+ chunk_hash = content_hash(chunk)
374
+ if indexed.get(("chunk", i)) != chunk_hash:
375
+ point_id = self._make_point_id(transcript.session_id, "chunk", i)
376
+ points_to_index.append(
377
+ models.PointStruct(
378
+ id=point_id,
379
+ vector=models.Document(
380
+ text=chunk[:4000],
381
+ model=INFERENCE_MODEL,
382
+ ),
383
+ payload={
384
+ **base_payload,
385
+ "type": "chunk",
386
+ "content_type": "chunk",
387
+ "content": chunk,
388
+ "chunk_index": i,
389
+ "total_chunks": len(transcript_chunks),
390
+ },
391
+ )
366
392
  )
367
- )
393
+ points_metadata.append(("chunk", i, chunk_hash, point_id))
368
394
 
369
- return points
395
+ return points_to_index, points_metadata
370
396
 
371
397
  def index_all_sessions(
372
398
  self,
jacked/searcher.py CHANGED
@@ -333,9 +333,13 @@ class SessionSearcher:
333
333
  session_id=session.get("session_id", ""),
334
334
  repo_name=session.get("repo_name", "unknown"),
335
335
  repo_path=session.get("repo_path", ""),
336
+ user_name=session.get("user_name", "unknown"),
336
337
  machine=session.get("machine", "unknown"),
337
338
  timestamp=timestamp,
338
339
  score=100, # No relevance score for list
340
+ semantic_score=0.0, # Not applicable for list
341
+ is_own=session.get("user_name") == self.config.user_name,
342
+ is_current_repo=True, # We're filtering by repo
339
343
  intent_preview="", # Not available in list
340
344
  chunk_count=session.get("chunk_count", 0),
341
345
  )