PyPI - claude-jacked - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

claude-jacked 0.2.3py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

claude_jacked-0.2.9.dist-info/METADATA +523 -0
claude_jacked-0.2.9.dist-info/RECORD +33 -0
jacked/cli.py +752 -47
jacked/client.py +196 -29
jacked/data/agents/code-simplicity-reviewer.md +87 -0
jacked/data/agents/defensive-error-handler.md +93 -0
jacked/data/agents/double-check-reviewer.md +214 -0
jacked/data/agents/git-pr-workflow-manager.md +149 -0
jacked/data/agents/issue-pr-coordinator.md +131 -0
jacked/data/agents/pr-workflow-checker.md +199 -0
jacked/data/agents/readme-maintainer.md +123 -0
jacked/data/agents/test-coverage-engineer.md +155 -0
jacked/data/agents/test-coverage-improver.md +139 -0
jacked/data/agents/wiki-documentation-architect.md +580 -0
jacked/data/commands/audit-rules.md +103 -0
jacked/data/commands/dc.md +155 -0
jacked/data/commands/learn.md +89 -0
jacked/data/commands/pr.md +4 -0
jacked/data/commands/redo.md +85 -0
jacked/data/commands/techdebt.md +115 -0
jacked/data/prompts/security_gatekeeper.txt +58 -0
jacked/data/rules/jacked_behaviors.md +11 -0
jacked/data/skills/jacked/SKILL.md +162 -0
jacked/index_write_tracker.py +227 -0
jacked/indexer.py +255 -129
jacked/retriever.py +389 -137
jacked/searcher.py +65 -13
jacked/transcript.py +339 -0
claude_jacked-0.2.3.dist-info/METADATA +0 -483
claude_jacked-0.2.3.dist-info/RECORD +0 -13
{claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/WHEEL +0 -0
{claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/entry_points.txt +0 -0
{claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/licenses/LICENSE +0 -0

jacked/indexer.py CHANGED Viewed

@@ -2,6 +2,13 @@
 Session indexing for Jacked.
 Handles parsing Claude sessions and upserting to Qdrant with server-side embedding.
+Content types indexed:
+- plan: Full implementation strategy from ~/.claude/plans/{slug}.md
+- subagent_summary: Rich summaries from subagent outputs
+- summary_label: Tiny chapter titles from compaction events
+- user_message: First few user messages for intent matching
+- chunk: Full transcript chunks for full retrieval mode
 """
 import logging
@@ -20,11 +27,11 @@ from jacked.config import (
 )
 from jacked.client import QdrantSessionClient, INFERENCE_MODEL
 from jacked.transcript import (
-    parse_jsonl_file,
+    parse_jsonl_file_enriched,
     chunk_text,
-    chunk_intent_text,
-    ParsedTranscript,
+    EnrichedTranscript,
 )
+from jacked.index_write_tracker import IndexWriteTracker
 logger = logging.getLogger(__name__)
@@ -34,9 +41,12 @@ class SessionIndexer:
     """
     Indexes Claude sessions to Qdrant using server-side embedding.
-    Creates two types of points for each session:
-    - Intent points: User messages for semantic search
-    - Chunk points: Full transcript chunks for retrieval
+    Creates multiple content types for each session:
+    - plan: Full implementation strategy (gold - highest priority)
+    - subagent_summary: Rich summaries from agent outputs (gold)
+    - summary_label: Tiny chapter titles from compaction
+    - user_message: First few user messages for intent matching
+    - chunk: Full transcript chunks for full retrieval mode
     Qdrant Cloud Inference handles all embedding server-side.
@@ -60,6 +70,10 @@ class SessionIndexer:
         """
         self.config = config
         self.client = client or QdrantSessionClient(config)
+        # Config hash for detecting chunk_size/overlap changes
+        self._config_hash = content_hash(f"{config.chunk_size}:{config.chunk_overlap}")
+        # Write tracker for incremental indexing (NOT for retrieval!)
+        self._tracker = IndexWriteTracker(self._config_hash)
     def index_session(
         self,
@@ -68,20 +82,24 @@ class SessionIndexer:
         force: bool = False,
     ) -> dict:
         """
-        Index a single session to Qdrant.
+        Index a single session to Qdrant with incremental updates.
+        Uses local SQLite tracker to avoid re-pushing unchanged content.
+        Only indexes NEW or CHANGED points - much more efficient than
+        the old delete-all-and-replace approach.
         Args:
             session_path: Path to the .jsonl session file
             repo_path: Full path to the repository
-            force: If True, re-index even if unchanged
+            force: If True, clear tracker and re-seed from Qdrant
         Returns:
             Dict with indexing results:
             - session_id: The session ID
-            - indexed: Whether the session was indexed
-            - skipped: Whether it was skipped (unchanged)
-            - intent_chunks: Number of intent chunks created
-            - transcript_chunks: Number of transcript chunks created
+            - indexed: Whether new content was indexed
+            - skipped: Whether it was skipped (no new content)
+            - new_points: Number of new/changed points indexed
+            - plans, subagent_summaries, etc.: Counts by content type
             - error: Error message if failed
         Examples:
@@ -92,8 +110,12 @@ class SessionIndexer:
             "session_id": session_path.stem,
             "indexed": False,
             "skipped": False,
-            "intent_chunks": 0,
-            "transcript_chunks": 0,
+            "new_points": 0,
+            "plans": 0,
+            "subagent_summaries": 0,
+            "summary_labels": 0,
+            "user_messages": 0,
+            "chunks": 0,
             "error": None,
         }
@@ -101,46 +123,79 @@ class SessionIndexer:
             # Ensure collection exists
             self.client.ensure_collection()
-            # Parse the transcript
-            transcript = parse_jsonl_file(session_path)
-            result["session_id"] = transcript.session_id
-            # Check if we should skip (unchanged)
-            if not force:
-                current_hash = content_hash(transcript.full_text)
-                existing = self._get_existing_hash(transcript.session_id)
-                if existing == current_hash:
-                    logger.debug(f"Session {transcript.session_id} unchanged, skipping")
-                    result["skipped"] = True
-                    return result
-            # Build points
-            points = self._build_points(transcript, repo_path)
-            if not points:
-                logger.warning(f"No points to index for session {transcript.session_id}")
-                result["error"] = "No content to index"
+            # Parse the transcript with enriched data
+            transcript = parse_jsonl_file_enriched(session_path)
+            session_id = transcript.session_id
+            result["session_id"] = session_id
+            # Check session metadata from tracker
+            meta = self._tracker.get_session_meta(session_id)
+            # Config changed? Clear and re-seed from Qdrant
+            if meta and meta["config_hash"] != self._config_hash:
+                logger.info(f"Config changed for session {session_id}, re-seeding from Qdrant")
+                self._tracker.clear_session(session_id)
+                meta = None
+            # Previous crash mid-indexing? Force re-index
+            if meta and meta["status"] == "indexing":
+                logger.info(f"Session {session_id} was interrupted mid-index, forcing re-seed")
+                force = True
+            # Cache miss or force? Seed from Qdrant (source of truth, THIS USER ONLY)
+            if meta is None or force:
+                self._tracker.clear_session(session_id)
+                self._tracker.seed_from_qdrant(session_id, self.client, self.config.user_name)
+            # Get what's already indexed
+            indexed = self._tracker.get_session_state(session_id)
+            # Mark as indexing BEFORE doing work (crash safety)
+            self._tracker.mark_indexing(session_id)
+            # Build only NEW/CHANGED points
+            points_to_index, points_metadata = self._build_incremental_points(
+                transcript, repo_path, indexed
+            )
+            if not points_to_index:
+                self._tracker.mark_complete(session_id)
+                result["skipped"] = True
+                logger.debug(f"Session {session_id}: no new content to index")
                 return result
-            # Delete existing points for this session (if any)
-            self.client.delete_by_session(transcript.session_id)
+            # Upsert to Qdrant (no delete needed - deterministic IDs handle overwrites)
+            self.client.upsert_points(points_to_index)
-            # Upsert new points
-            self.client.upsert_points(points)
+            # Record what we indexed in tracker
+            for content_type, idx, hash_val, point_id in points_metadata:
+                self._tracker.record_indexed(session_id, content_type, idx, hash_val, str(point_id))
-            # Count results
+            self._tracker.mark_complete(session_id)
+            # Count results by content_type
             result["indexed"] = True
-            for p in points:
-                payload = p.payload or {}
-                if payload.get("type") == "intent":
-                    result["intent_chunks"] += 1
-                elif payload.get("type") == "chunk":
-                    result["transcript_chunks"] += 1
+            result["new_points"] = len(points_to_index)
+            for content_type, _, _, _ in points_metadata:
+                if content_type == "plan":
+                    result["plans"] += 1
+                elif content_type == "subagent_summary":
+                    result["subagent_summaries"] += 1
+                elif content_type == "summary_label":
+                    result["summary_labels"] += 1
+                elif content_type == "user_message":
+                    result["user_messages"] += 1
+                elif content_type == "chunk":
+                    result["chunks"] += 1
             logger.info(
-                f"Indexed session {transcript.session_id}: "
-                f"{result['intent_chunks']} intent chunks, "
-                f"{result['transcript_chunks']} transcript chunks"
+                f"Indexed session {session_id}: "
+                f"{result['new_points']} new points ("
+                f"{result['plans']} plan, "
+                f"{result['subagent_summaries']} summaries, "
+                f"{result['summary_labels']} labels, "
+                f"{result['user_messages']} msgs, "
+                f"{result['chunks']} chunks)"
             )
             return result
@@ -150,41 +205,40 @@ class SessionIndexer:
             result["error"] = str(e)
             return result
-    def _get_existing_hash(self, session_id: str) -> Optional[str]:
-        """
-        Get the content hash of an existing indexed session.
+    def _make_point_id(self, session_id: str, content_type: str, index: int) -> str:
+        """Generate deterministic point ID.
         Args:
-            session_id: Session ID to check
+            session_id: The session UUID
+            content_type: One of plan, subagent_summary, summary_label, user_message, chunk
+            index: Index within that content type
         Returns:
-            Content hash string or None if not found
+            UUID5 string for the point
         """
-        # Look for the first intent point using deterministic UUID
-        point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{session_id}_intent_0"))
-        point = self.client.get_point_by_id(point_id)
-        if point and point.payload:
-            return point.payload.get("content_hash")
-        return None
-    def _build_points(
+        return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{session_id}:{content_type}:{index}"))
+    def _build_incremental_points(
         self,
-        transcript: ParsedTranscript,
+        transcript: EnrichedTranscript,
         repo_path: str,
-    ) -> list[models.PointStruct]:
+        indexed: dict,
+    ) -> tuple[list[models.PointStruct], list[tuple]]:
         """
-        Build Qdrant points for a transcript.
-        Uses models.Document for server-side embedding via Qdrant Cloud Inference.
+        Build only NEW or CHANGED points by comparing against what's already indexed.
         Args:
-            transcript: Parsed transcript
+            transcript: EnrichedTranscript with all extracted data
             repo_path: Full path to the repository
+            indexed: Dict mapping (content_type, index) -> content_hash from tracker
         Returns:
-            List of PointStruct objects
+            Tuple of (points_to_index, points_metadata) where points_metadata is
+            a list of (content_type, index, content_hash, point_id) tuples
         """
-        points = []
+        points_to_index = []
+        points_metadata = []  # (content_type, index, hash, point_id)
         repo_id = get_repo_id(repo_path)
         repo_name = get_repo_name(repo_path)
         full_hash = content_hash(transcript.full_text)
@@ -194,79 +248,151 @@ class SessionIndexer:
             else datetime.now().isoformat()
         )
-        # Build intent points (user messages for semantic search)
-        intent_chunks = chunk_intent_text(
-            transcript.intent_text,
-            max_tokens=self.config.intent_max_tokens,
-        )
+        # Base payload for all points
+        base_payload = {
+            "repo_id": repo_id,
+            "repo_name": repo_name,
+            "repo_path": repo_path,
+            "session_id": transcript.session_id,
+            "user_name": self.config.user_name,
+            "machine": self.config.machine_name,
+            "timestamp": timestamp_str,
+            "content_hash": full_hash,
+            "slug": transcript.slug,
+        }
+        # 1. Plan - check hash
+        if transcript.plan:
+            plan_hash = content_hash(transcript.plan.content)
+            if indexed.get(("plan", 0)) != plan_hash:
+                point_id = self._make_point_id(transcript.session_id, "plan", 0)
+                points_to_index.append(
+                    models.PointStruct(
+                        id=point_id,
+                        vector=models.Document(
+                            text=transcript.plan.content[:8000],
+                            model=INFERENCE_MODEL,
+                        ),
+                        payload={
+                            **base_payload,
+                            "type": "plan",
+                            "content_type": "plan",
+                            "content": transcript.plan.content,
+                            "plan_path": str(transcript.plan.path),
+                            "chunk_index": 0,
+                        },
+                    )
+                )
+                points_metadata.append(("plan", 0, plan_hash, point_id))
+        # 2. User messages - compare by content hash
+        max_user_messages = 5
+        for i, msg in enumerate(transcript.user_messages[:max_user_messages]):
+            if not msg.content or len(msg.content) < 20:
+                continue
+            msg_hash = content_hash(msg.content)
+            if indexed.get(("user_message", i)) != msg_hash:
+                point_id = self._make_point_id(transcript.session_id, "user_message", i)
+                points_to_index.append(
+                    models.PointStruct(
+                        id=point_id,
+                        vector=models.Document(
+                            text=msg.content[:2000],
+                            model=INFERENCE_MODEL,
+                        ),
+                        payload={
+                            **base_payload,
+                            "type": "user_message",
+                            "content_type": "user_message",
+                            "content": msg.content,
+                            "chunk_index": i,
+                        },
+                    )
+                )
+                points_metadata.append(("user_message", i, msg_hash, point_id))
+        # 3. Agent summaries - compare by hash
+        for i, agent_summary in enumerate(transcript.agent_summaries):
+            summary_hash = content_hash(agent_summary.summary_text)
+            if indexed.get(("subagent_summary", i)) != summary_hash:
+                point_id = self._make_point_id(transcript.session_id, "subagent_summary", i)
+                points_to_index.append(
+                    models.PointStruct(
+                        id=point_id,
+                        vector=models.Document(
+                            text=agent_summary.summary_text[:8000],
+                            model=INFERENCE_MODEL,
+                        ),
+                        payload={
+                            **base_payload,
+                            "type": "subagent_summary",
+                            "content_type": "subagent_summary",
+                            "content": agent_summary.summary_text,
+                            "agent_id": agent_summary.agent_id,
+                            "agent_type": agent_summary.agent_type,
+                            "chunk_index": i,
+                        },
+                    )
+                )
+                points_metadata.append(("subagent_summary", i, summary_hash, point_id))
+        # 4. Summary labels - compare by hash
+        for i, label in enumerate(transcript.summary_labels):
+            label_hash = content_hash(label.label)
+            if indexed.get(("summary_label", i)) != label_hash:
+                point_id = self._make_point_id(transcript.session_id, "summary_label", i)
+                points_to_index.append(
+                    models.PointStruct(
+                        id=point_id,
+                        vector=models.Document(
+                            text=label.label,
+                            model=INFERENCE_MODEL,
+                        ),
+                        payload={
+                            **base_payload,
+                            "type": "summary_label",
+                            "content_type": "summary_label",
+                            "content": label.label,
+                            "leaf_uuid": label.leaf_uuid,
+                            "chunk_index": i,
+                        },
+                    )
+                )
+                points_metadata.append(("summary_label", i, label_hash, point_id))
-        # Get total transcript chunks for metadata
+        # 5. Chunks - compare by hash (handles boundary drift)
         transcript_chunks = chunk_text(
             transcript.full_text,
             chunk_size=self.config.chunk_size,
             overlap=self.config.chunk_overlap,
         )
-        # Create intent points with Document for server-side embedding
-        for i, chunk in enumerate(intent_chunks):
-            if not chunk.strip():
-                continue
-            # Generate deterministic UUID from session_id + type + index
-            point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{transcript.session_id}_intent_{i}"))
-            points.append(
-                models.PointStruct(
-                    id=point_id,
-                    vector=models.Document(
-                        text=chunk,
-                        model=INFERENCE_MODEL,
-                    ),
-                    payload={
-                        "type": "intent",
-                        "repo_id": repo_id,
-                        "repo_name": repo_name,
-                        "repo_path": repo_path,
-                        "session_id": transcript.session_id,
-                        "user_name": self.config.user_name,
-                        "machine": self.config.machine_name,
-                        "timestamp": timestamp_str,
-                        "content_hash": full_hash,
-                        "intent_text": chunk,
-                        "chunk_index": i,
-                        "total_chunks": len(intent_chunks),
-                        "transcript_chunk_count": len(transcript_chunks),
-                    },
-                )
-            )
-        # Create transcript chunk points for retrieval
         for i, chunk in enumerate(transcript_chunks):
             if not chunk.strip():
                 continue
-            # Generate deterministic UUID from session_id + type + index
-            point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{transcript.session_id}_chunk_{i}"))
-            points.append(
-                models.PointStruct(
-                    id=point_id,
-                    vector=models.Document(
-                        text=chunk,
-                        model=INFERENCE_MODEL,
-                    ),
-                    payload={
-                        "type": "chunk",
-                        "repo_id": repo_id,
-                        "repo_name": repo_name,
-                        "session_id": transcript.session_id,
-                        "user_name": self.config.user_name,
-                        "chunk_index": i,
-                        "total_chunks": len(transcript_chunks),
-                        "content": chunk,
-                    },
+            chunk_hash = content_hash(chunk)
+            if indexed.get(("chunk", i)) != chunk_hash:
+                point_id = self._make_point_id(transcript.session_id, "chunk", i)
+                points_to_index.append(
+                    models.PointStruct(
+                        id=point_id,
+                        vector=models.Document(
+                            text=chunk[:4000],
+                            model=INFERENCE_MODEL,
+                        ),
+                        payload={
+                            **base_payload,
+                            "type": "chunk",
+                            "content_type": "chunk",
+                            "content": chunk,
+                            "chunk_index": i,
+                            "total_chunks": len(transcript_chunks),
+                        },
+                    )
                 )
-            )
+                points_metadata.append(("chunk", i, chunk_hash, point_id))
-        return points
+        return points_to_index, points_metadata
     def index_all_sessions(
         self,

claude-jacked 0.2.3__py3-none-any.whl → 0.2.9__py3-none-any.whl

claude-jacked 0.2.3py3-none-any.whl → 0.2.9py3-none-any.whl