npm - claude-self-reflect - Versions diffs - 3.0.1 → 3.0.2 - Mend

claude-self-reflect 3.0.1 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/.claude/agents/claude-self-reflect-test.md +110 -66
package/README.md +1 -1
package/installer/setup-wizard.js +4 -2
package/mcp-server/src/server.py +84 -0
package/package.json +1 -1
package/scripts/import-conversations-unified.py +225 -44

package/.claude/agents/claude-self-reflect-test.md CHANGED Viewed

@@ -12,13 +12,22 @@ You are a resilient and comprehensive testing specialist for Claude Self-Reflect
 - Streaming importer maintains <50MB memory while processing every 60s
 - MCP tools enable reflection and memory storage
 - System must handle sensitive API keys securely
+- Modular importer architecture in `scripts/importer/` package
+- Voyage API key read from `.env` file automatically
+## CRITICAL Testing Protocol
+1. **Test Local Mode First** - Ensure all functionality works with FastEmbed
+2. **Test Cloud Mode** - Switch to Voyage AI and validate
+3. **RESTORE TO LOCAL** - Machine MUST be left in 100% local state after testing
+4. **Certify Both Modes** - Only proceed to release if both modes pass
+5. **NO Model Changes** - Use sentence-transformers/all-MiniLM-L6-v2 (384 dims) for local
 ## Comprehensive Test Suite
 ### Available Test Categories
-The project now includes a comprehensive test suite in `/tests/` directory:
+The project includes a well-organized test suite:
-1. **MCP Tool Integration** (`test_mcp_tools_comprehensive.py`)
+1. **MCP Tool Integration** (`tests/integration/test_mcp_tools.py`)
    - All MCP tools with various parameters
    - Edge cases and error handling
    - Cross-project search validation
@@ -67,21 +76,20 @@ The project now includes a comprehensive test suite in `/tests/` directory:
 ```bash
 # Run ALL tests
 cd ~/projects/claude-self-reflect
-python tests/run_all_tests.py
+python -m pytest tests/
-# Run specific categories
-python tests/run_all_tests.py -c mcp_tools memory_decay multi_project
+# Run specific test categories
+python -m pytest tests/integration/
+python -m pytest tests/unit/
+python -m pytest tests/performance/
 # Run with verbose output
-python tests/run_all_tests.py -v
-# List available test categories
-python tests/run_all_tests.py --list
+python -m pytest tests/ -v
 # Run individual test files
-python tests/test_mcp_tools_comprehensive.py
-python tests/test_memory_decay.py
-python tests/test_multi_project.py
+python tests/integration/test_mcp_tools.py
+python tests/integration/test_collection_naming.py
+python tests/integration/test_system_integration.py
 ```
 ### Test Results Location
@@ -367,77 +375,113 @@ if [ -n "$VOYAGE_KEY" ]; then
 fi
 ```
-### Complete Cloud Embedding Test with Backup/Restore
+### CRITICAL: Verify Actual Imports (Not Just API Connection!)
 ```bash
-echo "=== Testing Cloud Embeddings (Voyage AI) with Full Backup ==="
-# Step 1: Backup current state
-echo "1. Backing up current local environment..."
-docker exec claude-reflection-qdrant qdrant-backup create /qdrant/backup/local-backup-$(date +%s) 2>/dev/null || echo "Backup command not available"
-cp config/imported-files.json config/imported-files.json.local-backup
-echo "Current embedding mode: ${PREFER_LOCAL_EMBEDDINGS:-true}"
-# Step 2: Check prerequisites
-if [ -z "$VOYAGE_KEY" ]; then
-    echo "⚠️  WARNING: VOYAGE_KEY not set"
-    echo "To test cloud mode, set: export VOYAGE_KEY='your-key'"
-    echo "Skipping cloud test..."
-    exit 0
+echo "=== REAL Cloud Embedding Import Test ==="
+# Step 1: Verify prerequisites
+if [ ! -f .env ] || [ -z "$(grep VOYAGE_KEY .env)" ]; then
+    echo "❌ FAIL: No VOYAGE_KEY in .env file"
+    exit 1
 fi
-# Step 3: Switch to cloud mode
-echo "2. Switching to Voyage AI cloud embeddings..."
+# Extract API key
+export VOYAGE_KEY=$(grep VOYAGE_KEY .env | cut -d= -f2)
+echo "✅ Found Voyage key: ${VOYAGE_KEY:0:10}..."
+# Step 2: Count existing collections before test
+BEFORE_LOCAL=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_local" | wc -l)
+BEFORE_VOYAGE=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | wc -l)
+echo "Before: $BEFORE_LOCAL local, $BEFORE_VOYAGE voyage collections"
+# Step 3: Create NEW test conversation for import
+TEST_PROJECT="test-voyage-$(date +%s)"
+TEST_DIR=~/.claude/projects/$TEST_PROJECT
+mkdir -p $TEST_DIR
+TEST_FILE=$TEST_DIR/voyage-test.jsonl
+cat > $TEST_FILE << 'EOF'
+{"type":"conversation","uuid":"voyage-test-001","name":"Voyage Import Test","messages":[{"role":"human","content":"Testing actual Voyage AI import"},{"role":"assistant","content":[{"type":"text","text":"This should create a real Voyage collection with 1024-dim vectors"}]}],"conversation_id":"voyage-test-001","created_at":"2025-09-08T00:00:00Z"}
+EOF
+echo "✅ Created test file: $TEST_FILE"
+# Step 4: Switch to Voyage mode and import
+echo "Switching to Voyage mode..."
 export PREFER_LOCAL_EMBEDDINGS=false
-docker compose --profile watch stop streaming-importer
-docker compose --profile watch up -d streaming-importer
+export USE_VOYAGE=true
-# Step 4: Create test conversation
-TEST_FILE=~/.claude/projects/claude-self-reflect/cloud-test-$(date +%s).jsonl
-echo '{"type":"conversation","uuid":"cloud-test-'$(date +%s)'","name":"Cloud Embedding Test","messages":[{"role":"human","content":"Testing Voyage AI cloud embeddings for v2.5.0"},{"role":"assistant","content":[{"type":"text","text":"This tests 1024-dimensional vectors with Voyage AI"}]}]}' > $TEST_FILE
+# Run import directly with modular importer
+cd ~/projects/claude-self-reflect
+source venv/bin/activate
+python -c "
+import os
+os.environ['VOYAGE_KEY'] = '$VOYAGE_KEY'
+os.environ['PREFER_LOCAL_EMBEDDINGS'] = 'false'
+os.environ['USE_VOYAGE'] = 'true'
+from scripts.importer.main import ImporterContainer
+container = ImporterContainer()
+processor = container.processor()
+# Process test file
+import json
+with open('$TEST_FILE') as f:
+    data = json.load(f)
+result = processor.process_conversation(
+    conversation_data=data,
+    file_path='$TEST_FILE',
+    project_path='$TEST_PROJECT'
+)
+print(f'Import result: {result}')
+"
-# Step 5: Wait for import and verify
-echo "3. Waiting for cloud import cycle (70s)..."
-sleep 70
+# Step 5: Verify actual Voyage collection created
+echo "Verifying Voyage collection..."
+sleep 5
+AFTER_VOYAGE=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | wc -l)
-# Step 6: Verify cloud collection created
-CLOUD_COLS=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage")
-if [ -n "$CLOUD_COLS" ]; then
-    echo "✅ PASS: Cloud collections created:"
-    echo "$CLOUD_COLS"
+if [ "$AFTER_VOYAGE" -gt "$BEFORE_VOYAGE" ]; then
+    echo "✅ SUCCESS: New Voyage collection created!"
+    # Get the new collection name
+    NEW_COL=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | tail -1)
+    # Verify dimensions
+    DIMS=$(curl -s http://localhost:6333/collections/$NEW_COL | jq '.result.config.params.vectors.size')
+    POINTS=$(curl -s http://localhost:6333/collections/$NEW_COL | jq '.result.points_count')
-    # Check vector dimensions
-    FIRST_COL=$(echo "$CLOUD_COLS" | head -1)
-    DIMS=$(curl -s http://localhost:6333/collections/$FIRST_COL | jq '.result.config.params.vectors.size')
-    if [ "$DIMS" = "1024" ]; then
-        echo "✅ PASS: Correct dimensions (1024) for Voyage AI"
+    echo "Collection: $NEW_COL"
+    echo "Dimensions: $DIMS (expected 1024)"
+    echo "Points: $POINTS"
+    if [ "$DIMS" = "1024" ] && [ "$POINTS" -gt "0" ]; then
+        echo "✅ PASS: Voyage import actually worked!"
     else
-        echo "❌ FAIL: Wrong dimensions: $DIMS (expected 1024)"
+        echo "❌ FAIL: Wrong dimensions or no points"
     fi
 else
-    echo "❌ FAIL: No cloud collections found"
+    echo "❌ FAIL: No new Voyage collection created - import didn't work!"
 fi
-# Step 7: Test MCP with cloud embeddings
-echo "4. Testing MCP search with cloud embeddings..."
-# Note: MCP must also use PREFER_LOCAL_EMBEDDINGS=false
-# Step 8: Restore local mode
-echo "5. Restoring local FastEmbed mode..."
+# Step 6: Restore to local mode
+echo "Restoring local mode..."
 export PREFER_LOCAL_EMBEDDINGS=true
-docker compose --profile watch stop streaming-importer
-docker compose --profile watch up -d streaming-importer
+export USE_VOYAGE=false
-# Step 9: Verify restoration
-sleep 10
-LOCAL_COLS=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_local" | wc -l)
-echo "✅ Restored: Found $LOCAL_COLS local collections"
-# Step 10: Cleanup
-rm -f $TEST_FILE
-cp config/imported-files.json.local-backup config/imported-files.json
-echo "✅ Cloud embedding test complete and restored to local mode"
+# Step 7: Cleanup
+rm -rf $TEST_DIR
+echo "✅ Test complete and cleaned up"
 ```
+### Verification Checklist for Real Imports
+1. **Check Collection Suffix**: `_voyage` for cloud, `_local` for FastEmbed
+2. **Verify Dimensions**: 1024 for Voyage, 384 for FastEmbed
+3. **Count Points**: Must have >0 points for successful import
+4. **Check Logs**: Look for actual embedding API calls
+5. **Verify State File**: Check imported-files.json for record
 ## Success Criteria
 ### System Functionality

package/README.md CHANGED Viewed

@@ -108,7 +108,7 @@ See your conversation indexing progress directly in your statusline:
 ### Active Indexing (50% with backlog)
 ![Statusline showing 50% indexed with 7h backlog](docs/images/statusbar-2.png)
-Works with [Claude Code Statusline](https://github.com/sirmalloc/ccstatusline) - shows progress bars, percentages, and indexing lag in real-time!
+Works with [Claude Code Statusline](https://github.com/sirmalloc/ccstatusline) - shows progress bars, percentages, and indexing lag in real-time! The statusline also displays MCP connection status (✓ Connected) and collection counts (28/29 indexed).
 ## Key Features

package/installer/setup-wizard.js CHANGED Viewed

@@ -2,11 +2,13 @@
 // This is the new Docker-based setup wizard
 // It runs everything in Docker to avoid Python environment issues
-import { fileURLToPath } from 'url';
+import { fileURLToPath, pathToFileURL } from 'url';
 import { dirname, join } from 'path';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 // Simply forward to the Docker-based wizard
-import(join(__dirname, 'setup-wizard-docker.js'));
+// Fix for Windows: Use pathToFileURL for dynamic imports (Issue #51)
+const wizardPath = join(__dirname, 'setup-wizard-docker.js');
+import(pathToFileURL(wizardPath).href);

package/mcp-server/src/server.py CHANGED Viewed

@@ -143,6 +143,90 @@ mcp = FastMCP(
 # Create Qdrant client
 qdrant_client = AsyncQdrantClient(url=QDRANT_URL)
+# Add MCP Resources for system status
+@mcp.resource("status://import-stats")
+async def get_import_stats():
+    """Current import statistics and progress."""
+    await update_indexing_status()
+    return json.dumps({
+        "indexed_conversations": indexing_status["indexed_conversations"],
+        "total_conversations": indexing_status["total_conversations"],
+        "percentage": indexing_status["percentage"],
+        "backlog_count": indexing_status["backlog_count"],
+        "last_check": datetime.fromtimestamp(indexing_status["last_check"]).isoformat() if indexing_status["last_check"] else None
+    }, indent=2)
+@mcp.resource("status://collection-list")
+async def get_collection_list():
+    """List of all Qdrant collections with metadata."""
+    try:
+        collections = await qdrant_client.get_collections()
+        collection_data = []
+        for collection in collections.collections:
+            # Get collection info
+            info = await qdrant_client.get_collection(collection_name=collection.name)
+            collection_data.append({
+                "name": collection.name,
+                "points_count": info.points_count,
+                "indexed_vectors_count": info.indexed_vectors_count,
+                "status": info.status,
+                "config": {
+                    "vector_size": info.config.params.vectors.size if hasattr(info.config.params.vectors, 'size') else 384,
+                    "distance": str(info.config.params.vectors.distance) if hasattr(info.config.params.vectors, 'distance') else "Cosine"
+                }
+            })
+        return json.dumps({
+            "total_collections": len(collection_data),
+            "collections": collection_data
+        }, indent=2)
+    except Exception as e:
+        return json.dumps({"error": str(e)}, indent=2)
+@mcp.resource("status://system-health")
+async def get_system_health():
+    """System health and configuration information."""
+    try:
+        # Check Qdrant connectivity
+        qdrant_info = await qdrant_client.get_collections()
+        qdrant_healthy = True
+        qdrant_version = "Connected"
+    except:
+        qdrant_healthy = False
+        qdrant_version = "Disconnected"
+    # Check embedding configuration
+    embedding_info = {}
+    if embedding_manager:
+        embedding_info = {
+            "model_type": embedding_manager.model_type,
+            "model_name": embedding_manager.model_name,
+            "dimension": embedding_manager.dimension
+        }
+    return json.dumps({
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "qdrant": {
+            "healthy": qdrant_healthy,
+            "url": QDRANT_URL,
+            "version": qdrant_version
+        },
+        "embeddings": embedding_info,
+        "configuration": {
+            "memory_decay_enabled": ENABLE_MEMORY_DECAY,
+            "decay_weight": DECAY_WEIGHT,
+            "decay_scale_days": DECAY_SCALE_DAYS,
+            "prefer_local_embeddings": PREFER_LOCAL_EMBEDDINGS
+        },
+        "indexing_status": {
+            "indexed": indexing_status["indexed_conversations"],
+            "total": indexing_status["total_conversations"],
+            "percentage": indexing_status["percentage"]
+        }
+    }, indent=2)
 # Track indexing status (updated periodically)
 indexing_status = {
     "last_check": 0,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-self-reflect",
-  "version": "3.0.1",
+  "version": "3.0.2",
   "description": "Give Claude perfect memory of all your conversations - Installation wizard for Python MCP server",
   "keywords": [
     "claude",

package/scripts/import-conversations-unified.py CHANGED Viewed

@@ -9,18 +9,27 @@ import os
 import sys
 import hashlib
 import gc
+import ast
+import re
 from pathlib import Path
 from datetime import datetime
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Set
 import logging
-# Add the project root to the Python path
-project_root = Path(__file__).parent.parent
-sys.path.insert(0, str(project_root))
+# Add the scripts directory to the Python path for utils import
+scripts_dir = Path(__file__).parent
+sys.path.insert(0, str(scripts_dir))
 from qdrant_client import QdrantClient
 from qdrant_client.models import PointStruct, Distance, VectorParams
+# Import the correct normalize_project_name from utils
+try:
+    from utils import normalize_project_name
+except ImportError as e:
+    logging.error(f"Failed to import normalize_project_name from utils: {e}")
+    sys.exit(1)
 # Set up logging
 logging.basicConfig(
     level=logging.INFO,
@@ -31,6 +40,12 @@ logger = logging.getLogger(__name__)
 # Environment variables
 QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+# Constants for metadata limits
+MAX_CONCEPTS = 10
+MAX_AST_ELEMENTS = 30
+MAX_CODE_BLOCKS = 5
+MAX_ELEMENTS_PER_BLOCK = 10
 # Robust cross-platform state file resolution
 def get_default_state_file():
     """Determine the default state file location with cross-platform support."""
@@ -74,9 +89,11 @@ embedding_dimension = None
 if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
     logger.info("Using local embeddings (fastembed)")
     from fastembed import TextEmbedding
+    # Using the same model as official Qdrant MCP server
     embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
     embedding_dimension = 384
     collection_suffix = "local"
+    logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
 else:
     logger.info("Using Voyage AI embeddings")
     import voyageai
@@ -84,15 +101,9 @@ else:
     embedding_dimension = 1024
     collection_suffix = "voyage"
-def normalize_project_name(project_name: str) -> str:
-    """Normalize project name for consistency."""
-    # For compatibility with delta-metadata-update, just use the project name as-is
-    # This ensures collection names match between import and delta update scripts
-    return project_name
 def get_collection_name(project_path: Path) -> str:
     """Generate collection name from project path."""
-    normalized = normalize_project_name(project_path.name)
+    normalized = normalize_project_name(str(project_path))
     name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
     return f"conv_{name_hash}_{collection_suffix}"
@@ -118,18 +129,23 @@ def generate_embeddings(texts: List[str]) -> List[List[float]]:
 def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
                             conversation_id: str, created_at: str,
                             metadata: Dict[str, Any], collection_name: str,
-                            project_path: Path) -> int:
+                            project_path: Path, total_messages: int) -> int:
     """Process and immediately upload a single chunk."""
     if not messages:
         return 0
-    # Extract text content
+    # Extract text content and message indices
     texts = []
+    message_indices = []
     for msg in messages:
         role = msg.get("role", "unknown")
         content = msg.get("content", "")
         if content:
             texts.append(f"{role.upper()}: {content}")
+            # Fix: Check for None instead of truthiness to include 0 values
+            idx = msg.get("message_index")
+            if idx is not None:
+                message_indices.append(idx)
     if not texts:
         return 0
@@ -140,6 +156,29 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
         # Generate embedding
         embeddings = generate_embeddings([chunk_text])
+        # Sanity check embeddings
+        if not embeddings or not embeddings[0]:
+            logger.error(f"Empty embedding generated for chunk {chunk_index}")
+            return 0
+        embedding = embeddings[0]
+        # Check for degenerate embeddings (all values identical)
+        if len(set(embedding)) == 1:
+            logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
+            return 0
+        # Check variance is above threshold
+        import statistics
+        variance = statistics.variance(embedding)
+        if variance < 1e-6:
+            logger.warning(f"Low variance embedding detected: {variance}")
+        # Validate dimension
+        if len(embedding) != embedding_dimension:
+            logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
+            return 0
         # Create point ID
         point_id = hashlib.md5(
             f"{conversation_id}_{chunk_index}".encode()
@@ -151,9 +190,12 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
             "conversation_id": conversation_id,
             "chunk_index": chunk_index,
             "timestamp": created_at,
-            "project": normalize_project_name(project_path.name),
+            "project": normalize_project_name(str(project_path)),
             "start_role": messages[0].get("role", "unknown") if messages else "unknown",
-            "message_count": len(messages)
+            "message_count": len(messages),
+            "total_messages": total_messages,
+            "message_index": message_indices[0] if message_indices else 0,
+            "message_indices": message_indices  # Store all indices in this chunk
         }
         # Add metadata
@@ -180,16 +222,84 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
         logger.error(f"Error processing chunk {chunk_index}: {e}")
         return 0
-def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
-    """Extract metadata in a single pass, return metadata and first timestamp."""
+def extract_ast_elements(code_text: str) -> Set[str]:
+    """Extract function and class names from code using AST parsing."""
+    elements = set()
+    # Try to parse as Python code
+    try:
+        tree = ast.parse(code_text)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef):
+                elements.add(f"func:{node.name}")
+            elif isinstance(node, ast.AsyncFunctionDef):
+                elements.add(f"func:{node.name}")
+            elif isinstance(node, ast.ClassDef):
+                elements.add(f"class:{node.name}")
+    except SyntaxError:
+        # Python regex fallback for partial fragments
+        for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
+            elements.add(f"func:{m.group(1)}")
+        for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
+            elements.add(f"func:{m.group(1)}")
+        for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
+            elements.add(f"class:{m.group(1)}")
+    except Exception as e:
+        logger.debug(f"Unexpected error parsing AST: {e}")
+    # Try regex patterns for other languages
+    # JavaScript/TypeScript functions
+    js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
+    for match in re.finditer(js_func_pattern, code_text):
+        elements.add(f"func:{match.group(1)}")
+    # Class definitions (multiple languages)
+    class_pattern = r'(?:class|interface|struct)\s+(\w+)'
+    for match in re.finditer(class_pattern, code_text):
+        elements.add(f"class:{match.group(1)}")
+    return elements
+def extract_concepts(text: str) -> List[str]:
+    """Extract development concepts from text."""
+    concepts = []
+    concept_patterns = {
+        'docker': r'\b(?:docker|container|compose|dockerfile)\b',
+        'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
+        'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
+        'api': r'\b(?:api|rest|graphql|endpoint)\b',
+        'security': r'\b(?:security|auth|authentication|encryption)\b',
+        'performance': r'\b(?:performance|optimization|cache|speed)\b',
+        'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
+        'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
+        'git': r'\b(?:git|commit|branch|merge|pull request)\b',
+        'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
+        'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
+    }
+    text_lower = text.lower()
+    for concept, pattern in concept_patterns.items():
+        if re.search(pattern, text_lower, re.IGNORECASE):
+            if concept not in concepts:
+                concepts.append(concept)
+    return concepts[:MAX_CONCEPTS]
+def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
+    """Extract metadata in a single pass, return metadata, first timestamp, and message count."""
     metadata = {
         "files_analyzed": [],
         "files_edited": [],
         "tools_used": [],
-        "concepts": []
+        "concepts": [],
+        "ast_elements": [],
+        "has_code_blocks": False,
+        "total_messages": 0
     }
     first_timestamp = None
+    message_count = 0
+    all_text = []
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
@@ -204,53 +314,107 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
                     if first_timestamp is None and 'timestamp' in data:
                         first_timestamp = data.get('timestamp')
-                    # Extract tool usage from messages
+                    # Count messages
                     if 'message' in data and data['message']:
                         msg = data['message']
+                        if msg.get('role') in ['user', 'assistant']:
+                            message_count += 1
                         if msg.get('content'):
                             content = msg['content']
+                            text_content = ""
                             if isinstance(content, list):
                                 for item in content:
-                                    if isinstance(item, dict) and item.get('type') == 'tool_use':
-                                        tool_name = item.get('name', '')
-                                        if tool_name and tool_name not in metadata['tools_used']:
-                                            metadata['tools_used'].append(tool_name)
+                                    if isinstance(item, dict):
+                                        if item.get('type') == 'text':
+                                            text_content += item.get('text', '')
+                                            # Check for code blocks
+                                            if '```' in item.get('text', ''):
+                                                metadata['has_code_blocks'] = True
+                                                # Extract code for AST analysis with bounds checking
+                                                if len(metadata['ast_elements']) < 30:
+                                                    # Fix: More permissive regex to handle various fence formats
+                                                    code_blocks = re.findall(r'```[^\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
+                                                    for code_block in code_blocks[:5]:  # Limit to 5 blocks
+                                                        if len(metadata['ast_elements']) >= 30:
+                                                            break
+                                                        ast_elems = extract_ast_elements(code_block)
+                                                        for elem in list(ast_elems)[:10]:  # Limit elements per block
+                                                            if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < 30:
+                                                                metadata['ast_elements'].append(elem)
-                                        # Extract file references
-                                        if 'input' in item:
-                                            input_data = item['input']
-                                            if isinstance(input_data, dict):
-                                                if 'file_path' in input_data:
-                                                    file_ref = input_data['file_path']
-                                                    if file_ref not in metadata['files_analyzed']:
-                                                        metadata['files_analyzed'].append(file_ref)
-                                                if 'path' in input_data:
-                                                    file_ref = input_data['path']
-                                                    if file_ref not in metadata['files_analyzed']:
-                                                        metadata['files_analyzed'].append(file_ref)
+                                        elif item.get('type') == 'tool_use':
+                                            tool_name = item.get('name', '')
+                                            if tool_name and tool_name not in metadata['tools_used']:
+                                                metadata['tools_used'].append(tool_name)
+                                            # Extract file references
+                                            if 'input' in item:
+                                                input_data = item['input']
+                                                if isinstance(input_data, dict):
+                                                    # Determine if it's an edit tool
+                                                    is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
+                                                    if 'file_path' in input_data:
+                                                        file_ref = input_data['file_path']
+                                                        if is_edit:
+                                                            if file_ref not in metadata['files_edited']:
+                                                                metadata['files_edited'].append(file_ref)
+                                                        else:
+                                                            if file_ref not in metadata['files_analyzed']:
+                                                                metadata['files_analyzed'].append(file_ref)
+                                                    if 'path' in input_data:
+                                                        file_ref = input_data['path']
+                                                        if file_ref not in metadata['files_analyzed']:
+                                                            metadata['files_analyzed'].append(file_ref)
+                                    elif isinstance(item, str):
+                                        text_content += item
+                            elif isinstance(content, str):
+                                text_content = content
+                            # Collect text for concept extraction
+                            if text_content:
+                                all_text.append(text_content[:1000])  # Limit text per message
                 except json.JSONDecodeError:
                     continue
                 except Exception:
                     continue
     except Exception as e:
         logger.warning(f"Error extracting metadata: {e}")
-    return metadata, first_timestamp or datetime.now().isoformat()
+    # Extract concepts from collected text
+    if all_text:
+        combined_text = ' '.join(all_text[:50])  # Limit to first 50 messages
+        metadata['concepts'] = extract_concepts(combined_text)
+    # Set total messages
+    metadata['total_messages'] = message_count
+    # Limit arrays
+    metadata['files_analyzed'] = metadata['files_analyzed'][:20]
+    metadata['files_edited'] = metadata['files_edited'][:20]
+    metadata['tools_used'] = metadata['tools_used'][:15]
+    metadata['ast_elements'] = metadata['ast_elements'][:30]
+    return metadata, first_timestamp or datetime.now().isoformat(), message_count
 def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
     """Stream import a single JSONL file without loading it into memory."""
     logger.info(f"Streaming import of {jsonl_file.name}")
     # Extract metadata in first pass (lightweight)
-    metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
+    metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
     # Stream messages and process in chunks
     chunk_buffer = []
     chunk_index = 0
     total_chunks = 0
     conversation_id = jsonl_file.stem
+    current_message_index = 0
     try:
         with open(jsonl_file, 'r', encoding='utf-8') as f:
@@ -282,16 +446,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
                                 content = '\n'.join(text_parts)
                             if content:
+                                # Track message index for user/assistant messages
+                                if msg['role'] in ['user', 'assistant']:
+                                    current_message_index += 1
+                                    message_idx = current_message_index
+                                else:
+                                    message_idx = 0
                                 chunk_buffer.append({
                                     'role': msg['role'],
-                                    'content': content
+                                    'content': content,
+                                    'message_index': message_idx
                                 })
                                 # Process chunk when buffer reaches MAX_CHUNK_SIZE
                                 if len(chunk_buffer) >= MAX_CHUNK_SIZE:
                                     chunks = process_and_upload_chunk(
                                         chunk_buffer, chunk_index, conversation_id,
-                                        created_at, metadata, collection_name, project_path
+                                        created_at, metadata, collection_name, project_path, total_messages
                                     )
                                     total_chunks += chunks
                                     chunk_buffer = []
@@ -313,7 +485,7 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
         if chunk_buffer:
             chunks = process_and_upload_chunk(
                 chunk_buffer, chunk_index, conversation_id,
-                created_at, metadata, collection_name, project_path
+                created_at, metadata, collection_name, project_path, total_messages
             )
             total_chunks += chunks
@@ -335,10 +507,19 @@ def load_state() -> dict:
     return {"imported_files": {}}
 def save_state(state: dict):
-    """Save import state."""
-    os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
-    with open(STATE_FILE, 'w') as f:
+    """Save import state with atomic write."""
+    # Fix: Handle case where STATE_FILE has no directory component
+    state_dir = os.path.dirname(STATE_FILE)
+    if state_dir:
+        os.makedirs(state_dir, exist_ok=True)
+    # Use atomic write to prevent corruption during crashes
+    temp_file = f"{STATE_FILE}.tmp"
+    with open(temp_file, 'w') as f:
         json.dump(state, f, indent=2)
+    # Atomic rename (on POSIX systems)
+    os.replace(temp_file, STATE_FILE)
 def should_import_file(file_path: Path, state: dict) -> bool:
     """Check if file should be imported."""