claude-self-reflect 3.0.1 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,13 +12,22 @@ You are a resilient and comprehensive testing specialist for Claude Self-Reflect
12
12
  - Streaming importer maintains <50MB memory while processing every 60s
13
13
  - MCP tools enable reflection and memory storage
14
14
  - System must handle sensitive API keys securely
15
+ - Modular importer architecture in `scripts/importer/` package
16
+ - Voyage API key read from `.env` file automatically
17
+
18
+ ## CRITICAL Testing Protocol
19
+ 1. **Test Local Mode First** - Ensure all functionality works with FastEmbed
20
+ 2. **Test Cloud Mode** - Switch to Voyage AI and validate
21
+ 3. **RESTORE TO LOCAL** - Machine MUST be left in 100% local state after testing
22
+ 4. **Certify Both Modes** - Only proceed to release if both modes pass
23
+ 5. **NO Model Changes** - Use sentence-transformers/all-MiniLM-L6-v2 (384 dims) for local
15
24
 
16
25
  ## Comprehensive Test Suite
17
26
 
18
27
  ### Available Test Categories
19
- The project now includes a comprehensive test suite in `/tests/` directory:
28
+ The project includes a well-organized test suite:
20
29
 
21
- 1. **MCP Tool Integration** (`test_mcp_tools_comprehensive.py`)
30
+ 1. **MCP Tool Integration** (`tests/integration/test_mcp_tools.py`)
22
31
  - All MCP tools with various parameters
23
32
  - Edge cases and error handling
24
33
  - Cross-project search validation
@@ -67,21 +76,20 @@ The project now includes a comprehensive test suite in `/tests/` directory:
67
76
  ```bash
68
77
  # Run ALL tests
69
78
  cd ~/projects/claude-self-reflect
70
- python tests/run_all_tests.py
79
+ python -m pytest tests/
71
80
 
72
- # Run specific categories
73
- python tests/run_all_tests.py -c mcp_tools memory_decay multi_project
81
+ # Run specific test categories
82
+ python -m pytest tests/integration/
83
+ python -m pytest tests/unit/
84
+ python -m pytest tests/performance/
74
85
 
75
86
  # Run with verbose output
76
- python tests/run_all_tests.py -v
77
-
78
- # List available test categories
79
- python tests/run_all_tests.py --list
87
+ python -m pytest tests/ -v
80
88
 
81
89
  # Run individual test files
82
- python tests/test_mcp_tools_comprehensive.py
83
- python tests/test_memory_decay.py
84
- python tests/test_multi_project.py
90
+ python tests/integration/test_mcp_tools.py
91
+ python tests/integration/test_collection_naming.py
92
+ python tests/integration/test_system_integration.py
85
93
  ```
86
94
 
87
95
  ### Test Results Location
@@ -367,77 +375,113 @@ if [ -n "$VOYAGE_KEY" ]; then
367
375
  fi
368
376
  ```
369
377
 
370
- ### Complete Cloud Embedding Test with Backup/Restore
378
+ ### CRITICAL: Verify Actual Imports (Not Just API Connection!)
371
379
  ```bash
372
- echo "=== Testing Cloud Embeddings (Voyage AI) with Full Backup ==="
373
-
374
- # Step 1: Backup current state
375
- echo "1. Backing up current local environment..."
376
- docker exec claude-reflection-qdrant qdrant-backup create /qdrant/backup/local-backup-$(date +%s) 2>/dev/null || echo "Backup command not available"
377
- cp config/imported-files.json config/imported-files.json.local-backup
378
- echo "Current embedding mode: ${PREFER_LOCAL_EMBEDDINGS:-true}"
379
-
380
- # Step 2: Check prerequisites
381
- if [ -z "$VOYAGE_KEY" ]; then
382
- echo "⚠️ WARNING: VOYAGE_KEY not set"
383
- echo "To test cloud mode, set: export VOYAGE_KEY='your-key'"
384
- echo "Skipping cloud test..."
385
- exit 0
380
+ echo "=== REAL Cloud Embedding Import Test ==="
381
+
382
+ # Step 1: Verify prerequisites
383
+ if [ ! -f .env ] || [ -z "$(grep VOYAGE_KEY .env)" ]; then
384
+ echo "❌ FAIL: No VOYAGE_KEY in .env file"
385
+ exit 1
386
386
  fi
387
387
 
388
- # Step 3: Switch to cloud mode
389
- echo "2. Switching to Voyage AI cloud embeddings..."
388
+ # Extract API key
389
+ export VOYAGE_KEY=$(grep VOYAGE_KEY .env | cut -d= -f2)
390
+ echo "✅ Found Voyage key: ${VOYAGE_KEY:0:10}..."
391
+
392
+ # Step 2: Count existing collections before test
393
+ BEFORE_LOCAL=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_local" | wc -l)
394
+ BEFORE_VOYAGE=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | wc -l)
395
+ echo "Before: $BEFORE_LOCAL local, $BEFORE_VOYAGE voyage collections"
396
+
397
+ # Step 3: Create NEW test conversation for import
398
+ TEST_PROJECT="test-voyage-$(date +%s)"
399
+ TEST_DIR=~/.claude/projects/$TEST_PROJECT
400
+ mkdir -p $TEST_DIR
401
+ TEST_FILE=$TEST_DIR/voyage-test.jsonl
402
+
403
+ cat > $TEST_FILE << 'EOF'
404
+ {"type":"conversation","uuid":"voyage-test-001","name":"Voyage Import Test","messages":[{"role":"human","content":"Testing actual Voyage AI import"},{"role":"assistant","content":[{"type":"text","text":"This should create a real Voyage collection with 1024-dim vectors"}]}],"conversation_id":"voyage-test-001","created_at":"2025-09-08T00:00:00Z"}
405
+ EOF
406
+
407
+ echo "✅ Created test file: $TEST_FILE"
408
+
409
+ # Step 4: Switch to Voyage mode and import
410
+ echo "Switching to Voyage mode..."
390
411
  export PREFER_LOCAL_EMBEDDINGS=false
391
- docker compose --profile watch stop streaming-importer
392
- docker compose --profile watch up -d streaming-importer
412
+ export USE_VOYAGE=true
393
413
 
394
- # Step 4: Create test conversation
395
- TEST_FILE=~/.claude/projects/claude-self-reflect/cloud-test-$(date +%s).jsonl
396
- echo '{"type":"conversation","uuid":"cloud-test-'$(date +%s)'","name":"Cloud Embedding Test","messages":[{"role":"human","content":"Testing Voyage AI cloud embeddings for v2.5.0"},{"role":"assistant","content":[{"type":"text","text":"This tests 1024-dimensional vectors with Voyage AI"}]}]}' > $TEST_FILE
414
+ # Run import directly with modular importer
415
+ cd ~/projects/claude-self-reflect
416
+ source venv/bin/activate
417
+ python -c "
418
+ import os
419
+ os.environ['VOYAGE_KEY'] = '$VOYAGE_KEY'
420
+ os.environ['PREFER_LOCAL_EMBEDDINGS'] = 'false'
421
+ os.environ['USE_VOYAGE'] = 'true'
422
+
423
+ from scripts.importer.main import ImporterContainer
424
+ container = ImporterContainer()
425
+ processor = container.processor()
426
+
427
+ # Process test file
428
+ import json
429
+ with open('$TEST_FILE') as f:
430
+ data = json.load(f)
431
+
432
+ result = processor.process_conversation(
433
+ conversation_data=data,
434
+ file_path='$TEST_FILE',
435
+ project_path='$TEST_PROJECT'
436
+ )
437
+ print(f'Import result: {result}')
438
+ "
397
439
 
398
- # Step 5: Wait for import and verify
399
- echo "3. Waiting for cloud import cycle (70s)..."
400
- sleep 70
440
+ # Step 5: Verify actual Voyage collection created
441
+ echo "Verifying Voyage collection..."
442
+ sleep 5
443
+ AFTER_VOYAGE=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | wc -l)
401
444
 
402
- # Step 6: Verify cloud collection created
403
- CLOUD_COLS=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage")
404
- if [ -n "$CLOUD_COLS" ]; then
405
- echo "✅ PASS: Cloud collections created:"
406
- echo "$CLOUD_COLS"
445
+ if [ "$AFTER_VOYAGE" -gt "$BEFORE_VOYAGE" ]; then
446
+ echo "✅ SUCCESS: New Voyage collection created!"
447
+
448
+ # Get the new collection name
449
+ NEW_COL=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | tail -1)
450
+
451
+ # Verify dimensions
452
+ DIMS=$(curl -s http://localhost:6333/collections/$NEW_COL | jq '.result.config.params.vectors.size')
453
+ POINTS=$(curl -s http://localhost:6333/collections/$NEW_COL | jq '.result.points_count')
407
454
 
408
- # Check vector dimensions
409
- FIRST_COL=$(echo "$CLOUD_COLS" | head -1)
410
- DIMS=$(curl -s http://localhost:6333/collections/$FIRST_COL | jq '.result.config.params.vectors.size')
411
- if [ "$DIMS" = "1024" ]; then
412
- echo " PASS: Correct dimensions (1024) for Voyage AI"
455
+ echo "Collection: $NEW_COL"
456
+ echo "Dimensions: $DIMS (expected 1024)"
457
+ echo "Points: $POINTS"
458
+
459
+ if [ "$DIMS" = "1024" ] && [ "$POINTS" -gt "0" ]; then
460
+ echo "✅ PASS: Voyage import actually worked!"
413
461
  else
414
- echo "❌ FAIL: Wrong dimensions: $DIMS (expected 1024)"
462
+ echo "❌ FAIL: Wrong dimensions or no points"
415
463
  fi
416
464
  else
417
- echo "❌ FAIL: No cloud collections found"
465
+ echo "❌ FAIL: No new Voyage collection created - import didn't work!"
418
466
  fi
419
467
 
420
- # Step 7: Test MCP with cloud embeddings
421
- echo "4. Testing MCP search with cloud embeddings..."
422
- # Note: MCP must also use PREFER_LOCAL_EMBEDDINGS=false
423
-
424
- # Step 8: Restore local mode
425
- echo "5. Restoring local FastEmbed mode..."
468
+ # Step 6: Restore to local mode
469
+ echo "Restoring local mode..."
426
470
  export PREFER_LOCAL_EMBEDDINGS=true
427
- docker compose --profile watch stop streaming-importer
428
- docker compose --profile watch up -d streaming-importer
471
+ export USE_VOYAGE=false
429
472
 
430
- # Step 9: Verify restoration
431
- sleep 10
432
- LOCAL_COLS=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_local" | wc -l)
433
- echo "✅ Restored: Found $LOCAL_COLS local collections"
434
-
435
- # Step 10: Cleanup
436
- rm -f $TEST_FILE
437
- cp config/imported-files.json.local-backup config/imported-files.json
438
- echo "✅ Cloud embedding test complete and restored to local mode"
473
+ # Step 7: Cleanup
474
+ rm -rf $TEST_DIR
475
+ echo "✅ Test complete and cleaned up"
439
476
  ```
440
477
 
478
+ ### Verification Checklist for Real Imports
479
+ 1. **Check Collection Suffix**: `_voyage` for cloud, `_local` for FastEmbed
480
+ 2. **Verify Dimensions**: 1024 for Voyage, 384 for FastEmbed
481
+ 3. **Count Points**: Must have >0 points for successful import
482
+ 4. **Check Logs**: Look for actual embedding API calls
483
+ 5. **Verify State File**: Check imported-files.json for record
484
+
441
485
  ## Success Criteria
442
486
 
443
487
  ### System Functionality
package/README.md CHANGED
@@ -108,7 +108,7 @@ See your conversation indexing progress directly in your statusline:
108
108
  ### Active Indexing (50% with backlog)
109
109
  ![Statusline showing 50% indexed with 7h backlog](docs/images/statusbar-2.png)
110
110
 
111
- Works with [Claude Code Statusline](https://github.com/sirmalloc/ccstatusline) - shows progress bars, percentages, and indexing lag in real-time!
111
+ Works with [Claude Code Statusline](https://github.com/sirmalloc/ccstatusline) - shows progress bars, percentages, and indexing lag in real-time! The statusline also displays MCP connection status (✓ Connected) and collection counts (28/29 indexed).
112
112
 
113
113
  ## Key Features
114
114
 
@@ -2,11 +2,13 @@
2
2
 
3
3
  // This is the new Docker-based setup wizard
4
4
  // It runs everything in Docker to avoid Python environment issues
5
- import { fileURLToPath } from 'url';
5
+ import { fileURLToPath, pathToFileURL } from 'url';
6
6
  import { dirname, join } from 'path';
7
7
 
8
8
  const __filename = fileURLToPath(import.meta.url);
9
9
  const __dirname = dirname(__filename);
10
10
 
11
11
  // Simply forward to the Docker-based wizard
12
- import(join(__dirname, 'setup-wizard-docker.js'));
12
+ // Fix for Windows: Use pathToFileURL for dynamic imports (Issue #51)
13
+ const wizardPath = join(__dirname, 'setup-wizard-docker.js');
14
+ import(pathToFileURL(wizardPath).href);
@@ -143,6 +143,90 @@ mcp = FastMCP(
143
143
  # Create Qdrant client
144
144
  qdrant_client = AsyncQdrantClient(url=QDRANT_URL)
145
145
 
146
+ # Add MCP Resources for system status
147
+ @mcp.resource("status://import-stats")
148
+ async def get_import_stats():
149
+ """Current import statistics and progress."""
150
+ await update_indexing_status()
151
+
152
+ return json.dumps({
153
+ "indexed_conversations": indexing_status["indexed_conversations"],
154
+ "total_conversations": indexing_status["total_conversations"],
155
+ "percentage": indexing_status["percentage"],
156
+ "backlog_count": indexing_status["backlog_count"],
157
+ "last_check": datetime.fromtimestamp(indexing_status["last_check"]).isoformat() if indexing_status["last_check"] else None
158
+ }, indent=2)
159
+
160
+ @mcp.resource("status://collection-list")
161
+ async def get_collection_list():
162
+ """List of all Qdrant collections with metadata."""
163
+ try:
164
+ collections = await qdrant_client.get_collections()
165
+ collection_data = []
166
+
167
+ for collection in collections.collections:
168
+ # Get collection info
169
+ info = await qdrant_client.get_collection(collection_name=collection.name)
170
+ collection_data.append({
171
+ "name": collection.name,
172
+ "points_count": info.points_count,
173
+ "indexed_vectors_count": info.indexed_vectors_count,
174
+ "status": info.status,
175
+ "config": {
176
+ "vector_size": info.config.params.vectors.size if hasattr(info.config.params.vectors, 'size') else 384,
177
+ "distance": str(info.config.params.vectors.distance) if hasattr(info.config.params.vectors, 'distance') else "Cosine"
178
+ }
179
+ })
180
+
181
+ return json.dumps({
182
+ "total_collections": len(collection_data),
183
+ "collections": collection_data
184
+ }, indent=2)
185
+ except Exception as e:
186
+ return json.dumps({"error": str(e)}, indent=2)
187
+
188
+ @mcp.resource("status://system-health")
189
+ async def get_system_health():
190
+ """System health and configuration information."""
191
+ try:
192
+ # Check Qdrant connectivity
193
+ qdrant_info = await qdrant_client.get_collections()
194
+ qdrant_healthy = True
195
+ qdrant_version = "Connected"
196
+ except:
197
+ qdrant_healthy = False
198
+ qdrant_version = "Disconnected"
199
+
200
+ # Check embedding configuration
201
+ embedding_info = {}
202
+ if embedding_manager:
203
+ embedding_info = {
204
+ "model_type": embedding_manager.model_type,
205
+ "model_name": embedding_manager.model_name,
206
+ "dimension": embedding_manager.dimension
207
+ }
208
+
209
+ return json.dumps({
210
+ "timestamp": datetime.now(timezone.utc).isoformat(),
211
+ "qdrant": {
212
+ "healthy": qdrant_healthy,
213
+ "url": QDRANT_URL,
214
+ "version": qdrant_version
215
+ },
216
+ "embeddings": embedding_info,
217
+ "configuration": {
218
+ "memory_decay_enabled": ENABLE_MEMORY_DECAY,
219
+ "decay_weight": DECAY_WEIGHT,
220
+ "decay_scale_days": DECAY_SCALE_DAYS,
221
+ "prefer_local_embeddings": PREFER_LOCAL_EMBEDDINGS
222
+ },
223
+ "indexing_status": {
224
+ "indexed": indexing_status["indexed_conversations"],
225
+ "total": indexing_status["total_conversations"],
226
+ "percentage": indexing_status["percentage"]
227
+ }
228
+ }, indent=2)
229
+
146
230
  # Track indexing status (updated periodically)
147
231
  indexing_status = {
148
232
  "last_check": 0,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-self-reflect",
3
- "version": "3.0.1",
3
+ "version": "3.0.2",
4
4
  "description": "Give Claude perfect memory of all your conversations - Installation wizard for Python MCP server",
5
5
  "keywords": [
6
6
  "claude",
@@ -9,18 +9,27 @@ import os
9
9
  import sys
10
10
  import hashlib
11
11
  import gc
12
+ import ast
13
+ import re
12
14
  from pathlib import Path
13
15
  from datetime import datetime
14
- from typing import List, Dict, Any, Optional
16
+ from typing import List, Dict, Any, Optional, Set
15
17
  import logging
16
18
 
17
- # Add the project root to the Python path
18
- project_root = Path(__file__).parent.parent
19
- sys.path.insert(0, str(project_root))
19
+ # Add the scripts directory to the Python path for utils import
20
+ scripts_dir = Path(__file__).parent
21
+ sys.path.insert(0, str(scripts_dir))
20
22
 
21
23
  from qdrant_client import QdrantClient
22
24
  from qdrant_client.models import PointStruct, Distance, VectorParams
23
25
 
26
+ # Import the correct normalize_project_name from utils
27
+ try:
28
+ from utils import normalize_project_name
29
+ except ImportError as e:
30
+ logging.error(f"Failed to import normalize_project_name from utils: {e}")
31
+ sys.exit(1)
32
+
24
33
  # Set up logging
25
34
  logging.basicConfig(
26
35
  level=logging.INFO,
@@ -31,6 +40,12 @@ logger = logging.getLogger(__name__)
31
40
  # Environment variables
32
41
  QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
33
42
 
43
+ # Constants for metadata limits
44
+ MAX_CONCEPTS = 10
45
+ MAX_AST_ELEMENTS = 30
46
+ MAX_CODE_BLOCKS = 5
47
+ MAX_ELEMENTS_PER_BLOCK = 10
48
+
34
49
  # Robust cross-platform state file resolution
35
50
  def get_default_state_file():
36
51
  """Determine the default state file location with cross-platform support."""
@@ -74,9 +89,11 @@ embedding_dimension = None
74
89
  if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
75
90
  logger.info("Using local embeddings (fastembed)")
76
91
  from fastembed import TextEmbedding
92
+ # Using the same model as official Qdrant MCP server
77
93
  embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
78
94
  embedding_dimension = 384
79
95
  collection_suffix = "local"
96
+ logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
80
97
  else:
81
98
  logger.info("Using Voyage AI embeddings")
82
99
  import voyageai
@@ -84,15 +101,9 @@ else:
84
101
  embedding_dimension = 1024
85
102
  collection_suffix = "voyage"
86
103
 
87
- def normalize_project_name(project_name: str) -> str:
88
- """Normalize project name for consistency."""
89
- # For compatibility with delta-metadata-update, just use the project name as-is
90
- # This ensures collection names match between import and delta update scripts
91
- return project_name
92
-
93
104
  def get_collection_name(project_path: Path) -> str:
94
105
  """Generate collection name from project path."""
95
- normalized = normalize_project_name(project_path.name)
106
+ normalized = normalize_project_name(str(project_path))
96
107
  name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
97
108
  return f"conv_{name_hash}_{collection_suffix}"
98
109
 
@@ -118,18 +129,23 @@ def generate_embeddings(texts: List[str]) -> List[List[float]]:
118
129
  def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
119
130
  conversation_id: str, created_at: str,
120
131
  metadata: Dict[str, Any], collection_name: str,
121
- project_path: Path) -> int:
132
+ project_path: Path, total_messages: int) -> int:
122
133
  """Process and immediately upload a single chunk."""
123
134
  if not messages:
124
135
  return 0
125
136
 
126
- # Extract text content
137
+ # Extract text content and message indices
127
138
  texts = []
139
+ message_indices = []
128
140
  for msg in messages:
129
141
  role = msg.get("role", "unknown")
130
142
  content = msg.get("content", "")
131
143
  if content:
132
144
  texts.append(f"{role.upper()}: {content}")
145
+ # Fix: Check for None instead of truthiness to include 0 values
146
+ idx = msg.get("message_index")
147
+ if idx is not None:
148
+ message_indices.append(idx)
133
149
 
134
150
  if not texts:
135
151
  return 0
@@ -140,6 +156,29 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
140
156
  # Generate embedding
141
157
  embeddings = generate_embeddings([chunk_text])
142
158
 
159
+ # Sanity check embeddings
160
+ if not embeddings or not embeddings[0]:
161
+ logger.error(f"Empty embedding generated for chunk {chunk_index}")
162
+ return 0
163
+
164
+ embedding = embeddings[0]
165
+
166
+ # Check for degenerate embeddings (all values identical)
167
+ if len(set(embedding)) == 1:
168
+ logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
169
+ return 0
170
+
171
+ # Check variance is above threshold
172
+ import statistics
173
+ variance = statistics.variance(embedding)
174
+ if variance < 1e-6:
175
+ logger.warning(f"Low variance embedding detected: {variance}")
176
+
177
+ # Validate dimension
178
+ if len(embedding) != embedding_dimension:
179
+ logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
180
+ return 0
181
+
143
182
  # Create point ID
144
183
  point_id = hashlib.md5(
145
184
  f"{conversation_id}_{chunk_index}".encode()
@@ -151,9 +190,12 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
151
190
  "conversation_id": conversation_id,
152
191
  "chunk_index": chunk_index,
153
192
  "timestamp": created_at,
154
- "project": normalize_project_name(project_path.name),
193
+ "project": normalize_project_name(str(project_path)),
155
194
  "start_role": messages[0].get("role", "unknown") if messages else "unknown",
156
- "message_count": len(messages)
195
+ "message_count": len(messages),
196
+ "total_messages": total_messages,
197
+ "message_index": message_indices[0] if message_indices else 0,
198
+ "message_indices": message_indices # Store all indices in this chunk
157
199
  }
158
200
 
159
201
  # Add metadata
@@ -180,16 +222,84 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
180
222
  logger.error(f"Error processing chunk {chunk_index}: {e}")
181
223
  return 0
182
224
 
183
- def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
184
- """Extract metadata in a single pass, return metadata and first timestamp."""
225
+ def extract_ast_elements(code_text: str) -> Set[str]:
226
+ """Extract function and class names from code using AST parsing."""
227
+ elements = set()
228
+
229
+ # Try to parse as Python code
230
+ try:
231
+ tree = ast.parse(code_text)
232
+ for node in ast.walk(tree):
233
+ if isinstance(node, ast.FunctionDef):
234
+ elements.add(f"func:{node.name}")
235
+ elif isinstance(node, ast.AsyncFunctionDef):
236
+ elements.add(f"func:{node.name}")
237
+ elif isinstance(node, ast.ClassDef):
238
+ elements.add(f"class:{node.name}")
239
+ except SyntaxError:
240
+ # Python regex fallback for partial fragments
241
+ for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
242
+ elements.add(f"func:{m.group(1)}")
243
+ for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
244
+ elements.add(f"func:{m.group(1)}")
245
+ for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
246
+ elements.add(f"class:{m.group(1)}")
247
+ except Exception as e:
248
+ logger.debug(f"Unexpected error parsing AST: {e}")
249
+
250
+ # Try regex patterns for other languages
251
+ # JavaScript/TypeScript functions
252
+ js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
253
+ for match in re.finditer(js_func_pattern, code_text):
254
+ elements.add(f"func:{match.group(1)}")
255
+
256
+ # Class definitions (multiple languages)
257
+ class_pattern = r'(?:class|interface|struct)\s+(\w+)'
258
+ for match in re.finditer(class_pattern, code_text):
259
+ elements.add(f"class:{match.group(1)}")
260
+
261
+ return elements
262
+
263
+ def extract_concepts(text: str) -> List[str]:
264
+ """Extract development concepts from text."""
265
+ concepts = []
266
+ concept_patterns = {
267
+ 'docker': r'\b(?:docker|container|compose|dockerfile)\b',
268
+ 'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
269
+ 'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
270
+ 'api': r'\b(?:api|rest|graphql|endpoint)\b',
271
+ 'security': r'\b(?:security|auth|authentication|encryption)\b',
272
+ 'performance': r'\b(?:performance|optimization|cache|speed)\b',
273
+ 'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
274
+ 'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
275
+ 'git': r'\b(?:git|commit|branch|merge|pull request)\b',
276
+ 'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
277
+ 'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
278
+ }
279
+
280
+ text_lower = text.lower()
281
+ for concept, pattern in concept_patterns.items():
282
+ if re.search(pattern, text_lower, re.IGNORECASE):
283
+ if concept not in concepts:
284
+ concepts.append(concept)
285
+
286
+ return concepts[:MAX_CONCEPTS]
287
+
288
+ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
289
+ """Extract metadata in a single pass, return metadata, first timestamp, and message count."""
185
290
  metadata = {
186
291
  "files_analyzed": [],
187
292
  "files_edited": [],
188
293
  "tools_used": [],
189
- "concepts": []
294
+ "concepts": [],
295
+ "ast_elements": [],
296
+ "has_code_blocks": False,
297
+ "total_messages": 0
190
298
  }
191
299
 
192
300
  first_timestamp = None
301
+ message_count = 0
302
+ all_text = []
193
303
 
194
304
  try:
195
305
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -204,53 +314,107 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
204
314
  if first_timestamp is None and 'timestamp' in data:
205
315
  first_timestamp = data.get('timestamp')
206
316
 
207
- # Extract tool usage from messages
317
+ # Count messages
208
318
  if 'message' in data and data['message']:
209
319
  msg = data['message']
320
+ if msg.get('role') in ['user', 'assistant']:
321
+ message_count += 1
322
+
210
323
  if msg.get('content'):
211
324
  content = msg['content']
325
+ text_content = ""
326
+
212
327
  if isinstance(content, list):
213
328
  for item in content:
214
- if isinstance(item, dict) and item.get('type') == 'tool_use':
215
- tool_name = item.get('name', '')
216
- if tool_name and tool_name not in metadata['tools_used']:
217
- metadata['tools_used'].append(tool_name)
329
+ if isinstance(item, dict):
330
+ if item.get('type') == 'text':
331
+ text_content += item.get('text', '')
332
+ # Check for code blocks
333
+ if '```' in item.get('text', ''):
334
+ metadata['has_code_blocks'] = True
335
+ # Extract code for AST analysis with bounds checking
336
+ if len(metadata['ast_elements']) < 30:
337
+ # Fix: More permissive regex to handle various fence formats
338
+ code_blocks = re.findall(r'```[^\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
339
+ for code_block in code_blocks[:5]: # Limit to 5 blocks
340
+ if len(metadata['ast_elements']) >= 30:
341
+ break
342
+ ast_elems = extract_ast_elements(code_block)
343
+ for elem in list(ast_elems)[:10]: # Limit elements per block
344
+ if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < 30:
345
+ metadata['ast_elements'].append(elem)
218
346
 
219
- # Extract file references
220
- if 'input' in item:
221
- input_data = item['input']
222
- if isinstance(input_data, dict):
223
- if 'file_path' in input_data:
224
- file_ref = input_data['file_path']
225
- if file_ref not in metadata['files_analyzed']:
226
- metadata['files_analyzed'].append(file_ref)
227
- if 'path' in input_data:
228
- file_ref = input_data['path']
229
- if file_ref not in metadata['files_analyzed']:
230
- metadata['files_analyzed'].append(file_ref)
347
+ elif item.get('type') == 'tool_use':
348
+ tool_name = item.get('name', '')
349
+ if tool_name and tool_name not in metadata['tools_used']:
350
+ metadata['tools_used'].append(tool_name)
351
+
352
+ # Extract file references
353
+ if 'input' in item:
354
+ input_data = item['input']
355
+ if isinstance(input_data, dict):
356
+ # Determine if it's an edit tool
357
+ is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
358
+
359
+ if 'file_path' in input_data:
360
+ file_ref = input_data['file_path']
361
+ if is_edit:
362
+ if file_ref not in metadata['files_edited']:
363
+ metadata['files_edited'].append(file_ref)
364
+ else:
365
+ if file_ref not in metadata['files_analyzed']:
366
+ metadata['files_analyzed'].append(file_ref)
367
+
368
+ if 'path' in input_data:
369
+ file_ref = input_data['path']
370
+ if file_ref not in metadata['files_analyzed']:
371
+ metadata['files_analyzed'].append(file_ref)
372
+ elif isinstance(item, str):
373
+ text_content += item
374
+ elif isinstance(content, str):
375
+ text_content = content
376
+
377
+ # Collect text for concept extraction
378
+ if text_content:
379
+ all_text.append(text_content[:1000]) # Limit text per message
231
380
 
232
381
  except json.JSONDecodeError:
233
382
  continue
234
383
  except Exception:
235
384
  continue
236
-
385
+
237
386
  except Exception as e:
238
387
  logger.warning(f"Error extracting metadata: {e}")
239
388
 
240
- return metadata, first_timestamp or datetime.now().isoformat()
389
+ # Extract concepts from collected text
390
+ if all_text:
391
+ combined_text = ' '.join(all_text[:50]) # Limit to first 50 messages
392
+ metadata['concepts'] = extract_concepts(combined_text)
393
+
394
+ # Set total messages
395
+ metadata['total_messages'] = message_count
396
+
397
+ # Limit arrays
398
+ metadata['files_analyzed'] = metadata['files_analyzed'][:20]
399
+ metadata['files_edited'] = metadata['files_edited'][:20]
400
+ metadata['tools_used'] = metadata['tools_used'][:15]
401
+ metadata['ast_elements'] = metadata['ast_elements'][:30]
402
+
403
+ return metadata, first_timestamp or datetime.now().isoformat(), message_count
241
404
 
242
405
  def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
243
406
  """Stream import a single JSONL file without loading it into memory."""
244
407
  logger.info(f"Streaming import of {jsonl_file.name}")
245
408
 
246
409
  # Extract metadata in first pass (lightweight)
247
- metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
410
+ metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
248
411
 
249
412
  # Stream messages and process in chunks
250
413
  chunk_buffer = []
251
414
  chunk_index = 0
252
415
  total_chunks = 0
253
416
  conversation_id = jsonl_file.stem
417
+ current_message_index = 0
254
418
 
255
419
  try:
256
420
  with open(jsonl_file, 'r', encoding='utf-8') as f:
@@ -282,16 +446,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
282
446
  content = '\n'.join(text_parts)
283
447
 
284
448
  if content:
449
+ # Track message index for user/assistant messages
450
+ if msg['role'] in ['user', 'assistant']:
451
+ current_message_index += 1
452
+ message_idx = current_message_index
453
+ else:
454
+ message_idx = 0
455
+
285
456
  chunk_buffer.append({
286
457
  'role': msg['role'],
287
- 'content': content
458
+ 'content': content,
459
+ 'message_index': message_idx
288
460
  })
289
461
 
290
462
  # Process chunk when buffer reaches MAX_CHUNK_SIZE
291
463
  if len(chunk_buffer) >= MAX_CHUNK_SIZE:
292
464
  chunks = process_and_upload_chunk(
293
465
  chunk_buffer, chunk_index, conversation_id,
294
- created_at, metadata, collection_name, project_path
466
+ created_at, metadata, collection_name, project_path, total_messages
295
467
  )
296
468
  total_chunks += chunks
297
469
  chunk_buffer = []
@@ -313,7 +485,7 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
313
485
  if chunk_buffer:
314
486
  chunks = process_and_upload_chunk(
315
487
  chunk_buffer, chunk_index, conversation_id,
316
- created_at, metadata, collection_name, project_path
488
+ created_at, metadata, collection_name, project_path, total_messages
317
489
  )
318
490
  total_chunks += chunks
319
491
 
@@ -335,10 +507,19 @@ def load_state() -> dict:
335
507
  return {"imported_files": {}}
336
508
 
337
509
  def save_state(state: dict):
338
- """Save import state."""
339
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
340
- with open(STATE_FILE, 'w') as f:
510
+ """Save import state with atomic write."""
511
+ # Fix: Handle case where STATE_FILE has no directory component
512
+ state_dir = os.path.dirname(STATE_FILE)
513
+ if state_dir:
514
+ os.makedirs(state_dir, exist_ok=True)
515
+
516
+ # Use atomic write to prevent corruption during crashes
517
+ temp_file = f"{STATE_FILE}.tmp"
518
+ with open(temp_file, 'w') as f:
341
519
  json.dump(state, f, indent=2)
520
+
521
+ # Atomic rename (on POSIX systems)
522
+ os.replace(temp_file, STATE_FILE)
342
523
 
343
524
  def should_import_file(file_path: Path, state: dict) -> bool:
344
525
  """Check if file should be imported."""