claude-self-reflect 3.0.1 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -12,13 +12,22 @@ You are a resilient and comprehensive testing specialist for Claude Self-Reflect
|
|
|
12
12
|
- Streaming importer maintains <50MB memory while processing every 60s
|
|
13
13
|
- MCP tools enable reflection and memory storage
|
|
14
14
|
- System must handle sensitive API keys securely
|
|
15
|
+
- Modular importer architecture in `scripts/importer/` package
|
|
16
|
+
- Voyage API key read from `.env` file automatically
|
|
17
|
+
|
|
18
|
+
## CRITICAL Testing Protocol
|
|
19
|
+
1. **Test Local Mode First** - Ensure all functionality works with FastEmbed
|
|
20
|
+
2. **Test Cloud Mode** - Switch to Voyage AI and validate
|
|
21
|
+
3. **RESTORE TO LOCAL** - Machine MUST be left in 100% local state after testing
|
|
22
|
+
4. **Certify Both Modes** - Only proceed to release if both modes pass
|
|
23
|
+
5. **NO Model Changes** - Use sentence-transformers/all-MiniLM-L6-v2 (384 dims) for local
|
|
15
24
|
|
|
16
25
|
## Comprehensive Test Suite
|
|
17
26
|
|
|
18
27
|
### Available Test Categories
|
|
19
|
-
The project
|
|
28
|
+
The project includes a well-organized test suite:
|
|
20
29
|
|
|
21
|
-
1. **MCP Tool Integration** (`
|
|
30
|
+
1. **MCP Tool Integration** (`tests/integration/test_mcp_tools.py`)
|
|
22
31
|
- All MCP tools with various parameters
|
|
23
32
|
- Edge cases and error handling
|
|
24
33
|
- Cross-project search validation
|
|
@@ -67,21 +76,20 @@ The project now includes a comprehensive test suite in `/tests/` directory:
|
|
|
67
76
|
```bash
|
|
68
77
|
# Run ALL tests
|
|
69
78
|
cd ~/projects/claude-self-reflect
|
|
70
|
-
python tests/
|
|
79
|
+
python -m pytest tests/
|
|
71
80
|
|
|
72
|
-
# Run specific categories
|
|
73
|
-
python
|
|
81
|
+
# Run specific test categories
|
|
82
|
+
python -m pytest tests/integration/
|
|
83
|
+
python -m pytest tests/unit/
|
|
84
|
+
python -m pytest tests/performance/
|
|
74
85
|
|
|
75
86
|
# Run with verbose output
|
|
76
|
-
python tests/
|
|
77
|
-
|
|
78
|
-
# List available test categories
|
|
79
|
-
python tests/run_all_tests.py --list
|
|
87
|
+
python -m pytest tests/ -v
|
|
80
88
|
|
|
81
89
|
# Run individual test files
|
|
82
|
-
python tests/
|
|
83
|
-
python tests/
|
|
84
|
-
python tests/
|
|
90
|
+
python tests/integration/test_mcp_tools.py
|
|
91
|
+
python tests/integration/test_collection_naming.py
|
|
92
|
+
python tests/integration/test_system_integration.py
|
|
85
93
|
```
|
|
86
94
|
|
|
87
95
|
### Test Results Location
|
|
@@ -367,77 +375,113 @@ if [ -n "$VOYAGE_KEY" ]; then
|
|
|
367
375
|
fi
|
|
368
376
|
```
|
|
369
377
|
|
|
370
|
-
###
|
|
378
|
+
### CRITICAL: Verify Actual Imports (Not Just API Connection!)
|
|
371
379
|
```bash
|
|
372
|
-
echo "===
|
|
373
|
-
|
|
374
|
-
# Step 1:
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
echo "Current embedding mode: ${PREFER_LOCAL_EMBEDDINGS:-true}"
|
|
379
|
-
|
|
380
|
-
# Step 2: Check prerequisites
|
|
381
|
-
if [ -z "$VOYAGE_KEY" ]; then
|
|
382
|
-
echo "⚠️ WARNING: VOYAGE_KEY not set"
|
|
383
|
-
echo "To test cloud mode, set: export VOYAGE_KEY='your-key'"
|
|
384
|
-
echo "Skipping cloud test..."
|
|
385
|
-
exit 0
|
|
380
|
+
echo "=== REAL Cloud Embedding Import Test ==="
|
|
381
|
+
|
|
382
|
+
# Step 1: Verify prerequisites
|
|
383
|
+
if [ ! -f .env ] || [ -z "$(grep VOYAGE_KEY .env)" ]; then
|
|
384
|
+
echo "❌ FAIL: No VOYAGE_KEY in .env file"
|
|
385
|
+
exit 1
|
|
386
386
|
fi
|
|
387
387
|
|
|
388
|
-
#
|
|
389
|
-
|
|
388
|
+
# Extract API key
|
|
389
|
+
export VOYAGE_KEY=$(grep VOYAGE_KEY .env | cut -d= -f2)
|
|
390
|
+
echo "✅ Found Voyage key: ${VOYAGE_KEY:0:10}..."
|
|
391
|
+
|
|
392
|
+
# Step 2: Count existing collections before test
|
|
393
|
+
BEFORE_LOCAL=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_local" | wc -l)
|
|
394
|
+
BEFORE_VOYAGE=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | wc -l)
|
|
395
|
+
echo "Before: $BEFORE_LOCAL local, $BEFORE_VOYAGE voyage collections"
|
|
396
|
+
|
|
397
|
+
# Step 3: Create NEW test conversation for import
|
|
398
|
+
TEST_PROJECT="test-voyage-$(date +%s)"
|
|
399
|
+
TEST_DIR=~/.claude/projects/$TEST_PROJECT
|
|
400
|
+
mkdir -p $TEST_DIR
|
|
401
|
+
TEST_FILE=$TEST_DIR/voyage-test.jsonl
|
|
402
|
+
|
|
403
|
+
cat > $TEST_FILE << 'EOF'
|
|
404
|
+
{"type":"conversation","uuid":"voyage-test-001","name":"Voyage Import Test","messages":[{"role":"human","content":"Testing actual Voyage AI import"},{"role":"assistant","content":[{"type":"text","text":"This should create a real Voyage collection with 1024-dim vectors"}]}],"conversation_id":"voyage-test-001","created_at":"2025-09-08T00:00:00Z"}
|
|
405
|
+
EOF
|
|
406
|
+
|
|
407
|
+
echo "✅ Created test file: $TEST_FILE"
|
|
408
|
+
|
|
409
|
+
# Step 4: Switch to Voyage mode and import
|
|
410
|
+
echo "Switching to Voyage mode..."
|
|
390
411
|
export PREFER_LOCAL_EMBEDDINGS=false
|
|
391
|
-
|
|
392
|
-
docker compose --profile watch up -d streaming-importer
|
|
412
|
+
export USE_VOYAGE=true
|
|
393
413
|
|
|
394
|
-
#
|
|
395
|
-
|
|
396
|
-
|
|
414
|
+
# Run import directly with modular importer
|
|
415
|
+
cd ~/projects/claude-self-reflect
|
|
416
|
+
source venv/bin/activate
|
|
417
|
+
python -c "
|
|
418
|
+
import os
|
|
419
|
+
os.environ['VOYAGE_KEY'] = '$VOYAGE_KEY'
|
|
420
|
+
os.environ['PREFER_LOCAL_EMBEDDINGS'] = 'false'
|
|
421
|
+
os.environ['USE_VOYAGE'] = 'true'
|
|
422
|
+
|
|
423
|
+
from scripts.importer.main import ImporterContainer
|
|
424
|
+
container = ImporterContainer()
|
|
425
|
+
processor = container.processor()
|
|
426
|
+
|
|
427
|
+
# Process test file
|
|
428
|
+
import json
|
|
429
|
+
with open('$TEST_FILE') as f:
|
|
430
|
+
data = json.load(f)
|
|
431
|
+
|
|
432
|
+
result = processor.process_conversation(
|
|
433
|
+
conversation_data=data,
|
|
434
|
+
file_path='$TEST_FILE',
|
|
435
|
+
project_path='$TEST_PROJECT'
|
|
436
|
+
)
|
|
437
|
+
print(f'Import result: {result}')
|
|
438
|
+
"
|
|
397
439
|
|
|
398
|
-
# Step 5:
|
|
399
|
-
echo "
|
|
400
|
-
sleep
|
|
440
|
+
# Step 5: Verify actual Voyage collection created
|
|
441
|
+
echo "Verifying Voyage collection..."
|
|
442
|
+
sleep 5
|
|
443
|
+
AFTER_VOYAGE=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | wc -l)
|
|
401
444
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
445
|
+
if [ "$AFTER_VOYAGE" -gt "$BEFORE_VOYAGE" ]; then
|
|
446
|
+
echo "✅ SUCCESS: New Voyage collection created!"
|
|
447
|
+
|
|
448
|
+
# Get the new collection name
|
|
449
|
+
NEW_COL=$(curl -s http://localhost:6333/collections | jq -r '.result.collections[].name' | grep "_voyage" | tail -1)
|
|
450
|
+
|
|
451
|
+
# Verify dimensions
|
|
452
|
+
DIMS=$(curl -s http://localhost:6333/collections/$NEW_COL | jq '.result.config.params.vectors.size')
|
|
453
|
+
POINTS=$(curl -s http://localhost:6333/collections/$NEW_COL | jq '.result.points_count')
|
|
407
454
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
455
|
+
echo "Collection: $NEW_COL"
|
|
456
|
+
echo "Dimensions: $DIMS (expected 1024)"
|
|
457
|
+
echo "Points: $POINTS"
|
|
458
|
+
|
|
459
|
+
if [ "$DIMS" = "1024" ] && [ "$POINTS" -gt "0" ]; then
|
|
460
|
+
echo "✅ PASS: Voyage import actually worked!"
|
|
413
461
|
else
|
|
414
|
-
echo "❌ FAIL: Wrong dimensions
|
|
462
|
+
echo "❌ FAIL: Wrong dimensions or no points"
|
|
415
463
|
fi
|
|
416
464
|
else
|
|
417
|
-
echo "❌ FAIL: No
|
|
465
|
+
echo "❌ FAIL: No new Voyage collection created - import didn't work!"
|
|
418
466
|
fi
|
|
419
467
|
|
|
420
|
-
# Step
|
|
421
|
-
echo "
|
|
422
|
-
# Note: MCP must also use PREFER_LOCAL_EMBEDDINGS=false
|
|
423
|
-
|
|
424
|
-
# Step 8: Restore local mode
|
|
425
|
-
echo "5. Restoring local FastEmbed mode..."
|
|
468
|
+
# Step 6: Restore to local mode
|
|
469
|
+
echo "Restoring local mode..."
|
|
426
470
|
export PREFER_LOCAL_EMBEDDINGS=true
|
|
427
|
-
|
|
428
|
-
docker compose --profile watch up -d streaming-importer
|
|
471
|
+
export USE_VOYAGE=false
|
|
429
472
|
|
|
430
|
-
# Step
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
echo "✅ Restored: Found $LOCAL_COLS local collections"
|
|
434
|
-
|
|
435
|
-
# Step 10: Cleanup
|
|
436
|
-
rm -f $TEST_FILE
|
|
437
|
-
cp config/imported-files.json.local-backup config/imported-files.json
|
|
438
|
-
echo "✅ Cloud embedding test complete and restored to local mode"
|
|
473
|
+
# Step 7: Cleanup
|
|
474
|
+
rm -rf $TEST_DIR
|
|
475
|
+
echo "✅ Test complete and cleaned up"
|
|
439
476
|
```
|
|
440
477
|
|
|
478
|
+
### Verification Checklist for Real Imports
|
|
479
|
+
1. **Check Collection Suffix**: `_voyage` for cloud, `_local` for FastEmbed
|
|
480
|
+
2. **Verify Dimensions**: 1024 for Voyage, 384 for FastEmbed
|
|
481
|
+
3. **Count Points**: Must have >0 points for successful import
|
|
482
|
+
4. **Check Logs**: Look for actual embedding API calls
|
|
483
|
+
5. **Verify State File**: Check imported-files.json for record
|
|
484
|
+
|
|
441
485
|
## Success Criteria
|
|
442
486
|
|
|
443
487
|
### System Functionality
|
package/README.md
CHANGED
|
@@ -108,7 +108,7 @@ See your conversation indexing progress directly in your statusline:
|
|
|
108
108
|
### Active Indexing (50% with backlog)
|
|
109
109
|

|
|
110
110
|
|
|
111
|
-
Works with [Claude Code Statusline](https://github.com/sirmalloc/ccstatusline) - shows progress bars, percentages, and indexing lag in real-time!
|
|
111
|
+
Works with [Claude Code Statusline](https://github.com/sirmalloc/ccstatusline) - shows progress bars, percentages, and indexing lag in real-time! The statusline also displays MCP connection status (✓ Connected) and collection counts (28/29 indexed).
|
|
112
112
|
|
|
113
113
|
## Key Features
|
|
114
114
|
|
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
// This is the new Docker-based setup wizard
|
|
4
4
|
// It runs everything in Docker to avoid Python environment issues
|
|
5
|
-
import { fileURLToPath } from 'url';
|
|
5
|
+
import { fileURLToPath, pathToFileURL } from 'url';
|
|
6
6
|
import { dirname, join } from 'path';
|
|
7
7
|
|
|
8
8
|
const __filename = fileURLToPath(import.meta.url);
|
|
9
9
|
const __dirname = dirname(__filename);
|
|
10
10
|
|
|
11
11
|
// Simply forward to the Docker-based wizard
|
|
12
|
-
|
|
12
|
+
// Fix for Windows: Use pathToFileURL for dynamic imports (Issue #51)
|
|
13
|
+
const wizardPath = join(__dirname, 'setup-wizard-docker.js');
|
|
14
|
+
import(pathToFileURL(wizardPath).href);
|
package/mcp-server/src/server.py
CHANGED
|
@@ -143,6 +143,90 @@ mcp = FastMCP(
|
|
|
143
143
|
# Create Qdrant client
|
|
144
144
|
qdrant_client = AsyncQdrantClient(url=QDRANT_URL)
|
|
145
145
|
|
|
146
|
+
# Add MCP Resources for system status
|
|
147
|
+
@mcp.resource("status://import-stats")
|
|
148
|
+
async def get_import_stats():
|
|
149
|
+
"""Current import statistics and progress."""
|
|
150
|
+
await update_indexing_status()
|
|
151
|
+
|
|
152
|
+
return json.dumps({
|
|
153
|
+
"indexed_conversations": indexing_status["indexed_conversations"],
|
|
154
|
+
"total_conversations": indexing_status["total_conversations"],
|
|
155
|
+
"percentage": indexing_status["percentage"],
|
|
156
|
+
"backlog_count": indexing_status["backlog_count"],
|
|
157
|
+
"last_check": datetime.fromtimestamp(indexing_status["last_check"]).isoformat() if indexing_status["last_check"] else None
|
|
158
|
+
}, indent=2)
|
|
159
|
+
|
|
160
|
+
@mcp.resource("status://collection-list")
|
|
161
|
+
async def get_collection_list():
|
|
162
|
+
"""List of all Qdrant collections with metadata."""
|
|
163
|
+
try:
|
|
164
|
+
collections = await qdrant_client.get_collections()
|
|
165
|
+
collection_data = []
|
|
166
|
+
|
|
167
|
+
for collection in collections.collections:
|
|
168
|
+
# Get collection info
|
|
169
|
+
info = await qdrant_client.get_collection(collection_name=collection.name)
|
|
170
|
+
collection_data.append({
|
|
171
|
+
"name": collection.name,
|
|
172
|
+
"points_count": info.points_count,
|
|
173
|
+
"indexed_vectors_count": info.indexed_vectors_count,
|
|
174
|
+
"status": info.status,
|
|
175
|
+
"config": {
|
|
176
|
+
"vector_size": info.config.params.vectors.size if hasattr(info.config.params.vectors, 'size') else 384,
|
|
177
|
+
"distance": str(info.config.params.vectors.distance) if hasattr(info.config.params.vectors, 'distance') else "Cosine"
|
|
178
|
+
}
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
return json.dumps({
|
|
182
|
+
"total_collections": len(collection_data),
|
|
183
|
+
"collections": collection_data
|
|
184
|
+
}, indent=2)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
return json.dumps({"error": str(e)}, indent=2)
|
|
187
|
+
|
|
188
|
+
@mcp.resource("status://system-health")
|
|
189
|
+
async def get_system_health():
|
|
190
|
+
"""System health and configuration information."""
|
|
191
|
+
try:
|
|
192
|
+
# Check Qdrant connectivity
|
|
193
|
+
qdrant_info = await qdrant_client.get_collections()
|
|
194
|
+
qdrant_healthy = True
|
|
195
|
+
qdrant_version = "Connected"
|
|
196
|
+
except:
|
|
197
|
+
qdrant_healthy = False
|
|
198
|
+
qdrant_version = "Disconnected"
|
|
199
|
+
|
|
200
|
+
# Check embedding configuration
|
|
201
|
+
embedding_info = {}
|
|
202
|
+
if embedding_manager:
|
|
203
|
+
embedding_info = {
|
|
204
|
+
"model_type": embedding_manager.model_type,
|
|
205
|
+
"model_name": embedding_manager.model_name,
|
|
206
|
+
"dimension": embedding_manager.dimension
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return json.dumps({
|
|
210
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
211
|
+
"qdrant": {
|
|
212
|
+
"healthy": qdrant_healthy,
|
|
213
|
+
"url": QDRANT_URL,
|
|
214
|
+
"version": qdrant_version
|
|
215
|
+
},
|
|
216
|
+
"embeddings": embedding_info,
|
|
217
|
+
"configuration": {
|
|
218
|
+
"memory_decay_enabled": ENABLE_MEMORY_DECAY,
|
|
219
|
+
"decay_weight": DECAY_WEIGHT,
|
|
220
|
+
"decay_scale_days": DECAY_SCALE_DAYS,
|
|
221
|
+
"prefer_local_embeddings": PREFER_LOCAL_EMBEDDINGS
|
|
222
|
+
},
|
|
223
|
+
"indexing_status": {
|
|
224
|
+
"indexed": indexing_status["indexed_conversations"],
|
|
225
|
+
"total": indexing_status["total_conversations"],
|
|
226
|
+
"percentage": indexing_status["percentage"]
|
|
227
|
+
}
|
|
228
|
+
}, indent=2)
|
|
229
|
+
|
|
146
230
|
# Track indexing status (updated periodically)
|
|
147
231
|
indexing_status = {
|
|
148
232
|
"last_check": 0,
|
package/package.json
CHANGED
|
@@ -9,18 +9,27 @@ import os
|
|
|
9
9
|
import sys
|
|
10
10
|
import hashlib
|
|
11
11
|
import gc
|
|
12
|
+
import ast
|
|
13
|
+
import re
|
|
12
14
|
from pathlib import Path
|
|
13
15
|
from datetime import datetime
|
|
14
|
-
from typing import List, Dict, Any, Optional
|
|
16
|
+
from typing import List, Dict, Any, Optional, Set
|
|
15
17
|
import logging
|
|
16
18
|
|
|
17
|
-
# Add the
|
|
18
|
-
|
|
19
|
-
sys.path.insert(0, str(
|
|
19
|
+
# Add the scripts directory to the Python path for utils import
|
|
20
|
+
scripts_dir = Path(__file__).parent
|
|
21
|
+
sys.path.insert(0, str(scripts_dir))
|
|
20
22
|
|
|
21
23
|
from qdrant_client import QdrantClient
|
|
22
24
|
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
23
25
|
|
|
26
|
+
# Import the correct normalize_project_name from utils
|
|
27
|
+
try:
|
|
28
|
+
from utils import normalize_project_name
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
logging.error(f"Failed to import normalize_project_name from utils: {e}")
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
24
33
|
# Set up logging
|
|
25
34
|
logging.basicConfig(
|
|
26
35
|
level=logging.INFO,
|
|
@@ -31,6 +40,12 @@ logger = logging.getLogger(__name__)
|
|
|
31
40
|
# Environment variables
|
|
32
41
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
33
42
|
|
|
43
|
+
# Constants for metadata limits
|
|
44
|
+
MAX_CONCEPTS = 10
|
|
45
|
+
MAX_AST_ELEMENTS = 30
|
|
46
|
+
MAX_CODE_BLOCKS = 5
|
|
47
|
+
MAX_ELEMENTS_PER_BLOCK = 10
|
|
48
|
+
|
|
34
49
|
# Robust cross-platform state file resolution
|
|
35
50
|
def get_default_state_file():
|
|
36
51
|
"""Determine the default state file location with cross-platform support."""
|
|
@@ -74,9 +89,11 @@ embedding_dimension = None
|
|
|
74
89
|
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
75
90
|
logger.info("Using local embeddings (fastembed)")
|
|
76
91
|
from fastembed import TextEmbedding
|
|
92
|
+
# Using the same model as official Qdrant MCP server
|
|
77
93
|
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
78
94
|
embedding_dimension = 384
|
|
79
95
|
collection_suffix = "local"
|
|
96
|
+
logger.info("Using fastembed model: sentence-transformers/all-MiniLM-L6-v2")
|
|
80
97
|
else:
|
|
81
98
|
logger.info("Using Voyage AI embeddings")
|
|
82
99
|
import voyageai
|
|
@@ -84,15 +101,9 @@ else:
|
|
|
84
101
|
embedding_dimension = 1024
|
|
85
102
|
collection_suffix = "voyage"
|
|
86
103
|
|
|
87
|
-
def normalize_project_name(project_name: str) -> str:
|
|
88
|
-
"""Normalize project name for consistency."""
|
|
89
|
-
# For compatibility with delta-metadata-update, just use the project name as-is
|
|
90
|
-
# This ensures collection names match between import and delta update scripts
|
|
91
|
-
return project_name
|
|
92
|
-
|
|
93
104
|
def get_collection_name(project_path: Path) -> str:
|
|
94
105
|
"""Generate collection name from project path."""
|
|
95
|
-
normalized = normalize_project_name(project_path
|
|
106
|
+
normalized = normalize_project_name(str(project_path))
|
|
96
107
|
name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
97
108
|
return f"conv_{name_hash}_{collection_suffix}"
|
|
98
109
|
|
|
@@ -118,18 +129,23 @@ def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
|
118
129
|
def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
119
130
|
conversation_id: str, created_at: str,
|
|
120
131
|
metadata: Dict[str, Any], collection_name: str,
|
|
121
|
-
project_path: Path) -> int:
|
|
132
|
+
project_path: Path, total_messages: int) -> int:
|
|
122
133
|
"""Process and immediately upload a single chunk."""
|
|
123
134
|
if not messages:
|
|
124
135
|
return 0
|
|
125
136
|
|
|
126
|
-
# Extract text content
|
|
137
|
+
# Extract text content and message indices
|
|
127
138
|
texts = []
|
|
139
|
+
message_indices = []
|
|
128
140
|
for msg in messages:
|
|
129
141
|
role = msg.get("role", "unknown")
|
|
130
142
|
content = msg.get("content", "")
|
|
131
143
|
if content:
|
|
132
144
|
texts.append(f"{role.upper()}: {content}")
|
|
145
|
+
# Fix: Check for None instead of truthiness to include 0 values
|
|
146
|
+
idx = msg.get("message_index")
|
|
147
|
+
if idx is not None:
|
|
148
|
+
message_indices.append(idx)
|
|
133
149
|
|
|
134
150
|
if not texts:
|
|
135
151
|
return 0
|
|
@@ -140,6 +156,29 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
140
156
|
# Generate embedding
|
|
141
157
|
embeddings = generate_embeddings([chunk_text])
|
|
142
158
|
|
|
159
|
+
# Sanity check embeddings
|
|
160
|
+
if not embeddings or not embeddings[0]:
|
|
161
|
+
logger.error(f"Empty embedding generated for chunk {chunk_index}")
|
|
162
|
+
return 0
|
|
163
|
+
|
|
164
|
+
embedding = embeddings[0]
|
|
165
|
+
|
|
166
|
+
# Check for degenerate embeddings (all values identical)
|
|
167
|
+
if len(set(embedding)) == 1:
|
|
168
|
+
logger.error(f"Degenerate embedding detected (all values identical): {embedding[0]}")
|
|
169
|
+
return 0
|
|
170
|
+
|
|
171
|
+
# Check variance is above threshold
|
|
172
|
+
import statistics
|
|
173
|
+
variance = statistics.variance(embedding)
|
|
174
|
+
if variance < 1e-6:
|
|
175
|
+
logger.warning(f"Low variance embedding detected: {variance}")
|
|
176
|
+
|
|
177
|
+
# Validate dimension
|
|
178
|
+
if len(embedding) != embedding_dimension:
|
|
179
|
+
logger.error(f"Embedding dimension mismatch: expected {embedding_dimension}, got {len(embedding)}")
|
|
180
|
+
return 0
|
|
181
|
+
|
|
143
182
|
# Create point ID
|
|
144
183
|
point_id = hashlib.md5(
|
|
145
184
|
f"{conversation_id}_{chunk_index}".encode()
|
|
@@ -151,9 +190,12 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
151
190
|
"conversation_id": conversation_id,
|
|
152
191
|
"chunk_index": chunk_index,
|
|
153
192
|
"timestamp": created_at,
|
|
154
|
-
"project": normalize_project_name(project_path
|
|
193
|
+
"project": normalize_project_name(str(project_path)),
|
|
155
194
|
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
156
|
-
"message_count": len(messages)
|
|
195
|
+
"message_count": len(messages),
|
|
196
|
+
"total_messages": total_messages,
|
|
197
|
+
"message_index": message_indices[0] if message_indices else 0,
|
|
198
|
+
"message_indices": message_indices # Store all indices in this chunk
|
|
157
199
|
}
|
|
158
200
|
|
|
159
201
|
# Add metadata
|
|
@@ -180,16 +222,84 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
180
222
|
logger.error(f"Error processing chunk {chunk_index}: {e}")
|
|
181
223
|
return 0
|
|
182
224
|
|
|
183
|
-
def
|
|
184
|
-
"""Extract
|
|
225
|
+
def extract_ast_elements(code_text: str) -> Set[str]:
|
|
226
|
+
"""Extract function and class names from code using AST parsing."""
|
|
227
|
+
elements = set()
|
|
228
|
+
|
|
229
|
+
# Try to parse as Python code
|
|
230
|
+
try:
|
|
231
|
+
tree = ast.parse(code_text)
|
|
232
|
+
for node in ast.walk(tree):
|
|
233
|
+
if isinstance(node, ast.FunctionDef):
|
|
234
|
+
elements.add(f"func:{node.name}")
|
|
235
|
+
elif isinstance(node, ast.AsyncFunctionDef):
|
|
236
|
+
elements.add(f"func:{node.name}")
|
|
237
|
+
elif isinstance(node, ast.ClassDef):
|
|
238
|
+
elements.add(f"class:{node.name}")
|
|
239
|
+
except SyntaxError:
|
|
240
|
+
# Python regex fallback for partial fragments
|
|
241
|
+
for m in re.finditer(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
|
|
242
|
+
elements.add(f"func:{m.group(1)}")
|
|
243
|
+
for m in re.finditer(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', code_text, re.MULTILINE):
|
|
244
|
+
elements.add(f"func:{m.group(1)}")
|
|
245
|
+
for m in re.finditer(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', code_text, re.MULTILINE):
|
|
246
|
+
elements.add(f"class:{m.group(1)}")
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.debug(f"Unexpected error parsing AST: {e}")
|
|
249
|
+
|
|
250
|
+
# Try regex patterns for other languages
|
|
251
|
+
# JavaScript/TypeScript functions
|
|
252
|
+
js_func_pattern = r'(?:function|const|let|var)\s+(\w+)\s*(?:=\s*)?(?:\([^)]*\)|\s*=>)'
|
|
253
|
+
for match in re.finditer(js_func_pattern, code_text):
|
|
254
|
+
elements.add(f"func:{match.group(1)}")
|
|
255
|
+
|
|
256
|
+
# Class definitions (multiple languages)
|
|
257
|
+
class_pattern = r'(?:class|interface|struct)\s+(\w+)'
|
|
258
|
+
for match in re.finditer(class_pattern, code_text):
|
|
259
|
+
elements.add(f"class:{match.group(1)}")
|
|
260
|
+
|
|
261
|
+
return elements
|
|
262
|
+
|
|
263
|
+
def extract_concepts(text: str) -> List[str]:
|
|
264
|
+
"""Extract development concepts from text."""
|
|
265
|
+
concepts = []
|
|
266
|
+
concept_patterns = {
|
|
267
|
+
'docker': r'\b(?:docker|container|compose|dockerfile)\b',
|
|
268
|
+
'testing': r'\b(?:test|testing|unittest|pytest|jest)\b',
|
|
269
|
+
'database': r'\b(?:database|sql|postgres|mysql|mongodb|qdrant)\b',
|
|
270
|
+
'api': r'\b(?:api|rest|graphql|endpoint)\b',
|
|
271
|
+
'security': r'\b(?:security|auth|authentication|encryption)\b',
|
|
272
|
+
'performance': r'\b(?:performance|optimization|cache|speed)\b',
|
|
273
|
+
'debugging': r'\b(?:debug|debugging|error|bug|trace)\b',
|
|
274
|
+
'deployment': r'\b(?:deploy|deployment|ci\/cd|production)\b',
|
|
275
|
+
'git': r'\b(?:git|commit|branch|merge|pull request)\b',
|
|
276
|
+
'mcp': r'\b(?:mcp|claude-self-reflect|claude code)\b',
|
|
277
|
+
'embeddings': r'\b(?:embedding|vector|semantic|similarity)\b',
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
text_lower = text.lower()
|
|
281
|
+
for concept, pattern in concept_patterns.items():
|
|
282
|
+
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
283
|
+
if concept not in concepts:
|
|
284
|
+
concepts.append(concept)
|
|
285
|
+
|
|
286
|
+
return concepts[:MAX_CONCEPTS]
|
|
287
|
+
|
|
288
|
+
def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, int]:
|
|
289
|
+
"""Extract metadata in a single pass, return metadata, first timestamp, and message count."""
|
|
185
290
|
metadata = {
|
|
186
291
|
"files_analyzed": [],
|
|
187
292
|
"files_edited": [],
|
|
188
293
|
"tools_used": [],
|
|
189
|
-
"concepts": []
|
|
294
|
+
"concepts": [],
|
|
295
|
+
"ast_elements": [],
|
|
296
|
+
"has_code_blocks": False,
|
|
297
|
+
"total_messages": 0
|
|
190
298
|
}
|
|
191
299
|
|
|
192
300
|
first_timestamp = None
|
|
301
|
+
message_count = 0
|
|
302
|
+
all_text = []
|
|
193
303
|
|
|
194
304
|
try:
|
|
195
305
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
@@ -204,53 +314,107 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
|
|
|
204
314
|
if first_timestamp is None and 'timestamp' in data:
|
|
205
315
|
first_timestamp = data.get('timestamp')
|
|
206
316
|
|
|
207
|
-
#
|
|
317
|
+
# Count messages
|
|
208
318
|
if 'message' in data and data['message']:
|
|
209
319
|
msg = data['message']
|
|
320
|
+
if msg.get('role') in ['user', 'assistant']:
|
|
321
|
+
message_count += 1
|
|
322
|
+
|
|
210
323
|
if msg.get('content'):
|
|
211
324
|
content = msg['content']
|
|
325
|
+
text_content = ""
|
|
326
|
+
|
|
212
327
|
if isinstance(content, list):
|
|
213
328
|
for item in content:
|
|
214
|
-
if isinstance(item, dict)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
329
|
+
if isinstance(item, dict):
|
|
330
|
+
if item.get('type') == 'text':
|
|
331
|
+
text_content += item.get('text', '')
|
|
332
|
+
# Check for code blocks
|
|
333
|
+
if '```' in item.get('text', ''):
|
|
334
|
+
metadata['has_code_blocks'] = True
|
|
335
|
+
# Extract code for AST analysis with bounds checking
|
|
336
|
+
if len(metadata['ast_elements']) < 30:
|
|
337
|
+
# Fix: More permissive regex to handle various fence formats
|
|
338
|
+
code_blocks = re.findall(r'```[^\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
|
|
339
|
+
for code_block in code_blocks[:5]: # Limit to 5 blocks
|
|
340
|
+
if len(metadata['ast_elements']) >= 30:
|
|
341
|
+
break
|
|
342
|
+
ast_elems = extract_ast_elements(code_block)
|
|
343
|
+
for elem in list(ast_elems)[:10]: # Limit elements per block
|
|
344
|
+
if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < 30:
|
|
345
|
+
metadata['ast_elements'].append(elem)
|
|
218
346
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
if
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
347
|
+
elif item.get('type') == 'tool_use':
|
|
348
|
+
tool_name = item.get('name', '')
|
|
349
|
+
if tool_name and tool_name not in metadata['tools_used']:
|
|
350
|
+
metadata['tools_used'].append(tool_name)
|
|
351
|
+
|
|
352
|
+
# Extract file references
|
|
353
|
+
if 'input' in item:
|
|
354
|
+
input_data = item['input']
|
|
355
|
+
if isinstance(input_data, dict):
|
|
356
|
+
# Determine if it's an edit tool
|
|
357
|
+
is_edit = tool_name in ['Edit', 'Write', 'MultiEdit', 'NotebookEdit']
|
|
358
|
+
|
|
359
|
+
if 'file_path' in input_data:
|
|
360
|
+
file_ref = input_data['file_path']
|
|
361
|
+
if is_edit:
|
|
362
|
+
if file_ref not in metadata['files_edited']:
|
|
363
|
+
metadata['files_edited'].append(file_ref)
|
|
364
|
+
else:
|
|
365
|
+
if file_ref not in metadata['files_analyzed']:
|
|
366
|
+
metadata['files_analyzed'].append(file_ref)
|
|
367
|
+
|
|
368
|
+
if 'path' in input_data:
|
|
369
|
+
file_ref = input_data['path']
|
|
370
|
+
if file_ref not in metadata['files_analyzed']:
|
|
371
|
+
metadata['files_analyzed'].append(file_ref)
|
|
372
|
+
elif isinstance(item, str):
|
|
373
|
+
text_content += item
|
|
374
|
+
elif isinstance(content, str):
|
|
375
|
+
text_content = content
|
|
376
|
+
|
|
377
|
+
# Collect text for concept extraction
|
|
378
|
+
if text_content:
|
|
379
|
+
all_text.append(text_content[:1000]) # Limit text per message
|
|
231
380
|
|
|
232
381
|
except json.JSONDecodeError:
|
|
233
382
|
continue
|
|
234
383
|
except Exception:
|
|
235
384
|
continue
|
|
236
|
-
|
|
385
|
+
|
|
237
386
|
except Exception as e:
|
|
238
387
|
logger.warning(f"Error extracting metadata: {e}")
|
|
239
388
|
|
|
240
|
-
|
|
389
|
+
# Extract concepts from collected text
|
|
390
|
+
if all_text:
|
|
391
|
+
combined_text = ' '.join(all_text[:50]) # Limit to first 50 messages
|
|
392
|
+
metadata['concepts'] = extract_concepts(combined_text)
|
|
393
|
+
|
|
394
|
+
# Set total messages
|
|
395
|
+
metadata['total_messages'] = message_count
|
|
396
|
+
|
|
397
|
+
# Limit arrays
|
|
398
|
+
metadata['files_analyzed'] = metadata['files_analyzed'][:20]
|
|
399
|
+
metadata['files_edited'] = metadata['files_edited'][:20]
|
|
400
|
+
metadata['tools_used'] = metadata['tools_used'][:15]
|
|
401
|
+
metadata['ast_elements'] = metadata['ast_elements'][:30]
|
|
402
|
+
|
|
403
|
+
return metadata, first_timestamp or datetime.now().isoformat(), message_count
|
|
241
404
|
|
|
242
405
|
def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
243
406
|
"""Stream import a single JSONL file without loading it into memory."""
|
|
244
407
|
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
245
408
|
|
|
246
409
|
# Extract metadata in first pass (lightweight)
|
|
247
|
-
metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
|
|
410
|
+
metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
|
|
248
411
|
|
|
249
412
|
# Stream messages and process in chunks
|
|
250
413
|
chunk_buffer = []
|
|
251
414
|
chunk_index = 0
|
|
252
415
|
total_chunks = 0
|
|
253
416
|
conversation_id = jsonl_file.stem
|
|
417
|
+
current_message_index = 0
|
|
254
418
|
|
|
255
419
|
try:
|
|
256
420
|
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
@@ -282,16 +446,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
282
446
|
content = '\n'.join(text_parts)
|
|
283
447
|
|
|
284
448
|
if content:
|
|
449
|
+
# Track message index for user/assistant messages
|
|
450
|
+
if msg['role'] in ['user', 'assistant']:
|
|
451
|
+
current_message_index += 1
|
|
452
|
+
message_idx = current_message_index
|
|
453
|
+
else:
|
|
454
|
+
message_idx = 0
|
|
455
|
+
|
|
285
456
|
chunk_buffer.append({
|
|
286
457
|
'role': msg['role'],
|
|
287
|
-
'content': content
|
|
458
|
+
'content': content,
|
|
459
|
+
'message_index': message_idx
|
|
288
460
|
})
|
|
289
461
|
|
|
290
462
|
# Process chunk when buffer reaches MAX_CHUNK_SIZE
|
|
291
463
|
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
292
464
|
chunks = process_and_upload_chunk(
|
|
293
465
|
chunk_buffer, chunk_index, conversation_id,
|
|
294
|
-
created_at, metadata, collection_name, project_path
|
|
466
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
295
467
|
)
|
|
296
468
|
total_chunks += chunks
|
|
297
469
|
chunk_buffer = []
|
|
@@ -313,7 +485,7 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
313
485
|
if chunk_buffer:
|
|
314
486
|
chunks = process_and_upload_chunk(
|
|
315
487
|
chunk_buffer, chunk_index, conversation_id,
|
|
316
|
-
created_at, metadata, collection_name, project_path
|
|
488
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
317
489
|
)
|
|
318
490
|
total_chunks += chunks
|
|
319
491
|
|
|
@@ -335,10 +507,19 @@ def load_state() -> dict:
|
|
|
335
507
|
return {"imported_files": {}}
|
|
336
508
|
|
|
337
509
|
def save_state(state: dict):
|
|
338
|
-
"""Save import state."""
|
|
339
|
-
|
|
340
|
-
|
|
510
|
+
"""Save import state with atomic write."""
|
|
511
|
+
# Fix: Handle case where STATE_FILE has no directory component
|
|
512
|
+
state_dir = os.path.dirname(STATE_FILE)
|
|
513
|
+
if state_dir:
|
|
514
|
+
os.makedirs(state_dir, exist_ok=True)
|
|
515
|
+
|
|
516
|
+
# Use atomic write to prevent corruption during crashes
|
|
517
|
+
temp_file = f"{STATE_FILE}.tmp"
|
|
518
|
+
with open(temp_file, 'w') as f:
|
|
341
519
|
json.dump(state, f, indent=2)
|
|
520
|
+
|
|
521
|
+
# Atomic rename (on POSIX systems)
|
|
522
|
+
os.replace(temp_file, STATE_FILE)
|
|
342
523
|
|
|
343
524
|
def should_import_file(file_path: Path, state: dict) -> bool:
|
|
344
525
|
"""Check if file should be imported."""
|