claude-self-reflect 2.5.18 → 2.5.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +76 -0
- package/.claude/agents/mcp-integration.md +45 -0
- package/.claude/agents/qdrant-specialist.md +41 -0
- package/README.md +25 -0
- package/installer/setup-wizard-docker.js +65 -1
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/src/server.py +91 -32
- package/mcp-server/src/status.py +2 -2
- package/package.json +3 -1
- package/scripts/delta-metadata-update-safe.py +442 -0
- package/scripts/force-metadata-recovery.py +305 -0
|
@@ -13,6 +13,82 @@ You are a resilient and comprehensive testing specialist for Claude Self-Reflect
|
|
|
13
13
|
- MCP tools enable reflection and memory storage
|
|
14
14
|
- System must handle sensitive API keys securely
|
|
15
15
|
|
|
16
|
+
## Comprehensive Test Suite
|
|
17
|
+
|
|
18
|
+
### Available Test Categories
|
|
19
|
+
The project now includes a comprehensive test suite in `/tests/` directory:
|
|
20
|
+
|
|
21
|
+
1. **MCP Tool Integration** (`test_mcp_tools_comprehensive.py`)
|
|
22
|
+
- All MCP tools with various parameters
|
|
23
|
+
- Edge cases and error handling
|
|
24
|
+
- Cross-project search validation
|
|
25
|
+
|
|
26
|
+
2. **Memory Decay** (`test_memory_decay.py`)
|
|
27
|
+
- Decay calculations and half-life variations
|
|
28
|
+
- Score adjustments and ranking changes
|
|
29
|
+
- Performance impact measurements
|
|
30
|
+
|
|
31
|
+
3. **Multi-Project Support** (`test_multi_project.py`)
|
|
32
|
+
- Project isolation and collection naming
|
|
33
|
+
- Cross-project search functionality
|
|
34
|
+
- Metadata storage and retrieval
|
|
35
|
+
|
|
36
|
+
4. **Embedding Models** (`test_embedding_models.py`)
|
|
37
|
+
- FastEmbed vs Voyage AI switching
|
|
38
|
+
- Dimension compatibility (384 vs 1024)
|
|
39
|
+
- Model performance comparisons
|
|
40
|
+
|
|
41
|
+
5. **Delta Metadata** (`test_delta_metadata.py`)
|
|
42
|
+
- Tool usage extraction
|
|
43
|
+
- File reference tracking
|
|
44
|
+
- Incremental updates without re-embedding
|
|
45
|
+
|
|
46
|
+
6. **Performance & Load** (`test_performance_load.py`)
|
|
47
|
+
- Large conversation imports (>1000 chunks)
|
|
48
|
+
- Concurrent operations
|
|
49
|
+
- Memory and CPU monitoring
|
|
50
|
+
|
|
51
|
+
7. **Data Integrity** (`test_data_integrity.py`)
|
|
52
|
+
- Duplicate detection
|
|
53
|
+
- Unicode handling
|
|
54
|
+
- Chunk ordering preservation
|
|
55
|
+
|
|
56
|
+
8. **Recovery Scenarios** (`test_recovery_scenarios.py`)
|
|
57
|
+
- Partial import recovery
|
|
58
|
+
- Container restart resilience
|
|
59
|
+
- State file corruption handling
|
|
60
|
+
|
|
61
|
+
9. **Security** (`test_security.py`)
|
|
62
|
+
- API key validation
|
|
63
|
+
- Input sanitization
|
|
64
|
+
- Path traversal prevention
|
|
65
|
+
|
|
66
|
+
### Running the Test Suite
|
|
67
|
+
```bash
|
|
68
|
+
# Run ALL tests
|
|
69
|
+
cd ~/projects/claude-self-reflect
|
|
70
|
+
python tests/run_all_tests.py
|
|
71
|
+
|
|
72
|
+
# Run specific categories
|
|
73
|
+
python tests/run_all_tests.py -c mcp_tools memory_decay multi_project
|
|
74
|
+
|
|
75
|
+
# Run with verbose output
|
|
76
|
+
python tests/run_all_tests.py -v
|
|
77
|
+
|
|
78
|
+
# List available test categories
|
|
79
|
+
python tests/run_all_tests.py --list
|
|
80
|
+
|
|
81
|
+
# Run individual test files
|
|
82
|
+
python tests/test_mcp_tools_comprehensive.py
|
|
83
|
+
python tests/test_memory_decay.py
|
|
84
|
+
python tests/test_multi_project.py
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Test Results Location
|
|
88
|
+
- JSON results: `tests/test_results.json`
|
|
89
|
+
- Contains timestamps, durations, pass/fail counts
|
|
90
|
+
- Useful for tracking test history
|
|
91
|
+
|
|
16
92
|
## Key Responsibilities
|
|
17
93
|
|
|
18
94
|
1. **System State Detection**
|
|
@@ -16,6 +16,51 @@ You are an MCP server development specialist for the memento-stack project. You
|
|
|
16
16
|
- Supports both local (FastEmbed) and cloud (Voyage AI) embeddings
|
|
17
17
|
- MCP determines project from working directory context
|
|
18
18
|
|
|
19
|
+
## Available Test Suites
|
|
20
|
+
|
|
21
|
+
### MCP-Specific Tests
|
|
22
|
+
1. **Comprehensive MCP Tool Tests** (`tests/test_mcp_tools_comprehensive.py`)
|
|
23
|
+
- Tests all MCP tools: reflect_on_past, store_reflection, quick_search, search_summary
|
|
24
|
+
- Edge case handling and error scenarios
|
|
25
|
+
- Parameter validation (limit, min_score, use_decay, response_format)
|
|
26
|
+
- Cross-project search with project="all"
|
|
27
|
+
- Run with: `python tests/test_mcp_tools_comprehensive.py`
|
|
28
|
+
|
|
29
|
+
2. **MCP Search Tests** (`scripts/test-mcp-search.py`)
|
|
30
|
+
- Basic MCP search functionality
|
|
31
|
+
- Integration with Qdrant backend
|
|
32
|
+
- Response parsing and formatting
|
|
33
|
+
- Run with: `python scripts/test-mcp-search.py`
|
|
34
|
+
|
|
35
|
+
3. **MCP Robustness Tests** (`scripts/test-mcp-robustness.py`)
|
|
36
|
+
- Error recovery mechanisms
|
|
37
|
+
- Timeout handling
|
|
38
|
+
- Connection resilience
|
|
39
|
+
- Run with: `python scripts/test-mcp-robustness.py`
|
|
40
|
+
|
|
41
|
+
### Running MCP Tests
|
|
42
|
+
```bash
|
|
43
|
+
# Run all MCP tests
|
|
44
|
+
cd ~/projects/claude-self-reflect
|
|
45
|
+
python tests/run_all_tests.py -c mcp_tools mcp_search
|
|
46
|
+
|
|
47
|
+
# Test MCP server directly
|
|
48
|
+
cd mcp-server && python test_server.py
|
|
49
|
+
|
|
50
|
+
# Verify MCP registration in Claude Code
|
|
51
|
+
claude mcp list | grep claude-self-reflect
|
|
52
|
+
|
|
53
|
+
# Test MCP tools from Python
|
|
54
|
+
python -c "from mcp_server.src.server import reflect_on_past; import asyncio; asyncio.run(reflect_on_past({'query': 'test', 'limit': 5}))"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### MCP Tool Parameters Reference
|
|
58
|
+
- **reflect_on_past**: query, limit, brief, min_score, project, use_decay, response_format, include_raw
|
|
59
|
+
- **store_reflection**: content, tags
|
|
60
|
+
- **quick_search**: query, min_score, project
|
|
61
|
+
- **search_by_file**: file_path, limit, project
|
|
62
|
+
- **search_by_concept**: concept, include_files, limit, project
|
|
63
|
+
|
|
19
64
|
## Key Responsibilities
|
|
20
65
|
|
|
21
66
|
1. **MCP Server Development**
|
|
@@ -49,6 +49,47 @@ You are a Qdrant vector database specialist for the memento-stack project. Your
|
|
|
49
49
|
- **Monitor baseline + headroom** - Measure actual usage before setting limits
|
|
50
50
|
- **Use cgroup-aware CPU monitoring** - Docker shows all CPUs but has limits
|
|
51
51
|
|
|
52
|
+
## Available Test Suites
|
|
53
|
+
|
|
54
|
+
### Qdrant-Specific Tests
|
|
55
|
+
1. **Multi-Project Support Tests** (`tests/test_multi_project.py`)
|
|
56
|
+
- Collection isolation verification
|
|
57
|
+
- Cross-project search functionality
|
|
58
|
+
- Collection naming consistency
|
|
59
|
+
- Project metadata storage
|
|
60
|
+
- Run with: `python tests/test_multi_project.py`
|
|
61
|
+
|
|
62
|
+
2. **Data Integrity Tests** (`tests/test_data_integrity.py`)
|
|
63
|
+
- Duplicate detection
|
|
64
|
+
- Chunk ordering preservation
|
|
65
|
+
- Unicode and special character handling
|
|
66
|
+
- Collection consistency checks
|
|
67
|
+
|
|
68
|
+
3. **Performance Tests** (`tests/test_performance_load.py`)
|
|
69
|
+
- Large conversation imports (>1000 chunks)
|
|
70
|
+
- Concurrent search requests
|
|
71
|
+
- Memory usage patterns
|
|
72
|
+
- Collection size limits
|
|
73
|
+
|
|
74
|
+
### How to Run Tests
|
|
75
|
+
```bash
|
|
76
|
+
# Run all Qdrant-related tests
|
|
77
|
+
cd ~/projects/claude-self-reflect
|
|
78
|
+
python tests/run_all_tests.py -c multi_project data_integrity performance
|
|
79
|
+
|
|
80
|
+
# Check collection health
|
|
81
|
+
docker exec claude-reflection-qdrant curl -s http://localhost:6333/collections | jq
|
|
82
|
+
|
|
83
|
+
# Verify specific collection
|
|
84
|
+
python -c "from qdrant_client import QdrantClient; c=QdrantClient('localhost', 6333); print(c.get_collection('conv_HASH_local'))"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Common Issues & Solutions
|
|
88
|
+
- **Dimension mismatch**: Check embedding model (384 for local, 1024 for voyage)
|
|
89
|
+
- **Empty search results**: Verify collection exists and has points
|
|
90
|
+
- **Slow searches**: Check collection size and optimize with filters
|
|
91
|
+
- **Collection not found**: Verify project name normalization and MD5 hash
|
|
92
|
+
|
|
52
93
|
### Quality Gates
|
|
53
94
|
- **Follow the workflow**: implementation → review → test → docs → release
|
|
54
95
|
- **Use pre-releases for major changes** - Better to test than break production
|
package/README.md
CHANGED
|
@@ -176,8 +176,33 @@ Recent conversations matter more. Old ones fade. Like your brain, but reliable.
|
|
|
176
176
|
- [GitHub Issues](https://github.com/ramakay/claude-self-reflect/issues)
|
|
177
177
|
- [Discussions](https://github.com/ramakay/claude-self-reflect/discussions)
|
|
178
178
|
|
|
179
|
+
## Upgrading to v2.5.19
|
|
180
|
+
|
|
181
|
+
### 🆕 New Feature: Metadata Enrichment
|
|
182
|
+
v2.5.19 adds searchable metadata to your conversations - concepts, files, and tools!
|
|
183
|
+
|
|
184
|
+
#### For Existing Users
|
|
185
|
+
```bash
|
|
186
|
+
# Update to latest version
|
|
187
|
+
npm update -g claude-self-reflect
|
|
188
|
+
|
|
189
|
+
# Run setup - it will detect your existing installation
|
|
190
|
+
claude-self-reflect setup
|
|
191
|
+
# Choose "yes" when asked about metadata enrichment
|
|
192
|
+
|
|
193
|
+
# Or manually enrich metadata anytime:
|
|
194
|
+
docker compose run --rm importer python /app/scripts/delta-metadata-update-safe.py
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
#### What You Get
|
|
198
|
+
- `search_by_concept("docker")` - Find conversations by topic
|
|
199
|
+
- `search_by_file("server.py")` - Find conversations that touched specific files
|
|
200
|
+
- Better search accuracy with metadata-based filtering
|
|
201
|
+
|
|
179
202
|
## What's New
|
|
180
203
|
|
|
204
|
+
- **v2.5.19** - Metadata Enrichment! Search by concepts, files, and tools. [Full release notes](docs/releases/v2.5.19-RELEASE-NOTES.md)
|
|
205
|
+
- **v2.5.18** - Security dependency updates
|
|
181
206
|
- **v2.5.17** - Critical CPU fix and memory limit adjustment. [Full release notes](docs/releases/v2.5.17-release-notes.md)
|
|
182
207
|
- **v2.5.16** - (Pre-release only) Initial streaming importer with CPU throttling
|
|
183
208
|
- **v2.5.15** - Critical bug fixes and collection creation improvements
|
|
@@ -406,6 +406,54 @@ async function importConversations() {
|
|
|
406
406
|
}
|
|
407
407
|
}
|
|
408
408
|
|
|
409
|
+
async function enrichMetadata() {
|
|
410
|
+
console.log('\n🔍 Metadata Enrichment (NEW in v2.5.19!)...');
|
|
411
|
+
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
412
|
+
console.log('This feature enhances your conversations with searchable metadata:');
|
|
413
|
+
console.log(' • Concepts: High-level topics (docker, security, testing, etc.)');
|
|
414
|
+
console.log(' • Files: Track which files were analyzed or edited');
|
|
415
|
+
console.log(' • Tools: Record which Claude tools were used');
|
|
416
|
+
console.log('\nEnables powerful searches like:');
|
|
417
|
+
console.log(' • search_by_concept("docker")');
|
|
418
|
+
console.log(' • search_by_file("server.py")');
|
|
419
|
+
|
|
420
|
+
const enrichChoice = await question('\nEnrich past conversations with metadata? (recommended) (y/n): ');
|
|
421
|
+
|
|
422
|
+
if (enrichChoice.toLowerCase() === 'y') {
|
|
423
|
+
console.log('\n⏳ Starting metadata enrichment (safe mode)...');
|
|
424
|
+
console.log(' • Processing last 30 days of conversations');
|
|
425
|
+
console.log(' • Using conservative rate limiting');
|
|
426
|
+
console.log(' • This may take 5-10 minutes\n');
|
|
427
|
+
|
|
428
|
+
try {
|
|
429
|
+
// Run the safe delta update script
|
|
430
|
+
safeExec('docker', [
|
|
431
|
+
'compose', 'run', '--rm',
|
|
432
|
+
'-e', 'DAYS_TO_UPDATE=30',
|
|
433
|
+
'-e', 'BATCH_SIZE=2',
|
|
434
|
+
'-e', 'RATE_LIMIT_DELAY=0.5',
|
|
435
|
+
'-e', 'MAX_CONCURRENT_UPDATES=2',
|
|
436
|
+
'importer',
|
|
437
|
+
'python', '/app/scripts/delta-metadata-update-safe.py'
|
|
438
|
+
], {
|
|
439
|
+
cwd: projectRoot,
|
|
440
|
+
stdio: 'inherit'
|
|
441
|
+
});
|
|
442
|
+
|
|
443
|
+
console.log('\n✅ Metadata enrichment completed successfully!');
|
|
444
|
+
console.log(' Your conversations now have searchable concepts and file tracking.');
|
|
445
|
+
} catch (error) {
|
|
446
|
+
console.log('\n⚠️ Metadata enrichment had some issues but continuing setup');
|
|
447
|
+
console.log(' You can retry later with:');
|
|
448
|
+
console.log(' docker compose run --rm importer python /app/scripts/delta-metadata-update-safe.py');
|
|
449
|
+
}
|
|
450
|
+
} else {
|
|
451
|
+
console.log('\n📝 Skipping metadata enrichment.');
|
|
452
|
+
console.log(' You can run it later with:');
|
|
453
|
+
console.log(' docker compose run --rm importer python /app/scripts/delta-metadata-update-safe.py');
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
|
|
409
457
|
async function showFinalInstructions() {
|
|
410
458
|
console.log('\n✅ Setup complete!');
|
|
411
459
|
|
|
@@ -419,6 +467,7 @@ async function showFinalInstructions() {
|
|
|
419
467
|
console.log(' • Check status: docker compose ps');
|
|
420
468
|
console.log(' • View logs: docker compose logs -f');
|
|
421
469
|
console.log(' • Import conversations: docker compose run --rm importer');
|
|
470
|
+
console.log(' • Enrich metadata: docker compose run --rm importer python /app/scripts/delta-metadata-update-safe.py');
|
|
422
471
|
console.log(' • Start watcher: docker compose --profile watch up -d');
|
|
423
472
|
console.log(' • Stop all: docker compose down');
|
|
424
473
|
|
|
@@ -450,13 +499,25 @@ async function checkExistingInstallation() {
|
|
|
450
499
|
console.log(' • 🔍 Mode: ' + (localMode ? 'Local embeddings (privacy mode)' : 'Cloud embeddings (Voyage AI)'));
|
|
451
500
|
console.log(' • ⚡ Memory decay: Enabled (90-day half-life)');
|
|
452
501
|
|
|
502
|
+
// Offer metadata enrichment for v2.5.19
|
|
503
|
+
console.log('\n🆕 NEW in v2.5.19: Metadata Enrichment!');
|
|
504
|
+
console.log(' Enhance your conversations with searchable concepts and file tracking.');
|
|
505
|
+
|
|
506
|
+
const upgradeChoice = await question('\nWould you like to enrich your conversations with metadata? (y/n): ');
|
|
507
|
+
|
|
508
|
+
if (upgradeChoice.toLowerCase() === 'y') {
|
|
509
|
+
await enrichMetadata();
|
|
510
|
+
console.log('\n✅ Upgrade complete! Your conversations now have enhanced search capabilities.');
|
|
511
|
+
}
|
|
512
|
+
|
|
453
513
|
console.log('\n📋 Quick Commands:');
|
|
454
514
|
console.log(' • View status: docker compose ps');
|
|
455
515
|
console.log(' • View logs: docker compose logs -f');
|
|
516
|
+
console.log(' • Enrich metadata: docker compose run --rm importer python /app/scripts/delta-metadata-update-safe.py');
|
|
456
517
|
console.log(' • Restart: docker compose restart');
|
|
457
518
|
console.log(' • Stop: docker compose down');
|
|
458
519
|
|
|
459
|
-
console.log('\n💡 To re-run setup, first stop services with: docker compose down');
|
|
520
|
+
console.log('\n💡 To re-run full setup, first stop services with: docker compose down');
|
|
460
521
|
return true;
|
|
461
522
|
}
|
|
462
523
|
}
|
|
@@ -504,6 +565,9 @@ async function main() {
|
|
|
504
565
|
// Import conversations
|
|
505
566
|
await importConversations();
|
|
506
567
|
|
|
568
|
+
// Enrich metadata (new in v2.5.19)
|
|
569
|
+
await enrichMetadata();
|
|
570
|
+
|
|
507
571
|
// Show final instructions
|
|
508
572
|
await showFinalInstructions();
|
|
509
573
|
|
package/mcp-server/src/server.py
CHANGED
|
@@ -522,7 +522,7 @@ async def reflect_on_past(
|
|
|
522
522
|
|
|
523
523
|
# Handle project matching - check if the target project name appears at the end of the stored project path
|
|
524
524
|
if target_project != 'all' and not project_collections:
|
|
525
|
-
# The stored project name is like "-Users-
|
|
525
|
+
# The stored project name is like "-Users-username-projects-ShopifyMCPMockShop"
|
|
526
526
|
# We want to match just "ShopifyMCPMockShop"
|
|
527
527
|
if not point_project.endswith(f"-{target_project}") and point_project != target_project:
|
|
528
528
|
continue # Skip results from other projects
|
|
@@ -602,7 +602,7 @@ async def reflect_on_past(
|
|
|
602
602
|
|
|
603
603
|
# Handle project matching - check if the target project name appears at the end of the stored project path
|
|
604
604
|
if target_project != 'all' and not project_collections:
|
|
605
|
-
# The stored project name is like "-Users-
|
|
605
|
+
# The stored project name is like "-Users-username-projects-ShopifyMCPMockShop"
|
|
606
606
|
# We want to match just "ShopifyMCPMockShop"
|
|
607
607
|
if not point_project.endswith(f"-{target_project}") and point_project != target_project:
|
|
608
608
|
continue # Skip results from other projects
|
|
@@ -639,7 +639,7 @@ async def reflect_on_past(
|
|
|
639
639
|
|
|
640
640
|
# Handle project matching - check if the target project name appears at the end of the stored project path
|
|
641
641
|
if target_project != 'all' and not project_collections:
|
|
642
|
-
# The stored project name is like "-Users-
|
|
642
|
+
# The stored project name is like "-Users-username-projects-ShopifyMCPMockShop"
|
|
643
643
|
# We want to match just "ShopifyMCPMockShop"
|
|
644
644
|
if not point_project.endswith(f"-{target_project}") and point_project != target_project:
|
|
645
645
|
continue # Skip results from other projects
|
|
@@ -1169,7 +1169,8 @@ async def search_by_concept(
|
|
|
1169
1169
|
|
|
1170
1170
|
if project and project != 'all':
|
|
1171
1171
|
# Filter collections for specific project
|
|
1172
|
-
|
|
1172
|
+
normalized_project = normalize_project_name(project)
|
|
1173
|
+
project_hash = hashlib.md5(normalized_project.encode()).hexdigest()[:8]
|
|
1173
1174
|
collection_prefix = f"conv_{project_hash}_"
|
|
1174
1175
|
collections = [c for c in await get_all_collections() if c.startswith(collection_prefix)]
|
|
1175
1176
|
elif project == 'all':
|
|
@@ -1178,49 +1179,101 @@ async def search_by_concept(
|
|
|
1178
1179
|
if not collections:
|
|
1179
1180
|
return "<search_by_concept>\n<error>No collections found to search</error>\n</search_by_concept>"
|
|
1180
1181
|
|
|
1181
|
-
#
|
|
1182
|
-
|
|
1182
|
+
# First, check metadata health
|
|
1183
|
+
metadata_found = False
|
|
1184
|
+
total_points_checked = 0
|
|
1183
1185
|
|
|
1184
|
-
for collection_name in collections:
|
|
1186
|
+
for collection_name in collections[:3]: # Sample first 3 collections
|
|
1185
1187
|
try:
|
|
1186
|
-
|
|
1187
|
-
results = await qdrant_client.search(
|
|
1188
|
+
sample_points, _ = await qdrant_client.scroll(
|
|
1188
1189
|
collection_name=collection_name,
|
|
1189
|
-
|
|
1190
|
-
query_filter=models.Filter(
|
|
1191
|
-
should=[
|
|
1192
|
-
models.FieldCondition(
|
|
1193
|
-
key="concepts",
|
|
1194
|
-
match=models.MatchAny(any=[concept.lower()])
|
|
1195
|
-
)
|
|
1196
|
-
]
|
|
1197
|
-
),
|
|
1198
|
-
limit=limit * 2, # Get more results for better filtering
|
|
1190
|
+
limit=10,
|
|
1199
1191
|
with_payload=True
|
|
1200
1192
|
)
|
|
1201
|
-
|
|
1202
|
-
for point in
|
|
1203
|
-
payload
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
'collection': collection_name
|
|
1210
|
-
})
|
|
1211
|
-
|
|
1212
|
-
except Exception as e:
|
|
1193
|
+
total_points_checked += len(sample_points)
|
|
1194
|
+
for point in sample_points:
|
|
1195
|
+
if 'concepts' in point.payload and point.payload['concepts']:
|
|
1196
|
+
metadata_found = True
|
|
1197
|
+
break
|
|
1198
|
+
if metadata_found:
|
|
1199
|
+
break
|
|
1200
|
+
except:
|
|
1213
1201
|
continue
|
|
1214
1202
|
|
|
1203
|
+
# Search all collections
|
|
1204
|
+
all_results = []
|
|
1205
|
+
|
|
1206
|
+
# If metadata exists, try metadata-based search first
|
|
1207
|
+
if metadata_found:
|
|
1208
|
+
for collection_name in collections:
|
|
1209
|
+
try:
|
|
1210
|
+
# Hybrid search: semantic + concept filter
|
|
1211
|
+
results = await qdrant_client.search(
|
|
1212
|
+
collection_name=collection_name,
|
|
1213
|
+
query_vector=embedding,
|
|
1214
|
+
query_filter=models.Filter(
|
|
1215
|
+
should=[
|
|
1216
|
+
models.FieldCondition(
|
|
1217
|
+
key="concepts",
|
|
1218
|
+
match=models.MatchAny(any=[concept.lower()])
|
|
1219
|
+
)
|
|
1220
|
+
]
|
|
1221
|
+
),
|
|
1222
|
+
limit=limit * 2, # Get more results for better filtering
|
|
1223
|
+
with_payload=True
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
for point in results:
|
|
1227
|
+
payload = point.payload
|
|
1228
|
+
# Boost score if concept is in the concepts list
|
|
1229
|
+
score_boost = 0.2 if concept.lower() in payload.get('concepts', []) else 0.0
|
|
1230
|
+
all_results.append({
|
|
1231
|
+
'score': float(point.score) + score_boost,
|
|
1232
|
+
'payload': payload,
|
|
1233
|
+
'collection': collection_name,
|
|
1234
|
+
'search_type': 'metadata'
|
|
1235
|
+
})
|
|
1236
|
+
|
|
1237
|
+
except Exception as e:
|
|
1238
|
+
continue
|
|
1239
|
+
|
|
1240
|
+
# If no results from metadata search OR no metadata exists, fall back to semantic search
|
|
1241
|
+
if not all_results:
|
|
1242
|
+
await ctx.debug(f"Falling back to semantic search for concept: {concept}")
|
|
1243
|
+
|
|
1244
|
+
for collection_name in collections:
|
|
1245
|
+
try:
|
|
1246
|
+
# Pure semantic search without filters
|
|
1247
|
+
results = await qdrant_client.search(
|
|
1248
|
+
collection_name=collection_name,
|
|
1249
|
+
query_vector=embedding,
|
|
1250
|
+
limit=limit,
|
|
1251
|
+
score_threshold=0.5, # Lower threshold for broader results
|
|
1252
|
+
with_payload=True
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
for point in results:
|
|
1256
|
+
all_results.append({
|
|
1257
|
+
'score': float(point.score),
|
|
1258
|
+
'payload': point.payload,
|
|
1259
|
+
'collection': collection_name,
|
|
1260
|
+
'search_type': 'semantic'
|
|
1261
|
+
})
|
|
1262
|
+
|
|
1263
|
+
except Exception as e:
|
|
1264
|
+
continue
|
|
1265
|
+
|
|
1215
1266
|
# Sort by score and limit
|
|
1216
1267
|
all_results.sort(key=lambda x: x['score'], reverse=True)
|
|
1217
1268
|
all_results = all_results[:limit]
|
|
1218
1269
|
|
|
1219
1270
|
# Format results
|
|
1220
1271
|
if not all_results:
|
|
1272
|
+
metadata_status = "with metadata" if metadata_found else "NO METADATA FOUND"
|
|
1221
1273
|
return f"""<search_by_concept>
|
|
1222
1274
|
<concept>{concept}</concept>
|
|
1223
|
-
<
|
|
1275
|
+
<metadata_health>{metadata_status} (checked {total_points_checked} points)</metadata_health>
|
|
1276
|
+
<message>No conversations found about this concept. {'Try running: python scripts/delta-metadata-update.py' if not metadata_found else 'Try different search terms.'}</message>
|
|
1224
1277
|
</search_by_concept>"""
|
|
1225
1278
|
|
|
1226
1279
|
results_text = []
|
|
@@ -1255,8 +1308,14 @@ async def search_by_concept(
|
|
|
1255
1308
|
<preview>{text_preview}</preview>
|
|
1256
1309
|
</result>""")
|
|
1257
1310
|
|
|
1311
|
+
# Determine if this was a fallback search
|
|
1312
|
+
used_fallback = any(r.get('search_type') == 'semantic' for r in all_results)
|
|
1313
|
+
metadata_status = "with metadata" if metadata_found else "NO METADATA FOUND"
|
|
1314
|
+
|
|
1258
1315
|
return f"""<search_by_concept>
|
|
1259
1316
|
<concept>{concept}</concept>
|
|
1317
|
+
<metadata_health>{metadata_status} (checked {total_points_checked} points)</metadata_health>
|
|
1318
|
+
<search_type>{'fallback_semantic' if used_fallback else 'metadata_based'}</search_type>
|
|
1260
1319
|
<count>{len(all_results)}</count>
|
|
1261
1320
|
<results>
|
|
1262
1321
|
{''.join(results_text)}
|
package/mcp-server/src/status.py
CHANGED
|
@@ -13,8 +13,8 @@ def extract_project_name_from_path(file_path: str) -> str:
|
|
|
13
13
|
"""Extract project name from JSONL file path.
|
|
14
14
|
|
|
15
15
|
Handles paths like:
|
|
16
|
-
- ~/.claude/projects/-Users-
|
|
17
|
-
- /logs/-Users-
|
|
16
|
+
- ~/.claude/projects/-Users-username-projects-claude-self-reflect/file.jsonl
|
|
17
|
+
- /logs/-Users-username-projects-n8n-builder/file.jsonl
|
|
18
18
|
"""
|
|
19
19
|
# Get the directory name containing the JSONL file
|
|
20
20
|
path_obj = Path(file_path)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-self-reflect",
|
|
3
|
-
"version": "2.5.
|
|
3
|
+
"version": "2.5.19",
|
|
4
4
|
"description": "Give Claude perfect memory of all your conversations - Installation wizard for Python MCP server",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"claude",
|
|
@@ -34,6 +34,8 @@
|
|
|
34
34
|
"mcp-server/run-mcp.sh",
|
|
35
35
|
"mcp-server/run-mcp-docker.sh",
|
|
36
36
|
"scripts/import-*.py",
|
|
37
|
+
"scripts/delta-metadata-update-safe.py",
|
|
38
|
+
"scripts/force-metadata-recovery.py",
|
|
37
39
|
".claude/agents/*.md",
|
|
38
40
|
"config/qdrant-config.yaml",
|
|
39
41
|
"docker-compose.yaml",
|
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Safe delta metadata update script for Claude Self-Reflect.
|
|
4
|
+
Updates existing Qdrant points with tool usage metadata without overwhelming the system.
|
|
5
|
+
Includes rate limiting, batch processing, and proper error recovery.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import hashlib
|
|
12
|
+
import re
|
|
13
|
+
import time
|
|
14
|
+
import asyncio
|
|
15
|
+
from datetime import datetime, timedelta
|
|
16
|
+
from typing import List, Dict, Any, Set, Tuple, Optional
|
|
17
|
+
import logging
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
|
|
21
|
+
from qdrant_client import QdrantClient
|
|
22
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
23
|
+
|
|
24
|
+
# Configuration
|
|
25
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
26
|
+
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
27
|
+
STATE_FILE = os.getenv("STATE_FILE", "./config/delta-update-state.json")
|
|
28
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
29
|
+
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
30
|
+
DAYS_TO_UPDATE = int(os.getenv("DAYS_TO_UPDATE", "7"))
|
|
31
|
+
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Process N conversations at a time
|
|
32
|
+
RATE_LIMIT_DELAY = float(os.getenv("RATE_LIMIT_DELAY", "0.1")) # Delay between updates
|
|
33
|
+
MAX_CONCURRENT_UPDATES = int(os.getenv("MAX_CONCURRENT_UPDATES", "5")) # Max parallel updates
|
|
34
|
+
|
|
35
|
+
# Set up logging
|
|
36
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# Initialize Qdrant client
|
|
40
|
+
client = QdrantClient(url=QDRANT_URL, timeout=30) # Increased timeout
|
|
41
|
+
|
|
42
|
+
def get_collection_suffix():
|
|
43
|
+
"""Get the collection suffix based on embedding type (for new collections only)."""
|
|
44
|
+
return "_local" if PREFER_LOCAL_EMBEDDINGS else "_voyage"
|
|
45
|
+
|
|
46
|
+
def get_existing_collection_suffix(project_hash: str, max_retries: int = 3) -> str:
|
|
47
|
+
"""Detect which collection type actually exists for this project.
|
|
48
|
+
|
|
49
|
+
This function checks for existing collections and returns the actual suffix found.
|
|
50
|
+
Only falls back to preference when creating new collections.
|
|
51
|
+
Includes retry logic for resilience against temporary Qdrant unavailability.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
project_hash: The MD5 hash of the normalized project name
|
|
55
|
+
max_retries: Maximum number of retry attempts for collection detection
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
"_voyage" if voyage collection exists, "_local" if local exists,
|
|
59
|
+
or preference-based suffix if neither exists yet
|
|
60
|
+
"""
|
|
61
|
+
for attempt in range(max_retries):
|
|
62
|
+
try:
|
|
63
|
+
collections = client.get_collections().collections
|
|
64
|
+
collection_names = [c.name for c in collections]
|
|
65
|
+
|
|
66
|
+
# Check for both possible collection names
|
|
67
|
+
voyage_name = f"conv_{project_hash}_voyage"
|
|
68
|
+
local_name = f"conv_{project_hash}_local"
|
|
69
|
+
|
|
70
|
+
# Return the actual collection type that exists
|
|
71
|
+
if voyage_name in collection_names:
|
|
72
|
+
logger.debug(f"Found existing Voyage collection: {voyage_name}")
|
|
73
|
+
return "_voyage"
|
|
74
|
+
elif local_name in collection_names:
|
|
75
|
+
logger.debug(f"Found existing Local collection: {local_name}")
|
|
76
|
+
return "_local"
|
|
77
|
+
else:
|
|
78
|
+
# No existing collection - use preference for new ones
|
|
79
|
+
suffix = get_collection_suffix()
|
|
80
|
+
logger.debug(f"No existing collection for hash {project_hash}, using preference: {suffix}")
|
|
81
|
+
return suffix
|
|
82
|
+
except Exception as e:
|
|
83
|
+
if attempt < max_retries - 1:
|
|
84
|
+
wait_time = 0.5 * (attempt + 1) # Exponential backoff
|
|
85
|
+
logger.debug(f"Error checking collections (attempt {attempt + 1}/{max_retries}): {e}, retrying in {wait_time}s")
|
|
86
|
+
time.sleep(wait_time)
|
|
87
|
+
continue
|
|
88
|
+
logger.warning(f"Error checking collections after {max_retries} attempts: {e}, falling back to preference")
|
|
89
|
+
return get_collection_suffix()
|
|
90
|
+
|
|
91
|
+
def normalize_project_name(project_name: str) -> str:
|
|
92
|
+
"""Normalize project name by removing path-like prefixes."""
|
|
93
|
+
if project_name.startswith("-"):
|
|
94
|
+
parts = project_name.split("-")
|
|
95
|
+
for i, part in enumerate(parts):
|
|
96
|
+
if part == "projects" and i < len(parts) - 1:
|
|
97
|
+
return "-".join(parts[i+1:])
|
|
98
|
+
return project_name
|
|
99
|
+
|
|
100
|
+
def normalize_path(path: str) -> str:
|
|
101
|
+
"""Normalize file paths for consistency across platforms."""
|
|
102
|
+
if not path:
|
|
103
|
+
return ""
|
|
104
|
+
path = path.replace("/Users/", "~/").replace("\\Users\\", "~\\")
|
|
105
|
+
path = path.replace("\\", "/")
|
|
106
|
+
path = re.sub(r'/+', '/', path)
|
|
107
|
+
return path
|
|
108
|
+
|
|
109
|
+
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
110
|
+
"""Extract high-level concepts from conversation and tool usage."""
|
|
111
|
+
concepts = set()
|
|
112
|
+
|
|
113
|
+
concept_patterns = {
|
|
114
|
+
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
115
|
+
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
116
|
+
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
117
|
+
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
118
|
+
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
119
|
+
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
|
|
120
|
+
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
121
|
+
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
122
|
+
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
123
|
+
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
124
|
+
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
125
|
+
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
126
|
+
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
127
|
+
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
128
|
+
'search': r'(search|query|find|filter|match|relevance)'
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
combined_text = text.lower()
|
|
132
|
+
for concept, pattern in concept_patterns.items():
|
|
133
|
+
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
134
|
+
concepts.add(concept)
|
|
135
|
+
|
|
136
|
+
# Check tool usage patterns
|
|
137
|
+
if tool_usage.get('grep_searches'):
|
|
138
|
+
concepts.add('search')
|
|
139
|
+
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
140
|
+
concepts.add('development')
|
|
141
|
+
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
142
|
+
concepts.add('testing')
|
|
143
|
+
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
144
|
+
concepts.add('docker')
|
|
145
|
+
|
|
146
|
+
return concepts
|
|
147
|
+
|
|
148
|
+
def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
149
|
+
"""Extract all tool usage from a conversation."""
|
|
150
|
+
tool_usage = {
|
|
151
|
+
"files_read": [],
|
|
152
|
+
"files_edited": [],
|
|
153
|
+
"files_created": [],
|
|
154
|
+
"grep_searches": [],
|
|
155
|
+
"bash_commands": [],
|
|
156
|
+
"tools_summary": {}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
161
|
+
for line in f:
|
|
162
|
+
if not line.strip():
|
|
163
|
+
continue
|
|
164
|
+
try:
|
|
165
|
+
data = json.loads(line)
|
|
166
|
+
if 'message' in data and data['message']:
|
|
167
|
+
msg = data['message']
|
|
168
|
+
if msg.get('role') == 'assistant' and msg.get('content'):
|
|
169
|
+
content = msg['content']
|
|
170
|
+
if isinstance(content, list):
|
|
171
|
+
for item in content:
|
|
172
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
173
|
+
tool_name = item.get('name', '')
|
|
174
|
+
inputs = item.get('input', {})
|
|
175
|
+
|
|
176
|
+
# Track tool usage
|
|
177
|
+
tool_usage['tools_summary'][tool_name] = tool_usage['tools_summary'].get(tool_name, 0) + 1
|
|
178
|
+
|
|
179
|
+
# Extract file paths
|
|
180
|
+
if tool_name == 'Read':
|
|
181
|
+
file_path = inputs.get('file_path')
|
|
182
|
+
if file_path:
|
|
183
|
+
tool_usage['files_read'].append(normalize_path(file_path))
|
|
184
|
+
elif tool_name in ['Edit', 'Write', 'MultiEdit']:
|
|
185
|
+
file_path = inputs.get('file_path')
|
|
186
|
+
if file_path:
|
|
187
|
+
tool_usage['files_edited'].append(normalize_path(file_path))
|
|
188
|
+
elif tool_name == 'Grep':
|
|
189
|
+
pattern = inputs.get('pattern')
|
|
190
|
+
if pattern:
|
|
191
|
+
tool_usage['grep_searches'].append({'pattern': pattern[:100]})
|
|
192
|
+
elif tool_name == 'Bash':
|
|
193
|
+
command = inputs.get('command', '')[:200]
|
|
194
|
+
if command:
|
|
195
|
+
tool_usage['bash_commands'].append(command)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
continue
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Error reading JSONL file {jsonl_path}: {e}")
|
|
200
|
+
|
|
201
|
+
# Deduplicate
|
|
202
|
+
tool_usage['files_read'] = list(set(tool_usage['files_read']))[:20]
|
|
203
|
+
tool_usage['files_edited'] = list(set(tool_usage['files_edited']))[:10]
|
|
204
|
+
|
|
205
|
+
return tool_usage
|
|
206
|
+
|
|
207
|
+
def load_state() -> Dict[str, Any]:
|
|
208
|
+
"""Load the current state from file."""
|
|
209
|
+
state_path = Path(STATE_FILE)
|
|
210
|
+
if state_path.exists():
|
|
211
|
+
try:
|
|
212
|
+
with open(state_path, 'r') as f:
|
|
213
|
+
return json.load(f)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.warning(f"Could not load state: {e}")
|
|
216
|
+
return {
|
|
217
|
+
"last_update": None,
|
|
218
|
+
"updated_conversations": {},
|
|
219
|
+
"failed_conversations": {}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
def save_state(state: Dict[str, Any]):
|
|
223
|
+
"""Save the current state to file."""
|
|
224
|
+
state_path = Path(STATE_FILE)
|
|
225
|
+
state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
226
|
+
|
|
227
|
+
state["last_update"] = datetime.now().isoformat()
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
with open(state_path, 'w') as f:
|
|
231
|
+
json.dump(state, f, indent=2)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error(f"Could not save state: {e}")
|
|
234
|
+
|
|
235
|
+
async def update_point_metadata_batch(updates: List[Tuple[str, int, Dict, str]]) -> int:
|
|
236
|
+
"""Update multiple points in a batch with rate limiting."""
|
|
237
|
+
success_count = 0
|
|
238
|
+
|
|
239
|
+
for conversation_id, chunk_index, metadata, collection_name in updates:
|
|
240
|
+
try:
|
|
241
|
+
# Calculate point ID
|
|
242
|
+
point_id_str = hashlib.md5(
|
|
243
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
244
|
+
).hexdigest()[:16]
|
|
245
|
+
point_id = int(point_id_str, 16) % (2**63)
|
|
246
|
+
|
|
247
|
+
if not DRY_RUN:
|
|
248
|
+
# Update with rate limiting
|
|
249
|
+
client.set_payload(
|
|
250
|
+
collection_name=collection_name,
|
|
251
|
+
payload=metadata,
|
|
252
|
+
points=[point_id],
|
|
253
|
+
wait=False
|
|
254
|
+
)
|
|
255
|
+
success_count += 1
|
|
256
|
+
|
|
257
|
+
# Rate limit to avoid overwhelming Qdrant
|
|
258
|
+
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|
259
|
+
else:
|
|
260
|
+
logger.info(f"[DRY RUN] Would update point {point_id}")
|
|
261
|
+
success_count += 1
|
|
262
|
+
|
|
263
|
+
except Exception as e:
|
|
264
|
+
logger.debug(f"Failed to update point {conversation_id}_{chunk_index}: {e}")
|
|
265
|
+
|
|
266
|
+
return success_count
|
|
267
|
+
|
|
268
|
+
async def process_conversation_async(jsonl_file: Path, state: Dict[str, Any]) -> bool:
|
|
269
|
+
"""Process a single conversation file asynchronously."""
|
|
270
|
+
try:
|
|
271
|
+
conversation_id = jsonl_file.stem
|
|
272
|
+
project_name = jsonl_file.parent.name
|
|
273
|
+
|
|
274
|
+
# Check if already updated
|
|
275
|
+
if conversation_id in state.get("updated_conversations", {}):
|
|
276
|
+
last_updated = state["updated_conversations"][conversation_id].get("updated_at")
|
|
277
|
+
file_mtime = jsonl_file.stat().st_mtime
|
|
278
|
+
if last_updated and last_updated >= file_mtime:
|
|
279
|
+
logger.debug(f"Skipping {conversation_id} - already updated")
|
|
280
|
+
return True
|
|
281
|
+
|
|
282
|
+
# Check if previously failed too many times
|
|
283
|
+
failed_info = state.get("failed_conversations", {}).get(conversation_id, {})
|
|
284
|
+
if failed_info.get("retry_count", 0) > 3:
|
|
285
|
+
logger.debug(f"Skipping {conversation_id} - too many failures")
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
logger.info(f"Processing: {conversation_id}")
|
|
289
|
+
|
|
290
|
+
# Extract metadata
|
|
291
|
+
tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
|
|
292
|
+
|
|
293
|
+
# Read conversation text (limited)
|
|
294
|
+
conversation_text = ""
|
|
295
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
296
|
+
for i, line in enumerate(f):
|
|
297
|
+
if i > 100: # Limit lines to avoid memory issues
|
|
298
|
+
break
|
|
299
|
+
if line.strip():
|
|
300
|
+
try:
|
|
301
|
+
data = json.loads(line)
|
|
302
|
+
if 'message' in data and data['message']:
|
|
303
|
+
msg = data['message']
|
|
304
|
+
if msg.get('content'):
|
|
305
|
+
if isinstance(msg['content'], str):
|
|
306
|
+
conversation_text += msg['content'][:500] + "\n"
|
|
307
|
+
except Exception as e:
|
|
308
|
+
logger.debug(f"Parse error in {jsonl_file}: {e}")
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
# Extract concepts
|
|
312
|
+
concepts = extract_concepts(conversation_text[:10000], tool_usage)
|
|
313
|
+
|
|
314
|
+
# Prepare metadata
|
|
315
|
+
metadata_update = {
|
|
316
|
+
"files_analyzed": tool_usage.get('files_read', [])[:20],
|
|
317
|
+
"files_edited": tool_usage.get('files_edited', [])[:10],
|
|
318
|
+
"tools_used": list(tool_usage.get('tools_summary', {}).keys())[:20],
|
|
319
|
+
"concepts": list(concepts)[:15],
|
|
320
|
+
"has_file_metadata": True,
|
|
321
|
+
"metadata_updated_at": datetime.now().isoformat()
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
# Determine collection
|
|
325
|
+
project_hash = hashlib.md5(normalize_project_name(project_name).encode()).hexdigest()[:8]
|
|
326
|
+
# Use smart detection to find the actual collection type
|
|
327
|
+
collection_suffix = get_existing_collection_suffix(project_hash)
|
|
328
|
+
collection_name = f"conv_{project_hash}{collection_suffix}"
|
|
329
|
+
|
|
330
|
+
# Check if collection exists
|
|
331
|
+
try:
|
|
332
|
+
collections = client.get_collections().collections
|
|
333
|
+
if collection_name not in [c.name for c in collections]:
|
|
334
|
+
logger.warning(f"Collection {collection_name} not found")
|
|
335
|
+
return False
|
|
336
|
+
except Exception as e:
|
|
337
|
+
logger.error(f"Error checking collection: {e}")
|
|
338
|
+
# Record failure
|
|
339
|
+
state.setdefault("failed_conversations", {})[conversation_id] = {
|
|
340
|
+
"error": str(e),
|
|
341
|
+
"retry_count": failed_info.get("retry_count", 0) + 1,
|
|
342
|
+
"last_attempt": time.time()
|
|
343
|
+
}
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
# Prepare batch updates
|
|
347
|
+
updates = []
|
|
348
|
+
for chunk_index in range(20): # Most conversations have < 20 chunks
|
|
349
|
+
updates.append((conversation_id, chunk_index, metadata_update, collection_name))
|
|
350
|
+
|
|
351
|
+
# Process in batch with rate limiting
|
|
352
|
+
success_count = await update_point_metadata_batch(updates)
|
|
353
|
+
|
|
354
|
+
if success_count > 0:
|
|
355
|
+
logger.info(f"Updated {success_count} chunks for {conversation_id}")
|
|
356
|
+
state["updated_conversations"][conversation_id] = {
|
|
357
|
+
"updated_at": time.time(),
|
|
358
|
+
"chunks_updated": success_count,
|
|
359
|
+
"project": project_name
|
|
360
|
+
}
|
|
361
|
+
return True
|
|
362
|
+
else:
|
|
363
|
+
logger.warning(f"No chunks updated for {conversation_id}")
|
|
364
|
+
return False
|
|
365
|
+
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.error(f"Failed to process {jsonl_file}: {e}")
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
async def main_async():
|
|
371
|
+
"""Main async function with proper batching and rate limiting."""
|
|
372
|
+
logger.info("=== Starting Safe Delta Metadata Update ===")
|
|
373
|
+
logger.info(f"Configuration:")
|
|
374
|
+
logger.info(f" Qdrant URL: {QDRANT_URL}")
|
|
375
|
+
logger.info(f" Days to update: {DAYS_TO_UPDATE}")
|
|
376
|
+
logger.info(f" Batch size: {BATCH_SIZE}")
|
|
377
|
+
logger.info(f" Rate limit delay: {RATE_LIMIT_DELAY}s")
|
|
378
|
+
logger.info(f" Max concurrent: {MAX_CONCURRENT_UPDATES}")
|
|
379
|
+
|
|
380
|
+
# Load state
|
|
381
|
+
state = load_state()
|
|
382
|
+
|
|
383
|
+
# Get recent files
|
|
384
|
+
recent_files = []
|
|
385
|
+
cutoff_time = datetime.now() - timedelta(days=DAYS_TO_UPDATE)
|
|
386
|
+
logs_path = Path(LOGS_DIR)
|
|
387
|
+
|
|
388
|
+
if logs_path.exists():
|
|
389
|
+
for jsonl_file in logs_path.glob("**/*.jsonl"):
|
|
390
|
+
try:
|
|
391
|
+
mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
|
|
392
|
+
if mtime >= cutoff_time:
|
|
393
|
+
recent_files.append(jsonl_file)
|
|
394
|
+
except:
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
logger.info(f"Found {len(recent_files)} conversations from the past {DAYS_TO_UPDATE} days")
|
|
398
|
+
|
|
399
|
+
# Process in batches
|
|
400
|
+
success_count = 0
|
|
401
|
+
failed_count = 0
|
|
402
|
+
|
|
403
|
+
for i in range(0, len(recent_files), BATCH_SIZE):
|
|
404
|
+
batch = recent_files[i:i + BATCH_SIZE]
|
|
405
|
+
logger.info(f"Processing batch {i//BATCH_SIZE + 1}/{(len(recent_files) + BATCH_SIZE - 1)//BATCH_SIZE}")
|
|
406
|
+
|
|
407
|
+
# Create tasks for concurrent processing
|
|
408
|
+
tasks = []
|
|
409
|
+
for jsonl_file in batch:
|
|
410
|
+
task = asyncio.create_task(process_conversation_async(jsonl_file, state))
|
|
411
|
+
tasks.append(task)
|
|
412
|
+
|
|
413
|
+
# Wait for batch to complete
|
|
414
|
+
results = await asyncio.gather(*tasks)
|
|
415
|
+
|
|
416
|
+
# Count results
|
|
417
|
+
batch_success = sum(1 for r in results if r)
|
|
418
|
+
batch_failed = len(results) - batch_success
|
|
419
|
+
success_count += batch_success
|
|
420
|
+
failed_count += batch_failed
|
|
421
|
+
|
|
422
|
+
# Save state after each batch
|
|
423
|
+
save_state(state)
|
|
424
|
+
|
|
425
|
+
# Add delay between batches to avoid overwhelming the system
|
|
426
|
+
if i + BATCH_SIZE < len(recent_files):
|
|
427
|
+
await asyncio.sleep(1.0)
|
|
428
|
+
|
|
429
|
+
# Final save
|
|
430
|
+
save_state(state)
|
|
431
|
+
|
|
432
|
+
logger.info("=== Delta Update Complete ===")
|
|
433
|
+
logger.info(f"Successfully updated: {success_count} conversations")
|
|
434
|
+
logger.info(f"Failed: {failed_count} conversations")
|
|
435
|
+
logger.info(f"Total conversations in state: {len(state['updated_conversations'])}")
|
|
436
|
+
|
|
437
|
+
def main():
|
|
438
|
+
"""Entry point."""
|
|
439
|
+
asyncio.run(main_async())
|
|
440
|
+
|
|
441
|
+
if __name__ == "__main__":
|
|
442
|
+
main()
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Force metadata recovery script for Claude Self-Reflect.
|
|
4
|
+
Fixes conversations that were marked as updated but don't actually have metadata.
|
|
5
|
+
This addresses the point ID mismatch bug in delta-metadata-update.py.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import hashlib
|
|
12
|
+
import re
|
|
13
|
+
import asyncio
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import List, Dict, Any, Set, Optional
|
|
16
|
+
import logging
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from qdrant_client import QdrantClient
|
|
20
|
+
|
|
21
|
+
# Configuration
|
|
22
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
23
|
+
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
24
|
+
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
25
|
+
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))
|
|
26
|
+
|
|
27
|
+
# Set up logging
|
|
28
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# Initialize Qdrant client
|
|
32
|
+
client = QdrantClient(url=QDRANT_URL, timeout=30)
|
|
33
|
+
|
|
34
|
+
def normalize_path(path: str) -> str:
|
|
35
|
+
"""Normalize file paths for consistency."""
|
|
36
|
+
if not path:
|
|
37
|
+
return ""
|
|
38
|
+
path = path.replace("/Users/", "~/").replace("\\Users\\", "~\\")
|
|
39
|
+
path = path.replace("\\", "/")
|
|
40
|
+
path = re.sub(r'/+', '/', path)
|
|
41
|
+
return path
|
|
42
|
+
|
|
43
|
+
def extract_concepts(text: str) -> Set[str]:
|
|
44
|
+
"""Extract high-level concepts from text."""
|
|
45
|
+
concepts = set()
|
|
46
|
+
|
|
47
|
+
concept_patterns = {
|
|
48
|
+
'security': r'(security|vulnerability|CVE|injection|auth)',
|
|
49
|
+
'docker': r'(docker|container|compose|kubernetes)',
|
|
50
|
+
'testing': r'(test|pytest|unittest|coverage)',
|
|
51
|
+
'api': r'(API|REST|GraphQL|endpoint)',
|
|
52
|
+
'database': r'(database|SQL|query|migration|qdrant)',
|
|
53
|
+
'debugging': r'(debug|error|exception|traceback)',
|
|
54
|
+
'git': r'(git|commit|branch|merge|pull request)',
|
|
55
|
+
'mcp': r'(MCP|claude-self-reflect|tool|agent)',
|
|
56
|
+
'embeddings': r'(embedding|vector|semantic|similarity)',
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
text_lower = text.lower()
|
|
60
|
+
for concept, pattern in concept_patterns.items():
|
|
61
|
+
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
62
|
+
concepts.add(concept)
|
|
63
|
+
|
|
64
|
+
return concepts
|
|
65
|
+
|
|
66
|
+
def extract_metadata_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
67
|
+
"""Extract metadata from a JSONL conversation file."""
|
|
68
|
+
metadata = {
|
|
69
|
+
"files_analyzed": [],
|
|
70
|
+
"files_edited": [],
|
|
71
|
+
"tools_used": set(),
|
|
72
|
+
"concepts": set(),
|
|
73
|
+
"text_sample": ""
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
78
|
+
line_count = 0
|
|
79
|
+
for line in f:
|
|
80
|
+
line_count += 1
|
|
81
|
+
if line_count > 200: # Limit processing
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
if not line.strip():
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
data = json.loads(line)
|
|
89
|
+
if 'message' in data and data['message']:
|
|
90
|
+
msg = data['message']
|
|
91
|
+
|
|
92
|
+
# Extract text for concept analysis
|
|
93
|
+
if msg.get('content'):
|
|
94
|
+
if isinstance(msg['content'], str):
|
|
95
|
+
metadata['text_sample'] += msg['content'][:500] + "\n"
|
|
96
|
+
|
|
97
|
+
# Extract tool usage
|
|
98
|
+
if msg.get('role') == 'assistant' and msg.get('content'):
|
|
99
|
+
content = msg['content']
|
|
100
|
+
if isinstance(content, list):
|
|
101
|
+
for item in content:
|
|
102
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
103
|
+
tool_name = item.get('name', '')
|
|
104
|
+
metadata['tools_used'].add(tool_name)
|
|
105
|
+
|
|
106
|
+
inputs = item.get('input', {})
|
|
107
|
+
|
|
108
|
+
if tool_name == 'Read' and 'file_path' in inputs:
|
|
109
|
+
metadata['files_analyzed'].append(
|
|
110
|
+
normalize_path(inputs['file_path'])
|
|
111
|
+
)
|
|
112
|
+
elif tool_name in ['Edit', 'Write'] and 'file_path' in inputs:
|
|
113
|
+
metadata['files_edited'].append(
|
|
114
|
+
normalize_path(inputs['file_path'])
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
except json.JSONDecodeError:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"Error reading {jsonl_path}: {e}")
|
|
122
|
+
|
|
123
|
+
# Extract concepts from collected text
|
|
124
|
+
if metadata['text_sample']:
|
|
125
|
+
metadata['concepts'] = extract_concepts(metadata['text_sample'][:5000])
|
|
126
|
+
|
|
127
|
+
# Convert sets to lists and limit
|
|
128
|
+
metadata['tools_used'] = list(metadata['tools_used'])[:20]
|
|
129
|
+
metadata['concepts'] = list(metadata['concepts'])[:15]
|
|
130
|
+
metadata['files_analyzed'] = list(set(metadata['files_analyzed']))[:20]
|
|
131
|
+
metadata['files_edited'] = list(set(metadata['files_edited']))[:10]
|
|
132
|
+
|
|
133
|
+
del metadata['text_sample'] # Don't store in Qdrant
|
|
134
|
+
|
|
135
|
+
return metadata
|
|
136
|
+
|
|
137
|
+
async def find_conversations_without_metadata(collection_name: str) -> List[str]:
|
|
138
|
+
"""Find all unique conversation IDs that don't have metadata."""
|
|
139
|
+
conversations_without_metadata = set()
|
|
140
|
+
|
|
141
|
+
offset = None
|
|
142
|
+
total_checked = 0
|
|
143
|
+
|
|
144
|
+
while True:
|
|
145
|
+
points, next_offset = client.scroll(
|
|
146
|
+
collection_name=collection_name,
|
|
147
|
+
limit=BATCH_SIZE,
|
|
148
|
+
offset=offset,
|
|
149
|
+
with_payload=True,
|
|
150
|
+
with_vectors=False
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if not points:
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
for point in points:
|
|
157
|
+
# Check if metadata is missing
|
|
158
|
+
if not point.payload.get('concepts') or not point.payload.get('has_file_metadata'):
|
|
159
|
+
conv_id = point.payload.get('conversation_id')
|
|
160
|
+
if conv_id:
|
|
161
|
+
conversations_without_metadata.add(conv_id)
|
|
162
|
+
|
|
163
|
+
total_checked += len(points)
|
|
164
|
+
offset = next_offset
|
|
165
|
+
|
|
166
|
+
if offset is None:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
logger.info(f" Checked {total_checked} points, found {len(conversations_without_metadata)} conversations without metadata")
|
|
170
|
+
return list(conversations_without_metadata)
|
|
171
|
+
|
|
172
|
+
async def update_conversation_points(collection_name: str, conversation_id: str, metadata: Dict[str, Any]) -> int:
|
|
173
|
+
"""Update all points for a conversation with metadata."""
|
|
174
|
+
updated_count = 0
|
|
175
|
+
|
|
176
|
+
# Get all points in the collection
|
|
177
|
+
offset = None
|
|
178
|
+
while True:
|
|
179
|
+
points, next_offset = client.scroll(
|
|
180
|
+
collection_name=collection_name,
|
|
181
|
+
limit=BATCH_SIZE,
|
|
182
|
+
offset=offset,
|
|
183
|
+
with_payload=True,
|
|
184
|
+
with_vectors=False
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if not points:
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
# Find and update points for this conversation
|
|
191
|
+
for point in points:
|
|
192
|
+
if point.payload.get('conversation_id') == conversation_id:
|
|
193
|
+
if not DRY_RUN:
|
|
194
|
+
# Merge metadata with existing payload
|
|
195
|
+
updated_payload = {**point.payload, **metadata}
|
|
196
|
+
updated_payload['has_file_metadata'] = True
|
|
197
|
+
updated_payload['metadata_updated_at'] = datetime.now().isoformat()
|
|
198
|
+
|
|
199
|
+
client.set_payload(
|
|
200
|
+
collection_name=collection_name,
|
|
201
|
+
payload=updated_payload,
|
|
202
|
+
points=[point.id],
|
|
203
|
+
wait=False
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
updated_count += 1
|
|
207
|
+
|
|
208
|
+
offset = next_offset
|
|
209
|
+
if offset is None:
|
|
210
|
+
break
|
|
211
|
+
|
|
212
|
+
return updated_count
|
|
213
|
+
|
|
214
|
+
async def process_collection(collection_name: str):
|
|
215
|
+
"""Process a single collection to add missing metadata."""
|
|
216
|
+
logger.info(f"\nProcessing collection: {collection_name}")
|
|
217
|
+
|
|
218
|
+
# Find conversations without metadata
|
|
219
|
+
conversations_without_metadata = await find_conversations_without_metadata(collection_name)
|
|
220
|
+
|
|
221
|
+
if not conversations_without_metadata:
|
|
222
|
+
logger.info(f" ✓ All conversations have metadata")
|
|
223
|
+
return 0
|
|
224
|
+
|
|
225
|
+
logger.info(f" Found {len(conversations_without_metadata)} conversations needing metadata")
|
|
226
|
+
|
|
227
|
+
# Process each conversation
|
|
228
|
+
success_count = 0
|
|
229
|
+
failed_count = 0
|
|
230
|
+
|
|
231
|
+
for conv_id in conversations_without_metadata[:10]: # Limit for testing
|
|
232
|
+
# Find the JSONL file
|
|
233
|
+
jsonl_pattern = f"**/{conv_id}.jsonl"
|
|
234
|
+
jsonl_files = list(Path(LOGS_DIR).glob(jsonl_pattern))
|
|
235
|
+
|
|
236
|
+
if not jsonl_files:
|
|
237
|
+
logger.warning(f" Cannot find JSONL for {conv_id}")
|
|
238
|
+
failed_count += 1
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
jsonl_file = jsonl_files[0]
|
|
242
|
+
logger.info(f" Processing {conv_id}")
|
|
243
|
+
|
|
244
|
+
# Extract metadata
|
|
245
|
+
metadata = extract_metadata_from_jsonl(str(jsonl_file))
|
|
246
|
+
|
|
247
|
+
if not metadata['concepts'] and not metadata['files_analyzed']:
|
|
248
|
+
logger.warning(f" No metadata extracted from {conv_id}")
|
|
249
|
+
failed_count += 1
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
# Update points
|
|
253
|
+
updated_points = await update_conversation_points(collection_name, conv_id, metadata)
|
|
254
|
+
|
|
255
|
+
if updated_points > 0:
|
|
256
|
+
logger.info(f" ✓ Updated {updated_points} points with {len(metadata['concepts'])} concepts")
|
|
257
|
+
success_count += 1
|
|
258
|
+
else:
|
|
259
|
+
logger.warning(f" No points updated for {conv_id}")
|
|
260
|
+
failed_count += 1
|
|
261
|
+
|
|
262
|
+
logger.info(f" Collection complete: {success_count} fixed, {failed_count} failed")
|
|
263
|
+
return success_count
|
|
264
|
+
|
|
265
|
+
async def main():
|
|
266
|
+
"""Main recovery process."""
|
|
267
|
+
logger.info("=== Force Metadata Recovery ===")
|
|
268
|
+
logger.info(f"Qdrant URL: {QDRANT_URL}")
|
|
269
|
+
logger.info(f"Dry run: {DRY_RUN}")
|
|
270
|
+
|
|
271
|
+
# Get all collections
|
|
272
|
+
collections = client.get_collections().collections
|
|
273
|
+
|
|
274
|
+
# Focus on collections with potential issues
|
|
275
|
+
priority_collections = []
|
|
276
|
+
other_collections = []
|
|
277
|
+
|
|
278
|
+
for collection in collections:
|
|
279
|
+
name = collection.name
|
|
280
|
+
if name.startswith('conv_'):
|
|
281
|
+
other_collections.append(name)
|
|
282
|
+
|
|
283
|
+
logger.info(f"Found {len(priority_collections)} priority collections")
|
|
284
|
+
logger.info(f"Found {len(other_collections)} other collections")
|
|
285
|
+
|
|
286
|
+
# Process priority collections first
|
|
287
|
+
total_fixed = 0
|
|
288
|
+
|
|
289
|
+
for collection_name in priority_collections:
|
|
290
|
+
fixed = await process_collection(collection_name)
|
|
291
|
+
total_fixed += fixed
|
|
292
|
+
|
|
293
|
+
# Process a sample of other collections
|
|
294
|
+
for collection_name in other_collections[:5]:
|
|
295
|
+
fixed = await process_collection(collection_name)
|
|
296
|
+
total_fixed += fixed
|
|
297
|
+
|
|
298
|
+
logger.info(f"\n=== Recovery Complete ===")
|
|
299
|
+
logger.info(f"Total conversations fixed: {total_fixed}")
|
|
300
|
+
|
|
301
|
+
if DRY_RUN:
|
|
302
|
+
logger.info("This was a DRY RUN - no actual updates were made")
|
|
303
|
+
|
|
304
|
+
if __name__ == "__main__":
|
|
305
|
+
asyncio.run(main())
|