claude-self-reflect 4.0.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/csr-validator.md +151 -0
- package/.claude/agents/open-source-maintainer.md +46 -7
- package/mcp-server/src/parallel_search.py +6 -1
- package/mcp-server/src/search_tools.py +8 -2
- package/mcp-server/src/status_unified.py +286 -0
- package/package.json +1 -1
- package/scripts/import-conversations-unified.py +96 -99
- package/scripts/streaming-watcher.py +113 -158
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: csr-validator
|
|
3
|
+
description: Validates Claude Self-Reflect system functionality. Use for testing MCP tools, embedding modes, import pipeline, and search. MUST BE USED before releases and after major changes.
|
|
4
|
+
tools: mcp__claude-self-reflect__switch_embedding_mode, mcp__claude-self-reflect__get_embedding_mode, mcp__claude-self-reflect__store_reflection, mcp__claude-self-reflect__csr_reflect_on_past, mcp__claude-self-reflect__csr_quick_check, mcp__claude-self-reflect__csr_search_insights, mcp__claude-self-reflect__get_recent_work, mcp__claude-self-reflect__search_by_recency, mcp__claude-self-reflect__get_timeline, mcp__claude-self-reflect__search_by_file, mcp__claude-self-reflect__search_by_concept, mcp__claude-self-reflect__get_full_conversation, mcp__claude-self-reflect__get_next_results, mcp__claude-self-reflect__csr_get_more, mcp__claude-self-reflect__reload_code, mcp__claude-self-reflect__reload_status, mcp__claude-self-reflect__clear_module_cache, Bash, Read
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are a focused CSR system validator. Test ONLY through MCP protocol - NEVER import Python modules directly.
|
|
9
|
+
|
|
10
|
+
## Test Sequence (MANDATORY ORDER)
|
|
11
|
+
|
|
12
|
+
### 1. Mode Testing
|
|
13
|
+
```
|
|
14
|
+
1. Get current mode (get_embedding_mode)
|
|
15
|
+
2. Switch to CLOUD mode (switch_embedding_mode)
|
|
16
|
+
3. Verify 1024 dimensions
|
|
17
|
+
4. Store test reflection with tag "cloud-test-{timestamp}"
|
|
18
|
+
5. Search for it immediately
|
|
19
|
+
6. Switch to LOCAL mode
|
|
20
|
+
7. Verify 384 dimensions
|
|
21
|
+
8. Store test reflection with tag "local-test-{timestamp}"
|
|
22
|
+
9. Search for it immediately
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### 2. MCP Tools Validation (ALL 15+)
|
|
26
|
+
Test each tool with minimal viable input:
|
|
27
|
+
- `csr_reflect_on_past`: Query "test"
|
|
28
|
+
- `csr_quick_check`: Query "system"
|
|
29
|
+
- `store_reflection`: Content with unique timestamp
|
|
30
|
+
- `get_recent_work`: Limit 2
|
|
31
|
+
- `search_by_recency`: Query "import", time_range "today"
|
|
32
|
+
- `get_timeline`: Range "last hour"
|
|
33
|
+
- `search_by_file`: Path "*.py"
|
|
34
|
+
- `search_by_concept`: Concept "testing"
|
|
35
|
+
- `get_full_conversation`: Use any recent ID
|
|
36
|
+
- `csr_search_insights`: Query "performance"
|
|
37
|
+
- `csr_get_more`: After any search
|
|
38
|
+
- `get_next_results`: After any search
|
|
39
|
+
- `reload_status`: Check reload state
|
|
40
|
+
- `clear_module_cache`: If needed
|
|
41
|
+
- `reload_code`: If status shows changes
|
|
42
|
+
|
|
43
|
+
### 3. Security Scan (CRITICAL)
|
|
44
|
+
```bash
|
|
45
|
+
# Scan for hardcoded paths
|
|
46
|
+
grep -r "/Users/[a-zA-Z]*/\|/home/[a-zA-Z]*/" scripts/ --include="*.py" | grep -v "^#" | head -20
|
|
47
|
+
|
|
48
|
+
# Scan for API keys/secrets (VOYAGE_KEY, etc)
|
|
49
|
+
grep -r "VOYAGE_KEY\|API_KEY\|SECRET\|PASSWORD" scripts/ --include="*.py" | grep -v "os.environ\|getenv" | head -10
|
|
50
|
+
|
|
51
|
+
# Check for sensitive patterns in state files
|
|
52
|
+
grep -E "(api_key|secret|password|token)" ~/.claude-self-reflect/config/*.json | head -10
|
|
53
|
+
|
|
54
|
+
# Find transient test files
|
|
55
|
+
find . -name "*test*.py" -o -name "*benchmark*.py" -o -name "*tmp*" -o -name "*.pyc" | grep -v ".git" | head -20
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 4. Performance Check
|
|
59
|
+
```bash
|
|
60
|
+
# Via Bash tool only
|
|
61
|
+
time python -c "from datetime import datetime; print(datetime.now())"
|
|
62
|
+
ps aux | grep python | head -5
|
|
63
|
+
docker ps --format "table {{.Names}}\t{{.Status}}" | grep qdrant
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 5. State Verification
|
|
67
|
+
```bash
|
|
68
|
+
# Check unified state
|
|
69
|
+
ls -la ~/.claude-self-reflect/config/unified-state.json
|
|
70
|
+
wc -l ~/.claude-self-reflect/config/unified-state.json
|
|
71
|
+
head -20 ~/.claude-self-reflect/config/unified-state.json
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 6. CodeRabbit CLI Analysis
|
|
75
|
+
```bash
|
|
76
|
+
# Run CodeRabbit for code quality check
|
|
77
|
+
echo "=== Running CodeRabbit CLI ==="
|
|
78
|
+
coderabbit --version
|
|
79
|
+
script -q /dev/null coderabbit --prompt-only || echo "CodeRabbit CLI issues detected - terminal mode incompatibility"
|
|
80
|
+
|
|
81
|
+
# Alternative: Check GitHub PR for CodeRabbit comments
|
|
82
|
+
echo "=== Checking PR CodeRabbit feedback ==="
|
|
83
|
+
gh pr list --state open --limit 1 --json number --jq '.[0].number' | xargs -I {} gh pr view {} --comments | grep -A 5 "coderabbitai" || echo "No open PRs with CodeRabbit feedback"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 7. Cleanup Transient Files
|
|
87
|
+
```bash
|
|
88
|
+
# List transient files (DO NOT DELETE YET)
|
|
89
|
+
echo "=== Transient files found ==="
|
|
90
|
+
find . -type f \( -name "*test_*.py" -o -name "test_*.py" -o -name "*benchmark*.py" \) -not -path "./.git/*" -not -path "./tests/*"
|
|
91
|
+
|
|
92
|
+
# Archive or mark for deletion
|
|
93
|
+
echo "=== Suggest archiving to: tests/throwaway/ ==="
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Output Format
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
CSR VALIDATION REPORT
|
|
100
|
+
====================
|
|
101
|
+
SECURITY SCAN: [PASS/FAIL]
|
|
102
|
+
- Hardcoded paths: [0 found/X found - LIST THEM]
|
|
103
|
+
- API keys exposed: [0 found/X found - LIST THEM]
|
|
104
|
+
- Sensitive data: [none/FOUND - LIST]
|
|
105
|
+
- Transient files: [X files - LIST FOR CLEANUP]
|
|
106
|
+
|
|
107
|
+
Mode Switching: [PASS/FAIL]
|
|
108
|
+
- Local→Cloud: [✓/✗]
|
|
109
|
+
- Cloud→Local: [✓/✗]
|
|
110
|
+
- Dimensions: [384/1024 verified]
|
|
111
|
+
|
|
112
|
+
MCP Tools (15/15):
|
|
113
|
+
- csr_reflect_on_past: [✓/✗]
|
|
114
|
+
- [... list all ...]
|
|
115
|
+
|
|
116
|
+
Performance:
|
|
117
|
+
- Search latency: [Xms]
|
|
118
|
+
- Memory usage: [XMB]
|
|
119
|
+
- Qdrant status: [healthy/unhealthy]
|
|
120
|
+
|
|
121
|
+
CodeRabbit Analysis: [PASS/FAIL]
|
|
122
|
+
- CLI execution: [✓/✗ - terminal mode issues]
|
|
123
|
+
- PR feedback checked: [✓/✗]
|
|
124
|
+
- Issues found: [none/list]
|
|
125
|
+
|
|
126
|
+
Critical Issues: [none/list]
|
|
127
|
+
|
|
128
|
+
CLEANUP NEEDED:
|
|
129
|
+
- [ ] Remove: [list transient files]
|
|
130
|
+
- [ ] Archive: [list test files]
|
|
131
|
+
- [ ] Fix: [list hardcoded paths]
|
|
132
|
+
|
|
133
|
+
VERDICT: [GREEN/YELLOW/RED]
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Rules
|
|
137
|
+
1. NEVER import Python modules (no `from X import Y`)
|
|
138
|
+
2. Use ONLY mcp__claude-self-reflect__ prefixed tools
|
|
139
|
+
3. Use Bash for system checks ONLY (no Python scripts)
|
|
140
|
+
4. Report EVERY failure, even minor
|
|
141
|
+
5. Test BOTH modes completely
|
|
142
|
+
6. Restore to LOCAL mode at end
|
|
143
|
+
7. Complete in <2 minutes
|
|
144
|
+
|
|
145
|
+
## Failure Handling
|
|
146
|
+
- If any MCP tool fails: Report exact error, continue testing others
|
|
147
|
+
- If mode switch fails: CRITICAL - stop and report
|
|
148
|
+
- If search returns no results: Note but continue
|
|
149
|
+
- If Bash fails: Try alternative command
|
|
150
|
+
|
|
151
|
+
Focus: Validate MCP protocol layer functionality, not implementation details.
|
|
@@ -6,8 +6,42 @@ tools: Read, Write, Edit, Bash, Grep, Glob, LS, WebFetch
|
|
|
6
6
|
|
|
7
7
|
You are an open-source project maintainer for the Claude Self Reflect project. Your expertise covers community management, release processes, and maintaining a healthy, welcoming project.
|
|
8
8
|
|
|
9
|
+
## CRITICAL WORKFLOW - MUST FOLLOW THIS SEQUENCE
|
|
10
|
+
|
|
11
|
+
### Complete Release Flow (CSR Tester → Open Source Maintainer → NPM)
|
|
12
|
+
1. **Code Review Phase**
|
|
13
|
+
- Check CodeRabbit feedback on existing PRs
|
|
14
|
+
- Fix ALL identified issues locally
|
|
15
|
+
- Create feature branch for fixes
|
|
16
|
+
|
|
17
|
+
2. **PR Creation Phase**
|
|
18
|
+
- Create PR with all fixes
|
|
19
|
+
- Monitor CodeRabbit automated review on the PR
|
|
20
|
+
- Address any new issues CodeRabbit identifies
|
|
21
|
+
- Ensure all CI/CD checks pass
|
|
22
|
+
|
|
23
|
+
3. **PR Merge Phase**
|
|
24
|
+
- Request review/approval
|
|
25
|
+
- Merge PR to main branch
|
|
26
|
+
- Verify merge completed successfully
|
|
27
|
+
|
|
28
|
+
4. **Release Creation Phase**
|
|
29
|
+
- Create GitHub release with comprehensive notes
|
|
30
|
+
- Tag appropriately following semver
|
|
31
|
+
- Monitor automated workflows
|
|
32
|
+
|
|
33
|
+
5. **NPM Publication Phase**
|
|
34
|
+
- Watch CI/CD pipeline for npm publish
|
|
35
|
+
- Verify package published to npm registry
|
|
36
|
+
- Test installation: `npm install -g claude-self-reflect@latest`
|
|
37
|
+
|
|
38
|
+
6. **Post-Release Phase**
|
|
39
|
+
- Close related issues with release references
|
|
40
|
+
- Update project documentation
|
|
41
|
+
- Announce release in discussions/social
|
|
42
|
+
|
|
9
43
|
## Core Workflow: Explore, Plan, Execute, Verify
|
|
10
|
-
1. **Explore**: Read relevant files, check git history, review PRs
|
|
44
|
+
1. **Explore**: Read relevant files, check git history, review PRs, check CodeRabbit feedback
|
|
11
45
|
2. **Plan**: Think hard about the release strategy before executing
|
|
12
46
|
3. **Execute**: Implement the release with proper checks
|
|
13
47
|
4. **Verify**: Use independent verification (or ask user to verify)
|
|
@@ -81,13 +115,18 @@ git log -p --grep="feature name"
|
|
|
81
115
|
gh pr list --state merged --limit 10
|
|
82
116
|
```
|
|
83
117
|
|
|
84
|
-
### PR Review Process
|
|
118
|
+
### PR Review Process with CodeRabbit
|
|
85
119
|
1. Thank contributor for their time
|
|
86
|
-
2.
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
120
|
+
2. Check CodeRabbit automated review comments
|
|
121
|
+
```bash
|
|
122
|
+
gh pr view PR_NUMBER --comments | grep -B2 -A10 "coderabbitai"
|
|
123
|
+
```
|
|
124
|
+
3. Address any CodeRabbit-identified issues
|
|
125
|
+
4. Run CI/CD checks
|
|
126
|
+
5. Review code for quality and style
|
|
127
|
+
6. Test changes locally
|
|
128
|
+
7. Provide constructive feedback
|
|
129
|
+
8. Merge with descriptive commit message
|
|
91
130
|
|
|
92
131
|
### Release Checklist
|
|
93
132
|
|
|
@@ -83,9 +83,14 @@ async def search_single_collection(
|
|
|
83
83
|
with_payload=True
|
|
84
84
|
)
|
|
85
85
|
|
|
86
|
+
# CRITICAL FIX: Handle None search results (cloud mode issue)
|
|
87
|
+
if search_results is None:
|
|
88
|
+
logger.warning(f"Search returned None for collection {collection_name}")
|
|
89
|
+
search_results = []
|
|
90
|
+
|
|
86
91
|
# Debug: Log search results
|
|
87
92
|
logger.debug(f"Search of {collection_name} returned {len(search_results)} results")
|
|
88
|
-
|
|
93
|
+
|
|
89
94
|
if should_use_decay and not USE_NATIVE_DECAY:
|
|
90
95
|
# Apply client-side decay
|
|
91
96
|
await ctx.debug(f"Using CLIENT-SIDE decay for {collection_name}")
|
|
@@ -102,9 +102,15 @@ class SearchTools:
|
|
|
102
102
|
collection_name=collection_name,
|
|
103
103
|
query_vector=query_embedding,
|
|
104
104
|
limit=limit,
|
|
105
|
-
score_threshold=min_score
|
|
105
|
+
score_threshold=min_score,
|
|
106
|
+
with_payload=True # Explicitly request payloads from Qdrant
|
|
106
107
|
)
|
|
107
|
-
|
|
108
|
+
|
|
109
|
+
# CRITICAL FIX: Handle None search results (cloud mode issue)
|
|
110
|
+
if search_results is None:
|
|
111
|
+
logger.warning(f"Search returned None for collection {collection_name}")
|
|
112
|
+
search_results = []
|
|
113
|
+
|
|
108
114
|
# Convert results to dict format
|
|
109
115
|
results = []
|
|
110
116
|
for result in search_results:
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""Ultra-fast status checker using unified state management.
|
|
2
|
+
|
|
3
|
+
This module reads from the unified state file for indexing status.
|
|
4
|
+
Designed for <20ms execution time to support status bars and shell scripts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import time
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
|
|
13
|
+
# Add scripts directory to path for unified state manager
|
|
14
|
+
scripts_dir = Path(__file__).parent.parent.parent / "scripts"
|
|
15
|
+
if scripts_dir.exists():
|
|
16
|
+
sys.path.insert(0, str(scripts_dir))
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from unified_state_manager import UnifiedStateManager
|
|
20
|
+
except ImportError:
|
|
21
|
+
# Fallback to reading JSON directly if manager not available
|
|
22
|
+
UnifiedStateManager = None
|
|
23
|
+
|
|
24
|
+
# Try to import shared utilities
|
|
25
|
+
try:
|
|
26
|
+
from shared_utils import (
|
|
27
|
+
extract_project_name_from_path,
|
|
28
|
+
get_claude_projects_dir,
|
|
29
|
+
get_csr_config_dir
|
|
30
|
+
)
|
|
31
|
+
except ImportError:
|
|
32
|
+
# Fallback implementations
|
|
33
|
+
def extract_project_name_from_path(file_path: str) -> str:
|
|
34
|
+
"""Extract project name from JSONL file path."""
|
|
35
|
+
path_obj = Path(file_path)
|
|
36
|
+
dir_name = path_obj.parent.name
|
|
37
|
+
|
|
38
|
+
if dir_name.startswith('-') and 'projects' in dir_name:
|
|
39
|
+
parts = dir_name.split('-')
|
|
40
|
+
try:
|
|
41
|
+
projects_idx = parts.index('projects')
|
|
42
|
+
if projects_idx + 1 < len(parts):
|
|
43
|
+
project_parts = parts[projects_idx + 1:]
|
|
44
|
+
return '-'.join(project_parts)
|
|
45
|
+
except ValueError:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
return dir_name.lstrip('-')
|
|
49
|
+
|
|
50
|
+
def get_claude_projects_dir() -> Path:
|
|
51
|
+
"""Get Claude projects directory."""
|
|
52
|
+
import os
|
|
53
|
+
if 'CLAUDE_PROJECTS_DIR' in os.environ:
|
|
54
|
+
return Path(os.environ['CLAUDE_PROJECTS_DIR'])
|
|
55
|
+
return Path.home() / ".claude" / "projects"
|
|
56
|
+
|
|
57
|
+
def get_csr_config_dir() -> Path:
|
|
58
|
+
"""Get CSR config directory."""
|
|
59
|
+
import os
|
|
60
|
+
if 'CSR_CONFIG_DIR' in os.environ:
|
|
61
|
+
return Path(os.environ['CSR_CONFIG_DIR'])
|
|
62
|
+
return Path.home() / '.claude-self-reflect' / 'config'
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_watcher_status() -> dict:
|
|
66
|
+
"""Get streaming watcher status from unified state."""
|
|
67
|
+
try:
|
|
68
|
+
if UnifiedStateManager:
|
|
69
|
+
manager = UnifiedStateManager()
|
|
70
|
+
state = manager.read_state()
|
|
71
|
+
|
|
72
|
+
# Get watcher status from importers section
|
|
73
|
+
watcher_info = state.get("importers", {}).get("streaming", {})
|
|
74
|
+
last_run = watcher_info.get("last_run")
|
|
75
|
+
|
|
76
|
+
if last_run:
|
|
77
|
+
from datetime import datetime, timezone
|
|
78
|
+
last_run_dt = datetime.fromisoformat(last_run)
|
|
79
|
+
now = datetime.now(timezone.utc)
|
|
80
|
+
age_seconds = (now - last_run_dt).total_seconds()
|
|
81
|
+
is_active = age_seconds < 120 # Active if updated in last 2 minutes
|
|
82
|
+
else:
|
|
83
|
+
is_active = False
|
|
84
|
+
age_seconds = float('inf')
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"running": is_active,
|
|
88
|
+
"files_processed": watcher_info.get("files_processed", 0),
|
|
89
|
+
"last_update_seconds": int(age_seconds) if age_seconds != float('inf') else None,
|
|
90
|
+
"status": "🟢 active" if is_active else "🔴 inactive"
|
|
91
|
+
}
|
|
92
|
+
else:
|
|
93
|
+
# Fallback to old method if UnifiedStateManager not available
|
|
94
|
+
watcher_state_file = get_csr_config_dir() / "csr-watcher.json"
|
|
95
|
+
|
|
96
|
+
if not watcher_state_file.exists():
|
|
97
|
+
return {"running": False, "status": "not configured"}
|
|
98
|
+
|
|
99
|
+
with open(watcher_state_file) as f:
|
|
100
|
+
state = json.load(f)
|
|
101
|
+
|
|
102
|
+
file_age = time.time() - watcher_state_file.stat().st_mtime
|
|
103
|
+
is_active = file_age < 120
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
"running": is_active,
|
|
107
|
+
"files_processed": len(state.get("imported_files", {})),
|
|
108
|
+
"last_update_seconds": int(file_age),
|
|
109
|
+
"status": "🟢 active" if is_active else "🔴 inactive"
|
|
110
|
+
}
|
|
111
|
+
except Exception as e:
|
|
112
|
+
return {"running": False, "status": f"error: {str(e)[:50]}"}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_status() -> dict:
|
|
116
|
+
"""Get indexing status from unified state with per-project breakdown.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
dict: JSON structure with overall and per-project indexing status
|
|
120
|
+
"""
|
|
121
|
+
start_time = time.time()
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
if UnifiedStateManager:
|
|
125
|
+
# Use unified state manager for fast access
|
|
126
|
+
manager = UnifiedStateManager()
|
|
127
|
+
status = manager.get_status()
|
|
128
|
+
|
|
129
|
+
# Get per-project breakdown
|
|
130
|
+
project_stats = defaultdict(lambda: {"indexed": 0, "total": 0})
|
|
131
|
+
|
|
132
|
+
# Count total JSONL files per project
|
|
133
|
+
projects_dir = get_claude_projects_dir()
|
|
134
|
+
if projects_dir.exists():
|
|
135
|
+
for jsonl_file in projects_dir.glob("**/*.jsonl"):
|
|
136
|
+
project_name = extract_project_name_from_path(str(jsonl_file))
|
|
137
|
+
project_stats[project_name]["total"] += 1
|
|
138
|
+
|
|
139
|
+
# Count indexed files per project from unified state
|
|
140
|
+
state = manager.read_state()
|
|
141
|
+
for file_path, metadata in state.get("files", {}).items():
|
|
142
|
+
if metadata.get("status") == "completed":
|
|
143
|
+
project_name = extract_project_name_from_path(file_path)
|
|
144
|
+
if project_name in project_stats:
|
|
145
|
+
project_stats[project_name]["indexed"] += 1
|
|
146
|
+
|
|
147
|
+
# Format response
|
|
148
|
+
result = {
|
|
149
|
+
"overall": {
|
|
150
|
+
"percentage": status["percentage"],
|
|
151
|
+
"indexed_files": status["indexed_files"],
|
|
152
|
+
"total_files": status["total_files"],
|
|
153
|
+
"total_chunks": status["total_chunks"],
|
|
154
|
+
},
|
|
155
|
+
"watcher": get_watcher_status(),
|
|
156
|
+
"projects": dict(project_stats),
|
|
157
|
+
"execution_time_ms": round((time.time() - start_time) * 1000, 2)
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return result
|
|
161
|
+
|
|
162
|
+
else:
|
|
163
|
+
# Fallback to old multi-file method
|
|
164
|
+
return get_status_legacy()
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
return {
|
|
168
|
+
"error": str(e),
|
|
169
|
+
"execution_time_ms": round((time.time() - start_time) * 1000, 2)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def get_status_legacy() -> dict:
|
|
174
|
+
"""Legacy status method reading from multiple files (fallback)."""
|
|
175
|
+
projects_dir = get_claude_projects_dir()
|
|
176
|
+
project_stats = defaultdict(lambda: {"indexed": 0, "total": 0})
|
|
177
|
+
|
|
178
|
+
# Count total JSONL files per project
|
|
179
|
+
if projects_dir.exists():
|
|
180
|
+
for jsonl_file in projects_dir.glob("**/*.jsonl"):
|
|
181
|
+
file_str = str(jsonl_file)
|
|
182
|
+
project_name = extract_project_name_from_path(file_str)
|
|
183
|
+
project_stats[project_name]["total"] += 1
|
|
184
|
+
|
|
185
|
+
# Read imported-files.json to count indexed files
|
|
186
|
+
config_dir = get_csr_config_dir()
|
|
187
|
+
imported_files_path = config_dir / "imported-files.json"
|
|
188
|
+
|
|
189
|
+
if imported_files_path.exists():
|
|
190
|
+
try:
|
|
191
|
+
with open(imported_files_path, 'r') as f:
|
|
192
|
+
data = json.load(f)
|
|
193
|
+
imported_files = data.get("imported_files", {})
|
|
194
|
+
|
|
195
|
+
for file_path in imported_files.keys():
|
|
196
|
+
# Normalize path
|
|
197
|
+
if file_path.startswith("/logs/"):
|
|
198
|
+
projects_path = str(get_claude_projects_dir())
|
|
199
|
+
normalized_path = file_path.replace("/logs/", projects_path + "/", 1)
|
|
200
|
+
else:
|
|
201
|
+
normalized_path = file_path
|
|
202
|
+
|
|
203
|
+
# Check if file exists and count it
|
|
204
|
+
if Path(normalized_path).exists():
|
|
205
|
+
project_name = extract_project_name_from_path(normalized_path)
|
|
206
|
+
if project_name in project_stats:
|
|
207
|
+
project_stats[project_name]["indexed"] += 1
|
|
208
|
+
except Exception:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
# Calculate overall stats
|
|
212
|
+
total_files = sum(p["total"] for p in project_stats.values())
|
|
213
|
+
indexed_files = sum(p["indexed"] for p in project_stats.values())
|
|
214
|
+
percentage = (indexed_files / max(total_files, 1)) * 100
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
"overall": {
|
|
218
|
+
"percentage": percentage,
|
|
219
|
+
"indexed_files": indexed_files,
|
|
220
|
+
"total_files": total_files
|
|
221
|
+
},
|
|
222
|
+
"watcher": get_watcher_status(),
|
|
223
|
+
"projects": dict(project_stats)
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def main():
|
|
228
|
+
"""CLI interface for status checking."""
|
|
229
|
+
import argparse
|
|
230
|
+
|
|
231
|
+
parser = argparse.ArgumentParser(description="Check Claude Self-Reflect indexing status")
|
|
232
|
+
parser.add_argument("--format", choices=["json", "text"], default="json",
|
|
233
|
+
help="Output format (default: json)")
|
|
234
|
+
parser.add_argument("--watch", action="store_true",
|
|
235
|
+
help="Watch mode - update every 2 seconds")
|
|
236
|
+
|
|
237
|
+
args = parser.parse_args()
|
|
238
|
+
|
|
239
|
+
if args.watch:
|
|
240
|
+
try:
|
|
241
|
+
while True:
|
|
242
|
+
status = get_status()
|
|
243
|
+
if args.format == "json":
|
|
244
|
+
print(json.dumps(status, indent=2))
|
|
245
|
+
else:
|
|
246
|
+
overall = status.get("overall", {})
|
|
247
|
+
print(f"Indexing: {overall.get('percentage', 0):.1f}% "
|
|
248
|
+
f"({overall.get('indexed_files', 0)}/{overall.get('total_files', 0)})")
|
|
249
|
+
|
|
250
|
+
watcher = status.get("watcher", {})
|
|
251
|
+
print(f"Watcher: {watcher.get('status', '🔴 inactive')}")
|
|
252
|
+
|
|
253
|
+
if status.get("execution_time_ms"):
|
|
254
|
+
print(f"Time: {status['execution_time_ms']}ms")
|
|
255
|
+
|
|
256
|
+
print("\n" + "-" * 40)
|
|
257
|
+
time.sleep(2)
|
|
258
|
+
|
|
259
|
+
except KeyboardInterrupt:
|
|
260
|
+
print("\nStopped")
|
|
261
|
+
else:
|
|
262
|
+
status = get_status()
|
|
263
|
+
if args.format == "json":
|
|
264
|
+
print(json.dumps(status, indent=2))
|
|
265
|
+
else:
|
|
266
|
+
overall = status.get("overall", {})
|
|
267
|
+
print(f"Indexing: {overall.get('percentage', 0):.1f}% "
|
|
268
|
+
f"({overall.get('indexed_files', 0)}/{overall.get('total_files', 0)} files)")
|
|
269
|
+
|
|
270
|
+
watcher = status.get("watcher", {})
|
|
271
|
+
print(f"Watcher: {watcher.get('status', '🔴 inactive')}")
|
|
272
|
+
|
|
273
|
+
# Show per-project if available
|
|
274
|
+
projects = status.get("projects", {})
|
|
275
|
+
if projects:
|
|
276
|
+
print("\nProjects:")
|
|
277
|
+
for proj, stats in projects.items():
|
|
278
|
+
pct = (stats["indexed"] / max(stats["total"], 1)) * 100
|
|
279
|
+
print(f" {proj}: {pct:.1f}% ({stats['indexed']}/{stats['total']})")
|
|
280
|
+
|
|
281
|
+
if status.get("execution_time_ms"):
|
|
282
|
+
print(f"\nExecution time: {status['execution_time_ms']}ms")
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
if __name__ == "__main__":
|
|
286
|
+
main()
|
package/package.json
CHANGED
|
@@ -15,7 +15,7 @@ import fcntl
|
|
|
15
15
|
import time
|
|
16
16
|
import argparse
|
|
17
17
|
from pathlib import Path
|
|
18
|
-
from datetime import datetime
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
19
|
from typing import List, Dict, Any, Optional, Set
|
|
20
20
|
import logging
|
|
21
21
|
|
|
@@ -34,6 +34,9 @@ except ImportError:
|
|
|
34
34
|
scripts_dir = Path(__file__).parent
|
|
35
35
|
sys.path.insert(0, str(scripts_dir))
|
|
36
36
|
|
|
37
|
+
# Import UnifiedStateManager
|
|
38
|
+
from unified_state_manager import UnifiedStateManager
|
|
39
|
+
|
|
37
40
|
from qdrant_client import QdrantClient
|
|
38
41
|
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
39
42
|
|
|
@@ -72,32 +75,15 @@ MAX_FILES_EDITED = 20
|
|
|
72
75
|
MAX_TOOLS_USED = 15
|
|
73
76
|
MAX_CONCEPT_MESSAGES = 50
|
|
74
77
|
|
|
75
|
-
#
|
|
76
|
-
|
|
77
|
-
"""Determine the default state file location with cross-platform support."""
|
|
78
|
-
from pathlib import Path
|
|
79
|
-
|
|
80
|
-
# Check if we're in Docker (more reliable than just checking /config)
|
|
81
|
-
docker_indicators = [
|
|
82
|
-
Path("/.dockerenv").exists(), # Docker creates this file
|
|
83
|
-
os.path.exists("/config") and os.access("/config", os.W_OK) # Mounted config dir with write access
|
|
84
|
-
]
|
|
85
|
-
|
|
86
|
-
if any(docker_indicators):
|
|
87
|
-
return "/config/imported-files.json"
|
|
88
|
-
|
|
89
|
-
# Use pathlib for cross-platform home directory path
|
|
90
|
-
home_state = Path.home() / ".claude-self-reflect" / "config" / "imported-files.json"
|
|
91
|
-
return str(home_state)
|
|
92
|
-
|
|
93
|
-
# Get state file path with env override support
|
|
78
|
+
# Initialize UnifiedStateManager
|
|
79
|
+
# Support legacy STATE_FILE environment variable
|
|
94
80
|
env_state = os.getenv("STATE_FILE")
|
|
95
81
|
if env_state:
|
|
96
|
-
# Normalize any user-provided path to absolute
|
|
97
82
|
from pathlib import Path
|
|
98
|
-
|
|
83
|
+
state_file_path = Path(env_state).expanduser().resolve()
|
|
84
|
+
state_manager = UnifiedStateManager(state_file_path)
|
|
99
85
|
else:
|
|
100
|
-
|
|
86
|
+
state_manager = UnifiedStateManager() # Uses default location
|
|
101
87
|
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
102
88
|
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
103
89
|
MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
|
|
@@ -686,18 +672,13 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
686
672
|
|
|
687
673
|
except Exception as e:
|
|
688
674
|
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
675
|
+
# Mark file as failed in state manager
|
|
676
|
+
try:
|
|
677
|
+
state_manager.mark_file_failed(str(jsonl_file), str(e))
|
|
678
|
+
except Exception as state_error:
|
|
679
|
+
logger.warning(f"Could not mark file as failed in state: {state_error}")
|
|
689
680
|
return 0
|
|
690
681
|
|
|
691
|
-
def _locked_open(path, mode):
|
|
692
|
-
"""Open file with exclusive lock for concurrent safety."""
|
|
693
|
-
f = open(path, mode)
|
|
694
|
-
try:
|
|
695
|
-
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
696
|
-
except Exception:
|
|
697
|
-
f.close()
|
|
698
|
-
raise
|
|
699
|
-
return f
|
|
700
|
-
|
|
701
682
|
def _with_retries(fn, attempts=3, base_sleep=0.5):
|
|
702
683
|
"""Execute function with retries and exponential backoff."""
|
|
703
684
|
for i in range(attempts):
|
|
@@ -709,66 +690,78 @@ def _with_retries(fn, attempts=3, base_sleep=0.5):
|
|
|
709
690
|
time.sleep(base_sleep * (2 ** i))
|
|
710
691
|
logger.debug(f"Retrying after error: {e}")
|
|
711
692
|
|
|
712
|
-
def
|
|
713
|
-
"""
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
"
|
|
771
|
-
|
|
693
|
+
def should_import_file(file_path: Path) -> bool:
|
|
694
|
+
"""Check if file should be imported using UnifiedStateManager."""
|
|
695
|
+
try:
|
|
696
|
+
# Get imported files from state manager
|
|
697
|
+
imported_files = state_manager.get_imported_files()
|
|
698
|
+
|
|
699
|
+
# Normalize the file path for comparison
|
|
700
|
+
normalized_path = state_manager.normalize_path(str(file_path))
|
|
701
|
+
|
|
702
|
+
if normalized_path in imported_files:
|
|
703
|
+
file_info = imported_files[normalized_path]
|
|
704
|
+
|
|
705
|
+
# Skip if file failed and we haven't reached retry limit
|
|
706
|
+
if file_info.get("status") == "failed" and file_info.get("retry_count", 0) >= 3:
|
|
707
|
+
logger.info(f"Skipping failed file (max retries reached): {file_path.name}")
|
|
708
|
+
return False
|
|
709
|
+
|
|
710
|
+
# Get file modification time for comparison
|
|
711
|
+
last_modified = file_path.stat().st_mtime
|
|
712
|
+
stored_modified = file_info.get("last_modified")
|
|
713
|
+
|
|
714
|
+
# Check if file has been modified (convert stored timestamp to float if needed)
|
|
715
|
+
if stored_modified:
|
|
716
|
+
try:
|
|
717
|
+
# Parse ISO timestamp to float for comparison
|
|
718
|
+
stored_time = datetime.fromisoformat(stored_modified.replace("Z", "+00:00")).timestamp()
|
|
719
|
+
if abs(last_modified - stored_time) > 1: # Allow 1 second tolerance
|
|
720
|
+
logger.info(f"File modified, will re-import: {file_path.name}")
|
|
721
|
+
return True
|
|
722
|
+
except (ValueError, TypeError):
|
|
723
|
+
# If we can't parse the stored time, re-import to be safe
|
|
724
|
+
logger.warning(f"Could not parse stored modification time, will re-import: {file_path.name}")
|
|
725
|
+
return True
|
|
726
|
+
|
|
727
|
+
# Check for suspiciously low chunk counts (likely failed imports)
|
|
728
|
+
chunks = file_info.get("chunks", 0)
|
|
729
|
+
file_size_kb = file_path.stat().st_size / 1024
|
|
730
|
+
|
|
731
|
+
# Heuristic: Files > 10KB should have more than 2 chunks
|
|
732
|
+
if file_size_kb > 10 and chunks <= 2 and file_info.get("status") != "failed":
|
|
733
|
+
logger.warning(f"File has suspiciously low chunks ({chunks}) for size {file_size_kb:.1f}KB, will re-import: {file_path.name}")
|
|
734
|
+
return True
|
|
735
|
+
|
|
736
|
+
# Skip if successfully imported
|
|
737
|
+
if file_info.get("status") == "completed":
|
|
738
|
+
logger.info(f"Skipping successfully imported file: {file_path.name}")
|
|
739
|
+
return False
|
|
740
|
+
|
|
741
|
+
return True
|
|
742
|
+
|
|
743
|
+
except Exception as e:
|
|
744
|
+
logger.warning(f"Error checking import status for {file_path}: {e}")
|
|
745
|
+
return True # Default to importing if we can't check status
|
|
746
|
+
|
|
747
|
+
def update_file_state(file_path: Path, chunks: int, collection_name: str):
|
|
748
|
+
"""Update state for imported file using UnifiedStateManager."""
|
|
749
|
+
try:
|
|
750
|
+
# Determine embedding mode from collection suffix
|
|
751
|
+
embedding_mode = "local" if collection_suffix == "local" else "cloud"
|
|
752
|
+
|
|
753
|
+
# Add file to state manager
|
|
754
|
+
state_manager.add_imported_file(
|
|
755
|
+
file_path=str(file_path),
|
|
756
|
+
chunks=chunks,
|
|
757
|
+
importer="streaming",
|
|
758
|
+
collection=collection_name,
|
|
759
|
+
embedding_mode=embedding_mode,
|
|
760
|
+
status="completed"
|
|
761
|
+
)
|
|
762
|
+
logger.debug(f"Updated state for {file_path.name}: {chunks} chunks")
|
|
763
|
+
except Exception as e:
|
|
764
|
+
logger.error(f"Failed to update state for {file_path}: {e}")
|
|
772
765
|
|
|
773
766
|
def main():
|
|
774
767
|
"""Main import function."""
|
|
@@ -798,9 +791,9 @@ def main():
|
|
|
798
791
|
collection_suffix = "voyage"
|
|
799
792
|
logger.info("Switched to Voyage AI embeddings (dimension: 1024)")
|
|
800
793
|
|
|
801
|
-
#
|
|
802
|
-
|
|
803
|
-
logger.info(f"Loaded state with {
|
|
794
|
+
# Get status from state manager
|
|
795
|
+
status = state_manager.get_status()
|
|
796
|
+
logger.info(f"Loaded state with {status['indexed_files']} previously imported files")
|
|
804
797
|
|
|
805
798
|
# Find all projects
|
|
806
799
|
# Use LOGS_DIR env var, or fall back to Claude projects directory, then /logs for Docker
|
|
@@ -848,7 +841,7 @@ def main():
|
|
|
848
841
|
logger.info(f"Reached limit of {args.limit} files, stopping import")
|
|
849
842
|
break
|
|
850
843
|
|
|
851
|
-
if should_import_file(jsonl_file
|
|
844
|
+
if should_import_file(jsonl_file):
|
|
852
845
|
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
853
846
|
files_processed += 1
|
|
854
847
|
if chunks > 0:
|
|
@@ -868,8 +861,7 @@ def main():
|
|
|
868
861
|
|
|
869
862
|
if actual_count > 0:
|
|
870
863
|
logger.info(f"Verified {actual_count} points in Qdrant for {conversation_id}")
|
|
871
|
-
update_file_state(jsonl_file,
|
|
872
|
-
save_state(state)
|
|
864
|
+
update_file_state(jsonl_file, chunks, collection_name)
|
|
873
865
|
total_imported += 1
|
|
874
866
|
else:
|
|
875
867
|
logger.error(f"No points found in Qdrant for {conversation_id} despite {chunks} chunks processed - not marking as imported")
|
|
@@ -883,6 +875,11 @@ def main():
|
|
|
883
875
|
# Critical fix: Don't mark files with 0 chunks as imported
|
|
884
876
|
# This allows retry on next run
|
|
885
877
|
logger.warning(f"File produced 0 chunks, not marking as imported: {jsonl_file.name}")
|
|
878
|
+
# Mark as failed so we don't keep retrying indefinitely
|
|
879
|
+
try:
|
|
880
|
+
state_manager.mark_file_failed(str(jsonl_file), "File produced 0 chunks during import")
|
|
881
|
+
except Exception as state_error:
|
|
882
|
+
logger.warning(f"Could not mark file as failed in state: {state_error}")
|
|
886
883
|
|
|
887
884
|
logger.info(f"Import complete: processed {total_imported} files")
|
|
888
885
|
|
|
@@ -35,10 +35,11 @@ from qdrant_client.http.exceptions import UnexpectedResponse
|
|
|
35
35
|
from fastembed import TextEmbedding
|
|
36
36
|
import psutil
|
|
37
37
|
|
|
38
|
-
# Import normalize_project_name
|
|
38
|
+
# Import normalize_project_name and UnifiedStateManager
|
|
39
39
|
import sys
|
|
40
40
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
41
41
|
from utils import normalize_project_name
|
|
42
|
+
from unified_state_manager import UnifiedStateManager
|
|
42
43
|
|
|
43
44
|
# Configure logging
|
|
44
45
|
logging.basicConfig(
|
|
@@ -52,26 +53,14 @@ logger = logging.getLogger(__name__)
|
|
|
52
53
|
class Config:
|
|
53
54
|
"""Production configuration with proper defaults."""
|
|
54
55
|
qdrant_url: str = field(default_factory=lambda: os.getenv("QDRANT_URL", "http://localhost:6333"))
|
|
56
|
+
qdrant_api_key: Optional[str] = field(default_factory=lambda: os.getenv("QDRANT_API_KEY"))
|
|
57
|
+
require_tls_for_remote: bool = field(default_factory=lambda: os.getenv("QDRANT_REQUIRE_TLS_FOR_REMOTE", "true").lower() == "true")
|
|
55
58
|
voyage_api_key: Optional[str] = field(default_factory=lambda: os.getenv("VOYAGE_API_KEY"))
|
|
56
59
|
prefer_local_embeddings: bool = field(default_factory=lambda: os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true")
|
|
57
60
|
embedding_model: str = field(default_factory=lambda: os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2"))
|
|
58
61
|
|
|
59
62
|
logs_dir: Path = field(default_factory=lambda: Path(os.getenv("LOGS_DIR", "~/.claude/projects")).expanduser())
|
|
60
63
|
|
|
61
|
-
# Production state file with proper naming
|
|
62
|
-
state_file: Path = field(default_factory=lambda: (
|
|
63
|
-
# Docker/cloud mode: use /config volume
|
|
64
|
-
Path("/config/csr-watcher.json") if os.path.exists("/.dockerenv")
|
|
65
|
-
# Local mode with cloud flag: separate state file
|
|
66
|
-
else Path("~/.claude-self-reflect/config/csr-watcher-cloud.json").expanduser()
|
|
67
|
-
if os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "false" and os.getenv("VOYAGE_API_KEY")
|
|
68
|
-
# Default local mode
|
|
69
|
-
else Path("~/.claude-self-reflect/config/csr-watcher.json").expanduser()
|
|
70
|
-
if os.getenv("STATE_FILE") is None
|
|
71
|
-
# User override
|
|
72
|
-
else Path(os.getenv("STATE_FILE")).expanduser()
|
|
73
|
-
))
|
|
74
|
-
|
|
75
64
|
collection_prefix: str = "conv"
|
|
76
65
|
vector_size: int = 384 # FastEmbed all-MiniLM-L6-v2
|
|
77
66
|
|
|
@@ -496,7 +485,7 @@ class QdrantService:
|
|
|
496
485
|
# Initialize with API key if provided
|
|
497
486
|
self.client = AsyncQdrantClient(
|
|
498
487
|
url=config.qdrant_url,
|
|
499
|
-
api_key=config.qdrant_api_key
|
|
488
|
+
api_key=config.qdrant_api_key
|
|
500
489
|
)
|
|
501
490
|
self.embedding_provider = embedding_provider
|
|
502
491
|
self._collection_cache: Dict[str, float] = {}
|
|
@@ -797,7 +786,7 @@ class StreamingWatcher:
|
|
|
797
786
|
|
|
798
787
|
def __init__(self, config: Config):
|
|
799
788
|
self.config = config
|
|
800
|
-
self.
|
|
789
|
+
self.state_manager = UnifiedStateManager()
|
|
801
790
|
self.embedding_provider = self._create_embedding_provider()
|
|
802
791
|
self.qdrant_service = QdrantService(config, self.embedding_provider)
|
|
803
792
|
self.chunker = TokenAwareChunker()
|
|
@@ -805,23 +794,23 @@ class StreamingWatcher:
|
|
|
805
794
|
self.memory_monitor = MemoryMonitor(config.memory_limit_mb, config.memory_warning_mb)
|
|
806
795
|
self.queue_manager = QueueManager(config.max_queue_size, config.max_backlog_hours)
|
|
807
796
|
self.progress = IndexingProgress(config.logs_dir)
|
|
808
|
-
|
|
797
|
+
|
|
809
798
|
self.stats = {
|
|
810
799
|
"files_processed": 0,
|
|
811
800
|
"chunks_processed": 0,
|
|
812
801
|
"failures": 0,
|
|
813
802
|
"start_time": time.time()
|
|
814
803
|
}
|
|
815
|
-
|
|
804
|
+
|
|
816
805
|
# Track file wait times for starvation prevention
|
|
817
806
|
self.file_first_seen: Dict[str, float] = {}
|
|
818
807
|
self.current_project: Optional[str] = self._detect_current_project()
|
|
819
808
|
self.last_mode: Optional[str] = None # Track mode changes for logging
|
|
820
|
-
|
|
809
|
+
|
|
821
810
|
self.shutdown_event = asyncio.Event()
|
|
822
|
-
|
|
823
|
-
logger.info(
|
|
824
|
-
logger.info(f"State file: {self.
|
|
811
|
+
|
|
812
|
+
logger.info("Streaming Watcher v3.0.0 with HOT/WARM/COLD prioritization")
|
|
813
|
+
logger.info(f"State file: {self.state_manager.state_file}")
|
|
825
814
|
logger.info(f"Memory limits: {config.memory_warning_mb}MB warning, {config.memory_limit_mb}MB limit")
|
|
826
815
|
logger.info(f"HOT window: {config.hot_window_minutes} min, WARM window: {config.warm_window_hours} hrs")
|
|
827
816
|
|
|
@@ -901,75 +890,19 @@ class StreamingWatcher:
|
|
|
901
890
|
)
|
|
902
891
|
|
|
903
892
|
async def load_state(self) -> None:
|
|
904
|
-
"""Load persisted state
|
|
905
|
-
if self.config.state_file.exists():
|
|
906
|
-
try:
|
|
907
|
-
with open(self.config.state_file, 'r') as f:
|
|
908
|
-
self.state = json.load(f)
|
|
909
|
-
|
|
910
|
-
# Migrate old state format if needed
|
|
911
|
-
if "imported_files" in self.state:
|
|
912
|
-
imported_count = len(self.state["imported_files"])
|
|
913
|
-
logger.info(f"Loaded state with {imported_count} files")
|
|
914
|
-
|
|
915
|
-
# Ensure all entries have full paths as keys
|
|
916
|
-
migrated = {}
|
|
917
|
-
for key, value in self.state["imported_files"].items():
|
|
918
|
-
# Ensure key is a full path
|
|
919
|
-
if not key.startswith('/'):
|
|
920
|
-
# Try to reconstruct full path
|
|
921
|
-
possible_path = self.config.logs_dir / key
|
|
922
|
-
if possible_path.exists():
|
|
923
|
-
migrated[str(possible_path)] = value
|
|
924
|
-
else:
|
|
925
|
-
migrated[key] = value # Keep as is
|
|
926
|
-
else:
|
|
927
|
-
migrated[key] = value
|
|
928
|
-
|
|
929
|
-
if len(migrated) != len(self.state["imported_files"]):
|
|
930
|
-
logger.info(f"Migrated state format: {len(self.state['imported_files'])} -> {len(migrated)} entries")
|
|
931
|
-
self.state["imported_files"] = migrated
|
|
932
|
-
|
|
933
|
-
except Exception as e:
|
|
934
|
-
logger.error(f"Error loading state: {e}")
|
|
935
|
-
self.state = {}
|
|
936
|
-
|
|
937
|
-
if "imported_files" not in self.state:
|
|
938
|
-
self.state["imported_files"] = {}
|
|
939
|
-
if "high_water_mark" not in self.state:
|
|
940
|
-
self.state["high_water_mark"] = 0
|
|
941
|
-
|
|
942
|
-
# Update progress tracker
|
|
943
|
-
self.progress.update(len(self.state["imported_files"]))
|
|
944
|
-
|
|
945
|
-
async def save_state(self) -> None:
|
|
946
|
-
"""Save state atomically."""
|
|
893
|
+
"""Load persisted state using UnifiedStateManager."""
|
|
947
894
|
try:
|
|
948
|
-
self.
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
os.fsync(f.fileno())
|
|
955
|
-
|
|
956
|
-
if platform.system() == 'Windows':
|
|
957
|
-
if self.config.state_file.exists():
|
|
958
|
-
self.config.state_file.unlink()
|
|
959
|
-
temp_file.rename(self.config.state_file)
|
|
960
|
-
else:
|
|
961
|
-
os.replace(temp_file, self.config.state_file)
|
|
962
|
-
|
|
963
|
-
# Directory fsync for stronger guarantees
|
|
964
|
-
try:
|
|
965
|
-
dir_fd = os.open(str(self.config.state_file.parent), os.O_DIRECTORY)
|
|
966
|
-
os.fsync(dir_fd)
|
|
967
|
-
os.close(dir_fd)
|
|
968
|
-
except:
|
|
969
|
-
pass
|
|
970
|
-
|
|
895
|
+
status = self.state_manager.get_status()
|
|
896
|
+
imported_count = status["indexed_files"]
|
|
897
|
+
logger.info(f"Loaded state with {imported_count} files")
|
|
898
|
+
|
|
899
|
+
# Update progress tracker
|
|
900
|
+
self.progress.update(imported_count)
|
|
971
901
|
except Exception as e:
|
|
972
|
-
logger.error(f"Error
|
|
902
|
+
logger.error(f"Error loading state: {e}")
|
|
903
|
+
# Initialize progress with 0
|
|
904
|
+
self.progress.update(0)
|
|
905
|
+
|
|
973
906
|
|
|
974
907
|
def get_collection_name(self, project_path: str) -> str:
|
|
975
908
|
"""Get collection name for project."""
|
|
@@ -1092,15 +1025,15 @@ class StreamingWatcher:
|
|
|
1092
1025
|
continue
|
|
1093
1026
|
|
|
1094
1027
|
if not all_messages:
|
|
1095
|
-
logger.warning(f"No messages in {file_path}, marking as
|
|
1096
|
-
# Mark
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1028
|
+
logger.warning(f"No messages in {file_path}, marking as failed")
|
|
1029
|
+
# Mark as failed to enable retry and correct progress
|
|
1030
|
+
try:
|
|
1031
|
+
self.state_manager.mark_file_failed(
|
|
1032
|
+
str(file_path),
|
|
1033
|
+
"No messages found in conversation (0 chunks)"
|
|
1034
|
+
)
|
|
1035
|
+
except Exception as e:
|
|
1036
|
+
logger.exception("Failed to update state for %s", file_path)
|
|
1104
1037
|
self.stats["files_processed"] += 1
|
|
1105
1038
|
return True
|
|
1106
1039
|
|
|
@@ -1181,15 +1114,15 @@ class StreamingWatcher:
|
|
|
1181
1114
|
|
|
1182
1115
|
combined_text = "\n\n".join(text_parts)
|
|
1183
1116
|
if not combined_text.strip():
|
|
1184
|
-
logger.warning(f"No textual content in {file_path}, marking as
|
|
1185
|
-
# Mark
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1117
|
+
logger.warning(f"No textual content in {file_path}, marking as failed")
|
|
1118
|
+
# Mark as failed to enable retry and correct progress
|
|
1119
|
+
try:
|
|
1120
|
+
self.state_manager.mark_file_failed(
|
|
1121
|
+
str(file_path),
|
|
1122
|
+
"No textual content in conversation (0 chunks)"
|
|
1123
|
+
)
|
|
1124
|
+
except Exception as e:
|
|
1125
|
+
logger.exception("Failed to update state for %s", file_path)
|
|
1193
1126
|
self.stats["files_processed"] += 1
|
|
1194
1127
|
return True
|
|
1195
1128
|
|
|
@@ -1280,23 +1213,34 @@ class StreamingWatcher:
|
|
|
1280
1213
|
if should_cleanup:
|
|
1281
1214
|
await self.memory_monitor.cleanup()
|
|
1282
1215
|
|
|
1283
|
-
# Update state
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1216
|
+
# Update state using UnifiedStateManager
|
|
1217
|
+
try:
|
|
1218
|
+
self.state_manager.add_imported_file(
|
|
1219
|
+
file_path=str(file_path),
|
|
1220
|
+
chunks=chunks_processed,
|
|
1221
|
+
importer="streaming",
|
|
1222
|
+
collection=collection_name,
|
|
1223
|
+
embedding_mode="local" if self.config.prefer_local_embeddings else "cloud",
|
|
1224
|
+
status="completed"
|
|
1225
|
+
)
|
|
1226
|
+
except Exception as e:
|
|
1227
|
+
logger.error(f"Failed to update state for {file_path}: {e}")
|
|
1228
|
+
return False
|
|
1229
|
+
|
|
1291
1230
|
self.stats["files_processed"] += 1
|
|
1292
1231
|
self.stats["chunks_processed"] += chunks_processed
|
|
1293
|
-
|
|
1232
|
+
|
|
1294
1233
|
logger.info(f"Completed: {file_path.name} ({chunks_processed} chunks)")
|
|
1295
1234
|
return True
|
|
1296
1235
|
|
|
1297
1236
|
except Exception as e:
|
|
1298
1237
|
logger.error(f"Error processing {file_path}: {e}")
|
|
1299
1238
|
self.stats["failures"] += 1
|
|
1239
|
+
# Mark file as failed using UnifiedStateManager
|
|
1240
|
+
try:
|
|
1241
|
+
self.state_manager.mark_file_failed(str(file_path), str(e))
|
|
1242
|
+
except Exception as mark_error:
|
|
1243
|
+
logger.error(f"Failed to mark file as failed: {mark_error}")
|
|
1300
1244
|
return False
|
|
1301
1245
|
|
|
1302
1246
|
async def find_new_files(self) -> List[Tuple[Path, FreshnessLevel, int]]:
|
|
@@ -1304,47 +1248,51 @@ class StreamingWatcher:
|
|
|
1304
1248
|
if not self.config.logs_dir.exists():
|
|
1305
1249
|
logger.warning(f"Logs dir not found: {self.config.logs_dir}")
|
|
1306
1250
|
return []
|
|
1307
|
-
|
|
1251
|
+
|
|
1308
1252
|
categorized_files = []
|
|
1309
|
-
high_water_mark = self.state.get("high_water_mark", 0)
|
|
1310
|
-
new_high_water = high_water_mark
|
|
1311
1253
|
now = time.time()
|
|
1312
|
-
|
|
1254
|
+
|
|
1255
|
+
# Get imported files from UnifiedStateManager
|
|
1256
|
+
try:
|
|
1257
|
+
imported_files = self.state_manager.get_imported_files()
|
|
1258
|
+
except Exception as e:
|
|
1259
|
+
logger.error(f"Error getting imported files: {e}")
|
|
1260
|
+
imported_files = {}
|
|
1261
|
+
|
|
1313
1262
|
try:
|
|
1314
1263
|
for project_dir in self.config.logs_dir.iterdir():
|
|
1315
1264
|
if not project_dir.is_dir():
|
|
1316
1265
|
continue
|
|
1317
|
-
|
|
1266
|
+
|
|
1318
1267
|
try:
|
|
1319
1268
|
for jsonl_file in project_dir.glob("*.jsonl"):
|
|
1320
1269
|
file_mtime = jsonl_file.stat().st_mtime
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1270
|
+
|
|
1271
|
+
# Check if already processed (using normalized path)
|
|
1272
|
+
try:
|
|
1273
|
+
normalized_path = self.state_manager.normalize_path(str(jsonl_file))
|
|
1274
|
+
if normalized_path in imported_files:
|
|
1275
|
+
stored = imported_files[normalized_path]
|
|
1276
|
+
# Check if file was modified after import
|
|
1277
|
+
import_time_str = stored.get("imported_at")
|
|
1278
|
+
if import_time_str:
|
|
1279
|
+
import_time = datetime.fromisoformat(import_time_str.replace("Z", "+00:00")).timestamp()
|
|
1280
|
+
if file_mtime <= import_time:
|
|
1281
|
+
continue
|
|
1282
|
+
except Exception as e:
|
|
1283
|
+
logger.debug(f"Error checking import status for {jsonl_file}: {e}")
|
|
1284
|
+
# If we can't check, assume not imported
|
|
1285
|
+
|
|
1336
1286
|
# Categorize file freshness (handles first_seen tracking internally)
|
|
1337
1287
|
freshness_level, priority_score = self.categorize_freshness(jsonl_file)
|
|
1338
|
-
|
|
1288
|
+
|
|
1339
1289
|
categorized_files.append((jsonl_file, freshness_level, priority_score))
|
|
1340
1290
|
except Exception as e:
|
|
1341
1291
|
logger.error(f"Error scanning project dir {project_dir}: {e}")
|
|
1342
|
-
|
|
1292
|
+
|
|
1343
1293
|
except Exception as e:
|
|
1344
1294
|
logger.error(f"Error scanning logs dir: {e}")
|
|
1345
1295
|
|
|
1346
|
-
self.state["high_water_mark"] = new_high_water
|
|
1347
|
-
|
|
1348
1296
|
# Sort by priority score (lower = higher priority)
|
|
1349
1297
|
categorized_files.sort(key=lambda x: x[2])
|
|
1350
1298
|
|
|
@@ -1370,7 +1318,7 @@ class StreamingWatcher:
|
|
|
1370
1318
|
logger.info("=" * 60)
|
|
1371
1319
|
logger.info("Claude Self-Reflect Streaming Watcher v3.0.0")
|
|
1372
1320
|
logger.info("=" * 60)
|
|
1373
|
-
logger.info(
|
|
1321
|
+
logger.info("State manager: UnifiedStateManager")
|
|
1374
1322
|
logger.info(f"Memory: {self.config.memory_warning_mb}MB warning, {self.config.memory_limit_mb}MB limit")
|
|
1375
1323
|
logger.info(f"CPU limit: {self.cpu_monitor.max_total_cpu:.1f}%")
|
|
1376
1324
|
logger.info(f"Queue size: {self.config.max_queue_size}")
|
|
@@ -1380,9 +1328,10 @@ class StreamingWatcher:
|
|
|
1380
1328
|
|
|
1381
1329
|
# Initial progress scan
|
|
1382
1330
|
total_files = self.progress.scan_total_files()
|
|
1383
|
-
|
|
1331
|
+
status = self.state_manager.get_status()
|
|
1332
|
+
indexed_files = status["indexed_files"]
|
|
1384
1333
|
self.progress.update(indexed_files)
|
|
1385
|
-
|
|
1334
|
+
|
|
1386
1335
|
initial_progress = self.progress.get_progress()
|
|
1387
1336
|
logger.info(f"Initial progress: {indexed_files}/{total_files} files ({initial_progress['percent']:.1f}%)")
|
|
1388
1337
|
|
|
@@ -1433,23 +1382,30 @@ class StreamingWatcher:
|
|
|
1433
1382
|
except FileNotFoundError:
|
|
1434
1383
|
logger.warning(f"File disappeared: {file_path}")
|
|
1435
1384
|
continue
|
|
1436
|
-
|
|
1437
|
-
imported
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1385
|
+
|
|
1386
|
+
# Check if already imported using UnifiedStateManager
|
|
1387
|
+
try:
|
|
1388
|
+
normalized_path = self.state_manager.normalize_path(file_key)
|
|
1389
|
+
imported_files = self.state_manager.get_imported_files()
|
|
1390
|
+
if normalized_path in imported_files:
|
|
1391
|
+
stored = imported_files[normalized_path]
|
|
1392
|
+
import_time_str = stored.get("imported_at")
|
|
1393
|
+
if import_time_str:
|
|
1394
|
+
import_time = datetime.fromisoformat(import_time_str.replace("Z", "+00:00")).timestamp()
|
|
1395
|
+
if file_mtime <= import_time:
|
|
1396
|
+
logger.debug(f"Skipping already imported: {file_path.name}")
|
|
1397
|
+
continue
|
|
1398
|
+
except Exception as e:
|
|
1399
|
+
logger.debug(f"Error checking import status: {e}")
|
|
1400
|
+
|
|
1446
1401
|
success = await self.process_file(file_path)
|
|
1447
|
-
|
|
1402
|
+
|
|
1448
1403
|
if success:
|
|
1449
1404
|
# Clean up first_seen tracking to prevent memory leak
|
|
1450
1405
|
self.file_first_seen.pop(file_key, None)
|
|
1451
|
-
|
|
1452
|
-
self.
|
|
1406
|
+
# Update progress (state is managed by UnifiedStateManager)
|
|
1407
|
+
status = self.state_manager.get_status()
|
|
1408
|
+
self.progress.update(status["indexed_files"])
|
|
1453
1409
|
|
|
1454
1410
|
# Log comprehensive metrics
|
|
1455
1411
|
if batch or cycle_count % 6 == 0: # Every minute if idle
|
|
@@ -1519,7 +1475,6 @@ class StreamingWatcher:
|
|
|
1519
1475
|
raise
|
|
1520
1476
|
finally:
|
|
1521
1477
|
logger.info("Shutting down...")
|
|
1522
|
-
await self.save_state()
|
|
1523
1478
|
await self.embedding_provider.close()
|
|
1524
1479
|
await self.qdrant_service.close()
|
|
1525
1480
|
|