claude-self-reflect 3.2.4 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +992 -510
- package/.claude/agents/reflection-specialist.md +59 -3
- package/README.md +14 -5
- package/installer/cli.js +16 -0
- package/installer/postinstall.js +14 -0
- package/installer/statusline-setup.js +289 -0
- package/mcp-server/run-mcp.sh +73 -5
- package/mcp-server/src/app_context.py +64 -0
- package/mcp-server/src/config.py +57 -0
- package/mcp-server/src/connection_pool.py +286 -0
- package/mcp-server/src/decay_manager.py +106 -0
- package/mcp-server/src/embedding_manager.py +64 -40
- package/mcp-server/src/embeddings_old.py +141 -0
- package/mcp-server/src/models.py +64 -0
- package/mcp-server/src/parallel_search.py +305 -0
- package/mcp-server/src/project_resolver.py +5 -0
- package/mcp-server/src/reflection_tools.py +211 -0
- package/mcp-server/src/rich_formatting.py +196 -0
- package/mcp-server/src/search_tools.py +874 -0
- package/mcp-server/src/server.py +127 -1720
- package/mcp-server/src/temporal_design.py +132 -0
- package/mcp-server/src/temporal_tools.py +604 -0
- package/mcp-server/src/temporal_utils.py +384 -0
- package/mcp-server/src/utils.py +150 -67
- package/package.json +15 -1
- package/scripts/add-timestamp-indexes.py +134 -0
- package/scripts/ast_grep_final_analyzer.py +325 -0
- package/scripts/ast_grep_unified_registry.py +556 -0
- package/scripts/check-collections.py +29 -0
- package/scripts/csr-status +366 -0
- package/scripts/debug-august-parsing.py +76 -0
- package/scripts/debug-import-single.py +91 -0
- package/scripts/debug-project-resolver.py +82 -0
- package/scripts/debug-temporal-tools.py +135 -0
- package/scripts/delta-metadata-update.py +547 -0
- package/scripts/import-conversations-unified.py +157 -25
- package/scripts/precompact-hook.sh +33 -0
- package/scripts/session_quality_tracker.py +481 -0
- package/scripts/streaming-watcher.py +1578 -0
- package/scripts/update_patterns.py +334 -0
- package/scripts/utils.py +39 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Debug script for testing temporal tools in Claude Self Reflect.
|
|
4
|
+
This script directly tests the temporal tools that should be available via MCP.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import traceback
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
# Add the mcp-server source to Python path
|
|
15
|
+
sys.path.append(str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
|
16
|
+
|
|
17
|
+
os.environ["QDRANT_URL"] = "http://localhost:6333"
|
|
18
|
+
|
|
19
|
+
async def test_temporal_tools():
|
|
20
|
+
"""Test all temporal tools."""
|
|
21
|
+
print("=== TEMPORAL TOOLS DEBUG SCRIPT ===")
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
# Import required modules
|
|
25
|
+
from server import (
|
|
26
|
+
get_recent_work, search_by_recency, get_timeline,
|
|
27
|
+
get_all_collections, QDRANT_URL
|
|
28
|
+
)
|
|
29
|
+
from fastmcp import Context
|
|
30
|
+
|
|
31
|
+
print(f"✅ Successfully imported temporal tools")
|
|
32
|
+
print(f"✅ Qdrant URL: {QDRANT_URL}")
|
|
33
|
+
|
|
34
|
+
# Check if Qdrant is available
|
|
35
|
+
collections = await get_all_collections()
|
|
36
|
+
print(f"✅ Found {len(collections)} collections: {collections[:5]}...")
|
|
37
|
+
|
|
38
|
+
# Create a mock context for testing
|
|
39
|
+
class MockContext(Context):
|
|
40
|
+
def __init__(self):
|
|
41
|
+
pass
|
|
42
|
+
async def debug(self, message):
|
|
43
|
+
print(f"DEBUG: {message}")
|
|
44
|
+
async def error(self, message):
|
|
45
|
+
print(f"ERROR: {message}")
|
|
46
|
+
|
|
47
|
+
ctx = MockContext()
|
|
48
|
+
|
|
49
|
+
# Test 1: get_recent_work with default parameters
|
|
50
|
+
print("\n--- Test 1: get_recent_work (default) ---")
|
|
51
|
+
try:
|
|
52
|
+
result = await get_recent_work(ctx)
|
|
53
|
+
print(f"✅ get_recent_work succeeded")
|
|
54
|
+
print(f"Result length: {len(result) if result else 0} characters")
|
|
55
|
+
if result and len(result) < 500:
|
|
56
|
+
print(f"Result: {result}")
|
|
57
|
+
except Exception as e:
|
|
58
|
+
print(f"❌ get_recent_work failed: {e}")
|
|
59
|
+
traceback.print_exc()
|
|
60
|
+
|
|
61
|
+
# Test 2: get_recent_work with project='all'
|
|
62
|
+
print("\n--- Test 2: get_recent_work (project=all) ---")
|
|
63
|
+
try:
|
|
64
|
+
result = await get_recent_work(ctx, project="all", limit=5)
|
|
65
|
+
print(f"✅ get_recent_work (project=all) succeeded")
|
|
66
|
+
print(f"Result length: {len(result) if result else 0} characters")
|
|
67
|
+
except Exception as e:
|
|
68
|
+
print(f"❌ get_recent_work (project=all) failed: {e}")
|
|
69
|
+
traceback.print_exc()
|
|
70
|
+
|
|
71
|
+
# Test 3: get_recent_work with different group_by options
|
|
72
|
+
for group_by in ["conversation", "day", "session"]:
|
|
73
|
+
print(f"\n--- Test 3.{group_by}: get_recent_work (group_by={group_by}) ---")
|
|
74
|
+
try:
|
|
75
|
+
result = await get_recent_work(ctx, limit=3, group_by=group_by)
|
|
76
|
+
print(f"✅ get_recent_work (group_by={group_by}) succeeded")
|
|
77
|
+
print(f"Result length: {len(result) if result else 0} characters")
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(f"❌ get_recent_work (group_by={group_by}) failed: {e}")
|
|
80
|
+
traceback.print_exc()
|
|
81
|
+
|
|
82
|
+
# Test 4: search_by_recency with time_range
|
|
83
|
+
print("\n--- Test 4: search_by_recency (time_range) ---")
|
|
84
|
+
try:
|
|
85
|
+
result = await search_by_recency(
|
|
86
|
+
ctx,
|
|
87
|
+
query="testing debugging",
|
|
88
|
+
time_range="last week",
|
|
89
|
+
limit=5
|
|
90
|
+
)
|
|
91
|
+
print(f"✅ search_by_recency (time_range) succeeded")
|
|
92
|
+
print(f"Result length: {len(result) if result else 0} characters")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
print(f"❌ search_by_recency (time_range) failed: {e}")
|
|
95
|
+
traceback.print_exc()
|
|
96
|
+
|
|
97
|
+
# Test 5: search_by_recency with since/until
|
|
98
|
+
print("\n--- Test 5: search_by_recency (since/until) ---")
|
|
99
|
+
try:
|
|
100
|
+
result = await search_by_recency(
|
|
101
|
+
ctx,
|
|
102
|
+
query="python script",
|
|
103
|
+
since="yesterday",
|
|
104
|
+
limit=3
|
|
105
|
+
)
|
|
106
|
+
print(f"✅ search_by_recency (since/until) succeeded")
|
|
107
|
+
print(f"Result length: {len(result) if result else 0} characters")
|
|
108
|
+
except Exception as e:
|
|
109
|
+
print(f"❌ search_by_recency (since/until) failed: {e}")
|
|
110
|
+
traceback.print_exc()
|
|
111
|
+
|
|
112
|
+
# Test 6: get_timeline with different granularities
|
|
113
|
+
for granularity in ["day", "week"]:
|
|
114
|
+
print(f"\n--- Test 6.{granularity}: get_timeline (granularity={granularity}) ---")
|
|
115
|
+
try:
|
|
116
|
+
result = await get_timeline(
|
|
117
|
+
ctx,
|
|
118
|
+
time_range="last week",
|
|
119
|
+
granularity=granularity,
|
|
120
|
+
include_stats=True
|
|
121
|
+
)
|
|
122
|
+
print(f"✅ get_timeline (granularity={granularity}) succeeded")
|
|
123
|
+
print(f"Result length: {len(result) if result else 0} characters")
|
|
124
|
+
except Exception as e:
|
|
125
|
+
print(f"❌ get_timeline (granularity={granularity}) failed: {e}")
|
|
126
|
+
traceback.print_exc()
|
|
127
|
+
|
|
128
|
+
print("\n=== TEMPORAL TOOLS TEST COMPLETE ===")
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
print(f"❌ Critical error during setup: {e}")
|
|
132
|
+
traceback.print_exc()
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
asyncio.run(test_temporal_tools())
|
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Delta metadata update script for Claude Self-Reflect.
|
|
4
|
+
Updates existing Qdrant points with tool usage metadata without re-importing vectors.
|
|
5
|
+
This allows us to enhance past conversations with file tracking and concept extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import hashlib
|
|
12
|
+
import re
|
|
13
|
+
import time
|
|
14
|
+
from datetime import datetime, timedelta
|
|
15
|
+
from typing import List, Dict, Any, Set, Tuple, Optional
|
|
16
|
+
import logging
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from qdrant_client import QdrantClient
|
|
20
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
21
|
+
|
|
22
|
+
# Configuration
|
|
23
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
24
|
+
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
25
|
+
STATE_FILE = os.getenv("STATE_FILE", "./config/delta-update-state.json")
|
|
26
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
27
|
+
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
28
|
+
DAYS_TO_UPDATE = int(os.getenv("DAYS_TO_UPDATE", "7"))
|
|
29
|
+
|
|
30
|
+
# Set up logging
|
|
31
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Initialize Qdrant client
|
|
35
|
+
client = QdrantClient(url=QDRANT_URL)
|
|
36
|
+
|
|
37
|
+
def get_collection_suffix():
|
|
38
|
+
"""Get the collection suffix based on embedding type."""
|
|
39
|
+
return "_local" if PREFER_LOCAL_EMBEDDINGS else "_voyage"
|
|
40
|
+
|
|
41
|
+
def normalize_project_name(project_name: str) -> str:
|
|
42
|
+
"""Normalize project name by removing path-like prefixes."""
|
|
43
|
+
# Remove path-like prefixes (e.g., "-Users-username-projects-")
|
|
44
|
+
if project_name.startswith("-"):
|
|
45
|
+
# Split by '-' and reconstruct
|
|
46
|
+
parts = project_name.split("-")
|
|
47
|
+
# Find where the actual project name starts (usually after 'projects')
|
|
48
|
+
for i, part in enumerate(parts):
|
|
49
|
+
if part == "projects" and i < len(parts) - 1:
|
|
50
|
+
return "-".join(parts[i+1:])
|
|
51
|
+
return project_name
|
|
52
|
+
|
|
53
|
+
def normalize_path(path: str) -> str:
|
|
54
|
+
"""Normalize file paths for consistency across platforms."""
|
|
55
|
+
if not path:
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
# Remove common prefixes
|
|
59
|
+
path = path.replace("/Users/", "~/")
|
|
60
|
+
path = path.replace("\\Users\\", "~\\")
|
|
61
|
+
|
|
62
|
+
# Convert to forward slashes
|
|
63
|
+
path = path.replace("\\", "/")
|
|
64
|
+
|
|
65
|
+
# Remove duplicate slashes
|
|
66
|
+
path = re.sub(r'/+', '/', path)
|
|
67
|
+
|
|
68
|
+
return path
|
|
69
|
+
|
|
70
|
+
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
71
|
+
"""Extract high-level concepts from conversation and tool usage."""
|
|
72
|
+
concepts = set()
|
|
73
|
+
|
|
74
|
+
# Common development concepts with patterns
|
|
75
|
+
concept_patterns = {
|
|
76
|
+
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
77
|
+
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
78
|
+
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
79
|
+
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
80
|
+
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
81
|
+
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
|
|
82
|
+
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
83
|
+
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
84
|
+
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
85
|
+
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
86
|
+
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
87
|
+
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
88
|
+
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
89
|
+
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
90
|
+
'search': r'(search|query|find|filter|match|relevance)'
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Check text content
|
|
94
|
+
combined_text = text.lower()
|
|
95
|
+
for concept, pattern in concept_patterns.items():
|
|
96
|
+
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
97
|
+
concepts.add(concept)
|
|
98
|
+
|
|
99
|
+
# Check tool usage patterns
|
|
100
|
+
if tool_usage.get('grep_searches'):
|
|
101
|
+
concepts.add('search')
|
|
102
|
+
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
103
|
+
concepts.add('development')
|
|
104
|
+
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
105
|
+
concepts.add('testing')
|
|
106
|
+
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
107
|
+
concepts.add('docker')
|
|
108
|
+
|
|
109
|
+
return concepts
|
|
110
|
+
|
|
111
|
+
def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
112
|
+
"""Extract all tool usage from a conversation."""
|
|
113
|
+
tool_usage = {
|
|
114
|
+
"files_read": [],
|
|
115
|
+
"files_edited": [],
|
|
116
|
+
"files_created": [],
|
|
117
|
+
"grep_searches": [],
|
|
118
|
+
"bash_commands": [],
|
|
119
|
+
"glob_patterns": [],
|
|
120
|
+
"task_calls": [],
|
|
121
|
+
"web_searches": [],
|
|
122
|
+
"tools_summary": {}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
127
|
+
for line in f:
|
|
128
|
+
if not line.strip():
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
data = json.loads(line)
|
|
133
|
+
|
|
134
|
+
# Look for tool usage in message content
|
|
135
|
+
if 'message' in data and data['message']:
|
|
136
|
+
msg = data['message']
|
|
137
|
+
if msg.get('role') == 'assistant' and msg.get('content'):
|
|
138
|
+
content = msg['content']
|
|
139
|
+
|
|
140
|
+
# Handle content as list of objects
|
|
141
|
+
if isinstance(content, list):
|
|
142
|
+
for item in content:
|
|
143
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
144
|
+
extract_tool_data(item, tool_usage)
|
|
145
|
+
# Handle content as string (legacy format)
|
|
146
|
+
elif isinstance(content, str):
|
|
147
|
+
# Try to extract tool usage from text patterns
|
|
148
|
+
extract_tools_from_text(content, tool_usage)
|
|
149
|
+
|
|
150
|
+
except json.JSONDecodeError:
|
|
151
|
+
continue
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.debug(f"Error processing line: {e}")
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.error(f"Error reading JSONL file {jsonl_path}: {e}")
|
|
157
|
+
|
|
158
|
+
# Calculate tools summary
|
|
159
|
+
all_tools = []
|
|
160
|
+
if tool_usage['files_read']:
|
|
161
|
+
all_tools.extend(['Read'] * len(tool_usage['files_read']))
|
|
162
|
+
if tool_usage['files_edited']:
|
|
163
|
+
all_tools.extend(['Edit'] * len(tool_usage['files_edited']))
|
|
164
|
+
if tool_usage['files_created']:
|
|
165
|
+
all_tools.extend(['Write'] * len(tool_usage['files_created']))
|
|
166
|
+
if tool_usage['grep_searches']:
|
|
167
|
+
all_tools.extend(['Grep'] * len(tool_usage['grep_searches']))
|
|
168
|
+
if tool_usage['bash_commands']:
|
|
169
|
+
all_tools.extend(['Bash'] * len(tool_usage['bash_commands']))
|
|
170
|
+
if tool_usage['glob_patterns']:
|
|
171
|
+
all_tools.extend(['Glob'] * len(tool_usage['glob_patterns']))
|
|
172
|
+
if tool_usage['task_calls']:
|
|
173
|
+
all_tools.extend(['Task'] * len(tool_usage['task_calls']))
|
|
174
|
+
if tool_usage['web_searches']:
|
|
175
|
+
all_tools.extend(['WebSearch'] * len(tool_usage['web_searches']))
|
|
176
|
+
|
|
177
|
+
# Count tool usage
|
|
178
|
+
for tool in all_tools:
|
|
179
|
+
tool_usage['tools_summary'][tool] = tool_usage['tools_summary'].get(tool, 0) + 1
|
|
180
|
+
|
|
181
|
+
return tool_usage
|
|
182
|
+
|
|
183
|
+
def extract_tool_data(tool_use: Dict[str, Any], usage_dict: Dict[str, Any]):
|
|
184
|
+
"""Extract tool usage data from a tool_use object."""
|
|
185
|
+
tool_name = tool_use.get('name', '')
|
|
186
|
+
inputs = tool_use.get('input', {})
|
|
187
|
+
|
|
188
|
+
# Handle Read tool
|
|
189
|
+
if tool_name == 'Read':
|
|
190
|
+
file_path = inputs.get('file_path')
|
|
191
|
+
if file_path:
|
|
192
|
+
usage_dict['files_read'].append({
|
|
193
|
+
'path': normalize_path(file_path),
|
|
194
|
+
'operation': 'read'
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
# Handle Edit and MultiEdit tools
|
|
198
|
+
elif tool_name in ['Edit', 'MultiEdit']:
|
|
199
|
+
path = inputs.get('file_path')
|
|
200
|
+
if path:
|
|
201
|
+
usage_dict['files_edited'].append({
|
|
202
|
+
'path': normalize_path(path),
|
|
203
|
+
'operation': tool_name.lower()
|
|
204
|
+
})
|
|
205
|
+
|
|
206
|
+
# Handle Write tool
|
|
207
|
+
elif tool_name == 'Write':
|
|
208
|
+
path = inputs.get('file_path')
|
|
209
|
+
if path:
|
|
210
|
+
usage_dict['files_created'].append({
|
|
211
|
+
'path': normalize_path(path),
|
|
212
|
+
'operation': 'write'
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
# Handle Grep tool
|
|
216
|
+
elif tool_name == 'Grep':
|
|
217
|
+
pattern = inputs.get('pattern')
|
|
218
|
+
path = inputs.get('path', '.')
|
|
219
|
+
if pattern:
|
|
220
|
+
usage_dict['grep_searches'].append({
|
|
221
|
+
'pattern': pattern,
|
|
222
|
+
'path': normalize_path(path)
|
|
223
|
+
})
|
|
224
|
+
|
|
225
|
+
# Handle Bash tool
|
|
226
|
+
elif tool_name == 'Bash':
|
|
227
|
+
command = inputs.get('command')
|
|
228
|
+
if command:
|
|
229
|
+
usage_dict['bash_commands'].append({
|
|
230
|
+
'command': command[:200] # Limit command length
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
# Handle Glob tool
|
|
234
|
+
elif tool_name == 'Glob':
|
|
235
|
+
pattern = inputs.get('pattern')
|
|
236
|
+
if pattern:
|
|
237
|
+
usage_dict['glob_patterns'].append({
|
|
238
|
+
'pattern': pattern
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
# Handle Task tool
|
|
242
|
+
elif tool_name == 'Task':
|
|
243
|
+
agent = inputs.get('subagent_type', 'unknown')
|
|
244
|
+
usage_dict['task_calls'].append({
|
|
245
|
+
'agent': agent
|
|
246
|
+
})
|
|
247
|
+
|
|
248
|
+
# Handle WebSearch tool
|
|
249
|
+
elif tool_name == 'WebSearch':
|
|
250
|
+
query = inputs.get('query')
|
|
251
|
+
if query:
|
|
252
|
+
usage_dict['web_searches'].append({
|
|
253
|
+
'query': query[:100]
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
def extract_tools_from_text(content: str, usage_dict: Dict[str, Any]):
|
|
257
|
+
"""Extract tool usage from text content (fallback for legacy format)."""
|
|
258
|
+
# Look for file paths that might have been read/edited
|
|
259
|
+
file_pattern = r'(?:Reading|Editing|Writing|Checking)\s+(?:file\s+)?([/~][\w\-./]+\.\w+)'
|
|
260
|
+
for match in re.finditer(file_pattern, content):
|
|
261
|
+
file_path = match.group(1)
|
|
262
|
+
if 'Edit' in match.group(0):
|
|
263
|
+
usage_dict['files_edited'].append({
|
|
264
|
+
'path': normalize_path(file_path),
|
|
265
|
+
'operation': 'edit'
|
|
266
|
+
})
|
|
267
|
+
else:
|
|
268
|
+
usage_dict['files_read'].append({
|
|
269
|
+
'path': normalize_path(file_path),
|
|
270
|
+
'operation': 'read'
|
|
271
|
+
})
|
|
272
|
+
|
|
273
|
+
def load_state():
|
|
274
|
+
"""Load the delta update state."""
|
|
275
|
+
state_path = Path(STATE_FILE)
|
|
276
|
+
if state_path.exists():
|
|
277
|
+
try:
|
|
278
|
+
with open(state_path, 'r') as f:
|
|
279
|
+
return json.load(f)
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.warning(f"Failed to load state: {e}")
|
|
282
|
+
|
|
283
|
+
return {
|
|
284
|
+
"last_update": None,
|
|
285
|
+
"updated_conversations": {}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
def save_state(state: Dict[str, Any]):
|
|
289
|
+
"""Save the delta update state."""
|
|
290
|
+
state_path = Path(STATE_FILE)
|
|
291
|
+
state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
with open(state_path, 'w') as f:
|
|
295
|
+
json.dump(state, f, indent=2)
|
|
296
|
+
except Exception as e:
|
|
297
|
+
logger.error(f"Failed to save state: {e}")
|
|
298
|
+
|
|
299
|
+
def get_recent_conversations(days: int = 7) -> List[Path]:
|
|
300
|
+
"""Get conversation files from the past N days."""
|
|
301
|
+
recent_files = []
|
|
302
|
+
cutoff_time = datetime.now() - timedelta(days=days)
|
|
303
|
+
|
|
304
|
+
logs_path = Path(LOGS_DIR)
|
|
305
|
+
if not logs_path.exists():
|
|
306
|
+
logger.error(f"Logs directory not found: {LOGS_DIR}")
|
|
307
|
+
return recent_files
|
|
308
|
+
|
|
309
|
+
# Find all JSONL files
|
|
310
|
+
for jsonl_file in logs_path.glob("**/*.jsonl"):
|
|
311
|
+
try:
|
|
312
|
+
# Check file modification time
|
|
313
|
+
mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
|
|
314
|
+
if mtime >= cutoff_time:
|
|
315
|
+
recent_files.append(jsonl_file)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.debug(f"Error checking file {jsonl_file}: {e}")
|
|
318
|
+
|
|
319
|
+
logger.info(f"Found {len(recent_files)} conversations from the past {days} days")
|
|
320
|
+
return recent_files
|
|
321
|
+
|
|
322
|
+
def update_point_metadata(conversation_id: str, chunk_index: int, metadata: Dict[str, Any],
|
|
323
|
+
collection_name: str) -> bool:
|
|
324
|
+
"""Update metadata for a specific point in Qdrant."""
|
|
325
|
+
try:
|
|
326
|
+
# Calculate point ID (same as original import)
|
|
327
|
+
point_id_str = hashlib.md5(
|
|
328
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
329
|
+
).hexdigest()[:16]
|
|
330
|
+
point_id = int(point_id_str, 16) % (2**63)
|
|
331
|
+
|
|
332
|
+
if DRY_RUN:
|
|
333
|
+
logger.info(f"[DRY RUN] Would update point {point_id} with metadata")
|
|
334
|
+
return True
|
|
335
|
+
|
|
336
|
+
# First, try to get the existing point to preserve other fields
|
|
337
|
+
try:
|
|
338
|
+
existing_points = client.retrieve(
|
|
339
|
+
collection_name=collection_name,
|
|
340
|
+
ids=[point_id],
|
|
341
|
+
with_payload=True,
|
|
342
|
+
with_vectors=False
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
if existing_points:
|
|
346
|
+
# Merge with existing payload
|
|
347
|
+
existing_payload = existing_points[0].payload
|
|
348
|
+
existing_payload.update(metadata)
|
|
349
|
+
metadata = existing_payload
|
|
350
|
+
except Exception as e:
|
|
351
|
+
logger.debug(f"Could not retrieve existing point {point_id}: {e}")
|
|
352
|
+
|
|
353
|
+
# Use set_payload to update just the metadata without touching the vector
|
|
354
|
+
client.set_payload(
|
|
355
|
+
collection_name=collection_name,
|
|
356
|
+
payload=metadata,
|
|
357
|
+
points=[point_id],
|
|
358
|
+
wait=False # Don't wait for each point
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return True
|
|
362
|
+
|
|
363
|
+
except Exception as e:
|
|
364
|
+
import traceback
|
|
365
|
+
logger.error(f"Failed to update point {conversation_id}_{chunk_index}: {e}")
|
|
366
|
+
logger.debug(traceback.format_exc())
|
|
367
|
+
return False
|
|
368
|
+
|
|
369
|
+
def process_conversation(jsonl_file: Path, state: Dict[str, Any]) -> bool:
|
|
370
|
+
"""Process a single conversation file and update its metadata."""
|
|
371
|
+
try:
|
|
372
|
+
conversation_id = jsonl_file.stem
|
|
373
|
+
project_name = jsonl_file.parent.name
|
|
374
|
+
|
|
375
|
+
# Check if already updated
|
|
376
|
+
if conversation_id in state.get("updated_conversations", {}):
|
|
377
|
+
last_updated = state["updated_conversations"][conversation_id].get("updated_at")
|
|
378
|
+
file_mtime = jsonl_file.stat().st_mtime
|
|
379
|
+
if last_updated and last_updated >= file_mtime:
|
|
380
|
+
logger.debug(f"Skipping {conversation_id} - already updated")
|
|
381
|
+
return True
|
|
382
|
+
|
|
383
|
+
logger.info(f"Processing: {conversation_id}")
|
|
384
|
+
|
|
385
|
+
# Extract tool usage metadata
|
|
386
|
+
tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
|
|
387
|
+
|
|
388
|
+
# Read the full conversation to get text for concept extraction
|
|
389
|
+
conversation_text = ""
|
|
390
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
391
|
+
for line in f:
|
|
392
|
+
if line.strip():
|
|
393
|
+
try:
|
|
394
|
+
data = json.loads(line)
|
|
395
|
+
if 'message' in data and data['message']:
|
|
396
|
+
msg = data['message']
|
|
397
|
+
if msg.get('content'):
|
|
398
|
+
if isinstance(msg['content'], str):
|
|
399
|
+
conversation_text += msg['content'] + "\n"
|
|
400
|
+
elif isinstance(msg['content'], list):
|
|
401
|
+
for item in msg['content']:
|
|
402
|
+
if isinstance(item, dict) and item.get('text'):
|
|
403
|
+
conversation_text += item['text'] + "\n"
|
|
404
|
+
except:
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
# Extract concepts
|
|
408
|
+
concepts = extract_concepts(conversation_text[:10000], tool_usage) # Limit text for concept extraction
|
|
409
|
+
|
|
410
|
+
# Prepare metadata update
|
|
411
|
+
files_analyzed = list(set([
|
|
412
|
+
item['path'] if isinstance(item, dict) else item
|
|
413
|
+
for item in tool_usage.get('files_read', [])
|
|
414
|
+
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
415
|
+
]))[:20] # Limit to 20 files
|
|
416
|
+
|
|
417
|
+
files_edited = list(set([
|
|
418
|
+
item['path'] if isinstance(item, dict) else item
|
|
419
|
+
for item in tool_usage.get('files_edited', [])
|
|
420
|
+
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
421
|
+
]))[:10] # Limit to 10 files
|
|
422
|
+
|
|
423
|
+
metadata_update = {
|
|
424
|
+
"files_analyzed": files_analyzed,
|
|
425
|
+
"files_edited": files_edited,
|
|
426
|
+
"tools_used": list(tool_usage.get('tools_summary', {}).keys())[:20],
|
|
427
|
+
"tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
|
|
428
|
+
"concepts": list(concepts)[:15],
|
|
429
|
+
"search_patterns": [s.get('pattern', '') for s in tool_usage.get('grep_searches', [])][:10],
|
|
430
|
+
"analysis_only": len(files_edited) == 0 and len(tool_usage.get('files_created', [])) == 0,
|
|
431
|
+
"has_file_metadata": True, # Flag to indicate this has been enhanced
|
|
432
|
+
"metadata_updated_at": datetime.now().isoformat()
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
# Determine collection name
|
|
436
|
+
project_hash = hashlib.md5(normalize_project_name(project_name).encode()).hexdigest()[:8]
|
|
437
|
+
collection_name = f"conv_{project_hash}{get_collection_suffix()}"
|
|
438
|
+
|
|
439
|
+
# Check if collection exists
|
|
440
|
+
try:
|
|
441
|
+
collections = client.get_collections().collections
|
|
442
|
+
if collection_name not in [c.name for c in collections]:
|
|
443
|
+
logger.warning(f"Collection {collection_name} not found for project {project_name}")
|
|
444
|
+
return False
|
|
445
|
+
except Exception as e:
|
|
446
|
+
logger.error(f"Error checking collection: {e}")
|
|
447
|
+
return False
|
|
448
|
+
|
|
449
|
+
# Get the number of chunks for this conversation
|
|
450
|
+
# We need to know how many chunks were created during original import
|
|
451
|
+
# For now, we'll try to update up to 50 chunks (most conversations have fewer)
|
|
452
|
+
max_chunks = 50
|
|
453
|
+
updated_count = 0
|
|
454
|
+
failed_count = 0
|
|
455
|
+
|
|
456
|
+
for chunk_index in range(max_chunks):
|
|
457
|
+
success = update_point_metadata(
|
|
458
|
+
conversation_id,
|
|
459
|
+
chunk_index,
|
|
460
|
+
metadata_update,
|
|
461
|
+
collection_name
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if success:
|
|
465
|
+
updated_count += 1
|
|
466
|
+
else:
|
|
467
|
+
failed_count += 1
|
|
468
|
+
# If we get too many failures in a row, the conversation probably has fewer chunks
|
|
469
|
+
if failed_count > 5:
|
|
470
|
+
break
|
|
471
|
+
|
|
472
|
+
if updated_count > 0:
|
|
473
|
+
logger.info(f"Updated {updated_count} chunks for {conversation_id}")
|
|
474
|
+
|
|
475
|
+
# Update state
|
|
476
|
+
state["updated_conversations"][conversation_id] = {
|
|
477
|
+
"updated_at": time.time(),
|
|
478
|
+
"chunks_updated": updated_count,
|
|
479
|
+
"project": project_name
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return True
|
|
483
|
+
else:
|
|
484
|
+
logger.warning(f"No chunks updated for {conversation_id}")
|
|
485
|
+
return False
|
|
486
|
+
|
|
487
|
+
except Exception as e:
|
|
488
|
+
logger.error(f"Failed to process {jsonl_file}: {e}")
|
|
489
|
+
return False
|
|
490
|
+
|
|
491
|
+
def main():
|
|
492
|
+
"""Main delta update function."""
|
|
493
|
+
logger.info("=== Starting Delta Metadata Update ===")
|
|
494
|
+
logger.info(f"Configuration:")
|
|
495
|
+
logger.info(f" Qdrant URL: {QDRANT_URL}")
|
|
496
|
+
logger.info(f" Logs directory: {LOGS_DIR}")
|
|
497
|
+
logger.info(f" Days to update: {DAYS_TO_UPDATE}")
|
|
498
|
+
logger.info(f" Embedding type: {'local' if PREFER_LOCAL_EMBEDDINGS else 'voyage'}")
|
|
499
|
+
logger.info(f" Dry run: {DRY_RUN}")
|
|
500
|
+
|
|
501
|
+
# Load state
|
|
502
|
+
state = load_state()
|
|
503
|
+
|
|
504
|
+
# Get recent conversations
|
|
505
|
+
recent_files = get_recent_conversations(DAYS_TO_UPDATE)
|
|
506
|
+
|
|
507
|
+
if not recent_files:
|
|
508
|
+
logger.info("No recent conversations found to update")
|
|
509
|
+
return
|
|
510
|
+
|
|
511
|
+
# Limit for testing
|
|
512
|
+
if os.getenv("LIMIT"):
|
|
513
|
+
limit = int(os.getenv("LIMIT"))
|
|
514
|
+
recent_files = recent_files[:limit]
|
|
515
|
+
logger.info(f"Limited to {limit} files for testing")
|
|
516
|
+
|
|
517
|
+
# Process each conversation
|
|
518
|
+
success_count = 0
|
|
519
|
+
failure_count = 0
|
|
520
|
+
|
|
521
|
+
for i, jsonl_file in enumerate(recent_files, 1):
|
|
522
|
+
logger.info(f"Processing {i}/{len(recent_files)}: {jsonl_file.name}")
|
|
523
|
+
|
|
524
|
+
if process_conversation(jsonl_file, state):
|
|
525
|
+
success_count += 1
|
|
526
|
+
else:
|
|
527
|
+
failure_count += 1
|
|
528
|
+
|
|
529
|
+
# Save state periodically
|
|
530
|
+
if i % 10 == 0:
|
|
531
|
+
save_state(state)
|
|
532
|
+
|
|
533
|
+
# Final state save
|
|
534
|
+
state["last_update"] = datetime.now().isoformat()
|
|
535
|
+
save_state(state)
|
|
536
|
+
|
|
537
|
+
# Summary
|
|
538
|
+
logger.info("=== Delta Update Complete ===")
|
|
539
|
+
logger.info(f"Successfully updated: {success_count} conversations")
|
|
540
|
+
logger.info(f"Failed: {failure_count} conversations")
|
|
541
|
+
logger.info(f"Total conversations in state: {len(state['updated_conversations'])}")
|
|
542
|
+
|
|
543
|
+
if DRY_RUN:
|
|
544
|
+
logger.info("This was a DRY RUN - no actual updates were made")
|
|
545
|
+
|
|
546
|
+
if __name__ == "__main__":
|
|
547
|
+
main()
|