claude-self-reflect 3.2.3 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +595 -528
- package/.claude/agents/documentation-writer.md +1 -1
- package/.claude/agents/qdrant-specialist.md +2 -2
- package/.claude/agents/reflection-specialist.md +61 -5
- package/.claude/agents/search-optimizer.md +9 -7
- package/README.md +16 -9
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/run-mcp.sh +49 -5
- package/mcp-server/src/app_context.py +64 -0
- package/mcp-server/src/config.py +57 -0
- package/mcp-server/src/connection_pool.py +286 -0
- package/mcp-server/src/decay_manager.py +106 -0
- package/mcp-server/src/embedding_manager.py +64 -40
- package/mcp-server/src/embeddings_old.py +141 -0
- package/mcp-server/src/models.py +64 -0
- package/mcp-server/src/parallel_search.py +371 -0
- package/mcp-server/src/project_resolver.py +33 -46
- package/mcp-server/src/reflection_tools.py +206 -0
- package/mcp-server/src/rich_formatting.py +196 -0
- package/mcp-server/src/search_tools.py +826 -0
- package/mcp-server/src/server.py +140 -1715
- package/mcp-server/src/temporal_design.py +132 -0
- package/mcp-server/src/temporal_tools.py +597 -0
- package/mcp-server/src/temporal_utils.py +384 -0
- package/mcp-server/src/utils.py +150 -67
- package/package.json +11 -1
- package/scripts/add-timestamp-indexes.py +134 -0
- package/scripts/check-collections.py +29 -0
- package/scripts/debug-august-parsing.py +76 -0
- package/scripts/debug-import-single.py +91 -0
- package/scripts/debug-project-resolver.py +82 -0
- package/scripts/debug-temporal-tools.py +135 -0
- package/scripts/delta-metadata-update.py +547 -0
- package/scripts/import-conversations-unified.py +65 -6
- package/scripts/importer/utils/project_normalizer.py +22 -9
- package/scripts/precompact-hook.sh +33 -0
- package/scripts/streaming-watcher.py +1443 -0
- package/scripts/utils.py +39 -0
- package/shared/__init__.py +5 -0
- package/shared/normalization.py +54 -0
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Delta metadata update script for Claude Self-Reflect.
|
|
4
|
+
Updates existing Qdrant points with tool usage metadata without re-importing vectors.
|
|
5
|
+
This allows us to enhance past conversations with file tracking and concept extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import hashlib
|
|
12
|
+
import re
|
|
13
|
+
import time
|
|
14
|
+
from datetime import datetime, timedelta
|
|
15
|
+
from typing import List, Dict, Any, Set, Tuple, Optional
|
|
16
|
+
import logging
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from qdrant_client import QdrantClient
|
|
20
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
21
|
+
|
|
22
|
+
# Configuration
|
|
23
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
24
|
+
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
25
|
+
STATE_FILE = os.getenv("STATE_FILE", "./config/delta-update-state.json")
|
|
26
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
27
|
+
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
28
|
+
DAYS_TO_UPDATE = int(os.getenv("DAYS_TO_UPDATE", "7"))
|
|
29
|
+
|
|
30
|
+
# Set up logging
|
|
31
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Initialize Qdrant client
|
|
35
|
+
client = QdrantClient(url=QDRANT_URL)
|
|
36
|
+
|
|
37
|
+
def get_collection_suffix():
|
|
38
|
+
"""Get the collection suffix based on embedding type."""
|
|
39
|
+
return "_local" if PREFER_LOCAL_EMBEDDINGS else "_voyage"
|
|
40
|
+
|
|
41
|
+
def normalize_project_name(project_name: str) -> str:
|
|
42
|
+
"""Normalize project name by removing path-like prefixes."""
|
|
43
|
+
# Remove path-like prefixes (e.g., "-Users-username-projects-")
|
|
44
|
+
if project_name.startswith("-"):
|
|
45
|
+
# Split by '-' and reconstruct
|
|
46
|
+
parts = project_name.split("-")
|
|
47
|
+
# Find where the actual project name starts (usually after 'projects')
|
|
48
|
+
for i, part in enumerate(parts):
|
|
49
|
+
if part == "projects" and i < len(parts) - 1:
|
|
50
|
+
return "-".join(parts[i+1:])
|
|
51
|
+
return project_name
|
|
52
|
+
|
|
53
|
+
def normalize_path(path: str) -> str:
|
|
54
|
+
"""Normalize file paths for consistency across platforms."""
|
|
55
|
+
if not path:
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
# Remove common prefixes
|
|
59
|
+
path = path.replace("/Users/", "~/")
|
|
60
|
+
path = path.replace("\\Users\\", "~\\")
|
|
61
|
+
|
|
62
|
+
# Convert to forward slashes
|
|
63
|
+
path = path.replace("\\", "/")
|
|
64
|
+
|
|
65
|
+
# Remove duplicate slashes
|
|
66
|
+
path = re.sub(r'/+', '/', path)
|
|
67
|
+
|
|
68
|
+
return path
|
|
69
|
+
|
|
70
|
+
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
71
|
+
"""Extract high-level concepts from conversation and tool usage."""
|
|
72
|
+
concepts = set()
|
|
73
|
+
|
|
74
|
+
# Common development concepts with patterns
|
|
75
|
+
concept_patterns = {
|
|
76
|
+
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
77
|
+
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
78
|
+
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
79
|
+
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
80
|
+
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
81
|
+
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
|
|
82
|
+
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
83
|
+
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
84
|
+
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
85
|
+
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
86
|
+
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
87
|
+
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
88
|
+
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
89
|
+
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
90
|
+
'search': r'(search|query|find|filter|match|relevance)'
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Check text content
|
|
94
|
+
combined_text = text.lower()
|
|
95
|
+
for concept, pattern in concept_patterns.items():
|
|
96
|
+
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
97
|
+
concepts.add(concept)
|
|
98
|
+
|
|
99
|
+
# Check tool usage patterns
|
|
100
|
+
if tool_usage.get('grep_searches'):
|
|
101
|
+
concepts.add('search')
|
|
102
|
+
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
103
|
+
concepts.add('development')
|
|
104
|
+
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
105
|
+
concepts.add('testing')
|
|
106
|
+
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
107
|
+
concepts.add('docker')
|
|
108
|
+
|
|
109
|
+
return concepts
|
|
110
|
+
|
|
111
|
+
def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
112
|
+
"""Extract all tool usage from a conversation."""
|
|
113
|
+
tool_usage = {
|
|
114
|
+
"files_read": [],
|
|
115
|
+
"files_edited": [],
|
|
116
|
+
"files_created": [],
|
|
117
|
+
"grep_searches": [],
|
|
118
|
+
"bash_commands": [],
|
|
119
|
+
"glob_patterns": [],
|
|
120
|
+
"task_calls": [],
|
|
121
|
+
"web_searches": [],
|
|
122
|
+
"tools_summary": {}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
127
|
+
for line in f:
|
|
128
|
+
if not line.strip():
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
data = json.loads(line)
|
|
133
|
+
|
|
134
|
+
# Look for tool usage in message content
|
|
135
|
+
if 'message' in data and data['message']:
|
|
136
|
+
msg = data['message']
|
|
137
|
+
if msg.get('role') == 'assistant' and msg.get('content'):
|
|
138
|
+
content = msg['content']
|
|
139
|
+
|
|
140
|
+
# Handle content as list of objects
|
|
141
|
+
if isinstance(content, list):
|
|
142
|
+
for item in content:
|
|
143
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
144
|
+
extract_tool_data(item, tool_usage)
|
|
145
|
+
# Handle content as string (legacy format)
|
|
146
|
+
elif isinstance(content, str):
|
|
147
|
+
# Try to extract tool usage from text patterns
|
|
148
|
+
extract_tools_from_text(content, tool_usage)
|
|
149
|
+
|
|
150
|
+
except json.JSONDecodeError:
|
|
151
|
+
continue
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.debug(f"Error processing line: {e}")
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.error(f"Error reading JSONL file {jsonl_path}: {e}")
|
|
157
|
+
|
|
158
|
+
# Calculate tools summary
|
|
159
|
+
all_tools = []
|
|
160
|
+
if tool_usage['files_read']:
|
|
161
|
+
all_tools.extend(['Read'] * len(tool_usage['files_read']))
|
|
162
|
+
if tool_usage['files_edited']:
|
|
163
|
+
all_tools.extend(['Edit'] * len(tool_usage['files_edited']))
|
|
164
|
+
if tool_usage['files_created']:
|
|
165
|
+
all_tools.extend(['Write'] * len(tool_usage['files_created']))
|
|
166
|
+
if tool_usage['grep_searches']:
|
|
167
|
+
all_tools.extend(['Grep'] * len(tool_usage['grep_searches']))
|
|
168
|
+
if tool_usage['bash_commands']:
|
|
169
|
+
all_tools.extend(['Bash'] * len(tool_usage['bash_commands']))
|
|
170
|
+
if tool_usage['glob_patterns']:
|
|
171
|
+
all_tools.extend(['Glob'] * len(tool_usage['glob_patterns']))
|
|
172
|
+
if tool_usage['task_calls']:
|
|
173
|
+
all_tools.extend(['Task'] * len(tool_usage['task_calls']))
|
|
174
|
+
if tool_usage['web_searches']:
|
|
175
|
+
all_tools.extend(['WebSearch'] * len(tool_usage['web_searches']))
|
|
176
|
+
|
|
177
|
+
# Count tool usage
|
|
178
|
+
for tool in all_tools:
|
|
179
|
+
tool_usage['tools_summary'][tool] = tool_usage['tools_summary'].get(tool, 0) + 1
|
|
180
|
+
|
|
181
|
+
return tool_usage
|
|
182
|
+
|
|
183
|
+
def extract_tool_data(tool_use: Dict[str, Any], usage_dict: Dict[str, Any]):
|
|
184
|
+
"""Extract tool usage data from a tool_use object."""
|
|
185
|
+
tool_name = tool_use.get('name', '')
|
|
186
|
+
inputs = tool_use.get('input', {})
|
|
187
|
+
|
|
188
|
+
# Handle Read tool
|
|
189
|
+
if tool_name == 'Read':
|
|
190
|
+
file_path = inputs.get('file_path')
|
|
191
|
+
if file_path:
|
|
192
|
+
usage_dict['files_read'].append({
|
|
193
|
+
'path': normalize_path(file_path),
|
|
194
|
+
'operation': 'read'
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
# Handle Edit and MultiEdit tools
|
|
198
|
+
elif tool_name in ['Edit', 'MultiEdit']:
|
|
199
|
+
path = inputs.get('file_path')
|
|
200
|
+
if path:
|
|
201
|
+
usage_dict['files_edited'].append({
|
|
202
|
+
'path': normalize_path(path),
|
|
203
|
+
'operation': tool_name.lower()
|
|
204
|
+
})
|
|
205
|
+
|
|
206
|
+
# Handle Write tool
|
|
207
|
+
elif tool_name == 'Write':
|
|
208
|
+
path = inputs.get('file_path')
|
|
209
|
+
if path:
|
|
210
|
+
usage_dict['files_created'].append({
|
|
211
|
+
'path': normalize_path(path),
|
|
212
|
+
'operation': 'write'
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
# Handle Grep tool
|
|
216
|
+
elif tool_name == 'Grep':
|
|
217
|
+
pattern = inputs.get('pattern')
|
|
218
|
+
path = inputs.get('path', '.')
|
|
219
|
+
if pattern:
|
|
220
|
+
usage_dict['grep_searches'].append({
|
|
221
|
+
'pattern': pattern,
|
|
222
|
+
'path': normalize_path(path)
|
|
223
|
+
})
|
|
224
|
+
|
|
225
|
+
# Handle Bash tool
|
|
226
|
+
elif tool_name == 'Bash':
|
|
227
|
+
command = inputs.get('command')
|
|
228
|
+
if command:
|
|
229
|
+
usage_dict['bash_commands'].append({
|
|
230
|
+
'command': command[:200] # Limit command length
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
# Handle Glob tool
|
|
234
|
+
elif tool_name == 'Glob':
|
|
235
|
+
pattern = inputs.get('pattern')
|
|
236
|
+
if pattern:
|
|
237
|
+
usage_dict['glob_patterns'].append({
|
|
238
|
+
'pattern': pattern
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
# Handle Task tool
|
|
242
|
+
elif tool_name == 'Task':
|
|
243
|
+
agent = inputs.get('subagent_type', 'unknown')
|
|
244
|
+
usage_dict['task_calls'].append({
|
|
245
|
+
'agent': agent
|
|
246
|
+
})
|
|
247
|
+
|
|
248
|
+
# Handle WebSearch tool
|
|
249
|
+
elif tool_name == 'WebSearch':
|
|
250
|
+
query = inputs.get('query')
|
|
251
|
+
if query:
|
|
252
|
+
usage_dict['web_searches'].append({
|
|
253
|
+
'query': query[:100]
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
def extract_tools_from_text(content: str, usage_dict: Dict[str, Any]):
|
|
257
|
+
"""Extract tool usage from text content (fallback for legacy format)."""
|
|
258
|
+
# Look for file paths that might have been read/edited
|
|
259
|
+
file_pattern = r'(?:Reading|Editing|Writing|Checking)\s+(?:file\s+)?([/~][\w\-./]+\.\w+)'
|
|
260
|
+
for match in re.finditer(file_pattern, content):
|
|
261
|
+
file_path = match.group(1)
|
|
262
|
+
if 'Edit' in match.group(0):
|
|
263
|
+
usage_dict['files_edited'].append({
|
|
264
|
+
'path': normalize_path(file_path),
|
|
265
|
+
'operation': 'edit'
|
|
266
|
+
})
|
|
267
|
+
else:
|
|
268
|
+
usage_dict['files_read'].append({
|
|
269
|
+
'path': normalize_path(file_path),
|
|
270
|
+
'operation': 'read'
|
|
271
|
+
})
|
|
272
|
+
|
|
273
|
+
def load_state():
|
|
274
|
+
"""Load the delta update state."""
|
|
275
|
+
state_path = Path(STATE_FILE)
|
|
276
|
+
if state_path.exists():
|
|
277
|
+
try:
|
|
278
|
+
with open(state_path, 'r') as f:
|
|
279
|
+
return json.load(f)
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.warning(f"Failed to load state: {e}")
|
|
282
|
+
|
|
283
|
+
return {
|
|
284
|
+
"last_update": None,
|
|
285
|
+
"updated_conversations": {}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
def save_state(state: Dict[str, Any]):
|
|
289
|
+
"""Save the delta update state."""
|
|
290
|
+
state_path = Path(STATE_FILE)
|
|
291
|
+
state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
with open(state_path, 'w') as f:
|
|
295
|
+
json.dump(state, f, indent=2)
|
|
296
|
+
except Exception as e:
|
|
297
|
+
logger.error(f"Failed to save state: {e}")
|
|
298
|
+
|
|
299
|
+
def get_recent_conversations(days: int = 7) -> List[Path]:
|
|
300
|
+
"""Get conversation files from the past N days."""
|
|
301
|
+
recent_files = []
|
|
302
|
+
cutoff_time = datetime.now() - timedelta(days=days)
|
|
303
|
+
|
|
304
|
+
logs_path = Path(LOGS_DIR)
|
|
305
|
+
if not logs_path.exists():
|
|
306
|
+
logger.error(f"Logs directory not found: {LOGS_DIR}")
|
|
307
|
+
return recent_files
|
|
308
|
+
|
|
309
|
+
# Find all JSONL files
|
|
310
|
+
for jsonl_file in logs_path.glob("**/*.jsonl"):
|
|
311
|
+
try:
|
|
312
|
+
# Check file modification time
|
|
313
|
+
mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
|
|
314
|
+
if mtime >= cutoff_time:
|
|
315
|
+
recent_files.append(jsonl_file)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.debug(f"Error checking file {jsonl_file}: {e}")
|
|
318
|
+
|
|
319
|
+
logger.info(f"Found {len(recent_files)} conversations from the past {days} days")
|
|
320
|
+
return recent_files
|
|
321
|
+
|
|
322
|
+
def update_point_metadata(conversation_id: str, chunk_index: int, metadata: Dict[str, Any],
|
|
323
|
+
collection_name: str) -> bool:
|
|
324
|
+
"""Update metadata for a specific point in Qdrant."""
|
|
325
|
+
try:
|
|
326
|
+
# Calculate point ID (same as original import)
|
|
327
|
+
point_id_str = hashlib.md5(
|
|
328
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
329
|
+
).hexdigest()[:16]
|
|
330
|
+
point_id = int(point_id_str, 16) % (2**63)
|
|
331
|
+
|
|
332
|
+
if DRY_RUN:
|
|
333
|
+
logger.info(f"[DRY RUN] Would update point {point_id} with metadata")
|
|
334
|
+
return True
|
|
335
|
+
|
|
336
|
+
# First, try to get the existing point to preserve other fields
|
|
337
|
+
try:
|
|
338
|
+
existing_points = client.retrieve(
|
|
339
|
+
collection_name=collection_name,
|
|
340
|
+
ids=[point_id],
|
|
341
|
+
with_payload=True,
|
|
342
|
+
with_vectors=False
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
if existing_points:
|
|
346
|
+
# Merge with existing payload
|
|
347
|
+
existing_payload = existing_points[0].payload
|
|
348
|
+
existing_payload.update(metadata)
|
|
349
|
+
metadata = existing_payload
|
|
350
|
+
except Exception as e:
|
|
351
|
+
logger.debug(f"Could not retrieve existing point {point_id}: {e}")
|
|
352
|
+
|
|
353
|
+
# Use set_payload to update just the metadata without touching the vector
|
|
354
|
+
client.set_payload(
|
|
355
|
+
collection_name=collection_name,
|
|
356
|
+
payload=metadata,
|
|
357
|
+
points=[point_id],
|
|
358
|
+
wait=False # Don't wait for each point
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return True
|
|
362
|
+
|
|
363
|
+
except Exception as e:
|
|
364
|
+
import traceback
|
|
365
|
+
logger.error(f"Failed to update point {conversation_id}_{chunk_index}: {e}")
|
|
366
|
+
logger.debug(traceback.format_exc())
|
|
367
|
+
return False
|
|
368
|
+
|
|
369
|
+
def process_conversation(jsonl_file: Path, state: Dict[str, Any]) -> bool:
|
|
370
|
+
"""Process a single conversation file and update its metadata."""
|
|
371
|
+
try:
|
|
372
|
+
conversation_id = jsonl_file.stem
|
|
373
|
+
project_name = jsonl_file.parent.name
|
|
374
|
+
|
|
375
|
+
# Check if already updated
|
|
376
|
+
if conversation_id in state.get("updated_conversations", {}):
|
|
377
|
+
last_updated = state["updated_conversations"][conversation_id].get("updated_at")
|
|
378
|
+
file_mtime = jsonl_file.stat().st_mtime
|
|
379
|
+
if last_updated and last_updated >= file_mtime:
|
|
380
|
+
logger.debug(f"Skipping {conversation_id} - already updated")
|
|
381
|
+
return True
|
|
382
|
+
|
|
383
|
+
logger.info(f"Processing: {conversation_id}")
|
|
384
|
+
|
|
385
|
+
# Extract tool usage metadata
|
|
386
|
+
tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
|
|
387
|
+
|
|
388
|
+
# Read the full conversation to get text for concept extraction
|
|
389
|
+
conversation_text = ""
|
|
390
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
391
|
+
for line in f:
|
|
392
|
+
if line.strip():
|
|
393
|
+
try:
|
|
394
|
+
data = json.loads(line)
|
|
395
|
+
if 'message' in data and data['message']:
|
|
396
|
+
msg = data['message']
|
|
397
|
+
if msg.get('content'):
|
|
398
|
+
if isinstance(msg['content'], str):
|
|
399
|
+
conversation_text += msg['content'] + "\n"
|
|
400
|
+
elif isinstance(msg['content'], list):
|
|
401
|
+
for item in msg['content']:
|
|
402
|
+
if isinstance(item, dict) and item.get('text'):
|
|
403
|
+
conversation_text += item['text'] + "\n"
|
|
404
|
+
except:
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
# Extract concepts
|
|
408
|
+
concepts = extract_concepts(conversation_text[:10000], tool_usage) # Limit text for concept extraction
|
|
409
|
+
|
|
410
|
+
# Prepare metadata update
|
|
411
|
+
files_analyzed = list(set([
|
|
412
|
+
item['path'] if isinstance(item, dict) else item
|
|
413
|
+
for item in tool_usage.get('files_read', [])
|
|
414
|
+
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
415
|
+
]))[:20] # Limit to 20 files
|
|
416
|
+
|
|
417
|
+
files_edited = list(set([
|
|
418
|
+
item['path'] if isinstance(item, dict) else item
|
|
419
|
+
for item in tool_usage.get('files_edited', [])
|
|
420
|
+
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
421
|
+
]))[:10] # Limit to 10 files
|
|
422
|
+
|
|
423
|
+
metadata_update = {
|
|
424
|
+
"files_analyzed": files_analyzed,
|
|
425
|
+
"files_edited": files_edited,
|
|
426
|
+
"tools_used": list(tool_usage.get('tools_summary', {}).keys())[:20],
|
|
427
|
+
"tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
|
|
428
|
+
"concepts": list(concepts)[:15],
|
|
429
|
+
"search_patterns": [s.get('pattern', '') for s in tool_usage.get('grep_searches', [])][:10],
|
|
430
|
+
"analysis_only": len(files_edited) == 0 and len(tool_usage.get('files_created', [])) == 0,
|
|
431
|
+
"has_file_metadata": True, # Flag to indicate this has been enhanced
|
|
432
|
+
"metadata_updated_at": datetime.now().isoformat()
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
# Determine collection name
|
|
436
|
+
project_hash = hashlib.md5(normalize_project_name(project_name).encode()).hexdigest()[:8]
|
|
437
|
+
collection_name = f"conv_{project_hash}{get_collection_suffix()}"
|
|
438
|
+
|
|
439
|
+
# Check if collection exists
|
|
440
|
+
try:
|
|
441
|
+
collections = client.get_collections().collections
|
|
442
|
+
if collection_name not in [c.name for c in collections]:
|
|
443
|
+
logger.warning(f"Collection {collection_name} not found for project {project_name}")
|
|
444
|
+
return False
|
|
445
|
+
except Exception as e:
|
|
446
|
+
logger.error(f"Error checking collection: {e}")
|
|
447
|
+
return False
|
|
448
|
+
|
|
449
|
+
# Get the number of chunks for this conversation
|
|
450
|
+
# We need to know how many chunks were created during original import
|
|
451
|
+
# For now, we'll try to update up to 50 chunks (most conversations have fewer)
|
|
452
|
+
max_chunks = 50
|
|
453
|
+
updated_count = 0
|
|
454
|
+
failed_count = 0
|
|
455
|
+
|
|
456
|
+
for chunk_index in range(max_chunks):
|
|
457
|
+
success = update_point_metadata(
|
|
458
|
+
conversation_id,
|
|
459
|
+
chunk_index,
|
|
460
|
+
metadata_update,
|
|
461
|
+
collection_name
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if success:
|
|
465
|
+
updated_count += 1
|
|
466
|
+
else:
|
|
467
|
+
failed_count += 1
|
|
468
|
+
# If we get too many failures in a row, the conversation probably has fewer chunks
|
|
469
|
+
if failed_count > 5:
|
|
470
|
+
break
|
|
471
|
+
|
|
472
|
+
if updated_count > 0:
|
|
473
|
+
logger.info(f"Updated {updated_count} chunks for {conversation_id}")
|
|
474
|
+
|
|
475
|
+
# Update state
|
|
476
|
+
state["updated_conversations"][conversation_id] = {
|
|
477
|
+
"updated_at": time.time(),
|
|
478
|
+
"chunks_updated": updated_count,
|
|
479
|
+
"project": project_name
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return True
|
|
483
|
+
else:
|
|
484
|
+
logger.warning(f"No chunks updated for {conversation_id}")
|
|
485
|
+
return False
|
|
486
|
+
|
|
487
|
+
except Exception as e:
|
|
488
|
+
logger.error(f"Failed to process {jsonl_file}: {e}")
|
|
489
|
+
return False
|
|
490
|
+
|
|
491
|
+
def main():
|
|
492
|
+
"""Main delta update function."""
|
|
493
|
+
logger.info("=== Starting Delta Metadata Update ===")
|
|
494
|
+
logger.info(f"Configuration:")
|
|
495
|
+
logger.info(f" Qdrant URL: {QDRANT_URL}")
|
|
496
|
+
logger.info(f" Logs directory: {LOGS_DIR}")
|
|
497
|
+
logger.info(f" Days to update: {DAYS_TO_UPDATE}")
|
|
498
|
+
logger.info(f" Embedding type: {'local' if PREFER_LOCAL_EMBEDDINGS else 'voyage'}")
|
|
499
|
+
logger.info(f" Dry run: {DRY_RUN}")
|
|
500
|
+
|
|
501
|
+
# Load state
|
|
502
|
+
state = load_state()
|
|
503
|
+
|
|
504
|
+
# Get recent conversations
|
|
505
|
+
recent_files = get_recent_conversations(DAYS_TO_UPDATE)
|
|
506
|
+
|
|
507
|
+
if not recent_files:
|
|
508
|
+
logger.info("No recent conversations found to update")
|
|
509
|
+
return
|
|
510
|
+
|
|
511
|
+
# Limit for testing
|
|
512
|
+
if os.getenv("LIMIT"):
|
|
513
|
+
limit = int(os.getenv("LIMIT"))
|
|
514
|
+
recent_files = recent_files[:limit]
|
|
515
|
+
logger.info(f"Limited to {limit} files for testing")
|
|
516
|
+
|
|
517
|
+
# Process each conversation
|
|
518
|
+
success_count = 0
|
|
519
|
+
failure_count = 0
|
|
520
|
+
|
|
521
|
+
for i, jsonl_file in enumerate(recent_files, 1):
|
|
522
|
+
logger.info(f"Processing {i}/{len(recent_files)}: {jsonl_file.name}")
|
|
523
|
+
|
|
524
|
+
if process_conversation(jsonl_file, state):
|
|
525
|
+
success_count += 1
|
|
526
|
+
else:
|
|
527
|
+
failure_count += 1
|
|
528
|
+
|
|
529
|
+
# Save state periodically
|
|
530
|
+
if i % 10 == 0:
|
|
531
|
+
save_state(state)
|
|
532
|
+
|
|
533
|
+
# Final state save
|
|
534
|
+
state["last_update"] = datetime.now().isoformat()
|
|
535
|
+
save_state(state)
|
|
536
|
+
|
|
537
|
+
# Summary
|
|
538
|
+
logger.info("=== Delta Update Complete ===")
|
|
539
|
+
logger.info(f"Successfully updated: {success_count} conversations")
|
|
540
|
+
logger.info(f"Failed: {failure_count} conversations")
|
|
541
|
+
logger.info(f"Total conversations in state: {len(state['updated_conversations'])}")
|
|
542
|
+
|
|
543
|
+
if DRY_RUN:
|
|
544
|
+
logger.info("This was a DRY RUN - no actual updates were made")
|
|
545
|
+
|
|
546
|
+
if __name__ == "__main__":
|
|
547
|
+
main()
|
|
@@ -13,11 +13,23 @@ import ast
|
|
|
13
13
|
import re
|
|
14
14
|
import fcntl
|
|
15
15
|
import time
|
|
16
|
+
import argparse
|
|
16
17
|
from pathlib import Path
|
|
17
18
|
from datetime import datetime
|
|
18
19
|
from typing import List, Dict, Any, Optional, Set
|
|
19
20
|
import logging
|
|
20
21
|
|
|
22
|
+
# Load .env file if it exists
|
|
23
|
+
try:
|
|
24
|
+
from dotenv import load_dotenv
|
|
25
|
+
# Load from project root
|
|
26
|
+
env_path = Path(__file__).parent.parent / '.env'
|
|
27
|
+
if env_path.exists():
|
|
28
|
+
load_dotenv(env_path)
|
|
29
|
+
print(f"Loaded .env from {env_path}")
|
|
30
|
+
except ImportError:
|
|
31
|
+
pass # dotenv not available, use system environment
|
|
32
|
+
|
|
21
33
|
# Add the scripts directory to the Python path for utils import
|
|
22
34
|
scripts_dir = Path(__file__).parent
|
|
23
35
|
sys.path.insert(0, str(scripts_dir))
|
|
@@ -25,12 +37,20 @@ sys.path.insert(0, str(scripts_dir))
|
|
|
25
37
|
from qdrant_client import QdrantClient
|
|
26
38
|
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
27
39
|
|
|
28
|
-
# Import
|
|
40
|
+
# Import normalize_project_name from shared module
|
|
41
|
+
# Add parent directory to path to import shared module
|
|
42
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
29
43
|
try:
|
|
30
|
-
from
|
|
44
|
+
from shared.normalization import normalize_project_name
|
|
31
45
|
except ImportError as e:
|
|
32
|
-
logging.error(f"Failed to import normalize_project_name from
|
|
33
|
-
|
|
46
|
+
logging.error(f"Failed to import normalize_project_name from shared module: {e}")
|
|
47
|
+
# Fall back to local utils if shared module not found
|
|
48
|
+
try:
|
|
49
|
+
from utils import normalize_project_name
|
|
50
|
+
logging.warning("Using legacy utils.normalize_project_name - consider updating")
|
|
51
|
+
except ImportError:
|
|
52
|
+
logging.error("Could not import normalize_project_name from any source")
|
|
53
|
+
sys.exit(1)
|
|
34
54
|
|
|
35
55
|
# Set up logging
|
|
36
56
|
logging.basicConfig(
|
|
@@ -125,7 +145,8 @@ def ensure_collection(collection_name: str):
|
|
|
125
145
|
|
|
126
146
|
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
127
147
|
"""Generate embeddings for texts."""
|
|
128
|
-
|
|
148
|
+
# Use the global embedding_provider which gets updated by command-line args
|
|
149
|
+
if PREFER_LOCAL_EMBEDDINGS:
|
|
129
150
|
embeddings = list(embedding_provider.passage_embed(texts))
|
|
130
151
|
return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
|
|
131
152
|
else:
|
|
@@ -665,6 +686,32 @@ def update_file_state(file_path: Path, state: dict, chunks: int):
|
|
|
665
686
|
|
|
666
687
|
def main():
|
|
667
688
|
"""Main import function."""
|
|
689
|
+
# Parse command-line arguments
|
|
690
|
+
parser = argparse.ArgumentParser(description='Import conversations with unified embeddings support')
|
|
691
|
+
parser.add_argument('--prefer-voyage', action='store_true',
|
|
692
|
+
help='Use Voyage AI embeddings instead of local FastEmbed')
|
|
693
|
+
parser.add_argument('--limit', type=int,
|
|
694
|
+
help='Limit number of files to import')
|
|
695
|
+
parser.add_argument('--max-files-per-cycle', type=int,
|
|
696
|
+
help='Maximum files to process per cycle')
|
|
697
|
+
args = parser.parse_args()
|
|
698
|
+
|
|
699
|
+
# Override environment variable if --prefer-voyage is specified
|
|
700
|
+
global PREFER_LOCAL_EMBEDDINGS, embedding_provider, embedding_dimension, collection_suffix
|
|
701
|
+
if args.prefer_voyage:
|
|
702
|
+
if not VOYAGE_API_KEY:
|
|
703
|
+
logger.error("--prefer-voyage specified but VOYAGE_KEY environment variable not set")
|
|
704
|
+
sys.exit(1)
|
|
705
|
+
logger.info("Command-line flag --prefer-voyage detected, switching to Voyage AI embeddings")
|
|
706
|
+
PREFER_LOCAL_EMBEDDINGS = False
|
|
707
|
+
|
|
708
|
+
# Re-initialize embedding provider with Voyage
|
|
709
|
+
import voyageai
|
|
710
|
+
embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
711
|
+
embedding_dimension = 1024
|
|
712
|
+
collection_suffix = "voyage"
|
|
713
|
+
logger.info("Switched to Voyage AI embeddings (dimension: 1024)")
|
|
714
|
+
|
|
668
715
|
# Load state
|
|
669
716
|
state = load_state()
|
|
670
717
|
logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
|
|
@@ -687,6 +734,7 @@ def main():
|
|
|
687
734
|
logger.info(f"Found {len(project_dirs)} projects to import")
|
|
688
735
|
|
|
689
736
|
total_imported = 0
|
|
737
|
+
files_processed = 0
|
|
690
738
|
|
|
691
739
|
for project_dir in project_dirs:
|
|
692
740
|
# Get collection name
|
|
@@ -699,13 +747,24 @@ def main():
|
|
|
699
747
|
# Find JSONL files
|
|
700
748
|
jsonl_files = sorted(project_dir.glob("*.jsonl"))
|
|
701
749
|
|
|
750
|
+
# Apply limit from command line if specified
|
|
751
|
+
if args.limit and files_processed >= args.limit:
|
|
752
|
+
logger.info(f"Reached limit of {args.limit} files, stopping import")
|
|
753
|
+
break
|
|
754
|
+
|
|
702
755
|
# Limit files per cycle if specified
|
|
703
|
-
max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
|
|
756
|
+
max_files = args.max_files_per_cycle or int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
|
|
704
757
|
jsonl_files = jsonl_files[:max_files]
|
|
705
758
|
|
|
706
759
|
for jsonl_file in jsonl_files:
|
|
760
|
+
# Check limit again per file
|
|
761
|
+
if args.limit and files_processed >= args.limit:
|
|
762
|
+
logger.info(f"Reached limit of {args.limit} files, stopping import")
|
|
763
|
+
break
|
|
764
|
+
|
|
707
765
|
if should_import_file(jsonl_file, state):
|
|
708
766
|
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
767
|
+
files_processed += 1
|
|
709
768
|
if chunks > 0:
|
|
710
769
|
# Verify data is actually in Qdrant before marking as imported
|
|
711
770
|
from qdrant_client.models import Filter, FieldCondition, MatchValue
|