claude-self-reflect 3.2.4 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/.claude/agents/claude-self-reflect-test.md +992 -510
  2. package/.claude/agents/reflection-specialist.md +59 -3
  3. package/README.md +14 -5
  4. package/installer/cli.js +16 -0
  5. package/installer/postinstall.js +14 -0
  6. package/installer/statusline-setup.js +289 -0
  7. package/mcp-server/run-mcp.sh +73 -5
  8. package/mcp-server/src/app_context.py +64 -0
  9. package/mcp-server/src/config.py +57 -0
  10. package/mcp-server/src/connection_pool.py +286 -0
  11. package/mcp-server/src/decay_manager.py +106 -0
  12. package/mcp-server/src/embedding_manager.py +64 -40
  13. package/mcp-server/src/embeddings_old.py +141 -0
  14. package/mcp-server/src/models.py +64 -0
  15. package/mcp-server/src/parallel_search.py +305 -0
  16. package/mcp-server/src/project_resolver.py +5 -0
  17. package/mcp-server/src/reflection_tools.py +211 -0
  18. package/mcp-server/src/rich_formatting.py +196 -0
  19. package/mcp-server/src/search_tools.py +874 -0
  20. package/mcp-server/src/server.py +127 -1720
  21. package/mcp-server/src/temporal_design.py +132 -0
  22. package/mcp-server/src/temporal_tools.py +604 -0
  23. package/mcp-server/src/temporal_utils.py +384 -0
  24. package/mcp-server/src/utils.py +150 -67
  25. package/package.json +15 -1
  26. package/scripts/add-timestamp-indexes.py +134 -0
  27. package/scripts/ast_grep_final_analyzer.py +325 -0
  28. package/scripts/ast_grep_unified_registry.py +556 -0
  29. package/scripts/check-collections.py +29 -0
  30. package/scripts/csr-status +366 -0
  31. package/scripts/debug-august-parsing.py +76 -0
  32. package/scripts/debug-import-single.py +91 -0
  33. package/scripts/debug-project-resolver.py +82 -0
  34. package/scripts/debug-temporal-tools.py +135 -0
  35. package/scripts/delta-metadata-update.py +547 -0
  36. package/scripts/import-conversations-unified.py +157 -25
  37. package/scripts/precompact-hook.sh +33 -0
  38. package/scripts/session_quality_tracker.py +481 -0
  39. package/scripts/streaming-watcher.py +1578 -0
  40. package/scripts/update_patterns.py +334 -0
  41. package/scripts/utils.py +39 -0
@@ -0,0 +1,135 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Debug script for testing temporal tools in Claude Self Reflect.
4
+ This script directly tests the temporal tools that should be available via MCP.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import asyncio
10
+ import json
11
+ import traceback
12
+ from pathlib import Path
13
+
14
+ # Add the mcp-server source to Python path
15
+ sys.path.append(str(Path(__file__).parent.parent / "mcp-server" / "src"))
16
+
17
+ os.environ["QDRANT_URL"] = "http://localhost:6333"
18
+
19
+ async def test_temporal_tools():
20
+ """Test all temporal tools."""
21
+ print("=== TEMPORAL TOOLS DEBUG SCRIPT ===")
22
+
23
+ try:
24
+ # Import required modules
25
+ from server import (
26
+ get_recent_work, search_by_recency, get_timeline,
27
+ get_all_collections, QDRANT_URL
28
+ )
29
+ from fastmcp import Context
30
+
31
+ print(f"✅ Successfully imported temporal tools")
32
+ print(f"✅ Qdrant URL: {QDRANT_URL}")
33
+
34
+ # Check if Qdrant is available
35
+ collections = await get_all_collections()
36
+ print(f"✅ Found {len(collections)} collections: {collections[:5]}...")
37
+
38
+ # Create a mock context for testing
39
+ class MockContext(Context):
40
+ def __init__(self):
41
+ pass
42
+ async def debug(self, message):
43
+ print(f"DEBUG: {message}")
44
+ async def error(self, message):
45
+ print(f"ERROR: {message}")
46
+
47
+ ctx = MockContext()
48
+
49
+ # Test 1: get_recent_work with default parameters
50
+ print("\n--- Test 1: get_recent_work (default) ---")
51
+ try:
52
+ result = await get_recent_work(ctx)
53
+ print(f"✅ get_recent_work succeeded")
54
+ print(f"Result length: {len(result) if result else 0} characters")
55
+ if result and len(result) < 500:
56
+ print(f"Result: {result}")
57
+ except Exception as e:
58
+ print(f"❌ get_recent_work failed: {e}")
59
+ traceback.print_exc()
60
+
61
+ # Test 2: get_recent_work with project='all'
62
+ print("\n--- Test 2: get_recent_work (project=all) ---")
63
+ try:
64
+ result = await get_recent_work(ctx, project="all", limit=5)
65
+ print(f"✅ get_recent_work (project=all) succeeded")
66
+ print(f"Result length: {len(result) if result else 0} characters")
67
+ except Exception as e:
68
+ print(f"❌ get_recent_work (project=all) failed: {e}")
69
+ traceback.print_exc()
70
+
71
+ # Test 3: get_recent_work with different group_by options
72
+ for group_by in ["conversation", "day", "session"]:
73
+ print(f"\n--- Test 3.{group_by}: get_recent_work (group_by={group_by}) ---")
74
+ try:
75
+ result = await get_recent_work(ctx, limit=3, group_by=group_by)
76
+ print(f"✅ get_recent_work (group_by={group_by}) succeeded")
77
+ print(f"Result length: {len(result) if result else 0} characters")
78
+ except Exception as e:
79
+ print(f"❌ get_recent_work (group_by={group_by}) failed: {e}")
80
+ traceback.print_exc()
81
+
82
+ # Test 4: search_by_recency with time_range
83
+ print("\n--- Test 4: search_by_recency (time_range) ---")
84
+ try:
85
+ result = await search_by_recency(
86
+ ctx,
87
+ query="testing debugging",
88
+ time_range="last week",
89
+ limit=5
90
+ )
91
+ print(f"✅ search_by_recency (time_range) succeeded")
92
+ print(f"Result length: {len(result) if result else 0} characters")
93
+ except Exception as e:
94
+ print(f"❌ search_by_recency (time_range) failed: {e}")
95
+ traceback.print_exc()
96
+
97
+ # Test 5: search_by_recency with since/until
98
+ print("\n--- Test 5: search_by_recency (since/until) ---")
99
+ try:
100
+ result = await search_by_recency(
101
+ ctx,
102
+ query="python script",
103
+ since="yesterday",
104
+ limit=3
105
+ )
106
+ print(f"✅ search_by_recency (since/until) succeeded")
107
+ print(f"Result length: {len(result) if result else 0} characters")
108
+ except Exception as e:
109
+ print(f"❌ search_by_recency (since/until) failed: {e}")
110
+ traceback.print_exc()
111
+
112
+ # Test 6: get_timeline with different granularities
113
+ for granularity in ["day", "week"]:
114
+ print(f"\n--- Test 6.{granularity}: get_timeline (granularity={granularity}) ---")
115
+ try:
116
+ result = await get_timeline(
117
+ ctx,
118
+ time_range="last week",
119
+ granularity=granularity,
120
+ include_stats=True
121
+ )
122
+ print(f"✅ get_timeline (granularity={granularity}) succeeded")
123
+ print(f"Result length: {len(result) if result else 0} characters")
124
+ except Exception as e:
125
+ print(f"❌ get_timeline (granularity={granularity}) failed: {e}")
126
+ traceback.print_exc()
127
+
128
+ print("\n=== TEMPORAL TOOLS TEST COMPLETE ===")
129
+
130
+ except Exception as e:
131
+ print(f"❌ Critical error during setup: {e}")
132
+ traceback.print_exc()
133
+
134
+ if __name__ == "__main__":
135
+ asyncio.run(test_temporal_tools())
@@ -0,0 +1,547 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Delta metadata update script for Claude Self-Reflect.
4
+ Updates existing Qdrant points with tool usage metadata without re-importing vectors.
5
+ This allows us to enhance past conversations with file tracking and concept extraction.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import hashlib
12
+ import re
13
+ import time
14
+ from datetime import datetime, timedelta
15
+ from typing import List, Dict, Any, Set, Tuple, Optional
16
+ import logging
17
+ from pathlib import Path
18
+
19
+ from qdrant_client import QdrantClient
20
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
21
+
22
+ # Configuration
23
+ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
24
+ LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
25
+ STATE_FILE = os.getenv("STATE_FILE", "./config/delta-update-state.json")
26
+ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
27
+ DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
28
+ DAYS_TO_UPDATE = int(os.getenv("DAYS_TO_UPDATE", "7"))
29
+
30
+ # Set up logging
31
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Initialize Qdrant client
35
+ client = QdrantClient(url=QDRANT_URL)
36
+
37
+ def get_collection_suffix():
38
+ """Get the collection suffix based on embedding type."""
39
+ return "_local" if PREFER_LOCAL_EMBEDDINGS else "_voyage"
40
+
41
+ def normalize_project_name(project_name: str) -> str:
42
+ """Normalize project name by removing path-like prefixes."""
43
+ # Remove path-like prefixes (e.g., "-Users-username-projects-")
44
+ if project_name.startswith("-"):
45
+ # Split by '-' and reconstruct
46
+ parts = project_name.split("-")
47
+ # Find where the actual project name starts (usually after 'projects')
48
+ for i, part in enumerate(parts):
49
+ if part == "projects" and i < len(parts) - 1:
50
+ return "-".join(parts[i+1:])
51
+ return project_name
52
+
53
+ def normalize_path(path: str) -> str:
54
+ """Normalize file paths for consistency across platforms."""
55
+ if not path:
56
+ return ""
57
+
58
+ # Remove common prefixes
59
+ path = path.replace("/Users/", "~/")
60
+ path = path.replace("\\Users\\", "~\\")
61
+
62
+ # Convert to forward slashes
63
+ path = path.replace("\\", "/")
64
+
65
+ # Remove duplicate slashes
66
+ path = re.sub(r'/+', '/', path)
67
+
68
+ return path
69
+
70
+ def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
71
+ """Extract high-level concepts from conversation and tool usage."""
72
+ concepts = set()
73
+
74
+ # Common development concepts with patterns
75
+ concept_patterns = {
76
+ 'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
77
+ 'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
78
+ 'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
79
+ 'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
80
+ 'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
81
+ 'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
82
+ 'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
83
+ 'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
84
+ 'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
85
+ 'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
86
+ 'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
87
+ 'architecture': r'(architecture|design|pattern|structure|component|module)',
88
+ 'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
89
+ 'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
90
+ 'search': r'(search|query|find|filter|match|relevance)'
91
+ }
92
+
93
+ # Check text content
94
+ combined_text = text.lower()
95
+ for concept, pattern in concept_patterns.items():
96
+ if re.search(pattern, combined_text, re.IGNORECASE):
97
+ concepts.add(concept)
98
+
99
+ # Check tool usage patterns
100
+ if tool_usage.get('grep_searches'):
101
+ concepts.add('search')
102
+ if tool_usage.get('files_edited') or tool_usage.get('files_created'):
103
+ concepts.add('development')
104
+ if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
105
+ concepts.add('testing')
106
+ if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
107
+ concepts.add('docker')
108
+
109
+ return concepts
110
+
111
+ def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
112
+ """Extract all tool usage from a conversation."""
113
+ tool_usage = {
114
+ "files_read": [],
115
+ "files_edited": [],
116
+ "files_created": [],
117
+ "grep_searches": [],
118
+ "bash_commands": [],
119
+ "glob_patterns": [],
120
+ "task_calls": [],
121
+ "web_searches": [],
122
+ "tools_summary": {}
123
+ }
124
+
125
+ try:
126
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
127
+ for line in f:
128
+ if not line.strip():
129
+ continue
130
+
131
+ try:
132
+ data = json.loads(line)
133
+
134
+ # Look for tool usage in message content
135
+ if 'message' in data and data['message']:
136
+ msg = data['message']
137
+ if msg.get('role') == 'assistant' and msg.get('content'):
138
+ content = msg['content']
139
+
140
+ # Handle content as list of objects
141
+ if isinstance(content, list):
142
+ for item in content:
143
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
144
+ extract_tool_data(item, tool_usage)
145
+ # Handle content as string (legacy format)
146
+ elif isinstance(content, str):
147
+ # Try to extract tool usage from text patterns
148
+ extract_tools_from_text(content, tool_usage)
149
+
150
+ except json.JSONDecodeError:
151
+ continue
152
+ except Exception as e:
153
+ logger.debug(f"Error processing line: {e}")
154
+
155
+ except Exception as e:
156
+ logger.error(f"Error reading JSONL file {jsonl_path}: {e}")
157
+
158
+ # Calculate tools summary
159
+ all_tools = []
160
+ if tool_usage['files_read']:
161
+ all_tools.extend(['Read'] * len(tool_usage['files_read']))
162
+ if tool_usage['files_edited']:
163
+ all_tools.extend(['Edit'] * len(tool_usage['files_edited']))
164
+ if tool_usage['files_created']:
165
+ all_tools.extend(['Write'] * len(tool_usage['files_created']))
166
+ if tool_usage['grep_searches']:
167
+ all_tools.extend(['Grep'] * len(tool_usage['grep_searches']))
168
+ if tool_usage['bash_commands']:
169
+ all_tools.extend(['Bash'] * len(tool_usage['bash_commands']))
170
+ if tool_usage['glob_patterns']:
171
+ all_tools.extend(['Glob'] * len(tool_usage['glob_patterns']))
172
+ if tool_usage['task_calls']:
173
+ all_tools.extend(['Task'] * len(tool_usage['task_calls']))
174
+ if tool_usage['web_searches']:
175
+ all_tools.extend(['WebSearch'] * len(tool_usage['web_searches']))
176
+
177
+ # Count tool usage
178
+ for tool in all_tools:
179
+ tool_usage['tools_summary'][tool] = tool_usage['tools_summary'].get(tool, 0) + 1
180
+
181
+ return tool_usage
182
+
183
+ def extract_tool_data(tool_use: Dict[str, Any], usage_dict: Dict[str, Any]):
184
+ """Extract tool usage data from a tool_use object."""
185
+ tool_name = tool_use.get('name', '')
186
+ inputs = tool_use.get('input', {})
187
+
188
+ # Handle Read tool
189
+ if tool_name == 'Read':
190
+ file_path = inputs.get('file_path')
191
+ if file_path:
192
+ usage_dict['files_read'].append({
193
+ 'path': normalize_path(file_path),
194
+ 'operation': 'read'
195
+ })
196
+
197
+ # Handle Edit and MultiEdit tools
198
+ elif tool_name in ['Edit', 'MultiEdit']:
199
+ path = inputs.get('file_path')
200
+ if path:
201
+ usage_dict['files_edited'].append({
202
+ 'path': normalize_path(path),
203
+ 'operation': tool_name.lower()
204
+ })
205
+
206
+ # Handle Write tool
207
+ elif tool_name == 'Write':
208
+ path = inputs.get('file_path')
209
+ if path:
210
+ usage_dict['files_created'].append({
211
+ 'path': normalize_path(path),
212
+ 'operation': 'write'
213
+ })
214
+
215
+ # Handle Grep tool
216
+ elif tool_name == 'Grep':
217
+ pattern = inputs.get('pattern')
218
+ path = inputs.get('path', '.')
219
+ if pattern:
220
+ usage_dict['grep_searches'].append({
221
+ 'pattern': pattern,
222
+ 'path': normalize_path(path)
223
+ })
224
+
225
+ # Handle Bash tool
226
+ elif tool_name == 'Bash':
227
+ command = inputs.get('command')
228
+ if command:
229
+ usage_dict['bash_commands'].append({
230
+ 'command': command[:200] # Limit command length
231
+ })
232
+
233
+ # Handle Glob tool
234
+ elif tool_name == 'Glob':
235
+ pattern = inputs.get('pattern')
236
+ if pattern:
237
+ usage_dict['glob_patterns'].append({
238
+ 'pattern': pattern
239
+ })
240
+
241
+ # Handle Task tool
242
+ elif tool_name == 'Task':
243
+ agent = inputs.get('subagent_type', 'unknown')
244
+ usage_dict['task_calls'].append({
245
+ 'agent': agent
246
+ })
247
+
248
+ # Handle WebSearch tool
249
+ elif tool_name == 'WebSearch':
250
+ query = inputs.get('query')
251
+ if query:
252
+ usage_dict['web_searches'].append({
253
+ 'query': query[:100]
254
+ })
255
+
256
+ def extract_tools_from_text(content: str, usage_dict: Dict[str, Any]):
257
+ """Extract tool usage from text content (fallback for legacy format)."""
258
+ # Look for file paths that might have been read/edited
259
+ file_pattern = r'(?:Reading|Editing|Writing|Checking)\s+(?:file\s+)?([/~][\w\-./]+\.\w+)'
260
+ for match in re.finditer(file_pattern, content):
261
+ file_path = match.group(1)
262
+ if 'Edit' in match.group(0):
263
+ usage_dict['files_edited'].append({
264
+ 'path': normalize_path(file_path),
265
+ 'operation': 'edit'
266
+ })
267
+ else:
268
+ usage_dict['files_read'].append({
269
+ 'path': normalize_path(file_path),
270
+ 'operation': 'read'
271
+ })
272
+
273
+ def load_state():
274
+ """Load the delta update state."""
275
+ state_path = Path(STATE_FILE)
276
+ if state_path.exists():
277
+ try:
278
+ with open(state_path, 'r') as f:
279
+ return json.load(f)
280
+ except Exception as e:
281
+ logger.warning(f"Failed to load state: {e}")
282
+
283
+ return {
284
+ "last_update": None,
285
+ "updated_conversations": {}
286
+ }
287
+
288
+ def save_state(state: Dict[str, Any]):
289
+ """Save the delta update state."""
290
+ state_path = Path(STATE_FILE)
291
+ state_path.parent.mkdir(parents=True, exist_ok=True)
292
+
293
+ try:
294
+ with open(state_path, 'w') as f:
295
+ json.dump(state, f, indent=2)
296
+ except Exception as e:
297
+ logger.error(f"Failed to save state: {e}")
298
+
299
+ def get_recent_conversations(days: int = 7) -> List[Path]:
300
+ """Get conversation files from the past N days."""
301
+ recent_files = []
302
+ cutoff_time = datetime.now() - timedelta(days=days)
303
+
304
+ logs_path = Path(LOGS_DIR)
305
+ if not logs_path.exists():
306
+ logger.error(f"Logs directory not found: {LOGS_DIR}")
307
+ return recent_files
308
+
309
+ # Find all JSONL files
310
+ for jsonl_file in logs_path.glob("**/*.jsonl"):
311
+ try:
312
+ # Check file modification time
313
+ mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
314
+ if mtime >= cutoff_time:
315
+ recent_files.append(jsonl_file)
316
+ except Exception as e:
317
+ logger.debug(f"Error checking file {jsonl_file}: {e}")
318
+
319
+ logger.info(f"Found {len(recent_files)} conversations from the past {days} days")
320
+ return recent_files
321
+
322
+ def update_point_metadata(conversation_id: str, chunk_index: int, metadata: Dict[str, Any],
323
+ collection_name: str) -> bool:
324
+ """Update metadata for a specific point in Qdrant."""
325
+ try:
326
+ # Calculate point ID (same as original import)
327
+ point_id_str = hashlib.md5(
328
+ f"{conversation_id}_{chunk_index}".encode()
329
+ ).hexdigest()[:16]
330
+ point_id = int(point_id_str, 16) % (2**63)
331
+
332
+ if DRY_RUN:
333
+ logger.info(f"[DRY RUN] Would update point {point_id} with metadata")
334
+ return True
335
+
336
+ # First, try to get the existing point to preserve other fields
337
+ try:
338
+ existing_points = client.retrieve(
339
+ collection_name=collection_name,
340
+ ids=[point_id],
341
+ with_payload=True,
342
+ with_vectors=False
343
+ )
344
+
345
+ if existing_points:
346
+ # Merge with existing payload
347
+ existing_payload = existing_points[0].payload
348
+ existing_payload.update(metadata)
349
+ metadata = existing_payload
350
+ except Exception as e:
351
+ logger.debug(f"Could not retrieve existing point {point_id}: {e}")
352
+
353
+ # Use set_payload to update just the metadata without touching the vector
354
+ client.set_payload(
355
+ collection_name=collection_name,
356
+ payload=metadata,
357
+ points=[point_id],
358
+ wait=False # Don't wait for each point
359
+ )
360
+
361
+ return True
362
+
363
+ except Exception as e:
364
+ import traceback
365
+ logger.error(f"Failed to update point {conversation_id}_{chunk_index}: {e}")
366
+ logger.debug(traceback.format_exc())
367
+ return False
368
+
369
+ def process_conversation(jsonl_file: Path, state: Dict[str, Any]) -> bool:
370
+ """Process a single conversation file and update its metadata."""
371
+ try:
372
+ conversation_id = jsonl_file.stem
373
+ project_name = jsonl_file.parent.name
374
+
375
+ # Check if already updated
376
+ if conversation_id in state.get("updated_conversations", {}):
377
+ last_updated = state["updated_conversations"][conversation_id].get("updated_at")
378
+ file_mtime = jsonl_file.stat().st_mtime
379
+ if last_updated and last_updated >= file_mtime:
380
+ logger.debug(f"Skipping {conversation_id} - already updated")
381
+ return True
382
+
383
+ logger.info(f"Processing: {conversation_id}")
384
+
385
+ # Extract tool usage metadata
386
+ tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
387
+
388
+ # Read the full conversation to get text for concept extraction
389
+ conversation_text = ""
390
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
391
+ for line in f:
392
+ if line.strip():
393
+ try:
394
+ data = json.loads(line)
395
+ if 'message' in data and data['message']:
396
+ msg = data['message']
397
+ if msg.get('content'):
398
+ if isinstance(msg['content'], str):
399
+ conversation_text += msg['content'] + "\n"
400
+ elif isinstance(msg['content'], list):
401
+ for item in msg['content']:
402
+ if isinstance(item, dict) and item.get('text'):
403
+ conversation_text += item['text'] + "\n"
404
+ except:
405
+ continue
406
+
407
+ # Extract concepts
408
+ concepts = extract_concepts(conversation_text[:10000], tool_usage) # Limit text for concept extraction
409
+
410
+ # Prepare metadata update
411
+ files_analyzed = list(set([
412
+ item['path'] if isinstance(item, dict) else item
413
+ for item in tool_usage.get('files_read', [])
414
+ if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
415
+ ]))[:20] # Limit to 20 files
416
+
417
+ files_edited = list(set([
418
+ item['path'] if isinstance(item, dict) else item
419
+ for item in tool_usage.get('files_edited', [])
420
+ if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
421
+ ]))[:10] # Limit to 10 files
422
+
423
+ metadata_update = {
424
+ "files_analyzed": files_analyzed,
425
+ "files_edited": files_edited,
426
+ "tools_used": list(tool_usage.get('tools_summary', {}).keys())[:20],
427
+ "tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
428
+ "concepts": list(concepts)[:15],
429
+ "search_patterns": [s.get('pattern', '') for s in tool_usage.get('grep_searches', [])][:10],
430
+ "analysis_only": len(files_edited) == 0 and len(tool_usage.get('files_created', [])) == 0,
431
+ "has_file_metadata": True, # Flag to indicate this has been enhanced
432
+ "metadata_updated_at": datetime.now().isoformat()
433
+ }
434
+
435
+ # Determine collection name
436
+ project_hash = hashlib.md5(normalize_project_name(project_name).encode()).hexdigest()[:8]
437
+ collection_name = f"conv_{project_hash}{get_collection_suffix()}"
438
+
439
+ # Check if collection exists
440
+ try:
441
+ collections = client.get_collections().collections
442
+ if collection_name not in [c.name for c in collections]:
443
+ logger.warning(f"Collection {collection_name} not found for project {project_name}")
444
+ return False
445
+ except Exception as e:
446
+ logger.error(f"Error checking collection: {e}")
447
+ return False
448
+
449
+ # Get the number of chunks for this conversation
450
+ # We need to know how many chunks were created during original import
451
+ # For now, we'll try to update up to 50 chunks (most conversations have fewer)
452
+ max_chunks = 50
453
+ updated_count = 0
454
+ failed_count = 0
455
+
456
+ for chunk_index in range(max_chunks):
457
+ success = update_point_metadata(
458
+ conversation_id,
459
+ chunk_index,
460
+ metadata_update,
461
+ collection_name
462
+ )
463
+
464
+ if success:
465
+ updated_count += 1
466
+ else:
467
+ failed_count += 1
468
+ # If we get too many failures in a row, the conversation probably has fewer chunks
469
+ if failed_count > 5:
470
+ break
471
+
472
+ if updated_count > 0:
473
+ logger.info(f"Updated {updated_count} chunks for {conversation_id}")
474
+
475
+ # Update state
476
+ state["updated_conversations"][conversation_id] = {
477
+ "updated_at": time.time(),
478
+ "chunks_updated": updated_count,
479
+ "project": project_name
480
+ }
481
+
482
+ return True
483
+ else:
484
+ logger.warning(f"No chunks updated for {conversation_id}")
485
+ return False
486
+
487
+ except Exception as e:
488
+ logger.error(f"Failed to process {jsonl_file}: {e}")
489
+ return False
490
+
491
+ def main():
492
+ """Main delta update function."""
493
+ logger.info("=== Starting Delta Metadata Update ===")
494
+ logger.info(f"Configuration:")
495
+ logger.info(f" Qdrant URL: {QDRANT_URL}")
496
+ logger.info(f" Logs directory: {LOGS_DIR}")
497
+ logger.info(f" Days to update: {DAYS_TO_UPDATE}")
498
+ logger.info(f" Embedding type: {'local' if PREFER_LOCAL_EMBEDDINGS else 'voyage'}")
499
+ logger.info(f" Dry run: {DRY_RUN}")
500
+
501
+ # Load state
502
+ state = load_state()
503
+
504
+ # Get recent conversations
505
+ recent_files = get_recent_conversations(DAYS_TO_UPDATE)
506
+
507
+ if not recent_files:
508
+ logger.info("No recent conversations found to update")
509
+ return
510
+
511
+ # Limit for testing
512
+ if os.getenv("LIMIT"):
513
+ limit = int(os.getenv("LIMIT"))
514
+ recent_files = recent_files[:limit]
515
+ logger.info(f"Limited to {limit} files for testing")
516
+
517
+ # Process each conversation
518
+ success_count = 0
519
+ failure_count = 0
520
+
521
+ for i, jsonl_file in enumerate(recent_files, 1):
522
+ logger.info(f"Processing {i}/{len(recent_files)}: {jsonl_file.name}")
523
+
524
+ if process_conversation(jsonl_file, state):
525
+ success_count += 1
526
+ else:
527
+ failure_count += 1
528
+
529
+ # Save state periodically
530
+ if i % 10 == 0:
531
+ save_state(state)
532
+
533
+ # Final state save
534
+ state["last_update"] = datetime.now().isoformat()
535
+ save_state(state)
536
+
537
+ # Summary
538
+ logger.info("=== Delta Update Complete ===")
539
+ logger.info(f"Successfully updated: {success_count} conversations")
540
+ logger.info(f"Failed: {failure_count} conversations")
541
+ logger.info(f"Total conversations in state: {len(state['updated_conversations'])}")
542
+
543
+ if DRY_RUN:
544
+ logger.info("This was a DRY RUN - no actual updates were made")
545
+
546
+ if __name__ == "__main__":
547
+ main()