claude-self-reflect 2.5.18 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,442 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Safe delta metadata update script for Claude Self-Reflect.
4
+ Updates existing Qdrant points with tool usage metadata without overwhelming the system.
5
+ Includes rate limiting, batch processing, and proper error recovery.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import hashlib
12
+ import re
13
+ import time
14
+ import asyncio
15
+ from datetime import datetime, timedelta
16
+ from typing import List, Dict, Any, Set, Tuple, Optional
17
+ import logging
18
+ from pathlib import Path
19
+ from collections import defaultdict
20
+
21
+ from qdrant_client import QdrantClient
22
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
23
+
24
+ # Configuration
25
+ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
26
+ LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
27
+ STATE_FILE = os.getenv("STATE_FILE", "./config/delta-update-state.json")
28
+ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
29
+ DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
30
+ DAYS_TO_UPDATE = int(os.getenv("DAYS_TO_UPDATE", "7"))
31
+ BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Process N conversations at a time
32
+ RATE_LIMIT_DELAY = float(os.getenv("RATE_LIMIT_DELAY", "0.1")) # Delay between updates
33
+ MAX_CONCURRENT_UPDATES = int(os.getenv("MAX_CONCURRENT_UPDATES", "5")) # Max parallel updates
34
+
35
+ # Set up logging
36
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # Initialize Qdrant client
40
+ client = QdrantClient(url=QDRANT_URL, timeout=30) # Increased timeout
41
+
42
+ def get_collection_suffix():
43
+ """Get the collection suffix based on embedding type (for new collections only)."""
44
+ return "_local" if PREFER_LOCAL_EMBEDDINGS else "_voyage"
45
+
46
+ def get_existing_collection_suffix(project_hash: str, max_retries: int = 3) -> str:
47
+ """Detect which collection type actually exists for this project.
48
+
49
+ This function checks for existing collections and returns the actual suffix found.
50
+ Only falls back to preference when creating new collections.
51
+ Includes retry logic for resilience against temporary Qdrant unavailability.
52
+
53
+ Args:
54
+ project_hash: The MD5 hash of the normalized project name
55
+ max_retries: Maximum number of retry attempts for collection detection
56
+
57
+ Returns:
58
+ "_voyage" if voyage collection exists, "_local" if local exists,
59
+ or preference-based suffix if neither exists yet
60
+ """
61
+ for attempt in range(max_retries):
62
+ try:
63
+ collections = client.get_collections().collections
64
+ collection_names = [c.name for c in collections]
65
+
66
+ # Check for both possible collection names
67
+ voyage_name = f"conv_{project_hash}_voyage"
68
+ local_name = f"conv_{project_hash}_local"
69
+
70
+ # Return the actual collection type that exists
71
+ if voyage_name in collection_names:
72
+ logger.debug(f"Found existing Voyage collection: {voyage_name}")
73
+ return "_voyage"
74
+ elif local_name in collection_names:
75
+ logger.debug(f"Found existing Local collection: {local_name}")
76
+ return "_local"
77
+ else:
78
+ # No existing collection - use preference for new ones
79
+ suffix = get_collection_suffix()
80
+ logger.debug(f"No existing collection for hash {project_hash}, using preference: {suffix}")
81
+ return suffix
82
+ except Exception as e:
83
+ if attempt < max_retries - 1:
84
+ wait_time = 0.5 * (attempt + 1) # Exponential backoff
85
+ logger.debug(f"Error checking collections (attempt {attempt + 1}/{max_retries}): {e}, retrying in {wait_time}s")
86
+ time.sleep(wait_time)
87
+ continue
88
+ logger.warning(f"Error checking collections after {max_retries} attempts: {e}, falling back to preference")
89
+ return get_collection_suffix()
90
+
91
+ def normalize_project_name(project_name: str) -> str:
92
+ """Normalize project name by removing path-like prefixes."""
93
+ if project_name.startswith("-"):
94
+ parts = project_name.split("-")
95
+ for i, part in enumerate(parts):
96
+ if part == "projects" and i < len(parts) - 1:
97
+ return "-".join(parts[i+1:])
98
+ return project_name
99
+
100
+ def normalize_path(path: str) -> str:
101
+ """Normalize file paths for consistency across platforms."""
102
+ if not path:
103
+ return ""
104
+ path = path.replace("/Users/", "~/").replace("\\Users\\", "~\\")
105
+ path = path.replace("\\", "/")
106
+ path = re.sub(r'/+', '/', path)
107
+ return path
108
+
109
+ def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
110
+ """Extract high-level concepts from conversation and tool usage."""
111
+ concepts = set()
112
+
113
+ concept_patterns = {
114
+ 'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
115
+ 'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
116
+ 'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
117
+ 'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
118
+ 'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
119
+ 'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
120
+ 'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
121
+ 'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
122
+ 'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
123
+ 'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
124
+ 'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
125
+ 'architecture': r'(architecture|design|pattern|structure|component|module)',
126
+ 'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
127
+ 'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
128
+ 'search': r'(search|query|find|filter|match|relevance)'
129
+ }
130
+
131
+ combined_text = text.lower()
132
+ for concept, pattern in concept_patterns.items():
133
+ if re.search(pattern, combined_text, re.IGNORECASE):
134
+ concepts.add(concept)
135
+
136
+ # Check tool usage patterns
137
+ if tool_usage.get('grep_searches'):
138
+ concepts.add('search')
139
+ if tool_usage.get('files_edited') or tool_usage.get('files_created'):
140
+ concepts.add('development')
141
+ if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
142
+ concepts.add('testing')
143
+ if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
144
+ concepts.add('docker')
145
+
146
+ return concepts
147
+
148
+ def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
149
+ """Extract all tool usage from a conversation."""
150
+ tool_usage = {
151
+ "files_read": [],
152
+ "files_edited": [],
153
+ "files_created": [],
154
+ "grep_searches": [],
155
+ "bash_commands": [],
156
+ "tools_summary": {}
157
+ }
158
+
159
+ try:
160
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
161
+ for line in f:
162
+ if not line.strip():
163
+ continue
164
+ try:
165
+ data = json.loads(line)
166
+ if 'message' in data and data['message']:
167
+ msg = data['message']
168
+ if msg.get('role') == 'assistant' and msg.get('content'):
169
+ content = msg['content']
170
+ if isinstance(content, list):
171
+ for item in content:
172
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
173
+ tool_name = item.get('name', '')
174
+ inputs = item.get('input', {})
175
+
176
+ # Track tool usage
177
+ tool_usage['tools_summary'][tool_name] = tool_usage['tools_summary'].get(tool_name, 0) + 1
178
+
179
+ # Extract file paths
180
+ if tool_name == 'Read':
181
+ file_path = inputs.get('file_path')
182
+ if file_path:
183
+ tool_usage['files_read'].append(normalize_path(file_path))
184
+ elif tool_name in ['Edit', 'Write', 'MultiEdit']:
185
+ file_path = inputs.get('file_path')
186
+ if file_path:
187
+ tool_usage['files_edited'].append(normalize_path(file_path))
188
+ elif tool_name == 'Grep':
189
+ pattern = inputs.get('pattern')
190
+ if pattern:
191
+ tool_usage['grep_searches'].append({'pattern': pattern[:100]})
192
+ elif tool_name == 'Bash':
193
+ command = inputs.get('command', '')[:200]
194
+ if command:
195
+ tool_usage['bash_commands'].append(command)
196
+ except Exception as e:
197
+ continue
198
+ except Exception as e:
199
+ logger.error(f"Error reading JSONL file {jsonl_path}: {e}")
200
+
201
+ # Deduplicate
202
+ tool_usage['files_read'] = list(set(tool_usage['files_read']))[:20]
203
+ tool_usage['files_edited'] = list(set(tool_usage['files_edited']))[:10]
204
+
205
+ return tool_usage
206
+
207
+ def load_state() -> Dict[str, Any]:
208
+ """Load the current state from file."""
209
+ state_path = Path(STATE_FILE)
210
+ if state_path.exists():
211
+ try:
212
+ with open(state_path, 'r') as f:
213
+ return json.load(f)
214
+ except Exception as e:
215
+ logger.warning(f"Could not load state: {e}")
216
+ return {
217
+ "last_update": None,
218
+ "updated_conversations": {},
219
+ "failed_conversations": {}
220
+ }
221
+
222
+ def save_state(state: Dict[str, Any]):
223
+ """Save the current state to file."""
224
+ state_path = Path(STATE_FILE)
225
+ state_path.parent.mkdir(parents=True, exist_ok=True)
226
+
227
+ state["last_update"] = datetime.now().isoformat()
228
+
229
+ try:
230
+ with open(state_path, 'w') as f:
231
+ json.dump(state, f, indent=2)
232
+ except Exception as e:
233
+ logger.error(f"Could not save state: {e}")
234
+
235
+ async def update_point_metadata_batch(updates: List[Tuple[str, int, Dict, str]]) -> int:
236
+ """Update multiple points in a batch with rate limiting."""
237
+ success_count = 0
238
+
239
+ for conversation_id, chunk_index, metadata, collection_name in updates:
240
+ try:
241
+ # Calculate point ID
242
+ point_id_str = hashlib.md5(
243
+ f"{conversation_id}_{chunk_index}".encode()
244
+ ).hexdigest()[:16]
245
+ point_id = int(point_id_str, 16) % (2**63)
246
+
247
+ if not DRY_RUN:
248
+ # Update with rate limiting
249
+ client.set_payload(
250
+ collection_name=collection_name,
251
+ payload=metadata,
252
+ points=[point_id],
253
+ wait=False
254
+ )
255
+ success_count += 1
256
+
257
+ # Rate limit to avoid overwhelming Qdrant
258
+ await asyncio.sleep(RATE_LIMIT_DELAY)
259
+ else:
260
+ logger.info(f"[DRY RUN] Would update point {point_id}")
261
+ success_count += 1
262
+
263
+ except Exception as e:
264
+ logger.debug(f"Failed to update point {conversation_id}_{chunk_index}: {e}")
265
+
266
+ return success_count
267
+
268
+ async def process_conversation_async(jsonl_file: Path, state: Dict[str, Any]) -> bool:
269
+ """Process a single conversation file asynchronously."""
270
+ try:
271
+ conversation_id = jsonl_file.stem
272
+ project_name = jsonl_file.parent.name
273
+
274
+ # Check if already updated
275
+ if conversation_id in state.get("updated_conversations", {}):
276
+ last_updated = state["updated_conversations"][conversation_id].get("updated_at")
277
+ file_mtime = jsonl_file.stat().st_mtime
278
+ if last_updated and last_updated >= file_mtime:
279
+ logger.debug(f"Skipping {conversation_id} - already updated")
280
+ return True
281
+
282
+ # Check if previously failed too many times
283
+ failed_info = state.get("failed_conversations", {}).get(conversation_id, {})
284
+ if failed_info.get("retry_count", 0) > 3:
285
+ logger.debug(f"Skipping {conversation_id} - too many failures")
286
+ return False
287
+
288
+ logger.info(f"Processing: {conversation_id}")
289
+
290
+ # Extract metadata
291
+ tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
292
+
293
+ # Read conversation text (limited)
294
+ conversation_text = ""
295
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
296
+ for i, line in enumerate(f):
297
+ if i > 100: # Limit lines to avoid memory issues
298
+ break
299
+ if line.strip():
300
+ try:
301
+ data = json.loads(line)
302
+ if 'message' in data and data['message']:
303
+ msg = data['message']
304
+ if msg.get('content'):
305
+ if isinstance(msg['content'], str):
306
+ conversation_text += msg['content'][:500] + "\n"
307
+ except Exception as e:
308
+ logger.debug(f"Parse error in {jsonl_file}: {e}")
309
+ continue
310
+
311
+ # Extract concepts
312
+ concepts = extract_concepts(conversation_text[:10000], tool_usage)
313
+
314
+ # Prepare metadata
315
+ metadata_update = {
316
+ "files_analyzed": tool_usage.get('files_read', [])[:20],
317
+ "files_edited": tool_usage.get('files_edited', [])[:10],
318
+ "tools_used": list(tool_usage.get('tools_summary', {}).keys())[:20],
319
+ "concepts": list(concepts)[:15],
320
+ "has_file_metadata": True,
321
+ "metadata_updated_at": datetime.now().isoformat()
322
+ }
323
+
324
+ # Determine collection
325
+ project_hash = hashlib.md5(normalize_project_name(project_name).encode()).hexdigest()[:8]
326
+ # Use smart detection to find the actual collection type
327
+ collection_suffix = get_existing_collection_suffix(project_hash)
328
+ collection_name = f"conv_{project_hash}{collection_suffix}"
329
+
330
+ # Check if collection exists
331
+ try:
332
+ collections = client.get_collections().collections
333
+ if collection_name not in [c.name for c in collections]:
334
+ logger.warning(f"Collection {collection_name} not found")
335
+ return False
336
+ except Exception as e:
337
+ logger.error(f"Error checking collection: {e}")
338
+ # Record failure
339
+ state.setdefault("failed_conversations", {})[conversation_id] = {
340
+ "error": str(e),
341
+ "retry_count": failed_info.get("retry_count", 0) + 1,
342
+ "last_attempt": time.time()
343
+ }
344
+ return False
345
+
346
+ # Prepare batch updates
347
+ updates = []
348
+ for chunk_index in range(20): # Most conversations have < 20 chunks
349
+ updates.append((conversation_id, chunk_index, metadata_update, collection_name))
350
+
351
+ # Process in batch with rate limiting
352
+ success_count = await update_point_metadata_batch(updates)
353
+
354
+ if success_count > 0:
355
+ logger.info(f"Updated {success_count} chunks for {conversation_id}")
356
+ state["updated_conversations"][conversation_id] = {
357
+ "updated_at": time.time(),
358
+ "chunks_updated": success_count,
359
+ "project": project_name
360
+ }
361
+ return True
362
+ else:
363
+ logger.warning(f"No chunks updated for {conversation_id}")
364
+ return False
365
+
366
+ except Exception as e:
367
+ logger.error(f"Failed to process {jsonl_file}: {e}")
368
+ return False
369
+
370
+ async def main_async():
371
+ """Main async function with proper batching and rate limiting."""
372
+ logger.info("=== Starting Safe Delta Metadata Update ===")
373
+ logger.info(f"Configuration:")
374
+ logger.info(f" Qdrant URL: {QDRANT_URL}")
375
+ logger.info(f" Days to update: {DAYS_TO_UPDATE}")
376
+ logger.info(f" Batch size: {BATCH_SIZE}")
377
+ logger.info(f" Rate limit delay: {RATE_LIMIT_DELAY}s")
378
+ logger.info(f" Max concurrent: {MAX_CONCURRENT_UPDATES}")
379
+
380
+ # Load state
381
+ state = load_state()
382
+
383
+ # Get recent files
384
+ recent_files = []
385
+ cutoff_time = datetime.now() - timedelta(days=DAYS_TO_UPDATE)
386
+ logs_path = Path(LOGS_DIR)
387
+
388
+ if logs_path.exists():
389
+ for jsonl_file in logs_path.glob("**/*.jsonl"):
390
+ try:
391
+ mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
392
+ if mtime >= cutoff_time:
393
+ recent_files.append(jsonl_file)
394
+ except:
395
+ continue
396
+
397
+ logger.info(f"Found {len(recent_files)} conversations from the past {DAYS_TO_UPDATE} days")
398
+
399
+ # Process in batches
400
+ success_count = 0
401
+ failed_count = 0
402
+
403
+ for i in range(0, len(recent_files), BATCH_SIZE):
404
+ batch = recent_files[i:i + BATCH_SIZE]
405
+ logger.info(f"Processing batch {i//BATCH_SIZE + 1}/{(len(recent_files) + BATCH_SIZE - 1)//BATCH_SIZE}")
406
+
407
+ # Create tasks for concurrent processing
408
+ tasks = []
409
+ for jsonl_file in batch:
410
+ task = asyncio.create_task(process_conversation_async(jsonl_file, state))
411
+ tasks.append(task)
412
+
413
+ # Wait for batch to complete
414
+ results = await asyncio.gather(*tasks)
415
+
416
+ # Count results
417
+ batch_success = sum(1 for r in results if r)
418
+ batch_failed = len(results) - batch_success
419
+ success_count += batch_success
420
+ failed_count += batch_failed
421
+
422
+ # Save state after each batch
423
+ save_state(state)
424
+
425
+ # Add delay between batches to avoid overwhelming the system
426
+ if i + BATCH_SIZE < len(recent_files):
427
+ await asyncio.sleep(1.0)
428
+
429
+ # Final save
430
+ save_state(state)
431
+
432
+ logger.info("=== Delta Update Complete ===")
433
+ logger.info(f"Successfully updated: {success_count} conversations")
434
+ logger.info(f"Failed: {failed_count} conversations")
435
+ logger.info(f"Total conversations in state: {len(state['updated_conversations'])}")
436
+
437
+ def main():
438
+ """Entry point."""
439
+ asyncio.run(main_async())
440
+
441
+ if __name__ == "__main__":
442
+ main()