claude-self-reflect 3.2.4 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/.claude/agents/claude-self-reflect-test.md +595 -528
  2. package/.claude/agents/reflection-specialist.md +59 -3
  3. package/README.md +14 -5
  4. package/mcp-server/run-mcp.sh +49 -5
  5. package/mcp-server/src/app_context.py +64 -0
  6. package/mcp-server/src/config.py +57 -0
  7. package/mcp-server/src/connection_pool.py +286 -0
  8. package/mcp-server/src/decay_manager.py +106 -0
  9. package/mcp-server/src/embedding_manager.py +64 -40
  10. package/mcp-server/src/embeddings_old.py +141 -0
  11. package/mcp-server/src/models.py +64 -0
  12. package/mcp-server/src/parallel_search.py +371 -0
  13. package/mcp-server/src/project_resolver.py +5 -0
  14. package/mcp-server/src/reflection_tools.py +206 -0
  15. package/mcp-server/src/rich_formatting.py +196 -0
  16. package/mcp-server/src/search_tools.py +826 -0
  17. package/mcp-server/src/server.py +127 -1720
  18. package/mcp-server/src/temporal_design.py +132 -0
  19. package/mcp-server/src/temporal_tools.py +597 -0
  20. package/mcp-server/src/temporal_utils.py +384 -0
  21. package/mcp-server/src/utils.py +150 -67
  22. package/package.json +10 -1
  23. package/scripts/add-timestamp-indexes.py +134 -0
  24. package/scripts/check-collections.py +29 -0
  25. package/scripts/debug-august-parsing.py +76 -0
  26. package/scripts/debug-import-single.py +91 -0
  27. package/scripts/debug-project-resolver.py +82 -0
  28. package/scripts/debug-temporal-tools.py +135 -0
  29. package/scripts/delta-metadata-update.py +547 -0
  30. package/scripts/import-conversations-unified.py +53 -2
  31. package/scripts/precompact-hook.sh +33 -0
  32. package/scripts/streaming-watcher.py +1443 -0
  33. package/scripts/utils.py +39 -0
@@ -0,0 +1,547 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Delta metadata update script for Claude Self-Reflect.
4
+ Updates existing Qdrant points with tool usage metadata without re-importing vectors.
5
+ This allows us to enhance past conversations with file tracking and concept extraction.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import hashlib
12
+ import re
13
+ import time
14
+ from datetime import datetime, timedelta
15
+ from typing import List, Dict, Any, Set, Tuple, Optional
16
+ import logging
17
+ from pathlib import Path
18
+
19
+ from qdrant_client import QdrantClient
20
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
21
+
22
+ # Configuration
23
+ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
24
+ LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
25
+ STATE_FILE = os.getenv("STATE_FILE", "./config/delta-update-state.json")
26
+ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
27
+ DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
28
+ DAYS_TO_UPDATE = int(os.getenv("DAYS_TO_UPDATE", "7"))
29
+
30
+ # Set up logging
31
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Initialize Qdrant client
35
+ client = QdrantClient(url=QDRANT_URL)
36
+
37
+ def get_collection_suffix():
38
+ """Get the collection suffix based on embedding type."""
39
+ return "_local" if PREFER_LOCAL_EMBEDDINGS else "_voyage"
40
+
41
+ def normalize_project_name(project_name: str) -> str:
42
+ """Normalize project name by removing path-like prefixes."""
43
+ # Remove path-like prefixes (e.g., "-Users-username-projects-")
44
+ if project_name.startswith("-"):
45
+ # Split by '-' and reconstruct
46
+ parts = project_name.split("-")
47
+ # Find where the actual project name starts (usually after 'projects')
48
+ for i, part in enumerate(parts):
49
+ if part == "projects" and i < len(parts) - 1:
50
+ return "-".join(parts[i+1:])
51
+ return project_name
52
+
53
+ def normalize_path(path: str) -> str:
54
+ """Normalize file paths for consistency across platforms."""
55
+ if not path:
56
+ return ""
57
+
58
+ # Remove common prefixes
59
+ path = path.replace("/Users/", "~/")
60
+ path = path.replace("\\Users\\", "~\\")
61
+
62
+ # Convert to forward slashes
63
+ path = path.replace("\\", "/")
64
+
65
+ # Remove duplicate slashes
66
+ path = re.sub(r'/+', '/', path)
67
+
68
+ return path
69
+
70
+ def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
71
+ """Extract high-level concepts from conversation and tool usage."""
72
+ concepts = set()
73
+
74
+ # Common development concepts with patterns
75
+ concept_patterns = {
76
+ 'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
77
+ 'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
78
+ 'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
79
+ 'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
80
+ 'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
81
+ 'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
82
+ 'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
83
+ 'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
84
+ 'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
85
+ 'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
86
+ 'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
87
+ 'architecture': r'(architecture|design|pattern|structure|component|module)',
88
+ 'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
89
+ 'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
90
+ 'search': r'(search|query|find|filter|match|relevance)'
91
+ }
92
+
93
+ # Check text content
94
+ combined_text = text.lower()
95
+ for concept, pattern in concept_patterns.items():
96
+ if re.search(pattern, combined_text, re.IGNORECASE):
97
+ concepts.add(concept)
98
+
99
+ # Check tool usage patterns
100
+ if tool_usage.get('grep_searches'):
101
+ concepts.add('search')
102
+ if tool_usage.get('files_edited') or tool_usage.get('files_created'):
103
+ concepts.add('development')
104
+ if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
105
+ concepts.add('testing')
106
+ if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
107
+ concepts.add('docker')
108
+
109
+ return concepts
110
+
111
+ def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
112
+ """Extract all tool usage from a conversation."""
113
+ tool_usage = {
114
+ "files_read": [],
115
+ "files_edited": [],
116
+ "files_created": [],
117
+ "grep_searches": [],
118
+ "bash_commands": [],
119
+ "glob_patterns": [],
120
+ "task_calls": [],
121
+ "web_searches": [],
122
+ "tools_summary": {}
123
+ }
124
+
125
+ try:
126
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
127
+ for line in f:
128
+ if not line.strip():
129
+ continue
130
+
131
+ try:
132
+ data = json.loads(line)
133
+
134
+ # Look for tool usage in message content
135
+ if 'message' in data and data['message']:
136
+ msg = data['message']
137
+ if msg.get('role') == 'assistant' and msg.get('content'):
138
+ content = msg['content']
139
+
140
+ # Handle content as list of objects
141
+ if isinstance(content, list):
142
+ for item in content:
143
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
144
+ extract_tool_data(item, tool_usage)
145
+ # Handle content as string (legacy format)
146
+ elif isinstance(content, str):
147
+ # Try to extract tool usage from text patterns
148
+ extract_tools_from_text(content, tool_usage)
149
+
150
+ except json.JSONDecodeError:
151
+ continue
152
+ except Exception as e:
153
+ logger.debug(f"Error processing line: {e}")
154
+
155
+ except Exception as e:
156
+ logger.error(f"Error reading JSONL file {jsonl_path}: {e}")
157
+
158
+ # Calculate tools summary
159
+ all_tools = []
160
+ if tool_usage['files_read']:
161
+ all_tools.extend(['Read'] * len(tool_usage['files_read']))
162
+ if tool_usage['files_edited']:
163
+ all_tools.extend(['Edit'] * len(tool_usage['files_edited']))
164
+ if tool_usage['files_created']:
165
+ all_tools.extend(['Write'] * len(tool_usage['files_created']))
166
+ if tool_usage['grep_searches']:
167
+ all_tools.extend(['Grep'] * len(tool_usage['grep_searches']))
168
+ if tool_usage['bash_commands']:
169
+ all_tools.extend(['Bash'] * len(tool_usage['bash_commands']))
170
+ if tool_usage['glob_patterns']:
171
+ all_tools.extend(['Glob'] * len(tool_usage['glob_patterns']))
172
+ if tool_usage['task_calls']:
173
+ all_tools.extend(['Task'] * len(tool_usage['task_calls']))
174
+ if tool_usage['web_searches']:
175
+ all_tools.extend(['WebSearch'] * len(tool_usage['web_searches']))
176
+
177
+ # Count tool usage
178
+ for tool in all_tools:
179
+ tool_usage['tools_summary'][tool] = tool_usage['tools_summary'].get(tool, 0) + 1
180
+
181
+ return tool_usage
182
+
183
+ def extract_tool_data(tool_use: Dict[str, Any], usage_dict: Dict[str, Any]):
184
+ """Extract tool usage data from a tool_use object."""
185
+ tool_name = tool_use.get('name', '')
186
+ inputs = tool_use.get('input', {})
187
+
188
+ # Handle Read tool
189
+ if tool_name == 'Read':
190
+ file_path = inputs.get('file_path')
191
+ if file_path:
192
+ usage_dict['files_read'].append({
193
+ 'path': normalize_path(file_path),
194
+ 'operation': 'read'
195
+ })
196
+
197
+ # Handle Edit and MultiEdit tools
198
+ elif tool_name in ['Edit', 'MultiEdit']:
199
+ path = inputs.get('file_path')
200
+ if path:
201
+ usage_dict['files_edited'].append({
202
+ 'path': normalize_path(path),
203
+ 'operation': tool_name.lower()
204
+ })
205
+
206
+ # Handle Write tool
207
+ elif tool_name == 'Write':
208
+ path = inputs.get('file_path')
209
+ if path:
210
+ usage_dict['files_created'].append({
211
+ 'path': normalize_path(path),
212
+ 'operation': 'write'
213
+ })
214
+
215
+ # Handle Grep tool
216
+ elif tool_name == 'Grep':
217
+ pattern = inputs.get('pattern')
218
+ path = inputs.get('path', '.')
219
+ if pattern:
220
+ usage_dict['grep_searches'].append({
221
+ 'pattern': pattern,
222
+ 'path': normalize_path(path)
223
+ })
224
+
225
+ # Handle Bash tool
226
+ elif tool_name == 'Bash':
227
+ command = inputs.get('command')
228
+ if command:
229
+ usage_dict['bash_commands'].append({
230
+ 'command': command[:200] # Limit command length
231
+ })
232
+
233
+ # Handle Glob tool
234
+ elif tool_name == 'Glob':
235
+ pattern = inputs.get('pattern')
236
+ if pattern:
237
+ usage_dict['glob_patterns'].append({
238
+ 'pattern': pattern
239
+ })
240
+
241
+ # Handle Task tool
242
+ elif tool_name == 'Task':
243
+ agent = inputs.get('subagent_type', 'unknown')
244
+ usage_dict['task_calls'].append({
245
+ 'agent': agent
246
+ })
247
+
248
+ # Handle WebSearch tool
249
+ elif tool_name == 'WebSearch':
250
+ query = inputs.get('query')
251
+ if query:
252
+ usage_dict['web_searches'].append({
253
+ 'query': query[:100]
254
+ })
255
+
256
+ def extract_tools_from_text(content: str, usage_dict: Dict[str, Any]):
257
+ """Extract tool usage from text content (fallback for legacy format)."""
258
+ # Look for file paths that might have been read/edited
259
+ file_pattern = r'(?:Reading|Editing|Writing|Checking)\s+(?:file\s+)?([/~][\w\-./]+\.\w+)'
260
+ for match in re.finditer(file_pattern, content):
261
+ file_path = match.group(1)
262
+ if 'Edit' in match.group(0):
263
+ usage_dict['files_edited'].append({
264
+ 'path': normalize_path(file_path),
265
+ 'operation': 'edit'
266
+ })
267
+ else:
268
+ usage_dict['files_read'].append({
269
+ 'path': normalize_path(file_path),
270
+ 'operation': 'read'
271
+ })
272
+
273
+ def load_state():
274
+ """Load the delta update state."""
275
+ state_path = Path(STATE_FILE)
276
+ if state_path.exists():
277
+ try:
278
+ with open(state_path, 'r') as f:
279
+ return json.load(f)
280
+ except Exception as e:
281
+ logger.warning(f"Failed to load state: {e}")
282
+
283
+ return {
284
+ "last_update": None,
285
+ "updated_conversations": {}
286
+ }
287
+
288
+ def save_state(state: Dict[str, Any]):
289
+ """Save the delta update state."""
290
+ state_path = Path(STATE_FILE)
291
+ state_path.parent.mkdir(parents=True, exist_ok=True)
292
+
293
+ try:
294
+ with open(state_path, 'w') as f:
295
+ json.dump(state, f, indent=2)
296
+ except Exception as e:
297
+ logger.error(f"Failed to save state: {e}")
298
+
299
+ def get_recent_conversations(days: int = 7) -> List[Path]:
300
+ """Get conversation files from the past N days."""
301
+ recent_files = []
302
+ cutoff_time = datetime.now() - timedelta(days=days)
303
+
304
+ logs_path = Path(LOGS_DIR)
305
+ if not logs_path.exists():
306
+ logger.error(f"Logs directory not found: {LOGS_DIR}")
307
+ return recent_files
308
+
309
+ # Find all JSONL files
310
+ for jsonl_file in logs_path.glob("**/*.jsonl"):
311
+ try:
312
+ # Check file modification time
313
+ mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
314
+ if mtime >= cutoff_time:
315
+ recent_files.append(jsonl_file)
316
+ except Exception as e:
317
+ logger.debug(f"Error checking file {jsonl_file}: {e}")
318
+
319
+ logger.info(f"Found {len(recent_files)} conversations from the past {days} days")
320
+ return recent_files
321
+
322
+ def update_point_metadata(conversation_id: str, chunk_index: int, metadata: Dict[str, Any],
323
+ collection_name: str) -> bool:
324
+ """Update metadata for a specific point in Qdrant."""
325
+ try:
326
+ # Calculate point ID (same as original import)
327
+ point_id_str = hashlib.md5(
328
+ f"{conversation_id}_{chunk_index}".encode()
329
+ ).hexdigest()[:16]
330
+ point_id = int(point_id_str, 16) % (2**63)
331
+
332
+ if DRY_RUN:
333
+ logger.info(f"[DRY RUN] Would update point {point_id} with metadata")
334
+ return True
335
+
336
+ # First, try to get the existing point to preserve other fields
337
+ try:
338
+ existing_points = client.retrieve(
339
+ collection_name=collection_name,
340
+ ids=[point_id],
341
+ with_payload=True,
342
+ with_vectors=False
343
+ )
344
+
345
+ if existing_points:
346
+ # Merge with existing payload
347
+ existing_payload = existing_points[0].payload
348
+ existing_payload.update(metadata)
349
+ metadata = existing_payload
350
+ except Exception as e:
351
+ logger.debug(f"Could not retrieve existing point {point_id}: {e}")
352
+
353
+ # Use set_payload to update just the metadata without touching the vector
354
+ client.set_payload(
355
+ collection_name=collection_name,
356
+ payload=metadata,
357
+ points=[point_id],
358
+ wait=False # Don't wait for each point
359
+ )
360
+
361
+ return True
362
+
363
+ except Exception as e:
364
+ import traceback
365
+ logger.error(f"Failed to update point {conversation_id}_{chunk_index}: {e}")
366
+ logger.debug(traceback.format_exc())
367
+ return False
368
+
369
+ def process_conversation(jsonl_file: Path, state: Dict[str, Any]) -> bool:
370
+ """Process a single conversation file and update its metadata."""
371
+ try:
372
+ conversation_id = jsonl_file.stem
373
+ project_name = jsonl_file.parent.name
374
+
375
+ # Check if already updated
376
+ if conversation_id in state.get("updated_conversations", {}):
377
+ last_updated = state["updated_conversations"][conversation_id].get("updated_at")
378
+ file_mtime = jsonl_file.stat().st_mtime
379
+ if last_updated and last_updated >= file_mtime:
380
+ logger.debug(f"Skipping {conversation_id} - already updated")
381
+ return True
382
+
383
+ logger.info(f"Processing: {conversation_id}")
384
+
385
+ # Extract tool usage metadata
386
+ tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
387
+
388
+ # Read the full conversation to get text for concept extraction
389
+ conversation_text = ""
390
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
391
+ for line in f:
392
+ if line.strip():
393
+ try:
394
+ data = json.loads(line)
395
+ if 'message' in data and data['message']:
396
+ msg = data['message']
397
+ if msg.get('content'):
398
+ if isinstance(msg['content'], str):
399
+ conversation_text += msg['content'] + "\n"
400
+ elif isinstance(msg['content'], list):
401
+ for item in msg['content']:
402
+ if isinstance(item, dict) and item.get('text'):
403
+ conversation_text += item['text'] + "\n"
404
+ except:
405
+ continue
406
+
407
+ # Extract concepts
408
+ concepts = extract_concepts(conversation_text[:10000], tool_usage) # Limit text for concept extraction
409
+
410
+ # Prepare metadata update
411
+ files_analyzed = list(set([
412
+ item['path'] if isinstance(item, dict) else item
413
+ for item in tool_usage.get('files_read', [])
414
+ if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
415
+ ]))[:20] # Limit to 20 files
416
+
417
+ files_edited = list(set([
418
+ item['path'] if isinstance(item, dict) else item
419
+ for item in tool_usage.get('files_edited', [])
420
+ if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
421
+ ]))[:10] # Limit to 10 files
422
+
423
+ metadata_update = {
424
+ "files_analyzed": files_analyzed,
425
+ "files_edited": files_edited,
426
+ "tools_used": list(tool_usage.get('tools_summary', {}).keys())[:20],
427
+ "tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
428
+ "concepts": list(concepts)[:15],
429
+ "search_patterns": [s.get('pattern', '') for s in tool_usage.get('grep_searches', [])][:10],
430
+ "analysis_only": len(files_edited) == 0 and len(tool_usage.get('files_created', [])) == 0,
431
+ "has_file_metadata": True, # Flag to indicate this has been enhanced
432
+ "metadata_updated_at": datetime.now().isoformat()
433
+ }
434
+
435
+ # Determine collection name
436
+ project_hash = hashlib.md5(normalize_project_name(project_name).encode()).hexdigest()[:8]
437
+ collection_name = f"conv_{project_hash}{get_collection_suffix()}"
438
+
439
+ # Check if collection exists
440
+ try:
441
+ collections = client.get_collections().collections
442
+ if collection_name not in [c.name for c in collections]:
443
+ logger.warning(f"Collection {collection_name} not found for project {project_name}")
444
+ return False
445
+ except Exception as e:
446
+ logger.error(f"Error checking collection: {e}")
447
+ return False
448
+
449
+ # Get the number of chunks for this conversation
450
+ # We need to know how many chunks were created during original import
451
+ # For now, we'll try to update up to 50 chunks (most conversations have fewer)
452
+ max_chunks = 50
453
+ updated_count = 0
454
+ failed_count = 0
455
+
456
+ for chunk_index in range(max_chunks):
457
+ success = update_point_metadata(
458
+ conversation_id,
459
+ chunk_index,
460
+ metadata_update,
461
+ collection_name
462
+ )
463
+
464
+ if success:
465
+ updated_count += 1
466
+ else:
467
+ failed_count += 1
468
+ # If we get too many failures in a row, the conversation probably has fewer chunks
469
+ if failed_count > 5:
470
+ break
471
+
472
+ if updated_count > 0:
473
+ logger.info(f"Updated {updated_count} chunks for {conversation_id}")
474
+
475
+ # Update state
476
+ state["updated_conversations"][conversation_id] = {
477
+ "updated_at": time.time(),
478
+ "chunks_updated": updated_count,
479
+ "project": project_name
480
+ }
481
+
482
+ return True
483
+ else:
484
+ logger.warning(f"No chunks updated for {conversation_id}")
485
+ return False
486
+
487
+ except Exception as e:
488
+ logger.error(f"Failed to process {jsonl_file}: {e}")
489
+ return False
490
+
491
+ def main():
492
+ """Main delta update function."""
493
+ logger.info("=== Starting Delta Metadata Update ===")
494
+ logger.info(f"Configuration:")
495
+ logger.info(f" Qdrant URL: {QDRANT_URL}")
496
+ logger.info(f" Logs directory: {LOGS_DIR}")
497
+ logger.info(f" Days to update: {DAYS_TO_UPDATE}")
498
+ logger.info(f" Embedding type: {'local' if PREFER_LOCAL_EMBEDDINGS else 'voyage'}")
499
+ logger.info(f" Dry run: {DRY_RUN}")
500
+
501
+ # Load state
502
+ state = load_state()
503
+
504
+ # Get recent conversations
505
+ recent_files = get_recent_conversations(DAYS_TO_UPDATE)
506
+
507
+ if not recent_files:
508
+ logger.info("No recent conversations found to update")
509
+ return
510
+
511
+ # Limit for testing
512
+ if os.getenv("LIMIT"):
513
+ limit = int(os.getenv("LIMIT"))
514
+ recent_files = recent_files[:limit]
515
+ logger.info(f"Limited to {limit} files for testing")
516
+
517
+ # Process each conversation
518
+ success_count = 0
519
+ failure_count = 0
520
+
521
+ for i, jsonl_file in enumerate(recent_files, 1):
522
+ logger.info(f"Processing {i}/{len(recent_files)}: {jsonl_file.name}")
523
+
524
+ if process_conversation(jsonl_file, state):
525
+ success_count += 1
526
+ else:
527
+ failure_count += 1
528
+
529
+ # Save state periodically
530
+ if i % 10 == 0:
531
+ save_state(state)
532
+
533
+ # Final state save
534
+ state["last_update"] = datetime.now().isoformat()
535
+ save_state(state)
536
+
537
+ # Summary
538
+ logger.info("=== Delta Update Complete ===")
539
+ logger.info(f"Successfully updated: {success_count} conversations")
540
+ logger.info(f"Failed: {failure_count} conversations")
541
+ logger.info(f"Total conversations in state: {len(state['updated_conversations'])}")
542
+
543
+ if DRY_RUN:
544
+ logger.info("This was a DRY RUN - no actual updates were made")
545
+
546
+ if __name__ == "__main__":
547
+ main()
@@ -13,11 +13,23 @@ import ast
13
13
  import re
14
14
  import fcntl
15
15
  import time
16
+ import argparse
16
17
  from pathlib import Path
17
18
  from datetime import datetime
18
19
  from typing import List, Dict, Any, Optional, Set
19
20
  import logging
20
21
 
22
+ # Load .env file if it exists
23
+ try:
24
+ from dotenv import load_dotenv
25
+ # Load from project root
26
+ env_path = Path(__file__).parent.parent / '.env'
27
+ if env_path.exists():
28
+ load_dotenv(env_path)
29
+ print(f"Loaded .env from {env_path}")
30
+ except ImportError:
31
+ pass # dotenv not available, use system environment
32
+
21
33
  # Add the scripts directory to the Python path for utils import
22
34
  scripts_dir = Path(__file__).parent
23
35
  sys.path.insert(0, str(scripts_dir))
@@ -133,7 +145,8 @@ def ensure_collection(collection_name: str):
133
145
 
134
146
  def generate_embeddings(texts: List[str]) -> List[List[float]]:
135
147
  """Generate embeddings for texts."""
136
- if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
148
+ # Use the global embedding_provider which gets updated by command-line args
149
+ if PREFER_LOCAL_EMBEDDINGS:
137
150
  embeddings = list(embedding_provider.passage_embed(texts))
138
151
  return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
139
152
  else:
@@ -673,6 +686,32 @@ def update_file_state(file_path: Path, state: dict, chunks: int):
673
686
 
674
687
  def main():
675
688
  """Main import function."""
689
+ # Parse command-line arguments
690
+ parser = argparse.ArgumentParser(description='Import conversations with unified embeddings support')
691
+ parser.add_argument('--prefer-voyage', action='store_true',
692
+ help='Use Voyage AI embeddings instead of local FastEmbed')
693
+ parser.add_argument('--limit', type=int,
694
+ help='Limit number of files to import')
695
+ parser.add_argument('--max-files-per-cycle', type=int,
696
+ help='Maximum files to process per cycle')
697
+ args = parser.parse_args()
698
+
699
+ # Override environment variable if --prefer-voyage is specified
700
+ global PREFER_LOCAL_EMBEDDINGS, embedding_provider, embedding_dimension, collection_suffix
701
+ if args.prefer_voyage:
702
+ if not VOYAGE_API_KEY:
703
+ logger.error("--prefer-voyage specified but VOYAGE_KEY environment variable not set")
704
+ sys.exit(1)
705
+ logger.info("Command-line flag --prefer-voyage detected, switching to Voyage AI embeddings")
706
+ PREFER_LOCAL_EMBEDDINGS = False
707
+
708
+ # Re-initialize embedding provider with Voyage
709
+ import voyageai
710
+ embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
711
+ embedding_dimension = 1024
712
+ collection_suffix = "voyage"
713
+ logger.info("Switched to Voyage AI embeddings (dimension: 1024)")
714
+
676
715
  # Load state
677
716
  state = load_state()
678
717
  logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
@@ -695,6 +734,7 @@ def main():
695
734
  logger.info(f"Found {len(project_dirs)} projects to import")
696
735
 
697
736
  total_imported = 0
737
+ files_processed = 0
698
738
 
699
739
  for project_dir in project_dirs:
700
740
  # Get collection name
@@ -707,13 +747,24 @@ def main():
707
747
  # Find JSONL files
708
748
  jsonl_files = sorted(project_dir.glob("*.jsonl"))
709
749
 
750
+ # Apply limit from command line if specified
751
+ if args.limit and files_processed >= args.limit:
752
+ logger.info(f"Reached limit of {args.limit} files, stopping import")
753
+ break
754
+
710
755
  # Limit files per cycle if specified
711
- max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
756
+ max_files = args.max_files_per_cycle or int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
712
757
  jsonl_files = jsonl_files[:max_files]
713
758
 
714
759
  for jsonl_file in jsonl_files:
760
+ # Check limit again per file
761
+ if args.limit and files_processed >= args.limit:
762
+ logger.info(f"Reached limit of {args.limit} files, stopping import")
763
+ break
764
+
715
765
  if should_import_file(jsonl_file, state):
716
766
  chunks = stream_import_file(jsonl_file, collection_name, project_dir)
767
+ files_processed += 1
717
768
  if chunks > 0:
718
769
  # Verify data is actually in Qdrant before marking as imported
719
770
  from qdrant_client.models import Filter, FieldCondition, MatchValue
@@ -0,0 +1,33 @@
1
+ #!/bin/bash
2
+ # PreCompact hook for Claude Self-Reflect
3
+ # Place this in ~/.claude/hooks/precompact or source it from there
4
+
5
+ # Configuration
6
+ CLAUDE_REFLECT_DIR="${CLAUDE_REFLECT_DIR:-$HOME/claude-self-reflect}"
7
+ VENV_PATH="${VENV_PATH:-$CLAUDE_REFLECT_DIR/.venv}"
8
+ IMPORT_TIMEOUT="${IMPORT_TIMEOUT:-30}"
9
+
10
+ # Check if Claude Self-Reflect is installed
11
+ if [ ! -d "$CLAUDE_REFLECT_DIR" ]; then
12
+ echo "Claude Self-Reflect not found at $CLAUDE_REFLECT_DIR" >&2
13
+ exit 0 # Exit gracefully
14
+ fi
15
+
16
+ # Check if virtual environment exists
17
+ if [ ! -d "$VENV_PATH" ]; then
18
+ echo "Virtual environment not found at $VENV_PATH" >&2
19
+ exit 0 # Exit gracefully
20
+ fi
21
+
22
+ # Run quick import with timeout
23
+ echo "Updating conversation memory..." >&2
24
+ timeout $IMPORT_TIMEOUT bash -c "
25
+ source '$VENV_PATH/bin/activate' 2>/dev/null
26
+ python '$CLAUDE_REFLECT_DIR/scripts/import-latest.py' 2>&1 | \
27
+ grep -E '(Quick import completed|Imported|Warning)' >&2
28
+ " || {
29
+ echo "Quick import timed out after ${IMPORT_TIMEOUT}s" >&2
30
+ }
31
+
32
+ # Always exit successfully to not block compacting
33
+ exit 0