claude-self-reflect 2.3.6 → 2.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,867 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Import Claude conversation logs from JSONL files into Qdrant vector database using Voyage AI embeddings.
4
- Enhanced version with detailed progress tracking, time estimates, and dry-run mode.
5
- """
6
-
7
- import json
8
- import os
9
- import glob
10
- import time
11
- import hashlib
12
- from datetime import datetime, timedelta
13
- from typing import List, Dict, Any, Optional, Tuple
14
- import logging
15
- from qdrant_client import QdrantClient
16
- from qdrant_client.models import VectorParams, Distance, PointStruct
17
- import requests
18
- import backoff
19
- from tqdm import tqdm
20
- import humanize
21
- import sys
22
- import argparse
23
-
24
- # Configuration
25
- QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
26
- LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
27
- STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
28
- VOYAGE_API_KEY = os.getenv("VOYAGE_KEY-2") or os.getenv("VOYAGE_KEY")
29
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "20")) # Voyage supports batch embedding
30
- CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "10")) # Can use larger chunks with 32k token limit
31
- RATE_LIMIT_DELAY = 1 # 1 second between requests for paid account (60 RPM)
32
- EMBEDDING_MODEL = "voyage-3.5-lite"
33
- EMBEDDING_DIMENSIONS = 1024 # Voyage default dimensions
34
- VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings"
35
-
36
- # Set up logging (less verbose for progress mode)
37
- logging.basicConfig(
38
- level=logging.WARNING,
39
- format='%(message)s'
40
- )
41
- logger = logging.getLogger(__name__)
42
-
43
- class EnhancedVoyageImporter:
44
- def __init__(self, dry_run=False, validate_only=False, preview=False):
45
- """Initialize the importer with Qdrant and Voyage AI.
46
-
47
- Args:
48
- dry_run: Simulate import without making changes
49
- validate_only: Only validate setup and files
50
- preview: Show sample chunks in dry-run mode
51
- """
52
- self.dry_run = dry_run
53
- self.validate_only = validate_only
54
- self.preview = preview
55
-
56
- if self.dry_run or self.validate_only:
57
- print(f"šŸ” Running in {'VALIDATE-ONLY' if self.validate_only else 'DRY-RUN'} mode...")
58
- print("=" * 60)
59
-
60
- # Validate API key
61
- if not VOYAGE_API_KEY:
62
- if self.dry_run or self.validate_only:
63
- print("āš ļø VOYAGE_KEY environment variable not set")
64
- self.voyage_available = False
65
- else:
66
- raise ValueError("VOYAGE_KEY environment variable not set")
67
- else:
68
- self.voyage_available = True
69
-
70
- print("šŸš€ Initializing Claude-Self-Reflect Importer...")
71
- print("=" * 60)
72
-
73
- # Initialize clients (skip in validate-only mode)
74
- if not self.validate_only:
75
- try:
76
- self.qdrant_client = QdrantClient(url=QDRANT_URL, timeout=60)
77
- if not self.dry_run:
78
- # Test connection
79
- self.qdrant_client.get_collections()
80
- except Exception as e:
81
- if self.dry_run:
82
- print(f"āš ļø Qdrant connection test failed: {e}")
83
- self.qdrant_client = None
84
- else:
85
- raise
86
-
87
- if self.voyage_available:
88
- self.voyage_headers = {
89
- "Authorization": f"Bearer {VOYAGE_API_KEY}",
90
- "Content-Type": "application/json"
91
- }
92
-
93
- self.state = self._load_state()
94
- self.total_imported = 0
95
- self.total_errors = 0
96
- self.start_time = time.time()
97
-
98
- # Statistics for progress tracking
99
- self.stats = {
100
- 'files_processed': 0,
101
- 'total_files': 0,
102
- 'chunks_created': 0,
103
- 'embeddings_generated': 0,
104
- 'messages_processed': 0,
105
- 'bytes_processed': 0,
106
- 'api_calls': 0,
107
- 'estimated_cost': 0.0,
108
- 'sample_chunks': []
109
- }
110
-
111
- def _load_state(self) -> Dict[str, Any]:
112
- """Load or initialize state."""
113
- if os.path.exists(STATE_FILE):
114
- try:
115
- with open(STATE_FILE, 'r') as f:
116
- data = json.load(f)
117
- # Handle old format (files list) vs new format (projects dict)
118
- if 'files' in data and 'projects' not in data:
119
- # Convert old format to new format
120
- projects = {}
121
- for file_path in data.get('files', []):
122
- # Extract project name from file path
123
- parts = file_path.split('/')
124
- if len(parts) >= 3:
125
- project_name = parts[2]
126
- if project_name not in projects:
127
- projects[project_name] = []
128
- projects[project_name].append(file_path)
129
- return {
130
- "projects": projects,
131
- "last_updated": data.get('lastUpdated'),
132
- "total_imported": len(data.get('files', []))
133
- }
134
- # New format
135
- return data
136
- except Exception as e:
137
- print(f"āš ļø Failed to load state: {e}")
138
-
139
- return {
140
- "projects": {},
141
- "last_updated": None,
142
- "total_imported": 0
143
- }
144
-
145
- def _save_state(self):
146
- """Save current state to disk."""
147
- try:
148
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
149
- self.state["last_updated"] = datetime.now().isoformat()
150
- self.state["total_imported"] = self.total_imported
151
-
152
- with open(STATE_FILE, 'w') as f:
153
- json.dump(self.state, f, indent=2)
154
- except Exception as e:
155
- logger.error(f"Failed to save state: {e}")
156
-
157
- def _get_collection_name(self, project_name: str) -> str:
158
- """Generate collection name for project with Voyage suffix."""
159
- project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
160
- return f"conv_{project_hash}_voyage"
161
-
162
- def _ensure_collection(self, collection_name: str):
163
- """Ensure collection exists with correct configuration for OpenAI embeddings."""
164
- if self.dry_run:
165
- # In dry-run mode, just log what would happen
166
- print(f"[DRY-RUN] Would ensure collection: {collection_name}")
167
- return
168
-
169
- collections = [col.name for col in self.qdrant_client.get_collections().collections]
170
-
171
- if collection_name not in collections:
172
- self.qdrant_client.create_collection(
173
- collection_name=collection_name,
174
- vectors_config=VectorParams(
175
- size=EMBEDDING_DIMENSIONS,
176
- distance=Distance.COSINE
177
- )
178
- )
179
- else:
180
- # Verify dimensions
181
- info = self.qdrant_client.get_collection(collection_name)
182
- if info.config.params.vectors.size != EMBEDDING_DIMENSIONS:
183
- raise ValueError(f"Dimension mismatch in collection {collection_name}")
184
-
185
- def _count_total_work(self) -> Tuple[int, int, int]:
186
- """Count total files and estimate chunks to process."""
187
- total_files = 0
188
- new_files = 0
189
- estimated_chunks = 0
190
- total_size = 0
191
-
192
- projects_dir = LOGS_DIR
193
- if not os.path.exists(projects_dir):
194
- return 0, 0, 0
195
-
196
- # Count all JSONL files
197
- for project_name in os.listdir(projects_dir):
198
- project_path = os.path.join(projects_dir, project_name)
199
- if os.path.isdir(project_path) and not project_name.startswith('.'):
200
- for file in os.listdir(project_path):
201
- if file.endswith('.jsonl'):
202
- total_files += 1
203
- file_path = os.path.join(project_path, file)
204
-
205
- # Check if already imported
206
- if not (project_name in self.state["projects"] and
207
- file_path in self.state["projects"][project_name]):
208
- new_files += 1
209
-
210
- # Estimate chunks based on file size
211
- try:
212
- file_size = os.path.getsize(file_path)
213
- total_size += file_size
214
- # Rough estimate: 1 chunk per 10KB
215
- estimated_chunks += max(1, file_size // 10240)
216
- except:
217
- estimated_chunks += 5 # Default estimate
218
-
219
- return total_files, new_files, estimated_chunks
220
-
221
- def _estimate_cost(self, text_count: int) -> float:
222
- """Estimate API cost for embeddings.
223
-
224
- Voyage AI pricing (as of 2024):
225
- - voyage-3.5-lite: $0.02 per 1M tokens
226
- - Estimated 500 tokens per chunk average
227
- """
228
- estimated_tokens = text_count * 500 # Average tokens per chunk
229
- cost_per_million = 0.02 # $0.02 per 1M tokens for voyage-3.5-lite
230
- return (estimated_tokens / 1_000_000) * cost_per_million
231
-
232
- @backoff.on_exception(
233
- backoff.expo,
234
- Exception,
235
- max_tries=5,
236
- on_backoff=lambda details: None # Silent backoff
237
- )
238
- def _generate_embeddings(self, texts: List[str], progress_bar=None) -> List[List[float]]:
239
- """Generate embeddings using Voyage AI API with retry logic."""
240
- if self.dry_run:
241
- # In dry-run mode, simulate embeddings
242
- if progress_bar:
243
- progress_bar.set_description("[DRY-RUN] Simulating embeddings...")
244
-
245
- # Update cost estimation
246
- self.stats['estimated_cost'] += self._estimate_cost(len(texts))
247
- self.stats['api_calls'] += 1
248
- self.stats['embeddings_generated'] += len(texts)
249
-
250
- # Return fake embeddings
251
- return [[0.0] * EMBEDDING_DIMENSIONS for _ in texts]
252
-
253
- try:
254
- if progress_bar:
255
- progress_bar.set_description("šŸ¤– Generating embeddings...")
256
-
257
- response = requests.post(
258
- VOYAGE_API_URL,
259
- headers=self.voyage_headers,
260
- json={
261
- "input": texts,
262
- "model": EMBEDDING_MODEL,
263
- "input_type": "document"
264
- }
265
- )
266
-
267
- self.stats['api_calls'] += 1
268
-
269
- if response.status_code != 200:
270
- raise Exception(f"Voyage API error: {response.status_code} - {response.text}")
271
-
272
- data = response.json()
273
- embeddings = [item["embedding"] for item in data["data"]]
274
- self.stats['embeddings_generated'] += len(embeddings)
275
-
276
- return embeddings
277
- except Exception as e:
278
- if progress_bar:
279
- progress_bar.set_description(f"āŒ Embedding error: {str(e)[:30]}...")
280
- raise
281
-
282
- def _process_jsonl_file(self, file_path: str, progress_bar=None) -> List[Dict[str, Any]]:
283
- """Extract messages from a JSONL file with progress tracking."""
284
- messages = []
285
- file_size = os.path.getsize(file_path)
286
- self.stats['bytes_processed'] += file_size
287
-
288
- if progress_bar:
289
- progress_bar.set_description(f"šŸ“„ Reading {os.path.basename(file_path)[:30]}...")
290
-
291
- try:
292
- with open(file_path, 'r', encoding='utf-8') as f:
293
- for line_num, line in enumerate(f, 1):
294
- line = line.strip()
295
- if not line:
296
- continue
297
-
298
- try:
299
- data = json.loads(line)
300
-
301
- # Extract message if present
302
- if 'message' in data and data['message']:
303
- msg = data['message']
304
- if msg.get('role') and msg.get('content'):
305
- content = msg['content']
306
- if isinstance(content, dict):
307
- content = content.get('text', json.dumps(content))
308
-
309
- messages.append({
310
- 'role': msg['role'],
311
- 'content': content,
312
- 'file_path': file_path,
313
- 'line_number': line_num,
314
- 'timestamp': data.get('timestamp', datetime.now().isoformat())
315
- })
316
- self.stats['messages_processed'] += 1
317
- except json.JSONDecodeError:
318
- pass # Skip invalid JSON
319
- except Exception as e:
320
- logger.debug(f"Error processing line {line_num}: {e}")
321
-
322
- except Exception as e:
323
- logger.error(f"Failed to read file {file_path}: {e}")
324
-
325
- return messages
326
-
327
- def _create_conversation_chunks(self, messages: List[Dict[str, Any]], progress_bar=None) -> List[Dict[str, Any]]:
328
- """Group messages into conversation chunks for better context."""
329
- chunks = []
330
-
331
- if progress_bar:
332
- progress_bar.set_description("āœ‚ļø Creating conversation chunks...")
333
-
334
- for i in range(0, len(messages), CHUNK_SIZE):
335
- chunk_messages = messages[i:i + CHUNK_SIZE]
336
-
337
- # Create conversation text - Voyage supports 32k tokens
338
- conversation_parts = []
339
- total_chars = 0
340
- max_chars = 100000 # Much larger limit with Voyage!
341
-
342
- for msg in chunk_messages:
343
- role = msg['role'].upper()
344
- content = msg['content']
345
-
346
- # Only truncate extremely long messages
347
- if len(content) > 20000:
348
- content = content[:15000] + "\n\n[... truncated ...]\n\n" + content[-5000:]
349
-
350
- part = f"{role}: {content}"
351
-
352
- # Check if adding this would exceed limit
353
- if total_chars + len(part) > max_chars:
354
- remaining = max_chars - total_chars
355
- if remaining > 1000:
356
- part = f"{role}: {content[:remaining-100]}..."
357
- conversation_parts.append(part)
358
- break
359
-
360
- conversation_parts.append(part)
361
- total_chars += len(part) + 2
362
-
363
- conversation_text = "\n\n".join(conversation_parts)
364
-
365
- # Extract metadata
366
- project_name = os.path.basename(os.path.dirname(chunk_messages[0]['file_path']))
367
- conversation_id = os.path.basename(chunk_messages[0]['file_path']).replace('.jsonl', '')
368
-
369
- # Generate unique ID
370
- chunk_id = hashlib.md5(
371
- f"{project_name}_{conversation_id}_{i}".encode()
372
- ).hexdigest()
373
-
374
- chunk_data = {
375
- 'id': chunk_id,
376
- 'text': conversation_text,
377
- 'metadata': {
378
- 'project': project_name,
379
- 'conversation_id': conversation_id,
380
- 'chunk_index': i // CHUNK_SIZE,
381
- 'message_count': len(chunk_messages),
382
- 'start_role': chunk_messages[0]['role'],
383
- 'timestamp': chunk_messages[0]['timestamp'],
384
- 'file': os.path.basename(chunk_messages[0]['file_path'])
385
- }
386
- }
387
-
388
- chunks.append(chunk_data)
389
-
390
- # Store sample chunks for preview
391
- if self.preview and len(self.stats['sample_chunks']) < 3:
392
- self.stats['sample_chunks'].append({
393
- 'project': project_name,
394
- 'file': os.path.basename(chunk_messages[0]['file_path']),
395
- 'preview': conversation_text[:500] + '...' if len(conversation_text) > 500 else conversation_text,
396
- 'message_count': len(chunk_messages)
397
- })
398
-
399
- self.stats['chunks_created'] += len(chunks)
400
- return chunks
401
-
402
- def _import_chunks_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str, file_progress: tqdm):
403
- """Import conversation chunks to Qdrant with batched OpenAI embeddings."""
404
- if not chunks:
405
- return
406
-
407
- if self.dry_run:
408
- # In dry-run mode, simulate the import
409
- print(f"\n[DRY-RUN] Would import {len(chunks)} chunks to collection: {collection_name}")
410
-
411
- # Simulate progress
412
- for i in range(0, len(chunks), BATCH_SIZE):
413
- batch_size = min(BATCH_SIZE, len(chunks) - i)
414
- self.stats['api_calls'] += 1
415
- self.stats['embeddings_generated'] += batch_size
416
- self.total_imported += batch_size
417
-
418
- # Estimate cost
419
- self.stats['estimated_cost'] += self._estimate_cost(batch_size)
420
-
421
- return
422
-
423
- # Create sub-progress bar for chunks
424
- chunk_progress = tqdm(
425
- total=len(chunks),
426
- desc="šŸ“¦ Uploading chunks",
427
- unit="chunk",
428
- leave=False,
429
- position=2
430
- )
431
-
432
- # Process in batches
433
- for i in range(0, len(chunks), BATCH_SIZE):
434
- batch = chunks[i:i + BATCH_SIZE]
435
- texts = [chunk['text'] for chunk in batch]
436
-
437
- try:
438
- # Generate embeddings
439
- chunk_progress.set_description("šŸ¤– Generating embeddings...")
440
- embeddings = self._generate_embeddings(texts, chunk_progress)
441
-
442
- # Create points
443
- points = []
444
- for chunk, embedding in zip(batch, embeddings):
445
- point = PointStruct(
446
- id=chunk['id'],
447
- vector=embedding,
448
- payload={
449
- 'text': chunk['text'][:2000], # Limit text size
450
- **chunk['metadata']
451
- }
452
- )
453
- points.append(point)
454
-
455
- # Upload to Qdrant
456
- chunk_progress.set_description("ā¬†ļø Uploading to Qdrant...")
457
- self.qdrant_client.upsert(
458
- collection_name=collection_name,
459
- points=points
460
- )
461
-
462
- self.total_imported += len(points)
463
- chunk_progress.update(len(points))
464
-
465
- # Update speed in main progress
466
- elapsed = time.time() - self.start_time
467
- speed = self.total_imported / elapsed if elapsed > 0 else 0
468
- file_progress.set_postfix({
469
- 'chunks/s': f"{speed:.1f}",
470
- 'total': self.total_imported
471
- })
472
-
473
- # Add delay to respect rate limit
474
- if i + BATCH_SIZE < len(chunks):
475
- chunk_progress.set_description(f"ā³ Rate limit delay ({RATE_LIMIT_DELAY}s)...")
476
- time.sleep(RATE_LIMIT_DELAY)
477
-
478
- except Exception as e:
479
- chunk_progress.set_description(f"āŒ Error: {str(e)[:30]}...")
480
- self.total_errors += 1
481
- # Continue with next batch instead of failing completely
482
-
483
- chunk_progress.close()
484
-
485
- def import_project(self, project_path: str, project_progress: tqdm = None) -> int:
486
- """Import all JSONL files in a project directory."""
487
- project_name = os.path.basename(project_path)
488
- collection_name = self._get_collection_name(project_name)
489
-
490
- # Ensure collection exists
491
- self._ensure_collection(collection_name)
492
-
493
- # Get list of JSONL files
494
- jsonl_files = []
495
- for file in os.listdir(project_path):
496
- if file.endswith('.jsonl'):
497
- file_path = os.path.join(project_path, file)
498
-
499
- # Skip already imported files
500
- if (project_name in self.state["projects"] and
501
- file_path in self.state["projects"][project_name]):
502
- continue
503
-
504
- jsonl_files.append(file_path)
505
-
506
- if not jsonl_files:
507
- return 0
508
-
509
- # Create file progress bar
510
- file_progress = tqdm(
511
- total=len(jsonl_files),
512
- desc=f"šŸ“ {project_name}",
513
- unit="file",
514
- leave=False,
515
- position=1
516
- )
517
-
518
- project_total = 0
519
- for file_path in sorted(jsonl_files):
520
- file_name = os.path.basename(file_path)
521
- file_progress.set_description(f"šŸ“ {project_name}/{file_name[:20]}...")
522
-
523
- # Extract messages
524
- messages = self._process_jsonl_file(file_path, file_progress)
525
- if not messages:
526
- file_progress.update(1)
527
- continue
528
-
529
- # Create chunks
530
- chunks = self._create_conversation_chunks(messages, file_progress)
531
-
532
- # Import to Qdrant
533
- self._import_chunks_to_qdrant(chunks, collection_name, file_progress)
534
-
535
- # Mark file as imported (only in non-dry-run mode)
536
- if not self.dry_run:
537
- if project_name not in self.state["projects"]:
538
- self.state["projects"][project_name] = []
539
- self.state["projects"][project_name].append(file_path)
540
-
541
- # Save state after each file
542
- self._save_state()
543
-
544
- project_total += len(chunks)
545
- self.stats['files_processed'] += 1
546
-
547
- file_progress.update(1)
548
-
549
- file_progress.close()
550
- return project_total
551
-
552
- def validate_setup(self):
553
- """Validate the entire setup before import."""
554
- print("šŸ” Validating setup...")
555
- print("=" * 60)
556
-
557
- validations = {
558
- "API Key": False,
559
- "Qdrant Connection": False,
560
- "Claude Logs": False,
561
- "File Format": False,
562
- "Disk Space": False
563
- }
564
-
565
- # Check API key
566
- if self.voyage_available:
567
- try:
568
- # Test with a single embedding
569
- response = requests.post(
570
- VOYAGE_API_URL,
571
- headers=self.voyage_headers,
572
- json={
573
- "input": ["test"],
574
- "model": EMBEDDING_MODEL,
575
- "input_type": "document"
576
- }
577
- )
578
- if response.status_code == 200:
579
- validations["API Key"] = True
580
- print("āœ… Voyage API key is valid")
581
- else:
582
- print(f"āŒ Voyage API key test failed: {response.status_code}")
583
- except Exception as e:
584
- print(f"āŒ Voyage API connection failed: {e}")
585
- else:
586
- print("āš ļø No API key configured")
587
-
588
- # Check Qdrant
589
- if hasattr(self, 'qdrant_client') and self.qdrant_client:
590
- try:
591
- collections = self.qdrant_client.get_collections()
592
- validations["Qdrant Connection"] = True
593
- print(f"āœ… Qdrant is accessible ({len(collections.collections)} collections)")
594
- except Exception as e:
595
- print(f"āŒ Qdrant connection failed: {e}")
596
- else:
597
- print("āš ļø Qdrant client not initialized")
598
-
599
- # Check Claude logs
600
- projects_dir = LOGS_DIR
601
- if os.path.exists(projects_dir):
602
- total_files, _, _ = self._count_total_work()
603
- if total_files > 0:
604
- validations["Claude Logs"] = True
605
- print(f"āœ… Found {total_files} conversation files")
606
- else:
607
- print("āš ļø No conversation files found")
608
- else:
609
- print(f"āŒ Claude logs directory not found: {projects_dir}")
610
-
611
- # Validate file format
612
- if validations["Claude Logs"]:
613
- sample_validated = False
614
- for project in os.listdir(projects_dir):
615
- project_path = os.path.join(projects_dir, project)
616
- if os.path.isdir(project_path):
617
- for file in os.listdir(project_path):
618
- if file.endswith('.jsonl'):
619
- file_path = os.path.join(project_path, file)
620
- try:
621
- messages = self._process_jsonl_file(file_path)
622
- if messages:
623
- validations["File Format"] = True
624
- sample_validated = True
625
- print(f"āœ… JSONL format validated ({len(messages)} messages in sample)")
626
- break
627
- except Exception as e:
628
- print(f"āš ļø Sample file validation failed: {e}")
629
- break
630
- if sample_validated:
631
- break
632
-
633
- # Check disk space
634
- try:
635
- import shutil
636
- stat = shutil.disk_usage("/")
637
- free_gb = stat.free / (1024 ** 3)
638
- if free_gb > 1:
639
- validations["Disk Space"] = True
640
- print(f"āœ… Sufficient disk space ({free_gb:.1f} GB free)")
641
- else:
642
- print(f"āš ļø Low disk space ({free_gb:.1f} GB free)")
643
- except Exception:
644
- print("āš ļø Could not check disk space")
645
-
646
- # Summary
647
- print("\n" + "=" * 60)
648
- all_valid = all(validations.values())
649
- if all_valid:
650
- print("āœ… All validations passed!")
651
- else:
652
- print("āš ļø Some validations failed or have warnings")
653
- print("\nFailed checks:")
654
- for check, passed in validations.items():
655
- if not passed:
656
- print(f" • {check}")
657
-
658
- return all_valid
659
-
660
- def import_all(self):
661
- """Import all Claude projects with enhanced progress tracking."""
662
- if self.validate_only:
663
- # Only run validation
664
- self.validate_setup()
665
- return
666
-
667
- projects_dir = LOGS_DIR
668
-
669
- if not os.path.exists(projects_dir):
670
- print(f"āŒ Claude projects directory not found: {projects_dir}")
671
- return
672
-
673
- # Count total work
674
- print("šŸ” Analyzing conversation history...")
675
- total_files, new_files, estimated_chunks = self._count_total_work()
676
-
677
- if new_files == 0:
678
- print("āœ… All conversations already imported!")
679
- return
680
-
681
- # Calculate estimated cost
682
- estimated_cost = self._estimate_cost(estimated_chunks)
683
-
684
- print(f"\nšŸ“Š Import Summary:")
685
- print(f" • Total files: {total_files}")
686
- print(f" • New files to import: {new_files}")
687
- print(f" • Estimated chunks: ~{estimated_chunks}")
688
- print(f" • Estimated cost: ${estimated_cost:.4f}")
689
- print(f" • Embedding model: {EMBEDDING_MODEL}")
690
- print(f" • Batch size: {BATCH_SIZE}")
691
-
692
- if self.dry_run:
693
- print(f"\nšŸ” DRY-RUN MODE - No changes will be made")
694
-
695
- print(f"\nā³ Starting import...\n")
696
-
697
- # Get list of projects
698
- projects = [
699
- d for d in os.listdir(projects_dir)
700
- if os.path.isdir(os.path.join(projects_dir, d)) and not d.startswith('.')
701
- ]
702
-
703
- # Main progress bar for projects
704
- project_progress = tqdm(
705
- total=len(projects),
706
- desc="šŸš€ Overall Progress",
707
- unit="project",
708
- position=0
709
- )
710
-
711
- # Import each project
712
- self.start_time = time.time()
713
- for project_name in sorted(projects):
714
- project_path = os.path.join(projects_dir, project_name)
715
-
716
- try:
717
- count = self.import_project(project_path, project_progress)
718
-
719
- # Update progress
720
- project_progress.update(1)
721
-
722
- # Calculate and display ETA
723
- elapsed = time.time() - self.start_time
724
- progress_pct = (project_progress.n / len(projects))
725
- if progress_pct > 0:
726
- eta_seconds = (elapsed / progress_pct) - elapsed
727
- eta_str = humanize.naturaldelta(eta_seconds)
728
- else:
729
- eta_str = "calculating..."
730
-
731
- project_progress.set_postfix({
732
- 'ETA': eta_str,
733
- 'chunks': self.total_imported,
734
- 'errors': self.total_errors
735
- })
736
-
737
- except Exception as e:
738
- project_progress.set_description(f"āŒ Error in {project_name}: {str(e)[:30]}...")
739
- self.total_errors += 1
740
- continue
741
-
742
- project_progress.close()
743
-
744
- # Final summary
745
- elapsed_time = time.time() - self.start_time
746
- print("\n" + "=" * 60)
747
-
748
- if self.dry_run:
749
- print("āœ… Dry-Run Complete!")
750
- else:
751
- print("āœ… Import Complete!")
752
-
753
- print("=" * 60)
754
- print(f"\nšŸ“Š Final Statistics:")
755
- print(f" • Time elapsed: {humanize.naturaldelta(elapsed_time)}")
756
-
757
- if self.dry_run:
758
- print(f" • Projects to import: {len(projects)}")
759
- else:
760
- print(f" • Projects imported: {len(self.state['projects'])}/{len(projects)}")
761
-
762
- print(f" • Files processed: {self.stats['files_processed']}")
763
- print(f" • Messages processed: {self.stats['messages_processed']:,}")
764
- print(f" • Chunks created: {self.stats['chunks_created']:,}")
765
- print(f" • Embeddings {'would be' if self.dry_run else ''} generated: {self.stats['embeddings_generated']:,}")
766
- print(f" • Total chunks {'would be' if self.dry_run else ''} imported: {self.total_imported:,}")
767
- print(f" • API calls {'would be' if self.dry_run else ''} made: {self.stats['api_calls']:,}")
768
- print(f" • Data processed: {humanize.naturalsize(self.stats['bytes_processed'])}")
769
-
770
- if elapsed_time > 0:
771
- print(f" • Average speed: {self.total_imported/elapsed_time:.1f} chunks/second")
772
-
773
- if self.dry_run:
774
- print(f" • šŸ’° Estimated cost: ${self.stats['estimated_cost']:.4f}")
775
-
776
- if self.total_errors > 0:
777
- print(f" • āš ļø Errors encountered: {self.total_errors}")
778
-
779
- # Show sample chunks in preview mode
780
- if self.preview and self.stats['sample_chunks']:
781
- print(f"\nšŸ“‹ Sample Chunks Preview:")
782
- for i, sample in enumerate(self.stats['sample_chunks'], 1):
783
- print(f"\n--- Sample {i} ---")
784
- print(f"Project: {sample['project']}")
785
- print(f"File: {sample['file']}")
786
- print(f"Messages: {sample['message_count']}")
787
- print(f"Preview:\n{sample['preview']}")
788
-
789
- # Show collection summary (non-dry-run only)
790
- if not self.dry_run and hasattr(self, 'qdrant_client') and self.qdrant_client:
791
- print(f"\nšŸ“¦ Collection Summary:")
792
- for col in self.qdrant_client.get_collections().collections:
793
- if col.name.endswith("_voyage"):
794
- info = self.qdrant_client.get_collection(col.name)
795
- print(f" • {col.name}: {info.points_count:,} vectors")
796
-
797
- print(f"\nšŸ’” Next steps:")
798
- if self.dry_run:
799
- print(f" 1. Review the statistics above")
800
- print(f" 2. Run without --dry-run to perform actual import")
801
- print(f" 3. Consider using --preview to see sample chunks")
802
- else:
803
- print(f" 1. Restart Claude Desktop to load the MCP server")
804
- print(f" 2. Try searching: 'What did we discuss about X?'")
805
- print(f" 3. Enable continuous import: docker compose --profile watch up -d")
806
-
807
- def main():
808
- """Main entry point."""
809
- parser = argparse.ArgumentParser(
810
- description="Import Claude conversation logs to Qdrant vector database",
811
- formatter_class=argparse.RawDescriptionHelpFormatter,
812
- epilog="""
813
- Examples:
814
- # Run in dry-run mode to see what would happen
815
- python %(prog)s --dry-run
816
-
817
- # Validate setup only
818
- python %(prog)s --validate-only
819
-
820
- # Dry-run with preview of sample chunks
821
- python %(prog)s --dry-run --preview
822
-
823
- # Import a specific project
824
- python %(prog)s /path/to/project
825
-
826
- # Import all projects (normal mode)
827
- python %(prog)s
828
- """
829
- )
830
-
831
- parser.add_argument('project_path', nargs='?', help='Path to specific project to import')
832
- parser.add_argument('--dry-run', action='store_true',
833
- help='Simulate import without making changes')
834
- parser.add_argument('--validate-only', action='store_true',
835
- help='Only validate setup without importing')
836
- parser.add_argument('--preview', action='store_true',
837
- help='Show sample chunks in dry-run mode')
838
-
839
- args = parser.parse_args()
840
-
841
- try:
842
- importer = EnhancedVoyageImporter(
843
- dry_run=args.dry_run,
844
- validate_only=args.validate_only,
845
- preview=args.preview
846
- )
847
-
848
- if args.project_path:
849
- # Import specific project
850
- if os.path.exists(args.project_path):
851
- print(f"šŸ“ Importing single project: {os.path.basename(args.project_path)}")
852
- importer.import_project(args.project_path)
853
- else:
854
- print(f"āŒ Project path not found: {args.project_path}")
855
- else:
856
- # Import all projects
857
- importer.import_all()
858
- except KeyboardInterrupt:
859
- print("\n\nāš ļø Import interrupted by user")
860
- if not args.dry_run and not args.validate_only:
861
- print("Progress has been saved. Run again to continue where you left off.")
862
- except Exception as e:
863
- print(f"\nāŒ Fatal error: {e}")
864
- raise
865
-
866
- if __name__ == "__main__":
867
- main()