claude-self-reflect 2.3.6 ā 2.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/import-conversations-isolated.py +0 -311
- package/scripts/import-conversations-voyage-streaming.py +0 -368
- package/scripts/import-conversations-voyage.py +0 -430
- package/scripts/import-conversations.py +0 -240
- package/scripts/import-current-conversation.py +0 -39
- package/scripts/import-live-conversation.py +0 -154
- package/scripts/import-openai-enhanced.py +0 -867
- package/scripts/import-recent-only.py +0 -33
- package/scripts/import-single-project.py +0 -278
- package/scripts/import-watcher.py +0 -170
|
@@ -1,867 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Import Claude conversation logs from JSONL files into Qdrant vector database using Voyage AI embeddings.
|
|
4
|
-
Enhanced version with detailed progress tracking, time estimates, and dry-run mode.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import glob
|
|
10
|
-
import time
|
|
11
|
-
import hashlib
|
|
12
|
-
from datetime import datetime, timedelta
|
|
13
|
-
from typing import List, Dict, Any, Optional, Tuple
|
|
14
|
-
import logging
|
|
15
|
-
from qdrant_client import QdrantClient
|
|
16
|
-
from qdrant_client.models import VectorParams, Distance, PointStruct
|
|
17
|
-
import requests
|
|
18
|
-
import backoff
|
|
19
|
-
from tqdm import tqdm
|
|
20
|
-
import humanize
|
|
21
|
-
import sys
|
|
22
|
-
import argparse
|
|
23
|
-
|
|
24
|
-
# Configuration
|
|
25
|
-
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
26
|
-
LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
|
|
27
|
-
STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
|
|
28
|
-
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY-2") or os.getenv("VOYAGE_KEY")
|
|
29
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "20")) # Voyage supports batch embedding
|
|
30
|
-
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "10")) # Can use larger chunks with 32k token limit
|
|
31
|
-
RATE_LIMIT_DELAY = 1 # 1 second between requests for paid account (60 RPM)
|
|
32
|
-
EMBEDDING_MODEL = "voyage-3.5-lite"
|
|
33
|
-
EMBEDDING_DIMENSIONS = 1024 # Voyage default dimensions
|
|
34
|
-
VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings"
|
|
35
|
-
|
|
36
|
-
# Set up logging (less verbose for progress mode)
|
|
37
|
-
logging.basicConfig(
|
|
38
|
-
level=logging.WARNING,
|
|
39
|
-
format='%(message)s'
|
|
40
|
-
)
|
|
41
|
-
logger = logging.getLogger(__name__)
|
|
42
|
-
|
|
43
|
-
class EnhancedVoyageImporter:
|
|
44
|
-
def __init__(self, dry_run=False, validate_only=False, preview=False):
|
|
45
|
-
"""Initialize the importer with Qdrant and Voyage AI.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
dry_run: Simulate import without making changes
|
|
49
|
-
validate_only: Only validate setup and files
|
|
50
|
-
preview: Show sample chunks in dry-run mode
|
|
51
|
-
"""
|
|
52
|
-
self.dry_run = dry_run
|
|
53
|
-
self.validate_only = validate_only
|
|
54
|
-
self.preview = preview
|
|
55
|
-
|
|
56
|
-
if self.dry_run or self.validate_only:
|
|
57
|
-
print(f"š Running in {'VALIDATE-ONLY' if self.validate_only else 'DRY-RUN'} mode...")
|
|
58
|
-
print("=" * 60)
|
|
59
|
-
|
|
60
|
-
# Validate API key
|
|
61
|
-
if not VOYAGE_API_KEY:
|
|
62
|
-
if self.dry_run or self.validate_only:
|
|
63
|
-
print("ā ļø VOYAGE_KEY environment variable not set")
|
|
64
|
-
self.voyage_available = False
|
|
65
|
-
else:
|
|
66
|
-
raise ValueError("VOYAGE_KEY environment variable not set")
|
|
67
|
-
else:
|
|
68
|
-
self.voyage_available = True
|
|
69
|
-
|
|
70
|
-
print("š Initializing Claude-Self-Reflect Importer...")
|
|
71
|
-
print("=" * 60)
|
|
72
|
-
|
|
73
|
-
# Initialize clients (skip in validate-only mode)
|
|
74
|
-
if not self.validate_only:
|
|
75
|
-
try:
|
|
76
|
-
self.qdrant_client = QdrantClient(url=QDRANT_URL, timeout=60)
|
|
77
|
-
if not self.dry_run:
|
|
78
|
-
# Test connection
|
|
79
|
-
self.qdrant_client.get_collections()
|
|
80
|
-
except Exception as e:
|
|
81
|
-
if self.dry_run:
|
|
82
|
-
print(f"ā ļø Qdrant connection test failed: {e}")
|
|
83
|
-
self.qdrant_client = None
|
|
84
|
-
else:
|
|
85
|
-
raise
|
|
86
|
-
|
|
87
|
-
if self.voyage_available:
|
|
88
|
-
self.voyage_headers = {
|
|
89
|
-
"Authorization": f"Bearer {VOYAGE_API_KEY}",
|
|
90
|
-
"Content-Type": "application/json"
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
self.state = self._load_state()
|
|
94
|
-
self.total_imported = 0
|
|
95
|
-
self.total_errors = 0
|
|
96
|
-
self.start_time = time.time()
|
|
97
|
-
|
|
98
|
-
# Statistics for progress tracking
|
|
99
|
-
self.stats = {
|
|
100
|
-
'files_processed': 0,
|
|
101
|
-
'total_files': 0,
|
|
102
|
-
'chunks_created': 0,
|
|
103
|
-
'embeddings_generated': 0,
|
|
104
|
-
'messages_processed': 0,
|
|
105
|
-
'bytes_processed': 0,
|
|
106
|
-
'api_calls': 0,
|
|
107
|
-
'estimated_cost': 0.0,
|
|
108
|
-
'sample_chunks': []
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
def _load_state(self) -> Dict[str, Any]:
|
|
112
|
-
"""Load or initialize state."""
|
|
113
|
-
if os.path.exists(STATE_FILE):
|
|
114
|
-
try:
|
|
115
|
-
with open(STATE_FILE, 'r') as f:
|
|
116
|
-
data = json.load(f)
|
|
117
|
-
# Handle old format (files list) vs new format (projects dict)
|
|
118
|
-
if 'files' in data and 'projects' not in data:
|
|
119
|
-
# Convert old format to new format
|
|
120
|
-
projects = {}
|
|
121
|
-
for file_path in data.get('files', []):
|
|
122
|
-
# Extract project name from file path
|
|
123
|
-
parts = file_path.split('/')
|
|
124
|
-
if len(parts) >= 3:
|
|
125
|
-
project_name = parts[2]
|
|
126
|
-
if project_name not in projects:
|
|
127
|
-
projects[project_name] = []
|
|
128
|
-
projects[project_name].append(file_path)
|
|
129
|
-
return {
|
|
130
|
-
"projects": projects,
|
|
131
|
-
"last_updated": data.get('lastUpdated'),
|
|
132
|
-
"total_imported": len(data.get('files', []))
|
|
133
|
-
}
|
|
134
|
-
# New format
|
|
135
|
-
return data
|
|
136
|
-
except Exception as e:
|
|
137
|
-
print(f"ā ļø Failed to load state: {e}")
|
|
138
|
-
|
|
139
|
-
return {
|
|
140
|
-
"projects": {},
|
|
141
|
-
"last_updated": None,
|
|
142
|
-
"total_imported": 0
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
def _save_state(self):
|
|
146
|
-
"""Save current state to disk."""
|
|
147
|
-
try:
|
|
148
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
149
|
-
self.state["last_updated"] = datetime.now().isoformat()
|
|
150
|
-
self.state["total_imported"] = self.total_imported
|
|
151
|
-
|
|
152
|
-
with open(STATE_FILE, 'w') as f:
|
|
153
|
-
json.dump(self.state, f, indent=2)
|
|
154
|
-
except Exception as e:
|
|
155
|
-
logger.error(f"Failed to save state: {e}")
|
|
156
|
-
|
|
157
|
-
def _get_collection_name(self, project_name: str) -> str:
|
|
158
|
-
"""Generate collection name for project with Voyage suffix."""
|
|
159
|
-
project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
|
|
160
|
-
return f"conv_{project_hash}_voyage"
|
|
161
|
-
|
|
162
|
-
def _ensure_collection(self, collection_name: str):
|
|
163
|
-
"""Ensure collection exists with correct configuration for OpenAI embeddings."""
|
|
164
|
-
if self.dry_run:
|
|
165
|
-
# In dry-run mode, just log what would happen
|
|
166
|
-
print(f"[DRY-RUN] Would ensure collection: {collection_name}")
|
|
167
|
-
return
|
|
168
|
-
|
|
169
|
-
collections = [col.name for col in self.qdrant_client.get_collections().collections]
|
|
170
|
-
|
|
171
|
-
if collection_name not in collections:
|
|
172
|
-
self.qdrant_client.create_collection(
|
|
173
|
-
collection_name=collection_name,
|
|
174
|
-
vectors_config=VectorParams(
|
|
175
|
-
size=EMBEDDING_DIMENSIONS,
|
|
176
|
-
distance=Distance.COSINE
|
|
177
|
-
)
|
|
178
|
-
)
|
|
179
|
-
else:
|
|
180
|
-
# Verify dimensions
|
|
181
|
-
info = self.qdrant_client.get_collection(collection_name)
|
|
182
|
-
if info.config.params.vectors.size != EMBEDDING_DIMENSIONS:
|
|
183
|
-
raise ValueError(f"Dimension mismatch in collection {collection_name}")
|
|
184
|
-
|
|
185
|
-
def _count_total_work(self) -> Tuple[int, int, int]:
|
|
186
|
-
"""Count total files and estimate chunks to process."""
|
|
187
|
-
total_files = 0
|
|
188
|
-
new_files = 0
|
|
189
|
-
estimated_chunks = 0
|
|
190
|
-
total_size = 0
|
|
191
|
-
|
|
192
|
-
projects_dir = LOGS_DIR
|
|
193
|
-
if not os.path.exists(projects_dir):
|
|
194
|
-
return 0, 0, 0
|
|
195
|
-
|
|
196
|
-
# Count all JSONL files
|
|
197
|
-
for project_name in os.listdir(projects_dir):
|
|
198
|
-
project_path = os.path.join(projects_dir, project_name)
|
|
199
|
-
if os.path.isdir(project_path) and not project_name.startswith('.'):
|
|
200
|
-
for file in os.listdir(project_path):
|
|
201
|
-
if file.endswith('.jsonl'):
|
|
202
|
-
total_files += 1
|
|
203
|
-
file_path = os.path.join(project_path, file)
|
|
204
|
-
|
|
205
|
-
# Check if already imported
|
|
206
|
-
if not (project_name in self.state["projects"] and
|
|
207
|
-
file_path in self.state["projects"][project_name]):
|
|
208
|
-
new_files += 1
|
|
209
|
-
|
|
210
|
-
# Estimate chunks based on file size
|
|
211
|
-
try:
|
|
212
|
-
file_size = os.path.getsize(file_path)
|
|
213
|
-
total_size += file_size
|
|
214
|
-
# Rough estimate: 1 chunk per 10KB
|
|
215
|
-
estimated_chunks += max(1, file_size // 10240)
|
|
216
|
-
except:
|
|
217
|
-
estimated_chunks += 5 # Default estimate
|
|
218
|
-
|
|
219
|
-
return total_files, new_files, estimated_chunks
|
|
220
|
-
|
|
221
|
-
def _estimate_cost(self, text_count: int) -> float:
|
|
222
|
-
"""Estimate API cost for embeddings.
|
|
223
|
-
|
|
224
|
-
Voyage AI pricing (as of 2024):
|
|
225
|
-
- voyage-3.5-lite: $0.02 per 1M tokens
|
|
226
|
-
- Estimated 500 tokens per chunk average
|
|
227
|
-
"""
|
|
228
|
-
estimated_tokens = text_count * 500 # Average tokens per chunk
|
|
229
|
-
cost_per_million = 0.02 # $0.02 per 1M tokens for voyage-3.5-lite
|
|
230
|
-
return (estimated_tokens / 1_000_000) * cost_per_million
|
|
231
|
-
|
|
232
|
-
@backoff.on_exception(
|
|
233
|
-
backoff.expo,
|
|
234
|
-
Exception,
|
|
235
|
-
max_tries=5,
|
|
236
|
-
on_backoff=lambda details: None # Silent backoff
|
|
237
|
-
)
|
|
238
|
-
def _generate_embeddings(self, texts: List[str], progress_bar=None) -> List[List[float]]:
|
|
239
|
-
"""Generate embeddings using Voyage AI API with retry logic."""
|
|
240
|
-
if self.dry_run:
|
|
241
|
-
# In dry-run mode, simulate embeddings
|
|
242
|
-
if progress_bar:
|
|
243
|
-
progress_bar.set_description("[DRY-RUN] Simulating embeddings...")
|
|
244
|
-
|
|
245
|
-
# Update cost estimation
|
|
246
|
-
self.stats['estimated_cost'] += self._estimate_cost(len(texts))
|
|
247
|
-
self.stats['api_calls'] += 1
|
|
248
|
-
self.stats['embeddings_generated'] += len(texts)
|
|
249
|
-
|
|
250
|
-
# Return fake embeddings
|
|
251
|
-
return [[0.0] * EMBEDDING_DIMENSIONS for _ in texts]
|
|
252
|
-
|
|
253
|
-
try:
|
|
254
|
-
if progress_bar:
|
|
255
|
-
progress_bar.set_description("š¤ Generating embeddings...")
|
|
256
|
-
|
|
257
|
-
response = requests.post(
|
|
258
|
-
VOYAGE_API_URL,
|
|
259
|
-
headers=self.voyage_headers,
|
|
260
|
-
json={
|
|
261
|
-
"input": texts,
|
|
262
|
-
"model": EMBEDDING_MODEL,
|
|
263
|
-
"input_type": "document"
|
|
264
|
-
}
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
self.stats['api_calls'] += 1
|
|
268
|
-
|
|
269
|
-
if response.status_code != 200:
|
|
270
|
-
raise Exception(f"Voyage API error: {response.status_code} - {response.text}")
|
|
271
|
-
|
|
272
|
-
data = response.json()
|
|
273
|
-
embeddings = [item["embedding"] for item in data["data"]]
|
|
274
|
-
self.stats['embeddings_generated'] += len(embeddings)
|
|
275
|
-
|
|
276
|
-
return embeddings
|
|
277
|
-
except Exception as e:
|
|
278
|
-
if progress_bar:
|
|
279
|
-
progress_bar.set_description(f"ā Embedding error: {str(e)[:30]}...")
|
|
280
|
-
raise
|
|
281
|
-
|
|
282
|
-
def _process_jsonl_file(self, file_path: str, progress_bar=None) -> List[Dict[str, Any]]:
|
|
283
|
-
"""Extract messages from a JSONL file with progress tracking."""
|
|
284
|
-
messages = []
|
|
285
|
-
file_size = os.path.getsize(file_path)
|
|
286
|
-
self.stats['bytes_processed'] += file_size
|
|
287
|
-
|
|
288
|
-
if progress_bar:
|
|
289
|
-
progress_bar.set_description(f"š Reading {os.path.basename(file_path)[:30]}...")
|
|
290
|
-
|
|
291
|
-
try:
|
|
292
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
293
|
-
for line_num, line in enumerate(f, 1):
|
|
294
|
-
line = line.strip()
|
|
295
|
-
if not line:
|
|
296
|
-
continue
|
|
297
|
-
|
|
298
|
-
try:
|
|
299
|
-
data = json.loads(line)
|
|
300
|
-
|
|
301
|
-
# Extract message if present
|
|
302
|
-
if 'message' in data and data['message']:
|
|
303
|
-
msg = data['message']
|
|
304
|
-
if msg.get('role') and msg.get('content'):
|
|
305
|
-
content = msg['content']
|
|
306
|
-
if isinstance(content, dict):
|
|
307
|
-
content = content.get('text', json.dumps(content))
|
|
308
|
-
|
|
309
|
-
messages.append({
|
|
310
|
-
'role': msg['role'],
|
|
311
|
-
'content': content,
|
|
312
|
-
'file_path': file_path,
|
|
313
|
-
'line_number': line_num,
|
|
314
|
-
'timestamp': data.get('timestamp', datetime.now().isoformat())
|
|
315
|
-
})
|
|
316
|
-
self.stats['messages_processed'] += 1
|
|
317
|
-
except json.JSONDecodeError:
|
|
318
|
-
pass # Skip invalid JSON
|
|
319
|
-
except Exception as e:
|
|
320
|
-
logger.debug(f"Error processing line {line_num}: {e}")
|
|
321
|
-
|
|
322
|
-
except Exception as e:
|
|
323
|
-
logger.error(f"Failed to read file {file_path}: {e}")
|
|
324
|
-
|
|
325
|
-
return messages
|
|
326
|
-
|
|
327
|
-
def _create_conversation_chunks(self, messages: List[Dict[str, Any]], progress_bar=None) -> List[Dict[str, Any]]:
|
|
328
|
-
"""Group messages into conversation chunks for better context."""
|
|
329
|
-
chunks = []
|
|
330
|
-
|
|
331
|
-
if progress_bar:
|
|
332
|
-
progress_bar.set_description("āļø Creating conversation chunks...")
|
|
333
|
-
|
|
334
|
-
for i in range(0, len(messages), CHUNK_SIZE):
|
|
335
|
-
chunk_messages = messages[i:i + CHUNK_SIZE]
|
|
336
|
-
|
|
337
|
-
# Create conversation text - Voyage supports 32k tokens
|
|
338
|
-
conversation_parts = []
|
|
339
|
-
total_chars = 0
|
|
340
|
-
max_chars = 100000 # Much larger limit with Voyage!
|
|
341
|
-
|
|
342
|
-
for msg in chunk_messages:
|
|
343
|
-
role = msg['role'].upper()
|
|
344
|
-
content = msg['content']
|
|
345
|
-
|
|
346
|
-
# Only truncate extremely long messages
|
|
347
|
-
if len(content) > 20000:
|
|
348
|
-
content = content[:15000] + "\n\n[... truncated ...]\n\n" + content[-5000:]
|
|
349
|
-
|
|
350
|
-
part = f"{role}: {content}"
|
|
351
|
-
|
|
352
|
-
# Check if adding this would exceed limit
|
|
353
|
-
if total_chars + len(part) > max_chars:
|
|
354
|
-
remaining = max_chars - total_chars
|
|
355
|
-
if remaining > 1000:
|
|
356
|
-
part = f"{role}: {content[:remaining-100]}..."
|
|
357
|
-
conversation_parts.append(part)
|
|
358
|
-
break
|
|
359
|
-
|
|
360
|
-
conversation_parts.append(part)
|
|
361
|
-
total_chars += len(part) + 2
|
|
362
|
-
|
|
363
|
-
conversation_text = "\n\n".join(conversation_parts)
|
|
364
|
-
|
|
365
|
-
# Extract metadata
|
|
366
|
-
project_name = os.path.basename(os.path.dirname(chunk_messages[0]['file_path']))
|
|
367
|
-
conversation_id = os.path.basename(chunk_messages[0]['file_path']).replace('.jsonl', '')
|
|
368
|
-
|
|
369
|
-
# Generate unique ID
|
|
370
|
-
chunk_id = hashlib.md5(
|
|
371
|
-
f"{project_name}_{conversation_id}_{i}".encode()
|
|
372
|
-
).hexdigest()
|
|
373
|
-
|
|
374
|
-
chunk_data = {
|
|
375
|
-
'id': chunk_id,
|
|
376
|
-
'text': conversation_text,
|
|
377
|
-
'metadata': {
|
|
378
|
-
'project': project_name,
|
|
379
|
-
'conversation_id': conversation_id,
|
|
380
|
-
'chunk_index': i // CHUNK_SIZE,
|
|
381
|
-
'message_count': len(chunk_messages),
|
|
382
|
-
'start_role': chunk_messages[0]['role'],
|
|
383
|
-
'timestamp': chunk_messages[0]['timestamp'],
|
|
384
|
-
'file': os.path.basename(chunk_messages[0]['file_path'])
|
|
385
|
-
}
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
chunks.append(chunk_data)
|
|
389
|
-
|
|
390
|
-
# Store sample chunks for preview
|
|
391
|
-
if self.preview and len(self.stats['sample_chunks']) < 3:
|
|
392
|
-
self.stats['sample_chunks'].append({
|
|
393
|
-
'project': project_name,
|
|
394
|
-
'file': os.path.basename(chunk_messages[0]['file_path']),
|
|
395
|
-
'preview': conversation_text[:500] + '...' if len(conversation_text) > 500 else conversation_text,
|
|
396
|
-
'message_count': len(chunk_messages)
|
|
397
|
-
})
|
|
398
|
-
|
|
399
|
-
self.stats['chunks_created'] += len(chunks)
|
|
400
|
-
return chunks
|
|
401
|
-
|
|
402
|
-
def _import_chunks_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str, file_progress: tqdm):
|
|
403
|
-
"""Import conversation chunks to Qdrant with batched OpenAI embeddings."""
|
|
404
|
-
if not chunks:
|
|
405
|
-
return
|
|
406
|
-
|
|
407
|
-
if self.dry_run:
|
|
408
|
-
# In dry-run mode, simulate the import
|
|
409
|
-
print(f"\n[DRY-RUN] Would import {len(chunks)} chunks to collection: {collection_name}")
|
|
410
|
-
|
|
411
|
-
# Simulate progress
|
|
412
|
-
for i in range(0, len(chunks), BATCH_SIZE):
|
|
413
|
-
batch_size = min(BATCH_SIZE, len(chunks) - i)
|
|
414
|
-
self.stats['api_calls'] += 1
|
|
415
|
-
self.stats['embeddings_generated'] += batch_size
|
|
416
|
-
self.total_imported += batch_size
|
|
417
|
-
|
|
418
|
-
# Estimate cost
|
|
419
|
-
self.stats['estimated_cost'] += self._estimate_cost(batch_size)
|
|
420
|
-
|
|
421
|
-
return
|
|
422
|
-
|
|
423
|
-
# Create sub-progress bar for chunks
|
|
424
|
-
chunk_progress = tqdm(
|
|
425
|
-
total=len(chunks),
|
|
426
|
-
desc="š¦ Uploading chunks",
|
|
427
|
-
unit="chunk",
|
|
428
|
-
leave=False,
|
|
429
|
-
position=2
|
|
430
|
-
)
|
|
431
|
-
|
|
432
|
-
# Process in batches
|
|
433
|
-
for i in range(0, len(chunks), BATCH_SIZE):
|
|
434
|
-
batch = chunks[i:i + BATCH_SIZE]
|
|
435
|
-
texts = [chunk['text'] for chunk in batch]
|
|
436
|
-
|
|
437
|
-
try:
|
|
438
|
-
# Generate embeddings
|
|
439
|
-
chunk_progress.set_description("š¤ Generating embeddings...")
|
|
440
|
-
embeddings = self._generate_embeddings(texts, chunk_progress)
|
|
441
|
-
|
|
442
|
-
# Create points
|
|
443
|
-
points = []
|
|
444
|
-
for chunk, embedding in zip(batch, embeddings):
|
|
445
|
-
point = PointStruct(
|
|
446
|
-
id=chunk['id'],
|
|
447
|
-
vector=embedding,
|
|
448
|
-
payload={
|
|
449
|
-
'text': chunk['text'][:2000], # Limit text size
|
|
450
|
-
**chunk['metadata']
|
|
451
|
-
}
|
|
452
|
-
)
|
|
453
|
-
points.append(point)
|
|
454
|
-
|
|
455
|
-
# Upload to Qdrant
|
|
456
|
-
chunk_progress.set_description("ā¬ļø Uploading to Qdrant...")
|
|
457
|
-
self.qdrant_client.upsert(
|
|
458
|
-
collection_name=collection_name,
|
|
459
|
-
points=points
|
|
460
|
-
)
|
|
461
|
-
|
|
462
|
-
self.total_imported += len(points)
|
|
463
|
-
chunk_progress.update(len(points))
|
|
464
|
-
|
|
465
|
-
# Update speed in main progress
|
|
466
|
-
elapsed = time.time() - self.start_time
|
|
467
|
-
speed = self.total_imported / elapsed if elapsed > 0 else 0
|
|
468
|
-
file_progress.set_postfix({
|
|
469
|
-
'chunks/s': f"{speed:.1f}",
|
|
470
|
-
'total': self.total_imported
|
|
471
|
-
})
|
|
472
|
-
|
|
473
|
-
# Add delay to respect rate limit
|
|
474
|
-
if i + BATCH_SIZE < len(chunks):
|
|
475
|
-
chunk_progress.set_description(f"ā³ Rate limit delay ({RATE_LIMIT_DELAY}s)...")
|
|
476
|
-
time.sleep(RATE_LIMIT_DELAY)
|
|
477
|
-
|
|
478
|
-
except Exception as e:
|
|
479
|
-
chunk_progress.set_description(f"ā Error: {str(e)[:30]}...")
|
|
480
|
-
self.total_errors += 1
|
|
481
|
-
# Continue with next batch instead of failing completely
|
|
482
|
-
|
|
483
|
-
chunk_progress.close()
|
|
484
|
-
|
|
485
|
-
def import_project(self, project_path: str, project_progress: tqdm = None) -> int:
|
|
486
|
-
"""Import all JSONL files in a project directory."""
|
|
487
|
-
project_name = os.path.basename(project_path)
|
|
488
|
-
collection_name = self._get_collection_name(project_name)
|
|
489
|
-
|
|
490
|
-
# Ensure collection exists
|
|
491
|
-
self._ensure_collection(collection_name)
|
|
492
|
-
|
|
493
|
-
# Get list of JSONL files
|
|
494
|
-
jsonl_files = []
|
|
495
|
-
for file in os.listdir(project_path):
|
|
496
|
-
if file.endswith('.jsonl'):
|
|
497
|
-
file_path = os.path.join(project_path, file)
|
|
498
|
-
|
|
499
|
-
# Skip already imported files
|
|
500
|
-
if (project_name in self.state["projects"] and
|
|
501
|
-
file_path in self.state["projects"][project_name]):
|
|
502
|
-
continue
|
|
503
|
-
|
|
504
|
-
jsonl_files.append(file_path)
|
|
505
|
-
|
|
506
|
-
if not jsonl_files:
|
|
507
|
-
return 0
|
|
508
|
-
|
|
509
|
-
# Create file progress bar
|
|
510
|
-
file_progress = tqdm(
|
|
511
|
-
total=len(jsonl_files),
|
|
512
|
-
desc=f"š {project_name}",
|
|
513
|
-
unit="file",
|
|
514
|
-
leave=False,
|
|
515
|
-
position=1
|
|
516
|
-
)
|
|
517
|
-
|
|
518
|
-
project_total = 0
|
|
519
|
-
for file_path in sorted(jsonl_files):
|
|
520
|
-
file_name = os.path.basename(file_path)
|
|
521
|
-
file_progress.set_description(f"š {project_name}/{file_name[:20]}...")
|
|
522
|
-
|
|
523
|
-
# Extract messages
|
|
524
|
-
messages = self._process_jsonl_file(file_path, file_progress)
|
|
525
|
-
if not messages:
|
|
526
|
-
file_progress.update(1)
|
|
527
|
-
continue
|
|
528
|
-
|
|
529
|
-
# Create chunks
|
|
530
|
-
chunks = self._create_conversation_chunks(messages, file_progress)
|
|
531
|
-
|
|
532
|
-
# Import to Qdrant
|
|
533
|
-
self._import_chunks_to_qdrant(chunks, collection_name, file_progress)
|
|
534
|
-
|
|
535
|
-
# Mark file as imported (only in non-dry-run mode)
|
|
536
|
-
if not self.dry_run:
|
|
537
|
-
if project_name not in self.state["projects"]:
|
|
538
|
-
self.state["projects"][project_name] = []
|
|
539
|
-
self.state["projects"][project_name].append(file_path)
|
|
540
|
-
|
|
541
|
-
# Save state after each file
|
|
542
|
-
self._save_state()
|
|
543
|
-
|
|
544
|
-
project_total += len(chunks)
|
|
545
|
-
self.stats['files_processed'] += 1
|
|
546
|
-
|
|
547
|
-
file_progress.update(1)
|
|
548
|
-
|
|
549
|
-
file_progress.close()
|
|
550
|
-
return project_total
|
|
551
|
-
|
|
552
|
-
def validate_setup(self):
|
|
553
|
-
"""Validate the entire setup before import."""
|
|
554
|
-
print("š Validating setup...")
|
|
555
|
-
print("=" * 60)
|
|
556
|
-
|
|
557
|
-
validations = {
|
|
558
|
-
"API Key": False,
|
|
559
|
-
"Qdrant Connection": False,
|
|
560
|
-
"Claude Logs": False,
|
|
561
|
-
"File Format": False,
|
|
562
|
-
"Disk Space": False
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
# Check API key
|
|
566
|
-
if self.voyage_available:
|
|
567
|
-
try:
|
|
568
|
-
# Test with a single embedding
|
|
569
|
-
response = requests.post(
|
|
570
|
-
VOYAGE_API_URL,
|
|
571
|
-
headers=self.voyage_headers,
|
|
572
|
-
json={
|
|
573
|
-
"input": ["test"],
|
|
574
|
-
"model": EMBEDDING_MODEL,
|
|
575
|
-
"input_type": "document"
|
|
576
|
-
}
|
|
577
|
-
)
|
|
578
|
-
if response.status_code == 200:
|
|
579
|
-
validations["API Key"] = True
|
|
580
|
-
print("ā
Voyage API key is valid")
|
|
581
|
-
else:
|
|
582
|
-
print(f"ā Voyage API key test failed: {response.status_code}")
|
|
583
|
-
except Exception as e:
|
|
584
|
-
print(f"ā Voyage API connection failed: {e}")
|
|
585
|
-
else:
|
|
586
|
-
print("ā ļø No API key configured")
|
|
587
|
-
|
|
588
|
-
# Check Qdrant
|
|
589
|
-
if hasattr(self, 'qdrant_client') and self.qdrant_client:
|
|
590
|
-
try:
|
|
591
|
-
collections = self.qdrant_client.get_collections()
|
|
592
|
-
validations["Qdrant Connection"] = True
|
|
593
|
-
print(f"ā
Qdrant is accessible ({len(collections.collections)} collections)")
|
|
594
|
-
except Exception as e:
|
|
595
|
-
print(f"ā Qdrant connection failed: {e}")
|
|
596
|
-
else:
|
|
597
|
-
print("ā ļø Qdrant client not initialized")
|
|
598
|
-
|
|
599
|
-
# Check Claude logs
|
|
600
|
-
projects_dir = LOGS_DIR
|
|
601
|
-
if os.path.exists(projects_dir):
|
|
602
|
-
total_files, _, _ = self._count_total_work()
|
|
603
|
-
if total_files > 0:
|
|
604
|
-
validations["Claude Logs"] = True
|
|
605
|
-
print(f"ā
Found {total_files} conversation files")
|
|
606
|
-
else:
|
|
607
|
-
print("ā ļø No conversation files found")
|
|
608
|
-
else:
|
|
609
|
-
print(f"ā Claude logs directory not found: {projects_dir}")
|
|
610
|
-
|
|
611
|
-
# Validate file format
|
|
612
|
-
if validations["Claude Logs"]:
|
|
613
|
-
sample_validated = False
|
|
614
|
-
for project in os.listdir(projects_dir):
|
|
615
|
-
project_path = os.path.join(projects_dir, project)
|
|
616
|
-
if os.path.isdir(project_path):
|
|
617
|
-
for file in os.listdir(project_path):
|
|
618
|
-
if file.endswith('.jsonl'):
|
|
619
|
-
file_path = os.path.join(project_path, file)
|
|
620
|
-
try:
|
|
621
|
-
messages = self._process_jsonl_file(file_path)
|
|
622
|
-
if messages:
|
|
623
|
-
validations["File Format"] = True
|
|
624
|
-
sample_validated = True
|
|
625
|
-
print(f"ā
JSONL format validated ({len(messages)} messages in sample)")
|
|
626
|
-
break
|
|
627
|
-
except Exception as e:
|
|
628
|
-
print(f"ā ļø Sample file validation failed: {e}")
|
|
629
|
-
break
|
|
630
|
-
if sample_validated:
|
|
631
|
-
break
|
|
632
|
-
|
|
633
|
-
# Check disk space
|
|
634
|
-
try:
|
|
635
|
-
import shutil
|
|
636
|
-
stat = shutil.disk_usage("/")
|
|
637
|
-
free_gb = stat.free / (1024 ** 3)
|
|
638
|
-
if free_gb > 1:
|
|
639
|
-
validations["Disk Space"] = True
|
|
640
|
-
print(f"ā
Sufficient disk space ({free_gb:.1f} GB free)")
|
|
641
|
-
else:
|
|
642
|
-
print(f"ā ļø Low disk space ({free_gb:.1f} GB free)")
|
|
643
|
-
except Exception:
|
|
644
|
-
print("ā ļø Could not check disk space")
|
|
645
|
-
|
|
646
|
-
# Summary
|
|
647
|
-
print("\n" + "=" * 60)
|
|
648
|
-
all_valid = all(validations.values())
|
|
649
|
-
if all_valid:
|
|
650
|
-
print("ā
All validations passed!")
|
|
651
|
-
else:
|
|
652
|
-
print("ā ļø Some validations failed or have warnings")
|
|
653
|
-
print("\nFailed checks:")
|
|
654
|
-
for check, passed in validations.items():
|
|
655
|
-
if not passed:
|
|
656
|
-
print(f" ⢠{check}")
|
|
657
|
-
|
|
658
|
-
return all_valid
|
|
659
|
-
|
|
660
|
-
def import_all(self):
|
|
661
|
-
"""Import all Claude projects with enhanced progress tracking."""
|
|
662
|
-
if self.validate_only:
|
|
663
|
-
# Only run validation
|
|
664
|
-
self.validate_setup()
|
|
665
|
-
return
|
|
666
|
-
|
|
667
|
-
projects_dir = LOGS_DIR
|
|
668
|
-
|
|
669
|
-
if not os.path.exists(projects_dir):
|
|
670
|
-
print(f"ā Claude projects directory not found: {projects_dir}")
|
|
671
|
-
return
|
|
672
|
-
|
|
673
|
-
# Count total work
|
|
674
|
-
print("š Analyzing conversation history...")
|
|
675
|
-
total_files, new_files, estimated_chunks = self._count_total_work()
|
|
676
|
-
|
|
677
|
-
if new_files == 0:
|
|
678
|
-
print("ā
All conversations already imported!")
|
|
679
|
-
return
|
|
680
|
-
|
|
681
|
-
# Calculate estimated cost
|
|
682
|
-
estimated_cost = self._estimate_cost(estimated_chunks)
|
|
683
|
-
|
|
684
|
-
print(f"\nš Import Summary:")
|
|
685
|
-
print(f" ⢠Total files: {total_files}")
|
|
686
|
-
print(f" ⢠New files to import: {new_files}")
|
|
687
|
-
print(f" ⢠Estimated chunks: ~{estimated_chunks}")
|
|
688
|
-
print(f" ⢠Estimated cost: ${estimated_cost:.4f}")
|
|
689
|
-
print(f" ⢠Embedding model: {EMBEDDING_MODEL}")
|
|
690
|
-
print(f" ⢠Batch size: {BATCH_SIZE}")
|
|
691
|
-
|
|
692
|
-
if self.dry_run:
|
|
693
|
-
print(f"\nš DRY-RUN MODE - No changes will be made")
|
|
694
|
-
|
|
695
|
-
print(f"\nā³ Starting import...\n")
|
|
696
|
-
|
|
697
|
-
# Get list of projects
|
|
698
|
-
projects = [
|
|
699
|
-
d for d in os.listdir(projects_dir)
|
|
700
|
-
if os.path.isdir(os.path.join(projects_dir, d)) and not d.startswith('.')
|
|
701
|
-
]
|
|
702
|
-
|
|
703
|
-
# Main progress bar for projects
|
|
704
|
-
project_progress = tqdm(
|
|
705
|
-
total=len(projects),
|
|
706
|
-
desc="š Overall Progress",
|
|
707
|
-
unit="project",
|
|
708
|
-
position=0
|
|
709
|
-
)
|
|
710
|
-
|
|
711
|
-
# Import each project
|
|
712
|
-
self.start_time = time.time()
|
|
713
|
-
for project_name in sorted(projects):
|
|
714
|
-
project_path = os.path.join(projects_dir, project_name)
|
|
715
|
-
|
|
716
|
-
try:
|
|
717
|
-
count = self.import_project(project_path, project_progress)
|
|
718
|
-
|
|
719
|
-
# Update progress
|
|
720
|
-
project_progress.update(1)
|
|
721
|
-
|
|
722
|
-
# Calculate and display ETA
|
|
723
|
-
elapsed = time.time() - self.start_time
|
|
724
|
-
progress_pct = (project_progress.n / len(projects))
|
|
725
|
-
if progress_pct > 0:
|
|
726
|
-
eta_seconds = (elapsed / progress_pct) - elapsed
|
|
727
|
-
eta_str = humanize.naturaldelta(eta_seconds)
|
|
728
|
-
else:
|
|
729
|
-
eta_str = "calculating..."
|
|
730
|
-
|
|
731
|
-
project_progress.set_postfix({
|
|
732
|
-
'ETA': eta_str,
|
|
733
|
-
'chunks': self.total_imported,
|
|
734
|
-
'errors': self.total_errors
|
|
735
|
-
})
|
|
736
|
-
|
|
737
|
-
except Exception as e:
|
|
738
|
-
project_progress.set_description(f"ā Error in {project_name}: {str(e)[:30]}...")
|
|
739
|
-
self.total_errors += 1
|
|
740
|
-
continue
|
|
741
|
-
|
|
742
|
-
project_progress.close()
|
|
743
|
-
|
|
744
|
-
# Final summary
|
|
745
|
-
elapsed_time = time.time() - self.start_time
|
|
746
|
-
print("\n" + "=" * 60)
|
|
747
|
-
|
|
748
|
-
if self.dry_run:
|
|
749
|
-
print("ā
Dry-Run Complete!")
|
|
750
|
-
else:
|
|
751
|
-
print("ā
Import Complete!")
|
|
752
|
-
|
|
753
|
-
print("=" * 60)
|
|
754
|
-
print(f"\nš Final Statistics:")
|
|
755
|
-
print(f" ⢠Time elapsed: {humanize.naturaldelta(elapsed_time)}")
|
|
756
|
-
|
|
757
|
-
if self.dry_run:
|
|
758
|
-
print(f" ⢠Projects to import: {len(projects)}")
|
|
759
|
-
else:
|
|
760
|
-
print(f" ⢠Projects imported: {len(self.state['projects'])}/{len(projects)}")
|
|
761
|
-
|
|
762
|
-
print(f" ⢠Files processed: {self.stats['files_processed']}")
|
|
763
|
-
print(f" ⢠Messages processed: {self.stats['messages_processed']:,}")
|
|
764
|
-
print(f" ⢠Chunks created: {self.stats['chunks_created']:,}")
|
|
765
|
-
print(f" ⢠Embeddings {'would be' if self.dry_run else ''} generated: {self.stats['embeddings_generated']:,}")
|
|
766
|
-
print(f" ⢠Total chunks {'would be' if self.dry_run else ''} imported: {self.total_imported:,}")
|
|
767
|
-
print(f" ⢠API calls {'would be' if self.dry_run else ''} made: {self.stats['api_calls']:,}")
|
|
768
|
-
print(f" ⢠Data processed: {humanize.naturalsize(self.stats['bytes_processed'])}")
|
|
769
|
-
|
|
770
|
-
if elapsed_time > 0:
|
|
771
|
-
print(f" ⢠Average speed: {self.total_imported/elapsed_time:.1f} chunks/second")
|
|
772
|
-
|
|
773
|
-
if self.dry_run:
|
|
774
|
-
print(f" ⢠š° Estimated cost: ${self.stats['estimated_cost']:.4f}")
|
|
775
|
-
|
|
776
|
-
if self.total_errors > 0:
|
|
777
|
-
print(f" ⢠ā ļø Errors encountered: {self.total_errors}")
|
|
778
|
-
|
|
779
|
-
# Show sample chunks in preview mode
|
|
780
|
-
if self.preview and self.stats['sample_chunks']:
|
|
781
|
-
print(f"\nš Sample Chunks Preview:")
|
|
782
|
-
for i, sample in enumerate(self.stats['sample_chunks'], 1):
|
|
783
|
-
print(f"\n--- Sample {i} ---")
|
|
784
|
-
print(f"Project: {sample['project']}")
|
|
785
|
-
print(f"File: {sample['file']}")
|
|
786
|
-
print(f"Messages: {sample['message_count']}")
|
|
787
|
-
print(f"Preview:\n{sample['preview']}")
|
|
788
|
-
|
|
789
|
-
# Show collection summary (non-dry-run only)
|
|
790
|
-
if not self.dry_run and hasattr(self, 'qdrant_client') and self.qdrant_client:
|
|
791
|
-
print(f"\nš¦ Collection Summary:")
|
|
792
|
-
for col in self.qdrant_client.get_collections().collections:
|
|
793
|
-
if col.name.endswith("_voyage"):
|
|
794
|
-
info = self.qdrant_client.get_collection(col.name)
|
|
795
|
-
print(f" ⢠{col.name}: {info.points_count:,} vectors")
|
|
796
|
-
|
|
797
|
-
print(f"\nš” Next steps:")
|
|
798
|
-
if self.dry_run:
|
|
799
|
-
print(f" 1. Review the statistics above")
|
|
800
|
-
print(f" 2. Run without --dry-run to perform actual import")
|
|
801
|
-
print(f" 3. Consider using --preview to see sample chunks")
|
|
802
|
-
else:
|
|
803
|
-
print(f" 1. Restart Claude Desktop to load the MCP server")
|
|
804
|
-
print(f" 2. Try searching: 'What did we discuss about X?'")
|
|
805
|
-
print(f" 3. Enable continuous import: docker compose --profile watch up -d")
|
|
806
|
-
|
|
807
|
-
def main():
|
|
808
|
-
"""Main entry point."""
|
|
809
|
-
parser = argparse.ArgumentParser(
|
|
810
|
-
description="Import Claude conversation logs to Qdrant vector database",
|
|
811
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
812
|
-
epilog="""
|
|
813
|
-
Examples:
|
|
814
|
-
# Run in dry-run mode to see what would happen
|
|
815
|
-
python %(prog)s --dry-run
|
|
816
|
-
|
|
817
|
-
# Validate setup only
|
|
818
|
-
python %(prog)s --validate-only
|
|
819
|
-
|
|
820
|
-
# Dry-run with preview of sample chunks
|
|
821
|
-
python %(prog)s --dry-run --preview
|
|
822
|
-
|
|
823
|
-
# Import a specific project
|
|
824
|
-
python %(prog)s /path/to/project
|
|
825
|
-
|
|
826
|
-
# Import all projects (normal mode)
|
|
827
|
-
python %(prog)s
|
|
828
|
-
"""
|
|
829
|
-
)
|
|
830
|
-
|
|
831
|
-
parser.add_argument('project_path', nargs='?', help='Path to specific project to import')
|
|
832
|
-
parser.add_argument('--dry-run', action='store_true',
|
|
833
|
-
help='Simulate import without making changes')
|
|
834
|
-
parser.add_argument('--validate-only', action='store_true',
|
|
835
|
-
help='Only validate setup without importing')
|
|
836
|
-
parser.add_argument('--preview', action='store_true',
|
|
837
|
-
help='Show sample chunks in dry-run mode')
|
|
838
|
-
|
|
839
|
-
args = parser.parse_args()
|
|
840
|
-
|
|
841
|
-
try:
|
|
842
|
-
importer = EnhancedVoyageImporter(
|
|
843
|
-
dry_run=args.dry_run,
|
|
844
|
-
validate_only=args.validate_only,
|
|
845
|
-
preview=args.preview
|
|
846
|
-
)
|
|
847
|
-
|
|
848
|
-
if args.project_path:
|
|
849
|
-
# Import specific project
|
|
850
|
-
if os.path.exists(args.project_path):
|
|
851
|
-
print(f"š Importing single project: {os.path.basename(args.project_path)}")
|
|
852
|
-
importer.import_project(args.project_path)
|
|
853
|
-
else:
|
|
854
|
-
print(f"ā Project path not found: {args.project_path}")
|
|
855
|
-
else:
|
|
856
|
-
# Import all projects
|
|
857
|
-
importer.import_all()
|
|
858
|
-
except KeyboardInterrupt:
|
|
859
|
-
print("\n\nā ļø Import interrupted by user")
|
|
860
|
-
if not args.dry_run and not args.validate_only:
|
|
861
|
-
print("Progress has been saved. Run again to continue where you left off.")
|
|
862
|
-
except Exception as e:
|
|
863
|
-
print(f"\nā Fatal error: {e}")
|
|
864
|
-
raise
|
|
865
|
-
|
|
866
|
-
if __name__ == "__main__":
|
|
867
|
-
main()
|