claude-self-reflect 2.3.6 → 2.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/import-conversations-isolated.py +0 -311
- package/scripts/import-conversations-voyage-streaming.py +0 -368
- package/scripts/import-conversations-voyage.py +0 -430
- package/scripts/import-conversations.py +0 -240
- package/scripts/import-current-conversation.py +0 -39
- package/scripts/import-live-conversation.py +0 -154
- package/scripts/import-openai-enhanced.py +0 -867
- package/scripts/import-recent-only.py +0 -33
- package/scripts/import-single-project.py +0 -278
- package/scripts/import-watcher.py +0 -170
|
@@ -1,430 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Import Claude conversation logs from JSONL files into Qdrant vector database using Voyage AI embeddings.
|
|
4
|
-
Clean implementation with 32k token context window.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import glob
|
|
10
|
-
import time
|
|
11
|
-
import hashlib
|
|
12
|
-
from datetime import datetime
|
|
13
|
-
from typing import List, Dict, Any, Optional
|
|
14
|
-
import logging
|
|
15
|
-
from qdrant_client import QdrantClient
|
|
16
|
-
from qdrant_client.models import VectorParams, Distance, PointStruct
|
|
17
|
-
import requests
|
|
18
|
-
import backoff
|
|
19
|
-
|
|
20
|
-
# Configuration
|
|
21
|
-
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
22
|
-
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
23
|
-
STATE_FILE = os.getenv("STATE_FILE", os.path.expanduser("~/.claude-self-reflect/imported-files.json"))
|
|
24
|
-
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY-2") or os.getenv("VOYAGE_KEY")
|
|
25
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "50")) # Voyage supports batch embedding
|
|
26
|
-
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "10")) # Can use larger chunks with 32k token limit
|
|
27
|
-
RATE_LIMIT_DELAY = 0.1 # 100ms between requests for faster imports
|
|
28
|
-
EMBEDDING_MODEL = "voyage-3.5-lite"
|
|
29
|
-
EMBEDDING_DIMENSIONS = 1024 # Voyage default dimensions
|
|
30
|
-
VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings"
|
|
31
|
-
|
|
32
|
-
# Set up logging
|
|
33
|
-
logging.basicConfig(
|
|
34
|
-
level=logging.INFO,
|
|
35
|
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
36
|
-
)
|
|
37
|
-
logger = logging.getLogger(__name__)
|
|
38
|
-
|
|
39
|
-
class VoyageConversationImporter:
|
|
40
|
-
def __init__(self):
|
|
41
|
-
"""Initialize the importer with Qdrant and Voyage AI."""
|
|
42
|
-
if not VOYAGE_API_KEY:
|
|
43
|
-
raise ValueError("VOYAGE_KEY environment variable not set")
|
|
44
|
-
|
|
45
|
-
self.qdrant_client = QdrantClient(url=QDRANT_URL, timeout=60)
|
|
46
|
-
self.voyage_headers = {
|
|
47
|
-
"Authorization": f"Bearer {VOYAGE_API_KEY}",
|
|
48
|
-
"Content-Type": "application/json"
|
|
49
|
-
}
|
|
50
|
-
self.state = self._load_state()
|
|
51
|
-
self.total_imported = 0
|
|
52
|
-
self.total_errors = 0
|
|
53
|
-
|
|
54
|
-
def _load_state(self) -> Dict[str, Any]:
|
|
55
|
-
"""Load or initialize state."""
|
|
56
|
-
if os.path.exists(STATE_FILE):
|
|
57
|
-
try:
|
|
58
|
-
with open(STATE_FILE, 'r') as f:
|
|
59
|
-
data = json.load(f)
|
|
60
|
-
# Handle old format (files list) vs new format (projects dict)
|
|
61
|
-
if 'files' in data and 'projects' not in data:
|
|
62
|
-
# Convert old format to new format
|
|
63
|
-
projects = {}
|
|
64
|
-
for file_path in data.get('files', []):
|
|
65
|
-
# Extract project name from file path
|
|
66
|
-
parts = file_path.split('/')
|
|
67
|
-
if len(parts) >= 3:
|
|
68
|
-
project_name = parts[2]
|
|
69
|
-
if project_name not in projects:
|
|
70
|
-
projects[project_name] = []
|
|
71
|
-
projects[project_name].append(file_path)
|
|
72
|
-
return {
|
|
73
|
-
"projects": projects,
|
|
74
|
-
"last_updated": data.get('lastUpdated'),
|
|
75
|
-
"total_imported": len(data.get('files', []))
|
|
76
|
-
}
|
|
77
|
-
# New format
|
|
78
|
-
return data
|
|
79
|
-
except Exception as e:
|
|
80
|
-
logger.error(f"Failed to load state: {e}")
|
|
81
|
-
|
|
82
|
-
return {
|
|
83
|
-
"projects": {},
|
|
84
|
-
"last_updated": None,
|
|
85
|
-
"total_imported": 0
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
def _save_state(self):
|
|
89
|
-
"""Save current state to disk."""
|
|
90
|
-
try:
|
|
91
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
92
|
-
self.state["last_updated"] = datetime.now().isoformat()
|
|
93
|
-
self.state["total_imported"] = self.total_imported
|
|
94
|
-
|
|
95
|
-
with open(STATE_FILE, 'w') as f:
|
|
96
|
-
json.dump(self.state, f, indent=2)
|
|
97
|
-
except Exception as e:
|
|
98
|
-
logger.error(f"Failed to save state: {e}")
|
|
99
|
-
|
|
100
|
-
def _get_collection_name(self, project_name: str) -> str:
|
|
101
|
-
"""Generate collection name for project with Voyage suffix."""
|
|
102
|
-
project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
|
|
103
|
-
return f"conv_{project_hash}_voyage"
|
|
104
|
-
|
|
105
|
-
def _ensure_collection(self, collection_name: str):
|
|
106
|
-
"""Ensure collection exists with correct configuration for OpenAI embeddings."""
|
|
107
|
-
collections = [col.name for col in self.qdrant_client.get_collections().collections]
|
|
108
|
-
|
|
109
|
-
if collection_name not in collections:
|
|
110
|
-
logger.info(f"Creating collection: {collection_name} with {EMBEDDING_DIMENSIONS} dimensions")
|
|
111
|
-
self.qdrant_client.create_collection(
|
|
112
|
-
collection_name=collection_name,
|
|
113
|
-
vectors_config=VectorParams(
|
|
114
|
-
size=EMBEDDING_DIMENSIONS,
|
|
115
|
-
distance=Distance.COSINE
|
|
116
|
-
)
|
|
117
|
-
)
|
|
118
|
-
else:
|
|
119
|
-
# Verify dimensions
|
|
120
|
-
info = self.qdrant_client.get_collection(collection_name)
|
|
121
|
-
if info.config.params.vectors.size != EMBEDDING_DIMENSIONS:
|
|
122
|
-
logger.error(f"Collection {collection_name} has wrong dimensions: {info.config.params.vectors.size}")
|
|
123
|
-
raise ValueError(f"Dimension mismatch in collection {collection_name}")
|
|
124
|
-
|
|
125
|
-
@backoff.on_exception(
|
|
126
|
-
backoff.expo,
|
|
127
|
-
Exception,
|
|
128
|
-
max_tries=5,
|
|
129
|
-
on_backoff=lambda details: logger.warning(f"Backing off {details['wait']}s after {details['tries']} tries")
|
|
130
|
-
)
|
|
131
|
-
def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
132
|
-
"""Generate embeddings using Voyage AI API with retry logic."""
|
|
133
|
-
try:
|
|
134
|
-
response = requests.post(
|
|
135
|
-
VOYAGE_API_URL,
|
|
136
|
-
headers=self.voyage_headers,
|
|
137
|
-
json={
|
|
138
|
-
"input": texts,
|
|
139
|
-
"model": EMBEDDING_MODEL,
|
|
140
|
-
"input_type": "document" # For document embeddings
|
|
141
|
-
}
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
if response.status_code != 200:
|
|
145
|
-
raise Exception(f"Voyage API error: {response.status_code} - {response.text}")
|
|
146
|
-
|
|
147
|
-
data = response.json()
|
|
148
|
-
return [item["embedding"] for item in data["data"]]
|
|
149
|
-
except Exception as e:
|
|
150
|
-
logger.error(f"Voyage API error: {e}")
|
|
151
|
-
raise
|
|
152
|
-
|
|
153
|
-
def _process_jsonl_file(self, file_path: str) -> List[Dict[str, Any]]:
|
|
154
|
-
"""Extract messages from a JSONL file."""
|
|
155
|
-
messages = []
|
|
156
|
-
|
|
157
|
-
try:
|
|
158
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
159
|
-
for line_num, line in enumerate(f, 1):
|
|
160
|
-
line = line.strip()
|
|
161
|
-
if not line:
|
|
162
|
-
continue
|
|
163
|
-
|
|
164
|
-
try:
|
|
165
|
-
data = json.loads(line)
|
|
166
|
-
|
|
167
|
-
# Extract message if present
|
|
168
|
-
if 'message' in data and data['message']:
|
|
169
|
-
msg = data['message']
|
|
170
|
-
if msg.get('role') and msg.get('content'):
|
|
171
|
-
content = msg['content']
|
|
172
|
-
if isinstance(content, dict):
|
|
173
|
-
content = content.get('text', json.dumps(content))
|
|
174
|
-
|
|
175
|
-
messages.append({
|
|
176
|
-
'role': msg['role'],
|
|
177
|
-
'content': content,
|
|
178
|
-
'file_path': file_path,
|
|
179
|
-
'line_number': line_num,
|
|
180
|
-
'timestamp': data.get('timestamp', datetime.now().isoformat())
|
|
181
|
-
})
|
|
182
|
-
except json.JSONDecodeError:
|
|
183
|
-
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
184
|
-
except Exception as e:
|
|
185
|
-
logger.error(f"Error processing line {line_num}: {e}")
|
|
186
|
-
|
|
187
|
-
except Exception as e:
|
|
188
|
-
logger.error(f"Failed to read file {file_path}: {e}")
|
|
189
|
-
|
|
190
|
-
return messages
|
|
191
|
-
|
|
192
|
-
def _create_conversation_chunks(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
193
|
-
"""Group messages into conversation chunks for better context."""
|
|
194
|
-
chunks = []
|
|
195
|
-
|
|
196
|
-
for i in range(0, len(messages), CHUNK_SIZE):
|
|
197
|
-
chunk_messages = messages[i:i + CHUNK_SIZE]
|
|
198
|
-
|
|
199
|
-
# Create conversation text - Voyage supports 32k tokens
|
|
200
|
-
# Rough estimate: ~4 chars per token, so ~128k chars max
|
|
201
|
-
# We'll use 100k chars to be safe
|
|
202
|
-
conversation_parts = []
|
|
203
|
-
total_chars = 0
|
|
204
|
-
max_chars = 100000 # Much larger limit with Voyage!
|
|
205
|
-
|
|
206
|
-
for msg in chunk_messages:
|
|
207
|
-
role = msg['role'].upper()
|
|
208
|
-
content = msg['content']
|
|
209
|
-
|
|
210
|
-
# Only truncate extremely long messages
|
|
211
|
-
if len(content) > 20000:
|
|
212
|
-
# Keep first 15000 and last 5000 chars
|
|
213
|
-
content = content[:15000] + "\n\n[... truncated ...]\n\n" + content[-5000:]
|
|
214
|
-
|
|
215
|
-
part = f"{role}: {content}"
|
|
216
|
-
|
|
217
|
-
# Check if adding this would exceed limit
|
|
218
|
-
if total_chars + len(part) > max_chars:
|
|
219
|
-
# For the last message, try to fit what we can
|
|
220
|
-
remaining = max_chars - total_chars
|
|
221
|
-
if remaining > 1000: # Only add if we can fit meaningful content
|
|
222
|
-
part = f"{role}: {content[:remaining-100]}..."
|
|
223
|
-
conversation_parts.append(part)
|
|
224
|
-
break
|
|
225
|
-
|
|
226
|
-
conversation_parts.append(part)
|
|
227
|
-
total_chars += len(part) + 2 # +2 for newlines
|
|
228
|
-
|
|
229
|
-
conversation_text = "\n\n".join(conversation_parts)
|
|
230
|
-
|
|
231
|
-
# Extract metadata
|
|
232
|
-
project_name = os.path.basename(os.path.dirname(chunk_messages[0]['file_path']))
|
|
233
|
-
conversation_id = os.path.basename(chunk_messages[0]['file_path']).replace('.jsonl', '')
|
|
234
|
-
|
|
235
|
-
# Generate unique ID
|
|
236
|
-
chunk_id = hashlib.md5(
|
|
237
|
-
f"{project_name}_{conversation_id}_{i}".encode()
|
|
238
|
-
).hexdigest()
|
|
239
|
-
|
|
240
|
-
chunks.append({
|
|
241
|
-
'id': chunk_id,
|
|
242
|
-
'text': conversation_text,
|
|
243
|
-
'metadata': {
|
|
244
|
-
'project': project_name,
|
|
245
|
-
'conversation_id': conversation_id,
|
|
246
|
-
'chunk_index': i // CHUNK_SIZE,
|
|
247
|
-
'message_count': len(chunk_messages),
|
|
248
|
-
'start_role': chunk_messages[0]['role'],
|
|
249
|
-
'timestamp': chunk_messages[0]['timestamp'],
|
|
250
|
-
'file': os.path.basename(chunk_messages[0]['file_path'])
|
|
251
|
-
}
|
|
252
|
-
})
|
|
253
|
-
|
|
254
|
-
return chunks
|
|
255
|
-
|
|
256
|
-
def _import_chunks_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str):
|
|
257
|
-
"""Import conversation chunks to Qdrant with batched OpenAI embeddings."""
|
|
258
|
-
if not chunks:
|
|
259
|
-
return
|
|
260
|
-
|
|
261
|
-
# Process in batches
|
|
262
|
-
for i in range(0, len(chunks), BATCH_SIZE):
|
|
263
|
-
batch = chunks[i:i + BATCH_SIZE]
|
|
264
|
-
texts = [chunk['text'] for chunk in batch]
|
|
265
|
-
|
|
266
|
-
try:
|
|
267
|
-
# Generate embeddings
|
|
268
|
-
embeddings = self._generate_embeddings(texts)
|
|
269
|
-
|
|
270
|
-
# Create points
|
|
271
|
-
points = []
|
|
272
|
-
for chunk, embedding in zip(batch, embeddings):
|
|
273
|
-
point = PointStruct(
|
|
274
|
-
id=chunk['id'],
|
|
275
|
-
vector=embedding,
|
|
276
|
-
payload={
|
|
277
|
-
'text': chunk['text'][:2000], # Limit text size
|
|
278
|
-
**chunk['metadata']
|
|
279
|
-
}
|
|
280
|
-
)
|
|
281
|
-
points.append(point)
|
|
282
|
-
|
|
283
|
-
# Upload to Qdrant
|
|
284
|
-
self.qdrant_client.upsert(
|
|
285
|
-
collection_name=collection_name,
|
|
286
|
-
points=points
|
|
287
|
-
)
|
|
288
|
-
|
|
289
|
-
self.total_imported += len(points)
|
|
290
|
-
logger.info(f"Imported batch of {len(points)} chunks (total: {self.total_imported})")
|
|
291
|
-
|
|
292
|
-
# Add delay to respect rate limit (3 RPM)
|
|
293
|
-
if i + BATCH_SIZE < len(chunks) and i % 100 == 0: # Only delay every 100 chunks
|
|
294
|
-
logger.info(f"Waiting {RATE_LIMIT_DELAY}s for rate limit...")
|
|
295
|
-
time.sleep(RATE_LIMIT_DELAY)
|
|
296
|
-
|
|
297
|
-
except Exception as e:
|
|
298
|
-
logger.error(f"Failed to import batch: {e}")
|
|
299
|
-
self.total_errors += 1
|
|
300
|
-
# Continue with next batch instead of failing completely
|
|
301
|
-
|
|
302
|
-
def import_project(self, project_path: str) -> int:
|
|
303
|
-
"""Import all JSONL files in a project directory."""
|
|
304
|
-
project_name = os.path.basename(project_path)
|
|
305
|
-
collection_name = self._get_collection_name(project_name)
|
|
306
|
-
|
|
307
|
-
logger.info(f"📁 Importing project: {project_name} to collection: {collection_name}")
|
|
308
|
-
|
|
309
|
-
# Ensure collection exists
|
|
310
|
-
self._ensure_collection(collection_name)
|
|
311
|
-
|
|
312
|
-
# Get list of JSONL files
|
|
313
|
-
jsonl_files = []
|
|
314
|
-
for file in os.listdir(project_path):
|
|
315
|
-
if file.endswith('.jsonl'):
|
|
316
|
-
file_path = os.path.join(project_path, file)
|
|
317
|
-
|
|
318
|
-
# Skip already imported files
|
|
319
|
-
if (project_name in self.state["projects"] and
|
|
320
|
-
file_path in self.state["projects"][project_name]):
|
|
321
|
-
logger.debug(f"Skipping already imported: {file}")
|
|
322
|
-
continue
|
|
323
|
-
|
|
324
|
-
jsonl_files.append(file_path)
|
|
325
|
-
|
|
326
|
-
if not jsonl_files:
|
|
327
|
-
logger.info(f"No new files to import for {project_name}")
|
|
328
|
-
return 0
|
|
329
|
-
|
|
330
|
-
project_total = 0
|
|
331
|
-
for file_path in sorted(jsonl_files):
|
|
332
|
-
logger.info(f"Processing: {os.path.basename(file_path)}")
|
|
333
|
-
|
|
334
|
-
# Extract messages
|
|
335
|
-
messages = self._process_jsonl_file(file_path)
|
|
336
|
-
if not messages:
|
|
337
|
-
logger.warning(f"No messages found in {file_path}")
|
|
338
|
-
continue
|
|
339
|
-
|
|
340
|
-
# Create chunks
|
|
341
|
-
chunks = self._create_conversation_chunks(messages)
|
|
342
|
-
|
|
343
|
-
# Import to Qdrant
|
|
344
|
-
self._import_chunks_to_qdrant(chunks, collection_name)
|
|
345
|
-
|
|
346
|
-
# Mark file as imported
|
|
347
|
-
if project_name not in self.state["projects"]:
|
|
348
|
-
self.state["projects"][project_name] = []
|
|
349
|
-
self.state["projects"][project_name].append(file_path)
|
|
350
|
-
|
|
351
|
-
project_total += len(chunks)
|
|
352
|
-
|
|
353
|
-
# Save state after each file
|
|
354
|
-
self._save_state()
|
|
355
|
-
|
|
356
|
-
logger.info(f"✅ Imported {project_total} chunks from {len(jsonl_files)} files")
|
|
357
|
-
return project_total
|
|
358
|
-
|
|
359
|
-
def import_all(self):
|
|
360
|
-
"""Import all Claude projects."""
|
|
361
|
-
projects_dir = LOGS_DIR
|
|
362
|
-
|
|
363
|
-
if not os.path.exists(projects_dir):
|
|
364
|
-
logger.error(f"Claude projects directory not found: {projects_dir}")
|
|
365
|
-
logger.error("This usually means Claude Code hasn't created any projects yet.")
|
|
366
|
-
logger.error("Please open Claude Code and create a conversation first.")
|
|
367
|
-
return
|
|
368
|
-
|
|
369
|
-
# Get list of projects
|
|
370
|
-
projects = [
|
|
371
|
-
d for d in os.listdir(projects_dir)
|
|
372
|
-
if os.path.isdir(os.path.join(projects_dir, d)) and not d.startswith('.')
|
|
373
|
-
]
|
|
374
|
-
|
|
375
|
-
logger.info(f"Found {len(projects)} projects to import")
|
|
376
|
-
|
|
377
|
-
# Import each project
|
|
378
|
-
start_time = time.time()
|
|
379
|
-
for idx, project_name in enumerate(sorted(projects), 1):
|
|
380
|
-
project_path = os.path.join(projects_dir, project_name)
|
|
381
|
-
|
|
382
|
-
try:
|
|
383
|
-
logger.info(f"\n[{idx}/{len(projects)}] Processing: {project_name}")
|
|
384
|
-
count = self.import_project(project_path)
|
|
385
|
-
|
|
386
|
-
# Log progress
|
|
387
|
-
imported_projects = len(self.state["projects"])
|
|
388
|
-
progress = (imported_projects / len(projects)) * 100
|
|
389
|
-
logger.info(
|
|
390
|
-
f"Progress: {imported_projects}/{len(projects)} projects "
|
|
391
|
-
f"({progress:.1f}%), Total chunks: {self.total_imported}"
|
|
392
|
-
)
|
|
393
|
-
|
|
394
|
-
except Exception as e:
|
|
395
|
-
logger.error(f"Failed to import project {project_name}: {e}")
|
|
396
|
-
self.total_errors += 1
|
|
397
|
-
continue
|
|
398
|
-
|
|
399
|
-
# Final summary
|
|
400
|
-
elapsed_time = time.time() - start_time
|
|
401
|
-
logger.info("=" * 60)
|
|
402
|
-
logger.info(f"Import completed in {elapsed_time:.1f} seconds!")
|
|
403
|
-
logger.info(f"Projects imported: {len(self.state['projects'])}/{len(projects)}")
|
|
404
|
-
logger.info(f"Total chunks: {self.total_imported}")
|
|
405
|
-
logger.info(f"Total errors: {self.total_errors}")
|
|
406
|
-
|
|
407
|
-
# Show collection summary
|
|
408
|
-
logger.info("\nCollection summary:")
|
|
409
|
-
for col in self.qdrant_client.get_collections().collections:
|
|
410
|
-
if col.name.endswith("_voyage"):
|
|
411
|
-
info = self.qdrant_client.get_collection(col.name)
|
|
412
|
-
logger.info(f" {col.name}: {info.points_count} points")
|
|
413
|
-
|
|
414
|
-
def main():
|
|
415
|
-
"""Main entry point."""
|
|
416
|
-
importer = VoyageConversationImporter()
|
|
417
|
-
|
|
418
|
-
if len(os.sys.argv) > 1:
|
|
419
|
-
# Import specific project
|
|
420
|
-
project_path = os.sys.argv[1]
|
|
421
|
-
if os.path.exists(project_path):
|
|
422
|
-
importer.import_project(project_path)
|
|
423
|
-
else:
|
|
424
|
-
logger.error(f"Project path not found: {project_path}")
|
|
425
|
-
else:
|
|
426
|
-
# Import all projects
|
|
427
|
-
importer.import_all()
|
|
428
|
-
|
|
429
|
-
if __name__ == "__main__":
|
|
430
|
-
main()
|
|
@@ -1,240 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Import Claude conversation logs from JSONL files into Qdrant vector database.
|
|
4
|
-
Simplified version focusing on semantic search without complex entity extraction.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import glob
|
|
10
|
-
from datetime import datetime, timedelta
|
|
11
|
-
from typing import List, Dict, Any
|
|
12
|
-
import logging
|
|
13
|
-
from qdrant_client import QdrantClient
|
|
14
|
-
from qdrant_client.models import (
|
|
15
|
-
VectorParams, Distance, PointStruct,
|
|
16
|
-
Filter, FieldCondition, MatchValue
|
|
17
|
-
)
|
|
18
|
-
from sentence_transformers import SentenceTransformer
|
|
19
|
-
import hashlib
|
|
20
|
-
|
|
21
|
-
# Configuration
|
|
22
|
-
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
23
|
-
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "conversations")
|
|
24
|
-
LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
|
|
25
|
-
STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
|
|
26
|
-
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
|
27
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))
|
|
28
|
-
|
|
29
|
-
# Set up logging
|
|
30
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
31
|
-
logger = logging.getLogger(__name__)
|
|
32
|
-
|
|
33
|
-
class ConversationImporter:
|
|
34
|
-
def __init__(self):
|
|
35
|
-
"""Initialize the importer with Qdrant client and embedding model."""
|
|
36
|
-
self.client = QdrantClient(url=QDRANT_URL)
|
|
37
|
-
self.encoder = SentenceTransformer(EMBEDDING_MODEL)
|
|
38
|
-
self.imported_files = self.load_state()
|
|
39
|
-
|
|
40
|
-
def load_state(self) -> set:
|
|
41
|
-
"""Load the set of already imported files."""
|
|
42
|
-
if os.path.exists(STATE_FILE):
|
|
43
|
-
try:
|
|
44
|
-
with open(STATE_FILE, 'r') as f:
|
|
45
|
-
data = json.load(f)
|
|
46
|
-
return set(data.get('files', []))
|
|
47
|
-
except Exception as e:
|
|
48
|
-
logger.error(f"Failed to load state: {e}")
|
|
49
|
-
return set()
|
|
50
|
-
|
|
51
|
-
def save_state(self):
|
|
52
|
-
"""Save the set of imported files."""
|
|
53
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
54
|
-
with open(STATE_FILE, 'w') as f:
|
|
55
|
-
json.dump({
|
|
56
|
-
'files': list(self.imported_files),
|
|
57
|
-
'last_updated': datetime.now().isoformat()
|
|
58
|
-
}, f)
|
|
59
|
-
|
|
60
|
-
def setup_collection(self):
|
|
61
|
-
"""Create or update the Qdrant collection."""
|
|
62
|
-
collections = self.client.get_collections().collections
|
|
63
|
-
exists = any(c.name == COLLECTION_NAME for c in collections)
|
|
64
|
-
|
|
65
|
-
if not exists:
|
|
66
|
-
logger.info(f"Creating collection: {COLLECTION_NAME}")
|
|
67
|
-
self.client.create_collection(
|
|
68
|
-
collection_name=COLLECTION_NAME,
|
|
69
|
-
vectors_config=VectorParams(
|
|
70
|
-
size=384, # all-MiniLM-L6-v2 dimension
|
|
71
|
-
distance=Distance.COSINE
|
|
72
|
-
)
|
|
73
|
-
)
|
|
74
|
-
else:
|
|
75
|
-
logger.info(f"Collection {COLLECTION_NAME} already exists")
|
|
76
|
-
|
|
77
|
-
def process_jsonl_file(self, file_path: str) -> List[Dict[str, Any]]:
|
|
78
|
-
"""Extract messages from a JSONL file."""
|
|
79
|
-
messages = []
|
|
80
|
-
|
|
81
|
-
try:
|
|
82
|
-
with open(file_path, 'r') as f:
|
|
83
|
-
for line_num, line in enumerate(f, 1):
|
|
84
|
-
try:
|
|
85
|
-
data = json.loads(line.strip())
|
|
86
|
-
|
|
87
|
-
# Extract message if present
|
|
88
|
-
if 'message' in data and data['message']:
|
|
89
|
-
msg = data['message']
|
|
90
|
-
if 'role' in msg and 'content' in msg:
|
|
91
|
-
# Handle content that might be an object
|
|
92
|
-
content = msg['content']
|
|
93
|
-
if isinstance(content, dict):
|
|
94
|
-
content = content.get('text', json.dumps(content))
|
|
95
|
-
|
|
96
|
-
# Create message document
|
|
97
|
-
messages.append({
|
|
98
|
-
'role': msg['role'],
|
|
99
|
-
'content': content,
|
|
100
|
-
'file_path': file_path,
|
|
101
|
-
'line_number': line_num,
|
|
102
|
-
'timestamp': data.get('timestamp', datetime.now().isoformat())
|
|
103
|
-
})
|
|
104
|
-
except json.JSONDecodeError:
|
|
105
|
-
logger.warning(f"Failed to parse line {line_num} in {file_path}")
|
|
106
|
-
except Exception as e:
|
|
107
|
-
logger.error(f"Error processing line {line_num} in {file_path}: {e}")
|
|
108
|
-
|
|
109
|
-
except Exception as e:
|
|
110
|
-
logger.error(f"Failed to read file {file_path}: {e}")
|
|
111
|
-
|
|
112
|
-
return messages
|
|
113
|
-
|
|
114
|
-
def create_conversation_chunks(self, messages: List[Dict[str, Any]], chunk_size: int = 5) -> List[Dict[str, Any]]:
|
|
115
|
-
"""Group messages into conversation chunks for better context."""
|
|
116
|
-
chunks = []
|
|
117
|
-
|
|
118
|
-
for i in range(0, len(messages), chunk_size):
|
|
119
|
-
chunk_messages = messages[i:i + chunk_size]
|
|
120
|
-
|
|
121
|
-
# Create a conversation summary
|
|
122
|
-
conversation_text = "\n\n".join([
|
|
123
|
-
f"{msg['role'].upper()}: {msg['content'][:500]}..."
|
|
124
|
-
if len(msg['content']) > 500 else f"{msg['role'].upper()}: {msg['content']}"
|
|
125
|
-
for msg in chunk_messages
|
|
126
|
-
])
|
|
127
|
-
|
|
128
|
-
# Extract metadata
|
|
129
|
-
project_id = os.path.basename(os.path.dirname(os.path.dirname(chunk_messages[0]['file_path'])))
|
|
130
|
-
conversation_id = os.path.basename(chunk_messages[0]['file_path']).replace('.jsonl', '')
|
|
131
|
-
|
|
132
|
-
chunks.append({
|
|
133
|
-
'id': hashlib.md5(f"{chunk_messages[0]['file_path']}_{i}".encode()).hexdigest(),
|
|
134
|
-
'text': conversation_text,
|
|
135
|
-
'metadata': {
|
|
136
|
-
'project_id': project_id,
|
|
137
|
-
'conversation_id': conversation_id,
|
|
138
|
-
'chunk_index': i // chunk_size,
|
|
139
|
-
'message_count': len(chunk_messages),
|
|
140
|
-
'start_role': chunk_messages[0]['role'],
|
|
141
|
-
'timestamp': chunk_messages[0]['timestamp'],
|
|
142
|
-
'file_path': chunk_messages[0]['file_path']
|
|
143
|
-
}
|
|
144
|
-
})
|
|
145
|
-
|
|
146
|
-
return chunks
|
|
147
|
-
|
|
148
|
-
def import_to_qdrant(self, chunks: List[Dict[str, Any]]):
|
|
149
|
-
"""Import conversation chunks to Qdrant."""
|
|
150
|
-
if not chunks:
|
|
151
|
-
return
|
|
152
|
-
|
|
153
|
-
# Generate embeddings
|
|
154
|
-
texts = [chunk['text'] for chunk in chunks]
|
|
155
|
-
embeddings = self.encoder.encode(texts, show_progress_bar=True)
|
|
156
|
-
|
|
157
|
-
# Create points for Qdrant
|
|
158
|
-
points = []
|
|
159
|
-
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
160
|
-
points.append(
|
|
161
|
-
PointStruct(
|
|
162
|
-
id=chunk['id'],
|
|
163
|
-
vector=embedding.tolist(),
|
|
164
|
-
payload={
|
|
165
|
-
'text': chunk['text'],
|
|
166
|
-
**chunk['metadata']
|
|
167
|
-
}
|
|
168
|
-
)
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
# Upload to Qdrant in batches
|
|
172
|
-
for i in range(0, len(points), BATCH_SIZE):
|
|
173
|
-
batch = points[i:i + BATCH_SIZE]
|
|
174
|
-
self.client.upsert(
|
|
175
|
-
collection_name=COLLECTION_NAME,
|
|
176
|
-
points=batch
|
|
177
|
-
)
|
|
178
|
-
logger.info(f"Uploaded batch of {len(batch)} points")
|
|
179
|
-
|
|
180
|
-
def find_recent_files(self, days: int = 30) -> List[str]:
|
|
181
|
-
"""Find JSONL files modified in the last N days."""
|
|
182
|
-
cutoff_time = datetime.now() - timedelta(days=days)
|
|
183
|
-
pattern = os.path.join(LOGS_DIR, "**", "*.jsonl")
|
|
184
|
-
|
|
185
|
-
recent_files = []
|
|
186
|
-
for file_path in glob.glob(pattern, recursive=True):
|
|
187
|
-
try:
|
|
188
|
-
mtime = os.path.getmtime(file_path)
|
|
189
|
-
if datetime.fromtimestamp(mtime) >= cutoff_time:
|
|
190
|
-
recent_files.append(file_path)
|
|
191
|
-
except Exception as e:
|
|
192
|
-
logger.error(f"Error checking file {file_path}: {e}")
|
|
193
|
-
|
|
194
|
-
return recent_files
|
|
195
|
-
|
|
196
|
-
def run(self):
|
|
197
|
-
"""Main import process."""
|
|
198
|
-
logger.info("Starting conversation import to Qdrant")
|
|
199
|
-
|
|
200
|
-
# Setup collection
|
|
201
|
-
self.setup_collection()
|
|
202
|
-
|
|
203
|
-
# Find files to import
|
|
204
|
-
all_files = self.find_recent_files()
|
|
205
|
-
new_files = [f for f in all_files if f not in self.imported_files]
|
|
206
|
-
|
|
207
|
-
logger.info(f"Found {len(all_files)} total files, {len(new_files)} new files to import")
|
|
208
|
-
|
|
209
|
-
total_chunks = 0
|
|
210
|
-
for file_path in new_files:
|
|
211
|
-
logger.info(f"Processing: {file_path}")
|
|
212
|
-
|
|
213
|
-
# Extract messages
|
|
214
|
-
messages = self.process_jsonl_file(file_path)
|
|
215
|
-
if not messages:
|
|
216
|
-
logger.warning(f"No messages found in {file_path}")
|
|
217
|
-
continue
|
|
218
|
-
|
|
219
|
-
# Create conversation chunks
|
|
220
|
-
chunks = self.create_conversation_chunks(messages)
|
|
221
|
-
|
|
222
|
-
# Import to Qdrant
|
|
223
|
-
self.import_to_qdrant(chunks)
|
|
224
|
-
|
|
225
|
-
# Mark file as imported
|
|
226
|
-
self.imported_files.add(file_path)
|
|
227
|
-
self.save_state()
|
|
228
|
-
|
|
229
|
-
total_chunks += len(chunks)
|
|
230
|
-
logger.info(f"Imported {len(chunks)} chunks from {file_path}")
|
|
231
|
-
|
|
232
|
-
logger.info(f"Import complete: {total_chunks} total chunks imported from {len(new_files)} files")
|
|
233
|
-
|
|
234
|
-
def main():
|
|
235
|
-
"""Entry point for the importer."""
|
|
236
|
-
importer = ConversationImporter()
|
|
237
|
-
importer.run()
|
|
238
|
-
|
|
239
|
-
if __name__ == "__main__":
|
|
240
|
-
main()
|