claude-self-reflect 2.3.6 → 2.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/import-conversations-isolated.py +0 -311
- package/scripts/import-conversations-voyage-streaming.py +0 -368
- package/scripts/import-conversations-voyage.py +0 -430
- package/scripts/import-conversations.py +0 -240
- package/scripts/import-current-conversation.py +0 -39
- package/scripts/import-live-conversation.py +0 -154
- package/scripts/import-openai-enhanced.py +0 -867
- package/scripts/import-recent-only.py +0 -33
- package/scripts/import-single-project.py +0 -278
- package/scripts/import-watcher.py +0 -170
package/package.json
CHANGED
|
@@ -1,311 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Import Claude conversation logs with project isolation support.
|
|
4
|
-
Each project gets its own collection for complete isolation.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import glob
|
|
10
|
-
import hashlib
|
|
11
|
-
from datetime import datetime, timedelta
|
|
12
|
-
from typing import List, Dict, Any, Set
|
|
13
|
-
import logging
|
|
14
|
-
from qdrant_client import QdrantClient
|
|
15
|
-
from qdrant_client.models import (
|
|
16
|
-
VectorParams, Distance, PointStruct,
|
|
17
|
-
Filter, FieldCondition, MatchValue
|
|
18
|
-
)
|
|
19
|
-
from sentence_transformers import SentenceTransformer
|
|
20
|
-
|
|
21
|
-
# Configuration
|
|
22
|
-
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
23
|
-
LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
|
|
24
|
-
STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
|
|
25
|
-
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
|
26
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))
|
|
27
|
-
ISOLATION_MODE = os.getenv("ISOLATION_MODE", "isolated") # isolated, shared, hybrid
|
|
28
|
-
|
|
29
|
-
# Set up logging
|
|
30
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
31
|
-
logger = logging.getLogger(__name__)
|
|
32
|
-
|
|
33
|
-
class ProjectAwareImporter:
|
|
34
|
-
def __init__(self):
|
|
35
|
-
"""Initialize the importer with Qdrant client and embedding model."""
|
|
36
|
-
self.client = QdrantClient(url=QDRANT_URL)
|
|
37
|
-
self.encoder = SentenceTransformer(EMBEDDING_MODEL)
|
|
38
|
-
self.imported_files = self.load_state()
|
|
39
|
-
self.project_collections: Set[str] = set()
|
|
40
|
-
|
|
41
|
-
def load_state(self) -> Dict[str, Set[str]]:
|
|
42
|
-
"""Load the set of already imported files per project."""
|
|
43
|
-
if os.path.exists(STATE_FILE):
|
|
44
|
-
try:
|
|
45
|
-
with open(STATE_FILE, 'r') as f:
|
|
46
|
-
data = json.load(f)
|
|
47
|
-
# Convert to per-project tracking
|
|
48
|
-
if isinstance(data.get('files'), list):
|
|
49
|
-
# Legacy format - convert to new format
|
|
50
|
-
return {'_legacy': set(data['files'])}
|
|
51
|
-
else:
|
|
52
|
-
# New format with per-project tracking
|
|
53
|
-
return {k: set(v) for k, v in data.get('projects', {}).items()}
|
|
54
|
-
except Exception as e:
|
|
55
|
-
logger.error(f"Failed to load state: {e}")
|
|
56
|
-
return {}
|
|
57
|
-
|
|
58
|
-
def save_state(self):
|
|
59
|
-
"""Save the set of imported files per project."""
|
|
60
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
61
|
-
with open(STATE_FILE, 'w') as f:
|
|
62
|
-
json.dump({
|
|
63
|
-
'projects': {k: list(v) for k, v in self.imported_files.items()},
|
|
64
|
-
'last_updated': datetime.now().isoformat(),
|
|
65
|
-
'mode': ISOLATION_MODE
|
|
66
|
-
}, f, indent=2)
|
|
67
|
-
|
|
68
|
-
def get_collection_name(self, project_name: str) -> str:
|
|
69
|
-
"""Get collection name based on isolation mode."""
|
|
70
|
-
if ISOLATION_MODE == "isolated":
|
|
71
|
-
# Create project-specific collection name
|
|
72
|
-
project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
|
|
73
|
-
return f"conv_{project_hash}"
|
|
74
|
-
else:
|
|
75
|
-
# Shared collection mode
|
|
76
|
-
return "conversations"
|
|
77
|
-
|
|
78
|
-
def setup_collection(self, project_name: str):
|
|
79
|
-
"""Create or update the Qdrant collection for a project."""
|
|
80
|
-
collection_name = self.get_collection_name(project_name)
|
|
81
|
-
|
|
82
|
-
# Skip if already set up in this session
|
|
83
|
-
if collection_name in self.project_collections:
|
|
84
|
-
return collection_name
|
|
85
|
-
|
|
86
|
-
collections = self.client.get_collections().collections
|
|
87
|
-
exists = any(c.name == collection_name for c in collections)
|
|
88
|
-
|
|
89
|
-
if not exists:
|
|
90
|
-
logger.info(f"Creating collection: {collection_name} for project: {project_name}")
|
|
91
|
-
self.client.create_collection(
|
|
92
|
-
collection_name=collection_name,
|
|
93
|
-
vectors_config=VectorParams(
|
|
94
|
-
size=384, # all-MiniLM-L6-v2 dimension
|
|
95
|
-
distance=Distance.COSINE
|
|
96
|
-
)
|
|
97
|
-
)
|
|
98
|
-
else:
|
|
99
|
-
logger.info(f"Collection {collection_name} already exists for project: {project_name}")
|
|
100
|
-
|
|
101
|
-
self.project_collections.add(collection_name)
|
|
102
|
-
return collection_name
|
|
103
|
-
|
|
104
|
-
def extract_project_name(self, file_path: str) -> str:
|
|
105
|
-
"""Extract project name from file path."""
|
|
106
|
-
# Expected path: /logs/<project-name>/<conversation-id>.jsonl
|
|
107
|
-
parts = file_path.split('/')
|
|
108
|
-
if len(parts) >= 3 and parts[-2] != 'logs':
|
|
109
|
-
return parts[-2]
|
|
110
|
-
return 'unknown'
|
|
111
|
-
|
|
112
|
-
def process_jsonl_file(self, file_path: str) -> List[Dict[str, Any]]:
|
|
113
|
-
"""Extract messages from a JSONL file."""
|
|
114
|
-
messages = []
|
|
115
|
-
|
|
116
|
-
try:
|
|
117
|
-
with open(file_path, 'r') as f:
|
|
118
|
-
for line_num, line in enumerate(f, 1):
|
|
119
|
-
try:
|
|
120
|
-
data = json.loads(line.strip())
|
|
121
|
-
|
|
122
|
-
# Extract message if present
|
|
123
|
-
if 'message' in data and data['message']:
|
|
124
|
-
msg = data['message']
|
|
125
|
-
if 'role' in msg and 'content' in msg:
|
|
126
|
-
# Handle content that might be an object
|
|
127
|
-
content = msg['content']
|
|
128
|
-
if isinstance(content, dict):
|
|
129
|
-
content = content.get('text', json.dumps(content))
|
|
130
|
-
|
|
131
|
-
# Create message document
|
|
132
|
-
messages.append({
|
|
133
|
-
'role': msg['role'],
|
|
134
|
-
'content': content,
|
|
135
|
-
'file_path': file_path,
|
|
136
|
-
'line_number': line_num,
|
|
137
|
-
'timestamp': data.get('timestamp', datetime.now().isoformat())
|
|
138
|
-
})
|
|
139
|
-
except json.JSONDecodeError:
|
|
140
|
-
logger.warning(f"Failed to parse line {line_num} in {file_path}")
|
|
141
|
-
except Exception as e:
|
|
142
|
-
logger.error(f"Error processing line {line_num} in {file_path}: {e}")
|
|
143
|
-
|
|
144
|
-
except Exception as e:
|
|
145
|
-
logger.error(f"Failed to read file {file_path}: {e}")
|
|
146
|
-
|
|
147
|
-
return messages
|
|
148
|
-
|
|
149
|
-
def create_conversation_chunks(self, messages: List[Dict[str, Any]], chunk_size: int = 5) -> List[Dict[str, Any]]:
|
|
150
|
-
"""Group messages into conversation chunks for better context."""
|
|
151
|
-
chunks = []
|
|
152
|
-
|
|
153
|
-
for i in range(0, len(messages), chunk_size):
|
|
154
|
-
chunk_messages = messages[i:i + chunk_size]
|
|
155
|
-
|
|
156
|
-
# Create a conversation summary
|
|
157
|
-
conversation_text = "\n\n".join([
|
|
158
|
-
f"{msg['role'].upper()}: {msg['content'][:500]}..."
|
|
159
|
-
if len(msg['content']) > 500 else f"{msg['role'].upper()}: {msg['content']}"
|
|
160
|
-
for msg in chunk_messages
|
|
161
|
-
])
|
|
162
|
-
|
|
163
|
-
# Extract metadata
|
|
164
|
-
project_id = self.extract_project_name(chunk_messages[0]['file_path'])
|
|
165
|
-
conversation_id = os.path.basename(chunk_messages[0]['file_path']).replace('.jsonl', '')
|
|
166
|
-
|
|
167
|
-
chunks.append({
|
|
168
|
-
'id': hashlib.md5(f"{chunk_messages[0]['file_path']}_{i}".encode()).hexdigest(),
|
|
169
|
-
'text': conversation_text,
|
|
170
|
-
'metadata': {
|
|
171
|
-
'project_id': project_id,
|
|
172
|
-
'project_name': project_id, # Add both for compatibility
|
|
173
|
-
'conversation_id': conversation_id,
|
|
174
|
-
'chunk_index': i // chunk_size,
|
|
175
|
-
'message_count': len(chunk_messages),
|
|
176
|
-
'start_role': chunk_messages[0]['role'],
|
|
177
|
-
'timestamp': chunk_messages[0]['timestamp'],
|
|
178
|
-
'file_path': chunk_messages[0]['file_path']
|
|
179
|
-
}
|
|
180
|
-
})
|
|
181
|
-
|
|
182
|
-
return chunks
|
|
183
|
-
|
|
184
|
-
def import_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str):
|
|
185
|
-
"""Import conversation chunks to a specific Qdrant collection."""
|
|
186
|
-
if not chunks:
|
|
187
|
-
return
|
|
188
|
-
|
|
189
|
-
# Generate embeddings
|
|
190
|
-
texts = [chunk['text'] for chunk in chunks]
|
|
191
|
-
embeddings = self.encoder.encode(texts, show_progress_bar=True)
|
|
192
|
-
|
|
193
|
-
# Create points for Qdrant
|
|
194
|
-
points = []
|
|
195
|
-
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
196
|
-
points.append(
|
|
197
|
-
PointStruct(
|
|
198
|
-
id=chunk['id'],
|
|
199
|
-
vector=embedding.tolist(),
|
|
200
|
-
payload={
|
|
201
|
-
'text': chunk['text'],
|
|
202
|
-
**chunk['metadata']
|
|
203
|
-
}
|
|
204
|
-
)
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
# Upload to Qdrant in batches
|
|
208
|
-
for i in range(0, len(points), BATCH_SIZE):
|
|
209
|
-
batch = points[i:i + BATCH_SIZE]
|
|
210
|
-
self.client.upsert(
|
|
211
|
-
collection_name=collection_name,
|
|
212
|
-
points=batch
|
|
213
|
-
)
|
|
214
|
-
logger.info(f"Uploaded batch of {len(batch)} points to {collection_name}")
|
|
215
|
-
|
|
216
|
-
def find_recent_files(self, days: int = 30) -> List[str]:
|
|
217
|
-
"""Find JSONL files modified in the last N days."""
|
|
218
|
-
cutoff_time = datetime.now() - timedelta(days=days)
|
|
219
|
-
pattern = os.path.join(LOGS_DIR, "**", "*.jsonl")
|
|
220
|
-
|
|
221
|
-
recent_files = []
|
|
222
|
-
for file_path in glob.glob(pattern, recursive=True):
|
|
223
|
-
try:
|
|
224
|
-
mtime = os.path.getmtime(file_path)
|
|
225
|
-
if datetime.fromtimestamp(mtime) >= cutoff_time:
|
|
226
|
-
recent_files.append(file_path)
|
|
227
|
-
except Exception as e:
|
|
228
|
-
logger.error(f"Error checking file {file_path}: {e}")
|
|
229
|
-
|
|
230
|
-
return recent_files
|
|
231
|
-
|
|
232
|
-
def run(self):
|
|
233
|
-
"""Main import process with project isolation."""
|
|
234
|
-
logger.info(f"Starting conversation import to Qdrant (mode: {ISOLATION_MODE})")
|
|
235
|
-
|
|
236
|
-
# Find files to import
|
|
237
|
-
all_files = self.find_recent_files()
|
|
238
|
-
logger.info(f"Found {len(all_files)} total files")
|
|
239
|
-
|
|
240
|
-
# Group files by project
|
|
241
|
-
files_by_project: Dict[str, List[str]] = {}
|
|
242
|
-
for file_path in all_files:
|
|
243
|
-
project_name = self.extract_project_name(file_path)
|
|
244
|
-
if project_name not in files_by_project:
|
|
245
|
-
files_by_project[project_name] = []
|
|
246
|
-
files_by_project[project_name].append(file_path)
|
|
247
|
-
|
|
248
|
-
logger.info(f"Found {len(files_by_project)} projects to process")
|
|
249
|
-
|
|
250
|
-
total_chunks = 0
|
|
251
|
-
for project_name, project_files in files_by_project.items():
|
|
252
|
-
logger.info(f"\nProcessing project: {project_name}")
|
|
253
|
-
|
|
254
|
-
# Get imported files for this project
|
|
255
|
-
project_imported = self.imported_files.get(project_name, set())
|
|
256
|
-
new_files = [f for f in project_files if f not in project_imported]
|
|
257
|
-
|
|
258
|
-
if not new_files:
|
|
259
|
-
logger.info(f"No new files for project {project_name}")
|
|
260
|
-
continue
|
|
261
|
-
|
|
262
|
-
logger.info(f"Found {len(new_files)} new files for project {project_name}")
|
|
263
|
-
|
|
264
|
-
# Setup collection for this project
|
|
265
|
-
collection_name = self.setup_collection(project_name)
|
|
266
|
-
|
|
267
|
-
project_chunks = 0
|
|
268
|
-
for file_path in new_files:
|
|
269
|
-
logger.info(f"Processing: {file_path}")
|
|
270
|
-
|
|
271
|
-
# Extract messages
|
|
272
|
-
messages = self.process_jsonl_file(file_path)
|
|
273
|
-
if not messages:
|
|
274
|
-
logger.warning(f"No messages found in {file_path}")
|
|
275
|
-
continue
|
|
276
|
-
|
|
277
|
-
# Create conversation chunks
|
|
278
|
-
chunks = self.create_conversation_chunks(messages)
|
|
279
|
-
|
|
280
|
-
# Import to project-specific collection
|
|
281
|
-
self.import_to_qdrant(chunks, collection_name)
|
|
282
|
-
|
|
283
|
-
# Mark file as imported for this project
|
|
284
|
-
if project_name not in self.imported_files:
|
|
285
|
-
self.imported_files[project_name] = set()
|
|
286
|
-
self.imported_files[project_name].add(file_path)
|
|
287
|
-
self.save_state()
|
|
288
|
-
|
|
289
|
-
project_chunks += len(chunks)
|
|
290
|
-
logger.info(f"Imported {len(chunks)} chunks from {file_path}")
|
|
291
|
-
|
|
292
|
-
total_chunks += project_chunks
|
|
293
|
-
logger.info(f"Project {project_name} complete: {project_chunks} chunks imported")
|
|
294
|
-
|
|
295
|
-
logger.info(f"\nImport complete: {total_chunks} total chunks imported")
|
|
296
|
-
|
|
297
|
-
# Show collection summary
|
|
298
|
-
logger.info("\nCollection summary:")
|
|
299
|
-
collections = self.client.get_collections().collections
|
|
300
|
-
for collection in collections:
|
|
301
|
-
if collection.name.startswith('conv_') or collection.name == 'conversations':
|
|
302
|
-
count = self.client.get_collection(collection.name).points_count
|
|
303
|
-
logger.info(f" {collection.name}: {count} points")
|
|
304
|
-
|
|
305
|
-
def main():
|
|
306
|
-
"""Entry point for the importer."""
|
|
307
|
-
importer = ProjectAwareImporter()
|
|
308
|
-
importer.run()
|
|
309
|
-
|
|
310
|
-
if __name__ == "__main__":
|
|
311
|
-
main()
|
|
@@ -1,368 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Streaming import for large Claude conversation logs.
|
|
4
|
-
Processes files in chunks without loading entire file into memory.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import sys
|
|
10
|
-
import time
|
|
11
|
-
import hashlib
|
|
12
|
-
from datetime import datetime
|
|
13
|
-
from typing import List, Dict, Any, Generator
|
|
14
|
-
import logging
|
|
15
|
-
import backoff
|
|
16
|
-
import requests
|
|
17
|
-
from qdrant_client import QdrantClient
|
|
18
|
-
from qdrant_client.models import VectorParams, Distance, PointStruct
|
|
19
|
-
from pathlib import Path
|
|
20
|
-
|
|
21
|
-
# Set up logging
|
|
22
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
23
|
-
logger = logging.getLogger(__name__)
|
|
24
|
-
|
|
25
|
-
# Configuration
|
|
26
|
-
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
27
|
-
STATE_FILE = os.getenv("STATE_FILE", "./config-isolated/imported-files.json")
|
|
28
|
-
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
29
|
-
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY_2") or os.getenv("VOYAGE_KEY")
|
|
30
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "50"))
|
|
31
|
-
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "10"))
|
|
32
|
-
STREAMING_BUFFER_SIZE = 100 # Process every 100 messages
|
|
33
|
-
RATE_LIMIT_DELAY = 0.1
|
|
34
|
-
EMBEDDING_MODEL = "voyage-3-large"
|
|
35
|
-
EMBEDDING_DIMENSIONS = 1024
|
|
36
|
-
VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings"
|
|
37
|
-
|
|
38
|
-
class StreamingVoyageImporter:
|
|
39
|
-
def __init__(self):
|
|
40
|
-
"""Initialize the streaming importer."""
|
|
41
|
-
if not VOYAGE_API_KEY:
|
|
42
|
-
raise ValueError("VOYAGE_KEY environment variable not set")
|
|
43
|
-
|
|
44
|
-
self.qdrant_client = QdrantClient(url=QDRANT_URL)
|
|
45
|
-
self.state = self._load_state()
|
|
46
|
-
self.total_imported = 0
|
|
47
|
-
self.total_errors = 0
|
|
48
|
-
|
|
49
|
-
logger.info(f"Connected to Qdrant at {QDRANT_URL}")
|
|
50
|
-
|
|
51
|
-
def _load_state(self) -> Dict[str, Any]:
|
|
52
|
-
"""Load import state from file."""
|
|
53
|
-
default_state = {
|
|
54
|
-
"projects": {},
|
|
55
|
-
"last_updated": None,
|
|
56
|
-
"mode": "isolated",
|
|
57
|
-
"total_imported": 0
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
if os.path.exists(STATE_FILE):
|
|
61
|
-
try:
|
|
62
|
-
with open(STATE_FILE, 'r') as f:
|
|
63
|
-
return json.load(f)
|
|
64
|
-
except Exception as e:
|
|
65
|
-
logger.error(f"Failed to load state: {e}")
|
|
66
|
-
|
|
67
|
-
return default_state
|
|
68
|
-
|
|
69
|
-
def _save_state(self):
|
|
70
|
-
"""Save import state to file."""
|
|
71
|
-
self.state["last_updated"] = datetime.now().isoformat()
|
|
72
|
-
self.state["total_imported"] = self.total_imported
|
|
73
|
-
|
|
74
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
75
|
-
with open(STATE_FILE, 'w') as f:
|
|
76
|
-
json.dump(self.state, f, indent=2)
|
|
77
|
-
|
|
78
|
-
@backoff.on_exception(
|
|
79
|
-
backoff.expo,
|
|
80
|
-
Exception,
|
|
81
|
-
max_tries=5,
|
|
82
|
-
on_backoff=lambda details: logger.warning(f"Backing off {details['wait']}s after {details['tries']} tries")
|
|
83
|
-
)
|
|
84
|
-
def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
85
|
-
"""Generate embeddings using Voyage AI API with batching."""
|
|
86
|
-
headers = {
|
|
87
|
-
"Authorization": f"Bearer {VOYAGE_API_KEY}",
|
|
88
|
-
"Content-Type": "application/json"
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
payload = {
|
|
92
|
-
"input": texts,
|
|
93
|
-
"model": EMBEDDING_MODEL,
|
|
94
|
-
"input_type": "document"
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
try:
|
|
98
|
-
response = requests.post(VOYAGE_API_URL, headers=headers, json=payload, timeout=30)
|
|
99
|
-
response.raise_for_status()
|
|
100
|
-
|
|
101
|
-
data = response.json()
|
|
102
|
-
embeddings = [item["embedding"] for item in data["data"]]
|
|
103
|
-
|
|
104
|
-
# Add small delay to respect rate limits
|
|
105
|
-
time.sleep(RATE_LIMIT_DELAY)
|
|
106
|
-
|
|
107
|
-
return embeddings
|
|
108
|
-
|
|
109
|
-
except requests.Timeout:
|
|
110
|
-
logger.error("Voyage API request timed out after 30 seconds")
|
|
111
|
-
raise
|
|
112
|
-
except Exception as e:
|
|
113
|
-
logger.error(f"Voyage API error: {e}")
|
|
114
|
-
raise
|
|
115
|
-
|
|
116
|
-
def stream_jsonl_messages(self, file_path: str, buffer_size: int = STREAMING_BUFFER_SIZE) -> Generator[List[Dict[str, Any]], None, None]:
|
|
117
|
-
"""Stream messages from JSONL file in buffers without loading entire file."""
|
|
118
|
-
buffer = []
|
|
119
|
-
line_count = 0
|
|
120
|
-
total_lines = 0
|
|
121
|
-
skipped_lines = 0
|
|
122
|
-
|
|
123
|
-
# Extract expected session ID from filename
|
|
124
|
-
expected_session_id = os.path.splitext(os.path.basename(file_path))[0]
|
|
125
|
-
logger.info(f"Starting to stream file: {os.path.basename(file_path)} (expecting session: {expected_session_id})")
|
|
126
|
-
|
|
127
|
-
try:
|
|
128
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
129
|
-
for line_num, line in enumerate(f, 1):
|
|
130
|
-
total_lines = line_num
|
|
131
|
-
line = line.strip()
|
|
132
|
-
if not line:
|
|
133
|
-
continue
|
|
134
|
-
|
|
135
|
-
try:
|
|
136
|
-
data = json.loads(line)
|
|
137
|
-
|
|
138
|
-
# Check session ID matches expected
|
|
139
|
-
session_id = data.get('sessionId', '')
|
|
140
|
-
if session_id != expected_session_id:
|
|
141
|
-
skipped_lines += 1
|
|
142
|
-
logger.debug(f"Skipping line {line_num}: different session ID ({session_id})")
|
|
143
|
-
continue
|
|
144
|
-
|
|
145
|
-
# Extract message if present
|
|
146
|
-
if 'message' in data and data['message']:
|
|
147
|
-
msg = data['message']
|
|
148
|
-
if msg.get('role') and msg.get('content'):
|
|
149
|
-
content = msg['content']
|
|
150
|
-
# Handle content array (common in Claude messages)
|
|
151
|
-
if isinstance(content, list) and len(content) > 0:
|
|
152
|
-
# Extract text from first content item
|
|
153
|
-
content_item = content[0]
|
|
154
|
-
if isinstance(content_item, dict):
|
|
155
|
-
content = content_item.get('text', str(content_item))
|
|
156
|
-
elif isinstance(content, dict):
|
|
157
|
-
content = content.get('text', json.dumps(content))
|
|
158
|
-
|
|
159
|
-
buffer.append({
|
|
160
|
-
'role': msg['role'],
|
|
161
|
-
'content': content,
|
|
162
|
-
'file_path': file_path,
|
|
163
|
-
'line_number': line_num,
|
|
164
|
-
'timestamp': data.get('timestamp', datetime.now().isoformat())
|
|
165
|
-
})
|
|
166
|
-
line_count += 1
|
|
167
|
-
|
|
168
|
-
# Yield buffer when it reaches the specified size
|
|
169
|
-
if len(buffer) >= buffer_size:
|
|
170
|
-
logger.info(f"Buffer full, yielding {len(buffer)} messages (total so far: {line_count})")
|
|
171
|
-
yield buffer
|
|
172
|
-
buffer = []
|
|
173
|
-
|
|
174
|
-
except json.JSONDecodeError:
|
|
175
|
-
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
176
|
-
skipped_lines += 1
|
|
177
|
-
except Exception as e:
|
|
178
|
-
logger.error(f"Error processing line {line_num}: {e}")
|
|
179
|
-
skipped_lines += 1
|
|
180
|
-
|
|
181
|
-
# Yield any remaining messages
|
|
182
|
-
if buffer:
|
|
183
|
-
logger.info(f"Yielding final buffer with {len(buffer)} messages")
|
|
184
|
-
yield buffer
|
|
185
|
-
|
|
186
|
-
logger.info(f"Completed streaming file: processed {total_lines} lines, {line_count} messages, {skipped_lines} skipped")
|
|
187
|
-
|
|
188
|
-
except Exception as e:
|
|
189
|
-
logger.error(f"Failed to read file {file_path}: {e}")
|
|
190
|
-
|
|
191
|
-
def process_message_buffer(self, messages: List[Dict[str, Any]], project_name: str, collection_name: str, conversation_id: str):
|
|
192
|
-
"""Process a buffer of messages into chunks and import them."""
|
|
193
|
-
chunks = []
|
|
194
|
-
|
|
195
|
-
# Create chunks from message buffer
|
|
196
|
-
for i in range(0, len(messages), CHUNK_SIZE):
|
|
197
|
-
chunk_messages = messages[i:i + CHUNK_SIZE]
|
|
198
|
-
|
|
199
|
-
# Create conversation text
|
|
200
|
-
conversation_text = "\n\n".join([
|
|
201
|
-
f"{msg['role'].upper()}: {msg['content'][:500]}"
|
|
202
|
-
for msg in chunk_messages
|
|
203
|
-
])
|
|
204
|
-
|
|
205
|
-
# Add metadata
|
|
206
|
-
timestamps = [msg['timestamp'] for msg in chunk_messages]
|
|
207
|
-
first_timestamp = min(timestamps) if timestamps else datetime.now().isoformat()
|
|
208
|
-
|
|
209
|
-
chunk_id = hashlib.md5(
|
|
210
|
-
f"{conversation_id}_{first_timestamp}_{len(chunks)}".encode()
|
|
211
|
-
).hexdigest()
|
|
212
|
-
|
|
213
|
-
chunks.append({
|
|
214
|
-
'id': chunk_id,
|
|
215
|
-
'text': conversation_text,
|
|
216
|
-
'metadata': {
|
|
217
|
-
'project': project_name,
|
|
218
|
-
'conversation_id': conversation_id,
|
|
219
|
-
'timestamp': first_timestamp,
|
|
220
|
-
'chunk_index': len(chunks),
|
|
221
|
-
'message_count': len(chunk_messages),
|
|
222
|
-
'roles': list(set(msg['role'] for msg in chunk_messages))
|
|
223
|
-
}
|
|
224
|
-
})
|
|
225
|
-
|
|
226
|
-
# Import chunks if we have any
|
|
227
|
-
if chunks:
|
|
228
|
-
self._import_chunks_to_qdrant(chunks, collection_name)
|
|
229
|
-
|
|
230
|
-
def _import_chunks_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str):
|
|
231
|
-
"""Import conversation chunks to Qdrant."""
|
|
232
|
-
if not chunks:
|
|
233
|
-
return
|
|
234
|
-
|
|
235
|
-
# Process in batches
|
|
236
|
-
for i in range(0, len(chunks), BATCH_SIZE):
|
|
237
|
-
batch = chunks[i:i + BATCH_SIZE]
|
|
238
|
-
texts = [chunk['text'] for chunk in batch]
|
|
239
|
-
|
|
240
|
-
try:
|
|
241
|
-
# Generate embeddings
|
|
242
|
-
embeddings = self._generate_embeddings(texts)
|
|
243
|
-
|
|
244
|
-
# Create points
|
|
245
|
-
points = []
|
|
246
|
-
for chunk, embedding in zip(batch, embeddings):
|
|
247
|
-
# Include both text and metadata in payload
|
|
248
|
-
payload = chunk['metadata'].copy()
|
|
249
|
-
payload['text'] = chunk['text']
|
|
250
|
-
|
|
251
|
-
points.append(PointStruct(
|
|
252
|
-
id=chunk['id'],
|
|
253
|
-
vector=embedding,
|
|
254
|
-
payload=payload
|
|
255
|
-
))
|
|
256
|
-
|
|
257
|
-
# Upsert to Qdrant
|
|
258
|
-
self.qdrant_client.upsert(
|
|
259
|
-
collection_name=collection_name,
|
|
260
|
-
points=points,
|
|
261
|
-
wait=True
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
self.total_imported += len(points)
|
|
265
|
-
logger.info(f"Imported batch of {len(points)} chunks (total: {self.total_imported})")
|
|
266
|
-
|
|
267
|
-
except Exception as e:
|
|
268
|
-
logger.error(f"Failed to import batch: {e}")
|
|
269
|
-
self.total_errors += 1
|
|
270
|
-
|
|
271
|
-
def import_large_file(self, file_path: str, project_name: str):
|
|
272
|
-
"""Import a large JSONL file using streaming."""
|
|
273
|
-
logger.info(f"🚀 Starting streaming import of {os.path.basename(file_path)}")
|
|
274
|
-
|
|
275
|
-
# Get collection name
|
|
276
|
-
project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
|
|
277
|
-
collection_name = f"conv_{project_hash}_voyage"
|
|
278
|
-
|
|
279
|
-
# Ensure collection exists
|
|
280
|
-
collections = [c.name for c in self.qdrant_client.get_collections().collections]
|
|
281
|
-
if collection_name not in collections:
|
|
282
|
-
logger.info(f"Creating collection: {collection_name}")
|
|
283
|
-
self.qdrant_client.create_collection(
|
|
284
|
-
collection_name=collection_name,
|
|
285
|
-
vectors_config=VectorParams(size=EMBEDDING_DIMENSIONS, distance=Distance.COSINE)
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
# Extract conversation ID from filename
|
|
289
|
-
conversation_id = os.path.splitext(os.path.basename(file_path))[0]
|
|
290
|
-
|
|
291
|
-
# Stream and process the file
|
|
292
|
-
chunk_count = 0
|
|
293
|
-
message_count = 0
|
|
294
|
-
|
|
295
|
-
try:
|
|
296
|
-
logger.info(f"Starting to process chunks from generator")
|
|
297
|
-
for message_buffer in self.stream_jsonl_messages(file_path):
|
|
298
|
-
logger.info(f"Received buffer with {len(message_buffer)} messages")
|
|
299
|
-
self.process_message_buffer(message_buffer, project_name, collection_name, conversation_id)
|
|
300
|
-
chunk_count += 1
|
|
301
|
-
message_count += len(message_buffer)
|
|
302
|
-
logger.info(f"Processed chunk {chunk_count} with {len(message_buffer)} messages (total: {message_count})")
|
|
303
|
-
|
|
304
|
-
# Save state periodically
|
|
305
|
-
if chunk_count % 10 == 0:
|
|
306
|
-
self._save_state()
|
|
307
|
-
|
|
308
|
-
# Log final statistics
|
|
309
|
-
logger.info(f"Finished processing {chunk_count} chunks with {message_count} total messages")
|
|
310
|
-
|
|
311
|
-
# Mark file as imported
|
|
312
|
-
if project_name not in self.state["projects"]:
|
|
313
|
-
self.state["projects"][project_name] = []
|
|
314
|
-
if file_path not in self.state["projects"][project_name]:
|
|
315
|
-
self.state["projects"][project_name].append(file_path)
|
|
316
|
-
|
|
317
|
-
self._save_state()
|
|
318
|
-
logger.info(f"✅ Completed streaming import of {os.path.basename(file_path)} - {chunk_count} chunks, {message_count} messages, {self.total_imported} vectors")
|
|
319
|
-
|
|
320
|
-
except Exception as e:
|
|
321
|
-
logger.error(f"Error during streaming import: {e}")
|
|
322
|
-
raise
|
|
323
|
-
|
|
324
|
-
def main():
|
|
325
|
-
"""Main entry point for streaming import."""
|
|
326
|
-
import sys
|
|
327
|
-
import argparse
|
|
328
|
-
|
|
329
|
-
parser = argparse.ArgumentParser(description="Streaming import for large conversation files")
|
|
330
|
-
parser.add_argument("--project", help="Project directory path")
|
|
331
|
-
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
332
|
-
args = parser.parse_args()
|
|
333
|
-
|
|
334
|
-
importer = StreamingVoyageImporter()
|
|
335
|
-
|
|
336
|
-
# If project path is provided via command line
|
|
337
|
-
if args.project and os.path.exists(args.project):
|
|
338
|
-
project_name = os.path.basename(args.project)
|
|
339
|
-
files_processed = 0
|
|
340
|
-
|
|
341
|
-
# Find all JSONL files in the project
|
|
342
|
-
for file_path in Path(args.project).glob("*.jsonl"):
|
|
343
|
-
if args.limit and files_processed >= args.limit:
|
|
344
|
-
break
|
|
345
|
-
|
|
346
|
-
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
|
347
|
-
logger.info(f"Processing {file_path.name} ({file_size_mb:.1f} MB)")
|
|
348
|
-
importer.import_large_file(str(file_path), project_name)
|
|
349
|
-
files_processed += 1
|
|
350
|
-
else:
|
|
351
|
-
# No specific project specified - scan for all projects
|
|
352
|
-
base_path = os.getenv("LOGS_PATH", "/logs")
|
|
353
|
-
if os.path.exists(base_path):
|
|
354
|
-
# Scan for all project directories
|
|
355
|
-
for project_dir in Path(base_path).iterdir():
|
|
356
|
-
if project_dir.is_dir() and not project_dir.name.startswith('.'):
|
|
357
|
-
# Look for JSONL files in this project
|
|
358
|
-
jsonl_files = list(project_dir.glob("*.jsonl"))
|
|
359
|
-
if jsonl_files:
|
|
360
|
-
for jsonl_file in jsonl_files:
|
|
361
|
-
file_size_mb = jsonl_file.stat().st_size / (1024 * 1024)
|
|
362
|
-
logger.info(f"Processing {jsonl_file.name} ({file_size_mb:.1f} MB) from project {project_dir.name}")
|
|
363
|
-
importer.import_large_file(str(jsonl_file), project_dir.name)
|
|
364
|
-
|
|
365
|
-
logger.info(f"Streaming import complete! Total chunks: {importer.total_imported}")
|
|
366
|
-
|
|
367
|
-
if __name__ == "__main__":
|
|
368
|
-
main()
|