claude-self-reflect 2.3.6 → 2.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,368 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Streaming import for large Claude conversation logs.
4
- Processes files in chunks without loading entire file into memory.
5
- """
6
-
7
- import json
8
- import os
9
- import sys
10
- import time
11
- import hashlib
12
- from datetime import datetime
13
- from typing import List, Dict, Any, Generator
14
- import logging
15
- import backoff
16
- import requests
17
- from qdrant_client import QdrantClient
18
- from qdrant_client.models import VectorParams, Distance, PointStruct
19
- from pathlib import Path
20
-
21
- # Set up logging
22
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
- logger = logging.getLogger(__name__)
24
-
25
- # Configuration
26
- QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
27
- STATE_FILE = os.getenv("STATE_FILE", "./config-isolated/imported-files.json")
28
- LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
29
- VOYAGE_API_KEY = os.getenv("VOYAGE_KEY_2") or os.getenv("VOYAGE_KEY")
30
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "50"))
31
- CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "10"))
32
- STREAMING_BUFFER_SIZE = 100 # Process every 100 messages
33
- RATE_LIMIT_DELAY = 0.1
34
- EMBEDDING_MODEL = "voyage-3-large"
35
- EMBEDDING_DIMENSIONS = 1024
36
- VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings"
37
-
38
- class StreamingVoyageImporter:
39
- def __init__(self):
40
- """Initialize the streaming importer."""
41
- if not VOYAGE_API_KEY:
42
- raise ValueError("VOYAGE_KEY environment variable not set")
43
-
44
- self.qdrant_client = QdrantClient(url=QDRANT_URL)
45
- self.state = self._load_state()
46
- self.total_imported = 0
47
- self.total_errors = 0
48
-
49
- logger.info(f"Connected to Qdrant at {QDRANT_URL}")
50
-
51
- def _load_state(self) -> Dict[str, Any]:
52
- """Load import state from file."""
53
- default_state = {
54
- "projects": {},
55
- "last_updated": None,
56
- "mode": "isolated",
57
- "total_imported": 0
58
- }
59
-
60
- if os.path.exists(STATE_FILE):
61
- try:
62
- with open(STATE_FILE, 'r') as f:
63
- return json.load(f)
64
- except Exception as e:
65
- logger.error(f"Failed to load state: {e}")
66
-
67
- return default_state
68
-
69
- def _save_state(self):
70
- """Save import state to file."""
71
- self.state["last_updated"] = datetime.now().isoformat()
72
- self.state["total_imported"] = self.total_imported
73
-
74
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
75
- with open(STATE_FILE, 'w') as f:
76
- json.dump(self.state, f, indent=2)
77
-
78
- @backoff.on_exception(
79
- backoff.expo,
80
- Exception,
81
- max_tries=5,
82
- on_backoff=lambda details: logger.warning(f"Backing off {details['wait']}s after {details['tries']} tries")
83
- )
84
- def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
85
- """Generate embeddings using Voyage AI API with batching."""
86
- headers = {
87
- "Authorization": f"Bearer {VOYAGE_API_KEY}",
88
- "Content-Type": "application/json"
89
- }
90
-
91
- payload = {
92
- "input": texts,
93
- "model": EMBEDDING_MODEL,
94
- "input_type": "document"
95
- }
96
-
97
- try:
98
- response = requests.post(VOYAGE_API_URL, headers=headers, json=payload, timeout=30)
99
- response.raise_for_status()
100
-
101
- data = response.json()
102
- embeddings = [item["embedding"] for item in data["data"]]
103
-
104
- # Add small delay to respect rate limits
105
- time.sleep(RATE_LIMIT_DELAY)
106
-
107
- return embeddings
108
-
109
- except requests.Timeout:
110
- logger.error("Voyage API request timed out after 30 seconds")
111
- raise
112
- except Exception as e:
113
- logger.error(f"Voyage API error: {e}")
114
- raise
115
-
116
- def stream_jsonl_messages(self, file_path: str, buffer_size: int = STREAMING_BUFFER_SIZE) -> Generator[List[Dict[str, Any]], None, None]:
117
- """Stream messages from JSONL file in buffers without loading entire file."""
118
- buffer = []
119
- line_count = 0
120
- total_lines = 0
121
- skipped_lines = 0
122
-
123
- # Extract expected session ID from filename
124
- expected_session_id = os.path.splitext(os.path.basename(file_path))[0]
125
- logger.info(f"Starting to stream file: {os.path.basename(file_path)} (expecting session: {expected_session_id})")
126
-
127
- try:
128
- with open(file_path, 'r', encoding='utf-8') as f:
129
- for line_num, line in enumerate(f, 1):
130
- total_lines = line_num
131
- line = line.strip()
132
- if not line:
133
- continue
134
-
135
- try:
136
- data = json.loads(line)
137
-
138
- # Check session ID matches expected
139
- session_id = data.get('sessionId', '')
140
- if session_id != expected_session_id:
141
- skipped_lines += 1
142
- logger.debug(f"Skipping line {line_num}: different session ID ({session_id})")
143
- continue
144
-
145
- # Extract message if present
146
- if 'message' in data and data['message']:
147
- msg = data['message']
148
- if msg.get('role') and msg.get('content'):
149
- content = msg['content']
150
- # Handle content array (common in Claude messages)
151
- if isinstance(content, list) and len(content) > 0:
152
- # Extract text from first content item
153
- content_item = content[0]
154
- if isinstance(content_item, dict):
155
- content = content_item.get('text', str(content_item))
156
- elif isinstance(content, dict):
157
- content = content.get('text', json.dumps(content))
158
-
159
- buffer.append({
160
- 'role': msg['role'],
161
- 'content': content,
162
- 'file_path': file_path,
163
- 'line_number': line_num,
164
- 'timestamp': data.get('timestamp', datetime.now().isoformat())
165
- })
166
- line_count += 1
167
-
168
- # Yield buffer when it reaches the specified size
169
- if len(buffer) >= buffer_size:
170
- logger.info(f"Buffer full, yielding {len(buffer)} messages (total so far: {line_count})")
171
- yield buffer
172
- buffer = []
173
-
174
- except json.JSONDecodeError:
175
- logger.debug(f"Skipping invalid JSON at line {line_num}")
176
- skipped_lines += 1
177
- except Exception as e:
178
- logger.error(f"Error processing line {line_num}: {e}")
179
- skipped_lines += 1
180
-
181
- # Yield any remaining messages
182
- if buffer:
183
- logger.info(f"Yielding final buffer with {len(buffer)} messages")
184
- yield buffer
185
-
186
- logger.info(f"Completed streaming file: processed {total_lines} lines, {line_count} messages, {skipped_lines} skipped")
187
-
188
- except Exception as e:
189
- logger.error(f"Failed to read file {file_path}: {e}")
190
-
191
- def process_message_buffer(self, messages: List[Dict[str, Any]], project_name: str, collection_name: str, conversation_id: str):
192
- """Process a buffer of messages into chunks and import them."""
193
- chunks = []
194
-
195
- # Create chunks from message buffer
196
- for i in range(0, len(messages), CHUNK_SIZE):
197
- chunk_messages = messages[i:i + CHUNK_SIZE]
198
-
199
- # Create conversation text
200
- conversation_text = "\n\n".join([
201
- f"{msg['role'].upper()}: {msg['content'][:500]}"
202
- for msg in chunk_messages
203
- ])
204
-
205
- # Add metadata
206
- timestamps = [msg['timestamp'] for msg in chunk_messages]
207
- first_timestamp = min(timestamps) if timestamps else datetime.now().isoformat()
208
-
209
- chunk_id = hashlib.md5(
210
- f"{conversation_id}_{first_timestamp}_{len(chunks)}".encode()
211
- ).hexdigest()
212
-
213
- chunks.append({
214
- 'id': chunk_id,
215
- 'text': conversation_text,
216
- 'metadata': {
217
- 'project': project_name,
218
- 'conversation_id': conversation_id,
219
- 'timestamp': first_timestamp,
220
- 'chunk_index': len(chunks),
221
- 'message_count': len(chunk_messages),
222
- 'roles': list(set(msg['role'] for msg in chunk_messages))
223
- }
224
- })
225
-
226
- # Import chunks if we have any
227
- if chunks:
228
- self._import_chunks_to_qdrant(chunks, collection_name)
229
-
230
- def _import_chunks_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str):
231
- """Import conversation chunks to Qdrant."""
232
- if not chunks:
233
- return
234
-
235
- # Process in batches
236
- for i in range(0, len(chunks), BATCH_SIZE):
237
- batch = chunks[i:i + BATCH_SIZE]
238
- texts = [chunk['text'] for chunk in batch]
239
-
240
- try:
241
- # Generate embeddings
242
- embeddings = self._generate_embeddings(texts)
243
-
244
- # Create points
245
- points = []
246
- for chunk, embedding in zip(batch, embeddings):
247
- # Include both text and metadata in payload
248
- payload = chunk['metadata'].copy()
249
- payload['text'] = chunk['text']
250
-
251
- points.append(PointStruct(
252
- id=chunk['id'],
253
- vector=embedding,
254
- payload=payload
255
- ))
256
-
257
- # Upsert to Qdrant
258
- self.qdrant_client.upsert(
259
- collection_name=collection_name,
260
- points=points,
261
- wait=True
262
- )
263
-
264
- self.total_imported += len(points)
265
- logger.info(f"Imported batch of {len(points)} chunks (total: {self.total_imported})")
266
-
267
- except Exception as e:
268
- logger.error(f"Failed to import batch: {e}")
269
- self.total_errors += 1
270
-
271
- def import_large_file(self, file_path: str, project_name: str):
272
- """Import a large JSONL file using streaming."""
273
- logger.info(f"🚀 Starting streaming import of {os.path.basename(file_path)}")
274
-
275
- # Get collection name
276
- project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
277
- collection_name = f"conv_{project_hash}_voyage"
278
-
279
- # Ensure collection exists
280
- collections = [c.name for c in self.qdrant_client.get_collections().collections]
281
- if collection_name not in collections:
282
- logger.info(f"Creating collection: {collection_name}")
283
- self.qdrant_client.create_collection(
284
- collection_name=collection_name,
285
- vectors_config=VectorParams(size=EMBEDDING_DIMENSIONS, distance=Distance.COSINE)
286
- )
287
-
288
- # Extract conversation ID from filename
289
- conversation_id = os.path.splitext(os.path.basename(file_path))[0]
290
-
291
- # Stream and process the file
292
- chunk_count = 0
293
- message_count = 0
294
-
295
- try:
296
- logger.info(f"Starting to process chunks from generator")
297
- for message_buffer in self.stream_jsonl_messages(file_path):
298
- logger.info(f"Received buffer with {len(message_buffer)} messages")
299
- self.process_message_buffer(message_buffer, project_name, collection_name, conversation_id)
300
- chunk_count += 1
301
- message_count += len(message_buffer)
302
- logger.info(f"Processed chunk {chunk_count} with {len(message_buffer)} messages (total: {message_count})")
303
-
304
- # Save state periodically
305
- if chunk_count % 10 == 0:
306
- self._save_state()
307
-
308
- # Log final statistics
309
- logger.info(f"Finished processing {chunk_count} chunks with {message_count} total messages")
310
-
311
- # Mark file as imported
312
- if project_name not in self.state["projects"]:
313
- self.state["projects"][project_name] = []
314
- if file_path not in self.state["projects"][project_name]:
315
- self.state["projects"][project_name].append(file_path)
316
-
317
- self._save_state()
318
- logger.info(f"✅ Completed streaming import of {os.path.basename(file_path)} - {chunk_count} chunks, {message_count} messages, {self.total_imported} vectors")
319
-
320
- except Exception as e:
321
- logger.error(f"Error during streaming import: {e}")
322
- raise
323
-
324
- def main():
325
- """Main entry point for streaming import."""
326
- import sys
327
- import argparse
328
-
329
- parser = argparse.ArgumentParser(description="Streaming import for large conversation files")
330
- parser.add_argument("--project", help="Project directory path")
331
- parser.add_argument("--limit", type=int, help="Limit number of files to process")
332
- args = parser.parse_args()
333
-
334
- importer = StreamingVoyageImporter()
335
-
336
- # If project path is provided via command line
337
- if args.project and os.path.exists(args.project):
338
- project_name = os.path.basename(args.project)
339
- files_processed = 0
340
-
341
- # Find all JSONL files in the project
342
- for file_path in Path(args.project).glob("*.jsonl"):
343
- if args.limit and files_processed >= args.limit:
344
- break
345
-
346
- file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
347
- logger.info(f"Processing {file_path.name} ({file_size_mb:.1f} MB)")
348
- importer.import_large_file(str(file_path), project_name)
349
- files_processed += 1
350
- else:
351
- # No specific project specified - scan for all projects
352
- base_path = os.getenv("LOGS_PATH", "/logs")
353
- if os.path.exists(base_path):
354
- # Scan for all project directories
355
- for project_dir in Path(base_path).iterdir():
356
- if project_dir.is_dir() and not project_dir.name.startswith('.'):
357
- # Look for JSONL files in this project
358
- jsonl_files = list(project_dir.glob("*.jsonl"))
359
- if jsonl_files:
360
- for jsonl_file in jsonl_files:
361
- file_size_mb = jsonl_file.stat().st_size / (1024 * 1024)
362
- logger.info(f"Processing {jsonl_file.name} ({file_size_mb:.1f} MB) from project {project_dir.name}")
363
- importer.import_large_file(str(jsonl_file), project_dir.name)
364
-
365
- logger.info(f"Streaming import complete! Total chunks: {importer.total_imported}")
366
-
367
- if __name__ == "__main__":
368
- main()