claude-self-reflect 2.6.0 → 2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,374 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Streaming importer with true line-by-line processing to prevent OOM.
4
+ Processes JSONL files without loading entire file into memory.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import sys
10
+ import hashlib
11
+ import gc
12
+ from pathlib import Path
13
+ from datetime import datetime
14
+ from typing import List, Dict, Any, Optional
15
+ import logging
16
+
17
+ # Add the project root to the Python path
18
+ project_root = Path(__file__).parent.parent
19
+ sys.path.insert(0, str(project_root))
20
+
21
+ from qdrant_client import QdrantClient
22
+ from qdrant_client.models import PointStruct, Distance, VectorParams
23
+
24
+ # Set up logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(levelname)s - %(message)s'
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Environment variables
32
+ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
33
+ STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
34
+ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
35
+ VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
36
+ MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
37
+
38
+ # Initialize Qdrant client
39
+ client = QdrantClient(url=QDRANT_URL)
40
+
41
+ # Initialize embedding provider
42
+ embedding_provider = None
43
+ embedding_dimension = None
44
+
45
+ if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
46
+ logger.info("Using local embeddings (fastembed)")
47
+ from fastembed import TextEmbedding
48
+ embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
49
+ embedding_dimension = 384
50
+ collection_suffix = "local"
51
+ else:
52
+ logger.info("Using Voyage AI embeddings")
53
+ import voyageai
54
+ embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
55
+ embedding_dimension = 1024
56
+ collection_suffix = "voyage"
57
+
58
+ def normalize_project_name(project_name: str) -> str:
59
+ """Normalize project name for consistency."""
60
+ return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
61
+
62
+ def get_collection_name(project_path: Path) -> str:
63
+ """Generate collection name from project path."""
64
+ normalized = normalize_project_name(project_path.name)
65
+ name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
66
+ return f"conv_{name_hash}_{collection_suffix}"
67
+
68
+ def ensure_collection(collection_name: str):
69
+ """Ensure collection exists with correct configuration."""
70
+ collections = client.get_collections().collections
71
+ if not any(c.name == collection_name for c in collections):
72
+ logger.info(f"Creating collection: {collection_name}")
73
+ client.create_collection(
74
+ collection_name=collection_name,
75
+ vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
76
+ )
77
+
78
+ def generate_embeddings(texts: List[str]) -> List[List[float]]:
79
+ """Generate embeddings for texts."""
80
+ if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
81
+ embeddings = list(embedding_provider.passage_embed(texts))
82
+ return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
83
+ else:
84
+ response = embedding_provider.embed(texts, model="voyage-3")
85
+ return response.embeddings
86
+
87
+ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
88
+ conversation_id: str, created_at: str,
89
+ metadata: Dict[str, Any], collection_name: str,
90
+ project_path: Path) -> int:
91
+ """Process and immediately upload a single chunk."""
92
+ if not messages:
93
+ return 0
94
+
95
+ # Extract text content
96
+ texts = []
97
+ for msg in messages:
98
+ role = msg.get("role", "unknown")
99
+ content = msg.get("content", "")
100
+ if content:
101
+ texts.append(f"{role.upper()}: {content}")
102
+
103
+ if not texts:
104
+ return 0
105
+
106
+ chunk_text = "\n".join(texts)
107
+
108
+ try:
109
+ # Generate embedding
110
+ embeddings = generate_embeddings([chunk_text])
111
+
112
+ # Create point ID
113
+ point_id = hashlib.md5(
114
+ f"{conversation_id}_{chunk_index}".encode()
115
+ ).hexdigest()[:16]
116
+
117
+ # Create payload
118
+ payload = {
119
+ "text": chunk_text,
120
+ "conversation_id": conversation_id,
121
+ "chunk_index": chunk_index,
122
+ "timestamp": created_at,
123
+ "project": normalize_project_name(project_path.name),
124
+ "start_role": messages[0].get("role", "unknown") if messages else "unknown",
125
+ "message_count": len(messages)
126
+ }
127
+
128
+ # Add metadata
129
+ if metadata:
130
+ payload.update(metadata)
131
+
132
+ # Create point
133
+ point = PointStruct(
134
+ id=int(point_id, 16) % (2**63),
135
+ vector=embeddings[0],
136
+ payload=payload
137
+ )
138
+
139
+ # Upload immediately
140
+ client.upsert(
141
+ collection_name=collection_name,
142
+ points=[point],
143
+ wait=True
144
+ )
145
+
146
+ return 1
147
+
148
+ except Exception as e:
149
+ logger.error(f"Error processing chunk {chunk_index}: {e}")
150
+ return 0
151
+
152
+ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
153
+ """Extract metadata in a single pass, return metadata and first timestamp."""
154
+ metadata = {
155
+ "files_analyzed": [],
156
+ "files_edited": [],
157
+ "tools_used": [],
158
+ "concepts": []
159
+ }
160
+
161
+ first_timestamp = None
162
+
163
+ try:
164
+ with open(file_path, 'r', encoding='utf-8') as f:
165
+ for line in f:
166
+ if not line.strip():
167
+ continue
168
+
169
+ try:
170
+ data = json.loads(line)
171
+
172
+ # Get timestamp from first valid entry
173
+ if first_timestamp is None and 'timestamp' in data:
174
+ first_timestamp = data.get('timestamp')
175
+
176
+ # Extract tool usage from messages
177
+ if 'message' in data and data['message']:
178
+ msg = data['message']
179
+ if msg.get('content'):
180
+ content = msg['content']
181
+ if isinstance(content, list):
182
+ for item in content:
183
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
184
+ tool_name = item.get('name', '')
185
+ if tool_name and tool_name not in metadata['tools_used']:
186
+ metadata['tools_used'].append(tool_name)
187
+
188
+ # Extract file references
189
+ if 'input' in item:
190
+ input_data = item['input']
191
+ if isinstance(input_data, dict):
192
+ if 'file_path' in input_data:
193
+ file_ref = input_data['file_path']
194
+ if file_ref not in metadata['files_analyzed']:
195
+ metadata['files_analyzed'].append(file_ref)
196
+ if 'path' in input_data:
197
+ file_ref = input_data['path']
198
+ if file_ref not in metadata['files_analyzed']:
199
+ metadata['files_analyzed'].append(file_ref)
200
+
201
+ except json.JSONDecodeError:
202
+ continue
203
+ except Exception:
204
+ continue
205
+
206
+ except Exception as e:
207
+ logger.warning(f"Error extracting metadata: {e}")
208
+
209
+ return metadata, first_timestamp or datetime.now().isoformat()
210
+
211
+ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
212
+ """Stream import a single JSONL file without loading it into memory."""
213
+ logger.info(f"Streaming import of {jsonl_file.name}")
214
+
215
+ # Extract metadata in first pass (lightweight)
216
+ metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
217
+
218
+ # Stream messages and process in chunks
219
+ chunk_buffer = []
220
+ chunk_index = 0
221
+ total_chunks = 0
222
+ conversation_id = jsonl_file.stem
223
+
224
+ try:
225
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
226
+ for line_num, line in enumerate(f, 1):
227
+ line = line.strip()
228
+ if not line:
229
+ continue
230
+
231
+ try:
232
+ data = json.loads(line)
233
+
234
+ # Skip non-message lines
235
+ if data.get('type') == 'summary':
236
+ continue
237
+
238
+ # Extract message if present
239
+ if 'message' in data and data['message']:
240
+ msg = data['message']
241
+ if msg.get('role') and msg.get('content'):
242
+ # Extract content
243
+ content = msg['content']
244
+ if isinstance(content, list):
245
+ text_parts = []
246
+ for item in content:
247
+ if isinstance(item, dict) and item.get('type') == 'text':
248
+ text_parts.append(item.get('text', ''))
249
+ elif isinstance(item, str):
250
+ text_parts.append(item)
251
+ content = '\n'.join(text_parts)
252
+
253
+ if content:
254
+ chunk_buffer.append({
255
+ 'role': msg['role'],
256
+ 'content': content
257
+ })
258
+
259
+ # Process chunk when buffer reaches MAX_CHUNK_SIZE
260
+ if len(chunk_buffer) >= MAX_CHUNK_SIZE:
261
+ chunks = process_and_upload_chunk(
262
+ chunk_buffer, chunk_index, conversation_id,
263
+ created_at, metadata, collection_name, project_path
264
+ )
265
+ total_chunks += chunks
266
+ chunk_buffer = []
267
+ chunk_index += 1
268
+
269
+ # Force garbage collection after each chunk
270
+ gc.collect()
271
+
272
+ # Log progress
273
+ if chunk_index % 10 == 0:
274
+ logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
275
+
276
+ except json.JSONDecodeError:
277
+ logger.debug(f"Skipping invalid JSON at line {line_num}")
278
+ except Exception as e:
279
+ logger.debug(f"Error processing line {line_num}: {e}")
280
+
281
+ # Process remaining messages
282
+ if chunk_buffer:
283
+ chunks = process_and_upload_chunk(
284
+ chunk_buffer, chunk_index, conversation_id,
285
+ created_at, metadata, collection_name, project_path
286
+ )
287
+ total_chunks += chunks
288
+
289
+ logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
290
+ return total_chunks
291
+
292
+ except Exception as e:
293
+ logger.error(f"Failed to import {jsonl_file}: {e}")
294
+ return 0
295
+
296
+ def load_state() -> dict:
297
+ """Load import state."""
298
+ if os.path.exists(STATE_FILE):
299
+ try:
300
+ with open(STATE_FILE, 'r') as f:
301
+ return json.load(f)
302
+ except:
303
+ pass
304
+ return {"imported_files": {}}
305
+
306
+ def save_state(state: dict):
307
+ """Save import state."""
308
+ os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
309
+ with open(STATE_FILE, 'w') as f:
310
+ json.dump(state, f, indent=2)
311
+
312
+ def should_import_file(file_path: Path, state: dict) -> bool:
313
+ """Check if file should be imported."""
314
+ file_str = str(file_path)
315
+ if file_str in state.get("imported_files", {}):
316
+ file_info = state["imported_files"][file_str]
317
+ last_modified = file_path.stat().st_mtime
318
+ if file_info.get("last_modified") == last_modified:
319
+ logger.info(f"Skipping unchanged file: {file_path.name}")
320
+ return False
321
+ return True
322
+
323
+ def update_file_state(file_path: Path, state: dict, chunks: int):
324
+ """Update state for imported file."""
325
+ file_str = str(file_path)
326
+ state["imported_files"][file_str] = {
327
+ "imported_at": datetime.now().isoformat(),
328
+ "last_modified": file_path.stat().st_mtime,
329
+ "chunks": chunks
330
+ }
331
+
332
+ def main():
333
+ """Main import function."""
334
+ # Load state
335
+ state = load_state()
336
+ logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
337
+
338
+ # Find all projects
339
+ logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
340
+ project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
341
+ logger.info(f"Found {len(project_dirs)} projects to import")
342
+
343
+ total_imported = 0
344
+
345
+ for project_dir in project_dirs:
346
+ # Get collection name
347
+ collection_name = get_collection_name(project_dir)
348
+ logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
349
+
350
+ # Ensure collection exists
351
+ ensure_collection(collection_name)
352
+
353
+ # Find JSONL files
354
+ jsonl_files = sorted(project_dir.glob("*.jsonl"))
355
+
356
+ # Limit files per cycle if specified
357
+ max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
358
+ jsonl_files = jsonl_files[:max_files]
359
+
360
+ for jsonl_file in jsonl_files:
361
+ if should_import_file(jsonl_file, state):
362
+ chunks = stream_import_file(jsonl_file, collection_name, project_dir)
363
+ if chunks > 0:
364
+ update_file_state(jsonl_file, state, chunks)
365
+ save_state(state)
366
+ total_imported += 1
367
+
368
+ # Force GC after each file
369
+ gc.collect()
370
+
371
+ logger.info(f"Import complete: processed {total_imported} files")
372
+
373
+ if __name__ == "__main__":
374
+ main()