claude-self-reflect 2.6.0 → 2.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +19 -18
- package/Dockerfile.importer +6 -2
- package/Dockerfile.safe-watcher +44 -0
- package/README.md +3 -1
- package/docker-compose.yaml +43 -11
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/src/project_resolver.py +527 -0
- package/mcp-server/src/server.py +14 -10
- package/mcp-server/src/utils.py +20 -3
- package/package.json +2 -2
- package/scripts/import-conversations-unified.backup.py +374 -0
- package/scripts/import-conversations-unified.py +297 -723
- package/scripts/import-latest.py +124 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Streaming importer with true line-by-line processing to prevent OOM.
|
|
4
|
+
Processes JSONL files without loading entire file into memory.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import hashlib
|
|
11
|
+
import gc
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from typing import List, Dict, Any, Optional
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
# Add the project root to the Python path
|
|
18
|
+
project_root = Path(__file__).parent.parent
|
|
19
|
+
sys.path.insert(0, str(project_root))
|
|
20
|
+
|
|
21
|
+
from qdrant_client import QdrantClient
|
|
22
|
+
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
23
|
+
|
|
24
|
+
# Set up logging
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
level=logging.INFO,
|
|
27
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
28
|
+
)
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# Environment variables
|
|
32
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
33
|
+
STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
|
|
34
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
35
|
+
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
36
|
+
MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
|
|
37
|
+
|
|
38
|
+
# Initialize Qdrant client
|
|
39
|
+
client = QdrantClient(url=QDRANT_URL)
|
|
40
|
+
|
|
41
|
+
# Initialize embedding provider
|
|
42
|
+
embedding_provider = None
|
|
43
|
+
embedding_dimension = None
|
|
44
|
+
|
|
45
|
+
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
46
|
+
logger.info("Using local embeddings (fastembed)")
|
|
47
|
+
from fastembed import TextEmbedding
|
|
48
|
+
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
49
|
+
embedding_dimension = 384
|
|
50
|
+
collection_suffix = "local"
|
|
51
|
+
else:
|
|
52
|
+
logger.info("Using Voyage AI embeddings")
|
|
53
|
+
import voyageai
|
|
54
|
+
embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
55
|
+
embedding_dimension = 1024
|
|
56
|
+
collection_suffix = "voyage"
|
|
57
|
+
|
|
58
|
+
def normalize_project_name(project_name: str) -> str:
|
|
59
|
+
"""Normalize project name for consistency."""
|
|
60
|
+
return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
|
|
61
|
+
|
|
62
|
+
def get_collection_name(project_path: Path) -> str:
|
|
63
|
+
"""Generate collection name from project path."""
|
|
64
|
+
normalized = normalize_project_name(project_path.name)
|
|
65
|
+
name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
66
|
+
return f"conv_{name_hash}_{collection_suffix}"
|
|
67
|
+
|
|
68
|
+
def ensure_collection(collection_name: str):
|
|
69
|
+
"""Ensure collection exists with correct configuration."""
|
|
70
|
+
collections = client.get_collections().collections
|
|
71
|
+
if not any(c.name == collection_name for c in collections):
|
|
72
|
+
logger.info(f"Creating collection: {collection_name}")
|
|
73
|
+
client.create_collection(
|
|
74
|
+
collection_name=collection_name,
|
|
75
|
+
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
79
|
+
"""Generate embeddings for texts."""
|
|
80
|
+
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
81
|
+
embeddings = list(embedding_provider.passage_embed(texts))
|
|
82
|
+
return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
|
|
83
|
+
else:
|
|
84
|
+
response = embedding_provider.embed(texts, model="voyage-3")
|
|
85
|
+
return response.embeddings
|
|
86
|
+
|
|
87
|
+
def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
88
|
+
conversation_id: str, created_at: str,
|
|
89
|
+
metadata: Dict[str, Any], collection_name: str,
|
|
90
|
+
project_path: Path) -> int:
|
|
91
|
+
"""Process and immediately upload a single chunk."""
|
|
92
|
+
if not messages:
|
|
93
|
+
return 0
|
|
94
|
+
|
|
95
|
+
# Extract text content
|
|
96
|
+
texts = []
|
|
97
|
+
for msg in messages:
|
|
98
|
+
role = msg.get("role", "unknown")
|
|
99
|
+
content = msg.get("content", "")
|
|
100
|
+
if content:
|
|
101
|
+
texts.append(f"{role.upper()}: {content}")
|
|
102
|
+
|
|
103
|
+
if not texts:
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
chunk_text = "\n".join(texts)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Generate embedding
|
|
110
|
+
embeddings = generate_embeddings([chunk_text])
|
|
111
|
+
|
|
112
|
+
# Create point ID
|
|
113
|
+
point_id = hashlib.md5(
|
|
114
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
115
|
+
).hexdigest()[:16]
|
|
116
|
+
|
|
117
|
+
# Create payload
|
|
118
|
+
payload = {
|
|
119
|
+
"text": chunk_text,
|
|
120
|
+
"conversation_id": conversation_id,
|
|
121
|
+
"chunk_index": chunk_index,
|
|
122
|
+
"timestamp": created_at,
|
|
123
|
+
"project": normalize_project_name(project_path.name),
|
|
124
|
+
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
125
|
+
"message_count": len(messages)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# Add metadata
|
|
129
|
+
if metadata:
|
|
130
|
+
payload.update(metadata)
|
|
131
|
+
|
|
132
|
+
# Create point
|
|
133
|
+
point = PointStruct(
|
|
134
|
+
id=int(point_id, 16) % (2**63),
|
|
135
|
+
vector=embeddings[0],
|
|
136
|
+
payload=payload
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Upload immediately
|
|
140
|
+
client.upsert(
|
|
141
|
+
collection_name=collection_name,
|
|
142
|
+
points=[point],
|
|
143
|
+
wait=True
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return 1
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.error(f"Error processing chunk {chunk_index}: {e}")
|
|
150
|
+
return 0
|
|
151
|
+
|
|
152
|
+
def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
|
|
153
|
+
"""Extract metadata in a single pass, return metadata and first timestamp."""
|
|
154
|
+
metadata = {
|
|
155
|
+
"files_analyzed": [],
|
|
156
|
+
"files_edited": [],
|
|
157
|
+
"tools_used": [],
|
|
158
|
+
"concepts": []
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
first_timestamp = None
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
165
|
+
for line in f:
|
|
166
|
+
if not line.strip():
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
data = json.loads(line)
|
|
171
|
+
|
|
172
|
+
# Get timestamp from first valid entry
|
|
173
|
+
if first_timestamp is None and 'timestamp' in data:
|
|
174
|
+
first_timestamp = data.get('timestamp')
|
|
175
|
+
|
|
176
|
+
# Extract tool usage from messages
|
|
177
|
+
if 'message' in data and data['message']:
|
|
178
|
+
msg = data['message']
|
|
179
|
+
if msg.get('content'):
|
|
180
|
+
content = msg['content']
|
|
181
|
+
if isinstance(content, list):
|
|
182
|
+
for item in content:
|
|
183
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
184
|
+
tool_name = item.get('name', '')
|
|
185
|
+
if tool_name and tool_name not in metadata['tools_used']:
|
|
186
|
+
metadata['tools_used'].append(tool_name)
|
|
187
|
+
|
|
188
|
+
# Extract file references
|
|
189
|
+
if 'input' in item:
|
|
190
|
+
input_data = item['input']
|
|
191
|
+
if isinstance(input_data, dict):
|
|
192
|
+
if 'file_path' in input_data:
|
|
193
|
+
file_ref = input_data['file_path']
|
|
194
|
+
if file_ref not in metadata['files_analyzed']:
|
|
195
|
+
metadata['files_analyzed'].append(file_ref)
|
|
196
|
+
if 'path' in input_data:
|
|
197
|
+
file_ref = input_data['path']
|
|
198
|
+
if file_ref not in metadata['files_analyzed']:
|
|
199
|
+
metadata['files_analyzed'].append(file_ref)
|
|
200
|
+
|
|
201
|
+
except json.JSONDecodeError:
|
|
202
|
+
continue
|
|
203
|
+
except Exception:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.warning(f"Error extracting metadata: {e}")
|
|
208
|
+
|
|
209
|
+
return metadata, first_timestamp or datetime.now().isoformat()
|
|
210
|
+
|
|
211
|
+
def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
212
|
+
"""Stream import a single JSONL file without loading it into memory."""
|
|
213
|
+
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
214
|
+
|
|
215
|
+
# Extract metadata in first pass (lightweight)
|
|
216
|
+
metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
|
|
217
|
+
|
|
218
|
+
# Stream messages and process in chunks
|
|
219
|
+
chunk_buffer = []
|
|
220
|
+
chunk_index = 0
|
|
221
|
+
total_chunks = 0
|
|
222
|
+
conversation_id = jsonl_file.stem
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
226
|
+
for line_num, line in enumerate(f, 1):
|
|
227
|
+
line = line.strip()
|
|
228
|
+
if not line:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
data = json.loads(line)
|
|
233
|
+
|
|
234
|
+
# Skip non-message lines
|
|
235
|
+
if data.get('type') == 'summary':
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
# Extract message if present
|
|
239
|
+
if 'message' in data and data['message']:
|
|
240
|
+
msg = data['message']
|
|
241
|
+
if msg.get('role') and msg.get('content'):
|
|
242
|
+
# Extract content
|
|
243
|
+
content = msg['content']
|
|
244
|
+
if isinstance(content, list):
|
|
245
|
+
text_parts = []
|
|
246
|
+
for item in content:
|
|
247
|
+
if isinstance(item, dict) and item.get('type') == 'text':
|
|
248
|
+
text_parts.append(item.get('text', ''))
|
|
249
|
+
elif isinstance(item, str):
|
|
250
|
+
text_parts.append(item)
|
|
251
|
+
content = '\n'.join(text_parts)
|
|
252
|
+
|
|
253
|
+
if content:
|
|
254
|
+
chunk_buffer.append({
|
|
255
|
+
'role': msg['role'],
|
|
256
|
+
'content': content
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
# Process chunk when buffer reaches MAX_CHUNK_SIZE
|
|
260
|
+
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
261
|
+
chunks = process_and_upload_chunk(
|
|
262
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
263
|
+
created_at, metadata, collection_name, project_path
|
|
264
|
+
)
|
|
265
|
+
total_chunks += chunks
|
|
266
|
+
chunk_buffer = []
|
|
267
|
+
chunk_index += 1
|
|
268
|
+
|
|
269
|
+
# Force garbage collection after each chunk
|
|
270
|
+
gc.collect()
|
|
271
|
+
|
|
272
|
+
# Log progress
|
|
273
|
+
if chunk_index % 10 == 0:
|
|
274
|
+
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
275
|
+
|
|
276
|
+
except json.JSONDecodeError:
|
|
277
|
+
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.debug(f"Error processing line {line_num}: {e}")
|
|
280
|
+
|
|
281
|
+
# Process remaining messages
|
|
282
|
+
if chunk_buffer:
|
|
283
|
+
chunks = process_and_upload_chunk(
|
|
284
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
285
|
+
created_at, metadata, collection_name, project_path
|
|
286
|
+
)
|
|
287
|
+
total_chunks += chunks
|
|
288
|
+
|
|
289
|
+
logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
|
|
290
|
+
return total_chunks
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
294
|
+
return 0
|
|
295
|
+
|
|
296
|
+
def load_state() -> dict:
|
|
297
|
+
"""Load import state."""
|
|
298
|
+
if os.path.exists(STATE_FILE):
|
|
299
|
+
try:
|
|
300
|
+
with open(STATE_FILE, 'r') as f:
|
|
301
|
+
return json.load(f)
|
|
302
|
+
except:
|
|
303
|
+
pass
|
|
304
|
+
return {"imported_files": {}}
|
|
305
|
+
|
|
306
|
+
def save_state(state: dict):
|
|
307
|
+
"""Save import state."""
|
|
308
|
+
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
309
|
+
with open(STATE_FILE, 'w') as f:
|
|
310
|
+
json.dump(state, f, indent=2)
|
|
311
|
+
|
|
312
|
+
def should_import_file(file_path: Path, state: dict) -> bool:
|
|
313
|
+
"""Check if file should be imported."""
|
|
314
|
+
file_str = str(file_path)
|
|
315
|
+
if file_str in state.get("imported_files", {}):
|
|
316
|
+
file_info = state["imported_files"][file_str]
|
|
317
|
+
last_modified = file_path.stat().st_mtime
|
|
318
|
+
if file_info.get("last_modified") == last_modified:
|
|
319
|
+
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
320
|
+
return False
|
|
321
|
+
return True
|
|
322
|
+
|
|
323
|
+
def update_file_state(file_path: Path, state: dict, chunks: int):
|
|
324
|
+
"""Update state for imported file."""
|
|
325
|
+
file_str = str(file_path)
|
|
326
|
+
state["imported_files"][file_str] = {
|
|
327
|
+
"imported_at": datetime.now().isoformat(),
|
|
328
|
+
"last_modified": file_path.stat().st_mtime,
|
|
329
|
+
"chunks": chunks
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
def main():
|
|
333
|
+
"""Main import function."""
|
|
334
|
+
# Load state
|
|
335
|
+
state = load_state()
|
|
336
|
+
logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
|
|
337
|
+
|
|
338
|
+
# Find all projects
|
|
339
|
+
logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
|
|
340
|
+
project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
|
|
341
|
+
logger.info(f"Found {len(project_dirs)} projects to import")
|
|
342
|
+
|
|
343
|
+
total_imported = 0
|
|
344
|
+
|
|
345
|
+
for project_dir in project_dirs:
|
|
346
|
+
# Get collection name
|
|
347
|
+
collection_name = get_collection_name(project_dir)
|
|
348
|
+
logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
|
|
349
|
+
|
|
350
|
+
# Ensure collection exists
|
|
351
|
+
ensure_collection(collection_name)
|
|
352
|
+
|
|
353
|
+
# Find JSONL files
|
|
354
|
+
jsonl_files = sorted(project_dir.glob("*.jsonl"))
|
|
355
|
+
|
|
356
|
+
# Limit files per cycle if specified
|
|
357
|
+
max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
|
|
358
|
+
jsonl_files = jsonl_files[:max_files]
|
|
359
|
+
|
|
360
|
+
for jsonl_file in jsonl_files:
|
|
361
|
+
if should_import_file(jsonl_file, state):
|
|
362
|
+
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
363
|
+
if chunks > 0:
|
|
364
|
+
update_file_state(jsonl_file, state, chunks)
|
|
365
|
+
save_state(state)
|
|
366
|
+
total_imported += 1
|
|
367
|
+
|
|
368
|
+
# Force GC after each file
|
|
369
|
+
gc.collect()
|
|
370
|
+
|
|
371
|
+
logger.info(f"Import complete: processed {total_imported} files")
|
|
372
|
+
|
|
373
|
+
if __name__ == "__main__":
|
|
374
|
+
main()
|