claude-self-reflect 2.5.19 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +19 -18
- package/Dockerfile.importer +6 -2
- package/Dockerfile.safe-watcher +44 -0
- package/README.md +31 -1
- package/docker-compose.yaml +43 -11
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/src/project_resolver.py +527 -0
- package/mcp-server/src/server.py +14 -10
- package/mcp-server/src/utils.py +20 -3
- package/package.json +7 -1
- package/scripts/import-conversations-unified.backup.py +374 -0
- package/scripts/import-conversations-unified.py +305 -560
- package/scripts/import-latest.py +124 -0
|
@@ -1,629 +1,374 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
|
|
3
|
+
Streaming importer with true line-by-line processing to prevent OOM.
|
|
4
|
+
Processes JSONL files without loading entire file into memory.
|
|
4
5
|
"""
|
|
5
6
|
|
|
7
|
+
import json
|
|
6
8
|
import os
|
|
7
9
|
import sys
|
|
8
|
-
import json
|
|
9
|
-
import glob
|
|
10
10
|
import hashlib
|
|
11
11
|
import gc
|
|
12
|
-
import
|
|
12
|
+
from pathlib import Path
|
|
13
13
|
from datetime import datetime
|
|
14
|
-
from typing import List, Dict, Any,
|
|
14
|
+
from typing import List, Dict, Any, Optional
|
|
15
15
|
import logging
|
|
16
|
-
from pathlib import Path
|
|
17
16
|
|
|
18
|
-
# Add the
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
# Add the project root to the Python path
|
|
18
|
+
project_root = Path(__file__).parent.parent
|
|
19
|
+
sys.path.insert(0, str(project_root))
|
|
21
20
|
|
|
22
21
|
from qdrant_client import QdrantClient
|
|
23
|
-
from qdrant_client.models import
|
|
24
|
-
VectorParams, Distance, PointStruct,
|
|
25
|
-
Filter, FieldCondition, MatchValue
|
|
26
|
-
)
|
|
22
|
+
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
27
23
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
24
|
+
# Set up logging
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
level=logging.INFO,
|
|
27
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
32
28
|
)
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
33
30
|
|
|
34
|
-
#
|
|
31
|
+
# Environment variables
|
|
35
32
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
default_state_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "imported-files.json")
|
|
39
|
-
STATE_FILE = os.getenv("STATE_FILE", default_state_file)
|
|
40
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Reduced from 100 to prevent OOM
|
|
41
|
-
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
|
|
33
|
+
STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
|
|
34
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
42
35
|
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
# Set up logging
|
|
46
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
47
|
-
logger = logging.getLogger(__name__)
|
|
36
|
+
MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
|
|
48
37
|
|
|
49
|
-
#
|
|
50
|
-
|
|
51
|
-
def normalize_path_for_metadata(path: str) -> str:
|
|
52
|
-
"""Normalize file paths for consistency in metadata."""
|
|
53
|
-
if not path:
|
|
54
|
-
return ""
|
|
55
|
-
|
|
56
|
-
# Remove common prefixes
|
|
57
|
-
path = path.replace("/Users/", "~/")
|
|
58
|
-
path = path.replace("\\Users\\", "~\\")
|
|
59
|
-
|
|
60
|
-
# Convert to forward slashes
|
|
61
|
-
path = path.replace("\\", "/")
|
|
62
|
-
|
|
63
|
-
# Remove duplicate slashes
|
|
64
|
-
path = re.sub(r'/+', '/', path)
|
|
65
|
-
|
|
66
|
-
return path
|
|
67
|
-
|
|
68
|
-
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
69
|
-
"""Extract high-level concepts from conversation and tool usage."""
|
|
70
|
-
concepts = set()
|
|
71
|
-
|
|
72
|
-
# Common development concepts with patterns
|
|
73
|
-
concept_patterns = {
|
|
74
|
-
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
75
|
-
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
76
|
-
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
77
|
-
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
78
|
-
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
79
|
-
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
|
|
80
|
-
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
81
|
-
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
82
|
-
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
83
|
-
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
84
|
-
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
85
|
-
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
86
|
-
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
87
|
-
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
88
|
-
'search': r'(search|query|find|filter|match|relevance)'
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
# Check text content (limit to first 10000 chars for performance)
|
|
92
|
-
combined_text = text[:10000].lower() if text else ""
|
|
93
|
-
for concept, pattern in concept_patterns.items():
|
|
94
|
-
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
95
|
-
concepts.add(concept)
|
|
96
|
-
|
|
97
|
-
# Check tool usage patterns
|
|
98
|
-
if tool_usage.get('grep_searches'):
|
|
99
|
-
concepts.add('search')
|
|
100
|
-
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
101
|
-
concepts.add('development')
|
|
102
|
-
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
103
|
-
concepts.add('testing')
|
|
104
|
-
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
105
|
-
concepts.add('docker')
|
|
106
|
-
|
|
107
|
-
return concepts
|
|
108
|
-
|
|
109
|
-
def extract_files_from_git_output(output_text: str) -> List[str]:
|
|
110
|
-
"""Extract file paths from git command outputs (diff, show, status, etc)."""
|
|
111
|
-
files = set()
|
|
112
|
-
|
|
113
|
-
# Patterns for different git output formats
|
|
114
|
-
patterns = [
|
|
115
|
-
r'diff --git a/(.*?) b/', # git diff format
|
|
116
|
-
r'^\+\+\+ b/(.+)$', # diff new file
|
|
117
|
-
r'^--- a/(.+)$', # diff old file
|
|
118
|
-
r'^modified:\s+(.+)$', # git status
|
|
119
|
-
r'^deleted:\s+(.+)$', # git status
|
|
120
|
-
r'^new file:\s+(.+)$', # git status
|
|
121
|
-
r'^renamed:\s+(.+) -> (.+)$', # git status (captures both)
|
|
122
|
-
]
|
|
123
|
-
|
|
124
|
-
for pattern in patterns:
|
|
125
|
-
matches = re.findall(pattern, output_text, re.MULTILINE)
|
|
126
|
-
for match in matches:
|
|
127
|
-
if isinstance(match, tuple):
|
|
128
|
-
# Handle renamed files (captures both old and new)
|
|
129
|
-
for f in match:
|
|
130
|
-
if f:
|
|
131
|
-
files.add(normalize_path_for_metadata(f))
|
|
132
|
-
else:
|
|
133
|
-
files.add(normalize_path_for_metadata(match))
|
|
134
|
-
|
|
135
|
-
return list(files)[:20] # Limit to 20 files
|
|
136
|
-
|
|
137
|
-
def extract_tool_data_from_message(tool_use: Dict[str, Any], usage_dict: Dict[str, Any], tool_output: str = None):
|
|
138
|
-
"""Extract tool usage data from a tool_use object in a message, including outputs."""
|
|
139
|
-
tool_name = tool_use.get('name', '')
|
|
140
|
-
inputs = tool_use.get('input', {})
|
|
141
|
-
|
|
142
|
-
# Track tool in summary
|
|
143
|
-
usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
|
|
144
|
-
|
|
145
|
-
# Handle Read tool
|
|
146
|
-
if tool_name == 'Read':
|
|
147
|
-
file_path = inputs.get('file_path')
|
|
148
|
-
if file_path:
|
|
149
|
-
normalized = normalize_path_for_metadata(file_path)
|
|
150
|
-
if normalized not in usage_dict['files_read']:
|
|
151
|
-
usage_dict['files_read'].append(normalized)
|
|
152
|
-
|
|
153
|
-
# Handle Edit and MultiEdit tools
|
|
154
|
-
elif tool_name in ['Edit', 'MultiEdit']:
|
|
155
|
-
path = inputs.get('file_path')
|
|
156
|
-
if path:
|
|
157
|
-
normalized = normalize_path_for_metadata(path)
|
|
158
|
-
if normalized not in usage_dict['files_edited']:
|
|
159
|
-
usage_dict['files_edited'].append(normalized)
|
|
160
|
-
|
|
161
|
-
# Handle Write tool
|
|
162
|
-
elif tool_name == 'Write':
|
|
163
|
-
path = inputs.get('file_path')
|
|
164
|
-
if path:
|
|
165
|
-
normalized = normalize_path_for_metadata(path)
|
|
166
|
-
if normalized not in usage_dict['files_created']:
|
|
167
|
-
usage_dict['files_created'].append(normalized)
|
|
168
|
-
|
|
169
|
-
# Handle Grep tool
|
|
170
|
-
elif tool_name == 'Grep':
|
|
171
|
-
pattern = inputs.get('pattern')
|
|
172
|
-
if pattern and len(usage_dict['grep_searches']) < 10: # Limit
|
|
173
|
-
usage_dict['grep_searches'].append(pattern[:100]) # Truncate long patterns
|
|
174
|
-
|
|
175
|
-
# Handle Bash tool - Extract both command and output
|
|
176
|
-
elif tool_name == 'Bash':
|
|
177
|
-
command = inputs.get('command')
|
|
178
|
-
if command and len(usage_dict['bash_commands']) < 10:
|
|
179
|
-
usage_dict['bash_commands'].append(command[:200]) # Truncate
|
|
180
|
-
|
|
181
|
-
# Process tool output for git commands
|
|
182
|
-
if tool_output and any(cmd in command for cmd in ['git diff', 'git show', 'git status']):
|
|
183
|
-
git_files = extract_files_from_git_output(tool_output)
|
|
184
|
-
for file_path in git_files:
|
|
185
|
-
if file_path not in usage_dict['git_file_changes']:
|
|
186
|
-
usage_dict['git_file_changes'].append(file_path)
|
|
187
|
-
|
|
188
|
-
# Store tool output preview (for any tool)
|
|
189
|
-
if tool_output and len(usage_dict['tool_outputs']) < 15:
|
|
190
|
-
usage_dict['tool_outputs'].append({
|
|
191
|
-
'tool': tool_name,
|
|
192
|
-
'command': inputs.get('command', inputs.get('pattern', ''))[:100],
|
|
193
|
-
'output_preview': tool_output[:500], # First 500 chars
|
|
194
|
-
'output_length': len(tool_output)
|
|
195
|
-
})
|
|
196
|
-
|
|
197
|
-
def extract_metadata_from_jsonl(file_path: str) -> Dict[str, Any]:
|
|
198
|
-
"""Extract metadata from a JSONL conversation file."""
|
|
199
|
-
tool_usage = {
|
|
200
|
-
"files_read": [],
|
|
201
|
-
"files_edited": [],
|
|
202
|
-
"files_created": [],
|
|
203
|
-
"grep_searches": [],
|
|
204
|
-
"bash_commands": [],
|
|
205
|
-
"tools_summary": {},
|
|
206
|
-
"git_file_changes": [], # NEW: Files from git outputs
|
|
207
|
-
"tool_outputs": [] # NEW: Tool output previews
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
conversation_text = ""
|
|
211
|
-
tool_outputs = {} # Map tool_use_id to output text
|
|
212
|
-
|
|
213
|
-
try:
|
|
214
|
-
# First pass: collect tool outputs
|
|
215
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
216
|
-
for line in f:
|
|
217
|
-
if line.strip():
|
|
218
|
-
try:
|
|
219
|
-
data = json.loads(line)
|
|
220
|
-
if 'message' in data and data['message']:
|
|
221
|
-
msg = data['message']
|
|
222
|
-
if msg.get('content') and isinstance(msg['content'], list):
|
|
223
|
-
for item in msg['content']:
|
|
224
|
-
if isinstance(item, dict) and item.get('type') == 'tool_result':
|
|
225
|
-
# Capture tool output
|
|
226
|
-
tool_id = item.get('tool_use_id')
|
|
227
|
-
output_content = item.get('content', '')
|
|
228
|
-
if tool_id and output_content:
|
|
229
|
-
tool_outputs[tool_id] = output_content
|
|
230
|
-
# Also check for toolUseResult in data
|
|
231
|
-
if 'toolUseResult' in data:
|
|
232
|
-
result = data['toolUseResult']
|
|
233
|
-
if isinstance(result, dict):
|
|
234
|
-
tool_outputs['last_result'] = json.dumps(result)[:1000]
|
|
235
|
-
except:
|
|
236
|
-
continue
|
|
237
|
-
|
|
238
|
-
# Second pass: extract tool uses and text with outputs available
|
|
239
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
240
|
-
for line in f:
|
|
241
|
-
if line.strip():
|
|
242
|
-
try:
|
|
243
|
-
data = json.loads(line)
|
|
244
|
-
if 'message' in data and data['message']:
|
|
245
|
-
msg = data['message']
|
|
246
|
-
# Extract text
|
|
247
|
-
if msg.get('content'):
|
|
248
|
-
if isinstance(msg['content'], str):
|
|
249
|
-
conversation_text += msg['content'] + "\n"
|
|
250
|
-
elif isinstance(msg['content'], list):
|
|
251
|
-
for item in msg['content']:
|
|
252
|
-
if isinstance(item, dict):
|
|
253
|
-
if item.get('type') == 'text' and item.get('text'):
|
|
254
|
-
conversation_text += item['text'] + "\n"
|
|
255
|
-
elif item.get('type') == 'tool_use':
|
|
256
|
-
# Process tool use with output now available
|
|
257
|
-
tool_id = item.get('id', '')
|
|
258
|
-
output = tool_outputs.get(tool_id, '')
|
|
259
|
-
extract_tool_data_from_message(item, tool_usage, output)
|
|
260
|
-
except:
|
|
261
|
-
continue
|
|
262
|
-
except Exception as e:
|
|
263
|
-
logger.warning(f"Error extracting metadata from {file_path}: {e}")
|
|
264
|
-
|
|
265
|
-
# Extract concepts from text
|
|
266
|
-
concepts = extract_concepts(conversation_text, tool_usage)
|
|
267
|
-
|
|
268
|
-
# Build metadata
|
|
269
|
-
metadata = {
|
|
270
|
-
"files_analyzed": tool_usage['files_read'][:20], # Limit to 20
|
|
271
|
-
"files_edited": tool_usage['files_edited'][:10], # Limit to 10
|
|
272
|
-
"files_created": tool_usage['files_created'][:10],
|
|
273
|
-
"tools_used": list(tool_usage['tools_summary'].keys())[:20],
|
|
274
|
-
"tool_summary": dict(list(tool_usage['tools_summary'].items())[:10]),
|
|
275
|
-
"concepts": list(concepts)[:15], # Limit to 15
|
|
276
|
-
"search_patterns": tool_usage['grep_searches'][:10],
|
|
277
|
-
"git_file_changes": tool_usage['git_file_changes'][:20], # NEW: Git file changes
|
|
278
|
-
"tool_outputs": tool_usage['tool_outputs'][:15], # NEW: Tool output previews
|
|
279
|
-
"analysis_only": len(tool_usage['files_edited']) == 0 and len(tool_usage['files_created']) == 0,
|
|
280
|
-
"has_file_metadata": True,
|
|
281
|
-
"metadata_version": CURRENT_METADATA_VERSION,
|
|
282
|
-
"metadata_extracted_at": datetime.now().isoformat()
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
return metadata
|
|
286
|
-
|
|
287
|
-
# ============= End Metadata Extraction Functions =============
|
|
288
|
-
|
|
289
|
-
# State management functions
|
|
290
|
-
def load_state():
|
|
291
|
-
"""Load the import state from file."""
|
|
292
|
-
if os.path.exists(STATE_FILE):
|
|
293
|
-
try:
|
|
294
|
-
with open(STATE_FILE, 'r') as f:
|
|
295
|
-
state = json.load(f)
|
|
296
|
-
# Ensure the expected structure exists
|
|
297
|
-
if "imported_files" not in state:
|
|
298
|
-
state["imported_files"] = {}
|
|
299
|
-
return state
|
|
300
|
-
except Exception as e:
|
|
301
|
-
logger.warning(f"Failed to load state file: {e}")
|
|
302
|
-
return {"imported_files": {}}
|
|
303
|
-
|
|
304
|
-
def save_state(state):
|
|
305
|
-
"""Save the import state to file."""
|
|
306
|
-
try:
|
|
307
|
-
# Ensure directory exists
|
|
308
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
309
|
-
# Write atomically by using a temp file
|
|
310
|
-
temp_file = STATE_FILE + ".tmp"
|
|
311
|
-
with open(temp_file, 'w') as f:
|
|
312
|
-
json.dump(state, f, indent=2)
|
|
313
|
-
os.replace(temp_file, STATE_FILE)
|
|
314
|
-
logger.debug(f"Saved state with {len(state['imported_files'])} files")
|
|
315
|
-
except Exception as e:
|
|
316
|
-
logger.error(f"Failed to save state file: {e}")
|
|
317
|
-
|
|
318
|
-
def should_import_file(file_path, state):
|
|
319
|
-
"""Check if a file should be imported based on modification time."""
|
|
320
|
-
str_path = str(file_path)
|
|
321
|
-
file_mtime = os.path.getmtime(file_path)
|
|
322
|
-
|
|
323
|
-
if str_path in state["imported_files"]:
|
|
324
|
-
file_state = state["imported_files"][str_path]
|
|
325
|
-
|
|
326
|
-
# Handle both old string format and new dict format
|
|
327
|
-
if isinstance(file_state, str):
|
|
328
|
-
# Old format (just timestamp string) - treat as needs reimport
|
|
329
|
-
logger.info(f"Found old format state for {file_path.name}, will reimport")
|
|
330
|
-
return True
|
|
331
|
-
else:
|
|
332
|
-
# New format with dictionary
|
|
333
|
-
last_imported = file_state.get("last_imported", 0)
|
|
334
|
-
last_modified = file_state.get("last_modified", 0)
|
|
335
|
-
|
|
336
|
-
# Skip if file hasn't been modified since last import
|
|
337
|
-
if file_mtime <= last_modified and last_imported > 0:
|
|
338
|
-
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
339
|
-
return False
|
|
340
|
-
|
|
341
|
-
return True
|
|
342
|
-
|
|
343
|
-
def update_file_state(file_path, state, chunks_imported):
|
|
344
|
-
"""Update the state for an imported file."""
|
|
345
|
-
str_path = str(file_path)
|
|
346
|
-
state["imported_files"][str_path] = {
|
|
347
|
-
"last_modified": os.path.getmtime(file_path),
|
|
348
|
-
"last_imported": datetime.now().timestamp(),
|
|
349
|
-
"chunks_imported": chunks_imported
|
|
350
|
-
}
|
|
38
|
+
# Initialize Qdrant client
|
|
39
|
+
client = QdrantClient(url=QDRANT_URL)
|
|
351
40
|
|
|
352
41
|
# Initialize embedding provider
|
|
353
42
|
embedding_provider = None
|
|
354
43
|
embedding_dimension = None
|
|
355
|
-
collection_suffix = None
|
|
356
44
|
|
|
357
45
|
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
358
|
-
# Use local embeddings
|
|
359
46
|
logger.info("Using local embeddings (fastembed)")
|
|
360
47
|
from fastembed import TextEmbedding
|
|
361
|
-
embedding_provider = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
|
|
48
|
+
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
362
49
|
embedding_dimension = 384
|
|
363
|
-
collection_suffix = "
|
|
50
|
+
collection_suffix = "local"
|
|
364
51
|
else:
|
|
365
|
-
# Use Voyage AI
|
|
366
52
|
logger.info("Using Voyage AI embeddings")
|
|
367
53
|
import voyageai
|
|
368
|
-
|
|
54
|
+
embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
369
55
|
embedding_dimension = 1024
|
|
370
|
-
collection_suffix = "
|
|
371
|
-
|
|
372
|
-
# Initialize Qdrant client
|
|
373
|
-
client = QdrantClient(url=QDRANT_URL)
|
|
56
|
+
collection_suffix = "voyage"
|
|
374
57
|
|
|
58
|
+
def normalize_project_name(project_name: str) -> str:
|
|
59
|
+
"""Normalize project name for consistency."""
|
|
60
|
+
return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
|
|
375
61
|
|
|
376
|
-
def
|
|
377
|
-
|
|
378
|
-
|
|
62
|
+
def get_collection_name(project_path: Path) -> str:
|
|
63
|
+
"""Generate collection name from project path."""
|
|
64
|
+
normalized = normalize_project_name(project_path.name)
|
|
65
|
+
name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
66
|
+
return f"conv_{name_hash}_{collection_suffix}"
|
|
379
67
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
68
|
+
def ensure_collection(collection_name: str):
|
|
69
|
+
"""Ensure collection exists with correct configuration."""
|
|
70
|
+
collections = client.get_collections().collections
|
|
71
|
+
if not any(c.name == collection_name for c in collections):
|
|
72
|
+
logger.info(f"Creating collection: {collection_name}")
|
|
73
|
+
client.create_collection(
|
|
74
|
+
collection_name=collection_name,
|
|
75
|
+
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
|
76
|
+
)
|
|
383
77
|
|
|
384
78
|
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
385
|
-
"""Generate embeddings for
|
|
79
|
+
"""Generate embeddings for texts."""
|
|
386
80
|
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
387
|
-
# Local embeddings using FastEmbed
|
|
388
81
|
embeddings = list(embedding_provider.passage_embed(texts))
|
|
389
|
-
return [
|
|
82
|
+
return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
|
|
390
83
|
else:
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
texts=texts,
|
|
394
|
-
model="voyage-3-large",
|
|
395
|
-
input_type="document"
|
|
396
|
-
)
|
|
397
|
-
return result.embeddings
|
|
84
|
+
response = embedding_provider.embed(texts, model="voyage-3")
|
|
85
|
+
return response.embeddings
|
|
398
86
|
|
|
399
|
-
def
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
# Extract text content
|
|
407
|
-
texts = []
|
|
408
|
-
for msg in chunk_messages:
|
|
409
|
-
role = msg.get("role", "unknown")
|
|
410
|
-
content = msg.get("content", "")
|
|
411
|
-
|
|
412
|
-
if isinstance(content, list):
|
|
413
|
-
# Handle structured content
|
|
414
|
-
text_parts = []
|
|
415
|
-
for item in content:
|
|
416
|
-
if isinstance(item, dict) and item.get("type") == "text":
|
|
417
|
-
text_parts.append(item.get("text", ""))
|
|
418
|
-
elif isinstance(item, str):
|
|
419
|
-
text_parts.append(item)
|
|
420
|
-
content = " ".join(text_parts)
|
|
421
|
-
|
|
422
|
-
if content:
|
|
423
|
-
texts.append(f"{role.upper()}: {content}")
|
|
424
|
-
|
|
425
|
-
if texts:
|
|
426
|
-
chunks.append({
|
|
427
|
-
"text": "\n".join(texts),
|
|
428
|
-
"messages": chunk_messages,
|
|
429
|
-
"chunk_index": i // chunk_size,
|
|
430
|
-
"start_role": chunk_messages[0].get("role", "unknown") if chunk_messages else "unknown"
|
|
431
|
-
})
|
|
87
|
+
def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
88
|
+
conversation_id: str, created_at: str,
|
|
89
|
+
metadata: Dict[str, Any], collection_name: str,
|
|
90
|
+
project_path: Path) -> int:
|
|
91
|
+
"""Process and immediately upload a single chunk."""
|
|
92
|
+
if not messages:
|
|
93
|
+
return 0
|
|
432
94
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
95
|
+
# Extract text content
|
|
96
|
+
texts = []
|
|
97
|
+
for msg in messages:
|
|
98
|
+
role = msg.get("role", "unknown")
|
|
99
|
+
content = msg.get("content", "")
|
|
100
|
+
if content:
|
|
101
|
+
texts.append(f"{role.upper()}: {content}")
|
|
438
102
|
|
|
439
|
-
if not
|
|
440
|
-
logger.warning(f"No JSONL files found in {project_path}")
|
|
103
|
+
if not texts:
|
|
441
104
|
return 0
|
|
442
105
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
106
|
+
chunk_text = "\n".join(texts)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Generate embedding
|
|
110
|
+
embeddings = generate_embeddings([chunk_text])
|
|
111
|
+
|
|
112
|
+
# Create point ID
|
|
113
|
+
point_id = hashlib.md5(
|
|
114
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
115
|
+
).hexdigest()[:16]
|
|
116
|
+
|
|
117
|
+
# Create payload
|
|
118
|
+
payload = {
|
|
119
|
+
"text": chunk_text,
|
|
120
|
+
"conversation_id": conversation_id,
|
|
121
|
+
"chunk_index": chunk_index,
|
|
122
|
+
"timestamp": created_at,
|
|
123
|
+
"project": normalize_project_name(project_path.name),
|
|
124
|
+
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
125
|
+
"message_count": len(messages)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# Add metadata
|
|
129
|
+
if metadata:
|
|
130
|
+
payload.update(metadata)
|
|
131
|
+
|
|
132
|
+
# Create point
|
|
133
|
+
point = PointStruct(
|
|
134
|
+
id=int(point_id, 16) % (2**63),
|
|
135
|
+
vector=embeddings[0],
|
|
136
|
+
payload=payload
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Upload immediately
|
|
140
|
+
client.upsert(
|
|
448
141
|
collection_name=collection_name,
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
distance=Distance.COSINE
|
|
452
|
-
)
|
|
142
|
+
points=[point],
|
|
143
|
+
wait=True
|
|
453
144
|
)
|
|
145
|
+
|
|
146
|
+
return 1
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.error(f"Error processing chunk {chunk_index}: {e}")
|
|
150
|
+
return 0
|
|
151
|
+
|
|
152
|
+
def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
|
|
153
|
+
"""Extract metadata in a single pass, return metadata and first timestamp."""
|
|
154
|
+
metadata = {
|
|
155
|
+
"files_analyzed": [],
|
|
156
|
+
"files_edited": [],
|
|
157
|
+
"tools_used": [],
|
|
158
|
+
"concepts": []
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
first_timestamp = None
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
165
|
+
for line in f:
|
|
166
|
+
if not line.strip():
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
data = json.loads(line)
|
|
171
|
+
|
|
172
|
+
# Get timestamp from first valid entry
|
|
173
|
+
if first_timestamp is None and 'timestamp' in data:
|
|
174
|
+
first_timestamp = data.get('timestamp')
|
|
175
|
+
|
|
176
|
+
# Extract tool usage from messages
|
|
177
|
+
if 'message' in data and data['message']:
|
|
178
|
+
msg = data['message']
|
|
179
|
+
if msg.get('content'):
|
|
180
|
+
content = msg['content']
|
|
181
|
+
if isinstance(content, list):
|
|
182
|
+
for item in content:
|
|
183
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
184
|
+
tool_name = item.get('name', '')
|
|
185
|
+
if tool_name and tool_name not in metadata['tools_used']:
|
|
186
|
+
metadata['tools_used'].append(tool_name)
|
|
187
|
+
|
|
188
|
+
# Extract file references
|
|
189
|
+
if 'input' in item:
|
|
190
|
+
input_data = item['input']
|
|
191
|
+
if isinstance(input_data, dict):
|
|
192
|
+
if 'file_path' in input_data:
|
|
193
|
+
file_ref = input_data['file_path']
|
|
194
|
+
if file_ref not in metadata['files_analyzed']:
|
|
195
|
+
metadata['files_analyzed'].append(file_ref)
|
|
196
|
+
if 'path' in input_data:
|
|
197
|
+
file_ref = input_data['path']
|
|
198
|
+
if file_ref not in metadata['files_analyzed']:
|
|
199
|
+
metadata['files_analyzed'].append(file_ref)
|
|
200
|
+
|
|
201
|
+
except json.JSONDecodeError:
|
|
202
|
+
continue
|
|
203
|
+
except Exception:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.warning(f"Error extracting metadata: {e}")
|
|
208
|
+
|
|
209
|
+
return metadata, first_timestamp or datetime.now().isoformat()
|
|
210
|
+
|
|
211
|
+
def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
212
|
+
"""Stream import a single JSONL file without loading it into memory."""
|
|
213
|
+
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
214
|
+
|
|
215
|
+
# Extract metadata in first pass (lightweight)
|
|
216
|
+
metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
|
|
454
217
|
|
|
218
|
+
# Stream messages and process in chunks
|
|
219
|
+
chunk_buffer = []
|
|
220
|
+
chunk_index = 0
|
|
455
221
|
total_chunks = 0
|
|
222
|
+
conversation_id = jsonl_file.stem
|
|
456
223
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
for line_num, line in enumerate(f, 1):
|
|
470
|
-
line = line.strip()
|
|
471
|
-
if not line:
|
|
224
|
+
try:
|
|
225
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
226
|
+
for line_num, line in enumerate(f, 1):
|
|
227
|
+
line = line.strip()
|
|
228
|
+
if not line:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
data = json.loads(line)
|
|
233
|
+
|
|
234
|
+
# Skip non-message lines
|
|
235
|
+
if data.get('type') == 'summary':
|
|
472
236
|
continue
|
|
473
237
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
238
|
+
# Extract message if present
|
|
239
|
+
if 'message' in data and data['message']:
|
|
240
|
+
msg = data['message']
|
|
241
|
+
if msg.get('role') and msg.get('content'):
|
|
242
|
+
# Extract content
|
|
243
|
+
content = msg['content']
|
|
244
|
+
if isinstance(content, list):
|
|
245
|
+
text_parts = []
|
|
246
|
+
for item in content:
|
|
247
|
+
if isinstance(item, dict) and item.get('type') == 'text':
|
|
248
|
+
text_parts.append(item.get('text', ''))
|
|
249
|
+
elif isinstance(item, str):
|
|
250
|
+
text_parts.append(item)
|
|
251
|
+
content = '\n'.join(text_parts)
|
|
484
252
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
content = msg['content']
|
|
491
|
-
if isinstance(content, list):
|
|
492
|
-
text_parts = []
|
|
493
|
-
for item in content:
|
|
494
|
-
if isinstance(item, dict) and item.get('type') == 'text':
|
|
495
|
-
text_parts.append(item.get('text', ''))
|
|
496
|
-
elif isinstance(item, str):
|
|
497
|
-
text_parts.append(item)
|
|
498
|
-
content = '\n'.join(text_parts)
|
|
253
|
+
if content:
|
|
254
|
+
chunk_buffer.append({
|
|
255
|
+
'role': msg['role'],
|
|
256
|
+
'content': content
|
|
257
|
+
})
|
|
499
258
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
update_file_state(jsonl_file, state, file_chunks)
|
|
573
|
-
|
|
574
|
-
# Save state after each file to prevent loss on OOM
|
|
575
|
-
save_state(state)
|
|
576
|
-
|
|
577
|
-
# Force garbage collection to free memory
|
|
578
|
-
gc.collect()
|
|
579
|
-
|
|
580
|
-
except Exception as e:
|
|
581
|
-
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
582
|
-
import traceback
|
|
583
|
-
logger.error(traceback.format_exc())
|
|
584
|
-
|
|
585
|
-
return total_chunks
|
|
259
|
+
# Process chunk when buffer reaches MAX_CHUNK_SIZE
|
|
260
|
+
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
261
|
+
chunks = process_and_upload_chunk(
|
|
262
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
263
|
+
created_at, metadata, collection_name, project_path
|
|
264
|
+
)
|
|
265
|
+
total_chunks += chunks
|
|
266
|
+
chunk_buffer = []
|
|
267
|
+
chunk_index += 1
|
|
268
|
+
|
|
269
|
+
# Force garbage collection after each chunk
|
|
270
|
+
gc.collect()
|
|
271
|
+
|
|
272
|
+
# Log progress
|
|
273
|
+
if chunk_index % 10 == 0:
|
|
274
|
+
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
275
|
+
|
|
276
|
+
except json.JSONDecodeError:
|
|
277
|
+
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.debug(f"Error processing line {line_num}: {e}")
|
|
280
|
+
|
|
281
|
+
# Process remaining messages
|
|
282
|
+
if chunk_buffer:
|
|
283
|
+
chunks = process_and_upload_chunk(
|
|
284
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
285
|
+
created_at, metadata, collection_name, project_path
|
|
286
|
+
)
|
|
287
|
+
total_chunks += chunks
|
|
288
|
+
|
|
289
|
+
logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
|
|
290
|
+
return total_chunks
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
294
|
+
return 0
|
|
295
|
+
|
|
296
|
+
def load_state() -> dict:
|
|
297
|
+
"""Load import state."""
|
|
298
|
+
if os.path.exists(STATE_FILE):
|
|
299
|
+
try:
|
|
300
|
+
with open(STATE_FILE, 'r') as f:
|
|
301
|
+
return json.load(f)
|
|
302
|
+
except:
|
|
303
|
+
pass
|
|
304
|
+
return {"imported_files": {}}
|
|
305
|
+
|
|
306
|
+
def save_state(state: dict):
|
|
307
|
+
"""Save import state."""
|
|
308
|
+
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
309
|
+
with open(STATE_FILE, 'w') as f:
|
|
310
|
+
json.dump(state, f, indent=2)
|
|
311
|
+
|
|
312
|
+
def should_import_file(file_path: Path, state: dict) -> bool:
|
|
313
|
+
"""Check if file should be imported."""
|
|
314
|
+
file_str = str(file_path)
|
|
315
|
+
if file_str in state.get("imported_files", {}):
|
|
316
|
+
file_info = state["imported_files"][file_str]
|
|
317
|
+
last_modified = file_path.stat().st_mtime
|
|
318
|
+
if file_info.get("last_modified") == last_modified:
|
|
319
|
+
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
320
|
+
return False
|
|
321
|
+
return True
|
|
322
|
+
|
|
323
|
+
def update_file_state(file_path: Path, state: dict, chunks: int):
|
|
324
|
+
"""Update state for imported file."""
|
|
325
|
+
file_str = str(file_path)
|
|
326
|
+
state["imported_files"][file_str] = {
|
|
327
|
+
"imported_at": datetime.now().isoformat(),
|
|
328
|
+
"last_modified": file_path.stat().st_mtime,
|
|
329
|
+
"chunks": chunks
|
|
330
|
+
}
|
|
586
331
|
|
|
587
332
|
def main():
|
|
588
333
|
"""Main import function."""
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
if not logs_path.exists():
|
|
592
|
-
logger.error(f"Logs directory not found: {LOGS_DIR}")
|
|
593
|
-
return
|
|
594
|
-
|
|
595
|
-
# Load existing state
|
|
334
|
+
# Load state
|
|
596
335
|
state = load_state()
|
|
597
|
-
logger.info(f"Loaded state with {len(state
|
|
598
|
-
|
|
599
|
-
# Find all project directories
|
|
600
|
-
project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
|
|
601
|
-
|
|
602
|
-
if not project_dirs:
|
|
603
|
-
logger.warning("No project directories found")
|
|
604
|
-
return
|
|
336
|
+
logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
|
|
605
337
|
|
|
338
|
+
# Find all projects
|
|
339
|
+
logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
|
|
340
|
+
project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
|
|
606
341
|
logger.info(f"Found {len(project_dirs)} projects to import")
|
|
607
342
|
|
|
608
|
-
# Import each project
|
|
609
343
|
total_imported = 0
|
|
344
|
+
|
|
610
345
|
for project_dir in project_dirs:
|
|
611
|
-
#
|
|
612
|
-
|
|
613
|
-
|
|
346
|
+
# Get collection name
|
|
347
|
+
collection_name = get_collection_name(project_dir)
|
|
348
|
+
logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
|
|
614
349
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
total_imported += chunks
|
|
618
|
-
logger.info(f"Imported {chunks} chunks from {project_dir.name}")
|
|
350
|
+
# Ensure collection exists
|
|
351
|
+
ensure_collection(collection_name)
|
|
619
352
|
|
|
620
|
-
#
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
353
|
+
# Find JSONL files
|
|
354
|
+
jsonl_files = sorted(project_dir.glob("*.jsonl"))
|
|
355
|
+
|
|
356
|
+
# Limit files per cycle if specified
|
|
357
|
+
max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
|
|
358
|
+
jsonl_files = jsonl_files[:max_files]
|
|
359
|
+
|
|
360
|
+
for jsonl_file in jsonl_files:
|
|
361
|
+
if should_import_file(jsonl_file, state):
|
|
362
|
+
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
363
|
+
if chunks > 0:
|
|
364
|
+
update_file_state(jsonl_file, state, chunks)
|
|
365
|
+
save_state(state)
|
|
366
|
+
total_imported += 1
|
|
367
|
+
|
|
368
|
+
# Force GC after each file
|
|
369
|
+
gc.collect()
|
|
625
370
|
|
|
626
|
-
logger.info(f"Import complete
|
|
371
|
+
logger.info(f"Import complete: processed {total_imported} files")
|
|
627
372
|
|
|
628
373
|
if __name__ == "__main__":
|
|
629
374
|
main()
|