claude-self-reflect 2.6.0 → 2.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +19 -18
- package/Dockerfile.importer +6 -2
- package/Dockerfile.safe-watcher +44 -0
- package/README.md +3 -1
- package/docker-compose.yaml +43 -11
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/src/project_resolver.py +527 -0
- package/mcp-server/src/server.py +14 -10
- package/mcp-server/src/utils.py +20 -3
- package/package.json +2 -2
- package/scripts/import-conversations-unified.backup.py +374 -0
- package/scripts/import-conversations-unified.py +297 -723
- package/scripts/import-latest.py +124 -0
|
@@ -1,800 +1,374 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
|
|
3
|
+
Streaming importer with true line-by-line processing to prevent OOM.
|
|
4
|
+
Processes JSONL files without loading entire file into memory.
|
|
4
5
|
"""
|
|
5
6
|
|
|
7
|
+
import json
|
|
6
8
|
import os
|
|
7
9
|
import sys
|
|
8
|
-
import json
|
|
9
|
-
import glob
|
|
10
10
|
import hashlib
|
|
11
11
|
import gc
|
|
12
|
-
import
|
|
12
|
+
from pathlib import Path
|
|
13
13
|
from datetime import datetime
|
|
14
|
-
from typing import List, Dict, Any,
|
|
14
|
+
from typing import List, Dict, Any, Optional
|
|
15
15
|
import logging
|
|
16
|
-
from pathlib import Path
|
|
17
16
|
|
|
18
|
-
# Add the
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
# Add the project root to the Python path
|
|
18
|
+
project_root = Path(__file__).parent.parent
|
|
19
|
+
sys.path.insert(0, str(project_root))
|
|
21
20
|
|
|
22
21
|
from qdrant_client import QdrantClient
|
|
23
|
-
from qdrant_client.models import
|
|
24
|
-
VectorParams, Distance, PointStruct,
|
|
25
|
-
Filter, FieldCondition, MatchValue
|
|
26
|
-
)
|
|
22
|
+
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
27
23
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
24
|
+
# Set up logging
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
level=logging.INFO,
|
|
27
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
32
28
|
)
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
33
30
|
|
|
34
|
-
#
|
|
31
|
+
# Environment variables
|
|
35
32
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
default_state_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "imported-files.json")
|
|
39
|
-
STATE_FILE = os.getenv("STATE_FILE", default_state_file)
|
|
40
|
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Reduced from 100 to prevent OOM
|
|
41
|
-
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
|
|
33
|
+
STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
|
|
34
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
42
35
|
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
# Token limit configuration for Voyage AI
|
|
46
|
-
MAX_TOKENS_PER_BATCH = int(os.getenv("MAX_TOKENS_PER_BATCH", "100000")) # Safe limit (120k - 20k buffer)
|
|
47
|
-
if MAX_TOKENS_PER_BATCH > 120000 or MAX_TOKENS_PER_BATCH < 1000:
|
|
48
|
-
logger.warning(f"MAX_TOKENS_PER_BATCH={MAX_TOKENS_PER_BATCH} outside safe range [1000, 120000], using 100000")
|
|
49
|
-
MAX_TOKENS_PER_BATCH = 100000
|
|
50
|
-
|
|
51
|
-
TOKEN_ESTIMATION_RATIO = int(os.getenv("TOKEN_ESTIMATION_RATIO", "3")) # chars per token estimate
|
|
52
|
-
if TOKEN_ESTIMATION_RATIO < 2 or TOKEN_ESTIMATION_RATIO > 10:
|
|
53
|
-
logger.warning(f"TOKEN_ESTIMATION_RATIO={TOKEN_ESTIMATION_RATIO} outside normal range [2, 10], using 3")
|
|
54
|
-
TOKEN_ESTIMATION_RATIO = 3
|
|
36
|
+
MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
|
|
55
37
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
# Set up logging
|
|
60
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
61
|
-
logger = logging.getLogger(__name__)
|
|
62
|
-
|
|
63
|
-
# ============= Metadata Extraction Functions =============
|
|
64
|
-
|
|
65
|
-
def normalize_path_for_metadata(path: str) -> str:
|
|
66
|
-
"""Normalize file paths for consistency in metadata."""
|
|
67
|
-
if not path:
|
|
68
|
-
return ""
|
|
69
|
-
|
|
70
|
-
# Remove common prefixes
|
|
71
|
-
path = path.replace("/Users/", "~/")
|
|
72
|
-
path = path.replace("\\Users\\", "~\\")
|
|
73
|
-
|
|
74
|
-
# Convert to forward slashes
|
|
75
|
-
path = path.replace("\\", "/")
|
|
76
|
-
|
|
77
|
-
# Remove duplicate slashes
|
|
78
|
-
path = re.sub(r'/+', '/', path)
|
|
79
|
-
|
|
80
|
-
return path
|
|
81
|
-
|
|
82
|
-
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
83
|
-
"""Extract high-level concepts from conversation and tool usage."""
|
|
84
|
-
concepts = set()
|
|
85
|
-
|
|
86
|
-
# Common development concepts with patterns
|
|
87
|
-
concept_patterns = {
|
|
88
|
-
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
89
|
-
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
90
|
-
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
91
|
-
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
92
|
-
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
93
|
-
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
|
|
94
|
-
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
95
|
-
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
96
|
-
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
97
|
-
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
98
|
-
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
99
|
-
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
100
|
-
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
101
|
-
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
102
|
-
'search': r'(search|query|find|filter|match|relevance)'
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
# Check text content (limit to first 10000 chars for performance)
|
|
106
|
-
combined_text = text[:10000].lower() if text else ""
|
|
107
|
-
for concept, pattern in concept_patterns.items():
|
|
108
|
-
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
109
|
-
concepts.add(concept)
|
|
110
|
-
|
|
111
|
-
# Check tool usage patterns
|
|
112
|
-
if tool_usage.get('grep_searches'):
|
|
113
|
-
concepts.add('search')
|
|
114
|
-
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
115
|
-
concepts.add('development')
|
|
116
|
-
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
117
|
-
concepts.add('testing')
|
|
118
|
-
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
119
|
-
concepts.add('docker')
|
|
120
|
-
|
|
121
|
-
return concepts
|
|
122
|
-
|
|
123
|
-
def extract_files_from_git_output(output_text: str) -> List[str]:
|
|
124
|
-
"""Extract file paths from git command outputs (diff, show, status, etc)."""
|
|
125
|
-
files = set()
|
|
126
|
-
|
|
127
|
-
# Patterns for different git output formats
|
|
128
|
-
patterns = [
|
|
129
|
-
r'diff --git a/(.*?) b/', # git diff format
|
|
130
|
-
r'^\+\+\+ b/(.+)$', # diff new file
|
|
131
|
-
r'^--- a/(.+)$', # diff old file
|
|
132
|
-
r'^modified:\s+(.+)$', # git status
|
|
133
|
-
r'^deleted:\s+(.+)$', # git status
|
|
134
|
-
r'^new file:\s+(.+)$', # git status
|
|
135
|
-
r'^renamed:\s+(.+) -> (.+)$', # git status (captures both)
|
|
136
|
-
]
|
|
137
|
-
|
|
138
|
-
for pattern in patterns:
|
|
139
|
-
matches = re.findall(pattern, output_text, re.MULTILINE)
|
|
140
|
-
for match in matches:
|
|
141
|
-
if isinstance(match, tuple):
|
|
142
|
-
# Handle renamed files (captures both old and new)
|
|
143
|
-
for f in match:
|
|
144
|
-
if f:
|
|
145
|
-
files.add(normalize_path_for_metadata(f))
|
|
146
|
-
else:
|
|
147
|
-
files.add(normalize_path_for_metadata(match))
|
|
148
|
-
|
|
149
|
-
return list(files)[:20] # Limit to 20 files
|
|
150
|
-
|
|
151
|
-
def extract_tool_data_from_message(tool_use: Dict[str, Any], usage_dict: Dict[str, Any], tool_output: str = None):
|
|
152
|
-
"""Extract tool usage data from a tool_use object in a message, including outputs."""
|
|
153
|
-
tool_name = tool_use.get('name', '')
|
|
154
|
-
inputs = tool_use.get('input', {})
|
|
155
|
-
|
|
156
|
-
# Track tool in summary
|
|
157
|
-
usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
|
|
158
|
-
|
|
159
|
-
# Handle Read tool
|
|
160
|
-
if tool_name == 'Read':
|
|
161
|
-
file_path = inputs.get('file_path')
|
|
162
|
-
if file_path:
|
|
163
|
-
normalized = normalize_path_for_metadata(file_path)
|
|
164
|
-
if normalized not in usage_dict['files_read']:
|
|
165
|
-
usage_dict['files_read'].append(normalized)
|
|
166
|
-
|
|
167
|
-
# Handle Edit and MultiEdit tools
|
|
168
|
-
elif tool_name in ['Edit', 'MultiEdit']:
|
|
169
|
-
path = inputs.get('file_path')
|
|
170
|
-
if path:
|
|
171
|
-
normalized = normalize_path_for_metadata(path)
|
|
172
|
-
if normalized not in usage_dict['files_edited']:
|
|
173
|
-
usage_dict['files_edited'].append(normalized)
|
|
174
|
-
|
|
175
|
-
# Handle Write tool
|
|
176
|
-
elif tool_name == 'Write':
|
|
177
|
-
path = inputs.get('file_path')
|
|
178
|
-
if path:
|
|
179
|
-
normalized = normalize_path_for_metadata(path)
|
|
180
|
-
if normalized not in usage_dict['files_created']:
|
|
181
|
-
usage_dict['files_created'].append(normalized)
|
|
182
|
-
|
|
183
|
-
# Handle Grep tool
|
|
184
|
-
elif tool_name == 'Grep':
|
|
185
|
-
pattern = inputs.get('pattern')
|
|
186
|
-
if pattern and len(usage_dict['grep_searches']) < 10: # Limit
|
|
187
|
-
usage_dict['grep_searches'].append(pattern[:100]) # Truncate long patterns
|
|
188
|
-
|
|
189
|
-
# Handle Bash tool - Extract both command and output
|
|
190
|
-
elif tool_name == 'Bash':
|
|
191
|
-
command = inputs.get('command')
|
|
192
|
-
if command and len(usage_dict['bash_commands']) < 10:
|
|
193
|
-
usage_dict['bash_commands'].append(command[:200]) # Truncate
|
|
194
|
-
|
|
195
|
-
# Process tool output for git commands
|
|
196
|
-
if tool_output and any(cmd in command for cmd in ['git diff', 'git show', 'git status']):
|
|
197
|
-
git_files = extract_files_from_git_output(tool_output)
|
|
198
|
-
for file_path in git_files:
|
|
199
|
-
if file_path not in usage_dict['git_file_changes']:
|
|
200
|
-
usage_dict['git_file_changes'].append(file_path)
|
|
201
|
-
|
|
202
|
-
# Store tool output preview (for any tool)
|
|
203
|
-
if tool_output and len(usage_dict['tool_outputs']) < 15:
|
|
204
|
-
usage_dict['tool_outputs'].append({
|
|
205
|
-
'tool': tool_name,
|
|
206
|
-
'command': inputs.get('command', inputs.get('pattern', ''))[:100],
|
|
207
|
-
'output_preview': tool_output[:500], # First 500 chars
|
|
208
|
-
'output_length': len(tool_output)
|
|
209
|
-
})
|
|
210
|
-
|
|
211
|
-
def extract_metadata_from_jsonl(file_path: str) -> Dict[str, Any]:
|
|
212
|
-
"""Extract metadata from a JSONL conversation file."""
|
|
213
|
-
tool_usage = {
|
|
214
|
-
"files_read": [],
|
|
215
|
-
"files_edited": [],
|
|
216
|
-
"files_created": [],
|
|
217
|
-
"grep_searches": [],
|
|
218
|
-
"bash_commands": [],
|
|
219
|
-
"tools_summary": {},
|
|
220
|
-
"git_file_changes": [], # NEW: Files from git outputs
|
|
221
|
-
"tool_outputs": [] # NEW: Tool output previews
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
conversation_text = ""
|
|
225
|
-
tool_outputs = {} # Map tool_use_id to output text
|
|
226
|
-
|
|
227
|
-
try:
|
|
228
|
-
# First pass: collect tool outputs
|
|
229
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
230
|
-
for line in f:
|
|
231
|
-
if line.strip():
|
|
232
|
-
try:
|
|
233
|
-
data = json.loads(line)
|
|
234
|
-
if 'message' in data and data['message']:
|
|
235
|
-
msg = data['message']
|
|
236
|
-
if msg.get('content') and isinstance(msg['content'], list):
|
|
237
|
-
for item in msg['content']:
|
|
238
|
-
if isinstance(item, dict) and item.get('type') == 'tool_result':
|
|
239
|
-
# Capture tool output
|
|
240
|
-
tool_id = item.get('tool_use_id')
|
|
241
|
-
output_content = item.get('content', '')
|
|
242
|
-
if tool_id and output_content:
|
|
243
|
-
tool_outputs[tool_id] = output_content
|
|
244
|
-
# Also check for toolUseResult in data
|
|
245
|
-
if 'toolUseResult' in data:
|
|
246
|
-
result = data['toolUseResult']
|
|
247
|
-
if isinstance(result, dict):
|
|
248
|
-
tool_outputs['last_result'] = json.dumps(result)[:1000]
|
|
249
|
-
except:
|
|
250
|
-
continue
|
|
251
|
-
|
|
252
|
-
# Second pass: extract tool uses and text with outputs available
|
|
253
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
254
|
-
for line in f:
|
|
255
|
-
if line.strip():
|
|
256
|
-
try:
|
|
257
|
-
data = json.loads(line)
|
|
258
|
-
if 'message' in data and data['message']:
|
|
259
|
-
msg = data['message']
|
|
260
|
-
# Extract text
|
|
261
|
-
if msg.get('content'):
|
|
262
|
-
if isinstance(msg['content'], str):
|
|
263
|
-
conversation_text += msg['content'] + "\n"
|
|
264
|
-
elif isinstance(msg['content'], list):
|
|
265
|
-
for item in msg['content']:
|
|
266
|
-
if isinstance(item, dict):
|
|
267
|
-
if item.get('type') == 'text' and item.get('text'):
|
|
268
|
-
conversation_text += item['text'] + "\n"
|
|
269
|
-
elif item.get('type') == 'tool_use':
|
|
270
|
-
# Process tool use with output now available
|
|
271
|
-
tool_id = item.get('id', '')
|
|
272
|
-
output = tool_outputs.get(tool_id, '')
|
|
273
|
-
extract_tool_data_from_message(item, tool_usage, output)
|
|
274
|
-
except:
|
|
275
|
-
continue
|
|
276
|
-
except Exception as e:
|
|
277
|
-
logger.warning(f"Error extracting metadata from {file_path}: {e}")
|
|
278
|
-
|
|
279
|
-
# Extract concepts from text
|
|
280
|
-
concepts = extract_concepts(conversation_text, tool_usage)
|
|
281
|
-
|
|
282
|
-
# Build metadata
|
|
283
|
-
metadata = {
|
|
284
|
-
"files_analyzed": tool_usage['files_read'][:20], # Limit to 20
|
|
285
|
-
"files_edited": tool_usage['files_edited'][:10], # Limit to 10
|
|
286
|
-
"files_created": tool_usage['files_created'][:10],
|
|
287
|
-
"tools_used": list(tool_usage['tools_summary'].keys())[:20],
|
|
288
|
-
"tool_summary": dict(list(tool_usage['tools_summary'].items())[:10]),
|
|
289
|
-
"concepts": list(concepts)[:15], # Limit to 15
|
|
290
|
-
"search_patterns": tool_usage['grep_searches'][:10],
|
|
291
|
-
"git_file_changes": tool_usage['git_file_changes'][:20], # NEW: Git file changes
|
|
292
|
-
"tool_outputs": tool_usage['tool_outputs'][:15], # NEW: Tool output previews
|
|
293
|
-
"analysis_only": len(tool_usage['files_edited']) == 0 and len(tool_usage['files_created']) == 0,
|
|
294
|
-
"has_file_metadata": True,
|
|
295
|
-
"metadata_version": CURRENT_METADATA_VERSION,
|
|
296
|
-
"metadata_extracted_at": datetime.now().isoformat()
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
return metadata
|
|
300
|
-
|
|
301
|
-
# ============= End Metadata Extraction Functions =============
|
|
302
|
-
|
|
303
|
-
# State management functions
|
|
304
|
-
def load_state():
|
|
305
|
-
"""Load the import state from file."""
|
|
306
|
-
if os.path.exists(STATE_FILE):
|
|
307
|
-
try:
|
|
308
|
-
with open(STATE_FILE, 'r') as f:
|
|
309
|
-
state = json.load(f)
|
|
310
|
-
# Ensure the expected structure exists
|
|
311
|
-
if "imported_files" not in state:
|
|
312
|
-
state["imported_files"] = {}
|
|
313
|
-
return state
|
|
314
|
-
except Exception as e:
|
|
315
|
-
logger.warning(f"Failed to load state file: {e}")
|
|
316
|
-
return {"imported_files": {}}
|
|
317
|
-
|
|
318
|
-
def save_state(state):
|
|
319
|
-
"""Save the import state to file."""
|
|
320
|
-
try:
|
|
321
|
-
# Ensure directory exists
|
|
322
|
-
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
323
|
-
# Write atomically by using a temp file
|
|
324
|
-
temp_file = STATE_FILE + ".tmp"
|
|
325
|
-
with open(temp_file, 'w') as f:
|
|
326
|
-
json.dump(state, f, indent=2)
|
|
327
|
-
os.replace(temp_file, STATE_FILE)
|
|
328
|
-
logger.debug(f"Saved state with {len(state['imported_files'])} files")
|
|
329
|
-
except Exception as e:
|
|
330
|
-
logger.error(f"Failed to save state file: {e}")
|
|
331
|
-
|
|
332
|
-
def should_import_file(file_path, state):
|
|
333
|
-
"""Check if a file should be imported based on modification time."""
|
|
334
|
-
str_path = str(file_path)
|
|
335
|
-
file_mtime = os.path.getmtime(file_path)
|
|
336
|
-
|
|
337
|
-
if str_path in state["imported_files"]:
|
|
338
|
-
file_state = state["imported_files"][str_path]
|
|
339
|
-
|
|
340
|
-
# Handle both old string format and new dict format
|
|
341
|
-
if isinstance(file_state, str):
|
|
342
|
-
# Old format (just timestamp string) - treat as needs reimport
|
|
343
|
-
logger.info(f"Found old format state for {file_path.name}, will reimport")
|
|
344
|
-
return True
|
|
345
|
-
else:
|
|
346
|
-
# New format with dictionary
|
|
347
|
-
last_imported = file_state.get("last_imported", 0)
|
|
348
|
-
last_modified = file_state.get("last_modified", 0)
|
|
349
|
-
|
|
350
|
-
# Skip if file hasn't been modified since last import
|
|
351
|
-
if file_mtime <= last_modified and last_imported > 0:
|
|
352
|
-
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
353
|
-
return False
|
|
354
|
-
|
|
355
|
-
return True
|
|
356
|
-
|
|
357
|
-
def update_file_state(file_path, state, chunks_imported):
|
|
358
|
-
"""Update the state for an imported file."""
|
|
359
|
-
str_path = str(file_path)
|
|
360
|
-
state["imported_files"][str_path] = {
|
|
361
|
-
"last_modified": os.path.getmtime(file_path),
|
|
362
|
-
"last_imported": datetime.now().timestamp(),
|
|
363
|
-
"chunks_imported": chunks_imported
|
|
364
|
-
}
|
|
38
|
+
# Initialize Qdrant client
|
|
39
|
+
client = QdrantClient(url=QDRANT_URL)
|
|
365
40
|
|
|
366
41
|
# Initialize embedding provider
|
|
367
42
|
embedding_provider = None
|
|
368
43
|
embedding_dimension = None
|
|
369
|
-
collection_suffix = None
|
|
370
44
|
|
|
371
45
|
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
372
|
-
# Use local embeddings
|
|
373
46
|
logger.info("Using local embeddings (fastembed)")
|
|
374
47
|
from fastembed import TextEmbedding
|
|
375
|
-
embedding_provider = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
|
|
48
|
+
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
376
49
|
embedding_dimension = 384
|
|
377
|
-
collection_suffix = "
|
|
50
|
+
collection_suffix = "local"
|
|
378
51
|
else:
|
|
379
|
-
# Use Voyage AI
|
|
380
52
|
logger.info("Using Voyage AI embeddings")
|
|
381
53
|
import voyageai
|
|
382
|
-
|
|
54
|
+
embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
383
55
|
embedding_dimension = 1024
|
|
384
|
-
collection_suffix = "
|
|
385
|
-
|
|
386
|
-
# Initialize Qdrant client
|
|
387
|
-
client = QdrantClient(url=QDRANT_URL)
|
|
56
|
+
collection_suffix = "voyage"
|
|
388
57
|
|
|
58
|
+
def normalize_project_name(project_name: str) -> str:
|
|
59
|
+
"""Normalize project name for consistency."""
|
|
60
|
+
return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
|
|
389
61
|
|
|
390
|
-
def
|
|
391
|
-
|
|
392
|
-
|
|
62
|
+
def get_collection_name(project_path: Path) -> str:
|
|
63
|
+
"""Generate collection name from project path."""
|
|
64
|
+
normalized = normalize_project_name(project_path.name)
|
|
65
|
+
name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
66
|
+
return f"conv_{name_hash}_{collection_suffix}"
|
|
393
67
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
base_tokens = len(text) // TOKEN_ESTIMATION_RATIO
|
|
404
|
-
|
|
405
|
-
# Adjust for code/JSON content (typically more tokens per char)
|
|
406
|
-
# Count indicators of structured content
|
|
407
|
-
structure_indicators = text.count('{') + text.count('[') + text.count('```')
|
|
408
|
-
if structure_indicators > 10: # Likely JSON/code
|
|
409
|
-
base_tokens = int(base_tokens * 1.3)
|
|
410
|
-
|
|
411
|
-
# Add 10% safety margin
|
|
412
|
-
return int(base_tokens * 1.1)
|
|
413
|
-
|
|
414
|
-
def extract_message_content(msg: Dict[str, Any]) -> str:
|
|
415
|
-
"""Extract text content from a message."""
|
|
416
|
-
content = msg.get("content", "")
|
|
417
|
-
|
|
418
|
-
if isinstance(content, list):
|
|
419
|
-
# Handle structured content
|
|
420
|
-
text_parts = []
|
|
421
|
-
for item in content:
|
|
422
|
-
if isinstance(item, dict) and item.get("type") == "text":
|
|
423
|
-
text_parts.append(item.get("text", ""))
|
|
424
|
-
elif isinstance(item, str):
|
|
425
|
-
text_parts.append(item)
|
|
426
|
-
content = " ".join(text_parts)
|
|
427
|
-
|
|
428
|
-
return content
|
|
68
|
+
def ensure_collection(collection_name: str):
|
|
69
|
+
"""Ensure collection exists with correct configuration."""
|
|
70
|
+
collections = client.get_collections().collections
|
|
71
|
+
if not any(c.name == collection_name for c in collections):
|
|
72
|
+
logger.info(f"Creating collection: {collection_name}")
|
|
73
|
+
client.create_collection(
|
|
74
|
+
collection_name=collection_name,
|
|
75
|
+
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
|
76
|
+
)
|
|
429
77
|
|
|
430
78
|
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
431
|
-
"""Generate embeddings for
|
|
79
|
+
"""Generate embeddings for texts."""
|
|
432
80
|
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
433
|
-
# Local embeddings using FastEmbed
|
|
434
81
|
embeddings = list(embedding_provider.passage_embed(texts))
|
|
435
|
-
return [
|
|
82
|
+
return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
|
|
436
83
|
else:
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
chunks = []
|
|
84
|
+
response = embedding_provider.embed(texts, model="voyage-3")
|
|
85
|
+
return response.embeddings
|
|
86
|
+
|
|
87
|
+
def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
88
|
+
conversation_id: str, created_at: str,
|
|
89
|
+
metadata: Dict[str, Any], collection_name: str,
|
|
90
|
+
project_path: Path) -> int:
|
|
91
|
+
"""Process and immediately upload a single chunk."""
|
|
92
|
+
if not messages:
|
|
93
|
+
return 0
|
|
448
94
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
content = msg.get("content", "")
|
|
457
|
-
|
|
458
|
-
if isinstance(content, list):
|
|
459
|
-
# Handle structured content
|
|
460
|
-
text_parts = []
|
|
461
|
-
for item in content:
|
|
462
|
-
if isinstance(item, dict) and item.get("type") == "text":
|
|
463
|
-
text_parts.append(item.get("text", ""))
|
|
464
|
-
elif isinstance(item, str):
|
|
465
|
-
text_parts.append(item)
|
|
466
|
-
content = " ".join(text_parts)
|
|
467
|
-
|
|
468
|
-
if content:
|
|
469
|
-
texts.append(f"{role.upper()}: {content}")
|
|
470
|
-
|
|
471
|
-
if texts:
|
|
472
|
-
chunks.append({
|
|
473
|
-
"text": "\n".join(texts),
|
|
474
|
-
"messages": chunk_messages,
|
|
475
|
-
"chunk_index": i // chunk_size,
|
|
476
|
-
"start_role": chunk_messages[0].get("role", "unknown") if chunk_messages else "unknown"
|
|
477
|
-
})
|
|
95
|
+
# Extract text content
|
|
96
|
+
texts = []
|
|
97
|
+
for msg in messages:
|
|
98
|
+
role = msg.get("role", "unknown")
|
|
99
|
+
content = msg.get("content", "")
|
|
100
|
+
if content:
|
|
101
|
+
texts.append(f"{role.upper()}: {content}")
|
|
478
102
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
def split_large_chunk(chunk: Dict[str, Any], max_tokens: int, depth: int = 0) -> List[Dict[str, Any]]:
|
|
482
|
-
"""Split a large chunk into smaller pieces that fit token limit."""
|
|
483
|
-
# Check recursion depth to prevent stack overflow
|
|
484
|
-
if depth >= MAX_RECURSION_DEPTH:
|
|
485
|
-
logger.error(f"Max recursion depth {MAX_RECURSION_DEPTH} reached while splitting chunk")
|
|
486
|
-
# Force truncate as last resort
|
|
487
|
-
max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
|
|
488
|
-
chunk["text"] = chunk["text"][:max_chars] + "\n[TRUNCATED - MAX DEPTH REACHED]"
|
|
489
|
-
chunk["was_truncated"] = True
|
|
490
|
-
return [chunk]
|
|
103
|
+
if not texts:
|
|
104
|
+
return 0
|
|
491
105
|
|
|
492
|
-
|
|
493
|
-
messages = chunk["messages"]
|
|
106
|
+
chunk_text = "\n".join(texts)
|
|
494
107
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
mid = len(messages) // 2
|
|
499
|
-
chunk1_messages = messages[:mid]
|
|
500
|
-
chunk2_messages = messages[mid:]
|
|
108
|
+
try:
|
|
109
|
+
# Generate embedding
|
|
110
|
+
embeddings = generate_embeddings([chunk_text])
|
|
501
111
|
|
|
502
|
-
#
|
|
503
|
-
|
|
504
|
-
|
|
112
|
+
# Create point ID
|
|
113
|
+
point_id = hashlib.md5(
|
|
114
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
115
|
+
).hexdigest()[:16]
|
|
505
116
|
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
117
|
+
# Create payload
|
|
118
|
+
payload = {
|
|
119
|
+
"text": chunk_text,
|
|
120
|
+
"conversation_id": conversation_id,
|
|
121
|
+
"chunk_index": chunk_index,
|
|
122
|
+
"timestamp": created_at,
|
|
123
|
+
"project": normalize_project_name(project_path.name),
|
|
124
|
+
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
125
|
+
"message_count": len(messages)
|
|
126
|
+
}
|
|
511
127
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
if content:
|
|
516
|
-
texts2.append(f"{role.upper()}: {content}")
|
|
128
|
+
# Add metadata
|
|
129
|
+
if metadata:
|
|
130
|
+
payload.update(metadata)
|
|
517
131
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
"start_role": chunk["start_role"]
|
|
525
|
-
})
|
|
526
|
-
if texts2:
|
|
527
|
-
split_chunks.append({
|
|
528
|
-
"text": "\n".join(texts2),
|
|
529
|
-
"messages": chunk2_messages,
|
|
530
|
-
"chunk_index": f"{chunk['chunk_index']}_b",
|
|
531
|
-
"start_role": chunk2_messages[0].get("role", "unknown") if chunk2_messages else "unknown"
|
|
532
|
-
})
|
|
132
|
+
# Create point
|
|
133
|
+
point = PointStruct(
|
|
134
|
+
id=int(point_id, 16) % (2**63),
|
|
135
|
+
vector=embeddings[0],
|
|
136
|
+
payload=payload
|
|
137
|
+
)
|
|
533
138
|
|
|
534
|
-
#
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
return
|
|
542
|
-
else:
|
|
543
|
-
# Single message too large - truncate with warning
|
|
544
|
-
max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
|
|
545
|
-
if len(text) > max_chars:
|
|
546
|
-
truncated_size = len(text) - max_chars
|
|
547
|
-
logger.warning(f"Single message exceeds token limit, truncating {truncated_size} chars from {len(text)} total")
|
|
548
|
-
chunk["text"] = text[:max_chars] + f"\n[TRUNCATED {truncated_size} CHARS]"
|
|
549
|
-
chunk["was_truncated"] = True
|
|
550
|
-
chunk["original_size"] = len(text)
|
|
551
|
-
return [chunk]
|
|
552
|
-
|
|
553
|
-
def create_token_aware_batches(chunks: List[Dict[str, Any]], max_tokens: int = MAX_TOKENS_PER_BATCH) -> List[List[Dict[str, Any]]]:
|
|
554
|
-
"""Create batches that respect token limits."""
|
|
555
|
-
if not USE_TOKEN_AWARE_BATCHING:
|
|
556
|
-
# Fall back to old batching method
|
|
557
|
-
batches = []
|
|
558
|
-
for i in range(0, len(chunks), BATCH_SIZE):
|
|
559
|
-
batches.append(chunks[i:i + BATCH_SIZE])
|
|
560
|
-
return batches
|
|
561
|
-
|
|
562
|
-
batches = []
|
|
563
|
-
current_batch = []
|
|
564
|
-
current_tokens = 0
|
|
565
|
-
|
|
566
|
-
for chunk in chunks:
|
|
567
|
-
chunk_tokens = estimate_tokens(chunk["text"])
|
|
139
|
+
# Upload immediately
|
|
140
|
+
client.upsert(
|
|
141
|
+
collection_name=collection_name,
|
|
142
|
+
points=[point],
|
|
143
|
+
wait=True
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return 1
|
|
568
147
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
batches.append(current_batch)
|
|
582
|
-
current_batch = [chunk]
|
|
583
|
-
current_tokens = chunk_tokens
|
|
584
|
-
else:
|
|
585
|
-
current_batch.append(chunk)
|
|
586
|
-
current_tokens += chunk_tokens
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.error(f"Error processing chunk {chunk_index}: {e}")
|
|
150
|
+
return 0
|
|
151
|
+
|
|
152
|
+
def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
|
|
153
|
+
"""Extract metadata in a single pass, return metadata and first timestamp."""
|
|
154
|
+
metadata = {
|
|
155
|
+
"files_analyzed": [],
|
|
156
|
+
"files_edited": [],
|
|
157
|
+
"tools_used": [],
|
|
158
|
+
"concepts": []
|
|
159
|
+
}
|
|
587
160
|
|
|
588
|
-
|
|
589
|
-
batches.append(current_batch)
|
|
161
|
+
first_timestamp = None
|
|
590
162
|
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
163
|
+
try:
|
|
164
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
165
|
+
for line in f:
|
|
166
|
+
if not line.strip():
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
data = json.loads(line)
|
|
171
|
+
|
|
172
|
+
# Get timestamp from first valid entry
|
|
173
|
+
if first_timestamp is None and 'timestamp' in data:
|
|
174
|
+
first_timestamp = data.get('timestamp')
|
|
175
|
+
|
|
176
|
+
# Extract tool usage from messages
|
|
177
|
+
if 'message' in data and data['message']:
|
|
178
|
+
msg = data['message']
|
|
179
|
+
if msg.get('content'):
|
|
180
|
+
content = msg['content']
|
|
181
|
+
if isinstance(content, list):
|
|
182
|
+
for item in content:
|
|
183
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
184
|
+
tool_name = item.get('name', '')
|
|
185
|
+
if tool_name and tool_name not in metadata['tools_used']:
|
|
186
|
+
metadata['tools_used'].append(tool_name)
|
|
187
|
+
|
|
188
|
+
# Extract file references
|
|
189
|
+
if 'input' in item:
|
|
190
|
+
input_data = item['input']
|
|
191
|
+
if isinstance(input_data, dict):
|
|
192
|
+
if 'file_path' in input_data:
|
|
193
|
+
file_ref = input_data['file_path']
|
|
194
|
+
if file_ref not in metadata['files_analyzed']:
|
|
195
|
+
metadata['files_analyzed'].append(file_ref)
|
|
196
|
+
if 'path' in input_data:
|
|
197
|
+
file_ref = input_data['path']
|
|
198
|
+
if file_ref not in metadata['files_analyzed']:
|
|
199
|
+
metadata['files_analyzed'].append(file_ref)
|
|
200
|
+
|
|
201
|
+
except json.JSONDecodeError:
|
|
202
|
+
continue
|
|
203
|
+
except Exception:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.warning(f"Error extracting metadata: {e}")
|
|
597
208
|
|
|
598
|
-
return
|
|
209
|
+
return metadata, first_timestamp or datetime.now().isoformat()
|
|
599
210
|
|
|
600
|
-
def
|
|
601
|
-
"""
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
if not jsonl_files:
|
|
605
|
-
logger.warning(f"No JSONL files found in {project_path}")
|
|
606
|
-
return 0
|
|
211
|
+
def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
212
|
+
"""Stream import a single JSONL file without loading it into memory."""
|
|
213
|
+
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
607
214
|
|
|
608
|
-
#
|
|
609
|
-
|
|
610
|
-
if collection_name not in [c.name for c in collections]:
|
|
611
|
-
logger.info(f"Creating collection: {collection_name}")
|
|
612
|
-
client.create_collection(
|
|
613
|
-
collection_name=collection_name,
|
|
614
|
-
vectors_config=VectorParams(
|
|
615
|
-
size=embedding_dimension,
|
|
616
|
-
distance=Distance.COSINE
|
|
617
|
-
)
|
|
618
|
-
)
|
|
215
|
+
# Extract metadata in first pass (lightweight)
|
|
216
|
+
metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
|
|
619
217
|
|
|
218
|
+
# Stream messages and process in chunks
|
|
219
|
+
chunk_buffer = []
|
|
220
|
+
chunk_index = 0
|
|
620
221
|
total_chunks = 0
|
|
222
|
+
conversation_id = jsonl_file.stem
|
|
621
223
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
for line_num, line in enumerate(f, 1):
|
|
635
|
-
line = line.strip()
|
|
636
|
-
if not line:
|
|
224
|
+
try:
|
|
225
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
226
|
+
for line_num, line in enumerate(f, 1):
|
|
227
|
+
line = line.strip()
|
|
228
|
+
if not line:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
data = json.loads(line)
|
|
233
|
+
|
|
234
|
+
# Skip non-message lines
|
|
235
|
+
if data.get('type') == 'summary':
|
|
637
236
|
continue
|
|
638
237
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
238
|
+
# Extract message if present
|
|
239
|
+
if 'message' in data and data['message']:
|
|
240
|
+
msg = data['message']
|
|
241
|
+
if msg.get('role') and msg.get('content'):
|
|
242
|
+
# Extract content
|
|
243
|
+
content = msg['content']
|
|
244
|
+
if isinstance(content, list):
|
|
245
|
+
text_parts = []
|
|
246
|
+
for item in content:
|
|
247
|
+
if isinstance(item, dict) and item.get('type') == 'text':
|
|
248
|
+
text_parts.append(item.get('text', ''))
|
|
249
|
+
elif isinstance(item, str):
|
|
250
|
+
text_parts.append(item)
|
|
251
|
+
content = '\n'.join(text_parts)
|
|
649
252
|
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
content = msg['content']
|
|
656
|
-
if isinstance(content, list):
|
|
657
|
-
text_parts = []
|
|
658
|
-
for item in content:
|
|
659
|
-
if isinstance(item, dict) and item.get('type') == 'text':
|
|
660
|
-
text_parts.append(item.get('text', ''))
|
|
661
|
-
elif isinstance(item, str):
|
|
662
|
-
text_parts.append(item)
|
|
663
|
-
content = '\n'.join(text_parts)
|
|
253
|
+
if content:
|
|
254
|
+
chunk_buffer.append({
|
|
255
|
+
'role': msg['role'],
|
|
256
|
+
'content': content
|
|
257
|
+
})
|
|
664
258
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
total_chunks += len(points)
|
|
738
|
-
|
|
739
|
-
file_chunks = len(chunks)
|
|
740
|
-
logger.info(f"Imported {file_chunks} chunks from {jsonl_file.name}")
|
|
741
|
-
|
|
742
|
-
# Update state for this file
|
|
743
|
-
update_file_state(jsonl_file, state, file_chunks)
|
|
744
|
-
|
|
745
|
-
# Save state after each file to prevent loss on OOM
|
|
746
|
-
save_state(state)
|
|
747
|
-
|
|
748
|
-
# Force garbage collection to free memory
|
|
749
|
-
gc.collect()
|
|
750
|
-
|
|
751
|
-
except Exception as e:
|
|
752
|
-
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
753
|
-
import traceback
|
|
754
|
-
logger.error(traceback.format_exc())
|
|
755
|
-
|
|
756
|
-
return total_chunks
|
|
259
|
+
# Process chunk when buffer reaches MAX_CHUNK_SIZE
|
|
260
|
+
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
261
|
+
chunks = process_and_upload_chunk(
|
|
262
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
263
|
+
created_at, metadata, collection_name, project_path
|
|
264
|
+
)
|
|
265
|
+
total_chunks += chunks
|
|
266
|
+
chunk_buffer = []
|
|
267
|
+
chunk_index += 1
|
|
268
|
+
|
|
269
|
+
# Force garbage collection after each chunk
|
|
270
|
+
gc.collect()
|
|
271
|
+
|
|
272
|
+
# Log progress
|
|
273
|
+
if chunk_index % 10 == 0:
|
|
274
|
+
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
275
|
+
|
|
276
|
+
except json.JSONDecodeError:
|
|
277
|
+
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.debug(f"Error processing line {line_num}: {e}")
|
|
280
|
+
|
|
281
|
+
# Process remaining messages
|
|
282
|
+
if chunk_buffer:
|
|
283
|
+
chunks = process_and_upload_chunk(
|
|
284
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
285
|
+
created_at, metadata, collection_name, project_path
|
|
286
|
+
)
|
|
287
|
+
total_chunks += chunks
|
|
288
|
+
|
|
289
|
+
logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
|
|
290
|
+
return total_chunks
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
294
|
+
return 0
|
|
295
|
+
|
|
296
|
+
def load_state() -> dict:
|
|
297
|
+
"""Load import state."""
|
|
298
|
+
if os.path.exists(STATE_FILE):
|
|
299
|
+
try:
|
|
300
|
+
with open(STATE_FILE, 'r') as f:
|
|
301
|
+
return json.load(f)
|
|
302
|
+
except:
|
|
303
|
+
pass
|
|
304
|
+
return {"imported_files": {}}
|
|
305
|
+
|
|
306
|
+
def save_state(state: dict):
|
|
307
|
+
"""Save import state."""
|
|
308
|
+
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
309
|
+
with open(STATE_FILE, 'w') as f:
|
|
310
|
+
json.dump(state, f, indent=2)
|
|
311
|
+
|
|
312
|
+
def should_import_file(file_path: Path, state: dict) -> bool:
|
|
313
|
+
"""Check if file should be imported."""
|
|
314
|
+
file_str = str(file_path)
|
|
315
|
+
if file_str in state.get("imported_files", {}):
|
|
316
|
+
file_info = state["imported_files"][file_str]
|
|
317
|
+
last_modified = file_path.stat().st_mtime
|
|
318
|
+
if file_info.get("last_modified") == last_modified:
|
|
319
|
+
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
320
|
+
return False
|
|
321
|
+
return True
|
|
322
|
+
|
|
323
|
+
def update_file_state(file_path: Path, state: dict, chunks: int):
|
|
324
|
+
"""Update state for imported file."""
|
|
325
|
+
file_str = str(file_path)
|
|
326
|
+
state["imported_files"][file_str] = {
|
|
327
|
+
"imported_at": datetime.now().isoformat(),
|
|
328
|
+
"last_modified": file_path.stat().st_mtime,
|
|
329
|
+
"chunks": chunks
|
|
330
|
+
}
|
|
757
331
|
|
|
758
332
|
def main():
|
|
759
333
|
"""Main import function."""
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
if not logs_path.exists():
|
|
763
|
-
logger.error(f"Logs directory not found: {LOGS_DIR}")
|
|
764
|
-
return
|
|
765
|
-
|
|
766
|
-
# Load existing state
|
|
334
|
+
# Load state
|
|
767
335
|
state = load_state()
|
|
768
|
-
logger.info(f"Loaded state with {len(state
|
|
769
|
-
|
|
770
|
-
# Find all project directories
|
|
771
|
-
project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
|
|
772
|
-
|
|
773
|
-
if not project_dirs:
|
|
774
|
-
logger.warning("No project directories found")
|
|
775
|
-
return
|
|
336
|
+
logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
|
|
776
337
|
|
|
338
|
+
# Find all projects
|
|
339
|
+
logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
|
|
340
|
+
project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
|
|
777
341
|
logger.info(f"Found {len(project_dirs)} projects to import")
|
|
778
342
|
|
|
779
|
-
# Import each project
|
|
780
343
|
total_imported = 0
|
|
344
|
+
|
|
781
345
|
for project_dir in project_dirs:
|
|
782
|
-
#
|
|
783
|
-
|
|
784
|
-
|
|
346
|
+
# Get collection name
|
|
347
|
+
collection_name = get_collection_name(project_dir)
|
|
348
|
+
logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
|
|
785
349
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
total_imported += chunks
|
|
789
|
-
logger.info(f"Imported {chunks} chunks from {project_dir.name}")
|
|
350
|
+
# Ensure collection exists
|
|
351
|
+
ensure_collection(collection_name)
|
|
790
352
|
|
|
791
|
-
#
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
353
|
+
# Find JSONL files
|
|
354
|
+
jsonl_files = sorted(project_dir.glob("*.jsonl"))
|
|
355
|
+
|
|
356
|
+
# Limit files per cycle if specified
|
|
357
|
+
max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
|
|
358
|
+
jsonl_files = jsonl_files[:max_files]
|
|
359
|
+
|
|
360
|
+
for jsonl_file in jsonl_files:
|
|
361
|
+
if should_import_file(jsonl_file, state):
|
|
362
|
+
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
363
|
+
if chunks > 0:
|
|
364
|
+
update_file_state(jsonl_file, state, chunks)
|
|
365
|
+
save_state(state)
|
|
366
|
+
total_imported += 1
|
|
367
|
+
|
|
368
|
+
# Force GC after each file
|
|
369
|
+
gc.collect()
|
|
796
370
|
|
|
797
|
-
logger.info(f"Import complete
|
|
371
|
+
logger.info(f"Import complete: processed {total_imported} files")
|
|
798
372
|
|
|
799
373
|
if __name__ == "__main__":
|
|
800
374
|
main()
|