claude-self-reflect 2.5.18 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +76 -0
- package/.claude/agents/mcp-integration.md +45 -0
- package/.claude/agents/qdrant-specialist.md +41 -0
- package/README.md +54 -1
- package/installer/setup-wizard-docker.js +65 -1
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/src/server.py +91 -32
- package/mcp-server/src/status.py +2 -2
- package/package.json +9 -1
- package/scripts/delta-metadata-update-safe.py +442 -0
- package/scripts/force-metadata-recovery.py +305 -0
- package/scripts/import-conversations-unified.py +174 -3
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Force metadata recovery script for Claude Self-Reflect.
|
|
4
|
+
Fixes conversations that were marked as updated but don't actually have metadata.
|
|
5
|
+
This addresses the point ID mismatch bug in delta-metadata-update.py.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import hashlib
|
|
12
|
+
import re
|
|
13
|
+
import asyncio
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import List, Dict, Any, Set, Optional
|
|
16
|
+
import logging
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from qdrant_client import QdrantClient
|
|
20
|
+
|
|
21
|
+
# Configuration
|
|
22
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
23
|
+
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
24
|
+
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
25
|
+
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))
|
|
26
|
+
|
|
27
|
+
# Set up logging
|
|
28
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# Initialize Qdrant client
|
|
32
|
+
client = QdrantClient(url=QDRANT_URL, timeout=30)
|
|
33
|
+
|
|
34
|
+
def normalize_path(path: str) -> str:
|
|
35
|
+
"""Normalize file paths for consistency."""
|
|
36
|
+
if not path:
|
|
37
|
+
return ""
|
|
38
|
+
path = path.replace("/Users/", "~/").replace("\\Users\\", "~\\")
|
|
39
|
+
path = path.replace("\\", "/")
|
|
40
|
+
path = re.sub(r'/+', '/', path)
|
|
41
|
+
return path
|
|
42
|
+
|
|
43
|
+
def extract_concepts(text: str) -> Set[str]:
|
|
44
|
+
"""Extract high-level concepts from text."""
|
|
45
|
+
concepts = set()
|
|
46
|
+
|
|
47
|
+
concept_patterns = {
|
|
48
|
+
'security': r'(security|vulnerability|CVE|injection|auth)',
|
|
49
|
+
'docker': r'(docker|container|compose|kubernetes)',
|
|
50
|
+
'testing': r'(test|pytest|unittest|coverage)',
|
|
51
|
+
'api': r'(API|REST|GraphQL|endpoint)',
|
|
52
|
+
'database': r'(database|SQL|query|migration|qdrant)',
|
|
53
|
+
'debugging': r'(debug|error|exception|traceback)',
|
|
54
|
+
'git': r'(git|commit|branch|merge|pull request)',
|
|
55
|
+
'mcp': r'(MCP|claude-self-reflect|tool|agent)',
|
|
56
|
+
'embeddings': r'(embedding|vector|semantic|similarity)',
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
text_lower = text.lower()
|
|
60
|
+
for concept, pattern in concept_patterns.items():
|
|
61
|
+
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
62
|
+
concepts.add(concept)
|
|
63
|
+
|
|
64
|
+
return concepts
|
|
65
|
+
|
|
66
|
+
def extract_metadata_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
67
|
+
"""Extract metadata from a JSONL conversation file."""
|
|
68
|
+
metadata = {
|
|
69
|
+
"files_analyzed": [],
|
|
70
|
+
"files_edited": [],
|
|
71
|
+
"tools_used": set(),
|
|
72
|
+
"concepts": set(),
|
|
73
|
+
"text_sample": ""
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
78
|
+
line_count = 0
|
|
79
|
+
for line in f:
|
|
80
|
+
line_count += 1
|
|
81
|
+
if line_count > 200: # Limit processing
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
if not line.strip():
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
data = json.loads(line)
|
|
89
|
+
if 'message' in data and data['message']:
|
|
90
|
+
msg = data['message']
|
|
91
|
+
|
|
92
|
+
# Extract text for concept analysis
|
|
93
|
+
if msg.get('content'):
|
|
94
|
+
if isinstance(msg['content'], str):
|
|
95
|
+
metadata['text_sample'] += msg['content'][:500] + "\n"
|
|
96
|
+
|
|
97
|
+
# Extract tool usage
|
|
98
|
+
if msg.get('role') == 'assistant' and msg.get('content'):
|
|
99
|
+
content = msg['content']
|
|
100
|
+
if isinstance(content, list):
|
|
101
|
+
for item in content:
|
|
102
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
103
|
+
tool_name = item.get('name', '')
|
|
104
|
+
metadata['tools_used'].add(tool_name)
|
|
105
|
+
|
|
106
|
+
inputs = item.get('input', {})
|
|
107
|
+
|
|
108
|
+
if tool_name == 'Read' and 'file_path' in inputs:
|
|
109
|
+
metadata['files_analyzed'].append(
|
|
110
|
+
normalize_path(inputs['file_path'])
|
|
111
|
+
)
|
|
112
|
+
elif tool_name in ['Edit', 'Write'] and 'file_path' in inputs:
|
|
113
|
+
metadata['files_edited'].append(
|
|
114
|
+
normalize_path(inputs['file_path'])
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
except json.JSONDecodeError:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"Error reading {jsonl_path}: {e}")
|
|
122
|
+
|
|
123
|
+
# Extract concepts from collected text
|
|
124
|
+
if metadata['text_sample']:
|
|
125
|
+
metadata['concepts'] = extract_concepts(metadata['text_sample'][:5000])
|
|
126
|
+
|
|
127
|
+
# Convert sets to lists and limit
|
|
128
|
+
metadata['tools_used'] = list(metadata['tools_used'])[:20]
|
|
129
|
+
metadata['concepts'] = list(metadata['concepts'])[:15]
|
|
130
|
+
metadata['files_analyzed'] = list(set(metadata['files_analyzed']))[:20]
|
|
131
|
+
metadata['files_edited'] = list(set(metadata['files_edited']))[:10]
|
|
132
|
+
|
|
133
|
+
del metadata['text_sample'] # Don't store in Qdrant
|
|
134
|
+
|
|
135
|
+
return metadata
|
|
136
|
+
|
|
137
|
+
async def find_conversations_without_metadata(collection_name: str) -> List[str]:
|
|
138
|
+
"""Find all unique conversation IDs that don't have metadata."""
|
|
139
|
+
conversations_without_metadata = set()
|
|
140
|
+
|
|
141
|
+
offset = None
|
|
142
|
+
total_checked = 0
|
|
143
|
+
|
|
144
|
+
while True:
|
|
145
|
+
points, next_offset = client.scroll(
|
|
146
|
+
collection_name=collection_name,
|
|
147
|
+
limit=BATCH_SIZE,
|
|
148
|
+
offset=offset,
|
|
149
|
+
with_payload=True,
|
|
150
|
+
with_vectors=False
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if not points:
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
for point in points:
|
|
157
|
+
# Check if metadata is missing
|
|
158
|
+
if not point.payload.get('concepts') or not point.payload.get('has_file_metadata'):
|
|
159
|
+
conv_id = point.payload.get('conversation_id')
|
|
160
|
+
if conv_id:
|
|
161
|
+
conversations_without_metadata.add(conv_id)
|
|
162
|
+
|
|
163
|
+
total_checked += len(points)
|
|
164
|
+
offset = next_offset
|
|
165
|
+
|
|
166
|
+
if offset is None:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
logger.info(f" Checked {total_checked} points, found {len(conversations_without_metadata)} conversations without metadata")
|
|
170
|
+
return list(conversations_without_metadata)
|
|
171
|
+
|
|
172
|
+
async def update_conversation_points(collection_name: str, conversation_id: str, metadata: Dict[str, Any]) -> int:
|
|
173
|
+
"""Update all points for a conversation with metadata."""
|
|
174
|
+
updated_count = 0
|
|
175
|
+
|
|
176
|
+
# Get all points in the collection
|
|
177
|
+
offset = None
|
|
178
|
+
while True:
|
|
179
|
+
points, next_offset = client.scroll(
|
|
180
|
+
collection_name=collection_name,
|
|
181
|
+
limit=BATCH_SIZE,
|
|
182
|
+
offset=offset,
|
|
183
|
+
with_payload=True,
|
|
184
|
+
with_vectors=False
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if not points:
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
# Find and update points for this conversation
|
|
191
|
+
for point in points:
|
|
192
|
+
if point.payload.get('conversation_id') == conversation_id:
|
|
193
|
+
if not DRY_RUN:
|
|
194
|
+
# Merge metadata with existing payload
|
|
195
|
+
updated_payload = {**point.payload, **metadata}
|
|
196
|
+
updated_payload['has_file_metadata'] = True
|
|
197
|
+
updated_payload['metadata_updated_at'] = datetime.now().isoformat()
|
|
198
|
+
|
|
199
|
+
client.set_payload(
|
|
200
|
+
collection_name=collection_name,
|
|
201
|
+
payload=updated_payload,
|
|
202
|
+
points=[point.id],
|
|
203
|
+
wait=False
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
updated_count += 1
|
|
207
|
+
|
|
208
|
+
offset = next_offset
|
|
209
|
+
if offset is None:
|
|
210
|
+
break
|
|
211
|
+
|
|
212
|
+
return updated_count
|
|
213
|
+
|
|
214
|
+
async def process_collection(collection_name: str):
|
|
215
|
+
"""Process a single collection to add missing metadata."""
|
|
216
|
+
logger.info(f"\nProcessing collection: {collection_name}")
|
|
217
|
+
|
|
218
|
+
# Find conversations without metadata
|
|
219
|
+
conversations_without_metadata = await find_conversations_without_metadata(collection_name)
|
|
220
|
+
|
|
221
|
+
if not conversations_without_metadata:
|
|
222
|
+
logger.info(f" ✓ All conversations have metadata")
|
|
223
|
+
return 0
|
|
224
|
+
|
|
225
|
+
logger.info(f" Found {len(conversations_without_metadata)} conversations needing metadata")
|
|
226
|
+
|
|
227
|
+
# Process each conversation
|
|
228
|
+
success_count = 0
|
|
229
|
+
failed_count = 0
|
|
230
|
+
|
|
231
|
+
for conv_id in conversations_without_metadata[:10]: # Limit for testing
|
|
232
|
+
# Find the JSONL file
|
|
233
|
+
jsonl_pattern = f"**/{conv_id}.jsonl"
|
|
234
|
+
jsonl_files = list(Path(LOGS_DIR).glob(jsonl_pattern))
|
|
235
|
+
|
|
236
|
+
if not jsonl_files:
|
|
237
|
+
logger.warning(f" Cannot find JSONL for {conv_id}")
|
|
238
|
+
failed_count += 1
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
jsonl_file = jsonl_files[0]
|
|
242
|
+
logger.info(f" Processing {conv_id}")
|
|
243
|
+
|
|
244
|
+
# Extract metadata
|
|
245
|
+
metadata = extract_metadata_from_jsonl(str(jsonl_file))
|
|
246
|
+
|
|
247
|
+
if not metadata['concepts'] and not metadata['files_analyzed']:
|
|
248
|
+
logger.warning(f" No metadata extracted from {conv_id}")
|
|
249
|
+
failed_count += 1
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
# Update points
|
|
253
|
+
updated_points = await update_conversation_points(collection_name, conv_id, metadata)
|
|
254
|
+
|
|
255
|
+
if updated_points > 0:
|
|
256
|
+
logger.info(f" ✓ Updated {updated_points} points with {len(metadata['concepts'])} concepts")
|
|
257
|
+
success_count += 1
|
|
258
|
+
else:
|
|
259
|
+
logger.warning(f" No points updated for {conv_id}")
|
|
260
|
+
failed_count += 1
|
|
261
|
+
|
|
262
|
+
logger.info(f" Collection complete: {success_count} fixed, {failed_count} failed")
|
|
263
|
+
return success_count
|
|
264
|
+
|
|
265
|
+
async def main():
|
|
266
|
+
"""Main recovery process."""
|
|
267
|
+
logger.info("=== Force Metadata Recovery ===")
|
|
268
|
+
logger.info(f"Qdrant URL: {QDRANT_URL}")
|
|
269
|
+
logger.info(f"Dry run: {DRY_RUN}")
|
|
270
|
+
|
|
271
|
+
# Get all collections
|
|
272
|
+
collections = client.get_collections().collections
|
|
273
|
+
|
|
274
|
+
# Focus on collections with potential issues
|
|
275
|
+
priority_collections = []
|
|
276
|
+
other_collections = []
|
|
277
|
+
|
|
278
|
+
for collection in collections:
|
|
279
|
+
name = collection.name
|
|
280
|
+
if name.startswith('conv_'):
|
|
281
|
+
other_collections.append(name)
|
|
282
|
+
|
|
283
|
+
logger.info(f"Found {len(priority_collections)} priority collections")
|
|
284
|
+
logger.info(f"Found {len(other_collections)} other collections")
|
|
285
|
+
|
|
286
|
+
# Process priority collections first
|
|
287
|
+
total_fixed = 0
|
|
288
|
+
|
|
289
|
+
for collection_name in priority_collections:
|
|
290
|
+
fixed = await process_collection(collection_name)
|
|
291
|
+
total_fixed += fixed
|
|
292
|
+
|
|
293
|
+
# Process a sample of other collections
|
|
294
|
+
for collection_name in other_collections[:5]:
|
|
295
|
+
fixed = await process_collection(collection_name)
|
|
296
|
+
total_fixed += fixed
|
|
297
|
+
|
|
298
|
+
logger.info(f"\n=== Recovery Complete ===")
|
|
299
|
+
logger.info(f"Total conversations fixed: {total_fixed}")
|
|
300
|
+
|
|
301
|
+
if DRY_RUN:
|
|
302
|
+
logger.info("This was a DRY RUN - no actual updates were made")
|
|
303
|
+
|
|
304
|
+
if __name__ == "__main__":
|
|
305
|
+
asyncio.run(main())
|
|
@@ -42,6 +42,20 @@ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower()
|
|
|
42
42
|
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
43
43
|
CURRENT_METADATA_VERSION = 2 # Version 2: Added tool output extraction
|
|
44
44
|
|
|
45
|
+
# Token limit configuration for Voyage AI
|
|
46
|
+
MAX_TOKENS_PER_BATCH = int(os.getenv("MAX_TOKENS_PER_BATCH", "100000")) # Safe limit (120k - 20k buffer)
|
|
47
|
+
if MAX_TOKENS_PER_BATCH > 120000 or MAX_TOKENS_PER_BATCH < 1000:
|
|
48
|
+
logger.warning(f"MAX_TOKENS_PER_BATCH={MAX_TOKENS_PER_BATCH} outside safe range [1000, 120000], using 100000")
|
|
49
|
+
MAX_TOKENS_PER_BATCH = 100000
|
|
50
|
+
|
|
51
|
+
TOKEN_ESTIMATION_RATIO = int(os.getenv("TOKEN_ESTIMATION_RATIO", "3")) # chars per token estimate
|
|
52
|
+
if TOKEN_ESTIMATION_RATIO < 2 or TOKEN_ESTIMATION_RATIO > 10:
|
|
53
|
+
logger.warning(f"TOKEN_ESTIMATION_RATIO={TOKEN_ESTIMATION_RATIO} outside normal range [2, 10], using 3")
|
|
54
|
+
TOKEN_ESTIMATION_RATIO = 3
|
|
55
|
+
|
|
56
|
+
USE_TOKEN_AWARE_BATCHING = os.getenv("USE_TOKEN_AWARE_BATCHING", "true").lower() == "true"
|
|
57
|
+
MAX_RECURSION_DEPTH = 10 # Maximum depth for recursive chunk splitting
|
|
58
|
+
|
|
45
59
|
# Set up logging
|
|
46
60
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
47
61
|
logger = logging.getLogger(__name__)
|
|
@@ -381,6 +395,38 @@ def log_retry_state(retry_state):
|
|
|
381
395
|
def embed_with_backoff(**kwargs):
|
|
382
396
|
return voyage_client.embed(**kwargs)
|
|
383
397
|
|
|
398
|
+
def estimate_tokens(text: str) -> int:
|
|
399
|
+
"""Estimate token count for text with content-aware heuristics.
|
|
400
|
+
Base estimate uses TOKEN_ESTIMATION_RATIO, adjusted for content type.
|
|
401
|
+
"""
|
|
402
|
+
# Base estimate
|
|
403
|
+
base_tokens = len(text) // TOKEN_ESTIMATION_RATIO
|
|
404
|
+
|
|
405
|
+
# Adjust for code/JSON content (typically more tokens per char)
|
|
406
|
+
# Count indicators of structured content
|
|
407
|
+
structure_indicators = text.count('{') + text.count('[') + text.count('```')
|
|
408
|
+
if structure_indicators > 10: # Likely JSON/code
|
|
409
|
+
base_tokens = int(base_tokens * 1.3)
|
|
410
|
+
|
|
411
|
+
# Add 10% safety margin
|
|
412
|
+
return int(base_tokens * 1.1)
|
|
413
|
+
|
|
414
|
+
def extract_message_content(msg: Dict[str, Any]) -> str:
|
|
415
|
+
"""Extract text content from a message."""
|
|
416
|
+
content = msg.get("content", "")
|
|
417
|
+
|
|
418
|
+
if isinstance(content, list):
|
|
419
|
+
# Handle structured content
|
|
420
|
+
text_parts = []
|
|
421
|
+
for item in content:
|
|
422
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
423
|
+
text_parts.append(item.get("text", ""))
|
|
424
|
+
elif isinstance(item, str):
|
|
425
|
+
text_parts.append(item)
|
|
426
|
+
content = " ".join(text_parts)
|
|
427
|
+
|
|
428
|
+
return content
|
|
429
|
+
|
|
384
430
|
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
385
431
|
"""Generate embeddings for a list of texts."""
|
|
386
432
|
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
@@ -432,6 +478,125 @@ def chunk_conversation(messages: List[Dict[str, Any]], chunk_size: int = 10) ->
|
|
|
432
478
|
|
|
433
479
|
return chunks
|
|
434
480
|
|
|
481
|
+
def split_large_chunk(chunk: Dict[str, Any], max_tokens: int, depth: int = 0) -> List[Dict[str, Any]]:
|
|
482
|
+
"""Split a large chunk into smaller pieces that fit token limit."""
|
|
483
|
+
# Check recursion depth to prevent stack overflow
|
|
484
|
+
if depth >= MAX_RECURSION_DEPTH:
|
|
485
|
+
logger.error(f"Max recursion depth {MAX_RECURSION_DEPTH} reached while splitting chunk")
|
|
486
|
+
# Force truncate as last resort
|
|
487
|
+
max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
|
|
488
|
+
chunk["text"] = chunk["text"][:max_chars] + "\n[TRUNCATED - MAX DEPTH REACHED]"
|
|
489
|
+
chunk["was_truncated"] = True
|
|
490
|
+
return [chunk]
|
|
491
|
+
|
|
492
|
+
text = chunk["text"]
|
|
493
|
+
messages = chunk["messages"]
|
|
494
|
+
|
|
495
|
+
# First, check if we can split by messages
|
|
496
|
+
if len(messages) > 1:
|
|
497
|
+
# Try splitting messages into smaller groups
|
|
498
|
+
mid = len(messages) // 2
|
|
499
|
+
chunk1_messages = messages[:mid]
|
|
500
|
+
chunk2_messages = messages[mid:]
|
|
501
|
+
|
|
502
|
+
# Recreate text for each split
|
|
503
|
+
texts1 = []
|
|
504
|
+
texts2 = []
|
|
505
|
+
|
|
506
|
+
for msg in chunk1_messages:
|
|
507
|
+
role = msg.get("role", "unknown")
|
|
508
|
+
content = extract_message_content(msg)
|
|
509
|
+
if content:
|
|
510
|
+
texts1.append(f"{role.upper()}: {content}")
|
|
511
|
+
|
|
512
|
+
for msg in chunk2_messages:
|
|
513
|
+
role = msg.get("role", "unknown")
|
|
514
|
+
content = extract_message_content(msg)
|
|
515
|
+
if content:
|
|
516
|
+
texts2.append(f"{role.upper()}: {content}")
|
|
517
|
+
|
|
518
|
+
split_chunks = []
|
|
519
|
+
if texts1:
|
|
520
|
+
split_chunks.append({
|
|
521
|
+
"text": "\n".join(texts1),
|
|
522
|
+
"messages": chunk1_messages,
|
|
523
|
+
"chunk_index": f"{chunk['chunk_index']}_a",
|
|
524
|
+
"start_role": chunk["start_role"]
|
|
525
|
+
})
|
|
526
|
+
if texts2:
|
|
527
|
+
split_chunks.append({
|
|
528
|
+
"text": "\n".join(texts2),
|
|
529
|
+
"messages": chunk2_messages,
|
|
530
|
+
"chunk_index": f"{chunk['chunk_index']}_b",
|
|
531
|
+
"start_role": chunk2_messages[0].get("role", "unknown") if chunk2_messages else "unknown"
|
|
532
|
+
})
|
|
533
|
+
|
|
534
|
+
# Recursively split if still too large
|
|
535
|
+
result = []
|
|
536
|
+
for split_chunk in split_chunks:
|
|
537
|
+
if estimate_tokens(split_chunk["text"]) > max_tokens:
|
|
538
|
+
result.extend(split_large_chunk(split_chunk, max_tokens, depth + 1))
|
|
539
|
+
else:
|
|
540
|
+
result.append(split_chunk)
|
|
541
|
+
return result
|
|
542
|
+
else:
|
|
543
|
+
# Single message too large - truncate with warning
|
|
544
|
+
max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
|
|
545
|
+
if len(text) > max_chars:
|
|
546
|
+
truncated_size = len(text) - max_chars
|
|
547
|
+
logger.warning(f"Single message exceeds token limit, truncating {truncated_size} chars from {len(text)} total")
|
|
548
|
+
chunk["text"] = text[:max_chars] + f"\n[TRUNCATED {truncated_size} CHARS]"
|
|
549
|
+
chunk["was_truncated"] = True
|
|
550
|
+
chunk["original_size"] = len(text)
|
|
551
|
+
return [chunk]
|
|
552
|
+
|
|
553
|
+
def create_token_aware_batches(chunks: List[Dict[str, Any]], max_tokens: int = MAX_TOKENS_PER_BATCH) -> List[List[Dict[str, Any]]]:
|
|
554
|
+
"""Create batches that respect token limits."""
|
|
555
|
+
if not USE_TOKEN_AWARE_BATCHING:
|
|
556
|
+
# Fall back to old batching method
|
|
557
|
+
batches = []
|
|
558
|
+
for i in range(0, len(chunks), BATCH_SIZE):
|
|
559
|
+
batches.append(chunks[i:i + BATCH_SIZE])
|
|
560
|
+
return batches
|
|
561
|
+
|
|
562
|
+
batches = []
|
|
563
|
+
current_batch = []
|
|
564
|
+
current_tokens = 0
|
|
565
|
+
|
|
566
|
+
for chunk in chunks:
|
|
567
|
+
chunk_tokens = estimate_tokens(chunk["text"])
|
|
568
|
+
|
|
569
|
+
# If single chunk exceeds limit, split it
|
|
570
|
+
if chunk_tokens > max_tokens:
|
|
571
|
+
logger.warning(f"Chunk with {chunk_tokens} estimated tokens exceeds limit of {max_tokens}, splitting...")
|
|
572
|
+
split_chunks = split_large_chunk(chunk, max_tokens)
|
|
573
|
+
for split_chunk in split_chunks:
|
|
574
|
+
split_tokens = estimate_tokens(split_chunk["text"])
|
|
575
|
+
if split_tokens > max_tokens:
|
|
576
|
+
logger.error(f"Split chunk still exceeds limit: {split_tokens} tokens")
|
|
577
|
+
batches.append([split_chunk])
|
|
578
|
+
# If adding chunk would exceed limit, start new batch
|
|
579
|
+
elif current_tokens + chunk_tokens > max_tokens:
|
|
580
|
+
if current_batch:
|
|
581
|
+
batches.append(current_batch)
|
|
582
|
+
current_batch = [chunk]
|
|
583
|
+
current_tokens = chunk_tokens
|
|
584
|
+
else:
|
|
585
|
+
current_batch.append(chunk)
|
|
586
|
+
current_tokens += chunk_tokens
|
|
587
|
+
|
|
588
|
+
if current_batch:
|
|
589
|
+
batches.append(current_batch)
|
|
590
|
+
|
|
591
|
+
# Log batch statistics
|
|
592
|
+
if batches:
|
|
593
|
+
batch_sizes = [len(batch) for batch in batches]
|
|
594
|
+
batch_tokens = [sum(estimate_tokens(chunk["text"]) for chunk in batch) for batch in batches]
|
|
595
|
+
logger.debug(f"Created {len(batches)} batches, chunk counts: min={min(batch_sizes)}, max={max(batch_sizes)}, "
|
|
596
|
+
f"estimated tokens: min={min(batch_tokens)}, max={max(batch_tokens)}, avg={sum(batch_tokens)//len(batches)}")
|
|
597
|
+
|
|
598
|
+
return batches
|
|
599
|
+
|
|
435
600
|
def import_project(project_path: Path, collection_name: str, state: dict) -> int:
|
|
436
601
|
"""Import all conversations from a project."""
|
|
437
602
|
jsonl_files = list(project_path.glob("*.jsonl"))
|
|
@@ -524,11 +689,17 @@ def import_project(project_path: Path, collection_name: str, state: dict) -> int
|
|
|
524
689
|
if not chunks:
|
|
525
690
|
continue
|
|
526
691
|
|
|
527
|
-
# Process in batches
|
|
528
|
-
|
|
529
|
-
|
|
692
|
+
# Process in batches (token-aware if enabled)
|
|
693
|
+
token_aware_batches = create_token_aware_batches(chunks)
|
|
694
|
+
|
|
695
|
+
for batch_idx, batch in enumerate(token_aware_batches):
|
|
530
696
|
texts = [chunk["text"] for chunk in batch]
|
|
531
697
|
|
|
698
|
+
# Log batch info for debugging
|
|
699
|
+
if USE_TOKEN_AWARE_BATCHING:
|
|
700
|
+
total_tokens = sum(estimate_tokens(text) for text in texts)
|
|
701
|
+
logger.debug(f"Batch {batch_idx + 1}/{len(token_aware_batches)}: {len(texts)} chunks, ~{total_tokens} estimated tokens")
|
|
702
|
+
|
|
532
703
|
# Generate embeddings
|
|
533
704
|
embeddings = generate_embeddings(texts)
|
|
534
705
|
|