claude-self-reflect 2.5.18 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +76 -0
- package/.claude/agents/mcp-integration.md +45 -0
- package/.claude/agents/qdrant-specialist.md +41 -0
- package/README.md +54 -1
- package/installer/setup-wizard-docker.js +65 -1
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/src/server.py +91 -32
- package/mcp-server/src/status.py +2 -2
- package/package.json +9 -1
- package/scripts/delta-metadata-update-safe.py +442 -0
- package/scripts/force-metadata-recovery.py +305 -0
- package/scripts/import-conversations-unified.py +174 -3
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Safe delta metadata update script for Claude Self-Reflect.
|
|
4
|
+
Updates existing Qdrant points with tool usage metadata without overwhelming the system.
|
|
5
|
+
Includes rate limiting, batch processing, and proper error recovery.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import hashlib
|
|
12
|
+
import re
|
|
13
|
+
import time
|
|
14
|
+
import asyncio
|
|
15
|
+
from datetime import datetime, timedelta
|
|
16
|
+
from typing import List, Dict, Any, Set, Tuple, Optional
|
|
17
|
+
import logging
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
|
|
21
|
+
from qdrant_client import QdrantClient
|
|
22
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
23
|
+
|
|
24
|
+
# Configuration
|
|
25
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
26
|
+
LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
|
|
27
|
+
STATE_FILE = os.getenv("STATE_FILE", "./config/delta-update-state.json")
|
|
28
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
29
|
+
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
30
|
+
DAYS_TO_UPDATE = int(os.getenv("DAYS_TO_UPDATE", "7"))
|
|
31
|
+
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Process N conversations at a time
|
|
32
|
+
RATE_LIMIT_DELAY = float(os.getenv("RATE_LIMIT_DELAY", "0.1")) # Delay between updates
|
|
33
|
+
MAX_CONCURRENT_UPDATES = int(os.getenv("MAX_CONCURRENT_UPDATES", "5")) # Max parallel updates
|
|
34
|
+
|
|
35
|
+
# Set up logging
|
|
36
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# Initialize Qdrant client
|
|
40
|
+
client = QdrantClient(url=QDRANT_URL, timeout=30) # Increased timeout
|
|
41
|
+
|
|
42
|
+
def get_collection_suffix():
|
|
43
|
+
"""Get the collection suffix based on embedding type (for new collections only)."""
|
|
44
|
+
return "_local" if PREFER_LOCAL_EMBEDDINGS else "_voyage"
|
|
45
|
+
|
|
46
|
+
def get_existing_collection_suffix(project_hash: str, max_retries: int = 3) -> str:
|
|
47
|
+
"""Detect which collection type actually exists for this project.
|
|
48
|
+
|
|
49
|
+
This function checks for existing collections and returns the actual suffix found.
|
|
50
|
+
Only falls back to preference when creating new collections.
|
|
51
|
+
Includes retry logic for resilience against temporary Qdrant unavailability.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
project_hash: The MD5 hash of the normalized project name
|
|
55
|
+
max_retries: Maximum number of retry attempts for collection detection
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
"_voyage" if voyage collection exists, "_local" if local exists,
|
|
59
|
+
or preference-based suffix if neither exists yet
|
|
60
|
+
"""
|
|
61
|
+
for attempt in range(max_retries):
|
|
62
|
+
try:
|
|
63
|
+
collections = client.get_collections().collections
|
|
64
|
+
collection_names = [c.name for c in collections]
|
|
65
|
+
|
|
66
|
+
# Check for both possible collection names
|
|
67
|
+
voyage_name = f"conv_{project_hash}_voyage"
|
|
68
|
+
local_name = f"conv_{project_hash}_local"
|
|
69
|
+
|
|
70
|
+
# Return the actual collection type that exists
|
|
71
|
+
if voyage_name in collection_names:
|
|
72
|
+
logger.debug(f"Found existing Voyage collection: {voyage_name}")
|
|
73
|
+
return "_voyage"
|
|
74
|
+
elif local_name in collection_names:
|
|
75
|
+
logger.debug(f"Found existing Local collection: {local_name}")
|
|
76
|
+
return "_local"
|
|
77
|
+
else:
|
|
78
|
+
# No existing collection - use preference for new ones
|
|
79
|
+
suffix = get_collection_suffix()
|
|
80
|
+
logger.debug(f"No existing collection for hash {project_hash}, using preference: {suffix}")
|
|
81
|
+
return suffix
|
|
82
|
+
except Exception as e:
|
|
83
|
+
if attempt < max_retries - 1:
|
|
84
|
+
wait_time = 0.5 * (attempt + 1) # Exponential backoff
|
|
85
|
+
logger.debug(f"Error checking collections (attempt {attempt + 1}/{max_retries}): {e}, retrying in {wait_time}s")
|
|
86
|
+
time.sleep(wait_time)
|
|
87
|
+
continue
|
|
88
|
+
logger.warning(f"Error checking collections after {max_retries} attempts: {e}, falling back to preference")
|
|
89
|
+
return get_collection_suffix()
|
|
90
|
+
|
|
91
|
+
def normalize_project_name(project_name: str) -> str:
|
|
92
|
+
"""Normalize project name by removing path-like prefixes."""
|
|
93
|
+
if project_name.startswith("-"):
|
|
94
|
+
parts = project_name.split("-")
|
|
95
|
+
for i, part in enumerate(parts):
|
|
96
|
+
if part == "projects" and i < len(parts) - 1:
|
|
97
|
+
return "-".join(parts[i+1:])
|
|
98
|
+
return project_name
|
|
99
|
+
|
|
100
|
+
def normalize_path(path: str) -> str:
|
|
101
|
+
"""Normalize file paths for consistency across platforms."""
|
|
102
|
+
if not path:
|
|
103
|
+
return ""
|
|
104
|
+
path = path.replace("/Users/", "~/").replace("\\Users\\", "~\\")
|
|
105
|
+
path = path.replace("\\", "/")
|
|
106
|
+
path = re.sub(r'/+', '/', path)
|
|
107
|
+
return path
|
|
108
|
+
|
|
109
|
+
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
110
|
+
"""Extract high-level concepts from conversation and tool usage."""
|
|
111
|
+
concepts = set()
|
|
112
|
+
|
|
113
|
+
concept_patterns = {
|
|
114
|
+
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
115
|
+
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
116
|
+
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
117
|
+
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
118
|
+
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
119
|
+
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
|
|
120
|
+
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
121
|
+
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
122
|
+
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
123
|
+
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
124
|
+
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
125
|
+
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
126
|
+
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
127
|
+
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
128
|
+
'search': r'(search|query|find|filter|match|relevance)'
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
combined_text = text.lower()
|
|
132
|
+
for concept, pattern in concept_patterns.items():
|
|
133
|
+
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
134
|
+
concepts.add(concept)
|
|
135
|
+
|
|
136
|
+
# Check tool usage patterns
|
|
137
|
+
if tool_usage.get('grep_searches'):
|
|
138
|
+
concepts.add('search')
|
|
139
|
+
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
140
|
+
concepts.add('development')
|
|
141
|
+
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
142
|
+
concepts.add('testing')
|
|
143
|
+
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
144
|
+
concepts.add('docker')
|
|
145
|
+
|
|
146
|
+
return concepts
|
|
147
|
+
|
|
148
|
+
def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
149
|
+
"""Extract all tool usage from a conversation."""
|
|
150
|
+
tool_usage = {
|
|
151
|
+
"files_read": [],
|
|
152
|
+
"files_edited": [],
|
|
153
|
+
"files_created": [],
|
|
154
|
+
"grep_searches": [],
|
|
155
|
+
"bash_commands": [],
|
|
156
|
+
"tools_summary": {}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
161
|
+
for line in f:
|
|
162
|
+
if not line.strip():
|
|
163
|
+
continue
|
|
164
|
+
try:
|
|
165
|
+
data = json.loads(line)
|
|
166
|
+
if 'message' in data and data['message']:
|
|
167
|
+
msg = data['message']
|
|
168
|
+
if msg.get('role') == 'assistant' and msg.get('content'):
|
|
169
|
+
content = msg['content']
|
|
170
|
+
if isinstance(content, list):
|
|
171
|
+
for item in content:
|
|
172
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
173
|
+
tool_name = item.get('name', '')
|
|
174
|
+
inputs = item.get('input', {})
|
|
175
|
+
|
|
176
|
+
# Track tool usage
|
|
177
|
+
tool_usage['tools_summary'][tool_name] = tool_usage['tools_summary'].get(tool_name, 0) + 1
|
|
178
|
+
|
|
179
|
+
# Extract file paths
|
|
180
|
+
if tool_name == 'Read':
|
|
181
|
+
file_path = inputs.get('file_path')
|
|
182
|
+
if file_path:
|
|
183
|
+
tool_usage['files_read'].append(normalize_path(file_path))
|
|
184
|
+
elif tool_name in ['Edit', 'Write', 'MultiEdit']:
|
|
185
|
+
file_path = inputs.get('file_path')
|
|
186
|
+
if file_path:
|
|
187
|
+
tool_usage['files_edited'].append(normalize_path(file_path))
|
|
188
|
+
elif tool_name == 'Grep':
|
|
189
|
+
pattern = inputs.get('pattern')
|
|
190
|
+
if pattern:
|
|
191
|
+
tool_usage['grep_searches'].append({'pattern': pattern[:100]})
|
|
192
|
+
elif tool_name == 'Bash':
|
|
193
|
+
command = inputs.get('command', '')[:200]
|
|
194
|
+
if command:
|
|
195
|
+
tool_usage['bash_commands'].append(command)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
continue
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Error reading JSONL file {jsonl_path}: {e}")
|
|
200
|
+
|
|
201
|
+
# Deduplicate
|
|
202
|
+
tool_usage['files_read'] = list(set(tool_usage['files_read']))[:20]
|
|
203
|
+
tool_usage['files_edited'] = list(set(tool_usage['files_edited']))[:10]
|
|
204
|
+
|
|
205
|
+
return tool_usage
|
|
206
|
+
|
|
207
|
+
def load_state() -> Dict[str, Any]:
|
|
208
|
+
"""Load the current state from file."""
|
|
209
|
+
state_path = Path(STATE_FILE)
|
|
210
|
+
if state_path.exists():
|
|
211
|
+
try:
|
|
212
|
+
with open(state_path, 'r') as f:
|
|
213
|
+
return json.load(f)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.warning(f"Could not load state: {e}")
|
|
216
|
+
return {
|
|
217
|
+
"last_update": None,
|
|
218
|
+
"updated_conversations": {},
|
|
219
|
+
"failed_conversations": {}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
def save_state(state: Dict[str, Any]):
|
|
223
|
+
"""Save the current state to file."""
|
|
224
|
+
state_path = Path(STATE_FILE)
|
|
225
|
+
state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
226
|
+
|
|
227
|
+
state["last_update"] = datetime.now().isoformat()
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
with open(state_path, 'w') as f:
|
|
231
|
+
json.dump(state, f, indent=2)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error(f"Could not save state: {e}")
|
|
234
|
+
|
|
235
|
+
async def update_point_metadata_batch(updates: List[Tuple[str, int, Dict, str]]) -> int:
|
|
236
|
+
"""Update multiple points in a batch with rate limiting."""
|
|
237
|
+
success_count = 0
|
|
238
|
+
|
|
239
|
+
for conversation_id, chunk_index, metadata, collection_name in updates:
|
|
240
|
+
try:
|
|
241
|
+
# Calculate point ID
|
|
242
|
+
point_id_str = hashlib.md5(
|
|
243
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
244
|
+
).hexdigest()[:16]
|
|
245
|
+
point_id = int(point_id_str, 16) % (2**63)
|
|
246
|
+
|
|
247
|
+
if not DRY_RUN:
|
|
248
|
+
# Update with rate limiting
|
|
249
|
+
client.set_payload(
|
|
250
|
+
collection_name=collection_name,
|
|
251
|
+
payload=metadata,
|
|
252
|
+
points=[point_id],
|
|
253
|
+
wait=False
|
|
254
|
+
)
|
|
255
|
+
success_count += 1
|
|
256
|
+
|
|
257
|
+
# Rate limit to avoid overwhelming Qdrant
|
|
258
|
+
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|
259
|
+
else:
|
|
260
|
+
logger.info(f"[DRY RUN] Would update point {point_id}")
|
|
261
|
+
success_count += 1
|
|
262
|
+
|
|
263
|
+
except Exception as e:
|
|
264
|
+
logger.debug(f"Failed to update point {conversation_id}_{chunk_index}: {e}")
|
|
265
|
+
|
|
266
|
+
return success_count
|
|
267
|
+
|
|
268
|
+
async def process_conversation_async(jsonl_file: Path, state: Dict[str, Any]) -> bool:
|
|
269
|
+
"""Process a single conversation file asynchronously."""
|
|
270
|
+
try:
|
|
271
|
+
conversation_id = jsonl_file.stem
|
|
272
|
+
project_name = jsonl_file.parent.name
|
|
273
|
+
|
|
274
|
+
# Check if already updated
|
|
275
|
+
if conversation_id in state.get("updated_conversations", {}):
|
|
276
|
+
last_updated = state["updated_conversations"][conversation_id].get("updated_at")
|
|
277
|
+
file_mtime = jsonl_file.stat().st_mtime
|
|
278
|
+
if last_updated and last_updated >= file_mtime:
|
|
279
|
+
logger.debug(f"Skipping {conversation_id} - already updated")
|
|
280
|
+
return True
|
|
281
|
+
|
|
282
|
+
# Check if previously failed too many times
|
|
283
|
+
failed_info = state.get("failed_conversations", {}).get(conversation_id, {})
|
|
284
|
+
if failed_info.get("retry_count", 0) > 3:
|
|
285
|
+
logger.debug(f"Skipping {conversation_id} - too many failures")
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
logger.info(f"Processing: {conversation_id}")
|
|
289
|
+
|
|
290
|
+
# Extract metadata
|
|
291
|
+
tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
|
|
292
|
+
|
|
293
|
+
# Read conversation text (limited)
|
|
294
|
+
conversation_text = ""
|
|
295
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
296
|
+
for i, line in enumerate(f):
|
|
297
|
+
if i > 100: # Limit lines to avoid memory issues
|
|
298
|
+
break
|
|
299
|
+
if line.strip():
|
|
300
|
+
try:
|
|
301
|
+
data = json.loads(line)
|
|
302
|
+
if 'message' in data and data['message']:
|
|
303
|
+
msg = data['message']
|
|
304
|
+
if msg.get('content'):
|
|
305
|
+
if isinstance(msg['content'], str):
|
|
306
|
+
conversation_text += msg['content'][:500] + "\n"
|
|
307
|
+
except Exception as e:
|
|
308
|
+
logger.debug(f"Parse error in {jsonl_file}: {e}")
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
# Extract concepts
|
|
312
|
+
concepts = extract_concepts(conversation_text[:10000], tool_usage)
|
|
313
|
+
|
|
314
|
+
# Prepare metadata
|
|
315
|
+
metadata_update = {
|
|
316
|
+
"files_analyzed": tool_usage.get('files_read', [])[:20],
|
|
317
|
+
"files_edited": tool_usage.get('files_edited', [])[:10],
|
|
318
|
+
"tools_used": list(tool_usage.get('tools_summary', {}).keys())[:20],
|
|
319
|
+
"concepts": list(concepts)[:15],
|
|
320
|
+
"has_file_metadata": True,
|
|
321
|
+
"metadata_updated_at": datetime.now().isoformat()
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
# Determine collection
|
|
325
|
+
project_hash = hashlib.md5(normalize_project_name(project_name).encode()).hexdigest()[:8]
|
|
326
|
+
# Use smart detection to find the actual collection type
|
|
327
|
+
collection_suffix = get_existing_collection_suffix(project_hash)
|
|
328
|
+
collection_name = f"conv_{project_hash}{collection_suffix}"
|
|
329
|
+
|
|
330
|
+
# Check if collection exists
|
|
331
|
+
try:
|
|
332
|
+
collections = client.get_collections().collections
|
|
333
|
+
if collection_name not in [c.name for c in collections]:
|
|
334
|
+
logger.warning(f"Collection {collection_name} not found")
|
|
335
|
+
return False
|
|
336
|
+
except Exception as e:
|
|
337
|
+
logger.error(f"Error checking collection: {e}")
|
|
338
|
+
# Record failure
|
|
339
|
+
state.setdefault("failed_conversations", {})[conversation_id] = {
|
|
340
|
+
"error": str(e),
|
|
341
|
+
"retry_count": failed_info.get("retry_count", 0) + 1,
|
|
342
|
+
"last_attempt": time.time()
|
|
343
|
+
}
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
# Prepare batch updates
|
|
347
|
+
updates = []
|
|
348
|
+
for chunk_index in range(20): # Most conversations have < 20 chunks
|
|
349
|
+
updates.append((conversation_id, chunk_index, metadata_update, collection_name))
|
|
350
|
+
|
|
351
|
+
# Process in batch with rate limiting
|
|
352
|
+
success_count = await update_point_metadata_batch(updates)
|
|
353
|
+
|
|
354
|
+
if success_count > 0:
|
|
355
|
+
logger.info(f"Updated {success_count} chunks for {conversation_id}")
|
|
356
|
+
state["updated_conversations"][conversation_id] = {
|
|
357
|
+
"updated_at": time.time(),
|
|
358
|
+
"chunks_updated": success_count,
|
|
359
|
+
"project": project_name
|
|
360
|
+
}
|
|
361
|
+
return True
|
|
362
|
+
else:
|
|
363
|
+
logger.warning(f"No chunks updated for {conversation_id}")
|
|
364
|
+
return False
|
|
365
|
+
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.error(f"Failed to process {jsonl_file}: {e}")
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
async def main_async():
|
|
371
|
+
"""Main async function with proper batching and rate limiting."""
|
|
372
|
+
logger.info("=== Starting Safe Delta Metadata Update ===")
|
|
373
|
+
logger.info(f"Configuration:")
|
|
374
|
+
logger.info(f" Qdrant URL: {QDRANT_URL}")
|
|
375
|
+
logger.info(f" Days to update: {DAYS_TO_UPDATE}")
|
|
376
|
+
logger.info(f" Batch size: {BATCH_SIZE}")
|
|
377
|
+
logger.info(f" Rate limit delay: {RATE_LIMIT_DELAY}s")
|
|
378
|
+
logger.info(f" Max concurrent: {MAX_CONCURRENT_UPDATES}")
|
|
379
|
+
|
|
380
|
+
# Load state
|
|
381
|
+
state = load_state()
|
|
382
|
+
|
|
383
|
+
# Get recent files
|
|
384
|
+
recent_files = []
|
|
385
|
+
cutoff_time = datetime.now() - timedelta(days=DAYS_TO_UPDATE)
|
|
386
|
+
logs_path = Path(LOGS_DIR)
|
|
387
|
+
|
|
388
|
+
if logs_path.exists():
|
|
389
|
+
for jsonl_file in logs_path.glob("**/*.jsonl"):
|
|
390
|
+
try:
|
|
391
|
+
mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
|
|
392
|
+
if mtime >= cutoff_time:
|
|
393
|
+
recent_files.append(jsonl_file)
|
|
394
|
+
except:
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
logger.info(f"Found {len(recent_files)} conversations from the past {DAYS_TO_UPDATE} days")
|
|
398
|
+
|
|
399
|
+
# Process in batches
|
|
400
|
+
success_count = 0
|
|
401
|
+
failed_count = 0
|
|
402
|
+
|
|
403
|
+
for i in range(0, len(recent_files), BATCH_SIZE):
|
|
404
|
+
batch = recent_files[i:i + BATCH_SIZE]
|
|
405
|
+
logger.info(f"Processing batch {i//BATCH_SIZE + 1}/{(len(recent_files) + BATCH_SIZE - 1)//BATCH_SIZE}")
|
|
406
|
+
|
|
407
|
+
# Create tasks for concurrent processing
|
|
408
|
+
tasks = []
|
|
409
|
+
for jsonl_file in batch:
|
|
410
|
+
task = asyncio.create_task(process_conversation_async(jsonl_file, state))
|
|
411
|
+
tasks.append(task)
|
|
412
|
+
|
|
413
|
+
# Wait for batch to complete
|
|
414
|
+
results = await asyncio.gather(*tasks)
|
|
415
|
+
|
|
416
|
+
# Count results
|
|
417
|
+
batch_success = sum(1 for r in results if r)
|
|
418
|
+
batch_failed = len(results) - batch_success
|
|
419
|
+
success_count += batch_success
|
|
420
|
+
failed_count += batch_failed
|
|
421
|
+
|
|
422
|
+
# Save state after each batch
|
|
423
|
+
save_state(state)
|
|
424
|
+
|
|
425
|
+
# Add delay between batches to avoid overwhelming the system
|
|
426
|
+
if i + BATCH_SIZE < len(recent_files):
|
|
427
|
+
await asyncio.sleep(1.0)
|
|
428
|
+
|
|
429
|
+
# Final save
|
|
430
|
+
save_state(state)
|
|
431
|
+
|
|
432
|
+
logger.info("=== Delta Update Complete ===")
|
|
433
|
+
logger.info(f"Successfully updated: {success_count} conversations")
|
|
434
|
+
logger.info(f"Failed: {failed_count} conversations")
|
|
435
|
+
logger.info(f"Total conversations in state: {len(state['updated_conversations'])}")
|
|
436
|
+
|
|
437
|
+
def main():
|
|
438
|
+
"""Entry point."""
|
|
439
|
+
asyncio.run(main_async())
|
|
440
|
+
|
|
441
|
+
if __name__ == "__main__":
|
|
442
|
+
main()
|