claude-self-reflect 2.6.0 → 2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,800 +1,374 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- Unified import script that supports both local and Voyage AI embeddings.
3
+ Streaming importer with true line-by-line processing to prevent OOM.
4
+ Processes JSONL files without loading entire file into memory.
4
5
  """
5
6
 
7
+ import json
6
8
  import os
7
9
  import sys
8
- import json
9
- import glob
10
10
  import hashlib
11
11
  import gc
12
- import re
12
+ from pathlib import Path
13
13
  from datetime import datetime
14
- from typing import List, Dict, Any, Set
14
+ from typing import List, Dict, Any, Optional
15
15
  import logging
16
- from pathlib import Path
17
16
 
18
- # Add the mcp-server/src directory to the Python path
19
- sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'mcp-server', 'src'))
20
- from utils import normalize_project_name
17
+ # Add the project root to the Python path
18
+ project_root = Path(__file__).parent.parent
19
+ sys.path.insert(0, str(project_root))
21
20
 
22
21
  from qdrant_client import QdrantClient
23
- from qdrant_client.models import (
24
- VectorParams, Distance, PointStruct,
25
- Filter, FieldCondition, MatchValue
26
- )
22
+ from qdrant_client.models import PointStruct, Distance, VectorParams
27
23
 
28
- from tenacity import (
29
- retry,
30
- stop_after_attempt,
31
- wait_random_exponential,
24
+ # Set up logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(levelname)s - %(message)s'
32
28
  )
29
+ logger = logging.getLogger(__name__)
33
30
 
34
- # Configuration
31
+ # Environment variables
35
32
  QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
36
- LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
37
- # Default to project config directory for state file
38
- default_state_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "imported-files.json")
39
- STATE_FILE = os.getenv("STATE_FILE", default_state_file)
40
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Reduced from 100 to prevent OOM
41
- PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
33
+ STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
34
+ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
42
35
  VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
43
- CURRENT_METADATA_VERSION = 2 # Version 2: Added tool output extraction
44
-
45
- # Token limit configuration for Voyage AI
46
- MAX_TOKENS_PER_BATCH = int(os.getenv("MAX_TOKENS_PER_BATCH", "100000")) # Safe limit (120k - 20k buffer)
47
- if MAX_TOKENS_PER_BATCH > 120000 or MAX_TOKENS_PER_BATCH < 1000:
48
- logger.warning(f"MAX_TOKENS_PER_BATCH={MAX_TOKENS_PER_BATCH} outside safe range [1000, 120000], using 100000")
49
- MAX_TOKENS_PER_BATCH = 100000
50
-
51
- TOKEN_ESTIMATION_RATIO = int(os.getenv("TOKEN_ESTIMATION_RATIO", "3")) # chars per token estimate
52
- if TOKEN_ESTIMATION_RATIO < 2 or TOKEN_ESTIMATION_RATIO > 10:
53
- logger.warning(f"TOKEN_ESTIMATION_RATIO={TOKEN_ESTIMATION_RATIO} outside normal range [2, 10], using 3")
54
- TOKEN_ESTIMATION_RATIO = 3
36
+ MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
55
37
 
56
- USE_TOKEN_AWARE_BATCHING = os.getenv("USE_TOKEN_AWARE_BATCHING", "true").lower() == "true"
57
- MAX_RECURSION_DEPTH = 10 # Maximum depth for recursive chunk splitting
58
-
59
- # Set up logging
60
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
61
- logger = logging.getLogger(__name__)
62
-
63
- # ============= Metadata Extraction Functions =============
64
-
65
- def normalize_path_for_metadata(path: str) -> str:
66
- """Normalize file paths for consistency in metadata."""
67
- if not path:
68
- return ""
69
-
70
- # Remove common prefixes
71
- path = path.replace("/Users/", "~/")
72
- path = path.replace("\\Users\\", "~\\")
73
-
74
- # Convert to forward slashes
75
- path = path.replace("\\", "/")
76
-
77
- # Remove duplicate slashes
78
- path = re.sub(r'/+', '/', path)
79
-
80
- return path
81
-
82
- def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
83
- """Extract high-level concepts from conversation and tool usage."""
84
- concepts = set()
85
-
86
- # Common development concepts with patterns
87
- concept_patterns = {
88
- 'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
89
- 'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
90
- 'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
91
- 'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
92
- 'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
93
- 'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
94
- 'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
95
- 'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
96
- 'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
97
- 'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
98
- 'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
99
- 'architecture': r'(architecture|design|pattern|structure|component|module)',
100
- 'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
101
- 'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
102
- 'search': r'(search|query|find|filter|match|relevance)'
103
- }
104
-
105
- # Check text content (limit to first 10000 chars for performance)
106
- combined_text = text[:10000].lower() if text else ""
107
- for concept, pattern in concept_patterns.items():
108
- if re.search(pattern, combined_text, re.IGNORECASE):
109
- concepts.add(concept)
110
-
111
- # Check tool usage patterns
112
- if tool_usage.get('grep_searches'):
113
- concepts.add('search')
114
- if tool_usage.get('files_edited') or tool_usage.get('files_created'):
115
- concepts.add('development')
116
- if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
117
- concepts.add('testing')
118
- if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
119
- concepts.add('docker')
120
-
121
- return concepts
122
-
123
- def extract_files_from_git_output(output_text: str) -> List[str]:
124
- """Extract file paths from git command outputs (diff, show, status, etc)."""
125
- files = set()
126
-
127
- # Patterns for different git output formats
128
- patterns = [
129
- r'diff --git a/(.*?) b/', # git diff format
130
- r'^\+\+\+ b/(.+)$', # diff new file
131
- r'^--- a/(.+)$', # diff old file
132
- r'^modified:\s+(.+)$', # git status
133
- r'^deleted:\s+(.+)$', # git status
134
- r'^new file:\s+(.+)$', # git status
135
- r'^renamed:\s+(.+) -> (.+)$', # git status (captures both)
136
- ]
137
-
138
- for pattern in patterns:
139
- matches = re.findall(pattern, output_text, re.MULTILINE)
140
- for match in matches:
141
- if isinstance(match, tuple):
142
- # Handle renamed files (captures both old and new)
143
- for f in match:
144
- if f:
145
- files.add(normalize_path_for_metadata(f))
146
- else:
147
- files.add(normalize_path_for_metadata(match))
148
-
149
- return list(files)[:20] # Limit to 20 files
150
-
151
- def extract_tool_data_from_message(tool_use: Dict[str, Any], usage_dict: Dict[str, Any], tool_output: str = None):
152
- """Extract tool usage data from a tool_use object in a message, including outputs."""
153
- tool_name = tool_use.get('name', '')
154
- inputs = tool_use.get('input', {})
155
-
156
- # Track tool in summary
157
- usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
158
-
159
- # Handle Read tool
160
- if tool_name == 'Read':
161
- file_path = inputs.get('file_path')
162
- if file_path:
163
- normalized = normalize_path_for_metadata(file_path)
164
- if normalized not in usage_dict['files_read']:
165
- usage_dict['files_read'].append(normalized)
166
-
167
- # Handle Edit and MultiEdit tools
168
- elif tool_name in ['Edit', 'MultiEdit']:
169
- path = inputs.get('file_path')
170
- if path:
171
- normalized = normalize_path_for_metadata(path)
172
- if normalized not in usage_dict['files_edited']:
173
- usage_dict['files_edited'].append(normalized)
174
-
175
- # Handle Write tool
176
- elif tool_name == 'Write':
177
- path = inputs.get('file_path')
178
- if path:
179
- normalized = normalize_path_for_metadata(path)
180
- if normalized not in usage_dict['files_created']:
181
- usage_dict['files_created'].append(normalized)
182
-
183
- # Handle Grep tool
184
- elif tool_name == 'Grep':
185
- pattern = inputs.get('pattern')
186
- if pattern and len(usage_dict['grep_searches']) < 10: # Limit
187
- usage_dict['grep_searches'].append(pattern[:100]) # Truncate long patterns
188
-
189
- # Handle Bash tool - Extract both command and output
190
- elif tool_name == 'Bash':
191
- command = inputs.get('command')
192
- if command and len(usage_dict['bash_commands']) < 10:
193
- usage_dict['bash_commands'].append(command[:200]) # Truncate
194
-
195
- # Process tool output for git commands
196
- if tool_output and any(cmd in command for cmd in ['git diff', 'git show', 'git status']):
197
- git_files = extract_files_from_git_output(tool_output)
198
- for file_path in git_files:
199
- if file_path not in usage_dict['git_file_changes']:
200
- usage_dict['git_file_changes'].append(file_path)
201
-
202
- # Store tool output preview (for any tool)
203
- if tool_output and len(usage_dict['tool_outputs']) < 15:
204
- usage_dict['tool_outputs'].append({
205
- 'tool': tool_name,
206
- 'command': inputs.get('command', inputs.get('pattern', ''))[:100],
207
- 'output_preview': tool_output[:500], # First 500 chars
208
- 'output_length': len(tool_output)
209
- })
210
-
211
- def extract_metadata_from_jsonl(file_path: str) -> Dict[str, Any]:
212
- """Extract metadata from a JSONL conversation file."""
213
- tool_usage = {
214
- "files_read": [],
215
- "files_edited": [],
216
- "files_created": [],
217
- "grep_searches": [],
218
- "bash_commands": [],
219
- "tools_summary": {},
220
- "git_file_changes": [], # NEW: Files from git outputs
221
- "tool_outputs": [] # NEW: Tool output previews
222
- }
223
-
224
- conversation_text = ""
225
- tool_outputs = {} # Map tool_use_id to output text
226
-
227
- try:
228
- # First pass: collect tool outputs
229
- with open(file_path, 'r', encoding='utf-8') as f:
230
- for line in f:
231
- if line.strip():
232
- try:
233
- data = json.loads(line)
234
- if 'message' in data and data['message']:
235
- msg = data['message']
236
- if msg.get('content') and isinstance(msg['content'], list):
237
- for item in msg['content']:
238
- if isinstance(item, dict) and item.get('type') == 'tool_result':
239
- # Capture tool output
240
- tool_id = item.get('tool_use_id')
241
- output_content = item.get('content', '')
242
- if tool_id and output_content:
243
- tool_outputs[tool_id] = output_content
244
- # Also check for toolUseResult in data
245
- if 'toolUseResult' in data:
246
- result = data['toolUseResult']
247
- if isinstance(result, dict):
248
- tool_outputs['last_result'] = json.dumps(result)[:1000]
249
- except:
250
- continue
251
-
252
- # Second pass: extract tool uses and text with outputs available
253
- with open(file_path, 'r', encoding='utf-8') as f:
254
- for line in f:
255
- if line.strip():
256
- try:
257
- data = json.loads(line)
258
- if 'message' in data and data['message']:
259
- msg = data['message']
260
- # Extract text
261
- if msg.get('content'):
262
- if isinstance(msg['content'], str):
263
- conversation_text += msg['content'] + "\n"
264
- elif isinstance(msg['content'], list):
265
- for item in msg['content']:
266
- if isinstance(item, dict):
267
- if item.get('type') == 'text' and item.get('text'):
268
- conversation_text += item['text'] + "\n"
269
- elif item.get('type') == 'tool_use':
270
- # Process tool use with output now available
271
- tool_id = item.get('id', '')
272
- output = tool_outputs.get(tool_id, '')
273
- extract_tool_data_from_message(item, tool_usage, output)
274
- except:
275
- continue
276
- except Exception as e:
277
- logger.warning(f"Error extracting metadata from {file_path}: {e}")
278
-
279
- # Extract concepts from text
280
- concepts = extract_concepts(conversation_text, tool_usage)
281
-
282
- # Build metadata
283
- metadata = {
284
- "files_analyzed": tool_usage['files_read'][:20], # Limit to 20
285
- "files_edited": tool_usage['files_edited'][:10], # Limit to 10
286
- "files_created": tool_usage['files_created'][:10],
287
- "tools_used": list(tool_usage['tools_summary'].keys())[:20],
288
- "tool_summary": dict(list(tool_usage['tools_summary'].items())[:10]),
289
- "concepts": list(concepts)[:15], # Limit to 15
290
- "search_patterns": tool_usage['grep_searches'][:10],
291
- "git_file_changes": tool_usage['git_file_changes'][:20], # NEW: Git file changes
292
- "tool_outputs": tool_usage['tool_outputs'][:15], # NEW: Tool output previews
293
- "analysis_only": len(tool_usage['files_edited']) == 0 and len(tool_usage['files_created']) == 0,
294
- "has_file_metadata": True,
295
- "metadata_version": CURRENT_METADATA_VERSION,
296
- "metadata_extracted_at": datetime.now().isoformat()
297
- }
298
-
299
- return metadata
300
-
301
- # ============= End Metadata Extraction Functions =============
302
-
303
- # State management functions
304
- def load_state():
305
- """Load the import state from file."""
306
- if os.path.exists(STATE_FILE):
307
- try:
308
- with open(STATE_FILE, 'r') as f:
309
- state = json.load(f)
310
- # Ensure the expected structure exists
311
- if "imported_files" not in state:
312
- state["imported_files"] = {}
313
- return state
314
- except Exception as e:
315
- logger.warning(f"Failed to load state file: {e}")
316
- return {"imported_files": {}}
317
-
318
- def save_state(state):
319
- """Save the import state to file."""
320
- try:
321
- # Ensure directory exists
322
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
323
- # Write atomically by using a temp file
324
- temp_file = STATE_FILE + ".tmp"
325
- with open(temp_file, 'w') as f:
326
- json.dump(state, f, indent=2)
327
- os.replace(temp_file, STATE_FILE)
328
- logger.debug(f"Saved state with {len(state['imported_files'])} files")
329
- except Exception as e:
330
- logger.error(f"Failed to save state file: {e}")
331
-
332
- def should_import_file(file_path, state):
333
- """Check if a file should be imported based on modification time."""
334
- str_path = str(file_path)
335
- file_mtime = os.path.getmtime(file_path)
336
-
337
- if str_path in state["imported_files"]:
338
- file_state = state["imported_files"][str_path]
339
-
340
- # Handle both old string format and new dict format
341
- if isinstance(file_state, str):
342
- # Old format (just timestamp string) - treat as needs reimport
343
- logger.info(f"Found old format state for {file_path.name}, will reimport")
344
- return True
345
- else:
346
- # New format with dictionary
347
- last_imported = file_state.get("last_imported", 0)
348
- last_modified = file_state.get("last_modified", 0)
349
-
350
- # Skip if file hasn't been modified since last import
351
- if file_mtime <= last_modified and last_imported > 0:
352
- logger.info(f"Skipping unchanged file: {file_path.name}")
353
- return False
354
-
355
- return True
356
-
357
- def update_file_state(file_path, state, chunks_imported):
358
- """Update the state for an imported file."""
359
- str_path = str(file_path)
360
- state["imported_files"][str_path] = {
361
- "last_modified": os.path.getmtime(file_path),
362
- "last_imported": datetime.now().timestamp(),
363
- "chunks_imported": chunks_imported
364
- }
38
+ # Initialize Qdrant client
39
+ client = QdrantClient(url=QDRANT_URL)
365
40
 
366
41
  # Initialize embedding provider
367
42
  embedding_provider = None
368
43
  embedding_dimension = None
369
- collection_suffix = None
370
44
 
371
45
  if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
372
- # Use local embeddings
373
46
  logger.info("Using local embeddings (fastembed)")
374
47
  from fastembed import TextEmbedding
375
- embedding_provider = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
48
+ embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
376
49
  embedding_dimension = 384
377
- collection_suffix = "_local"
50
+ collection_suffix = "local"
378
51
  else:
379
- # Use Voyage AI
380
52
  logger.info("Using Voyage AI embeddings")
381
53
  import voyageai
382
- voyage_client = voyageai.Client(api_key=VOYAGE_API_KEY)
54
+ embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
383
55
  embedding_dimension = 1024
384
- collection_suffix = "_voyage"
385
-
386
- # Initialize Qdrant client
387
- client = QdrantClient(url=QDRANT_URL)
56
+ collection_suffix = "voyage"
388
57
 
58
+ def normalize_project_name(project_name: str) -> str:
59
+ """Normalize project name for consistency."""
60
+ return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
389
61
 
390
- def log_retry_state(retry_state):
391
- print(f"Retrying function '{retry_state.fn.__name__}' for the {retry_state.attempt_number} time.")
392
- print(f"----> Waiting for {retry_state.next_action.sleep} seconds before next attempt.")
62
+ def get_collection_name(project_path: Path) -> str:
63
+ """Generate collection name from project path."""
64
+ normalized = normalize_project_name(project_path.name)
65
+ name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
66
+ return f"conv_{name_hash}_{collection_suffix}"
393
67
 
394
- @retry(wait=wait_random_exponential(multiplier=2, min=30, max=120), stop=stop_after_attempt(6), before_sleep=log_retry_state)
395
- def embed_with_backoff(**kwargs):
396
- return voyage_client.embed(**kwargs)
397
-
398
- def estimate_tokens(text: str) -> int:
399
- """Estimate token count for text with content-aware heuristics.
400
- Base estimate uses TOKEN_ESTIMATION_RATIO, adjusted for content type.
401
- """
402
- # Base estimate
403
- base_tokens = len(text) // TOKEN_ESTIMATION_RATIO
404
-
405
- # Adjust for code/JSON content (typically more tokens per char)
406
- # Count indicators of structured content
407
- structure_indicators = text.count('{') + text.count('[') + text.count('```')
408
- if structure_indicators > 10: # Likely JSON/code
409
- base_tokens = int(base_tokens * 1.3)
410
-
411
- # Add 10% safety margin
412
- return int(base_tokens * 1.1)
413
-
414
- def extract_message_content(msg: Dict[str, Any]) -> str:
415
- """Extract text content from a message."""
416
- content = msg.get("content", "")
417
-
418
- if isinstance(content, list):
419
- # Handle structured content
420
- text_parts = []
421
- for item in content:
422
- if isinstance(item, dict) and item.get("type") == "text":
423
- text_parts.append(item.get("text", ""))
424
- elif isinstance(item, str):
425
- text_parts.append(item)
426
- content = " ".join(text_parts)
427
-
428
- return content
68
+ def ensure_collection(collection_name: str):
69
+ """Ensure collection exists with correct configuration."""
70
+ collections = client.get_collections().collections
71
+ if not any(c.name == collection_name for c in collections):
72
+ logger.info(f"Creating collection: {collection_name}")
73
+ client.create_collection(
74
+ collection_name=collection_name,
75
+ vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
76
+ )
429
77
 
430
78
  def generate_embeddings(texts: List[str]) -> List[List[float]]:
431
- """Generate embeddings for a list of texts."""
79
+ """Generate embeddings for texts."""
432
80
  if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
433
- # Local embeddings using FastEmbed
434
81
  embeddings = list(embedding_provider.passage_embed(texts))
435
- return [embedding.tolist() for embedding in embeddings]
82
+ return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
436
83
  else:
437
- # Voyage AI embeddings
438
- result = embed_with_backoff(
439
- texts=texts,
440
- model="voyage-3-large",
441
- input_type="document"
442
- )
443
- return result.embeddings
444
-
445
- def chunk_conversation(messages: List[Dict[str, Any]], chunk_size: int = 10) -> List[Dict[str, Any]]:
446
- """Chunk conversation into smaller segments."""
447
- chunks = []
84
+ response = embedding_provider.embed(texts, model="voyage-3")
85
+ return response.embeddings
86
+
87
+ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
88
+ conversation_id: str, created_at: str,
89
+ metadata: Dict[str, Any], collection_name: str,
90
+ project_path: Path) -> int:
91
+ """Process and immediately upload a single chunk."""
92
+ if not messages:
93
+ return 0
448
94
 
449
- for i in range(0, len(messages), chunk_size):
450
- chunk_messages = messages[i:i + chunk_size]
451
-
452
- # Extract text content
453
- texts = []
454
- for msg in chunk_messages:
455
- role = msg.get("role", "unknown")
456
- content = msg.get("content", "")
457
-
458
- if isinstance(content, list):
459
- # Handle structured content
460
- text_parts = []
461
- for item in content:
462
- if isinstance(item, dict) and item.get("type") == "text":
463
- text_parts.append(item.get("text", ""))
464
- elif isinstance(item, str):
465
- text_parts.append(item)
466
- content = " ".join(text_parts)
467
-
468
- if content:
469
- texts.append(f"{role.upper()}: {content}")
470
-
471
- if texts:
472
- chunks.append({
473
- "text": "\n".join(texts),
474
- "messages": chunk_messages,
475
- "chunk_index": i // chunk_size,
476
- "start_role": chunk_messages[0].get("role", "unknown") if chunk_messages else "unknown"
477
- })
95
+ # Extract text content
96
+ texts = []
97
+ for msg in messages:
98
+ role = msg.get("role", "unknown")
99
+ content = msg.get("content", "")
100
+ if content:
101
+ texts.append(f"{role.upper()}: {content}")
478
102
 
479
- return chunks
480
-
481
- def split_large_chunk(chunk: Dict[str, Any], max_tokens: int, depth: int = 0) -> List[Dict[str, Any]]:
482
- """Split a large chunk into smaller pieces that fit token limit."""
483
- # Check recursion depth to prevent stack overflow
484
- if depth >= MAX_RECURSION_DEPTH:
485
- logger.error(f"Max recursion depth {MAX_RECURSION_DEPTH} reached while splitting chunk")
486
- # Force truncate as last resort
487
- max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
488
- chunk["text"] = chunk["text"][:max_chars] + "\n[TRUNCATED - MAX DEPTH REACHED]"
489
- chunk["was_truncated"] = True
490
- return [chunk]
103
+ if not texts:
104
+ return 0
491
105
 
492
- text = chunk["text"]
493
- messages = chunk["messages"]
106
+ chunk_text = "\n".join(texts)
494
107
 
495
- # First, check if we can split by messages
496
- if len(messages) > 1:
497
- # Try splitting messages into smaller groups
498
- mid = len(messages) // 2
499
- chunk1_messages = messages[:mid]
500
- chunk2_messages = messages[mid:]
108
+ try:
109
+ # Generate embedding
110
+ embeddings = generate_embeddings([chunk_text])
501
111
 
502
- # Recreate text for each split
503
- texts1 = []
504
- texts2 = []
112
+ # Create point ID
113
+ point_id = hashlib.md5(
114
+ f"{conversation_id}_{chunk_index}".encode()
115
+ ).hexdigest()[:16]
505
116
 
506
- for msg in chunk1_messages:
507
- role = msg.get("role", "unknown")
508
- content = extract_message_content(msg)
509
- if content:
510
- texts1.append(f"{role.upper()}: {content}")
117
+ # Create payload
118
+ payload = {
119
+ "text": chunk_text,
120
+ "conversation_id": conversation_id,
121
+ "chunk_index": chunk_index,
122
+ "timestamp": created_at,
123
+ "project": normalize_project_name(project_path.name),
124
+ "start_role": messages[0].get("role", "unknown") if messages else "unknown",
125
+ "message_count": len(messages)
126
+ }
511
127
 
512
- for msg in chunk2_messages:
513
- role = msg.get("role", "unknown")
514
- content = extract_message_content(msg)
515
- if content:
516
- texts2.append(f"{role.upper()}: {content}")
128
+ # Add metadata
129
+ if metadata:
130
+ payload.update(metadata)
517
131
 
518
- split_chunks = []
519
- if texts1:
520
- split_chunks.append({
521
- "text": "\n".join(texts1),
522
- "messages": chunk1_messages,
523
- "chunk_index": f"{chunk['chunk_index']}_a",
524
- "start_role": chunk["start_role"]
525
- })
526
- if texts2:
527
- split_chunks.append({
528
- "text": "\n".join(texts2),
529
- "messages": chunk2_messages,
530
- "chunk_index": f"{chunk['chunk_index']}_b",
531
- "start_role": chunk2_messages[0].get("role", "unknown") if chunk2_messages else "unknown"
532
- })
132
+ # Create point
133
+ point = PointStruct(
134
+ id=int(point_id, 16) % (2**63),
135
+ vector=embeddings[0],
136
+ payload=payload
137
+ )
533
138
 
534
- # Recursively split if still too large
535
- result = []
536
- for split_chunk in split_chunks:
537
- if estimate_tokens(split_chunk["text"]) > max_tokens:
538
- result.extend(split_large_chunk(split_chunk, max_tokens, depth + 1))
539
- else:
540
- result.append(split_chunk)
541
- return result
542
- else:
543
- # Single message too large - truncate with warning
544
- max_chars = max_tokens * TOKEN_ESTIMATION_RATIO
545
- if len(text) > max_chars:
546
- truncated_size = len(text) - max_chars
547
- logger.warning(f"Single message exceeds token limit, truncating {truncated_size} chars from {len(text)} total")
548
- chunk["text"] = text[:max_chars] + f"\n[TRUNCATED {truncated_size} CHARS]"
549
- chunk["was_truncated"] = True
550
- chunk["original_size"] = len(text)
551
- return [chunk]
552
-
553
- def create_token_aware_batches(chunks: List[Dict[str, Any]], max_tokens: int = MAX_TOKENS_PER_BATCH) -> List[List[Dict[str, Any]]]:
554
- """Create batches that respect token limits."""
555
- if not USE_TOKEN_AWARE_BATCHING:
556
- # Fall back to old batching method
557
- batches = []
558
- for i in range(0, len(chunks), BATCH_SIZE):
559
- batches.append(chunks[i:i + BATCH_SIZE])
560
- return batches
561
-
562
- batches = []
563
- current_batch = []
564
- current_tokens = 0
565
-
566
- for chunk in chunks:
567
- chunk_tokens = estimate_tokens(chunk["text"])
139
+ # Upload immediately
140
+ client.upsert(
141
+ collection_name=collection_name,
142
+ points=[point],
143
+ wait=True
144
+ )
145
+
146
+ return 1
568
147
 
569
- # If single chunk exceeds limit, split it
570
- if chunk_tokens > max_tokens:
571
- logger.warning(f"Chunk with {chunk_tokens} estimated tokens exceeds limit of {max_tokens}, splitting...")
572
- split_chunks = split_large_chunk(chunk, max_tokens)
573
- for split_chunk in split_chunks:
574
- split_tokens = estimate_tokens(split_chunk["text"])
575
- if split_tokens > max_tokens:
576
- logger.error(f"Split chunk still exceeds limit: {split_tokens} tokens")
577
- batches.append([split_chunk])
578
- # If adding chunk would exceed limit, start new batch
579
- elif current_tokens + chunk_tokens > max_tokens:
580
- if current_batch:
581
- batches.append(current_batch)
582
- current_batch = [chunk]
583
- current_tokens = chunk_tokens
584
- else:
585
- current_batch.append(chunk)
586
- current_tokens += chunk_tokens
148
+ except Exception as e:
149
+ logger.error(f"Error processing chunk {chunk_index}: {e}")
150
+ return 0
151
+
152
+ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
153
+ """Extract metadata in a single pass, return metadata and first timestamp."""
154
+ metadata = {
155
+ "files_analyzed": [],
156
+ "files_edited": [],
157
+ "tools_used": [],
158
+ "concepts": []
159
+ }
587
160
 
588
- if current_batch:
589
- batches.append(current_batch)
161
+ first_timestamp = None
590
162
 
591
- # Log batch statistics
592
- if batches:
593
- batch_sizes = [len(batch) for batch in batches]
594
- batch_tokens = [sum(estimate_tokens(chunk["text"]) for chunk in batch) for batch in batches]
595
- logger.debug(f"Created {len(batches)} batches, chunk counts: min={min(batch_sizes)}, max={max(batch_sizes)}, "
596
- f"estimated tokens: min={min(batch_tokens)}, max={max(batch_tokens)}, avg={sum(batch_tokens)//len(batches)}")
163
+ try:
164
+ with open(file_path, 'r', encoding='utf-8') as f:
165
+ for line in f:
166
+ if not line.strip():
167
+ continue
168
+
169
+ try:
170
+ data = json.loads(line)
171
+
172
+ # Get timestamp from first valid entry
173
+ if first_timestamp is None and 'timestamp' in data:
174
+ first_timestamp = data.get('timestamp')
175
+
176
+ # Extract tool usage from messages
177
+ if 'message' in data and data['message']:
178
+ msg = data['message']
179
+ if msg.get('content'):
180
+ content = msg['content']
181
+ if isinstance(content, list):
182
+ for item in content:
183
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
184
+ tool_name = item.get('name', '')
185
+ if tool_name and tool_name not in metadata['tools_used']:
186
+ metadata['tools_used'].append(tool_name)
187
+
188
+ # Extract file references
189
+ if 'input' in item:
190
+ input_data = item['input']
191
+ if isinstance(input_data, dict):
192
+ if 'file_path' in input_data:
193
+ file_ref = input_data['file_path']
194
+ if file_ref not in metadata['files_analyzed']:
195
+ metadata['files_analyzed'].append(file_ref)
196
+ if 'path' in input_data:
197
+ file_ref = input_data['path']
198
+ if file_ref not in metadata['files_analyzed']:
199
+ metadata['files_analyzed'].append(file_ref)
200
+
201
+ except json.JSONDecodeError:
202
+ continue
203
+ except Exception:
204
+ continue
205
+
206
+ except Exception as e:
207
+ logger.warning(f"Error extracting metadata: {e}")
597
208
 
598
- return batches
209
+ return metadata, first_timestamp or datetime.now().isoformat()
599
210
 
600
- def import_project(project_path: Path, collection_name: str, state: dict) -> int:
601
- """Import all conversations from a project."""
602
- jsonl_files = list(project_path.glob("*.jsonl"))
603
-
604
- if not jsonl_files:
605
- logger.warning(f"No JSONL files found in {project_path}")
606
- return 0
211
+ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
212
+ """Stream import a single JSONL file without loading it into memory."""
213
+ logger.info(f"Streaming import of {jsonl_file.name}")
607
214
 
608
- # Check if collection exists
609
- collections = client.get_collections().collections
610
- if collection_name not in [c.name for c in collections]:
611
- logger.info(f"Creating collection: {collection_name}")
612
- client.create_collection(
613
- collection_name=collection_name,
614
- vectors_config=VectorParams(
615
- size=embedding_dimension,
616
- distance=Distance.COSINE
617
- )
618
- )
215
+ # Extract metadata in first pass (lightweight)
216
+ metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
619
217
 
218
+ # Stream messages and process in chunks
219
+ chunk_buffer = []
220
+ chunk_index = 0
620
221
  total_chunks = 0
222
+ conversation_id = jsonl_file.stem
621
223
 
622
- for jsonl_file in jsonl_files:
623
- # Check if file should be imported
624
- if not should_import_file(jsonl_file, state):
625
- continue
626
-
627
- logger.info(f"Processing file: {jsonl_file.name}")
628
- try:
629
- # Read JSONL file and extract messages
630
- messages = []
631
- created_at = None
632
-
633
- with open(jsonl_file, 'r', encoding='utf-8') as f:
634
- for line_num, line in enumerate(f, 1):
635
- line = line.strip()
636
- if not line:
224
+ try:
225
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
226
+ for line_num, line in enumerate(f, 1):
227
+ line = line.strip()
228
+ if not line:
229
+ continue
230
+
231
+ try:
232
+ data = json.loads(line)
233
+
234
+ # Skip non-message lines
235
+ if data.get('type') == 'summary':
637
236
  continue
638
237
 
639
- try:
640
- data = json.loads(line)
641
-
642
- # Extract timestamp from first message
643
- if created_at is None and 'timestamp' in data:
644
- created_at = data.get('timestamp')
645
-
646
- # Skip non-message lines (summaries, etc.)
647
- if data.get('type') == 'summary':
648
- continue
238
+ # Extract message if present
239
+ if 'message' in data and data['message']:
240
+ msg = data['message']
241
+ if msg.get('role') and msg.get('content'):
242
+ # Extract content
243
+ content = msg['content']
244
+ if isinstance(content, list):
245
+ text_parts = []
246
+ for item in content:
247
+ if isinstance(item, dict) and item.get('type') == 'text':
248
+ text_parts.append(item.get('text', ''))
249
+ elif isinstance(item, str):
250
+ text_parts.append(item)
251
+ content = '\n'.join(text_parts)
649
252
 
650
- # Extract message if present
651
- if 'message' in data and data['message']:
652
- msg = data['message']
653
- if msg.get('role') and msg.get('content'):
654
- # Handle content that's an array of objects
655
- content = msg['content']
656
- if isinstance(content, list):
657
- text_parts = []
658
- for item in content:
659
- if isinstance(item, dict) and item.get('type') == 'text':
660
- text_parts.append(item.get('text', ''))
661
- elif isinstance(item, str):
662
- text_parts.append(item)
663
- content = '\n'.join(text_parts)
253
+ if content:
254
+ chunk_buffer.append({
255
+ 'role': msg['role'],
256
+ 'content': content
257
+ })
664
258
 
665
- if content:
666
- messages.append({
667
- 'role': msg['role'],
668
- 'content': content
669
- })
670
- except json.JSONDecodeError:
671
- logger.debug(f"Skipping invalid JSON at line {line_num}")
672
- except Exception as e:
673
- logger.error(f"Error processing line {line_num}: {e}")
674
-
675
- if not messages:
676
- continue
677
-
678
- # Extract metadata
679
- if created_at is None:
680
- created_at = datetime.now().isoformat()
681
- conversation_id = jsonl_file.stem
682
-
683
- # Extract tool usage metadata from the file
684
- metadata = extract_metadata_from_jsonl(str(jsonl_file))
685
-
686
- # Chunk the conversation
687
- chunks = chunk_conversation(messages)
688
-
689
- if not chunks:
690
- continue
691
-
692
- # Process in batches (token-aware if enabled)
693
- token_aware_batches = create_token_aware_batches(chunks)
694
-
695
- for batch_idx, batch in enumerate(token_aware_batches):
696
- texts = [chunk["text"] for chunk in batch]
697
-
698
- # Log batch info for debugging
699
- if USE_TOKEN_AWARE_BATCHING:
700
- total_tokens = sum(estimate_tokens(text) for text in texts)
701
- logger.debug(f"Batch {batch_idx + 1}/{len(token_aware_batches)}: {len(texts)} chunks, ~{total_tokens} estimated tokens")
702
-
703
- # Generate embeddings
704
- embeddings = generate_embeddings(texts)
705
-
706
- # Create points
707
- points = []
708
- for chunk, embedding in zip(batch, embeddings):
709
- point_id = hashlib.md5(
710
- f"{conversation_id}_{chunk['chunk_index']}".encode()
711
- ).hexdigest()[:16]
712
-
713
- # Combine basic payload with metadata
714
- payload = {
715
- "text": chunk["text"],
716
- "conversation_id": conversation_id,
717
- "chunk_index": chunk["chunk_index"],
718
- "timestamp": created_at,
719
- "project": normalize_project_name(project_path.name),
720
- "start_role": chunk["start_role"]
721
- }
722
- # Add metadata fields
723
- payload.update(metadata)
724
-
725
- points.append(PointStruct(
726
- id=int(point_id, 16) % (2**63), # Convert to valid integer ID
727
- vector=embedding,
728
- payload=payload
729
- ))
730
-
731
- # Upload to Qdrant
732
- client.upsert(
733
- collection_name=collection_name,
734
- points=points
735
- )
736
-
737
- total_chunks += len(points)
738
-
739
- file_chunks = len(chunks)
740
- logger.info(f"Imported {file_chunks} chunks from {jsonl_file.name}")
741
-
742
- # Update state for this file
743
- update_file_state(jsonl_file, state, file_chunks)
744
-
745
- # Save state after each file to prevent loss on OOM
746
- save_state(state)
747
-
748
- # Force garbage collection to free memory
749
- gc.collect()
750
-
751
- except Exception as e:
752
- logger.error(f"Failed to import {jsonl_file}: {e}")
753
- import traceback
754
- logger.error(traceback.format_exc())
755
-
756
- return total_chunks
259
+ # Process chunk when buffer reaches MAX_CHUNK_SIZE
260
+ if len(chunk_buffer) >= MAX_CHUNK_SIZE:
261
+ chunks = process_and_upload_chunk(
262
+ chunk_buffer, chunk_index, conversation_id,
263
+ created_at, metadata, collection_name, project_path
264
+ )
265
+ total_chunks += chunks
266
+ chunk_buffer = []
267
+ chunk_index += 1
268
+
269
+ # Force garbage collection after each chunk
270
+ gc.collect()
271
+
272
+ # Log progress
273
+ if chunk_index % 10 == 0:
274
+ logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
275
+
276
+ except json.JSONDecodeError:
277
+ logger.debug(f"Skipping invalid JSON at line {line_num}")
278
+ except Exception as e:
279
+ logger.debug(f"Error processing line {line_num}: {e}")
280
+
281
+ # Process remaining messages
282
+ if chunk_buffer:
283
+ chunks = process_and_upload_chunk(
284
+ chunk_buffer, chunk_index, conversation_id,
285
+ created_at, metadata, collection_name, project_path
286
+ )
287
+ total_chunks += chunks
288
+
289
+ logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
290
+ return total_chunks
291
+
292
+ except Exception as e:
293
+ logger.error(f"Failed to import {jsonl_file}: {e}")
294
+ return 0
295
+
296
+ def load_state() -> dict:
297
+ """Load import state."""
298
+ if os.path.exists(STATE_FILE):
299
+ try:
300
+ with open(STATE_FILE, 'r') as f:
301
+ return json.load(f)
302
+ except:
303
+ pass
304
+ return {"imported_files": {}}
305
+
306
+ def save_state(state: dict):
307
+ """Save import state."""
308
+ os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
309
+ with open(STATE_FILE, 'w') as f:
310
+ json.dump(state, f, indent=2)
311
+
312
+ def should_import_file(file_path: Path, state: dict) -> bool:
313
+ """Check if file should be imported."""
314
+ file_str = str(file_path)
315
+ if file_str in state.get("imported_files", {}):
316
+ file_info = state["imported_files"][file_str]
317
+ last_modified = file_path.stat().st_mtime
318
+ if file_info.get("last_modified") == last_modified:
319
+ logger.info(f"Skipping unchanged file: {file_path.name}")
320
+ return False
321
+ return True
322
+
323
+ def update_file_state(file_path: Path, state: dict, chunks: int):
324
+ """Update state for imported file."""
325
+ file_str = str(file_path)
326
+ state["imported_files"][file_str] = {
327
+ "imported_at": datetime.now().isoformat(),
328
+ "last_modified": file_path.stat().st_mtime,
329
+ "chunks": chunks
330
+ }
757
331
 
758
332
  def main():
759
333
  """Main import function."""
760
- logs_path = Path(LOGS_DIR)
761
-
762
- if not logs_path.exists():
763
- logger.error(f"Logs directory not found: {LOGS_DIR}")
764
- return
765
-
766
- # Load existing state
334
+ # Load state
767
335
  state = load_state()
768
- logger.info(f"Loaded state with {len(state['imported_files'])} previously imported files")
769
-
770
- # Find all project directories
771
- project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
772
-
773
- if not project_dirs:
774
- logger.warning("No project directories found")
775
- return
336
+ logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
776
337
 
338
+ # Find all projects
339
+ logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
340
+ project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
777
341
  logger.info(f"Found {len(project_dirs)} projects to import")
778
342
 
779
- # Import each project
780
343
  total_imported = 0
344
+
781
345
  for project_dir in project_dirs:
782
- # Create collection name from normalized project name
783
- normalized_name = normalize_project_name(project_dir.name)
784
- collection_name = f"conv_{hashlib.md5(normalized_name.encode()).hexdigest()[:8]}{collection_suffix}"
346
+ # Get collection name
347
+ collection_name = get_collection_name(project_dir)
348
+ logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
785
349
 
786
- logger.info(f"Importing project: {project_dir.name} (normalized: {normalized_name}) -> {collection_name}")
787
- chunks = import_project(project_dir, collection_name, state)
788
- total_imported += chunks
789
- logger.info(f"Imported {chunks} chunks from {project_dir.name}")
350
+ # Ensure collection exists
351
+ ensure_collection(collection_name)
790
352
 
791
- # Save state after each project to avoid losing progress
792
- save_state(state)
793
-
794
- # Final save (redundant but ensures state is saved)
795
- save_state(state)
353
+ # Find JSONL files
354
+ jsonl_files = sorted(project_dir.glob("*.jsonl"))
355
+
356
+ # Limit files per cycle if specified
357
+ max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
358
+ jsonl_files = jsonl_files[:max_files]
359
+
360
+ for jsonl_file in jsonl_files:
361
+ if should_import_file(jsonl_file, state):
362
+ chunks = stream_import_file(jsonl_file, collection_name, project_dir)
363
+ if chunks > 0:
364
+ update_file_state(jsonl_file, state, chunks)
365
+ save_state(state)
366
+ total_imported += 1
367
+
368
+ # Force GC after each file
369
+ gc.collect()
796
370
 
797
- logger.info(f"Import complete! Total chunks imported: {total_imported}")
371
+ logger.info(f"Import complete: processed {total_imported} files")
798
372
 
799
373
  if __name__ == "__main__":
800
374
  main()