claude-self-reflect 2.5.19 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,629 +1,374 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- Unified import script that supports both local and Voyage AI embeddings.
3
+ Streaming importer with true line-by-line processing to prevent OOM.
4
+ Processes JSONL files without loading entire file into memory.
4
5
  """
5
6
 
7
+ import json
6
8
  import os
7
9
  import sys
8
- import json
9
- import glob
10
10
  import hashlib
11
11
  import gc
12
- import re
12
+ from pathlib import Path
13
13
  from datetime import datetime
14
- from typing import List, Dict, Any, Set
14
+ from typing import List, Dict, Any, Optional
15
15
  import logging
16
- from pathlib import Path
17
16
 
18
- # Add the mcp-server/src directory to the Python path
19
- sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'mcp-server', 'src'))
20
- from utils import normalize_project_name
17
+ # Add the project root to the Python path
18
+ project_root = Path(__file__).parent.parent
19
+ sys.path.insert(0, str(project_root))
21
20
 
22
21
  from qdrant_client import QdrantClient
23
- from qdrant_client.models import (
24
- VectorParams, Distance, PointStruct,
25
- Filter, FieldCondition, MatchValue
26
- )
22
+ from qdrant_client.models import PointStruct, Distance, VectorParams
27
23
 
28
- from tenacity import (
29
- retry,
30
- stop_after_attempt,
31
- wait_random_exponential,
24
+ # Set up logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(levelname)s - %(message)s'
32
28
  )
29
+ logger = logging.getLogger(__name__)
33
30
 
34
- # Configuration
31
+ # Environment variables
35
32
  QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
36
- LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
37
- # Default to project config directory for state file
38
- default_state_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "imported-files.json")
39
- STATE_FILE = os.getenv("STATE_FILE", default_state_file)
40
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Reduced from 100 to prevent OOM
41
- PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
33
+ STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
34
+ PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
42
35
  VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
43
- CURRENT_METADATA_VERSION = 2 # Version 2: Added tool output extraction
44
-
45
- # Set up logging
46
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
47
- logger = logging.getLogger(__name__)
36
+ MAX_CHUNK_SIZE = int(os.getenv("MAX_CHUNK_SIZE", "50")) # Messages per chunk
48
37
 
49
- # ============= Metadata Extraction Functions =============
50
-
51
- def normalize_path_for_metadata(path: str) -> str:
52
- """Normalize file paths for consistency in metadata."""
53
- if not path:
54
- return ""
55
-
56
- # Remove common prefixes
57
- path = path.replace("/Users/", "~/")
58
- path = path.replace("\\Users\\", "~\\")
59
-
60
- # Convert to forward slashes
61
- path = path.replace("\\", "/")
62
-
63
- # Remove duplicate slashes
64
- path = re.sub(r'/+', '/', path)
65
-
66
- return path
67
-
68
- def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
69
- """Extract high-level concepts from conversation and tool usage."""
70
- concepts = set()
71
-
72
- # Common development concepts with patterns
73
- concept_patterns = {
74
- 'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
75
- 'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
76
- 'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
77
- 'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
78
- 'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
79
- 'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb|qdrant)',
80
- 'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
81
- 'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
82
- 'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
83
- 'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
84
- 'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
85
- 'architecture': r'(architecture|design|pattern|structure|component|module)',
86
- 'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
87
- 'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
88
- 'search': r'(search|query|find|filter|match|relevance)'
89
- }
90
-
91
- # Check text content (limit to first 10000 chars for performance)
92
- combined_text = text[:10000].lower() if text else ""
93
- for concept, pattern in concept_patterns.items():
94
- if re.search(pattern, combined_text, re.IGNORECASE):
95
- concepts.add(concept)
96
-
97
- # Check tool usage patterns
98
- if tool_usage.get('grep_searches'):
99
- concepts.add('search')
100
- if tool_usage.get('files_edited') or tool_usage.get('files_created'):
101
- concepts.add('development')
102
- if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
103
- concepts.add('testing')
104
- if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
105
- concepts.add('docker')
106
-
107
- return concepts
108
-
109
- def extract_files_from_git_output(output_text: str) -> List[str]:
110
- """Extract file paths from git command outputs (diff, show, status, etc)."""
111
- files = set()
112
-
113
- # Patterns for different git output formats
114
- patterns = [
115
- r'diff --git a/(.*?) b/', # git diff format
116
- r'^\+\+\+ b/(.+)$', # diff new file
117
- r'^--- a/(.+)$', # diff old file
118
- r'^modified:\s+(.+)$', # git status
119
- r'^deleted:\s+(.+)$', # git status
120
- r'^new file:\s+(.+)$', # git status
121
- r'^renamed:\s+(.+) -> (.+)$', # git status (captures both)
122
- ]
123
-
124
- for pattern in patterns:
125
- matches = re.findall(pattern, output_text, re.MULTILINE)
126
- for match in matches:
127
- if isinstance(match, tuple):
128
- # Handle renamed files (captures both old and new)
129
- for f in match:
130
- if f:
131
- files.add(normalize_path_for_metadata(f))
132
- else:
133
- files.add(normalize_path_for_metadata(match))
134
-
135
- return list(files)[:20] # Limit to 20 files
136
-
137
- def extract_tool_data_from_message(tool_use: Dict[str, Any], usage_dict: Dict[str, Any], tool_output: str = None):
138
- """Extract tool usage data from a tool_use object in a message, including outputs."""
139
- tool_name = tool_use.get('name', '')
140
- inputs = tool_use.get('input', {})
141
-
142
- # Track tool in summary
143
- usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
144
-
145
- # Handle Read tool
146
- if tool_name == 'Read':
147
- file_path = inputs.get('file_path')
148
- if file_path:
149
- normalized = normalize_path_for_metadata(file_path)
150
- if normalized not in usage_dict['files_read']:
151
- usage_dict['files_read'].append(normalized)
152
-
153
- # Handle Edit and MultiEdit tools
154
- elif tool_name in ['Edit', 'MultiEdit']:
155
- path = inputs.get('file_path')
156
- if path:
157
- normalized = normalize_path_for_metadata(path)
158
- if normalized not in usage_dict['files_edited']:
159
- usage_dict['files_edited'].append(normalized)
160
-
161
- # Handle Write tool
162
- elif tool_name == 'Write':
163
- path = inputs.get('file_path')
164
- if path:
165
- normalized = normalize_path_for_metadata(path)
166
- if normalized not in usage_dict['files_created']:
167
- usage_dict['files_created'].append(normalized)
168
-
169
- # Handle Grep tool
170
- elif tool_name == 'Grep':
171
- pattern = inputs.get('pattern')
172
- if pattern and len(usage_dict['grep_searches']) < 10: # Limit
173
- usage_dict['grep_searches'].append(pattern[:100]) # Truncate long patterns
174
-
175
- # Handle Bash tool - Extract both command and output
176
- elif tool_name == 'Bash':
177
- command = inputs.get('command')
178
- if command and len(usage_dict['bash_commands']) < 10:
179
- usage_dict['bash_commands'].append(command[:200]) # Truncate
180
-
181
- # Process tool output for git commands
182
- if tool_output and any(cmd in command for cmd in ['git diff', 'git show', 'git status']):
183
- git_files = extract_files_from_git_output(tool_output)
184
- for file_path in git_files:
185
- if file_path not in usage_dict['git_file_changes']:
186
- usage_dict['git_file_changes'].append(file_path)
187
-
188
- # Store tool output preview (for any tool)
189
- if tool_output and len(usage_dict['tool_outputs']) < 15:
190
- usage_dict['tool_outputs'].append({
191
- 'tool': tool_name,
192
- 'command': inputs.get('command', inputs.get('pattern', ''))[:100],
193
- 'output_preview': tool_output[:500], # First 500 chars
194
- 'output_length': len(tool_output)
195
- })
196
-
197
- def extract_metadata_from_jsonl(file_path: str) -> Dict[str, Any]:
198
- """Extract metadata from a JSONL conversation file."""
199
- tool_usage = {
200
- "files_read": [],
201
- "files_edited": [],
202
- "files_created": [],
203
- "grep_searches": [],
204
- "bash_commands": [],
205
- "tools_summary": {},
206
- "git_file_changes": [], # NEW: Files from git outputs
207
- "tool_outputs": [] # NEW: Tool output previews
208
- }
209
-
210
- conversation_text = ""
211
- tool_outputs = {} # Map tool_use_id to output text
212
-
213
- try:
214
- # First pass: collect tool outputs
215
- with open(file_path, 'r', encoding='utf-8') as f:
216
- for line in f:
217
- if line.strip():
218
- try:
219
- data = json.loads(line)
220
- if 'message' in data and data['message']:
221
- msg = data['message']
222
- if msg.get('content') and isinstance(msg['content'], list):
223
- for item in msg['content']:
224
- if isinstance(item, dict) and item.get('type') == 'tool_result':
225
- # Capture tool output
226
- tool_id = item.get('tool_use_id')
227
- output_content = item.get('content', '')
228
- if tool_id and output_content:
229
- tool_outputs[tool_id] = output_content
230
- # Also check for toolUseResult in data
231
- if 'toolUseResult' in data:
232
- result = data['toolUseResult']
233
- if isinstance(result, dict):
234
- tool_outputs['last_result'] = json.dumps(result)[:1000]
235
- except:
236
- continue
237
-
238
- # Second pass: extract tool uses and text with outputs available
239
- with open(file_path, 'r', encoding='utf-8') as f:
240
- for line in f:
241
- if line.strip():
242
- try:
243
- data = json.loads(line)
244
- if 'message' in data and data['message']:
245
- msg = data['message']
246
- # Extract text
247
- if msg.get('content'):
248
- if isinstance(msg['content'], str):
249
- conversation_text += msg['content'] + "\n"
250
- elif isinstance(msg['content'], list):
251
- for item in msg['content']:
252
- if isinstance(item, dict):
253
- if item.get('type') == 'text' and item.get('text'):
254
- conversation_text += item['text'] + "\n"
255
- elif item.get('type') == 'tool_use':
256
- # Process tool use with output now available
257
- tool_id = item.get('id', '')
258
- output = tool_outputs.get(tool_id, '')
259
- extract_tool_data_from_message(item, tool_usage, output)
260
- except:
261
- continue
262
- except Exception as e:
263
- logger.warning(f"Error extracting metadata from {file_path}: {e}")
264
-
265
- # Extract concepts from text
266
- concepts = extract_concepts(conversation_text, tool_usage)
267
-
268
- # Build metadata
269
- metadata = {
270
- "files_analyzed": tool_usage['files_read'][:20], # Limit to 20
271
- "files_edited": tool_usage['files_edited'][:10], # Limit to 10
272
- "files_created": tool_usage['files_created'][:10],
273
- "tools_used": list(tool_usage['tools_summary'].keys())[:20],
274
- "tool_summary": dict(list(tool_usage['tools_summary'].items())[:10]),
275
- "concepts": list(concepts)[:15], # Limit to 15
276
- "search_patterns": tool_usage['grep_searches'][:10],
277
- "git_file_changes": tool_usage['git_file_changes'][:20], # NEW: Git file changes
278
- "tool_outputs": tool_usage['tool_outputs'][:15], # NEW: Tool output previews
279
- "analysis_only": len(tool_usage['files_edited']) == 0 and len(tool_usage['files_created']) == 0,
280
- "has_file_metadata": True,
281
- "metadata_version": CURRENT_METADATA_VERSION,
282
- "metadata_extracted_at": datetime.now().isoformat()
283
- }
284
-
285
- return metadata
286
-
287
- # ============= End Metadata Extraction Functions =============
288
-
289
- # State management functions
290
- def load_state():
291
- """Load the import state from file."""
292
- if os.path.exists(STATE_FILE):
293
- try:
294
- with open(STATE_FILE, 'r') as f:
295
- state = json.load(f)
296
- # Ensure the expected structure exists
297
- if "imported_files" not in state:
298
- state["imported_files"] = {}
299
- return state
300
- except Exception as e:
301
- logger.warning(f"Failed to load state file: {e}")
302
- return {"imported_files": {}}
303
-
304
- def save_state(state):
305
- """Save the import state to file."""
306
- try:
307
- # Ensure directory exists
308
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
309
- # Write atomically by using a temp file
310
- temp_file = STATE_FILE + ".tmp"
311
- with open(temp_file, 'w') as f:
312
- json.dump(state, f, indent=2)
313
- os.replace(temp_file, STATE_FILE)
314
- logger.debug(f"Saved state with {len(state['imported_files'])} files")
315
- except Exception as e:
316
- logger.error(f"Failed to save state file: {e}")
317
-
318
- def should_import_file(file_path, state):
319
- """Check if a file should be imported based on modification time."""
320
- str_path = str(file_path)
321
- file_mtime = os.path.getmtime(file_path)
322
-
323
- if str_path in state["imported_files"]:
324
- file_state = state["imported_files"][str_path]
325
-
326
- # Handle both old string format and new dict format
327
- if isinstance(file_state, str):
328
- # Old format (just timestamp string) - treat as needs reimport
329
- logger.info(f"Found old format state for {file_path.name}, will reimport")
330
- return True
331
- else:
332
- # New format with dictionary
333
- last_imported = file_state.get("last_imported", 0)
334
- last_modified = file_state.get("last_modified", 0)
335
-
336
- # Skip if file hasn't been modified since last import
337
- if file_mtime <= last_modified and last_imported > 0:
338
- logger.info(f"Skipping unchanged file: {file_path.name}")
339
- return False
340
-
341
- return True
342
-
343
- def update_file_state(file_path, state, chunks_imported):
344
- """Update the state for an imported file."""
345
- str_path = str(file_path)
346
- state["imported_files"][str_path] = {
347
- "last_modified": os.path.getmtime(file_path),
348
- "last_imported": datetime.now().timestamp(),
349
- "chunks_imported": chunks_imported
350
- }
38
+ # Initialize Qdrant client
39
+ client = QdrantClient(url=QDRANT_URL)
351
40
 
352
41
  # Initialize embedding provider
353
42
  embedding_provider = None
354
43
  embedding_dimension = None
355
- collection_suffix = None
356
44
 
357
45
  if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
358
- # Use local embeddings
359
46
  logger.info("Using local embeddings (fastembed)")
360
47
  from fastembed import TextEmbedding
361
- embedding_provider = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
48
+ embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
362
49
  embedding_dimension = 384
363
- collection_suffix = "_local"
50
+ collection_suffix = "local"
364
51
  else:
365
- # Use Voyage AI
366
52
  logger.info("Using Voyage AI embeddings")
367
53
  import voyageai
368
- voyage_client = voyageai.Client(api_key=VOYAGE_API_KEY)
54
+ embedding_provider = voyageai.Client(api_key=VOYAGE_API_KEY)
369
55
  embedding_dimension = 1024
370
- collection_suffix = "_voyage"
371
-
372
- # Initialize Qdrant client
373
- client = QdrantClient(url=QDRANT_URL)
56
+ collection_suffix = "voyage"
374
57
 
58
+ def normalize_project_name(project_name: str) -> str:
59
+ """Normalize project name for consistency."""
60
+ return project_name.replace("-Users-ramakrishnanannaswamy-projects-", "").replace("-", "_").lower()
375
61
 
376
- def log_retry_state(retry_state):
377
- print(f"Retrying function '{retry_state.fn.__name__}' for the {retry_state.attempt_number} time.")
378
- print(f"----> Waiting for {retry_state.next_action.sleep} seconds before next attempt.")
62
+ def get_collection_name(project_path: Path) -> str:
63
+ """Generate collection name from project path."""
64
+ normalized = normalize_project_name(project_path.name)
65
+ name_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
66
+ return f"conv_{name_hash}_{collection_suffix}"
379
67
 
380
- @retry(wait=wait_random_exponential(multiplier=2, min=30, max=120), stop=stop_after_attempt(6), before_sleep=log_retry_state)
381
- def embed_with_backoff(**kwargs):
382
- return voyage_client.embed(**kwargs)
68
+ def ensure_collection(collection_name: str):
69
+ """Ensure collection exists with correct configuration."""
70
+ collections = client.get_collections().collections
71
+ if not any(c.name == collection_name for c in collections):
72
+ logger.info(f"Creating collection: {collection_name}")
73
+ client.create_collection(
74
+ collection_name=collection_name,
75
+ vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
76
+ )
383
77
 
384
78
  def generate_embeddings(texts: List[str]) -> List[List[float]]:
385
- """Generate embeddings for a list of texts."""
79
+ """Generate embeddings for texts."""
386
80
  if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
387
- # Local embeddings using FastEmbed
388
81
  embeddings = list(embedding_provider.passage_embed(texts))
389
- return [embedding.tolist() for embedding in embeddings]
82
+ return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
390
83
  else:
391
- # Voyage AI embeddings
392
- result = embed_with_backoff(
393
- texts=texts,
394
- model="voyage-3-large",
395
- input_type="document"
396
- )
397
- return result.embeddings
84
+ response = embedding_provider.embed(texts, model="voyage-3")
85
+ return response.embeddings
398
86
 
399
- def chunk_conversation(messages: List[Dict[str, Any]], chunk_size: int = 10) -> List[Dict[str, Any]]:
400
- """Chunk conversation into smaller segments."""
401
- chunks = []
402
-
403
- for i in range(0, len(messages), chunk_size):
404
- chunk_messages = messages[i:i + chunk_size]
405
-
406
- # Extract text content
407
- texts = []
408
- for msg in chunk_messages:
409
- role = msg.get("role", "unknown")
410
- content = msg.get("content", "")
411
-
412
- if isinstance(content, list):
413
- # Handle structured content
414
- text_parts = []
415
- for item in content:
416
- if isinstance(item, dict) and item.get("type") == "text":
417
- text_parts.append(item.get("text", ""))
418
- elif isinstance(item, str):
419
- text_parts.append(item)
420
- content = " ".join(text_parts)
421
-
422
- if content:
423
- texts.append(f"{role.upper()}: {content}")
424
-
425
- if texts:
426
- chunks.append({
427
- "text": "\n".join(texts),
428
- "messages": chunk_messages,
429
- "chunk_index": i // chunk_size,
430
- "start_role": chunk_messages[0].get("role", "unknown") if chunk_messages else "unknown"
431
- })
87
+ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
88
+ conversation_id: str, created_at: str,
89
+ metadata: Dict[str, Any], collection_name: str,
90
+ project_path: Path) -> int:
91
+ """Process and immediately upload a single chunk."""
92
+ if not messages:
93
+ return 0
432
94
 
433
- return chunks
434
-
435
- def import_project(project_path: Path, collection_name: str, state: dict) -> int:
436
- """Import all conversations from a project."""
437
- jsonl_files = list(project_path.glob("*.jsonl"))
95
+ # Extract text content
96
+ texts = []
97
+ for msg in messages:
98
+ role = msg.get("role", "unknown")
99
+ content = msg.get("content", "")
100
+ if content:
101
+ texts.append(f"{role.upper()}: {content}")
438
102
 
439
- if not jsonl_files:
440
- logger.warning(f"No JSONL files found in {project_path}")
103
+ if not texts:
441
104
  return 0
442
105
 
443
- # Check if collection exists
444
- collections = client.get_collections().collections
445
- if collection_name not in [c.name for c in collections]:
446
- logger.info(f"Creating collection: {collection_name}")
447
- client.create_collection(
106
+ chunk_text = "\n".join(texts)
107
+
108
+ try:
109
+ # Generate embedding
110
+ embeddings = generate_embeddings([chunk_text])
111
+
112
+ # Create point ID
113
+ point_id = hashlib.md5(
114
+ f"{conversation_id}_{chunk_index}".encode()
115
+ ).hexdigest()[:16]
116
+
117
+ # Create payload
118
+ payload = {
119
+ "text": chunk_text,
120
+ "conversation_id": conversation_id,
121
+ "chunk_index": chunk_index,
122
+ "timestamp": created_at,
123
+ "project": normalize_project_name(project_path.name),
124
+ "start_role": messages[0].get("role", "unknown") if messages else "unknown",
125
+ "message_count": len(messages)
126
+ }
127
+
128
+ # Add metadata
129
+ if metadata:
130
+ payload.update(metadata)
131
+
132
+ # Create point
133
+ point = PointStruct(
134
+ id=int(point_id, 16) % (2**63),
135
+ vector=embeddings[0],
136
+ payload=payload
137
+ )
138
+
139
+ # Upload immediately
140
+ client.upsert(
448
141
  collection_name=collection_name,
449
- vectors_config=VectorParams(
450
- size=embedding_dimension,
451
- distance=Distance.COSINE
452
- )
142
+ points=[point],
143
+ wait=True
453
144
  )
145
+
146
+ return 1
147
+
148
+ except Exception as e:
149
+ logger.error(f"Error processing chunk {chunk_index}: {e}")
150
+ return 0
151
+
152
+ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str]:
153
+ """Extract metadata in a single pass, return metadata and first timestamp."""
154
+ metadata = {
155
+ "files_analyzed": [],
156
+ "files_edited": [],
157
+ "tools_used": [],
158
+ "concepts": []
159
+ }
160
+
161
+ first_timestamp = None
162
+
163
+ try:
164
+ with open(file_path, 'r', encoding='utf-8') as f:
165
+ for line in f:
166
+ if not line.strip():
167
+ continue
168
+
169
+ try:
170
+ data = json.loads(line)
171
+
172
+ # Get timestamp from first valid entry
173
+ if first_timestamp is None and 'timestamp' in data:
174
+ first_timestamp = data.get('timestamp')
175
+
176
+ # Extract tool usage from messages
177
+ if 'message' in data and data['message']:
178
+ msg = data['message']
179
+ if msg.get('content'):
180
+ content = msg['content']
181
+ if isinstance(content, list):
182
+ for item in content:
183
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
184
+ tool_name = item.get('name', '')
185
+ if tool_name and tool_name not in metadata['tools_used']:
186
+ metadata['tools_used'].append(tool_name)
187
+
188
+ # Extract file references
189
+ if 'input' in item:
190
+ input_data = item['input']
191
+ if isinstance(input_data, dict):
192
+ if 'file_path' in input_data:
193
+ file_ref = input_data['file_path']
194
+ if file_ref not in metadata['files_analyzed']:
195
+ metadata['files_analyzed'].append(file_ref)
196
+ if 'path' in input_data:
197
+ file_ref = input_data['path']
198
+ if file_ref not in metadata['files_analyzed']:
199
+ metadata['files_analyzed'].append(file_ref)
200
+
201
+ except json.JSONDecodeError:
202
+ continue
203
+ except Exception:
204
+ continue
205
+
206
+ except Exception as e:
207
+ logger.warning(f"Error extracting metadata: {e}")
208
+
209
+ return metadata, first_timestamp or datetime.now().isoformat()
210
+
211
+ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Path) -> int:
212
+ """Stream import a single JSONL file without loading it into memory."""
213
+ logger.info(f"Streaming import of {jsonl_file.name}")
214
+
215
+ # Extract metadata in first pass (lightweight)
216
+ metadata, created_at = extract_metadata_single_pass(str(jsonl_file))
454
217
 
218
+ # Stream messages and process in chunks
219
+ chunk_buffer = []
220
+ chunk_index = 0
455
221
  total_chunks = 0
222
+ conversation_id = jsonl_file.stem
456
223
 
457
- for jsonl_file in jsonl_files:
458
- # Check if file should be imported
459
- if not should_import_file(jsonl_file, state):
460
- continue
461
-
462
- logger.info(f"Processing file: {jsonl_file.name}")
463
- try:
464
- # Read JSONL file and extract messages
465
- messages = []
466
- created_at = None
467
-
468
- with open(jsonl_file, 'r', encoding='utf-8') as f:
469
- for line_num, line in enumerate(f, 1):
470
- line = line.strip()
471
- if not line:
224
+ try:
225
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
226
+ for line_num, line in enumerate(f, 1):
227
+ line = line.strip()
228
+ if not line:
229
+ continue
230
+
231
+ try:
232
+ data = json.loads(line)
233
+
234
+ # Skip non-message lines
235
+ if data.get('type') == 'summary':
472
236
  continue
473
237
 
474
- try:
475
- data = json.loads(line)
476
-
477
- # Extract timestamp from first message
478
- if created_at is None and 'timestamp' in data:
479
- created_at = data.get('timestamp')
480
-
481
- # Skip non-message lines (summaries, etc.)
482
- if data.get('type') == 'summary':
483
- continue
238
+ # Extract message if present
239
+ if 'message' in data and data['message']:
240
+ msg = data['message']
241
+ if msg.get('role') and msg.get('content'):
242
+ # Extract content
243
+ content = msg['content']
244
+ if isinstance(content, list):
245
+ text_parts = []
246
+ for item in content:
247
+ if isinstance(item, dict) and item.get('type') == 'text':
248
+ text_parts.append(item.get('text', ''))
249
+ elif isinstance(item, str):
250
+ text_parts.append(item)
251
+ content = '\n'.join(text_parts)
484
252
 
485
- # Extract message if present
486
- if 'message' in data and data['message']:
487
- msg = data['message']
488
- if msg.get('role') and msg.get('content'):
489
- # Handle content that's an array of objects
490
- content = msg['content']
491
- if isinstance(content, list):
492
- text_parts = []
493
- for item in content:
494
- if isinstance(item, dict) and item.get('type') == 'text':
495
- text_parts.append(item.get('text', ''))
496
- elif isinstance(item, str):
497
- text_parts.append(item)
498
- content = '\n'.join(text_parts)
253
+ if content:
254
+ chunk_buffer.append({
255
+ 'role': msg['role'],
256
+ 'content': content
257
+ })
499
258
 
500
- if content:
501
- messages.append({
502
- 'role': msg['role'],
503
- 'content': content
504
- })
505
- except json.JSONDecodeError:
506
- logger.debug(f"Skipping invalid JSON at line {line_num}")
507
- except Exception as e:
508
- logger.error(f"Error processing line {line_num}: {e}")
509
-
510
- if not messages:
511
- continue
512
-
513
- # Extract metadata
514
- if created_at is None:
515
- created_at = datetime.now().isoformat()
516
- conversation_id = jsonl_file.stem
517
-
518
- # Extract tool usage metadata from the file
519
- metadata = extract_metadata_from_jsonl(str(jsonl_file))
520
-
521
- # Chunk the conversation
522
- chunks = chunk_conversation(messages)
523
-
524
- if not chunks:
525
- continue
526
-
527
- # Process in batches
528
- for batch_start in range(0, len(chunks), BATCH_SIZE):
529
- batch = chunks[batch_start:batch_start + BATCH_SIZE]
530
- texts = [chunk["text"] for chunk in batch]
531
-
532
- # Generate embeddings
533
- embeddings = generate_embeddings(texts)
534
-
535
- # Create points
536
- points = []
537
- for chunk, embedding in zip(batch, embeddings):
538
- point_id = hashlib.md5(
539
- f"{conversation_id}_{chunk['chunk_index']}".encode()
540
- ).hexdigest()[:16]
541
-
542
- # Combine basic payload with metadata
543
- payload = {
544
- "text": chunk["text"],
545
- "conversation_id": conversation_id,
546
- "chunk_index": chunk["chunk_index"],
547
- "timestamp": created_at,
548
- "project": normalize_project_name(project_path.name),
549
- "start_role": chunk["start_role"]
550
- }
551
- # Add metadata fields
552
- payload.update(metadata)
553
-
554
- points.append(PointStruct(
555
- id=int(point_id, 16) % (2**63), # Convert to valid integer ID
556
- vector=embedding,
557
- payload=payload
558
- ))
559
-
560
- # Upload to Qdrant
561
- client.upsert(
562
- collection_name=collection_name,
563
- points=points
564
- )
565
-
566
- total_chunks += len(points)
567
-
568
- file_chunks = len(chunks)
569
- logger.info(f"Imported {file_chunks} chunks from {jsonl_file.name}")
570
-
571
- # Update state for this file
572
- update_file_state(jsonl_file, state, file_chunks)
573
-
574
- # Save state after each file to prevent loss on OOM
575
- save_state(state)
576
-
577
- # Force garbage collection to free memory
578
- gc.collect()
579
-
580
- except Exception as e:
581
- logger.error(f"Failed to import {jsonl_file}: {e}")
582
- import traceback
583
- logger.error(traceback.format_exc())
584
-
585
- return total_chunks
259
+ # Process chunk when buffer reaches MAX_CHUNK_SIZE
260
+ if len(chunk_buffer) >= MAX_CHUNK_SIZE:
261
+ chunks = process_and_upload_chunk(
262
+ chunk_buffer, chunk_index, conversation_id,
263
+ created_at, metadata, collection_name, project_path
264
+ )
265
+ total_chunks += chunks
266
+ chunk_buffer = []
267
+ chunk_index += 1
268
+
269
+ # Force garbage collection after each chunk
270
+ gc.collect()
271
+
272
+ # Log progress
273
+ if chunk_index % 10 == 0:
274
+ logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
275
+
276
+ except json.JSONDecodeError:
277
+ logger.debug(f"Skipping invalid JSON at line {line_num}")
278
+ except Exception as e:
279
+ logger.debug(f"Error processing line {line_num}: {e}")
280
+
281
+ # Process remaining messages
282
+ if chunk_buffer:
283
+ chunks = process_and_upload_chunk(
284
+ chunk_buffer, chunk_index, conversation_id,
285
+ created_at, metadata, collection_name, project_path
286
+ )
287
+ total_chunks += chunks
288
+
289
+ logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
290
+ return total_chunks
291
+
292
+ except Exception as e:
293
+ logger.error(f"Failed to import {jsonl_file}: {e}")
294
+ return 0
295
+
296
+ def load_state() -> dict:
297
+ """Load import state."""
298
+ if os.path.exists(STATE_FILE):
299
+ try:
300
+ with open(STATE_FILE, 'r') as f:
301
+ return json.load(f)
302
+ except:
303
+ pass
304
+ return {"imported_files": {}}
305
+
306
+ def save_state(state: dict):
307
+ """Save import state."""
308
+ os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
309
+ with open(STATE_FILE, 'w') as f:
310
+ json.dump(state, f, indent=2)
311
+
312
+ def should_import_file(file_path: Path, state: dict) -> bool:
313
+ """Check if file should be imported."""
314
+ file_str = str(file_path)
315
+ if file_str in state.get("imported_files", {}):
316
+ file_info = state["imported_files"][file_str]
317
+ last_modified = file_path.stat().st_mtime
318
+ if file_info.get("last_modified") == last_modified:
319
+ logger.info(f"Skipping unchanged file: {file_path.name}")
320
+ return False
321
+ return True
322
+
323
+ def update_file_state(file_path: Path, state: dict, chunks: int):
324
+ """Update state for imported file."""
325
+ file_str = str(file_path)
326
+ state["imported_files"][file_str] = {
327
+ "imported_at": datetime.now().isoformat(),
328
+ "last_modified": file_path.stat().st_mtime,
329
+ "chunks": chunks
330
+ }
586
331
 
587
332
  def main():
588
333
  """Main import function."""
589
- logs_path = Path(LOGS_DIR)
590
-
591
- if not logs_path.exists():
592
- logger.error(f"Logs directory not found: {LOGS_DIR}")
593
- return
594
-
595
- # Load existing state
334
+ # Load state
596
335
  state = load_state()
597
- logger.info(f"Loaded state with {len(state['imported_files'])} previously imported files")
598
-
599
- # Find all project directories
600
- project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
601
-
602
- if not project_dirs:
603
- logger.warning("No project directories found")
604
- return
336
+ logger.info(f"Loaded state with {len(state.get('imported_files', {}))} previously imported files")
605
337
 
338
+ # Find all projects
339
+ logs_dir = Path(os.getenv("LOGS_DIR", "/logs"))
340
+ project_dirs = [d for d in logs_dir.iterdir() if d.is_dir()]
606
341
  logger.info(f"Found {len(project_dirs)} projects to import")
607
342
 
608
- # Import each project
609
343
  total_imported = 0
344
+
610
345
  for project_dir in project_dirs:
611
- # Create collection name from normalized project name
612
- normalized_name = normalize_project_name(project_dir.name)
613
- collection_name = f"conv_{hashlib.md5(normalized_name.encode()).hexdigest()[:8]}{collection_suffix}"
346
+ # Get collection name
347
+ collection_name = get_collection_name(project_dir)
348
+ logger.info(f"Importing project: {project_dir.name} -> {collection_name}")
614
349
 
615
- logger.info(f"Importing project: {project_dir.name} (normalized: {normalized_name}) -> {collection_name}")
616
- chunks = import_project(project_dir, collection_name, state)
617
- total_imported += chunks
618
- logger.info(f"Imported {chunks} chunks from {project_dir.name}")
350
+ # Ensure collection exists
351
+ ensure_collection(collection_name)
619
352
 
620
- # Save state after each project to avoid losing progress
621
- save_state(state)
622
-
623
- # Final save (redundant but ensures state is saved)
624
- save_state(state)
353
+ # Find JSONL files
354
+ jsonl_files = sorted(project_dir.glob("*.jsonl"))
355
+
356
+ # Limit files per cycle if specified
357
+ max_files = int(os.getenv("MAX_FILES_PER_CYCLE", "1000"))
358
+ jsonl_files = jsonl_files[:max_files]
359
+
360
+ for jsonl_file in jsonl_files:
361
+ if should_import_file(jsonl_file, state):
362
+ chunks = stream_import_file(jsonl_file, collection_name, project_dir)
363
+ if chunks > 0:
364
+ update_file_state(jsonl_file, state, chunks)
365
+ save_state(state)
366
+ total_imported += 1
367
+
368
+ # Force GC after each file
369
+ gc.collect()
625
370
 
626
- logger.info(f"Import complete! Total chunks imported: {total_imported}")
371
+ logger.info(f"Import complete: processed {total_imported} files")
627
372
 
628
373
  if __name__ == "__main__":
629
374
  main()