claude-self-reflect 5.0.4 → 5.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ """
2
+ Embedding service abstraction to handle both local and cloud embeddings.
3
+ Reduces complexity by separating embedding concerns from import logic.
4
+ """
5
+
6
+ import os
7
+ import logging
8
+ from abc import ABC, abstractmethod
9
+ from typing import List, Optional
10
+ from pathlib import Path
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class EmbeddingProvider(ABC):
16
+ """Abstract base class for embedding providers."""
17
+
18
+ @abstractmethod
19
+ def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
20
+ """Generate embeddings for a list of texts."""
21
+ pass
22
+
23
+ @abstractmethod
24
+ def get_dimension(self) -> int:
25
+ """Get the dimension of embeddings produced by this provider."""
26
+ pass
27
+
28
+ @abstractmethod
29
+ def get_collection_suffix(self) -> str:
30
+ """Get the suffix for collection naming."""
31
+ pass
32
+
33
+
34
+ class LocalEmbeddingProvider(EmbeddingProvider):
35
+ """Local embedding provider using FastEmbed."""
36
+
37
+ def __init__(self):
38
+ self.model = None
39
+ self.dimension = 384
40
+ self._initialize_model()
41
+
42
+ def _initialize_model(self):
43
+ """Initialize the FastEmbed model."""
44
+ try:
45
+ from fastembed import TextEmbedding
46
+ self.model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
47
+ logger.info("Initialized local FastEmbed model (384 dimensions)")
48
+ except ImportError as e:
49
+ logger.error("FastEmbed not installed. Install with: pip install fastembed")
50
+ raise
51
+ except Exception as e:
52
+ logger.exception(f"Failed to initialize FastEmbed: {e}")
53
+ raise
54
+
55
+ def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
56
+ """Generate embeddings using FastEmbed."""
57
+ if not self.model:
58
+ raise RuntimeError("FastEmbed model not initialized")
59
+
60
+ try:
61
+ embeddings = list(self.model.embed(texts))
62
+ return [list(emb) for emb in embeddings]
63
+ except Exception as e:
64
+ logger.error(f"Failed to generate local embeddings: {e}")
65
+ raise
66
+
67
+ def get_dimension(self) -> int:
68
+ """Get embedding dimension (384 for FastEmbed)."""
69
+ return self.dimension
70
+
71
+ def get_collection_suffix(self) -> str:
72
+ """Get collection suffix for local embeddings."""
73
+ return "local_384d"
74
+
75
+
76
+ class CloudEmbeddingProvider(EmbeddingProvider):
77
+ """Cloud embedding provider using Voyage AI."""
78
+
79
+ def __init__(self, api_key: str):
80
+ # Don't store API key directly, use it only for client initialization
81
+ self.client = None
82
+ self.dimension = 1024
83
+ self._initialize_client(api_key)
84
+
85
+ def _initialize_client(self, api_key: str):
86
+ """Initialize the Voyage AI client."""
87
+ try:
88
+ import voyageai
89
+ self.client = voyageai.Client(api_key=api_key)
90
+ logger.info("Initialized Voyage AI client (1024 dimensions)")
91
+ except ImportError as e:
92
+ logger.error("voyageai not installed. Install with: pip install voyageai")
93
+ raise
94
+ except Exception as e:
95
+ logger.exception(f"Failed to initialize Voyage AI: {e}")
96
+ raise
97
+
98
+ def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
99
+ """Generate embeddings using Voyage AI."""
100
+ if not self.client:
101
+ raise RuntimeError("Voyage AI client not initialized")
102
+
103
+ try:
104
+ result = self.client.embed(texts, model="voyage-2")
105
+ return result.embeddings
106
+ except Exception as e:
107
+ logger.error(f"Failed to generate cloud embeddings: {e}")
108
+ raise
109
+
110
+ def get_dimension(self) -> int:
111
+ """Get embedding dimension (1024 for Voyage)."""
112
+ return self.dimension
113
+
114
+ def get_collection_suffix(self) -> str:
115
+ """Get collection suffix for cloud embeddings."""
116
+ return "cloud_1024d"
117
+
118
+
119
+ class EmbeddingService:
120
+ """
121
+ Service to manage embedding generation with automatic provider selection.
122
+ Reduces complexity by encapsulating embedding logic.
123
+ """
124
+
125
+ def __init__(self, prefer_local: bool = True, voyage_api_key: Optional[str] = None):
126
+ """
127
+ Initialize embedding service.
128
+
129
+ Args:
130
+ prefer_local: Whether to prefer local embeddings when available
131
+ voyage_api_key: API key for Voyage AI (if using cloud embeddings)
132
+ """
133
+ self.prefer_local = prefer_local
134
+ self.voyage_api_key = voyage_api_key
135
+ self.provider = None
136
+ self._initialize_provider()
137
+
138
+ def _initialize_provider(self):
139
+ """Initialize the appropriate embedding provider."""
140
+ if self.prefer_local or not self.voyage_api_key:
141
+ try:
142
+ self.provider = LocalEmbeddingProvider()
143
+ logger.info("Using local embedding provider (FastEmbed)")
144
+ except Exception as e:
145
+ logger.warning(f"Failed to initialize local provider: {e}")
146
+ if self.voyage_api_key:
147
+ self._fallback_to_cloud()
148
+ else:
149
+ raise RuntimeError("No embedding provider available")
150
+ else:
151
+ try:
152
+ self.provider = CloudEmbeddingProvider(self.voyage_api_key)
153
+ logger.info("Using cloud embedding provider (Voyage AI)")
154
+ except Exception as e:
155
+ logger.warning(f"Failed to initialize cloud provider: {e}")
156
+ self._fallback_to_local()
157
+
158
+ def _fallback_to_cloud(self):
159
+ """Fallback to cloud provider."""
160
+ if not self.voyage_api_key:
161
+ raise RuntimeError("No Voyage API key available for cloud fallback")
162
+ try:
163
+ self.provider = CloudEmbeddingProvider(self.voyage_api_key)
164
+ logger.info("Fallback to cloud embedding provider")
165
+ # Clear the key after use
166
+ self.voyage_api_key = None
167
+ except Exception as e:
168
+ raise RuntimeError(f"Failed to initialize any embedding provider: {e}")
169
+
170
+ def _fallback_to_local(self):
171
+ """Fallback to local provider."""
172
+ try:
173
+ self.provider = LocalEmbeddingProvider()
174
+ logger.info("Fallback to local embedding provider")
175
+ except Exception as e:
176
+ raise RuntimeError(f"Failed to initialize any embedding provider: {e}")
177
+
178
+ def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
179
+ """
180
+ Generate embeddings for texts using the configured provider.
181
+
182
+ Args:
183
+ texts: List of texts to embed
184
+
185
+ Returns:
186
+ List of embedding vectors
187
+ """
188
+ if not self.provider:
189
+ raise RuntimeError("No embedding provider initialized")
190
+
191
+ # Filter out empty texts
192
+ non_empty_texts = [t for t in texts if t and t.strip()]
193
+ if not non_empty_texts:
194
+ return []
195
+
196
+ return self.provider.generate_embeddings(non_empty_texts)
197
+
198
+ def get_dimension(self) -> int:
199
+ """Get the dimension of embeddings."""
200
+ if not self.provider:
201
+ raise RuntimeError("No embedding provider initialized")
202
+ return self.provider.get_dimension()
203
+
204
+ def get_collection_suffix(self) -> str:
205
+ """Get the collection suffix for current provider."""
206
+ if not self.provider:
207
+ raise RuntimeError("No embedding provider initialized")
208
+ return self.provider.get_collection_suffix()
209
+
210
+ def get_provider_name(self) -> str:
211
+ """Get the name of the current provider."""
212
+ if isinstance(self.provider, LocalEmbeddingProvider):
213
+ return "FastEmbed (Local)"
214
+ elif isinstance(self.provider, CloudEmbeddingProvider):
215
+ return "Voyage AI (Cloud)"
216
+ else:
217
+ return "Unknown"
218
+
219
+
220
+ # Factory function for convenience
221
+ def create_embedding_service(
222
+ prefer_local: Optional[bool] = None,
223
+ voyage_api_key: Optional[str] = None
224
+ ) -> EmbeddingService:
225
+ """
226
+ Create an embedding service with environment variable defaults.
227
+
228
+ Args:
229
+ prefer_local: Override for PREFER_LOCAL_EMBEDDINGS env var
230
+ voyage_api_key: Override for VOYAGE_KEY env var
231
+
232
+ Returns:
233
+ Configured EmbeddingService instance
234
+ """
235
+ if prefer_local is None:
236
+ prefer_local = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
237
+
238
+ if voyage_api_key is None:
239
+ voyage_api_key = os.getenv("VOYAGE_KEY")
240
+
241
+ return EmbeddingService(prefer_local, voyage_api_key)
@@ -0,0 +1,344 @@
1
+ """
2
+ Import strategies using Strategy pattern to reduce complexity of stream_import_file.
3
+ """
4
+
5
+ import json
6
+ import gc
7
+ import os
8
+ import logging
9
+ from abc import ABC, abstractmethod
10
+ from pathlib import Path
11
+ from typing import Dict, Any, List, Optional, Generator
12
+ from datetime import datetime
13
+
14
+ from message_processors import MessageProcessorFactory
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ImportStrategy(ABC):
20
+ """Abstract base class for import strategies."""
21
+
22
+ @abstractmethod
23
+ def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
24
+ """Import a JSONL file using the specific strategy."""
25
+ pass
26
+
27
+
28
+ class ChunkBuffer:
29
+ """Manages buffering and processing of message chunks."""
30
+
31
+ def __init__(self, max_size: int = 50):
32
+ self.buffer: List[Dict[str, Any]] = []
33
+ self.max_size = max_size
34
+ self.current_index = 0
35
+ # Add memory limit for message content
36
+ self.max_content_length = int(os.getenv('MAX_MESSAGE_CONTENT_LENGTH', '5000'))
37
+
38
+ def add(self, message: Dict[str, Any]) -> bool:
39
+ """Add a message to the buffer. Returns True if buffer is full."""
40
+ # Truncate long content to prevent memory issues
41
+ if 'content' in message and len(message['content']) > self.max_content_length:
42
+ message = message.copy()
43
+ message['content'] = message['content'][:self.max_content_length] + '...[truncated]'
44
+ self.buffer.append(message)
45
+ return len(self.buffer) >= self.max_size
46
+
47
+ def get_and_clear(self) -> List[Dict[str, Any]]:
48
+ """Get buffer contents and clear it."""
49
+ contents = self.buffer.copy()
50
+ self.buffer.clear()
51
+ return contents
52
+
53
+ def has_content(self) -> bool:
54
+ """Check if buffer has any content."""
55
+ return len(self.buffer) > 0
56
+
57
+
58
+ class MessageStreamReader:
59
+ """Handles reading and parsing messages from JSONL files."""
60
+
61
+ def __init__(self):
62
+ self.processor_factory = MessageProcessorFactory()
63
+ self.current_message_index = 0
64
+
65
+ def read_messages(self, file_path: Path) -> Generator[Dict[str, Any], None, None]:
66
+ """Generator that yields processed messages from a JSONL file."""
67
+ self.current_message_index = 0
68
+
69
+ with open(file_path, 'r', encoding='utf-8') as f:
70
+ for line_num, line in enumerate(f, 1):
71
+ line = line.strip()
72
+ if not line:
73
+ continue
74
+
75
+ message = self._parse_line(line, line_num)
76
+ if message:
77
+ yield message
78
+
79
+ def _parse_line(self, line: str, line_num: int) -> Optional[Dict[str, Any]]:
80
+ """Parse a single line and extract message if present."""
81
+ try:
82
+ data = json.loads(line)
83
+
84
+ # Skip summary lines
85
+ if data.get('type') == 'summary':
86
+ return None
87
+
88
+ # Handle message entries
89
+ if 'message' in data and data['message']:
90
+ return self._process_message(data['message'])
91
+
92
+ # Handle top-level tool entries
93
+ entry_type = data.get('type')
94
+ if entry_type in ('tool_result', 'tool_use'):
95
+ return self._process_tool_entry(data, entry_type)
96
+
97
+ except json.JSONDecodeError:
98
+ logger.debug(f"Skipping invalid JSON at line {line_num}")
99
+ except (KeyError, TypeError, ValueError) as e:
100
+ logger.debug(f"Error processing data at line {line_num}: {e}")
101
+ except Exception as e:
102
+ logger.warning(f"Unexpected error at line {line_num}: {e}")
103
+
104
+ return None
105
+
106
+ def _process_message(self, message: Dict[str, Any]) -> Optional[Dict[str, Any]]:
107
+ """Process a message entry."""
108
+ role = message.get('role')
109
+ content = message.get('content')
110
+
111
+ if not role or not content:
112
+ return None
113
+
114
+ # Process content
115
+ text_content = self._extract_text_content(content)
116
+
117
+ if not text_content:
118
+ return None
119
+
120
+ # Track message index for user/assistant messages
121
+ if role in ['user', 'assistant']:
122
+ message_idx = self.current_message_index
123
+ self.current_message_index += 1
124
+ else:
125
+ message_idx = 0
126
+
127
+ return {
128
+ 'role': role,
129
+ 'content': text_content,
130
+ 'message_index': message_idx
131
+ }
132
+
133
+ def _extract_text_content(self, content: Any) -> str:
134
+ """Extract text content from various content formats."""
135
+ if isinstance(content, str):
136
+ return content
137
+
138
+ if isinstance(content, list):
139
+ text_parts = []
140
+ for item in content:
141
+ if isinstance(item, dict):
142
+ text = self._process_content_item(item)
143
+ if text:
144
+ text_parts.append(text)
145
+ elif isinstance(item, str):
146
+ text_parts.append(item)
147
+ return '\n'.join(text_parts)
148
+
149
+ return ''
150
+
151
+ def _process_content_item(self, item: Dict[str, Any]) -> Optional[str]:
152
+ """Process a single content item."""
153
+ item_type = item.get('type', '')
154
+
155
+ if item_type == 'text':
156
+ return item.get('text', '')
157
+ elif item_type == 'thinking':
158
+ thinking_content = item.get('thinking', '')
159
+ return f"[Thinking] {thinking_content[:1000]}" if thinking_content else None
160
+ elif item_type == 'tool_use':
161
+ tool_name = item.get('name', 'unknown')
162
+ tool_input = str(item.get('input', ''))[:500]
163
+ return f"[Tool: {tool_name}] {tool_input}"
164
+ elif item_type == 'tool_result':
165
+ result_content = str(item.get('content', ''))[:1000]
166
+ return f"[Result] {result_content}"
167
+
168
+ return None
169
+
170
+ def _process_tool_entry(self, data: Dict[str, Any], entry_type: str) -> Optional[Dict[str, Any]]:
171
+ """Process a top-level tool entry."""
172
+ text_parts = []
173
+
174
+ if entry_type == 'tool_use':
175
+ tool_name = data.get('name', 'unknown')
176
+ tool_input = str(data.get('input', ''))[:500]
177
+ text_parts.append(f"[Tool: {tool_name}] {tool_input}")
178
+
179
+ elif entry_type == 'tool_result':
180
+ result_content = self._extract_tool_result(data)
181
+ text_parts.append(f"[Result] {result_content[:1000]}")
182
+
183
+ content = "\n".join(text_parts)
184
+ if not content:
185
+ return None
186
+
187
+ message_idx = self.current_message_index
188
+ self.current_message_index += 1
189
+
190
+ return {
191
+ 'role': entry_type,
192
+ 'content': content,
193
+ 'message_index': message_idx
194
+ }
195
+
196
+ def _extract_tool_result(self, data: Dict[str, Any]) -> str:
197
+ """Extract result content from tool result data."""
198
+ result_content = data.get('content')
199
+
200
+ if isinstance(result_content, list):
201
+ flat = []
202
+ for item in result_content:
203
+ if isinstance(item, dict) and item.get('type') == 'text':
204
+ flat.append(item.get('text', ''))
205
+ elif isinstance(item, str):
206
+ flat.append(item)
207
+ result_content = "\n".join(flat)
208
+
209
+ if not result_content:
210
+ result_content = data.get('result', '')
211
+
212
+ return str(result_content)
213
+
214
+
215
+ class StreamImportStrategy(ImportStrategy):
216
+ """
217
+ Strategy for streaming import with chunked processing.
218
+ This is the main refactored implementation.
219
+ """
220
+
221
+ def __init__(self, client, process_chunk_fn, state_manager, max_chunk_size: int = 50,
222
+ cleanup_tolerance: int = None):
223
+ self.client = client
224
+ self.process_chunk_fn = process_chunk_fn
225
+ self.state_manager = state_manager
226
+ self.max_chunk_size = max_chunk_size
227
+ # Make cleanup tolerance configurable via environment variable
228
+ self.cleanup_tolerance = cleanup_tolerance or int(os.getenv('CLEANUP_TOLERANCE', '5'))
229
+ self.stream_reader = MessageStreamReader()
230
+
231
+ def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
232
+ """Import a JSONL file using streaming strategy."""
233
+ logger.info(f"Streaming import of {jsonl_file.name}")
234
+
235
+ conversation_id = jsonl_file.stem
236
+
237
+ # Extract metadata first (lightweight)
238
+ from metadata_extractor import MetadataExtractor
239
+ extractor = MetadataExtractor()
240
+ metadata, created_at, total_messages = extractor.extract_metadata_from_file(str(jsonl_file))
241
+
242
+ # Initialize chunk processing
243
+ chunk_buffer = ChunkBuffer(self.max_chunk_size)
244
+ chunk_index = 0
245
+ total_chunks = 0
246
+
247
+ try:
248
+ # Stream and process messages
249
+ for message in self.stream_reader.read_messages(jsonl_file):
250
+ if chunk_buffer.add(message):
251
+ # Buffer is full, process chunk
252
+ chunks = self._process_buffer(
253
+ chunk_buffer, chunk_index, conversation_id,
254
+ created_at, metadata, collection_name, project_path, total_messages
255
+ )
256
+ total_chunks += chunks
257
+ chunk_index += 1
258
+
259
+ # Force garbage collection after each chunk
260
+ gc.collect()
261
+
262
+ # Log progress
263
+ if chunk_index % 10 == 0:
264
+ logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
265
+
266
+ # Process remaining messages
267
+ if chunk_buffer.has_content():
268
+ chunks = self._process_buffer(
269
+ chunk_buffer, chunk_index, conversation_id,
270
+ created_at, metadata, collection_name, project_path, total_messages
271
+ )
272
+ total_chunks += chunks
273
+
274
+ # Clean up old points after successful import
275
+ if total_chunks > 0:
276
+ self._cleanup_old_points(conversation_id, collection_name, total_chunks)
277
+
278
+ logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
279
+ return total_chunks
280
+
281
+ except (IOError, OSError) as e:
282
+ logger.error(f"Failed to read file {jsonl_file}: {e}")
283
+ self._mark_failed(jsonl_file, str(e))
284
+ return 0
285
+ except json.JSONDecodeError as e:
286
+ logger.error(f"Invalid JSON in {jsonl_file}: {e}")
287
+ self._mark_failed(jsonl_file, str(e))
288
+ return 0
289
+ except Exception as e:
290
+ logger.error(f"Unexpected error importing {jsonl_file}: {e}")
291
+ self._mark_failed(jsonl_file, str(e))
292
+ return 0
293
+
294
+ def _process_buffer(self, chunk_buffer: ChunkBuffer, chunk_index: int,
295
+ conversation_id: str, created_at: str, metadata: Dict[str, Any],
296
+ collection_name: str, project_path: Path, total_messages: int) -> int:
297
+ """Process a buffer of messages and return number of chunks created."""
298
+ messages = chunk_buffer.get_and_clear()
299
+ return self.process_chunk_fn(
300
+ messages, chunk_index, conversation_id,
301
+ created_at, metadata, collection_name, project_path, total_messages
302
+ )
303
+
304
+ def _cleanup_old_points(self, conversation_id: str, collection_name: str, total_chunks: int):
305
+ """Clean up old points after successful import."""
306
+ try:
307
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
308
+
309
+ # Count old points using count API
310
+ old_count_filter = Filter(
311
+ must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
312
+ )
313
+
314
+ # Use count API to get actual count
315
+ old_count = self.client.count(
316
+ collection_name=collection_name,
317
+ count_filter=old_count_filter,
318
+ exact=True
319
+ ).count
320
+
321
+ if old_count > total_chunks + self.cleanup_tolerance:
322
+ # Use filter parameter for delete
323
+ self.client.delete(
324
+ collection_name=collection_name,
325
+ points_selector=Filter(
326
+ must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
327
+ ),
328
+ wait=True
329
+ )
330
+ logger.info(f"Deleted {old_count - total_chunks} old points for conversation {conversation_id}")
331
+
332
+ except ImportError as e:
333
+ logger.debug(f"Qdrant client import error: {e}")
334
+ except Exception as e:
335
+ logger.warning(f"Could not clean up old points for {conversation_id}: {e}")
336
+
337
+ def _mark_failed(self, jsonl_file: Path, error: str):
338
+ """Mark a file as failed in state manager."""
339
+ try:
340
+ self.state_manager.mark_file_failed(str(jsonl_file), error)
341
+ except AttributeError as e:
342
+ logger.debug(f"State manager method not available: {e}")
343
+ except Exception as e:
344
+ logger.warning(f"Unexpected error marking file as failed: {e}")