claude-self-reflect 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,393 @@
1
+ """Main orchestrator with dependency injection."""
2
+
3
+ import logging
4
+ import time
5
+ from pathlib import Path
6
+ from typing import List, Optional, Dict, Any
7
+ from dependency_injector import containers, providers
8
+
9
+ from .core import (
10
+ ImportConfig,
11
+ Message,
12
+ ConversationChunk,
13
+ ProcessedPoint,
14
+ ImportResult,
15
+ ImportStats
16
+ )
17
+ from .core.exceptions import ImportError, ParseError, ValidationError
18
+ from .embeddings import EmbeddingProvider, FastEmbedProvider
19
+ try:
20
+ from .embeddings import VoyageEmbeddingProvider
21
+ VOYAGE_AVAILABLE = True
22
+ except ImportError:
23
+ VoyageEmbeddingProvider = None
24
+ VOYAGE_AVAILABLE = False
25
+ from .processors import (
26
+ ConversationParser,
27
+ Chunker,
28
+ ASTExtractor,
29
+ ConceptExtractor,
30
+ ToolUsageExtractor
31
+ )
32
+ from .storage import QdrantStorage
33
+ from .state import StateManager
34
+ from .utils import ProjectNormalizer, setup_logging
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class ConversationProcessor:
40
+ """
41
+ Main orchestrator for processing conversations.
42
+
43
+ Follows dependency injection pattern with all dependencies
44
+ injected through constructor.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ config: ImportConfig,
50
+ embedding_provider: EmbeddingProvider,
51
+ storage: QdrantStorage,
52
+ parser: ConversationParser,
53
+ chunker: Chunker,
54
+ extractors: List[Any],
55
+ state_manager: StateManager,
56
+ normalizer: ProjectNormalizer
57
+ ):
58
+ self.config = config
59
+ self.embedding_provider = embedding_provider
60
+ self.storage = storage
61
+ self.parser = parser
62
+ self.chunker = chunker
63
+ self.extractors = extractors
64
+ self.state = state_manager
65
+ self.normalizer = normalizer
66
+ self.stats = ImportStats()
67
+
68
+ def process_file(self, file_path: Path) -> ImportResult:
69
+ """
70
+ Process a single JSONL file.
71
+
72
+ Returns:
73
+ ImportResult with processing details
74
+ """
75
+ start_time = time.time()
76
+ result = ImportResult(file_path=str(file_path), success=False)
77
+
78
+ try:
79
+ # Check if already processed
80
+ if not self.config.force_reimport and self.state.is_processed(file_path):
81
+ logger.info(f"Skipping already processed: {file_path}")
82
+ result.success = True
83
+ return result
84
+
85
+ # Parse conversation
86
+ logger.debug(f"Parsing conversation: {file_path}")
87
+ messages = self.parser.parse_file(file_path)
88
+ if not messages:
89
+ raise ParseError(str(file_path), reason="No messages found")
90
+
91
+ # Create chunks
92
+ logger.debug(f"Creating chunks for {len(messages)} messages")
93
+ chunks = self.chunker.create_chunks(messages, str(file_path))
94
+ result.chunks_processed = len(chunks)
95
+
96
+ # Extract metadata
97
+ logger.debug("Extracting metadata")
98
+ self._enrich_chunks(chunks)
99
+
100
+ # Generate embeddings
101
+ logger.debug("Generating embeddings")
102
+ texts = [chunk.text for chunk in chunks]
103
+ # Use embed_batch for proper token-aware batching with Voyage
104
+ if hasattr(self.embedding_provider, 'embed_batch'):
105
+ embeddings = self.embedding_provider.embed_batch(texts)
106
+ else:
107
+ embeddings = self.embedding_provider.embed(texts)
108
+
109
+ # Build points
110
+ logger.debug("Building points")
111
+ points = self._build_points(chunks, embeddings, file_path)
112
+
113
+ # Store in Qdrant
114
+ logger.debug(f"Storing {len(points)} points")
115
+ collection_name = self._get_collection_name(file_path)
116
+ stored = self.storage.upsert_points(collection_name, points)
117
+ result.points_created = stored
118
+
119
+ # Update state
120
+ self.state.mark_processed(file_path, stored)
121
+
122
+ result.success = True
123
+ logger.info(f"Successfully processed {file_path}: {stored} points")
124
+
125
+ except Exception as e:
126
+ logger.error(f"Failed to process {file_path}: {e}")
127
+ result.error = str(e)
128
+ self.state.mark_failed(file_path, str(e))
129
+ if not isinstance(e, ImportError):
130
+ raise ImportError(f"Processing failed: {e}")
131
+
132
+ finally:
133
+ result.duration_seconds = time.time() - start_time
134
+ self.stats.add_result(result)
135
+
136
+ return result
137
+
138
+ def _enrich_chunks(self, chunks: List[ConversationChunk]) -> None:
139
+ """Add metadata to chunks using extractors."""
140
+ for chunk in chunks:
141
+ for extractor in self.extractors:
142
+ try:
143
+ metadata = extractor.extract(chunk.text)
144
+ for key, value in metadata.items():
145
+ chunk.add_metadata(key, value)
146
+ except Exception as e:
147
+ logger.warning(f"Extractor {extractor.__class__.__name__} failed: {e}")
148
+
149
+ def _build_points(
150
+ self,
151
+ chunks: List[ConversationChunk],
152
+ embeddings: List[List[float]],
153
+ file_path: Path
154
+ ) -> List[ProcessedPoint]:
155
+ """Build Qdrant points from chunks and embeddings."""
156
+ points = []
157
+ project_name = self.normalizer.get_project_name(file_path)
158
+
159
+ for chunk, embedding in zip(chunks, embeddings):
160
+ # Generate unique point ID
161
+ point_id = f"{project_name}_{chunk.unique_id}"
162
+
163
+ # Build payload
164
+ payload = {
165
+ "text": chunk.text,
166
+ "project": project_name,
167
+ "file_path": str(file_path),
168
+ "chunk_index": chunk.chunk_index,
169
+ "total_chunks": chunk.total_chunks,
170
+ "message_indices": chunk.message_indices,
171
+ **chunk.metadata
172
+ }
173
+
174
+ point = ProcessedPoint(
175
+ id=point_id,
176
+ vector=embedding,
177
+ payload=payload
178
+ )
179
+
180
+ # Validate dimension
181
+ if not point.validate_dimension(self.embedding_provider.get_dimension()):
182
+ raise ValidationError(
183
+ "embedding",
184
+ len(embedding),
185
+ f"Expected dimension {self.embedding_provider.get_dimension()}"
186
+ )
187
+
188
+ points.append(point)
189
+
190
+ return points
191
+
192
+ def _get_collection_name(self, file_path: Path) -> str:
193
+ """Generate collection name for file."""
194
+ return self.normalizer.get_collection_name(file_path)
195
+
196
+ def get_stats(self) -> ImportStats:
197
+ """Get import statistics."""
198
+ return self.stats
199
+
200
+
201
+ class ImporterContainer(containers.DeclarativeContainer):
202
+ """
203
+ Dependency injection container using dependency-injector library.
204
+
205
+ This provides sophisticated dependency management as recommended
206
+ in the code review.
207
+ """
208
+
209
+ # Configuration provider
210
+ config = providers.Singleton(ImportConfig.from_env)
211
+
212
+ # Logging setup
213
+ logger_setup = providers.Resource(
214
+ setup_logging,
215
+ level=config.provided.log_level
216
+ )
217
+
218
+ # Core services
219
+ normalizer = providers.Singleton(ProjectNormalizer)
220
+
221
+ state_manager = providers.Singleton(
222
+ StateManager,
223
+ state_file=config.provided.state_file_path
224
+ )
225
+
226
+ # Embedding provider with selector
227
+ def get_embedding_provider(config_obj):
228
+ """Factory function to select embedding provider based on config."""
229
+ if config_obj.use_voyage and config_obj.voyage_api_key:
230
+ if not VOYAGE_AVAILABLE:
231
+ logger.warning("Voyage requested but not available, falling back to FastEmbed")
232
+ return FastEmbedProvider()
233
+ return VoyageEmbeddingProvider(
234
+ api_key=config_obj.voyage_api_key,
235
+ model_name="voyage-2"
236
+ )
237
+ return FastEmbedProvider()
238
+
239
+ embedding_provider = providers.Factory(
240
+ get_embedding_provider,
241
+ config_obj=config
242
+ )
243
+
244
+ # Storage
245
+ storage = providers.Singleton(
246
+ QdrantStorage,
247
+ url=config.provided.qdrant_url,
248
+ api_key=config.provided.qdrant_api_key
249
+ )
250
+
251
+ # Processors
252
+ parser = providers.Singleton(ConversationParser)
253
+
254
+ chunker = providers.Singleton(
255
+ Chunker,
256
+ chunk_size=config.provided.chunk_size,
257
+ chunk_overlap=config.provided.chunk_overlap
258
+ )
259
+
260
+ # Metadata extractors
261
+ ast_extractor = providers.Singleton(
262
+ ASTExtractor,
263
+ max_elements=config.provided.max_ast_elements
264
+ )
265
+
266
+ concept_extractor = providers.Singleton(ConceptExtractor)
267
+
268
+ tool_extractor = providers.Singleton(ToolUsageExtractor)
269
+
270
+ extractors = providers.List(
271
+ ast_extractor,
272
+ concept_extractor,
273
+ tool_extractor
274
+ )
275
+
276
+ # Main processor
277
+ processor = providers.Factory(
278
+ ConversationProcessor,
279
+ config=config,
280
+ embedding_provider=embedding_provider,
281
+ storage=storage,
282
+ parser=parser,
283
+ chunker=chunker,
284
+ extractors=extractors,
285
+ state_manager=state_manager,
286
+ normalizer=normalizer
287
+ )
288
+
289
+
290
+ def create_processor(config: Optional[ImportConfig] = None) -> ConversationProcessor:
291
+ """
292
+ Factory function to create a configured processor.
293
+
294
+ Args:
295
+ config: Optional configuration, uses environment if not provided
296
+
297
+ Returns:
298
+ Configured ConversationProcessor instance
299
+ """
300
+ container = ImporterContainer()
301
+
302
+ if config:
303
+ container.config.override(config)
304
+
305
+ # Get processor instance
306
+ processor = container.processor()
307
+
308
+ # Note: Providers are already initialized by the container
309
+ # No need to call initialize methods
310
+
311
+ return processor
312
+
313
+
314
+ def process_files(
315
+ files: List[Path],
316
+ config: Optional[ImportConfig] = None,
317
+ progress_callback: Optional[Any] = None
318
+ ) -> ImportStats:
319
+ """
320
+ Process multiple files with progress tracking.
321
+
322
+ Args:
323
+ files: List of JSONL files to process
324
+ config: Optional configuration
325
+ progress_callback: Optional callback for progress updates
326
+
327
+ Returns:
328
+ ImportStats with aggregate results
329
+ """
330
+ processor = create_processor(config)
331
+
332
+ for i, file_path in enumerate(files):
333
+ if progress_callback:
334
+ progress_callback(i, len(files), file_path)
335
+
336
+ try:
337
+ result = processor.process_file(file_path)
338
+ logger.info(
339
+ f"[{i+1}/{len(files)}] Processed {file_path.name}: "
340
+ f"{result.points_created} points"
341
+ )
342
+ except Exception as e:
343
+ logger.error(f"Failed to process {file_path}: {e}")
344
+
345
+ return processor.get_stats()
346
+
347
+
348
+ def main():
349
+ """Main entry point for CLI execution."""
350
+ import argparse
351
+
352
+ parser = argparse.ArgumentParser(description="Import Claude conversations to Qdrant")
353
+ parser.add_argument("--limit", type=int, help="Limit number of files to process")
354
+ parser.add_argument("--dry-run", action="store_true", help="Dry run without importing")
355
+ parser.add_argument("--force", action="store_true", help="Force reimport all files")
356
+ parser.add_argument("--voyage", action="store_true", help="Use Voyage AI embeddings")
357
+ parser.add_argument("--log-level", default="INFO", help="Logging level")
358
+
359
+ args = parser.parse_args()
360
+
361
+ # Setup logging
362
+ setup_logging(args.log_level)
363
+
364
+ # Create config from environment with CLI overrides
365
+ config_dict = {}
366
+ if args.dry_run:
367
+ config_dict["dry_run"] = True
368
+ if args.force:
369
+ config_dict["force_reimport"] = True
370
+ if args.voyage:
371
+ config_dict["use_voyage"] = True
372
+ if args.limit:
373
+ config_dict["file_limit"] = args.limit
374
+
375
+ config = ImportConfig.from_env()
376
+ if config_dict:
377
+ # Override with CLI args
378
+ config = ImportConfig.from_dict({**config.__dict__, **config_dict})
379
+
380
+ # Find all JSONL files
381
+ base_path = Path.home() / ".claude" / "projects"
382
+ files = list(base_path.glob("*/*.jsonl"))
383
+
384
+ if args.limit:
385
+ files = files[:args.limit]
386
+
387
+ logger.info(f"Processing {len(files)} files...")
388
+
389
+ # Process files
390
+ stats = process_files(files, config)
391
+
392
+ logger.info(f"Import complete: {stats}")
393
+ return 0 if stats.failed_files == 0 else 1
@@ -0,0 +1,15 @@
1
+ """Processors for parsing and extracting metadata from conversations."""
2
+
3
+ from .conversation_parser import ConversationParser
4
+ from .chunker import Chunker
5
+ from .ast_extractor import ASTExtractor
6
+ from .concept_extractor import ConceptExtractor
7
+ from .tool_extractor import ToolUsageExtractor
8
+
9
+ __all__ = [
10
+ "ConversationParser",
11
+ "Chunker",
12
+ "ASTExtractor",
13
+ "ConceptExtractor",
14
+ "ToolUsageExtractor"
15
+ ]
@@ -0,0 +1,197 @@
1
+ """Extract AST elements from code blocks."""
2
+
3
+ import ast
4
+ import re
5
+ import logging
6
+ from typing import Dict, Any, Set, List
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ASTExtractor:
12
+ """
13
+ Extract Abstract Syntax Tree elements from code.
14
+
15
+ Implements the critical fixes identified in code review:
16
+ 1. More permissive code fence regex
17
+ 2. Python regex fallback for partial code
18
+ 3. Bounded extraction with MAX_AST_ELEMENTS
19
+ """
20
+
21
+ def __init__(self, max_elements: int = 100):
22
+ self.max_elements = max_elements
23
+
24
+ # FIX: More permissive code fence regex to handle various formats
25
+ # Matches: ```python, ```py, ```javascript, ```ts strict, etc.
26
+ self.code_fence_pattern = re.compile(
27
+ r'```[^\n]*\n?(.*?)```',
28
+ re.DOTALL
29
+ )
30
+
31
+ # Python patterns for fallback extraction
32
+ self.python_patterns = {
33
+ 'function': re.compile(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
34
+ 'async_function': re.compile(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
35
+ 'class': re.compile(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', re.MULTILINE),
36
+ 'method': re.compile(r'^\s+def\s+([A-Za-z_]\w*)\s*\(self', re.MULTILINE),
37
+ 'static_method': re.compile(r'@staticmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL),
38
+ 'class_method': re.compile(r'@classmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL)
39
+ }
40
+
41
+ # JavaScript/TypeScript patterns
42
+ self.js_patterns = {
43
+ 'function': re.compile(r'function\s+([A-Za-z_$][\w$]*)\s*\('),
44
+ 'arrow': re.compile(r'(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>'),
45
+ 'async_function': re.compile(r'async\s+function\s+([A-Za-z_$][\w$]*)\s*\('),
46
+ 'class': re.compile(r'class\s+([A-Za-z_$][\w$]*)\s*(?:extends\s+[A-Za-z_$][\w$]*)?\s*\{'),
47
+ 'method': re.compile(r'([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{'),
48
+ 'export_function': re.compile(r'export\s+(?:default\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\('),
49
+ 'export_const': re.compile(r'export\s+const\s+([A-Za-z_$][\w$]*)\s*=')
50
+ }
51
+
52
+ def extract(self, text: str) -> Dict[str, Any]:
53
+ """
54
+ Extract AST elements from text.
55
+
56
+ Returns:
57
+ Dictionary with ast_elements and code_blocks keys
58
+ """
59
+ elements = set()
60
+ has_code = False
61
+
62
+ # Extract code blocks using permissive regex
63
+ code_blocks = self.code_fence_pattern.findall(text)
64
+
65
+ for code_block in code_blocks[:10]: # Limit processing
66
+ has_code = True
67
+
68
+ # Try to detect language from content
69
+ if self._looks_like_python(code_block):
70
+ python_elements = self._extract_python_ast(code_block)
71
+ elements.update(python_elements)
72
+ elif self._looks_like_javascript(code_block):
73
+ js_elements = self._extract_javascript_patterns(code_block)
74
+ elements.update(js_elements)
75
+ else:
76
+ # Try both as fallback
77
+ elements.update(self._extract_python_ast(code_block))
78
+ elements.update(self._extract_javascript_patterns(code_block))
79
+
80
+ # FIX: Enforce max elements limit
81
+ if len(elements) >= self.max_elements:
82
+ logger.debug(f"Reached max AST elements limit: {self.max_elements}")
83
+ break
84
+
85
+ # Also check for inline code patterns outside of fences
86
+ if not has_code:
87
+ # Look for function/class definitions in plain text
88
+ elements.update(self._extract_inline_patterns(text))
89
+
90
+ return {
91
+ "ast_elements": list(elements)[:self.max_elements],
92
+ "has_code_blocks": has_code
93
+ }
94
+
95
+ def _extract_python_ast(self, code: str) -> Set[str]:
96
+ """Extract Python AST elements with fallback to regex."""
97
+ elements = set()
98
+
99
+ try:
100
+ # Try proper AST parsing first
101
+ tree = ast.parse(code)
102
+
103
+ for node in ast.walk(tree):
104
+ if len(elements) >= self.max_elements:
105
+ break
106
+
107
+ if isinstance(node, ast.FunctionDef):
108
+ elements.add(f"func:{node.name}")
109
+ elif isinstance(node, ast.AsyncFunctionDef):
110
+ elements.add(f"func:{node.name}")
111
+ elif isinstance(node, ast.ClassDef):
112
+ elements.add(f"class:{node.name}")
113
+ # Extract methods
114
+ for item in node.body:
115
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
116
+ elements.add(f"method:{node.name}.{item.name}")
117
+ if len(elements) >= self.max_elements:
118
+ break
119
+
120
+ except (SyntaxError, ValueError) as e:
121
+ # FIX: Python regex fallback for partial code fragments
122
+ logger.debug(f"AST parsing failed, using regex fallback: {e}")
123
+
124
+ for pattern_type, pattern in self.python_patterns.items():
125
+ for match in pattern.finditer(code):
126
+ if len(elements) >= self.max_elements:
127
+ break
128
+
129
+ name = match.group(1)
130
+ if 'method' in pattern_type:
131
+ elements.add(f"method:{name}")
132
+ elif 'class' in pattern_type:
133
+ elements.add(f"class:{name}")
134
+ else:
135
+ elements.add(f"func:{name}")
136
+
137
+ return elements
138
+
139
+ def _extract_javascript_patterns(self, code: str) -> Set[str]:
140
+ """Extract JavaScript/TypeScript patterns."""
141
+ elements = set()
142
+
143
+ for pattern_type, pattern in self.js_patterns.items():
144
+ for match in pattern.finditer(code):
145
+ if len(elements) >= self.max_elements:
146
+ break
147
+
148
+ name = match.group(1)
149
+ if 'class' in pattern_type:
150
+ elements.add(f"class:{name}")
151
+ elif 'method' in pattern_type and name not in ['constructor', 'if', 'for', 'while']:
152
+ elements.add(f"method:{name}")
153
+ else:
154
+ elements.add(f"func:{name}")
155
+
156
+ return elements
157
+
158
+ def _extract_inline_patterns(self, text: str) -> Set[str]:
159
+ """Extract patterns from inline code mentions."""
160
+ elements = set()
161
+
162
+ # Look for backtick-wrapped function/class names
163
+ inline_pattern = re.compile(r'`([A-Za-z_][\w]*(?:\.[A-Za-z_][\w]*)*)`')
164
+
165
+ for match in inline_pattern.finditer(text):
166
+ if len(elements) >= self.max_elements:
167
+ break
168
+
169
+ name = match.group(1)
170
+ # Heuristic: if contains dot, likely a method
171
+ if '.' in name:
172
+ elements.add(f"method:{name}")
173
+ # Heuristic: PascalCase likely a class
174
+ elif name[0].isupper():
175
+ elements.add(f"class:{name}")
176
+ # Otherwise assume function
177
+ else:
178
+ elements.add(f"func:{name}")
179
+
180
+ return elements
181
+
182
+ def _looks_like_python(self, code: str) -> bool:
183
+ """Heuristic to detect Python code."""
184
+ python_indicators = [
185
+ 'def ', 'import ', 'from ', 'class ', 'self.', 'self,',
186
+ '__init__', '__name__', 'if __name__', 'print(', 'async def'
187
+ ]
188
+ return any(indicator in code for indicator in python_indicators)
189
+
190
+ def _looks_like_javascript(self, code: str) -> bool:
191
+ """Heuristic to detect JavaScript/TypeScript."""
192
+ js_indicators = [
193
+ 'function ', 'const ', 'let ', 'var ', '=>', 'export ',
194
+ 'import ', 'class ', 'constructor(', 'this.', 'async function',
195
+ 'interface ', 'type ', 'namespace ', 'enum '
196
+ ]
197
+ return any(indicator in code for indicator in js_indicators)