claude-self-reflect 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-server/pyproject.toml +1 -0
- package/package.json +2 -1
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
"""Main orchestrator with dependency injection."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Optional, Dict, Any
|
|
7
|
+
from dependency_injector import containers, providers
|
|
8
|
+
|
|
9
|
+
from .core import (
|
|
10
|
+
ImportConfig,
|
|
11
|
+
Message,
|
|
12
|
+
ConversationChunk,
|
|
13
|
+
ProcessedPoint,
|
|
14
|
+
ImportResult,
|
|
15
|
+
ImportStats
|
|
16
|
+
)
|
|
17
|
+
from .core.exceptions import ImportError, ParseError, ValidationError
|
|
18
|
+
from .embeddings import EmbeddingProvider, FastEmbedProvider
|
|
19
|
+
try:
|
|
20
|
+
from .embeddings import VoyageEmbeddingProvider
|
|
21
|
+
VOYAGE_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
VoyageEmbeddingProvider = None
|
|
24
|
+
VOYAGE_AVAILABLE = False
|
|
25
|
+
from .processors import (
|
|
26
|
+
ConversationParser,
|
|
27
|
+
Chunker,
|
|
28
|
+
ASTExtractor,
|
|
29
|
+
ConceptExtractor,
|
|
30
|
+
ToolUsageExtractor
|
|
31
|
+
)
|
|
32
|
+
from .storage import QdrantStorage
|
|
33
|
+
from .state import StateManager
|
|
34
|
+
from .utils import ProjectNormalizer, setup_logging
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ConversationProcessor:
|
|
40
|
+
"""
|
|
41
|
+
Main orchestrator for processing conversations.
|
|
42
|
+
|
|
43
|
+
Follows dependency injection pattern with all dependencies
|
|
44
|
+
injected through constructor.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
config: ImportConfig,
|
|
50
|
+
embedding_provider: EmbeddingProvider,
|
|
51
|
+
storage: QdrantStorage,
|
|
52
|
+
parser: ConversationParser,
|
|
53
|
+
chunker: Chunker,
|
|
54
|
+
extractors: List[Any],
|
|
55
|
+
state_manager: StateManager,
|
|
56
|
+
normalizer: ProjectNormalizer
|
|
57
|
+
):
|
|
58
|
+
self.config = config
|
|
59
|
+
self.embedding_provider = embedding_provider
|
|
60
|
+
self.storage = storage
|
|
61
|
+
self.parser = parser
|
|
62
|
+
self.chunker = chunker
|
|
63
|
+
self.extractors = extractors
|
|
64
|
+
self.state = state_manager
|
|
65
|
+
self.normalizer = normalizer
|
|
66
|
+
self.stats = ImportStats()
|
|
67
|
+
|
|
68
|
+
def process_file(self, file_path: Path) -> ImportResult:
|
|
69
|
+
"""
|
|
70
|
+
Process a single JSONL file.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
ImportResult with processing details
|
|
74
|
+
"""
|
|
75
|
+
start_time = time.time()
|
|
76
|
+
result = ImportResult(file_path=str(file_path), success=False)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
# Check if already processed
|
|
80
|
+
if not self.config.force_reimport and self.state.is_processed(file_path):
|
|
81
|
+
logger.info(f"Skipping already processed: {file_path}")
|
|
82
|
+
result.success = True
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
# Parse conversation
|
|
86
|
+
logger.debug(f"Parsing conversation: {file_path}")
|
|
87
|
+
messages = self.parser.parse_file(file_path)
|
|
88
|
+
if not messages:
|
|
89
|
+
raise ParseError(str(file_path), reason="No messages found")
|
|
90
|
+
|
|
91
|
+
# Create chunks
|
|
92
|
+
logger.debug(f"Creating chunks for {len(messages)} messages")
|
|
93
|
+
chunks = self.chunker.create_chunks(messages, str(file_path))
|
|
94
|
+
result.chunks_processed = len(chunks)
|
|
95
|
+
|
|
96
|
+
# Extract metadata
|
|
97
|
+
logger.debug("Extracting metadata")
|
|
98
|
+
self._enrich_chunks(chunks)
|
|
99
|
+
|
|
100
|
+
# Generate embeddings
|
|
101
|
+
logger.debug("Generating embeddings")
|
|
102
|
+
texts = [chunk.text for chunk in chunks]
|
|
103
|
+
# Use embed_batch for proper token-aware batching with Voyage
|
|
104
|
+
if hasattr(self.embedding_provider, 'embed_batch'):
|
|
105
|
+
embeddings = self.embedding_provider.embed_batch(texts)
|
|
106
|
+
else:
|
|
107
|
+
embeddings = self.embedding_provider.embed(texts)
|
|
108
|
+
|
|
109
|
+
# Build points
|
|
110
|
+
logger.debug("Building points")
|
|
111
|
+
points = self._build_points(chunks, embeddings, file_path)
|
|
112
|
+
|
|
113
|
+
# Store in Qdrant
|
|
114
|
+
logger.debug(f"Storing {len(points)} points")
|
|
115
|
+
collection_name = self._get_collection_name(file_path)
|
|
116
|
+
stored = self.storage.upsert_points(collection_name, points)
|
|
117
|
+
result.points_created = stored
|
|
118
|
+
|
|
119
|
+
# Update state
|
|
120
|
+
self.state.mark_processed(file_path, stored)
|
|
121
|
+
|
|
122
|
+
result.success = True
|
|
123
|
+
logger.info(f"Successfully processed {file_path}: {stored} points")
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.error(f"Failed to process {file_path}: {e}")
|
|
127
|
+
result.error = str(e)
|
|
128
|
+
self.state.mark_failed(file_path, str(e))
|
|
129
|
+
if not isinstance(e, ImportError):
|
|
130
|
+
raise ImportError(f"Processing failed: {e}")
|
|
131
|
+
|
|
132
|
+
finally:
|
|
133
|
+
result.duration_seconds = time.time() - start_time
|
|
134
|
+
self.stats.add_result(result)
|
|
135
|
+
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
def _enrich_chunks(self, chunks: List[ConversationChunk]) -> None:
|
|
139
|
+
"""Add metadata to chunks using extractors."""
|
|
140
|
+
for chunk in chunks:
|
|
141
|
+
for extractor in self.extractors:
|
|
142
|
+
try:
|
|
143
|
+
metadata = extractor.extract(chunk.text)
|
|
144
|
+
for key, value in metadata.items():
|
|
145
|
+
chunk.add_metadata(key, value)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.warning(f"Extractor {extractor.__class__.__name__} failed: {e}")
|
|
148
|
+
|
|
149
|
+
def _build_points(
|
|
150
|
+
self,
|
|
151
|
+
chunks: List[ConversationChunk],
|
|
152
|
+
embeddings: List[List[float]],
|
|
153
|
+
file_path: Path
|
|
154
|
+
) -> List[ProcessedPoint]:
|
|
155
|
+
"""Build Qdrant points from chunks and embeddings."""
|
|
156
|
+
points = []
|
|
157
|
+
project_name = self.normalizer.get_project_name(file_path)
|
|
158
|
+
|
|
159
|
+
for chunk, embedding in zip(chunks, embeddings):
|
|
160
|
+
# Generate unique point ID
|
|
161
|
+
point_id = f"{project_name}_{chunk.unique_id}"
|
|
162
|
+
|
|
163
|
+
# Build payload
|
|
164
|
+
payload = {
|
|
165
|
+
"text": chunk.text,
|
|
166
|
+
"project": project_name,
|
|
167
|
+
"file_path": str(file_path),
|
|
168
|
+
"chunk_index": chunk.chunk_index,
|
|
169
|
+
"total_chunks": chunk.total_chunks,
|
|
170
|
+
"message_indices": chunk.message_indices,
|
|
171
|
+
**chunk.metadata
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
point = ProcessedPoint(
|
|
175
|
+
id=point_id,
|
|
176
|
+
vector=embedding,
|
|
177
|
+
payload=payload
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Validate dimension
|
|
181
|
+
if not point.validate_dimension(self.embedding_provider.get_dimension()):
|
|
182
|
+
raise ValidationError(
|
|
183
|
+
"embedding",
|
|
184
|
+
len(embedding),
|
|
185
|
+
f"Expected dimension {self.embedding_provider.get_dimension()}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
points.append(point)
|
|
189
|
+
|
|
190
|
+
return points
|
|
191
|
+
|
|
192
|
+
def _get_collection_name(self, file_path: Path) -> str:
|
|
193
|
+
"""Generate collection name for file."""
|
|
194
|
+
return self.normalizer.get_collection_name(file_path)
|
|
195
|
+
|
|
196
|
+
def get_stats(self) -> ImportStats:
|
|
197
|
+
"""Get import statistics."""
|
|
198
|
+
return self.stats
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class ImporterContainer(containers.DeclarativeContainer):
|
|
202
|
+
"""
|
|
203
|
+
Dependency injection container using dependency-injector library.
|
|
204
|
+
|
|
205
|
+
This provides sophisticated dependency management as recommended
|
|
206
|
+
in the code review.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
# Configuration provider
|
|
210
|
+
config = providers.Singleton(ImportConfig.from_env)
|
|
211
|
+
|
|
212
|
+
# Logging setup
|
|
213
|
+
logger_setup = providers.Resource(
|
|
214
|
+
setup_logging,
|
|
215
|
+
level=config.provided.log_level
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Core services
|
|
219
|
+
normalizer = providers.Singleton(ProjectNormalizer)
|
|
220
|
+
|
|
221
|
+
state_manager = providers.Singleton(
|
|
222
|
+
StateManager,
|
|
223
|
+
state_file=config.provided.state_file_path
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Embedding provider with selector
|
|
227
|
+
def get_embedding_provider(config_obj):
|
|
228
|
+
"""Factory function to select embedding provider based on config."""
|
|
229
|
+
if config_obj.use_voyage and config_obj.voyage_api_key:
|
|
230
|
+
if not VOYAGE_AVAILABLE:
|
|
231
|
+
logger.warning("Voyage requested but not available, falling back to FastEmbed")
|
|
232
|
+
return FastEmbedProvider()
|
|
233
|
+
return VoyageEmbeddingProvider(
|
|
234
|
+
api_key=config_obj.voyage_api_key,
|
|
235
|
+
model_name="voyage-2"
|
|
236
|
+
)
|
|
237
|
+
return FastEmbedProvider()
|
|
238
|
+
|
|
239
|
+
embedding_provider = providers.Factory(
|
|
240
|
+
get_embedding_provider,
|
|
241
|
+
config_obj=config
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Storage
|
|
245
|
+
storage = providers.Singleton(
|
|
246
|
+
QdrantStorage,
|
|
247
|
+
url=config.provided.qdrant_url,
|
|
248
|
+
api_key=config.provided.qdrant_api_key
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Processors
|
|
252
|
+
parser = providers.Singleton(ConversationParser)
|
|
253
|
+
|
|
254
|
+
chunker = providers.Singleton(
|
|
255
|
+
Chunker,
|
|
256
|
+
chunk_size=config.provided.chunk_size,
|
|
257
|
+
chunk_overlap=config.provided.chunk_overlap
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Metadata extractors
|
|
261
|
+
ast_extractor = providers.Singleton(
|
|
262
|
+
ASTExtractor,
|
|
263
|
+
max_elements=config.provided.max_ast_elements
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
concept_extractor = providers.Singleton(ConceptExtractor)
|
|
267
|
+
|
|
268
|
+
tool_extractor = providers.Singleton(ToolUsageExtractor)
|
|
269
|
+
|
|
270
|
+
extractors = providers.List(
|
|
271
|
+
ast_extractor,
|
|
272
|
+
concept_extractor,
|
|
273
|
+
tool_extractor
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Main processor
|
|
277
|
+
processor = providers.Factory(
|
|
278
|
+
ConversationProcessor,
|
|
279
|
+
config=config,
|
|
280
|
+
embedding_provider=embedding_provider,
|
|
281
|
+
storage=storage,
|
|
282
|
+
parser=parser,
|
|
283
|
+
chunker=chunker,
|
|
284
|
+
extractors=extractors,
|
|
285
|
+
state_manager=state_manager,
|
|
286
|
+
normalizer=normalizer
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def create_processor(config: Optional[ImportConfig] = None) -> ConversationProcessor:
|
|
291
|
+
"""
|
|
292
|
+
Factory function to create a configured processor.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
config: Optional configuration, uses environment if not provided
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Configured ConversationProcessor instance
|
|
299
|
+
"""
|
|
300
|
+
container = ImporterContainer()
|
|
301
|
+
|
|
302
|
+
if config:
|
|
303
|
+
container.config.override(config)
|
|
304
|
+
|
|
305
|
+
# Get processor instance
|
|
306
|
+
processor = container.processor()
|
|
307
|
+
|
|
308
|
+
# Note: Providers are already initialized by the container
|
|
309
|
+
# No need to call initialize methods
|
|
310
|
+
|
|
311
|
+
return processor
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def process_files(
|
|
315
|
+
files: List[Path],
|
|
316
|
+
config: Optional[ImportConfig] = None,
|
|
317
|
+
progress_callback: Optional[Any] = None
|
|
318
|
+
) -> ImportStats:
|
|
319
|
+
"""
|
|
320
|
+
Process multiple files with progress tracking.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
files: List of JSONL files to process
|
|
324
|
+
config: Optional configuration
|
|
325
|
+
progress_callback: Optional callback for progress updates
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
ImportStats with aggregate results
|
|
329
|
+
"""
|
|
330
|
+
processor = create_processor(config)
|
|
331
|
+
|
|
332
|
+
for i, file_path in enumerate(files):
|
|
333
|
+
if progress_callback:
|
|
334
|
+
progress_callback(i, len(files), file_path)
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
result = processor.process_file(file_path)
|
|
338
|
+
logger.info(
|
|
339
|
+
f"[{i+1}/{len(files)}] Processed {file_path.name}: "
|
|
340
|
+
f"{result.points_created} points"
|
|
341
|
+
)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Failed to process {file_path}: {e}")
|
|
344
|
+
|
|
345
|
+
return processor.get_stats()
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def main():
|
|
349
|
+
"""Main entry point for CLI execution."""
|
|
350
|
+
import argparse
|
|
351
|
+
|
|
352
|
+
parser = argparse.ArgumentParser(description="Import Claude conversations to Qdrant")
|
|
353
|
+
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
354
|
+
parser.add_argument("--dry-run", action="store_true", help="Dry run without importing")
|
|
355
|
+
parser.add_argument("--force", action="store_true", help="Force reimport all files")
|
|
356
|
+
parser.add_argument("--voyage", action="store_true", help="Use Voyage AI embeddings")
|
|
357
|
+
parser.add_argument("--log-level", default="INFO", help="Logging level")
|
|
358
|
+
|
|
359
|
+
args = parser.parse_args()
|
|
360
|
+
|
|
361
|
+
# Setup logging
|
|
362
|
+
setup_logging(args.log_level)
|
|
363
|
+
|
|
364
|
+
# Create config from environment with CLI overrides
|
|
365
|
+
config_dict = {}
|
|
366
|
+
if args.dry_run:
|
|
367
|
+
config_dict["dry_run"] = True
|
|
368
|
+
if args.force:
|
|
369
|
+
config_dict["force_reimport"] = True
|
|
370
|
+
if args.voyage:
|
|
371
|
+
config_dict["use_voyage"] = True
|
|
372
|
+
if args.limit:
|
|
373
|
+
config_dict["file_limit"] = args.limit
|
|
374
|
+
|
|
375
|
+
config = ImportConfig.from_env()
|
|
376
|
+
if config_dict:
|
|
377
|
+
# Override with CLI args
|
|
378
|
+
config = ImportConfig.from_dict({**config.__dict__, **config_dict})
|
|
379
|
+
|
|
380
|
+
# Find all JSONL files
|
|
381
|
+
base_path = Path.home() / ".claude" / "projects"
|
|
382
|
+
files = list(base_path.glob("*/*.jsonl"))
|
|
383
|
+
|
|
384
|
+
if args.limit:
|
|
385
|
+
files = files[:args.limit]
|
|
386
|
+
|
|
387
|
+
logger.info(f"Processing {len(files)} files...")
|
|
388
|
+
|
|
389
|
+
# Process files
|
|
390
|
+
stats = process_files(files, config)
|
|
391
|
+
|
|
392
|
+
logger.info(f"Import complete: {stats}")
|
|
393
|
+
return 0 if stats.failed_files == 0 else 1
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Processors for parsing and extracting metadata from conversations."""
|
|
2
|
+
|
|
3
|
+
from .conversation_parser import ConversationParser
|
|
4
|
+
from .chunker import Chunker
|
|
5
|
+
from .ast_extractor import ASTExtractor
|
|
6
|
+
from .concept_extractor import ConceptExtractor
|
|
7
|
+
from .tool_extractor import ToolUsageExtractor
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"ConversationParser",
|
|
11
|
+
"Chunker",
|
|
12
|
+
"ASTExtractor",
|
|
13
|
+
"ConceptExtractor",
|
|
14
|
+
"ToolUsageExtractor"
|
|
15
|
+
]
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Extract AST elements from code blocks."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import re
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Dict, Any, Set, List
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ASTExtractor:
|
|
12
|
+
"""
|
|
13
|
+
Extract Abstract Syntax Tree elements from code.
|
|
14
|
+
|
|
15
|
+
Implements the critical fixes identified in code review:
|
|
16
|
+
1. More permissive code fence regex
|
|
17
|
+
2. Python regex fallback for partial code
|
|
18
|
+
3. Bounded extraction with MAX_AST_ELEMENTS
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, max_elements: int = 100):
|
|
22
|
+
self.max_elements = max_elements
|
|
23
|
+
|
|
24
|
+
# FIX: More permissive code fence regex to handle various formats
|
|
25
|
+
# Matches: ```python, ```py, ```javascript, ```ts strict, etc.
|
|
26
|
+
self.code_fence_pattern = re.compile(
|
|
27
|
+
r'```[^\n]*\n?(.*?)```',
|
|
28
|
+
re.DOTALL
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Python patterns for fallback extraction
|
|
32
|
+
self.python_patterns = {
|
|
33
|
+
'function': re.compile(r'^\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
|
|
34
|
+
'async_function': re.compile(r'^\s*async\s+def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE),
|
|
35
|
+
'class': re.compile(r'^\s*class\s+([A-Za-z_]\w*)\s*[:\(]', re.MULTILINE),
|
|
36
|
+
'method': re.compile(r'^\s+def\s+([A-Za-z_]\w*)\s*\(self', re.MULTILINE),
|
|
37
|
+
'static_method': re.compile(r'@staticmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL),
|
|
38
|
+
'class_method': re.compile(r'@classmethod.*?\n\s*def\s+([A-Za-z_]\w*)\s*\(', re.MULTILINE | re.DOTALL)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# JavaScript/TypeScript patterns
|
|
42
|
+
self.js_patterns = {
|
|
43
|
+
'function': re.compile(r'function\s+([A-Za-z_$][\w$]*)\s*\('),
|
|
44
|
+
'arrow': re.compile(r'(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>'),
|
|
45
|
+
'async_function': re.compile(r'async\s+function\s+([A-Za-z_$][\w$]*)\s*\('),
|
|
46
|
+
'class': re.compile(r'class\s+([A-Za-z_$][\w$]*)\s*(?:extends\s+[A-Za-z_$][\w$]*)?\s*\{'),
|
|
47
|
+
'method': re.compile(r'([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{'),
|
|
48
|
+
'export_function': re.compile(r'export\s+(?:default\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\('),
|
|
49
|
+
'export_const': re.compile(r'export\s+const\s+([A-Za-z_$][\w$]*)\s*=')
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def extract(self, text: str) -> Dict[str, Any]:
|
|
53
|
+
"""
|
|
54
|
+
Extract AST elements from text.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Dictionary with ast_elements and code_blocks keys
|
|
58
|
+
"""
|
|
59
|
+
elements = set()
|
|
60
|
+
has_code = False
|
|
61
|
+
|
|
62
|
+
# Extract code blocks using permissive regex
|
|
63
|
+
code_blocks = self.code_fence_pattern.findall(text)
|
|
64
|
+
|
|
65
|
+
for code_block in code_blocks[:10]: # Limit processing
|
|
66
|
+
has_code = True
|
|
67
|
+
|
|
68
|
+
# Try to detect language from content
|
|
69
|
+
if self._looks_like_python(code_block):
|
|
70
|
+
python_elements = self._extract_python_ast(code_block)
|
|
71
|
+
elements.update(python_elements)
|
|
72
|
+
elif self._looks_like_javascript(code_block):
|
|
73
|
+
js_elements = self._extract_javascript_patterns(code_block)
|
|
74
|
+
elements.update(js_elements)
|
|
75
|
+
else:
|
|
76
|
+
# Try both as fallback
|
|
77
|
+
elements.update(self._extract_python_ast(code_block))
|
|
78
|
+
elements.update(self._extract_javascript_patterns(code_block))
|
|
79
|
+
|
|
80
|
+
# FIX: Enforce max elements limit
|
|
81
|
+
if len(elements) >= self.max_elements:
|
|
82
|
+
logger.debug(f"Reached max AST elements limit: {self.max_elements}")
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
# Also check for inline code patterns outside of fences
|
|
86
|
+
if not has_code:
|
|
87
|
+
# Look for function/class definitions in plain text
|
|
88
|
+
elements.update(self._extract_inline_patterns(text))
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
"ast_elements": list(elements)[:self.max_elements],
|
|
92
|
+
"has_code_blocks": has_code
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
def _extract_python_ast(self, code: str) -> Set[str]:
|
|
96
|
+
"""Extract Python AST elements with fallback to regex."""
|
|
97
|
+
elements = set()
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
# Try proper AST parsing first
|
|
101
|
+
tree = ast.parse(code)
|
|
102
|
+
|
|
103
|
+
for node in ast.walk(tree):
|
|
104
|
+
if len(elements) >= self.max_elements:
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
if isinstance(node, ast.FunctionDef):
|
|
108
|
+
elements.add(f"func:{node.name}")
|
|
109
|
+
elif isinstance(node, ast.AsyncFunctionDef):
|
|
110
|
+
elements.add(f"func:{node.name}")
|
|
111
|
+
elif isinstance(node, ast.ClassDef):
|
|
112
|
+
elements.add(f"class:{node.name}")
|
|
113
|
+
# Extract methods
|
|
114
|
+
for item in node.body:
|
|
115
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
116
|
+
elements.add(f"method:{node.name}.{item.name}")
|
|
117
|
+
if len(elements) >= self.max_elements:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
except (SyntaxError, ValueError) as e:
|
|
121
|
+
# FIX: Python regex fallback for partial code fragments
|
|
122
|
+
logger.debug(f"AST parsing failed, using regex fallback: {e}")
|
|
123
|
+
|
|
124
|
+
for pattern_type, pattern in self.python_patterns.items():
|
|
125
|
+
for match in pattern.finditer(code):
|
|
126
|
+
if len(elements) >= self.max_elements:
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
name = match.group(1)
|
|
130
|
+
if 'method' in pattern_type:
|
|
131
|
+
elements.add(f"method:{name}")
|
|
132
|
+
elif 'class' in pattern_type:
|
|
133
|
+
elements.add(f"class:{name}")
|
|
134
|
+
else:
|
|
135
|
+
elements.add(f"func:{name}")
|
|
136
|
+
|
|
137
|
+
return elements
|
|
138
|
+
|
|
139
|
+
def _extract_javascript_patterns(self, code: str) -> Set[str]:
|
|
140
|
+
"""Extract JavaScript/TypeScript patterns."""
|
|
141
|
+
elements = set()
|
|
142
|
+
|
|
143
|
+
for pattern_type, pattern in self.js_patterns.items():
|
|
144
|
+
for match in pattern.finditer(code):
|
|
145
|
+
if len(elements) >= self.max_elements:
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
name = match.group(1)
|
|
149
|
+
if 'class' in pattern_type:
|
|
150
|
+
elements.add(f"class:{name}")
|
|
151
|
+
elif 'method' in pattern_type and name not in ['constructor', 'if', 'for', 'while']:
|
|
152
|
+
elements.add(f"method:{name}")
|
|
153
|
+
else:
|
|
154
|
+
elements.add(f"func:{name}")
|
|
155
|
+
|
|
156
|
+
return elements
|
|
157
|
+
|
|
158
|
+
def _extract_inline_patterns(self, text: str) -> Set[str]:
|
|
159
|
+
"""Extract patterns from inline code mentions."""
|
|
160
|
+
elements = set()
|
|
161
|
+
|
|
162
|
+
# Look for backtick-wrapped function/class names
|
|
163
|
+
inline_pattern = re.compile(r'`([A-Za-z_][\w]*(?:\.[A-Za-z_][\w]*)*)`')
|
|
164
|
+
|
|
165
|
+
for match in inline_pattern.finditer(text):
|
|
166
|
+
if len(elements) >= self.max_elements:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
name = match.group(1)
|
|
170
|
+
# Heuristic: if contains dot, likely a method
|
|
171
|
+
if '.' in name:
|
|
172
|
+
elements.add(f"method:{name}")
|
|
173
|
+
# Heuristic: PascalCase likely a class
|
|
174
|
+
elif name[0].isupper():
|
|
175
|
+
elements.add(f"class:{name}")
|
|
176
|
+
# Otherwise assume function
|
|
177
|
+
else:
|
|
178
|
+
elements.add(f"func:{name}")
|
|
179
|
+
|
|
180
|
+
return elements
|
|
181
|
+
|
|
182
|
+
def _looks_like_python(self, code: str) -> bool:
|
|
183
|
+
"""Heuristic to detect Python code."""
|
|
184
|
+
python_indicators = [
|
|
185
|
+
'def ', 'import ', 'from ', 'class ', 'self.', 'self,',
|
|
186
|
+
'__init__', '__name__', 'if __name__', 'print(', 'async def'
|
|
187
|
+
]
|
|
188
|
+
return any(indicator in code for indicator in python_indicators)
|
|
189
|
+
|
|
190
|
+
def _looks_like_javascript(self, code: str) -> bool:
|
|
191
|
+
"""Heuristic to detect JavaScript/TypeScript."""
|
|
192
|
+
js_indicators = [
|
|
193
|
+
'function ', 'const ', 'let ', 'var ', '=>', 'export ',
|
|
194
|
+
'import ', 'class ', 'constructor(', 'this.', 'async function',
|
|
195
|
+
'interface ', 'type ', 'namespace ', 'enum '
|
|
196
|
+
]
|
|
197
|
+
return any(indicator in code for indicator in js_indicators)
|