claude-self-reflect 3.0.0 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +110 -66
- package/README.md +1 -1
- package/installer/setup-wizard.js +4 -2
- package/mcp-server/pyproject.toml +1 -0
- package/mcp-server/src/server.py +84 -0
- package/package.json +2 -1
- package/scripts/import-conversations-unified.py +225 -44
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Voyage AI embedding provider with conditional import support."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from .base import EmbeddingProvider
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
# Conditional import to avoid dependency when not using Voyage
|
|
11
|
+
try:
|
|
12
|
+
import voyageai
|
|
13
|
+
VOYAGE_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
voyageai = None
|
|
16
|
+
VOYAGE_AVAILABLE = False
|
|
17
|
+
logger.debug("Voyage AI not installed. Install with: pip install voyageai")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VoyageEmbeddingProvider(EmbeddingProvider):
|
|
21
|
+
"""
|
|
22
|
+
Voyage AI cloud embedding provider.
|
|
23
|
+
|
|
24
|
+
Supports multiple models with different dimensions:
|
|
25
|
+
- voyage-2: 1024 dimensions (default)
|
|
26
|
+
- voyage-large-2: 1536 dimensions
|
|
27
|
+
- voyage-3: 1024 dimensions
|
|
28
|
+
- voyage-3-lite: 512 dimensions
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
SUPPORTED_MODELS = {
|
|
32
|
+
"voyage-2": 1024,
|
|
33
|
+
"voyage-large-2": 1536,
|
|
34
|
+
"voyage-3": 1024,
|
|
35
|
+
"voyage-3-lite": 512,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
api_key: str,
|
|
41
|
+
model_name: str = "voyage-2",
|
|
42
|
+
batch_size: int = 128,
|
|
43
|
+
max_tokens_per_batch: int = 100000, # 20k buffer from 120k limit
|
|
44
|
+
token_estimation_ratio: int = 3 # chars per token
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize Voyage AI provider.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
api_key: Voyage AI API key
|
|
51
|
+
model_name: Model to use (default: voyage-2)
|
|
52
|
+
batch_size: Maximum batch size for embedding
|
|
53
|
+
max_tokens_per_batch: Maximum tokens per batch (default: 100000, 20k buffer from 120k limit)
|
|
54
|
+
token_estimation_ratio: Characters per token estimate (default: 3)
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ImportError: If voyageai package is not installed
|
|
58
|
+
ValueError: If API key is empty or model is unsupported
|
|
59
|
+
"""
|
|
60
|
+
if not VOYAGE_AVAILABLE:
|
|
61
|
+
raise ImportError(
|
|
62
|
+
"Voyage AI is not installed. "
|
|
63
|
+
"Install with: pip install voyageai"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if not api_key:
|
|
67
|
+
raise ValueError("Voyage API key is required")
|
|
68
|
+
|
|
69
|
+
if model_name not in self.SUPPORTED_MODELS:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Unsupported model: {model_name}. "
|
|
72
|
+
f"Supported models: {list(self.SUPPORTED_MODELS.keys())}"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.api_key = api_key
|
|
76
|
+
self.model_name = model_name
|
|
77
|
+
self.batch_size = batch_size
|
|
78
|
+
self.dimension = self.SUPPORTED_MODELS[model_name]
|
|
79
|
+
self.max_tokens_per_batch = max_tokens_per_batch
|
|
80
|
+
self.token_estimation_ratio = token_estimation_ratio
|
|
81
|
+
|
|
82
|
+
# Initialize client
|
|
83
|
+
self.client = voyageai.Client(api_key=api_key)
|
|
84
|
+
logger.info(f"Initialized Voyage AI with model {model_name} ({self.dimension} dims)")
|
|
85
|
+
# Note: Never log the API key for security
|
|
86
|
+
|
|
87
|
+
def embed(self, texts: List[str]) -> List[List[float]]:
|
|
88
|
+
"""
|
|
89
|
+
Generate embeddings for texts using Voyage AI.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
texts: List of texts to embed
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of embedding vectors
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If request is invalid
|
|
99
|
+
Exception: If API call fails
|
|
100
|
+
"""
|
|
101
|
+
if not texts:
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
# Voyage AI expects a list of texts
|
|
106
|
+
result = self.client.embed(
|
|
107
|
+
texts=texts,
|
|
108
|
+
model=self.model_name
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Extract embeddings from response
|
|
112
|
+
embeddings = result.embeddings
|
|
113
|
+
|
|
114
|
+
# Validate dimensions
|
|
115
|
+
for i, embedding in enumerate(embeddings):
|
|
116
|
+
if len(embedding) != self.dimension:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"Embedding {i} has wrong dimension: "
|
|
119
|
+
f"expected {self.dimension}, got {len(embedding)}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
logger.debug(f"Generated {len(embeddings)} embeddings with Voyage AI")
|
|
123
|
+
return embeddings
|
|
124
|
+
|
|
125
|
+
except AttributeError as e:
|
|
126
|
+
# Handle voyageai-specific errors if available
|
|
127
|
+
if 'RateLimitError' in str(type(e).__name__):
|
|
128
|
+
logger.error(f"Rate limit exceeded, retry with backoff")
|
|
129
|
+
raise
|
|
130
|
+
elif 'InvalidRequestError' in str(type(e).__name__):
|
|
131
|
+
logger.error(f"Invalid request to Voyage API")
|
|
132
|
+
raise ValueError(f"Invalid embedding request: {e}")
|
|
133
|
+
else:
|
|
134
|
+
logger.error(f"Voyage API error: {e}")
|
|
135
|
+
raise
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Unexpected error during embedding: {type(e).__name__}")
|
|
138
|
+
raise
|
|
139
|
+
|
|
140
|
+
def estimate_tokens(self, text: str) -> int:
|
|
141
|
+
"""
|
|
142
|
+
Estimate token count for text.
|
|
143
|
+
Conservative estimate: 3 characters = 1 token.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
text: Text to estimate tokens for
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Estimated token count
|
|
150
|
+
"""
|
|
151
|
+
return len(text) // self.token_estimation_ratio
|
|
152
|
+
|
|
153
|
+
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
|
154
|
+
"""
|
|
155
|
+
Generate embeddings in token-aware batches to respect API limits.
|
|
156
|
+
|
|
157
|
+
This implements the critical fix for issue #38 - prevents
|
|
158
|
+
"max allowed tokens per batch is 120000" errors.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
texts: List of texts to embed
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of embedding vectors
|
|
165
|
+
"""
|
|
166
|
+
if not texts:
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
all_embeddings = []
|
|
170
|
+
current_batch = []
|
|
171
|
+
current_tokens = 0
|
|
172
|
+
|
|
173
|
+
for text in texts:
|
|
174
|
+
# Estimate tokens for this text
|
|
175
|
+
text_tokens = self.estimate_tokens(text)
|
|
176
|
+
|
|
177
|
+
# Check if single text exceeds limit
|
|
178
|
+
if text_tokens > self.max_tokens_per_batch:
|
|
179
|
+
logger.warning(
|
|
180
|
+
f"Single text with {text_tokens} estimated tokens exceeds "
|
|
181
|
+
f"limit of {self.max_tokens_per_batch}. Truncating."
|
|
182
|
+
)
|
|
183
|
+
# Truncate text to fit within limit
|
|
184
|
+
max_chars = self.max_tokens_per_batch * self.token_estimation_ratio
|
|
185
|
+
text = text[:max_chars]
|
|
186
|
+
text_tokens = self.estimate_tokens(text)
|
|
187
|
+
|
|
188
|
+
# Check if adding this text would exceed batch limit
|
|
189
|
+
if current_batch and (current_tokens + text_tokens) > self.max_tokens_per_batch:
|
|
190
|
+
# Process current batch
|
|
191
|
+
logger.debug(
|
|
192
|
+
f"Processing batch with {len(current_batch)} texts, "
|
|
193
|
+
f"~{current_tokens} tokens"
|
|
194
|
+
)
|
|
195
|
+
embeddings = self.embed(current_batch)
|
|
196
|
+
all_embeddings.extend(embeddings)
|
|
197
|
+
|
|
198
|
+
# Start new batch
|
|
199
|
+
current_batch = [text]
|
|
200
|
+
current_tokens = text_tokens
|
|
201
|
+
else:
|
|
202
|
+
# Add to current batch
|
|
203
|
+
current_batch.append(text)
|
|
204
|
+
current_tokens += text_tokens
|
|
205
|
+
|
|
206
|
+
# Process final batch
|
|
207
|
+
if current_batch:
|
|
208
|
+
logger.debug(
|
|
209
|
+
f"Processing final batch with {len(current_batch)} texts, "
|
|
210
|
+
f"~{current_tokens} tokens"
|
|
211
|
+
)
|
|
212
|
+
embeddings = self.embed(current_batch)
|
|
213
|
+
all_embeddings.extend(embeddings)
|
|
214
|
+
|
|
215
|
+
return all_embeddings
|
|
216
|
+
|
|
217
|
+
def get_dimension(self) -> int:
|
|
218
|
+
"""Get embedding dimension for current model."""
|
|
219
|
+
return self.dimension
|
|
220
|
+
|
|
221
|
+
def get_model_name(self) -> str:
|
|
222
|
+
"""Get the model name being used."""
|
|
223
|
+
return self.model_name
|
|
224
|
+
|
|
225
|
+
def validate_api_key(self) -> bool:
|
|
226
|
+
"""
|
|
227
|
+
Validate that the API key works.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if API key is valid
|
|
231
|
+
"""
|
|
232
|
+
try:
|
|
233
|
+
# Test with a simple embedding
|
|
234
|
+
test_result = self.client.embed(
|
|
235
|
+
texts=["test"],
|
|
236
|
+
model=self.model_name
|
|
237
|
+
)
|
|
238
|
+
return len(test_result.embeddings) > 0
|
|
239
|
+
except Exception as e:
|
|
240
|
+
logger.error(f"API key validation failed: {e}")
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def is_available(cls) -> bool:
|
|
245
|
+
"""Check if Voyage AI is available for use."""
|
|
246
|
+
return VOYAGE_AVAILABLE
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def get_supported_models(cls) -> dict:
|
|
250
|
+
"""Get dictionary of supported models and their dimensions."""
|
|
251
|
+
return cls.SUPPORTED_MODELS.copy()
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
"""Main orchestrator with dependency injection."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Optional, Dict, Any
|
|
7
|
+
from dependency_injector import containers, providers
|
|
8
|
+
|
|
9
|
+
from .core import (
|
|
10
|
+
ImportConfig,
|
|
11
|
+
Message,
|
|
12
|
+
ConversationChunk,
|
|
13
|
+
ProcessedPoint,
|
|
14
|
+
ImportResult,
|
|
15
|
+
ImportStats
|
|
16
|
+
)
|
|
17
|
+
from .core.exceptions import ImportError, ParseError, ValidationError
|
|
18
|
+
from .embeddings import EmbeddingProvider, FastEmbedProvider
|
|
19
|
+
try:
|
|
20
|
+
from .embeddings import VoyageEmbeddingProvider
|
|
21
|
+
VOYAGE_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
VoyageEmbeddingProvider = None
|
|
24
|
+
VOYAGE_AVAILABLE = False
|
|
25
|
+
from .processors import (
|
|
26
|
+
ConversationParser,
|
|
27
|
+
Chunker,
|
|
28
|
+
ASTExtractor,
|
|
29
|
+
ConceptExtractor,
|
|
30
|
+
ToolUsageExtractor
|
|
31
|
+
)
|
|
32
|
+
from .storage import QdrantStorage
|
|
33
|
+
from .state import StateManager
|
|
34
|
+
from .utils import ProjectNormalizer, setup_logging
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ConversationProcessor:
|
|
40
|
+
"""
|
|
41
|
+
Main orchestrator for processing conversations.
|
|
42
|
+
|
|
43
|
+
Follows dependency injection pattern with all dependencies
|
|
44
|
+
injected through constructor.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
config: ImportConfig,
|
|
50
|
+
embedding_provider: EmbeddingProvider,
|
|
51
|
+
storage: QdrantStorage,
|
|
52
|
+
parser: ConversationParser,
|
|
53
|
+
chunker: Chunker,
|
|
54
|
+
extractors: List[Any],
|
|
55
|
+
state_manager: StateManager,
|
|
56
|
+
normalizer: ProjectNormalizer
|
|
57
|
+
):
|
|
58
|
+
self.config = config
|
|
59
|
+
self.embedding_provider = embedding_provider
|
|
60
|
+
self.storage = storage
|
|
61
|
+
self.parser = parser
|
|
62
|
+
self.chunker = chunker
|
|
63
|
+
self.extractors = extractors
|
|
64
|
+
self.state = state_manager
|
|
65
|
+
self.normalizer = normalizer
|
|
66
|
+
self.stats = ImportStats()
|
|
67
|
+
|
|
68
|
+
def process_file(self, file_path: Path) -> ImportResult:
|
|
69
|
+
"""
|
|
70
|
+
Process a single JSONL file.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
ImportResult with processing details
|
|
74
|
+
"""
|
|
75
|
+
start_time = time.time()
|
|
76
|
+
result = ImportResult(file_path=str(file_path), success=False)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
# Check if already processed
|
|
80
|
+
if not self.config.force_reimport and self.state.is_processed(file_path):
|
|
81
|
+
logger.info(f"Skipping already processed: {file_path}")
|
|
82
|
+
result.success = True
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
# Parse conversation
|
|
86
|
+
logger.debug(f"Parsing conversation: {file_path}")
|
|
87
|
+
messages = self.parser.parse_file(file_path)
|
|
88
|
+
if not messages:
|
|
89
|
+
raise ParseError(str(file_path), reason="No messages found")
|
|
90
|
+
|
|
91
|
+
# Create chunks
|
|
92
|
+
logger.debug(f"Creating chunks for {len(messages)} messages")
|
|
93
|
+
chunks = self.chunker.create_chunks(messages, str(file_path))
|
|
94
|
+
result.chunks_processed = len(chunks)
|
|
95
|
+
|
|
96
|
+
# Extract metadata
|
|
97
|
+
logger.debug("Extracting metadata")
|
|
98
|
+
self._enrich_chunks(chunks)
|
|
99
|
+
|
|
100
|
+
# Generate embeddings
|
|
101
|
+
logger.debug("Generating embeddings")
|
|
102
|
+
texts = [chunk.text for chunk in chunks]
|
|
103
|
+
# Use embed_batch for proper token-aware batching with Voyage
|
|
104
|
+
if hasattr(self.embedding_provider, 'embed_batch'):
|
|
105
|
+
embeddings = self.embedding_provider.embed_batch(texts)
|
|
106
|
+
else:
|
|
107
|
+
embeddings = self.embedding_provider.embed(texts)
|
|
108
|
+
|
|
109
|
+
# Build points
|
|
110
|
+
logger.debug("Building points")
|
|
111
|
+
points = self._build_points(chunks, embeddings, file_path)
|
|
112
|
+
|
|
113
|
+
# Store in Qdrant
|
|
114
|
+
logger.debug(f"Storing {len(points)} points")
|
|
115
|
+
collection_name = self._get_collection_name(file_path)
|
|
116
|
+
stored = self.storage.upsert_points(collection_name, points)
|
|
117
|
+
result.points_created = stored
|
|
118
|
+
|
|
119
|
+
# Update state
|
|
120
|
+
self.state.mark_processed(file_path, stored)
|
|
121
|
+
|
|
122
|
+
result.success = True
|
|
123
|
+
logger.info(f"Successfully processed {file_path}: {stored} points")
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.error(f"Failed to process {file_path}: {e}")
|
|
127
|
+
result.error = str(e)
|
|
128
|
+
self.state.mark_failed(file_path, str(e))
|
|
129
|
+
if not isinstance(e, ImportError):
|
|
130
|
+
raise ImportError(f"Processing failed: {e}")
|
|
131
|
+
|
|
132
|
+
finally:
|
|
133
|
+
result.duration_seconds = time.time() - start_time
|
|
134
|
+
self.stats.add_result(result)
|
|
135
|
+
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
def _enrich_chunks(self, chunks: List[ConversationChunk]) -> None:
|
|
139
|
+
"""Add metadata to chunks using extractors."""
|
|
140
|
+
for chunk in chunks:
|
|
141
|
+
for extractor in self.extractors:
|
|
142
|
+
try:
|
|
143
|
+
metadata = extractor.extract(chunk.text)
|
|
144
|
+
for key, value in metadata.items():
|
|
145
|
+
chunk.add_metadata(key, value)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.warning(f"Extractor {extractor.__class__.__name__} failed: {e}")
|
|
148
|
+
|
|
149
|
+
def _build_points(
|
|
150
|
+
self,
|
|
151
|
+
chunks: List[ConversationChunk],
|
|
152
|
+
embeddings: List[List[float]],
|
|
153
|
+
file_path: Path
|
|
154
|
+
) -> List[ProcessedPoint]:
|
|
155
|
+
"""Build Qdrant points from chunks and embeddings."""
|
|
156
|
+
points = []
|
|
157
|
+
project_name = self.normalizer.get_project_name(file_path)
|
|
158
|
+
|
|
159
|
+
for chunk, embedding in zip(chunks, embeddings):
|
|
160
|
+
# Generate unique point ID
|
|
161
|
+
point_id = f"{project_name}_{chunk.unique_id}"
|
|
162
|
+
|
|
163
|
+
# Build payload
|
|
164
|
+
payload = {
|
|
165
|
+
"text": chunk.text,
|
|
166
|
+
"project": project_name,
|
|
167
|
+
"file_path": str(file_path),
|
|
168
|
+
"chunk_index": chunk.chunk_index,
|
|
169
|
+
"total_chunks": chunk.total_chunks,
|
|
170
|
+
"message_indices": chunk.message_indices,
|
|
171
|
+
**chunk.metadata
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
point = ProcessedPoint(
|
|
175
|
+
id=point_id,
|
|
176
|
+
vector=embedding,
|
|
177
|
+
payload=payload
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Validate dimension
|
|
181
|
+
if not point.validate_dimension(self.embedding_provider.get_dimension()):
|
|
182
|
+
raise ValidationError(
|
|
183
|
+
"embedding",
|
|
184
|
+
len(embedding),
|
|
185
|
+
f"Expected dimension {self.embedding_provider.get_dimension()}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
points.append(point)
|
|
189
|
+
|
|
190
|
+
return points
|
|
191
|
+
|
|
192
|
+
def _get_collection_name(self, file_path: Path) -> str:
|
|
193
|
+
"""Generate collection name for file."""
|
|
194
|
+
return self.normalizer.get_collection_name(file_path)
|
|
195
|
+
|
|
196
|
+
def get_stats(self) -> ImportStats:
|
|
197
|
+
"""Get import statistics."""
|
|
198
|
+
return self.stats
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class ImporterContainer(containers.DeclarativeContainer):
|
|
202
|
+
"""
|
|
203
|
+
Dependency injection container using dependency-injector library.
|
|
204
|
+
|
|
205
|
+
This provides sophisticated dependency management as recommended
|
|
206
|
+
in the code review.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
# Configuration provider
|
|
210
|
+
config = providers.Singleton(ImportConfig.from_env)
|
|
211
|
+
|
|
212
|
+
# Logging setup
|
|
213
|
+
logger_setup = providers.Resource(
|
|
214
|
+
setup_logging,
|
|
215
|
+
level=config.provided.log_level
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Core services
|
|
219
|
+
normalizer = providers.Singleton(ProjectNormalizer)
|
|
220
|
+
|
|
221
|
+
state_manager = providers.Singleton(
|
|
222
|
+
StateManager,
|
|
223
|
+
state_file=config.provided.state_file_path
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Embedding provider with selector
|
|
227
|
+
def get_embedding_provider(config_obj):
|
|
228
|
+
"""Factory function to select embedding provider based on config."""
|
|
229
|
+
if config_obj.use_voyage and config_obj.voyage_api_key:
|
|
230
|
+
if not VOYAGE_AVAILABLE:
|
|
231
|
+
logger.warning("Voyage requested but not available, falling back to FastEmbed")
|
|
232
|
+
return FastEmbedProvider()
|
|
233
|
+
return VoyageEmbeddingProvider(
|
|
234
|
+
api_key=config_obj.voyage_api_key,
|
|
235
|
+
model_name="voyage-2"
|
|
236
|
+
)
|
|
237
|
+
return FastEmbedProvider()
|
|
238
|
+
|
|
239
|
+
embedding_provider = providers.Factory(
|
|
240
|
+
get_embedding_provider,
|
|
241
|
+
config_obj=config
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Storage
|
|
245
|
+
storage = providers.Singleton(
|
|
246
|
+
QdrantStorage,
|
|
247
|
+
url=config.provided.qdrant_url,
|
|
248
|
+
api_key=config.provided.qdrant_api_key
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Processors
|
|
252
|
+
parser = providers.Singleton(ConversationParser)
|
|
253
|
+
|
|
254
|
+
chunker = providers.Singleton(
|
|
255
|
+
Chunker,
|
|
256
|
+
chunk_size=config.provided.chunk_size,
|
|
257
|
+
chunk_overlap=config.provided.chunk_overlap
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Metadata extractors
|
|
261
|
+
ast_extractor = providers.Singleton(
|
|
262
|
+
ASTExtractor,
|
|
263
|
+
max_elements=config.provided.max_ast_elements
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
concept_extractor = providers.Singleton(ConceptExtractor)
|
|
267
|
+
|
|
268
|
+
tool_extractor = providers.Singleton(ToolUsageExtractor)
|
|
269
|
+
|
|
270
|
+
extractors = providers.List(
|
|
271
|
+
ast_extractor,
|
|
272
|
+
concept_extractor,
|
|
273
|
+
tool_extractor
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Main processor
|
|
277
|
+
processor = providers.Factory(
|
|
278
|
+
ConversationProcessor,
|
|
279
|
+
config=config,
|
|
280
|
+
embedding_provider=embedding_provider,
|
|
281
|
+
storage=storage,
|
|
282
|
+
parser=parser,
|
|
283
|
+
chunker=chunker,
|
|
284
|
+
extractors=extractors,
|
|
285
|
+
state_manager=state_manager,
|
|
286
|
+
normalizer=normalizer
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def create_processor(config: Optional[ImportConfig] = None) -> ConversationProcessor:
|
|
291
|
+
"""
|
|
292
|
+
Factory function to create a configured processor.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
config: Optional configuration, uses environment if not provided
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Configured ConversationProcessor instance
|
|
299
|
+
"""
|
|
300
|
+
container = ImporterContainer()
|
|
301
|
+
|
|
302
|
+
if config:
|
|
303
|
+
container.config.override(config)
|
|
304
|
+
|
|
305
|
+
# Get processor instance
|
|
306
|
+
processor = container.processor()
|
|
307
|
+
|
|
308
|
+
# Note: Providers are already initialized by the container
|
|
309
|
+
# No need to call initialize methods
|
|
310
|
+
|
|
311
|
+
return processor
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def process_files(
|
|
315
|
+
files: List[Path],
|
|
316
|
+
config: Optional[ImportConfig] = None,
|
|
317
|
+
progress_callback: Optional[Any] = None
|
|
318
|
+
) -> ImportStats:
|
|
319
|
+
"""
|
|
320
|
+
Process multiple files with progress tracking.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
files: List of JSONL files to process
|
|
324
|
+
config: Optional configuration
|
|
325
|
+
progress_callback: Optional callback for progress updates
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
ImportStats with aggregate results
|
|
329
|
+
"""
|
|
330
|
+
processor = create_processor(config)
|
|
331
|
+
|
|
332
|
+
for i, file_path in enumerate(files):
|
|
333
|
+
if progress_callback:
|
|
334
|
+
progress_callback(i, len(files), file_path)
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
result = processor.process_file(file_path)
|
|
338
|
+
logger.info(
|
|
339
|
+
f"[{i+1}/{len(files)}] Processed {file_path.name}: "
|
|
340
|
+
f"{result.points_created} points"
|
|
341
|
+
)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Failed to process {file_path}: {e}")
|
|
344
|
+
|
|
345
|
+
return processor.get_stats()
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def main():
|
|
349
|
+
"""Main entry point for CLI execution."""
|
|
350
|
+
import argparse
|
|
351
|
+
|
|
352
|
+
parser = argparse.ArgumentParser(description="Import Claude conversations to Qdrant")
|
|
353
|
+
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
354
|
+
parser.add_argument("--dry-run", action="store_true", help="Dry run without importing")
|
|
355
|
+
parser.add_argument("--force", action="store_true", help="Force reimport all files")
|
|
356
|
+
parser.add_argument("--voyage", action="store_true", help="Use Voyage AI embeddings")
|
|
357
|
+
parser.add_argument("--log-level", default="INFO", help="Logging level")
|
|
358
|
+
|
|
359
|
+
args = parser.parse_args()
|
|
360
|
+
|
|
361
|
+
# Setup logging
|
|
362
|
+
setup_logging(args.log_level)
|
|
363
|
+
|
|
364
|
+
# Create config from environment with CLI overrides
|
|
365
|
+
config_dict = {}
|
|
366
|
+
if args.dry_run:
|
|
367
|
+
config_dict["dry_run"] = True
|
|
368
|
+
if args.force:
|
|
369
|
+
config_dict["force_reimport"] = True
|
|
370
|
+
if args.voyage:
|
|
371
|
+
config_dict["use_voyage"] = True
|
|
372
|
+
if args.limit:
|
|
373
|
+
config_dict["file_limit"] = args.limit
|
|
374
|
+
|
|
375
|
+
config = ImportConfig.from_env()
|
|
376
|
+
if config_dict:
|
|
377
|
+
# Override with CLI args
|
|
378
|
+
config = ImportConfig.from_dict({**config.__dict__, **config_dict})
|
|
379
|
+
|
|
380
|
+
# Find all JSONL files
|
|
381
|
+
base_path = Path.home() / ".claude" / "projects"
|
|
382
|
+
files = list(base_path.glob("*/*.jsonl"))
|
|
383
|
+
|
|
384
|
+
if args.limit:
|
|
385
|
+
files = files[:args.limit]
|
|
386
|
+
|
|
387
|
+
logger.info(f"Processing {len(files)} files...")
|
|
388
|
+
|
|
389
|
+
# Process files
|
|
390
|
+
stats = process_files(files, config)
|
|
391
|
+
|
|
392
|
+
logger.info(f"Import complete: {stats}")
|
|
393
|
+
return 0 if stats.failed_files == 0 else 1
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Processors for parsing and extracting metadata from conversations."""
|
|
2
|
+
|
|
3
|
+
from .conversation_parser import ConversationParser
|
|
4
|
+
from .chunker import Chunker
|
|
5
|
+
from .ast_extractor import ASTExtractor
|
|
6
|
+
from .concept_extractor import ConceptExtractor
|
|
7
|
+
from .tool_extractor import ToolUsageExtractor
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"ConversationParser",
|
|
11
|
+
"Chunker",
|
|
12
|
+
"ASTExtractor",
|
|
13
|
+
"ConceptExtractor",
|
|
14
|
+
"ToolUsageExtractor"
|
|
15
|
+
]
|