claude-self-reflect 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-server/pyproject.toml +1 -0
- package/package.json +2 -1
- package/scripts/importer/__init__.py +25 -0
- package/scripts/importer/__main__.py +14 -0
- package/scripts/importer/core/__init__.py +25 -0
- package/scripts/importer/core/config.py +120 -0
- package/scripts/importer/core/exceptions.py +52 -0
- package/scripts/importer/core/models.py +184 -0
- package/scripts/importer/embeddings/__init__.py +22 -0
- package/scripts/importer/embeddings/base.py +141 -0
- package/scripts/importer/embeddings/fastembed_provider.py +164 -0
- package/scripts/importer/embeddings/validator.py +136 -0
- package/scripts/importer/embeddings/voyage_provider.py +251 -0
- package/scripts/importer/main.py +393 -0
- package/scripts/importer/processors/__init__.py +15 -0
- package/scripts/importer/processors/ast_extractor.py +197 -0
- package/scripts/importer/processors/chunker.py +157 -0
- package/scripts/importer/processors/concept_extractor.py +109 -0
- package/scripts/importer/processors/conversation_parser.py +181 -0
- package/scripts/importer/processors/tool_extractor.py +165 -0
- package/scripts/importer/state/__init__.py +5 -0
- package/scripts/importer/state/state_manager.py +190 -0
- package/scripts/importer/storage/__init__.py +5 -0
- package/scripts/importer/storage/qdrant_storage.py +250 -0
- package/scripts/importer/utils/__init__.py +9 -0
- package/scripts/importer/utils/logger.py +87 -0
- package/scripts/importer/utils/project_normalizer.py +120 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""FastEmbed provider for local embeddings."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Any
|
|
4
|
+
import logging
|
|
5
|
+
import statistics
|
|
6
|
+
from .base import EmbeddingProvider
|
|
7
|
+
from ..core.exceptions import EmbeddingError
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FastEmbedProvider(EmbeddingProvider):
|
|
13
|
+
"""
|
|
14
|
+
FastEmbed provider for generating embeddings locally.
|
|
15
|
+
|
|
16
|
+
Uses sentence-transformers/all-MiniLM-L6-v2 model by default.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
super().__init__()
|
|
21
|
+
self.model = None
|
|
22
|
+
self.model_name = None
|
|
23
|
+
self.dimension = None
|
|
24
|
+
|
|
25
|
+
def initialize(self, config: Any) -> None:
|
|
26
|
+
"""Initialize FastEmbed with the specified model."""
|
|
27
|
+
try:
|
|
28
|
+
from fastembed import TextEmbedding
|
|
29
|
+
|
|
30
|
+
# CRITICAL: Use the correct model name
|
|
31
|
+
self.model_name = config.embedding_model
|
|
32
|
+
|
|
33
|
+
# FastEmbed uses specific model names
|
|
34
|
+
if self.model_name == "sentence-transformers/all-MiniLM-L6-v2":
|
|
35
|
+
# This is the correct model we must use
|
|
36
|
+
fastembed_model = "sentence-transformers/all-MiniLM-L6-v2"
|
|
37
|
+
else:
|
|
38
|
+
fastembed_model = self.model_name
|
|
39
|
+
|
|
40
|
+
logger.info(f"Initializing FastEmbed with model: {fastembed_model}")
|
|
41
|
+
|
|
42
|
+
self.model = TextEmbedding(model_name=fastembed_model)
|
|
43
|
+
self.dimension = config.embedding_dimension
|
|
44
|
+
self._initialized = True
|
|
45
|
+
|
|
46
|
+
logger.info(f"FastEmbed initialized successfully with dimension {self.dimension}")
|
|
47
|
+
|
|
48
|
+
except ImportError as e:
|
|
49
|
+
error = EmbeddingError(
|
|
50
|
+
"FastEmbed not installed. Install with: pip install fastembed",
|
|
51
|
+
provider="FastEmbed"
|
|
52
|
+
)
|
|
53
|
+
self.handle_initialization_error(error)
|
|
54
|
+
raise error
|
|
55
|
+
except Exception as e:
|
|
56
|
+
error = EmbeddingError(
|
|
57
|
+
f"Failed to initialize FastEmbed: {str(e)}",
|
|
58
|
+
provider="FastEmbed"
|
|
59
|
+
)
|
|
60
|
+
self.handle_initialization_error(error)
|
|
61
|
+
raise error
|
|
62
|
+
|
|
63
|
+
def embed(self, texts: List[str]) -> List[List[float]]:
|
|
64
|
+
"""Generate embeddings for texts using FastEmbed."""
|
|
65
|
+
if not self._initialized:
|
|
66
|
+
raise EmbeddingError("FastEmbed not initialized", provider="FastEmbed")
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
# FastEmbed returns a generator, convert to list
|
|
70
|
+
embeddings = list(self.model.embed(texts))
|
|
71
|
+
|
|
72
|
+
# Convert to regular Python lists with safe indexing
|
|
73
|
+
result = []
|
|
74
|
+
for i, embedding in enumerate(embeddings):
|
|
75
|
+
# Convert numpy array or similar to list
|
|
76
|
+
if hasattr(embedding, 'tolist'):
|
|
77
|
+
emb_list = embedding.tolist()
|
|
78
|
+
else:
|
|
79
|
+
emb_list = list(embedding)
|
|
80
|
+
|
|
81
|
+
# Validate each embedding
|
|
82
|
+
if not self.validate_embedding(emb_list):
|
|
83
|
+
# Safe indexing - use i which is guaranteed to be valid
|
|
84
|
+
text_len = len(texts[i]) if i < len(texts) else 0
|
|
85
|
+
raise EmbeddingError(
|
|
86
|
+
f"Invalid embedding generated for text {i} of length {text_len}",
|
|
87
|
+
provider="FastEmbed"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
result.append(emb_list)
|
|
91
|
+
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
if not isinstance(e, EmbeddingError):
|
|
96
|
+
e = EmbeddingError(
|
|
97
|
+
f"Failed to generate embeddings: {str(e)}",
|
|
98
|
+
provider="FastEmbed"
|
|
99
|
+
)
|
|
100
|
+
self._last_error = e
|
|
101
|
+
raise e
|
|
102
|
+
|
|
103
|
+
def get_dimension(self) -> int:
|
|
104
|
+
"""Get embedding dimension."""
|
|
105
|
+
if not self._initialized:
|
|
106
|
+
raise EmbeddingError("FastEmbed not initialized", provider="FastEmbed")
|
|
107
|
+
return self.dimension
|
|
108
|
+
|
|
109
|
+
def validate_embedding(self, embedding: List[float]) -> bool:
|
|
110
|
+
"""
|
|
111
|
+
Validate embedding quality.
|
|
112
|
+
|
|
113
|
+
Checks:
|
|
114
|
+
1. Non-empty
|
|
115
|
+
2. Correct dimension
|
|
116
|
+
3. Not degenerate (all same values)
|
|
117
|
+
4. Has reasonable variance
|
|
118
|
+
"""
|
|
119
|
+
if not embedding:
|
|
120
|
+
logger.error("Empty embedding detected")
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
# Check dimension
|
|
124
|
+
if len(embedding) != self.dimension:
|
|
125
|
+
logger.error(
|
|
126
|
+
f"Dimension mismatch: expected {self.dimension}, got {len(embedding)}"
|
|
127
|
+
)
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
# Check for degenerate embedding (all values identical)
|
|
131
|
+
unique_values = len(set(embedding))
|
|
132
|
+
if unique_values == 1:
|
|
133
|
+
logger.error(f"Degenerate embedding detected (all values are {embedding[0]})")
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
# Check variance is above threshold
|
|
137
|
+
try:
|
|
138
|
+
variance = statistics.variance(embedding)
|
|
139
|
+
if variance < 1e-6:
|
|
140
|
+
logger.warning(f"Low variance embedding detected: {variance}")
|
|
141
|
+
# Don't fail on low variance, just warn
|
|
142
|
+
except statistics.StatisticsError:
|
|
143
|
+
# Less than 2 data points
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
# Check for NaN or Inf values
|
|
147
|
+
if any(not isinstance(x, (int, float)) or x != x or abs(x) == float('inf')
|
|
148
|
+
for x in embedding):
|
|
149
|
+
logger.error("Embedding contains NaN or Inf values")
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
return True
|
|
153
|
+
|
|
154
|
+
def handle_initialization_error(self, error: Exception) -> None:
|
|
155
|
+
"""Handle and log initialization errors."""
|
|
156
|
+
self._last_error = error
|
|
157
|
+
self._initialized = False
|
|
158
|
+
logger.error(f"FastEmbed initialization failed: {error}")
|
|
159
|
+
|
|
160
|
+
# Could implement retry logic or fallback here
|
|
161
|
+
if "not installed" in str(error):
|
|
162
|
+
logger.info("Try: pip install fastembed")
|
|
163
|
+
elif "model" in str(error).lower():
|
|
164
|
+
logger.info(f"Model {self.model_name} may need to be downloaded first")
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Embedding validation utilities."""
|
|
2
|
+
|
|
3
|
+
import statistics
|
|
4
|
+
from typing import List, Tuple, Optional
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EmbeddingValidator:
|
|
11
|
+
"""
|
|
12
|
+
Comprehensive embedding validation.
|
|
13
|
+
|
|
14
|
+
Performs multiple checks to ensure embedding quality.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
expected_dimension: int,
|
|
20
|
+
min_variance: float = 1e-6,
|
|
21
|
+
max_magnitude: float = 100.0
|
|
22
|
+
):
|
|
23
|
+
self.expected_dimension = expected_dimension
|
|
24
|
+
self.min_variance = min_variance
|
|
25
|
+
self.max_magnitude = max_magnitude
|
|
26
|
+
|
|
27
|
+
def validate(self, embedding: List[float]) -> Tuple[bool, Optional[str]]:
|
|
28
|
+
"""
|
|
29
|
+
Validate an embedding vector.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Tuple of (is_valid, error_message)
|
|
33
|
+
"""
|
|
34
|
+
# Check empty
|
|
35
|
+
if not embedding:
|
|
36
|
+
return False, "Empty embedding"
|
|
37
|
+
|
|
38
|
+
# Check dimension
|
|
39
|
+
if len(embedding) != self.expected_dimension:
|
|
40
|
+
return False, f"Dimension mismatch: expected {self.expected_dimension}, got {len(embedding)}"
|
|
41
|
+
|
|
42
|
+
# Check for NaN/Inf
|
|
43
|
+
for i, val in enumerate(embedding):
|
|
44
|
+
if not isinstance(val, (int, float)):
|
|
45
|
+
return False, f"Non-numeric value at index {i}: {type(val)}"
|
|
46
|
+
if val != val: # NaN check
|
|
47
|
+
return False, f"NaN value at index {i}"
|
|
48
|
+
if abs(val) == float('inf'):
|
|
49
|
+
return False, f"Infinite value at index {i}"
|
|
50
|
+
|
|
51
|
+
# Check for degenerate (all same)
|
|
52
|
+
unique_count = len(set(embedding))
|
|
53
|
+
if unique_count == 1:
|
|
54
|
+
return False, f"Degenerate embedding (all values are {embedding[0]})"
|
|
55
|
+
|
|
56
|
+
# Check variance
|
|
57
|
+
if len(embedding) > 1:
|
|
58
|
+
try:
|
|
59
|
+
variance = statistics.variance(embedding)
|
|
60
|
+
if variance < self.min_variance:
|
|
61
|
+
# Warning, not error
|
|
62
|
+
logger.warning(f"Low variance: {variance}")
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.warning(f"Could not calculate variance: {e}")
|
|
65
|
+
|
|
66
|
+
# Check magnitude
|
|
67
|
+
max_val = max(abs(v) for v in embedding)
|
|
68
|
+
if max_val > self.max_magnitude:
|
|
69
|
+
return False, f"Value exceeds maximum magnitude: {max_val}"
|
|
70
|
+
|
|
71
|
+
# Check for mostly zeros
|
|
72
|
+
zero_count = sum(1 for v in embedding if abs(v) < 1e-10)
|
|
73
|
+
if zero_count > len(embedding) * 0.9:
|
|
74
|
+
return False, f"Embedding is mostly zeros ({zero_count}/{len(embedding)})"
|
|
75
|
+
|
|
76
|
+
return True, None
|
|
77
|
+
|
|
78
|
+
def validate_batch(
|
|
79
|
+
self,
|
|
80
|
+
embeddings: List[List[float]]
|
|
81
|
+
) -> List[Tuple[int, str]]:
|
|
82
|
+
"""
|
|
83
|
+
Validate a batch of embeddings.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of (index, error_message) for invalid embeddings
|
|
87
|
+
"""
|
|
88
|
+
errors = []
|
|
89
|
+
for i, embedding in enumerate(embeddings):
|
|
90
|
+
valid, error = self.validate(embedding)
|
|
91
|
+
if not valid:
|
|
92
|
+
errors.append((i, error))
|
|
93
|
+
return errors
|
|
94
|
+
|
|
95
|
+
def check_similarity(
|
|
96
|
+
self,
|
|
97
|
+
embeddings: List[List[float]]
|
|
98
|
+
) -> bool:
|
|
99
|
+
"""
|
|
100
|
+
Check if embeddings in a batch are too similar.
|
|
101
|
+
|
|
102
|
+
This can indicate a problem with the embedding model.
|
|
103
|
+
"""
|
|
104
|
+
if len(embeddings) < 2:
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
# Calculate pairwise cosine similarities
|
|
108
|
+
from math import sqrt
|
|
109
|
+
|
|
110
|
+
def cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
111
|
+
dot_product = sum(x * y for x, y in zip(a, b))
|
|
112
|
+
norm_a = sqrt(sum(x * x for x in a))
|
|
113
|
+
norm_b = sqrt(sum(y * y for y in b))
|
|
114
|
+
if norm_a == 0 or norm_b == 0:
|
|
115
|
+
return 0
|
|
116
|
+
return dot_product / (norm_a * norm_b)
|
|
117
|
+
|
|
118
|
+
# Check if all embeddings are too similar
|
|
119
|
+
high_similarity_count = 0
|
|
120
|
+
total_pairs = 0
|
|
121
|
+
|
|
122
|
+
for i in range(len(embeddings)):
|
|
123
|
+
for j in range(i + 1, min(i + 5, len(embeddings))): # Check first 5 pairs
|
|
124
|
+
similarity = cosine_similarity(embeddings[i], embeddings[j])
|
|
125
|
+
if similarity > 0.99: # Nearly identical
|
|
126
|
+
high_similarity_count += 1
|
|
127
|
+
total_pairs += 1
|
|
128
|
+
|
|
129
|
+
if total_pairs > 0 and high_similarity_count / total_pairs > 0.8:
|
|
130
|
+
logger.warning(
|
|
131
|
+
f"High similarity detected: {high_similarity_count}/{total_pairs} "
|
|
132
|
+
f"pairs have >0.99 similarity"
|
|
133
|
+
)
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
return True
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Voyage AI embedding provider with conditional import support."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from .base import EmbeddingProvider
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
# Conditional import to avoid dependency when not using Voyage
|
|
11
|
+
try:
|
|
12
|
+
import voyageai
|
|
13
|
+
VOYAGE_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
voyageai = None
|
|
16
|
+
VOYAGE_AVAILABLE = False
|
|
17
|
+
logger.debug("Voyage AI not installed. Install with: pip install voyageai")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VoyageEmbeddingProvider(EmbeddingProvider):
|
|
21
|
+
"""
|
|
22
|
+
Voyage AI cloud embedding provider.
|
|
23
|
+
|
|
24
|
+
Supports multiple models with different dimensions:
|
|
25
|
+
- voyage-2: 1024 dimensions (default)
|
|
26
|
+
- voyage-large-2: 1536 dimensions
|
|
27
|
+
- voyage-3: 1024 dimensions
|
|
28
|
+
- voyage-3-lite: 512 dimensions
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
SUPPORTED_MODELS = {
|
|
32
|
+
"voyage-2": 1024,
|
|
33
|
+
"voyage-large-2": 1536,
|
|
34
|
+
"voyage-3": 1024,
|
|
35
|
+
"voyage-3-lite": 512,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
api_key: str,
|
|
41
|
+
model_name: str = "voyage-2",
|
|
42
|
+
batch_size: int = 128,
|
|
43
|
+
max_tokens_per_batch: int = 100000, # 20k buffer from 120k limit
|
|
44
|
+
token_estimation_ratio: int = 3 # chars per token
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize Voyage AI provider.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
api_key: Voyage AI API key
|
|
51
|
+
model_name: Model to use (default: voyage-2)
|
|
52
|
+
batch_size: Maximum batch size for embedding
|
|
53
|
+
max_tokens_per_batch: Maximum tokens per batch (default: 100000, 20k buffer from 120k limit)
|
|
54
|
+
token_estimation_ratio: Characters per token estimate (default: 3)
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ImportError: If voyageai package is not installed
|
|
58
|
+
ValueError: If API key is empty or model is unsupported
|
|
59
|
+
"""
|
|
60
|
+
if not VOYAGE_AVAILABLE:
|
|
61
|
+
raise ImportError(
|
|
62
|
+
"Voyage AI is not installed. "
|
|
63
|
+
"Install with: pip install voyageai"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if not api_key:
|
|
67
|
+
raise ValueError("Voyage API key is required")
|
|
68
|
+
|
|
69
|
+
if model_name not in self.SUPPORTED_MODELS:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Unsupported model: {model_name}. "
|
|
72
|
+
f"Supported models: {list(self.SUPPORTED_MODELS.keys())}"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.api_key = api_key
|
|
76
|
+
self.model_name = model_name
|
|
77
|
+
self.batch_size = batch_size
|
|
78
|
+
self.dimension = self.SUPPORTED_MODELS[model_name]
|
|
79
|
+
self.max_tokens_per_batch = max_tokens_per_batch
|
|
80
|
+
self.token_estimation_ratio = token_estimation_ratio
|
|
81
|
+
|
|
82
|
+
# Initialize client
|
|
83
|
+
self.client = voyageai.Client(api_key=api_key)
|
|
84
|
+
logger.info(f"Initialized Voyage AI with model {model_name} ({self.dimension} dims)")
|
|
85
|
+
# Note: Never log the API key for security
|
|
86
|
+
|
|
87
|
+
def embed(self, texts: List[str]) -> List[List[float]]:
|
|
88
|
+
"""
|
|
89
|
+
Generate embeddings for texts using Voyage AI.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
texts: List of texts to embed
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of embedding vectors
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If request is invalid
|
|
99
|
+
Exception: If API call fails
|
|
100
|
+
"""
|
|
101
|
+
if not texts:
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
# Voyage AI expects a list of texts
|
|
106
|
+
result = self.client.embed(
|
|
107
|
+
texts=texts,
|
|
108
|
+
model=self.model_name
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Extract embeddings from response
|
|
112
|
+
embeddings = result.embeddings
|
|
113
|
+
|
|
114
|
+
# Validate dimensions
|
|
115
|
+
for i, embedding in enumerate(embeddings):
|
|
116
|
+
if len(embedding) != self.dimension:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"Embedding {i} has wrong dimension: "
|
|
119
|
+
f"expected {self.dimension}, got {len(embedding)}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
logger.debug(f"Generated {len(embeddings)} embeddings with Voyage AI")
|
|
123
|
+
return embeddings
|
|
124
|
+
|
|
125
|
+
except AttributeError as e:
|
|
126
|
+
# Handle voyageai-specific errors if available
|
|
127
|
+
if 'RateLimitError' in str(type(e).__name__):
|
|
128
|
+
logger.error(f"Rate limit exceeded, retry with backoff")
|
|
129
|
+
raise
|
|
130
|
+
elif 'InvalidRequestError' in str(type(e).__name__):
|
|
131
|
+
logger.error(f"Invalid request to Voyage API")
|
|
132
|
+
raise ValueError(f"Invalid embedding request: {e}")
|
|
133
|
+
else:
|
|
134
|
+
logger.error(f"Voyage API error: {e}")
|
|
135
|
+
raise
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Unexpected error during embedding: {type(e).__name__}")
|
|
138
|
+
raise
|
|
139
|
+
|
|
140
|
+
def estimate_tokens(self, text: str) -> int:
|
|
141
|
+
"""
|
|
142
|
+
Estimate token count for text.
|
|
143
|
+
Conservative estimate: 3 characters = 1 token.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
text: Text to estimate tokens for
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Estimated token count
|
|
150
|
+
"""
|
|
151
|
+
return len(text) // self.token_estimation_ratio
|
|
152
|
+
|
|
153
|
+
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
|
154
|
+
"""
|
|
155
|
+
Generate embeddings in token-aware batches to respect API limits.
|
|
156
|
+
|
|
157
|
+
This implements the critical fix for issue #38 - prevents
|
|
158
|
+
"max allowed tokens per batch is 120000" errors.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
texts: List of texts to embed
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of embedding vectors
|
|
165
|
+
"""
|
|
166
|
+
if not texts:
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
all_embeddings = []
|
|
170
|
+
current_batch = []
|
|
171
|
+
current_tokens = 0
|
|
172
|
+
|
|
173
|
+
for text in texts:
|
|
174
|
+
# Estimate tokens for this text
|
|
175
|
+
text_tokens = self.estimate_tokens(text)
|
|
176
|
+
|
|
177
|
+
# Check if single text exceeds limit
|
|
178
|
+
if text_tokens > self.max_tokens_per_batch:
|
|
179
|
+
logger.warning(
|
|
180
|
+
f"Single text with {text_tokens} estimated tokens exceeds "
|
|
181
|
+
f"limit of {self.max_tokens_per_batch}. Truncating."
|
|
182
|
+
)
|
|
183
|
+
# Truncate text to fit within limit
|
|
184
|
+
max_chars = self.max_tokens_per_batch * self.token_estimation_ratio
|
|
185
|
+
text = text[:max_chars]
|
|
186
|
+
text_tokens = self.estimate_tokens(text)
|
|
187
|
+
|
|
188
|
+
# Check if adding this text would exceed batch limit
|
|
189
|
+
if current_batch and (current_tokens + text_tokens) > self.max_tokens_per_batch:
|
|
190
|
+
# Process current batch
|
|
191
|
+
logger.debug(
|
|
192
|
+
f"Processing batch with {len(current_batch)} texts, "
|
|
193
|
+
f"~{current_tokens} tokens"
|
|
194
|
+
)
|
|
195
|
+
embeddings = self.embed(current_batch)
|
|
196
|
+
all_embeddings.extend(embeddings)
|
|
197
|
+
|
|
198
|
+
# Start new batch
|
|
199
|
+
current_batch = [text]
|
|
200
|
+
current_tokens = text_tokens
|
|
201
|
+
else:
|
|
202
|
+
# Add to current batch
|
|
203
|
+
current_batch.append(text)
|
|
204
|
+
current_tokens += text_tokens
|
|
205
|
+
|
|
206
|
+
# Process final batch
|
|
207
|
+
if current_batch:
|
|
208
|
+
logger.debug(
|
|
209
|
+
f"Processing final batch with {len(current_batch)} texts, "
|
|
210
|
+
f"~{current_tokens} tokens"
|
|
211
|
+
)
|
|
212
|
+
embeddings = self.embed(current_batch)
|
|
213
|
+
all_embeddings.extend(embeddings)
|
|
214
|
+
|
|
215
|
+
return all_embeddings
|
|
216
|
+
|
|
217
|
+
def get_dimension(self) -> int:
|
|
218
|
+
"""Get embedding dimension for current model."""
|
|
219
|
+
return self.dimension
|
|
220
|
+
|
|
221
|
+
def get_model_name(self) -> str:
|
|
222
|
+
"""Get the model name being used."""
|
|
223
|
+
return self.model_name
|
|
224
|
+
|
|
225
|
+
def validate_api_key(self) -> bool:
|
|
226
|
+
"""
|
|
227
|
+
Validate that the API key works.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if API key is valid
|
|
231
|
+
"""
|
|
232
|
+
try:
|
|
233
|
+
# Test with a simple embedding
|
|
234
|
+
test_result = self.client.embed(
|
|
235
|
+
texts=["test"],
|
|
236
|
+
model=self.model_name
|
|
237
|
+
)
|
|
238
|
+
return len(test_result.embeddings) > 0
|
|
239
|
+
except Exception as e:
|
|
240
|
+
logger.error(f"API key validation failed: {e}")
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def is_available(cls) -> bool:
|
|
245
|
+
"""Check if Voyage AI is available for use."""
|
|
246
|
+
return VOYAGE_AVAILABLE
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def get_supported_models(cls) -> dict:
|
|
250
|
+
"""Get dictionary of supported models and their dimensions."""
|
|
251
|
+
return cls.SUPPORTED_MODELS.copy()
|