claude-self-reflect 5.0.4 → 5.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/csr-validator.md +43 -0
- package/.claude/agents/open-source-maintainer.md +77 -0
- package/package.json +6 -1
- package/scripts/doctor.py +342 -0
- package/scripts/embedding_service.py +241 -0
- package/scripts/import_strategies.py +344 -0
- package/scripts/message_processors.py +248 -0
- package/scripts/metadata_extractor.py +262 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding service abstraction to handle both local and cloud embeddings.
|
|
3
|
+
Reduces complexity by separating embedding concerns from import logic.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import logging
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EmbeddingProvider(ABC):
|
|
16
|
+
"""Abstract base class for embedding providers."""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
20
|
+
"""Generate embeddings for a list of texts."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def get_dimension(self) -> int:
|
|
25
|
+
"""Get the dimension of embeddings produced by this provider."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def get_collection_suffix(self) -> str:
|
|
30
|
+
"""Get the suffix for collection naming."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LocalEmbeddingProvider(EmbeddingProvider):
|
|
35
|
+
"""Local embedding provider using FastEmbed."""
|
|
36
|
+
|
|
37
|
+
def __init__(self):
|
|
38
|
+
self.model = None
|
|
39
|
+
self.dimension = 384
|
|
40
|
+
self._initialize_model()
|
|
41
|
+
|
|
42
|
+
def _initialize_model(self):
|
|
43
|
+
"""Initialize the FastEmbed model."""
|
|
44
|
+
try:
|
|
45
|
+
from fastembed import TextEmbedding
|
|
46
|
+
self.model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
|
47
|
+
logger.info("Initialized local FastEmbed model (384 dimensions)")
|
|
48
|
+
except ImportError as e:
|
|
49
|
+
logger.error("FastEmbed not installed. Install with: pip install fastembed")
|
|
50
|
+
raise
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.exception(f"Failed to initialize FastEmbed: {e}")
|
|
53
|
+
raise
|
|
54
|
+
|
|
55
|
+
def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
56
|
+
"""Generate embeddings using FastEmbed."""
|
|
57
|
+
if not self.model:
|
|
58
|
+
raise RuntimeError("FastEmbed model not initialized")
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
embeddings = list(self.model.embed(texts))
|
|
62
|
+
return [list(emb) for emb in embeddings]
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.error(f"Failed to generate local embeddings: {e}")
|
|
65
|
+
raise
|
|
66
|
+
|
|
67
|
+
def get_dimension(self) -> int:
|
|
68
|
+
"""Get embedding dimension (384 for FastEmbed)."""
|
|
69
|
+
return self.dimension
|
|
70
|
+
|
|
71
|
+
def get_collection_suffix(self) -> str:
|
|
72
|
+
"""Get collection suffix for local embeddings."""
|
|
73
|
+
return "local_384d"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class CloudEmbeddingProvider(EmbeddingProvider):
|
|
77
|
+
"""Cloud embedding provider using Voyage AI."""
|
|
78
|
+
|
|
79
|
+
def __init__(self, api_key: str):
|
|
80
|
+
# Don't store API key directly, use it only for client initialization
|
|
81
|
+
self.client = None
|
|
82
|
+
self.dimension = 1024
|
|
83
|
+
self._initialize_client(api_key)
|
|
84
|
+
|
|
85
|
+
def _initialize_client(self, api_key: str):
|
|
86
|
+
"""Initialize the Voyage AI client."""
|
|
87
|
+
try:
|
|
88
|
+
import voyageai
|
|
89
|
+
self.client = voyageai.Client(api_key=api_key)
|
|
90
|
+
logger.info("Initialized Voyage AI client (1024 dimensions)")
|
|
91
|
+
except ImportError as e:
|
|
92
|
+
logger.error("voyageai not installed. Install with: pip install voyageai")
|
|
93
|
+
raise
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.exception(f"Failed to initialize Voyage AI: {e}")
|
|
96
|
+
raise
|
|
97
|
+
|
|
98
|
+
def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
99
|
+
"""Generate embeddings using Voyage AI."""
|
|
100
|
+
if not self.client:
|
|
101
|
+
raise RuntimeError("Voyage AI client not initialized")
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
result = self.client.embed(texts, model="voyage-2")
|
|
105
|
+
return result.embeddings
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Failed to generate cloud embeddings: {e}")
|
|
108
|
+
raise
|
|
109
|
+
|
|
110
|
+
def get_dimension(self) -> int:
|
|
111
|
+
"""Get embedding dimension (1024 for Voyage)."""
|
|
112
|
+
return self.dimension
|
|
113
|
+
|
|
114
|
+
def get_collection_suffix(self) -> str:
|
|
115
|
+
"""Get collection suffix for cloud embeddings."""
|
|
116
|
+
return "cloud_1024d"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class EmbeddingService:
|
|
120
|
+
"""
|
|
121
|
+
Service to manage embedding generation with automatic provider selection.
|
|
122
|
+
Reduces complexity by encapsulating embedding logic.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(self, prefer_local: bool = True, voyage_api_key: Optional[str] = None):
|
|
126
|
+
"""
|
|
127
|
+
Initialize embedding service.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
prefer_local: Whether to prefer local embeddings when available
|
|
131
|
+
voyage_api_key: API key for Voyage AI (if using cloud embeddings)
|
|
132
|
+
"""
|
|
133
|
+
self.prefer_local = prefer_local
|
|
134
|
+
self.voyage_api_key = voyage_api_key
|
|
135
|
+
self.provider = None
|
|
136
|
+
self._initialize_provider()
|
|
137
|
+
|
|
138
|
+
def _initialize_provider(self):
|
|
139
|
+
"""Initialize the appropriate embedding provider."""
|
|
140
|
+
if self.prefer_local or not self.voyage_api_key:
|
|
141
|
+
try:
|
|
142
|
+
self.provider = LocalEmbeddingProvider()
|
|
143
|
+
logger.info("Using local embedding provider (FastEmbed)")
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.warning(f"Failed to initialize local provider: {e}")
|
|
146
|
+
if self.voyage_api_key:
|
|
147
|
+
self._fallback_to_cloud()
|
|
148
|
+
else:
|
|
149
|
+
raise RuntimeError("No embedding provider available")
|
|
150
|
+
else:
|
|
151
|
+
try:
|
|
152
|
+
self.provider = CloudEmbeddingProvider(self.voyage_api_key)
|
|
153
|
+
logger.info("Using cloud embedding provider (Voyage AI)")
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.warning(f"Failed to initialize cloud provider: {e}")
|
|
156
|
+
self._fallback_to_local()
|
|
157
|
+
|
|
158
|
+
def _fallback_to_cloud(self):
|
|
159
|
+
"""Fallback to cloud provider."""
|
|
160
|
+
if not self.voyage_api_key:
|
|
161
|
+
raise RuntimeError("No Voyage API key available for cloud fallback")
|
|
162
|
+
try:
|
|
163
|
+
self.provider = CloudEmbeddingProvider(self.voyage_api_key)
|
|
164
|
+
logger.info("Fallback to cloud embedding provider")
|
|
165
|
+
# Clear the key after use
|
|
166
|
+
self.voyage_api_key = None
|
|
167
|
+
except Exception as e:
|
|
168
|
+
raise RuntimeError(f"Failed to initialize any embedding provider: {e}")
|
|
169
|
+
|
|
170
|
+
def _fallback_to_local(self):
|
|
171
|
+
"""Fallback to local provider."""
|
|
172
|
+
try:
|
|
173
|
+
self.provider = LocalEmbeddingProvider()
|
|
174
|
+
logger.info("Fallback to local embedding provider")
|
|
175
|
+
except Exception as e:
|
|
176
|
+
raise RuntimeError(f"Failed to initialize any embedding provider: {e}")
|
|
177
|
+
|
|
178
|
+
def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
179
|
+
"""
|
|
180
|
+
Generate embeddings for texts using the configured provider.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
texts: List of texts to embed
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List of embedding vectors
|
|
187
|
+
"""
|
|
188
|
+
if not self.provider:
|
|
189
|
+
raise RuntimeError("No embedding provider initialized")
|
|
190
|
+
|
|
191
|
+
# Filter out empty texts
|
|
192
|
+
non_empty_texts = [t for t in texts if t and t.strip()]
|
|
193
|
+
if not non_empty_texts:
|
|
194
|
+
return []
|
|
195
|
+
|
|
196
|
+
return self.provider.generate_embeddings(non_empty_texts)
|
|
197
|
+
|
|
198
|
+
def get_dimension(self) -> int:
|
|
199
|
+
"""Get the dimension of embeddings."""
|
|
200
|
+
if not self.provider:
|
|
201
|
+
raise RuntimeError("No embedding provider initialized")
|
|
202
|
+
return self.provider.get_dimension()
|
|
203
|
+
|
|
204
|
+
def get_collection_suffix(self) -> str:
|
|
205
|
+
"""Get the collection suffix for current provider."""
|
|
206
|
+
if not self.provider:
|
|
207
|
+
raise RuntimeError("No embedding provider initialized")
|
|
208
|
+
return self.provider.get_collection_suffix()
|
|
209
|
+
|
|
210
|
+
def get_provider_name(self) -> str:
|
|
211
|
+
"""Get the name of the current provider."""
|
|
212
|
+
if isinstance(self.provider, LocalEmbeddingProvider):
|
|
213
|
+
return "FastEmbed (Local)"
|
|
214
|
+
elif isinstance(self.provider, CloudEmbeddingProvider):
|
|
215
|
+
return "Voyage AI (Cloud)"
|
|
216
|
+
else:
|
|
217
|
+
return "Unknown"
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# Factory function for convenience
|
|
221
|
+
def create_embedding_service(
|
|
222
|
+
prefer_local: Optional[bool] = None,
|
|
223
|
+
voyage_api_key: Optional[str] = None
|
|
224
|
+
) -> EmbeddingService:
|
|
225
|
+
"""
|
|
226
|
+
Create an embedding service with environment variable defaults.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
prefer_local: Override for PREFER_LOCAL_EMBEDDINGS env var
|
|
230
|
+
voyage_api_key: Override for VOYAGE_KEY env var
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Configured EmbeddingService instance
|
|
234
|
+
"""
|
|
235
|
+
if prefer_local is None:
|
|
236
|
+
prefer_local = os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true"
|
|
237
|
+
|
|
238
|
+
if voyage_api_key is None:
|
|
239
|
+
voyage_api_key = os.getenv("VOYAGE_KEY")
|
|
240
|
+
|
|
241
|
+
return EmbeddingService(prefer_local, voyage_api_key)
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Import strategies using Strategy pattern to reduce complexity of stream_import_file.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import gc
|
|
7
|
+
import os
|
|
8
|
+
import logging
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, Any, List, Optional, Generator
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
14
|
+
from message_processors import MessageProcessorFactory
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ImportStrategy(ABC):
|
|
20
|
+
"""Abstract base class for import strategies."""
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
24
|
+
"""Import a JSONL file using the specific strategy."""
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ChunkBuffer:
|
|
29
|
+
"""Manages buffering and processing of message chunks."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, max_size: int = 50):
|
|
32
|
+
self.buffer: List[Dict[str, Any]] = []
|
|
33
|
+
self.max_size = max_size
|
|
34
|
+
self.current_index = 0
|
|
35
|
+
# Add memory limit for message content
|
|
36
|
+
self.max_content_length = int(os.getenv('MAX_MESSAGE_CONTENT_LENGTH', '5000'))
|
|
37
|
+
|
|
38
|
+
def add(self, message: Dict[str, Any]) -> bool:
|
|
39
|
+
"""Add a message to the buffer. Returns True if buffer is full."""
|
|
40
|
+
# Truncate long content to prevent memory issues
|
|
41
|
+
if 'content' in message and len(message['content']) > self.max_content_length:
|
|
42
|
+
message = message.copy()
|
|
43
|
+
message['content'] = message['content'][:self.max_content_length] + '...[truncated]'
|
|
44
|
+
self.buffer.append(message)
|
|
45
|
+
return len(self.buffer) >= self.max_size
|
|
46
|
+
|
|
47
|
+
def get_and_clear(self) -> List[Dict[str, Any]]:
|
|
48
|
+
"""Get buffer contents and clear it."""
|
|
49
|
+
contents = self.buffer.copy()
|
|
50
|
+
self.buffer.clear()
|
|
51
|
+
return contents
|
|
52
|
+
|
|
53
|
+
def has_content(self) -> bool:
|
|
54
|
+
"""Check if buffer has any content."""
|
|
55
|
+
return len(self.buffer) > 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class MessageStreamReader:
|
|
59
|
+
"""Handles reading and parsing messages from JSONL files."""
|
|
60
|
+
|
|
61
|
+
def __init__(self):
|
|
62
|
+
self.processor_factory = MessageProcessorFactory()
|
|
63
|
+
self.current_message_index = 0
|
|
64
|
+
|
|
65
|
+
def read_messages(self, file_path: Path) -> Generator[Dict[str, Any], None, None]:
|
|
66
|
+
"""Generator that yields processed messages from a JSONL file."""
|
|
67
|
+
self.current_message_index = 0
|
|
68
|
+
|
|
69
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
70
|
+
for line_num, line in enumerate(f, 1):
|
|
71
|
+
line = line.strip()
|
|
72
|
+
if not line:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
message = self._parse_line(line, line_num)
|
|
76
|
+
if message:
|
|
77
|
+
yield message
|
|
78
|
+
|
|
79
|
+
def _parse_line(self, line: str, line_num: int) -> Optional[Dict[str, Any]]:
|
|
80
|
+
"""Parse a single line and extract message if present."""
|
|
81
|
+
try:
|
|
82
|
+
data = json.loads(line)
|
|
83
|
+
|
|
84
|
+
# Skip summary lines
|
|
85
|
+
if data.get('type') == 'summary':
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
# Handle message entries
|
|
89
|
+
if 'message' in data and data['message']:
|
|
90
|
+
return self._process_message(data['message'])
|
|
91
|
+
|
|
92
|
+
# Handle top-level tool entries
|
|
93
|
+
entry_type = data.get('type')
|
|
94
|
+
if entry_type in ('tool_result', 'tool_use'):
|
|
95
|
+
return self._process_tool_entry(data, entry_type)
|
|
96
|
+
|
|
97
|
+
except json.JSONDecodeError:
|
|
98
|
+
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
99
|
+
except (KeyError, TypeError, ValueError) as e:
|
|
100
|
+
logger.debug(f"Error processing data at line {line_num}: {e}")
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.warning(f"Unexpected error at line {line_num}: {e}")
|
|
103
|
+
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def _process_message(self, message: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
107
|
+
"""Process a message entry."""
|
|
108
|
+
role = message.get('role')
|
|
109
|
+
content = message.get('content')
|
|
110
|
+
|
|
111
|
+
if not role or not content:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
# Process content
|
|
115
|
+
text_content = self._extract_text_content(content)
|
|
116
|
+
|
|
117
|
+
if not text_content:
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
# Track message index for user/assistant messages
|
|
121
|
+
if role in ['user', 'assistant']:
|
|
122
|
+
message_idx = self.current_message_index
|
|
123
|
+
self.current_message_index += 1
|
|
124
|
+
else:
|
|
125
|
+
message_idx = 0
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
'role': role,
|
|
129
|
+
'content': text_content,
|
|
130
|
+
'message_index': message_idx
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
def _extract_text_content(self, content: Any) -> str:
|
|
134
|
+
"""Extract text content from various content formats."""
|
|
135
|
+
if isinstance(content, str):
|
|
136
|
+
return content
|
|
137
|
+
|
|
138
|
+
if isinstance(content, list):
|
|
139
|
+
text_parts = []
|
|
140
|
+
for item in content:
|
|
141
|
+
if isinstance(item, dict):
|
|
142
|
+
text = self._process_content_item(item)
|
|
143
|
+
if text:
|
|
144
|
+
text_parts.append(text)
|
|
145
|
+
elif isinstance(item, str):
|
|
146
|
+
text_parts.append(item)
|
|
147
|
+
return '\n'.join(text_parts)
|
|
148
|
+
|
|
149
|
+
return ''
|
|
150
|
+
|
|
151
|
+
def _process_content_item(self, item: Dict[str, Any]) -> Optional[str]:
|
|
152
|
+
"""Process a single content item."""
|
|
153
|
+
item_type = item.get('type', '')
|
|
154
|
+
|
|
155
|
+
if item_type == 'text':
|
|
156
|
+
return item.get('text', '')
|
|
157
|
+
elif item_type == 'thinking':
|
|
158
|
+
thinking_content = item.get('thinking', '')
|
|
159
|
+
return f"[Thinking] {thinking_content[:1000]}" if thinking_content else None
|
|
160
|
+
elif item_type == 'tool_use':
|
|
161
|
+
tool_name = item.get('name', 'unknown')
|
|
162
|
+
tool_input = str(item.get('input', ''))[:500]
|
|
163
|
+
return f"[Tool: {tool_name}] {tool_input}"
|
|
164
|
+
elif item_type == 'tool_result':
|
|
165
|
+
result_content = str(item.get('content', ''))[:1000]
|
|
166
|
+
return f"[Result] {result_content}"
|
|
167
|
+
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
def _process_tool_entry(self, data: Dict[str, Any], entry_type: str) -> Optional[Dict[str, Any]]:
|
|
171
|
+
"""Process a top-level tool entry."""
|
|
172
|
+
text_parts = []
|
|
173
|
+
|
|
174
|
+
if entry_type == 'tool_use':
|
|
175
|
+
tool_name = data.get('name', 'unknown')
|
|
176
|
+
tool_input = str(data.get('input', ''))[:500]
|
|
177
|
+
text_parts.append(f"[Tool: {tool_name}] {tool_input}")
|
|
178
|
+
|
|
179
|
+
elif entry_type == 'tool_result':
|
|
180
|
+
result_content = self._extract_tool_result(data)
|
|
181
|
+
text_parts.append(f"[Result] {result_content[:1000]}")
|
|
182
|
+
|
|
183
|
+
content = "\n".join(text_parts)
|
|
184
|
+
if not content:
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
message_idx = self.current_message_index
|
|
188
|
+
self.current_message_index += 1
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
'role': entry_type,
|
|
192
|
+
'content': content,
|
|
193
|
+
'message_index': message_idx
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
def _extract_tool_result(self, data: Dict[str, Any]) -> str:
|
|
197
|
+
"""Extract result content from tool result data."""
|
|
198
|
+
result_content = data.get('content')
|
|
199
|
+
|
|
200
|
+
if isinstance(result_content, list):
|
|
201
|
+
flat = []
|
|
202
|
+
for item in result_content:
|
|
203
|
+
if isinstance(item, dict) and item.get('type') == 'text':
|
|
204
|
+
flat.append(item.get('text', ''))
|
|
205
|
+
elif isinstance(item, str):
|
|
206
|
+
flat.append(item)
|
|
207
|
+
result_content = "\n".join(flat)
|
|
208
|
+
|
|
209
|
+
if not result_content:
|
|
210
|
+
result_content = data.get('result', '')
|
|
211
|
+
|
|
212
|
+
return str(result_content)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class StreamImportStrategy(ImportStrategy):
|
|
216
|
+
"""
|
|
217
|
+
Strategy for streaming import with chunked processing.
|
|
218
|
+
This is the main refactored implementation.
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
def __init__(self, client, process_chunk_fn, state_manager, max_chunk_size: int = 50,
|
|
222
|
+
cleanup_tolerance: int = None):
|
|
223
|
+
self.client = client
|
|
224
|
+
self.process_chunk_fn = process_chunk_fn
|
|
225
|
+
self.state_manager = state_manager
|
|
226
|
+
self.max_chunk_size = max_chunk_size
|
|
227
|
+
# Make cleanup tolerance configurable via environment variable
|
|
228
|
+
self.cleanup_tolerance = cleanup_tolerance or int(os.getenv('CLEANUP_TOLERANCE', '5'))
|
|
229
|
+
self.stream_reader = MessageStreamReader()
|
|
230
|
+
|
|
231
|
+
def import_file(self, jsonl_file: Path, collection_name: str, project_path: Path) -> int:
|
|
232
|
+
"""Import a JSONL file using streaming strategy."""
|
|
233
|
+
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
234
|
+
|
|
235
|
+
conversation_id = jsonl_file.stem
|
|
236
|
+
|
|
237
|
+
# Extract metadata first (lightweight)
|
|
238
|
+
from metadata_extractor import MetadataExtractor
|
|
239
|
+
extractor = MetadataExtractor()
|
|
240
|
+
metadata, created_at, total_messages = extractor.extract_metadata_from_file(str(jsonl_file))
|
|
241
|
+
|
|
242
|
+
# Initialize chunk processing
|
|
243
|
+
chunk_buffer = ChunkBuffer(self.max_chunk_size)
|
|
244
|
+
chunk_index = 0
|
|
245
|
+
total_chunks = 0
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
# Stream and process messages
|
|
249
|
+
for message in self.stream_reader.read_messages(jsonl_file):
|
|
250
|
+
if chunk_buffer.add(message):
|
|
251
|
+
# Buffer is full, process chunk
|
|
252
|
+
chunks = self._process_buffer(
|
|
253
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
254
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
255
|
+
)
|
|
256
|
+
total_chunks += chunks
|
|
257
|
+
chunk_index += 1
|
|
258
|
+
|
|
259
|
+
# Force garbage collection after each chunk
|
|
260
|
+
gc.collect()
|
|
261
|
+
|
|
262
|
+
# Log progress
|
|
263
|
+
if chunk_index % 10 == 0:
|
|
264
|
+
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
265
|
+
|
|
266
|
+
# Process remaining messages
|
|
267
|
+
if chunk_buffer.has_content():
|
|
268
|
+
chunks = self._process_buffer(
|
|
269
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
270
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
271
|
+
)
|
|
272
|
+
total_chunks += chunks
|
|
273
|
+
|
|
274
|
+
# Clean up old points after successful import
|
|
275
|
+
if total_chunks > 0:
|
|
276
|
+
self._cleanup_old_points(conversation_id, collection_name, total_chunks)
|
|
277
|
+
|
|
278
|
+
logger.info(f"Imported {total_chunks} chunks from {jsonl_file.name}")
|
|
279
|
+
return total_chunks
|
|
280
|
+
|
|
281
|
+
except (IOError, OSError) as e:
|
|
282
|
+
logger.error(f"Failed to read file {jsonl_file}: {e}")
|
|
283
|
+
self._mark_failed(jsonl_file, str(e))
|
|
284
|
+
return 0
|
|
285
|
+
except json.JSONDecodeError as e:
|
|
286
|
+
logger.error(f"Invalid JSON in {jsonl_file}: {e}")
|
|
287
|
+
self._mark_failed(jsonl_file, str(e))
|
|
288
|
+
return 0
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f"Unexpected error importing {jsonl_file}: {e}")
|
|
291
|
+
self._mark_failed(jsonl_file, str(e))
|
|
292
|
+
return 0
|
|
293
|
+
|
|
294
|
+
def _process_buffer(self, chunk_buffer: ChunkBuffer, chunk_index: int,
|
|
295
|
+
conversation_id: str, created_at: str, metadata: Dict[str, Any],
|
|
296
|
+
collection_name: str, project_path: Path, total_messages: int) -> int:
|
|
297
|
+
"""Process a buffer of messages and return number of chunks created."""
|
|
298
|
+
messages = chunk_buffer.get_and_clear()
|
|
299
|
+
return self.process_chunk_fn(
|
|
300
|
+
messages, chunk_index, conversation_id,
|
|
301
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
def _cleanup_old_points(self, conversation_id: str, collection_name: str, total_chunks: int):
|
|
305
|
+
"""Clean up old points after successful import."""
|
|
306
|
+
try:
|
|
307
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
308
|
+
|
|
309
|
+
# Count old points using count API
|
|
310
|
+
old_count_filter = Filter(
|
|
311
|
+
must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Use count API to get actual count
|
|
315
|
+
old_count = self.client.count(
|
|
316
|
+
collection_name=collection_name,
|
|
317
|
+
count_filter=old_count_filter,
|
|
318
|
+
exact=True
|
|
319
|
+
).count
|
|
320
|
+
|
|
321
|
+
if old_count > total_chunks + self.cleanup_tolerance:
|
|
322
|
+
# Use filter parameter for delete
|
|
323
|
+
self.client.delete(
|
|
324
|
+
collection_name=collection_name,
|
|
325
|
+
points_selector=Filter(
|
|
326
|
+
must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
|
|
327
|
+
),
|
|
328
|
+
wait=True
|
|
329
|
+
)
|
|
330
|
+
logger.info(f"Deleted {old_count - total_chunks} old points for conversation {conversation_id}")
|
|
331
|
+
|
|
332
|
+
except ImportError as e:
|
|
333
|
+
logger.debug(f"Qdrant client import error: {e}")
|
|
334
|
+
except Exception as e:
|
|
335
|
+
logger.warning(f"Could not clean up old points for {conversation_id}: {e}")
|
|
336
|
+
|
|
337
|
+
def _mark_failed(self, jsonl_file: Path, error: str):
|
|
338
|
+
"""Mark a file as failed in state manager."""
|
|
339
|
+
try:
|
|
340
|
+
self.state_manager.mark_file_failed(str(jsonl_file), error)
|
|
341
|
+
except AttributeError as e:
|
|
342
|
+
logger.debug(f"State manager method not available: {e}")
|
|
343
|
+
except Exception as e:
|
|
344
|
+
logger.warning(f"Unexpected error marking file as failed: {e}")
|