claude-self-reflect 2.3.6 → 2.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-self-reflect",
3
- "version": "2.3.6",
3
+ "version": "2.3.7",
4
4
  "description": "Give Claude perfect memory of all your conversations - Installation wizard for Python MCP server",
5
5
  "keywords": [
6
6
  "claude",
@@ -1,311 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Import Claude conversation logs with project isolation support.
4
- Each project gets its own collection for complete isolation.
5
- """
6
-
7
- import json
8
- import os
9
- import glob
10
- import hashlib
11
- from datetime import datetime, timedelta
12
- from typing import List, Dict, Any, Set
13
- import logging
14
- from qdrant_client import QdrantClient
15
- from qdrant_client.models import (
16
- VectorParams, Distance, PointStruct,
17
- Filter, FieldCondition, MatchValue
18
- )
19
- from sentence_transformers import SentenceTransformer
20
-
21
- # Configuration
22
- QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
23
- LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
24
- STATE_FILE = os.getenv("STATE_FILE", "/config/imported-files.json")
25
- EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
26
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))
27
- ISOLATION_MODE = os.getenv("ISOLATION_MODE", "isolated") # isolated, shared, hybrid
28
-
29
- # Set up logging
30
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
- logger = logging.getLogger(__name__)
32
-
33
- class ProjectAwareImporter:
34
- def __init__(self):
35
- """Initialize the importer with Qdrant client and embedding model."""
36
- self.client = QdrantClient(url=QDRANT_URL)
37
- self.encoder = SentenceTransformer(EMBEDDING_MODEL)
38
- self.imported_files = self.load_state()
39
- self.project_collections: Set[str] = set()
40
-
41
- def load_state(self) -> Dict[str, Set[str]]:
42
- """Load the set of already imported files per project."""
43
- if os.path.exists(STATE_FILE):
44
- try:
45
- with open(STATE_FILE, 'r') as f:
46
- data = json.load(f)
47
- # Convert to per-project tracking
48
- if isinstance(data.get('files'), list):
49
- # Legacy format - convert to new format
50
- return {'_legacy': set(data['files'])}
51
- else:
52
- # New format with per-project tracking
53
- return {k: set(v) for k, v in data.get('projects', {}).items()}
54
- except Exception as e:
55
- logger.error(f"Failed to load state: {e}")
56
- return {}
57
-
58
- def save_state(self):
59
- """Save the set of imported files per project."""
60
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
61
- with open(STATE_FILE, 'w') as f:
62
- json.dump({
63
- 'projects': {k: list(v) for k, v in self.imported_files.items()},
64
- 'last_updated': datetime.now().isoformat(),
65
- 'mode': ISOLATION_MODE
66
- }, f, indent=2)
67
-
68
- def get_collection_name(self, project_name: str) -> str:
69
- """Get collection name based on isolation mode."""
70
- if ISOLATION_MODE == "isolated":
71
- # Create project-specific collection name
72
- project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
73
- return f"conv_{project_hash}"
74
- else:
75
- # Shared collection mode
76
- return "conversations"
77
-
78
- def setup_collection(self, project_name: str):
79
- """Create or update the Qdrant collection for a project."""
80
- collection_name = self.get_collection_name(project_name)
81
-
82
- # Skip if already set up in this session
83
- if collection_name in self.project_collections:
84
- return collection_name
85
-
86
- collections = self.client.get_collections().collections
87
- exists = any(c.name == collection_name for c in collections)
88
-
89
- if not exists:
90
- logger.info(f"Creating collection: {collection_name} for project: {project_name}")
91
- self.client.create_collection(
92
- collection_name=collection_name,
93
- vectors_config=VectorParams(
94
- size=384, # all-MiniLM-L6-v2 dimension
95
- distance=Distance.COSINE
96
- )
97
- )
98
- else:
99
- logger.info(f"Collection {collection_name} already exists for project: {project_name}")
100
-
101
- self.project_collections.add(collection_name)
102
- return collection_name
103
-
104
- def extract_project_name(self, file_path: str) -> str:
105
- """Extract project name from file path."""
106
- # Expected path: /logs/<project-name>/<conversation-id>.jsonl
107
- parts = file_path.split('/')
108
- if len(parts) >= 3 and parts[-2] != 'logs':
109
- return parts[-2]
110
- return 'unknown'
111
-
112
- def process_jsonl_file(self, file_path: str) -> List[Dict[str, Any]]:
113
- """Extract messages from a JSONL file."""
114
- messages = []
115
-
116
- try:
117
- with open(file_path, 'r') as f:
118
- for line_num, line in enumerate(f, 1):
119
- try:
120
- data = json.loads(line.strip())
121
-
122
- # Extract message if present
123
- if 'message' in data and data['message']:
124
- msg = data['message']
125
- if 'role' in msg and 'content' in msg:
126
- # Handle content that might be an object
127
- content = msg['content']
128
- if isinstance(content, dict):
129
- content = content.get('text', json.dumps(content))
130
-
131
- # Create message document
132
- messages.append({
133
- 'role': msg['role'],
134
- 'content': content,
135
- 'file_path': file_path,
136
- 'line_number': line_num,
137
- 'timestamp': data.get('timestamp', datetime.now().isoformat())
138
- })
139
- except json.JSONDecodeError:
140
- logger.warning(f"Failed to parse line {line_num} in {file_path}")
141
- except Exception as e:
142
- logger.error(f"Error processing line {line_num} in {file_path}: {e}")
143
-
144
- except Exception as e:
145
- logger.error(f"Failed to read file {file_path}: {e}")
146
-
147
- return messages
148
-
149
- def create_conversation_chunks(self, messages: List[Dict[str, Any]], chunk_size: int = 5) -> List[Dict[str, Any]]:
150
- """Group messages into conversation chunks for better context."""
151
- chunks = []
152
-
153
- for i in range(0, len(messages), chunk_size):
154
- chunk_messages = messages[i:i + chunk_size]
155
-
156
- # Create a conversation summary
157
- conversation_text = "\n\n".join([
158
- f"{msg['role'].upper()}: {msg['content'][:500]}..."
159
- if len(msg['content']) > 500 else f"{msg['role'].upper()}: {msg['content']}"
160
- for msg in chunk_messages
161
- ])
162
-
163
- # Extract metadata
164
- project_id = self.extract_project_name(chunk_messages[0]['file_path'])
165
- conversation_id = os.path.basename(chunk_messages[0]['file_path']).replace('.jsonl', '')
166
-
167
- chunks.append({
168
- 'id': hashlib.md5(f"{chunk_messages[0]['file_path']}_{i}".encode()).hexdigest(),
169
- 'text': conversation_text,
170
- 'metadata': {
171
- 'project_id': project_id,
172
- 'project_name': project_id, # Add both for compatibility
173
- 'conversation_id': conversation_id,
174
- 'chunk_index': i // chunk_size,
175
- 'message_count': len(chunk_messages),
176
- 'start_role': chunk_messages[0]['role'],
177
- 'timestamp': chunk_messages[0]['timestamp'],
178
- 'file_path': chunk_messages[0]['file_path']
179
- }
180
- })
181
-
182
- return chunks
183
-
184
- def import_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str):
185
- """Import conversation chunks to a specific Qdrant collection."""
186
- if not chunks:
187
- return
188
-
189
- # Generate embeddings
190
- texts = [chunk['text'] for chunk in chunks]
191
- embeddings = self.encoder.encode(texts, show_progress_bar=True)
192
-
193
- # Create points for Qdrant
194
- points = []
195
- for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
196
- points.append(
197
- PointStruct(
198
- id=chunk['id'],
199
- vector=embedding.tolist(),
200
- payload={
201
- 'text': chunk['text'],
202
- **chunk['metadata']
203
- }
204
- )
205
- )
206
-
207
- # Upload to Qdrant in batches
208
- for i in range(0, len(points), BATCH_SIZE):
209
- batch = points[i:i + BATCH_SIZE]
210
- self.client.upsert(
211
- collection_name=collection_name,
212
- points=batch
213
- )
214
- logger.info(f"Uploaded batch of {len(batch)} points to {collection_name}")
215
-
216
- def find_recent_files(self, days: int = 30) -> List[str]:
217
- """Find JSONL files modified in the last N days."""
218
- cutoff_time = datetime.now() - timedelta(days=days)
219
- pattern = os.path.join(LOGS_DIR, "**", "*.jsonl")
220
-
221
- recent_files = []
222
- for file_path in glob.glob(pattern, recursive=True):
223
- try:
224
- mtime = os.path.getmtime(file_path)
225
- if datetime.fromtimestamp(mtime) >= cutoff_time:
226
- recent_files.append(file_path)
227
- except Exception as e:
228
- logger.error(f"Error checking file {file_path}: {e}")
229
-
230
- return recent_files
231
-
232
- def run(self):
233
- """Main import process with project isolation."""
234
- logger.info(f"Starting conversation import to Qdrant (mode: {ISOLATION_MODE})")
235
-
236
- # Find files to import
237
- all_files = self.find_recent_files()
238
- logger.info(f"Found {len(all_files)} total files")
239
-
240
- # Group files by project
241
- files_by_project: Dict[str, List[str]] = {}
242
- for file_path in all_files:
243
- project_name = self.extract_project_name(file_path)
244
- if project_name not in files_by_project:
245
- files_by_project[project_name] = []
246
- files_by_project[project_name].append(file_path)
247
-
248
- logger.info(f"Found {len(files_by_project)} projects to process")
249
-
250
- total_chunks = 0
251
- for project_name, project_files in files_by_project.items():
252
- logger.info(f"\nProcessing project: {project_name}")
253
-
254
- # Get imported files for this project
255
- project_imported = self.imported_files.get(project_name, set())
256
- new_files = [f for f in project_files if f not in project_imported]
257
-
258
- if not new_files:
259
- logger.info(f"No new files for project {project_name}")
260
- continue
261
-
262
- logger.info(f"Found {len(new_files)} new files for project {project_name}")
263
-
264
- # Setup collection for this project
265
- collection_name = self.setup_collection(project_name)
266
-
267
- project_chunks = 0
268
- for file_path in new_files:
269
- logger.info(f"Processing: {file_path}")
270
-
271
- # Extract messages
272
- messages = self.process_jsonl_file(file_path)
273
- if not messages:
274
- logger.warning(f"No messages found in {file_path}")
275
- continue
276
-
277
- # Create conversation chunks
278
- chunks = self.create_conversation_chunks(messages)
279
-
280
- # Import to project-specific collection
281
- self.import_to_qdrant(chunks, collection_name)
282
-
283
- # Mark file as imported for this project
284
- if project_name not in self.imported_files:
285
- self.imported_files[project_name] = set()
286
- self.imported_files[project_name].add(file_path)
287
- self.save_state()
288
-
289
- project_chunks += len(chunks)
290
- logger.info(f"Imported {len(chunks)} chunks from {file_path}")
291
-
292
- total_chunks += project_chunks
293
- logger.info(f"Project {project_name} complete: {project_chunks} chunks imported")
294
-
295
- logger.info(f"\nImport complete: {total_chunks} total chunks imported")
296
-
297
- # Show collection summary
298
- logger.info("\nCollection summary:")
299
- collections = self.client.get_collections().collections
300
- for collection in collections:
301
- if collection.name.startswith('conv_') or collection.name == 'conversations':
302
- count = self.client.get_collection(collection.name).points_count
303
- logger.info(f" {collection.name}: {count} points")
304
-
305
- def main():
306
- """Entry point for the importer."""
307
- importer = ProjectAwareImporter()
308
- importer.run()
309
-
310
- if __name__ == "__main__":
311
- main()
@@ -1,368 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Streaming import for large Claude conversation logs.
4
- Processes files in chunks without loading entire file into memory.
5
- """
6
-
7
- import json
8
- import os
9
- import sys
10
- import time
11
- import hashlib
12
- from datetime import datetime
13
- from typing import List, Dict, Any, Generator
14
- import logging
15
- import backoff
16
- import requests
17
- from qdrant_client import QdrantClient
18
- from qdrant_client.models import VectorParams, Distance, PointStruct
19
- from pathlib import Path
20
-
21
- # Set up logging
22
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
- logger = logging.getLogger(__name__)
24
-
25
- # Configuration
26
- QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
27
- STATE_FILE = os.getenv("STATE_FILE", "./config-isolated/imported-files.json")
28
- LOGS_DIR = os.getenv("LOGS_DIR", os.path.expanduser("~/.claude/projects"))
29
- VOYAGE_API_KEY = os.getenv("VOYAGE_KEY_2") or os.getenv("VOYAGE_KEY")
30
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "50"))
31
- CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "10"))
32
- STREAMING_BUFFER_SIZE = 100 # Process every 100 messages
33
- RATE_LIMIT_DELAY = 0.1
34
- EMBEDDING_MODEL = "voyage-3-large"
35
- EMBEDDING_DIMENSIONS = 1024
36
- VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings"
37
-
38
- class StreamingVoyageImporter:
39
- def __init__(self):
40
- """Initialize the streaming importer."""
41
- if not VOYAGE_API_KEY:
42
- raise ValueError("VOYAGE_KEY environment variable not set")
43
-
44
- self.qdrant_client = QdrantClient(url=QDRANT_URL)
45
- self.state = self._load_state()
46
- self.total_imported = 0
47
- self.total_errors = 0
48
-
49
- logger.info(f"Connected to Qdrant at {QDRANT_URL}")
50
-
51
- def _load_state(self) -> Dict[str, Any]:
52
- """Load import state from file."""
53
- default_state = {
54
- "projects": {},
55
- "last_updated": None,
56
- "mode": "isolated",
57
- "total_imported": 0
58
- }
59
-
60
- if os.path.exists(STATE_FILE):
61
- try:
62
- with open(STATE_FILE, 'r') as f:
63
- return json.load(f)
64
- except Exception as e:
65
- logger.error(f"Failed to load state: {e}")
66
-
67
- return default_state
68
-
69
- def _save_state(self):
70
- """Save import state to file."""
71
- self.state["last_updated"] = datetime.now().isoformat()
72
- self.state["total_imported"] = self.total_imported
73
-
74
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
75
- with open(STATE_FILE, 'w') as f:
76
- json.dump(self.state, f, indent=2)
77
-
78
- @backoff.on_exception(
79
- backoff.expo,
80
- Exception,
81
- max_tries=5,
82
- on_backoff=lambda details: logger.warning(f"Backing off {details['wait']}s after {details['tries']} tries")
83
- )
84
- def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
85
- """Generate embeddings using Voyage AI API with batching."""
86
- headers = {
87
- "Authorization": f"Bearer {VOYAGE_API_KEY}",
88
- "Content-Type": "application/json"
89
- }
90
-
91
- payload = {
92
- "input": texts,
93
- "model": EMBEDDING_MODEL,
94
- "input_type": "document"
95
- }
96
-
97
- try:
98
- response = requests.post(VOYAGE_API_URL, headers=headers, json=payload, timeout=30)
99
- response.raise_for_status()
100
-
101
- data = response.json()
102
- embeddings = [item["embedding"] for item in data["data"]]
103
-
104
- # Add small delay to respect rate limits
105
- time.sleep(RATE_LIMIT_DELAY)
106
-
107
- return embeddings
108
-
109
- except requests.Timeout:
110
- logger.error("Voyage API request timed out after 30 seconds")
111
- raise
112
- except Exception as e:
113
- logger.error(f"Voyage API error: {e}")
114
- raise
115
-
116
- def stream_jsonl_messages(self, file_path: str, buffer_size: int = STREAMING_BUFFER_SIZE) -> Generator[List[Dict[str, Any]], None, None]:
117
- """Stream messages from JSONL file in buffers without loading entire file."""
118
- buffer = []
119
- line_count = 0
120
- total_lines = 0
121
- skipped_lines = 0
122
-
123
- # Extract expected session ID from filename
124
- expected_session_id = os.path.splitext(os.path.basename(file_path))[0]
125
- logger.info(f"Starting to stream file: {os.path.basename(file_path)} (expecting session: {expected_session_id})")
126
-
127
- try:
128
- with open(file_path, 'r', encoding='utf-8') as f:
129
- for line_num, line in enumerate(f, 1):
130
- total_lines = line_num
131
- line = line.strip()
132
- if not line:
133
- continue
134
-
135
- try:
136
- data = json.loads(line)
137
-
138
- # Check session ID matches expected
139
- session_id = data.get('sessionId', '')
140
- if session_id != expected_session_id:
141
- skipped_lines += 1
142
- logger.debug(f"Skipping line {line_num}: different session ID ({session_id})")
143
- continue
144
-
145
- # Extract message if present
146
- if 'message' in data and data['message']:
147
- msg = data['message']
148
- if msg.get('role') and msg.get('content'):
149
- content = msg['content']
150
- # Handle content array (common in Claude messages)
151
- if isinstance(content, list) and len(content) > 0:
152
- # Extract text from first content item
153
- content_item = content[0]
154
- if isinstance(content_item, dict):
155
- content = content_item.get('text', str(content_item))
156
- elif isinstance(content, dict):
157
- content = content.get('text', json.dumps(content))
158
-
159
- buffer.append({
160
- 'role': msg['role'],
161
- 'content': content,
162
- 'file_path': file_path,
163
- 'line_number': line_num,
164
- 'timestamp': data.get('timestamp', datetime.now().isoformat())
165
- })
166
- line_count += 1
167
-
168
- # Yield buffer when it reaches the specified size
169
- if len(buffer) >= buffer_size:
170
- logger.info(f"Buffer full, yielding {len(buffer)} messages (total so far: {line_count})")
171
- yield buffer
172
- buffer = []
173
-
174
- except json.JSONDecodeError:
175
- logger.debug(f"Skipping invalid JSON at line {line_num}")
176
- skipped_lines += 1
177
- except Exception as e:
178
- logger.error(f"Error processing line {line_num}: {e}")
179
- skipped_lines += 1
180
-
181
- # Yield any remaining messages
182
- if buffer:
183
- logger.info(f"Yielding final buffer with {len(buffer)} messages")
184
- yield buffer
185
-
186
- logger.info(f"Completed streaming file: processed {total_lines} lines, {line_count} messages, {skipped_lines} skipped")
187
-
188
- except Exception as e:
189
- logger.error(f"Failed to read file {file_path}: {e}")
190
-
191
- def process_message_buffer(self, messages: List[Dict[str, Any]], project_name: str, collection_name: str, conversation_id: str):
192
- """Process a buffer of messages into chunks and import them."""
193
- chunks = []
194
-
195
- # Create chunks from message buffer
196
- for i in range(0, len(messages), CHUNK_SIZE):
197
- chunk_messages = messages[i:i + CHUNK_SIZE]
198
-
199
- # Create conversation text
200
- conversation_text = "\n\n".join([
201
- f"{msg['role'].upper()}: {msg['content'][:500]}"
202
- for msg in chunk_messages
203
- ])
204
-
205
- # Add metadata
206
- timestamps = [msg['timestamp'] for msg in chunk_messages]
207
- first_timestamp = min(timestamps) if timestamps else datetime.now().isoformat()
208
-
209
- chunk_id = hashlib.md5(
210
- f"{conversation_id}_{first_timestamp}_{len(chunks)}".encode()
211
- ).hexdigest()
212
-
213
- chunks.append({
214
- 'id': chunk_id,
215
- 'text': conversation_text,
216
- 'metadata': {
217
- 'project': project_name,
218
- 'conversation_id': conversation_id,
219
- 'timestamp': first_timestamp,
220
- 'chunk_index': len(chunks),
221
- 'message_count': len(chunk_messages),
222
- 'roles': list(set(msg['role'] for msg in chunk_messages))
223
- }
224
- })
225
-
226
- # Import chunks if we have any
227
- if chunks:
228
- self._import_chunks_to_qdrant(chunks, collection_name)
229
-
230
- def _import_chunks_to_qdrant(self, chunks: List[Dict[str, Any]], collection_name: str):
231
- """Import conversation chunks to Qdrant."""
232
- if not chunks:
233
- return
234
-
235
- # Process in batches
236
- for i in range(0, len(chunks), BATCH_SIZE):
237
- batch = chunks[i:i + BATCH_SIZE]
238
- texts = [chunk['text'] for chunk in batch]
239
-
240
- try:
241
- # Generate embeddings
242
- embeddings = self._generate_embeddings(texts)
243
-
244
- # Create points
245
- points = []
246
- for chunk, embedding in zip(batch, embeddings):
247
- # Include both text and metadata in payload
248
- payload = chunk['metadata'].copy()
249
- payload['text'] = chunk['text']
250
-
251
- points.append(PointStruct(
252
- id=chunk['id'],
253
- vector=embedding,
254
- payload=payload
255
- ))
256
-
257
- # Upsert to Qdrant
258
- self.qdrant_client.upsert(
259
- collection_name=collection_name,
260
- points=points,
261
- wait=True
262
- )
263
-
264
- self.total_imported += len(points)
265
- logger.info(f"Imported batch of {len(points)} chunks (total: {self.total_imported})")
266
-
267
- except Exception as e:
268
- logger.error(f"Failed to import batch: {e}")
269
- self.total_errors += 1
270
-
271
- def import_large_file(self, file_path: str, project_name: str):
272
- """Import a large JSONL file using streaming."""
273
- logger.info(f"🚀 Starting streaming import of {os.path.basename(file_path)}")
274
-
275
- # Get collection name
276
- project_hash = hashlib.md5(project_name.encode()).hexdigest()[:8]
277
- collection_name = f"conv_{project_hash}_voyage"
278
-
279
- # Ensure collection exists
280
- collections = [c.name for c in self.qdrant_client.get_collections().collections]
281
- if collection_name not in collections:
282
- logger.info(f"Creating collection: {collection_name}")
283
- self.qdrant_client.create_collection(
284
- collection_name=collection_name,
285
- vectors_config=VectorParams(size=EMBEDDING_DIMENSIONS, distance=Distance.COSINE)
286
- )
287
-
288
- # Extract conversation ID from filename
289
- conversation_id = os.path.splitext(os.path.basename(file_path))[0]
290
-
291
- # Stream and process the file
292
- chunk_count = 0
293
- message_count = 0
294
-
295
- try:
296
- logger.info(f"Starting to process chunks from generator")
297
- for message_buffer in self.stream_jsonl_messages(file_path):
298
- logger.info(f"Received buffer with {len(message_buffer)} messages")
299
- self.process_message_buffer(message_buffer, project_name, collection_name, conversation_id)
300
- chunk_count += 1
301
- message_count += len(message_buffer)
302
- logger.info(f"Processed chunk {chunk_count} with {len(message_buffer)} messages (total: {message_count})")
303
-
304
- # Save state periodically
305
- if chunk_count % 10 == 0:
306
- self._save_state()
307
-
308
- # Log final statistics
309
- logger.info(f"Finished processing {chunk_count} chunks with {message_count} total messages")
310
-
311
- # Mark file as imported
312
- if project_name not in self.state["projects"]:
313
- self.state["projects"][project_name] = []
314
- if file_path not in self.state["projects"][project_name]:
315
- self.state["projects"][project_name].append(file_path)
316
-
317
- self._save_state()
318
- logger.info(f"✅ Completed streaming import of {os.path.basename(file_path)} - {chunk_count} chunks, {message_count} messages, {self.total_imported} vectors")
319
-
320
- except Exception as e:
321
- logger.error(f"Error during streaming import: {e}")
322
- raise
323
-
324
- def main():
325
- """Main entry point for streaming import."""
326
- import sys
327
- import argparse
328
-
329
- parser = argparse.ArgumentParser(description="Streaming import for large conversation files")
330
- parser.add_argument("--project", help="Project directory path")
331
- parser.add_argument("--limit", type=int, help="Limit number of files to process")
332
- args = parser.parse_args()
333
-
334
- importer = StreamingVoyageImporter()
335
-
336
- # If project path is provided via command line
337
- if args.project and os.path.exists(args.project):
338
- project_name = os.path.basename(args.project)
339
- files_processed = 0
340
-
341
- # Find all JSONL files in the project
342
- for file_path in Path(args.project).glob("*.jsonl"):
343
- if args.limit and files_processed >= args.limit:
344
- break
345
-
346
- file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
347
- logger.info(f"Processing {file_path.name} ({file_size_mb:.1f} MB)")
348
- importer.import_large_file(str(file_path), project_name)
349
- files_processed += 1
350
- else:
351
- # No specific project specified - scan for all projects
352
- base_path = os.getenv("LOGS_PATH", "/logs")
353
- if os.path.exists(base_path):
354
- # Scan for all project directories
355
- for project_dir in Path(base_path).iterdir():
356
- if project_dir.is_dir() and not project_dir.name.startswith('.'):
357
- # Look for JSONL files in this project
358
- jsonl_files = list(project_dir.glob("*.jsonl"))
359
- if jsonl_files:
360
- for jsonl_file in jsonl_files:
361
- file_size_mb = jsonl_file.stat().st_size / (1024 * 1024)
362
- logger.info(f"Processing {jsonl_file.name} ({file_size_mb:.1f} MB) from project {project_dir.name}")
363
- importer.import_large_file(str(jsonl_file), project_dir.name)
364
-
365
- logger.info(f"Streaming import complete! Total chunks: {importer.total_imported}")
366
-
367
- if __name__ == "__main__":
368
- main()