claude-self-reflect 5.0.7 → 6.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/open-source-maintainer.md +1 -1
- package/.claude/agents/reflection-specialist.md +2 -2
- package/Dockerfile.async-importer +6 -4
- package/Dockerfile.importer +6 -6
- package/Dockerfile.safe-watcher +8 -8
- package/Dockerfile.streaming-importer +8 -1
- package/Dockerfile.watcher +8 -16
- package/README.md +0 -3
- package/docker-compose.yaml +2 -6
- package/installer/.claude/agents/README.md +138 -0
- package/package.json +5 -26
- package/src/__init__.py +0 -0
- package/src/cli/__init__.py +0 -0
- package/src/runtime/__init__.py +0 -0
- package/src/runtime/import-latest.py +124 -0
- package/{scripts → src/runtime}/precompact-hook.sh +1 -1
- package/src/runtime/streaming-importer.py +995 -0
- package/{scripts → src/runtime}/watcher-loop.sh +1 -1
- package/.claude/agents/claude-self-reflect-test.md +0 -1274
- package/.claude/agents/reflect-tester.md +0 -300
- package/scripts/add-timestamp-indexes.py +0 -134
- package/scripts/ast_grep_final_analyzer.py +0 -338
- package/scripts/ast_grep_unified_registry.py +0 -710
- package/scripts/check-collections.py +0 -29
- package/scripts/debug-august-parsing.py +0 -80
- package/scripts/debug-import-single.py +0 -91
- package/scripts/debug-project-resolver.py +0 -82
- package/scripts/debug-temporal-tools.py +0 -135
- package/scripts/import-conversations-enhanced.py +0 -672
- package/scripts/migrate-to-unified-state.py +0 -426
- package/scripts/session_quality_tracker.py +0 -671
- package/scripts/update_patterns.py +0 -334
- /package/{scripts → src}/importer/__init__.py +0 -0
- /package/{scripts → src}/importer/__main__.py +0 -0
- /package/{scripts → src}/importer/core/__init__.py +0 -0
- /package/{scripts → src}/importer/core/config.py +0 -0
- /package/{scripts → src}/importer/core/exceptions.py +0 -0
- /package/{scripts → src}/importer/core/models.py +0 -0
- /package/{scripts → src}/importer/embeddings/__init__.py +0 -0
- /package/{scripts → src}/importer/embeddings/base.py +0 -0
- /package/{scripts → src}/importer/embeddings/fastembed_provider.py +0 -0
- /package/{scripts → src}/importer/embeddings/validator.py +0 -0
- /package/{scripts → src}/importer/embeddings/voyage_provider.py +0 -0
- /package/{scripts → src}/importer/main.py +0 -0
- /package/{scripts → src}/importer/processors/__init__.py +0 -0
- /package/{scripts → src}/importer/processors/ast_extractor.py +0 -0
- /package/{scripts → src}/importer/processors/chunker.py +0 -0
- /package/{scripts → src}/importer/processors/concept_extractor.py +0 -0
- /package/{scripts → src}/importer/processors/conversation_parser.py +0 -0
- /package/{scripts → src}/importer/processors/tool_extractor.py +0 -0
- /package/{scripts → src}/importer/state/__init__.py +0 -0
- /package/{scripts → src}/importer/state/state_manager.py +0 -0
- /package/{scripts → src}/importer/storage/__init__.py +0 -0
- /package/{scripts → src}/importer/storage/qdrant_storage.py +0 -0
- /package/{scripts → src}/importer/utils/__init__.py +0 -0
- /package/{scripts → src}/importer/utils/logger.py +0 -0
- /package/{scripts → src}/importer/utils/project_normalizer.py +0 -0
- /package/{scripts → src/runtime}/delta-metadata-update-safe.py +0 -0
- /package/{scripts → src/runtime}/delta-metadata-update.py +0 -0
- /package/{scripts → src/runtime}/doctor.py +0 -0
- /package/{scripts → src/runtime}/embedding_service.py +0 -0
- /package/{scripts → src/runtime}/force-metadata-recovery.py +0 -0
- /package/{scripts → src/runtime}/import-conversations-unified.py +0 -0
- /package/{scripts → src/runtime}/import_strategies.py +0 -0
- /package/{scripts → src/runtime}/message_processors.py +0 -0
- /package/{scripts → src/runtime}/metadata_extractor.py +0 -0
- /package/{scripts → src/runtime}/streaming-watcher.py +0 -0
- /package/{scripts → src/runtime}/unified_state_manager.py +0 -0
- /package/{scripts → src/runtime}/utils.py +0 -0
|
@@ -0,0 +1,995 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Production-Ready Streaming Importer v2.5.17 FINAL
|
|
4
|
+
Addresses all critical issues from Opus 4.1 and GPT-5 code reviews:
|
|
5
|
+
1. Fixed signal handler race condition
|
|
6
|
+
2. Fixed CPU monitoring initialization
|
|
7
|
+
3. Fixed queue overflow data loss
|
|
8
|
+
4. Fixed state persistence across restarts
|
|
9
|
+
5. Fixed cgroup-aware CPU detection
|
|
10
|
+
6. Fixed async operation cancellation
|
|
11
|
+
7. Fixed atomic file operations with fsync
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import time
|
|
18
|
+
import hashlib
|
|
19
|
+
import re
|
|
20
|
+
import gc
|
|
21
|
+
import ctypes
|
|
22
|
+
import platform
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Dict, List, Optional, Any, Set, Tuple, Generator
|
|
25
|
+
from datetime import datetime, timedelta
|
|
26
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
import logging
|
|
29
|
+
from collections import deque
|
|
30
|
+
|
|
31
|
+
from qdrant_client import AsyncQdrantClient, models
|
|
32
|
+
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
33
|
+
from fastembed import TextEmbedding
|
|
34
|
+
import psutil
|
|
35
|
+
|
|
36
|
+
# Import normalize_project_name
|
|
37
|
+
import sys
|
|
38
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
39
|
+
from utils import normalize_project_name
|
|
40
|
+
|
|
41
|
+
# Configure logging
|
|
42
|
+
logging.basicConfig(
|
|
43
|
+
level=logging.INFO,
|
|
44
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
45
|
+
)
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
47
|
+
|
|
48
|
+
# Configuration from environment
|
|
49
|
+
@dataclass
|
|
50
|
+
class Config:
|
|
51
|
+
"""Production configuration with proper defaults."""
|
|
52
|
+
qdrant_url: str = field(default_factory=lambda: os.getenv("QDRANT_URL", "http://localhost:6333"))
|
|
53
|
+
voyage_api_key: Optional[str] = field(default_factory=lambda: os.getenv("VOYAGE_API_KEY"))
|
|
54
|
+
prefer_local_embeddings: bool = field(default_factory=lambda: os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true")
|
|
55
|
+
embedding_model: str = field(default_factory=lambda: os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2"))
|
|
56
|
+
|
|
57
|
+
logs_dir: Path = field(default_factory=lambda: Path(os.getenv("LOGS_DIR", "~/.claude/projects")).expanduser())
|
|
58
|
+
# FIXED: Use writable location with fallback
|
|
59
|
+
state_file: Path = field(default_factory=lambda: Path(os.getenv("STATE_FILE", "~/.claude/streaming-state.json")).expanduser())
|
|
60
|
+
collection_prefix: str = "conv"
|
|
61
|
+
vector_size: int = 384 # FastEmbed all-MiniLM-L6-v2 (will be 1024 for Voyage)
|
|
62
|
+
|
|
63
|
+
# Production throttling controls
|
|
64
|
+
import_frequency: int = field(default_factory=lambda: int(os.getenv("IMPORT_FREQUENCY", "10"))) # Check every 10s
|
|
65
|
+
batch_size: int = field(default_factory=lambda: int(os.getenv("BATCH_SIZE", "10"))) # Increased from 5
|
|
66
|
+
memory_limit_mb: int = field(default_factory=lambda: int(os.getenv("MEMORY_LIMIT_MB", "2048"))) # Increased from 400MB to 2GB
|
|
67
|
+
|
|
68
|
+
# CPU management - properly scaled for multi-core
|
|
69
|
+
max_cpu_percent_per_core: float = field(default_factory=lambda: float(os.getenv("MAX_CPU_PERCENT_PER_CORE", "50")))
|
|
70
|
+
max_concurrent_embeddings: int = field(default_factory=lambda: int(os.getenv("MAX_CONCURRENT_EMBEDDINGS", "2")))
|
|
71
|
+
max_concurrent_qdrant: int = field(default_factory=lambda: int(os.getenv("MAX_CONCURRENT_QDRANT", "3")))
|
|
72
|
+
|
|
73
|
+
# Queue management
|
|
74
|
+
max_queue_size: int = field(default_factory=lambda: int(os.getenv("MAX_QUEUE_SIZE", "100"))) # Max files in queue
|
|
75
|
+
max_backlog_hours: int = field(default_factory=lambda: int(os.getenv("MAX_BACKLOG_HOURS", "24"))) # Alert if older
|
|
76
|
+
|
|
77
|
+
# Reliability settings
|
|
78
|
+
qdrant_timeout_s: float = field(default_factory=lambda: float(os.getenv("QDRANT_TIMEOUT", "10")))
|
|
79
|
+
max_retries: int = field(default_factory=lambda: int(os.getenv("MAX_RETRIES", "3")))
|
|
80
|
+
retry_delay_s: float = field(default_factory=lambda: float(os.getenv("RETRY_DELAY", "1")))
|
|
81
|
+
|
|
82
|
+
# Collection cache settings
|
|
83
|
+
collection_cache_ttl: int = field(default_factory=lambda: int(os.getenv("COLLECTION_CACHE_TTL", "3600"))) # 1 hour
|
|
84
|
+
collection_cache_max_size: int = field(default_factory=lambda: int(os.getenv("COLLECTION_CACHE_MAX_SIZE", "100")))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# Check if malloc_trim is available
|
|
88
|
+
try:
|
|
89
|
+
libc = ctypes.CDLL("libc.so.6")
|
|
90
|
+
malloc_trim = libc.malloc_trim
|
|
91
|
+
malloc_trim.argtypes = [ctypes.c_size_t]
|
|
92
|
+
malloc_trim.restype = ctypes.c_int
|
|
93
|
+
MALLOC_TRIM_AVAILABLE = True
|
|
94
|
+
except:
|
|
95
|
+
MALLOC_TRIM_AVAILABLE = False
|
|
96
|
+
logger.debug("malloc_trim not available on this platform")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_effective_cpus() -> float:
|
|
100
|
+
"""Get effective CPU count considering cgroup limits."""
|
|
101
|
+
# Try to get from environment first
|
|
102
|
+
effective_cores_env = os.getenv("EFFECTIVE_CORES")
|
|
103
|
+
if effective_cores_env:
|
|
104
|
+
try:
|
|
105
|
+
return float(effective_cores_env)
|
|
106
|
+
except ValueError:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
# cgroup v2
|
|
110
|
+
cpu_max = Path("/sys/fs/cgroup/cpu.max")
|
|
111
|
+
# cgroup v1
|
|
112
|
+
cpu_quota = Path("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
|
|
113
|
+
cpu_period = Path("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
if cpu_max.exists():
|
|
117
|
+
# format: "<quota> <period>" or "max <period>"
|
|
118
|
+
content = cpu_max.read_text().strip().split()
|
|
119
|
+
if content[0] != "max":
|
|
120
|
+
quota, period = int(content[0]), int(content[1])
|
|
121
|
+
if period > 0:
|
|
122
|
+
return max(1.0, quota / period)
|
|
123
|
+
elif cpu_quota.exists() and cpu_period.exists():
|
|
124
|
+
quota = int(cpu_quota.read_text())
|
|
125
|
+
period = int(cpu_period.read_text())
|
|
126
|
+
if quota > 0 and period > 0:
|
|
127
|
+
return max(1.0, quota / period)
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
return float(psutil.cpu_count() or 1)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def extract_tool_usage_from_conversation(messages: List[Dict]) -> Dict[str, Any]:
|
|
135
|
+
"""Extract tool usage metadata from conversation messages."""
|
|
136
|
+
tool_usage = {
|
|
137
|
+
'files_analyzed': [],
|
|
138
|
+
'files_edited': [],
|
|
139
|
+
'tools_used': set()
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
for msg in messages:
|
|
143
|
+
content = msg.get('content', '')
|
|
144
|
+
|
|
145
|
+
# Handle different content types
|
|
146
|
+
if isinstance(content, str):
|
|
147
|
+
text = content
|
|
148
|
+
elif isinstance(content, list):
|
|
149
|
+
text_parts = []
|
|
150
|
+
for item in content:
|
|
151
|
+
if isinstance(item, str):
|
|
152
|
+
text_parts.append(item)
|
|
153
|
+
elif isinstance(item, dict):
|
|
154
|
+
if item.get('type') == 'text':
|
|
155
|
+
text_parts.append(item.get('text', ''))
|
|
156
|
+
elif item.get('type') == 'tool_use':
|
|
157
|
+
# Extract tool information
|
|
158
|
+
tool_name = item.get('name', '')
|
|
159
|
+
tool_usage['tools_used'].add(tool_name)
|
|
160
|
+
|
|
161
|
+
# Extract file paths from tool inputs
|
|
162
|
+
if 'input' in item:
|
|
163
|
+
tool_input = item['input']
|
|
164
|
+
if isinstance(tool_input, dict):
|
|
165
|
+
# Check for file paths in common tool parameters
|
|
166
|
+
if 'file_path' in tool_input:
|
|
167
|
+
file_path = tool_input['file_path']
|
|
168
|
+
if tool_name in ['Read', 'Grep', 'Glob', 'LS']:
|
|
169
|
+
tool_usage['files_analyzed'].append(file_path)
|
|
170
|
+
elif tool_name in ['Edit', 'Write', 'MultiEdit']:
|
|
171
|
+
tool_usage['files_edited'].append(file_path)
|
|
172
|
+
|
|
173
|
+
# Handle multiple files
|
|
174
|
+
if 'files' in tool_input:
|
|
175
|
+
files = tool_input['files']
|
|
176
|
+
if isinstance(files, list):
|
|
177
|
+
tool_usage['files_analyzed'].extend(files)
|
|
178
|
+
text = ' '.join(text_parts)
|
|
179
|
+
else:
|
|
180
|
+
text = str(content)
|
|
181
|
+
|
|
182
|
+
# Extract file paths from text content using regex
|
|
183
|
+
file_patterns = [
|
|
184
|
+
r'`([/\w\-\.]+\.\w+)`',
|
|
185
|
+
r'File: ([/\w\-\.]+\.\w+)',
|
|
186
|
+
r'(?:^|\s)(/[\w\-\./]+\.\w+)',
|
|
187
|
+
r'(?:^|\s)([\w\-]+\.\w+)',
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
for pattern in file_patterns:
|
|
191
|
+
matches = re.findall(pattern, text[:5000]) # Limit regex to first 5k chars
|
|
192
|
+
for match in matches[:10]: # Limit matches
|
|
193
|
+
if match and not match.startswith('http'):
|
|
194
|
+
if any(keyword in text.lower() for keyword in ['edit', 'modify', 'update', 'write', 'create']):
|
|
195
|
+
tool_usage['files_edited'].append(match)
|
|
196
|
+
else:
|
|
197
|
+
tool_usage['files_analyzed'].append(match)
|
|
198
|
+
|
|
199
|
+
# Convert sets to lists and deduplicate
|
|
200
|
+
tool_usage['tools_used'] = list(tool_usage['tools_used'])
|
|
201
|
+
tool_usage['files_analyzed'] = list(set(tool_usage['files_analyzed']))[:20]
|
|
202
|
+
tool_usage['files_edited'] = list(set(tool_usage['files_edited']))[:20]
|
|
203
|
+
|
|
204
|
+
return tool_usage
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> List[str]:
|
|
208
|
+
"""Extract development concepts from conversation text."""
|
|
209
|
+
concepts = set()
|
|
210
|
+
|
|
211
|
+
# Limit text for concept extraction
|
|
212
|
+
text_sample = text[:50000] if len(text) > 50000 else text
|
|
213
|
+
|
|
214
|
+
concept_patterns = {
|
|
215
|
+
'docker': r'\b(?:docker|container|compose|dockerfile)\b',
|
|
216
|
+
'testing': r'\b(?:test|testing|unittest|pytest)\b',
|
|
217
|
+
'database': r'\b(?:database|sql|postgres|mysql|mongodb)\b',
|
|
218
|
+
'api': r'\b(?:api|rest|graphql|endpoint)\b',
|
|
219
|
+
'security': r'\b(?:security|auth|authentication)\b',
|
|
220
|
+
'performance': r'\b(?:performance|optimization|cache)\b',
|
|
221
|
+
'debugging': r'\b(?:debug|debugging|error|bug)\b',
|
|
222
|
+
'deployment': r'\b(?:deploy|deployment|ci\/cd)\b',
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
text_lower = text_sample.lower()
|
|
226
|
+
|
|
227
|
+
for concept, pattern in concept_patterns.items():
|
|
228
|
+
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
229
|
+
concepts.add(concept)
|
|
230
|
+
|
|
231
|
+
# Add concepts based on tools used
|
|
232
|
+
if 'Docker' in tool_usage.get('tools_used', []):
|
|
233
|
+
concepts.add('docker')
|
|
234
|
+
if 'Bash' in tool_usage.get('tools_used', []):
|
|
235
|
+
concepts.add('scripting')
|
|
236
|
+
|
|
237
|
+
return list(concepts)[:15]
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class EmbeddingProvider:
|
|
241
|
+
"""Base class for embedding providers."""
|
|
242
|
+
|
|
243
|
+
async def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
244
|
+
raise NotImplementedError
|
|
245
|
+
|
|
246
|
+
async def close(self):
|
|
247
|
+
"""Cleanup resources."""
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class FastEmbedProvider(EmbeddingProvider):
|
|
252
|
+
"""FastEmbed provider with proper resource management."""
|
|
253
|
+
|
|
254
|
+
def __init__(self, model_name: str, max_concurrent: int = 2):
|
|
255
|
+
self.model = TextEmbedding(model_name)
|
|
256
|
+
self.executor = ThreadPoolExecutor(max_workers=1)
|
|
257
|
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
258
|
+
|
|
259
|
+
async def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
260
|
+
"""Generate embeddings with concurrency control and retry."""
|
|
261
|
+
async with self.semaphore:
|
|
262
|
+
loop = asyncio.get_event_loop()
|
|
263
|
+
embeddings = await loop.run_in_executor(
|
|
264
|
+
self.executor,
|
|
265
|
+
lambda: list(self.model.embed(texts))
|
|
266
|
+
)
|
|
267
|
+
return [embedding.tolist() for embedding in embeddings]
|
|
268
|
+
|
|
269
|
+
async def close(self):
|
|
270
|
+
"""Shutdown executor properly."""
|
|
271
|
+
# FIXED: Use proper shutdown with wait=True
|
|
272
|
+
self.executor.shutdown(wait=True, cancel_futures=True)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class VoyageProvider(EmbeddingProvider):
|
|
276
|
+
"""Voyage AI provider for cloud embeddings."""
|
|
277
|
+
|
|
278
|
+
def __init__(self, api_key: str, max_concurrent: int = 2):
|
|
279
|
+
import voyageai
|
|
280
|
+
self.client = voyageai.Client(api_key=api_key)
|
|
281
|
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
282
|
+
|
|
283
|
+
async def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
284
|
+
"""Generate embeddings using Voyage AI."""
|
|
285
|
+
async with self.semaphore:
|
|
286
|
+
loop = asyncio.get_event_loop()
|
|
287
|
+
# Run Voyage API call in executor to avoid blocking
|
|
288
|
+
result = await loop.run_in_executor(
|
|
289
|
+
None,
|
|
290
|
+
lambda: self.client.embed(
|
|
291
|
+
texts=texts,
|
|
292
|
+
model="voyage-3",
|
|
293
|
+
input_type="document"
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
return result.embeddings
|
|
297
|
+
|
|
298
|
+
async def close(self):
|
|
299
|
+
"""Cleanup resources."""
|
|
300
|
+
pass # Voyage client doesn't need explicit cleanup
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class QdrantService:
|
|
304
|
+
"""Qdrant service with proper backpressure and retries."""
|
|
305
|
+
|
|
306
|
+
def __init__(self, config: Config, embedding_provider: EmbeddingProvider):
|
|
307
|
+
self.config = config
|
|
308
|
+
self.client = AsyncQdrantClient(url=config.qdrant_url)
|
|
309
|
+
self.embedding_provider = embedding_provider
|
|
310
|
+
self._collection_cache: Dict[str, float] = {} # name -> timestamp
|
|
311
|
+
self.request_semaphore = asyncio.Semaphore(config.max_concurrent_qdrant)
|
|
312
|
+
|
|
313
|
+
async def ensure_collection(self, collection_name: str) -> None:
|
|
314
|
+
"""Ensure collection exists with TTL cache."""
|
|
315
|
+
now = time.time()
|
|
316
|
+
|
|
317
|
+
# Check cache with TTL
|
|
318
|
+
if collection_name in self._collection_cache:
|
|
319
|
+
if now - self._collection_cache[collection_name] < self.config.collection_cache_ttl:
|
|
320
|
+
return
|
|
321
|
+
|
|
322
|
+
# Enforce cache size limit
|
|
323
|
+
if len(self._collection_cache) >= self.config.collection_cache_max_size:
|
|
324
|
+
# Remove oldest entry
|
|
325
|
+
oldest = min(self._collection_cache.items(), key=lambda x: x[1])
|
|
326
|
+
del self._collection_cache[oldest[0]]
|
|
327
|
+
|
|
328
|
+
async with self.request_semaphore:
|
|
329
|
+
try:
|
|
330
|
+
await asyncio.wait_for(
|
|
331
|
+
self.client.get_collection(collection_name),
|
|
332
|
+
timeout=self.config.qdrant_timeout_s
|
|
333
|
+
)
|
|
334
|
+
self._collection_cache[collection_name] = now
|
|
335
|
+
logger.debug(f"Collection {collection_name} exists")
|
|
336
|
+
except (UnexpectedResponse, asyncio.TimeoutError):
|
|
337
|
+
# Create collection
|
|
338
|
+
vector_size = 1024 if "_voyage" in collection_name else self.config.vector_size
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
await asyncio.wait_for(
|
|
342
|
+
self.client.create_collection(
|
|
343
|
+
collection_name=collection_name,
|
|
344
|
+
vectors_config=models.VectorParams(
|
|
345
|
+
size=vector_size,
|
|
346
|
+
distance=models.Distance.COSINE
|
|
347
|
+
),
|
|
348
|
+
optimizers_config=models.OptimizersConfigDiff(
|
|
349
|
+
indexing_threshold=100
|
|
350
|
+
)
|
|
351
|
+
),
|
|
352
|
+
timeout=self.config.qdrant_timeout_s
|
|
353
|
+
)
|
|
354
|
+
self._collection_cache[collection_name] = now
|
|
355
|
+
logger.info(f"Created collection {collection_name}")
|
|
356
|
+
except UnexpectedResponse as e:
|
|
357
|
+
if "already exists" in str(e):
|
|
358
|
+
self._collection_cache[collection_name] = now
|
|
359
|
+
else:
|
|
360
|
+
raise
|
|
361
|
+
|
|
362
|
+
async def store_points_with_retry(
|
|
363
|
+
self,
|
|
364
|
+
collection_name: str,
|
|
365
|
+
points: List[models.PointStruct]
|
|
366
|
+
) -> bool:
|
|
367
|
+
"""Store points with retry logic and proper acknowledgment."""
|
|
368
|
+
if not points:
|
|
369
|
+
return True
|
|
370
|
+
|
|
371
|
+
for attempt in range(self.config.max_retries):
|
|
372
|
+
try:
|
|
373
|
+
async with self.request_semaphore:
|
|
374
|
+
# FIXED: Create task for proper cancellation on timeout
|
|
375
|
+
task = asyncio.create_task(
|
|
376
|
+
self.client.upsert(
|
|
377
|
+
collection_name=collection_name,
|
|
378
|
+
points=points,
|
|
379
|
+
wait=True # CRITICAL: Wait for acknowledgment
|
|
380
|
+
)
|
|
381
|
+
)
|
|
382
|
+
await asyncio.wait_for(task, timeout=self.config.qdrant_timeout_s)
|
|
383
|
+
logger.debug(f"Stored {len(points)} points in {collection_name}")
|
|
384
|
+
return True
|
|
385
|
+
|
|
386
|
+
except asyncio.TimeoutError:
|
|
387
|
+
# FIXED: Cancel the background operation
|
|
388
|
+
task.cancel()
|
|
389
|
+
try:
|
|
390
|
+
await task
|
|
391
|
+
except asyncio.CancelledError:
|
|
392
|
+
pass
|
|
393
|
+
logger.warning(f"Timeout storing points (attempt {attempt + 1}/{self.config.max_retries})")
|
|
394
|
+
if attempt < self.config.max_retries - 1:
|
|
395
|
+
await asyncio.sleep(self.config.retry_delay_s * (2 ** attempt)) # Exponential backoff
|
|
396
|
+
except Exception as e:
|
|
397
|
+
logger.error(f"Error storing points: {e}")
|
|
398
|
+
if attempt < self.config.max_retries - 1:
|
|
399
|
+
await asyncio.sleep(self.config.retry_delay_s)
|
|
400
|
+
|
|
401
|
+
return False
|
|
402
|
+
|
|
403
|
+
async def close(self):
|
|
404
|
+
"""Close client connection."""
|
|
405
|
+
# AsyncQdrantClient doesn't have explicit close, but we can clear cache
|
|
406
|
+
self._collection_cache.clear()
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
class TokenAwareChunker:
|
|
410
|
+
"""Memory-efficient streaming chunker."""
|
|
411
|
+
|
|
412
|
+
def __init__(self, chunk_size_tokens: int = 400, chunk_overlap_tokens: int = 75):
|
|
413
|
+
self.chunk_size_chars = chunk_size_tokens * 4
|
|
414
|
+
self.chunk_overlap_chars = chunk_overlap_tokens * 4
|
|
415
|
+
logger.info(f"TokenAwareChunker: {chunk_size_tokens} tokens (~{self.chunk_size_chars} chars)")
|
|
416
|
+
|
|
417
|
+
def chunk_text_stream(self, text: str) -> Generator[str, None, None]:
|
|
418
|
+
"""Stream chunks without holding all in memory."""
|
|
419
|
+
if not text:
|
|
420
|
+
return
|
|
421
|
+
|
|
422
|
+
if len(text) <= self.chunk_size_chars:
|
|
423
|
+
yield text
|
|
424
|
+
return
|
|
425
|
+
|
|
426
|
+
start = 0
|
|
427
|
+
while start < len(text):
|
|
428
|
+
end = min(start + self.chunk_size_chars, len(text))
|
|
429
|
+
|
|
430
|
+
if end < len(text):
|
|
431
|
+
# Find natural boundary
|
|
432
|
+
for separator in ['. ', '.\n', '! ', '? ', '\n\n', '\n', ' ']:
|
|
433
|
+
last_sep = text.rfind(separator, start, end)
|
|
434
|
+
if last_sep > start + (self.chunk_size_chars // 2):
|
|
435
|
+
end = last_sep + len(separator)
|
|
436
|
+
break
|
|
437
|
+
|
|
438
|
+
chunk = text[start:end].strip()
|
|
439
|
+
if chunk:
|
|
440
|
+
yield chunk
|
|
441
|
+
|
|
442
|
+
if end >= len(text):
|
|
443
|
+
break
|
|
444
|
+
start = max(start + 1, end - self.chunk_overlap_chars)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
class CPUMonitor:
|
|
448
|
+
"""Non-blocking CPU monitoring with cgroup awareness."""
|
|
449
|
+
|
|
450
|
+
def __init__(self, max_cpu_per_core: float):
|
|
451
|
+
self.process = psutil.Process()
|
|
452
|
+
# FIXED: Use cgroup-aware CPU count
|
|
453
|
+
effective_cores = get_effective_cpus()
|
|
454
|
+
self.max_total_cpu = max_cpu_per_core * effective_cores
|
|
455
|
+
logger.info(f"CPU Monitor: {effective_cores:.1f} effective cores, {self.max_total_cpu:.1f}% limit")
|
|
456
|
+
|
|
457
|
+
# FIXED: Initialize CPU tracking properly
|
|
458
|
+
self.process.cpu_percent(interval=None) # First call to initialize
|
|
459
|
+
time.sleep(0.01) # Brief pause
|
|
460
|
+
self.last_check = time.time()
|
|
461
|
+
self.last_cpu = self.process.cpu_percent(interval=None)
|
|
462
|
+
|
|
463
|
+
def get_cpu_nowait(self) -> float:
|
|
464
|
+
"""Get CPU without blocking (uses cached value)."""
|
|
465
|
+
now = time.time()
|
|
466
|
+
if now - self.last_check > 1.0: # Update every second
|
|
467
|
+
val = self.process.cpu_percent(interval=None)
|
|
468
|
+
# FIXED: Guard against 0.0 from uninitialized reads
|
|
469
|
+
if val == 0.0 and self.last_cpu == 0.0:
|
|
470
|
+
# Best effort quick second sample
|
|
471
|
+
time.sleep(0.01)
|
|
472
|
+
val = self.process.cpu_percent(interval=None)
|
|
473
|
+
self.last_cpu = val
|
|
474
|
+
self.last_check = now
|
|
475
|
+
return self.last_cpu
|
|
476
|
+
|
|
477
|
+
def should_throttle(self) -> bool:
|
|
478
|
+
"""Check if we should throttle based on CPU."""
|
|
479
|
+
return self.get_cpu_nowait() > self.max_total_cpu
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
class QueueManager:
|
|
483
|
+
"""Manage file processing queue with limits."""
|
|
484
|
+
|
|
485
|
+
def __init__(self, max_size: int, max_age_hours: int):
|
|
486
|
+
self.max_size = max_size
|
|
487
|
+
self.max_age = timedelta(hours=max_age_hours)
|
|
488
|
+
self.queue: deque = deque(maxlen=max_size)
|
|
489
|
+
self.processed_count = 0
|
|
490
|
+
self.deferred_count = 0 # FIXED: Track deferred vs dropped
|
|
491
|
+
|
|
492
|
+
def add_files(self, files: List[Tuple[Path, datetime]]) -> int:
|
|
493
|
+
"""Add files to queue, return number added."""
|
|
494
|
+
added = 0
|
|
495
|
+
overflow = []
|
|
496
|
+
|
|
497
|
+
for file_path, mod_time in files:
|
|
498
|
+
if len(self.queue) >= self.max_size:
|
|
499
|
+
overflow.append((file_path, mod_time))
|
|
500
|
+
else:
|
|
501
|
+
self.queue.append((file_path, mod_time))
|
|
502
|
+
added += 1
|
|
503
|
+
|
|
504
|
+
# FIXED: More accurate logging and alerting
|
|
505
|
+
if overflow:
|
|
506
|
+
self.deferred_count += len(overflow)
|
|
507
|
+
oldest = min(overflow, key=lambda x: x[1])
|
|
508
|
+
logger.critical(f"QUEUE OVERFLOW: {len(overflow)} files deferred to next cycle. "
|
|
509
|
+
f"Oldest: {oldest[0].name} ({(datetime.now() - oldest[1]).total_seconds() / 3600:.1f}h old). "
|
|
510
|
+
f"Consider increasing MAX_QUEUE_SIZE or BATCH_SIZE")
|
|
511
|
+
|
|
512
|
+
return added
|
|
513
|
+
|
|
514
|
+
def get_batch(self, batch_size: int) -> List[Path]:
|
|
515
|
+
"""Get next batch of files, prioritizing oldest."""
|
|
516
|
+
batch = []
|
|
517
|
+
now = datetime.now()
|
|
518
|
+
|
|
519
|
+
# Check for stale files
|
|
520
|
+
if self.queue:
|
|
521
|
+
oldest_time = self.queue[0][1]
|
|
522
|
+
if now - oldest_time > self.max_age:
|
|
523
|
+
logger.warning(f"BACKLOG ALERT: Oldest file is {(now - oldest_time).total_seconds() / 3600:.1f} hours old")
|
|
524
|
+
|
|
525
|
+
# Get batch (process oldest first)
|
|
526
|
+
for _ in range(min(batch_size, len(self.queue))):
|
|
527
|
+
if self.queue:
|
|
528
|
+
file_path, _ = self.queue.popleft()
|
|
529
|
+
batch.append(file_path)
|
|
530
|
+
self.processed_count += 1
|
|
531
|
+
|
|
532
|
+
return batch
|
|
533
|
+
|
|
534
|
+
def get_metrics(self) -> Dict[str, Any]:
|
|
535
|
+
"""Get queue metrics."""
|
|
536
|
+
return {
|
|
537
|
+
"queue_size": len(self.queue),
|
|
538
|
+
"processed": self.processed_count,
|
|
539
|
+
"deferred": self.deferred_count, # FIXED: Use deferred instead of dropped
|
|
540
|
+
"oldest_age_hours": self._get_oldest_age()
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
def _get_oldest_age(self) -> float:
|
|
544
|
+
"""Get age of oldest item in hours."""
|
|
545
|
+
if not self.queue:
|
|
546
|
+
return 0
|
|
547
|
+
oldest_time = self.queue[0][1]
|
|
548
|
+
return (datetime.now() - oldest_time).total_seconds() / 3600
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
class StreamingImporter:
|
|
552
|
+
"""Production-ready streaming importer."""
|
|
553
|
+
|
|
554
|
+
def __init__(self, config: Config):
|
|
555
|
+
self.config = config
|
|
556
|
+
self.state: Dict[str, Any] = {}
|
|
557
|
+
self.embedding_provider = self._create_embedding_provider()
|
|
558
|
+
|
|
559
|
+
# Update vector_size based on embedding provider
|
|
560
|
+
if isinstance(self.embedding_provider, VoyageProvider):
|
|
561
|
+
self.config.vector_size = 1024 # Voyage uses 1024 dimensions
|
|
562
|
+
|
|
563
|
+
self.qdrant_service = QdrantService(config, self.embedding_provider)
|
|
564
|
+
self.chunker = TokenAwareChunker()
|
|
565
|
+
self.cpu_monitor = CPUMonitor(config.max_cpu_percent_per_core)
|
|
566
|
+
self.queue_manager = QueueManager(config.max_queue_size, config.max_backlog_hours)
|
|
567
|
+
|
|
568
|
+
self.stats = {
|
|
569
|
+
"files_processed": 0,
|
|
570
|
+
"chunks_processed": 0,
|
|
571
|
+
"failures": 0,
|
|
572
|
+
"start_time": time.time()
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
self.shutdown_event = asyncio.Event()
|
|
576
|
+
|
|
577
|
+
def _create_embedding_provider(self) -> EmbeddingProvider:
|
|
578
|
+
"""Create embedding provider with config."""
|
|
579
|
+
if not self.config.prefer_local_embeddings and self.config.voyage_api_key:
|
|
580
|
+
logger.info("Using Voyage AI for cloud embeddings")
|
|
581
|
+
return VoyageProvider(
|
|
582
|
+
self.config.voyage_api_key,
|
|
583
|
+
self.config.max_concurrent_embeddings
|
|
584
|
+
)
|
|
585
|
+
else:
|
|
586
|
+
logger.info(f"Using FastEmbed: {self.config.embedding_model}")
|
|
587
|
+
return FastEmbedProvider(
|
|
588
|
+
self.config.embedding_model,
|
|
589
|
+
self.config.max_concurrent_embeddings
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
async def load_state(self) -> None:
|
|
593
|
+
"""Load persisted state."""
|
|
594
|
+
if self.config.state_file.exists():
|
|
595
|
+
try:
|
|
596
|
+
with open(self.config.state_file, 'r') as f:
|
|
597
|
+
self.state = json.load(f)
|
|
598
|
+
logger.info(f"Loaded state with {len(self.state.get('imported_files', {}))} files")
|
|
599
|
+
except Exception as e:
|
|
600
|
+
logger.error(f"Error loading state: {e}")
|
|
601
|
+
self.state = {}
|
|
602
|
+
|
|
603
|
+
if "imported_files" not in self.state:
|
|
604
|
+
self.state["imported_files"] = {}
|
|
605
|
+
if "high_water_mark" not in self.state:
|
|
606
|
+
self.state["high_water_mark"] = 0
|
|
607
|
+
|
|
608
|
+
async def save_state(self) -> None:
|
|
609
|
+
"""Save state atomically with fsync."""
|
|
610
|
+
try:
|
|
611
|
+
self.config.state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
612
|
+
temp_file = self.config.state_file.with_suffix('.tmp')
|
|
613
|
+
|
|
614
|
+
# FIXED: Write with fsync for durability
|
|
615
|
+
with open(temp_file, 'w') as f:
|
|
616
|
+
json.dump(self.state, f, indent=2)
|
|
617
|
+
f.flush()
|
|
618
|
+
os.fsync(f.fileno())
|
|
619
|
+
|
|
620
|
+
# FIXED: Platform-specific atomic replacement
|
|
621
|
+
if platform.system() == 'Windows':
|
|
622
|
+
# Windows requires explicit removal
|
|
623
|
+
if self.config.state_file.exists():
|
|
624
|
+
self.config.state_file.unlink()
|
|
625
|
+
temp_file.rename(self.config.state_file)
|
|
626
|
+
else:
|
|
627
|
+
# POSIX atomic rename
|
|
628
|
+
os.replace(temp_file, self.config.state_file)
|
|
629
|
+
|
|
630
|
+
# Optionally fsync directory for stronger guarantees
|
|
631
|
+
try:
|
|
632
|
+
dir_fd = os.open(str(self.config.state_file.parent), os.O_DIRECTORY)
|
|
633
|
+
os.fsync(dir_fd)
|
|
634
|
+
os.close(dir_fd)
|
|
635
|
+
except:
|
|
636
|
+
pass # Directory fsync is best-effort
|
|
637
|
+
|
|
638
|
+
except Exception as e:
|
|
639
|
+
logger.error(f"Error saving state: {e}")
|
|
640
|
+
|
|
641
|
+
def get_memory_usage_mb(self) -> float:
|
|
642
|
+
"""Get current memory usage."""
|
|
643
|
+
process = psutil.Process()
|
|
644
|
+
return process.memory_info().rss / 1024 / 1024
|
|
645
|
+
|
|
646
|
+
async def memory_cleanup(self) -> None:
|
|
647
|
+
"""Perform memory cleanup."""
|
|
648
|
+
collected = gc.collect()
|
|
649
|
+
if MALLOC_TRIM_AVAILABLE:
|
|
650
|
+
malloc_trim(0)
|
|
651
|
+
logger.debug(f"Memory cleanup: collected {collected} objects")
|
|
652
|
+
|
|
653
|
+
def get_collection_name(self, project_path: str) -> str:
|
|
654
|
+
"""Get collection name for project."""
|
|
655
|
+
normalized = normalize_project_name(project_path)
|
|
656
|
+
project_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
657
|
+
suffix = "_local" if self.config.prefer_local_embeddings else "_voyage"
|
|
658
|
+
return f"{self.config.collection_prefix}_{project_hash}{suffix}"
|
|
659
|
+
|
|
660
|
+
def _extract_message_text(self, content: Any) -> str:
|
|
661
|
+
"""Extract text from message content."""
|
|
662
|
+
if isinstance(content, str):
|
|
663
|
+
return content
|
|
664
|
+
elif isinstance(content, list):
|
|
665
|
+
text_parts = []
|
|
666
|
+
for item in content:
|
|
667
|
+
if isinstance(item, str):
|
|
668
|
+
text_parts.append(item)
|
|
669
|
+
elif isinstance(item, dict):
|
|
670
|
+
if item.get('type') == 'text':
|
|
671
|
+
text_parts.append(item.get('text', ''))
|
|
672
|
+
return ' '.join(text_parts)
|
|
673
|
+
return str(content) if content else ''
|
|
674
|
+
|
|
675
|
+
async def process_file(self, file_path: Path) -> bool:
|
|
676
|
+
"""Process a single file with proper error handling."""
|
|
677
|
+
try:
|
|
678
|
+
# FIXED: Memory check with GC overhead buffer
|
|
679
|
+
memory_usage = self.get_memory_usage_mb()
|
|
680
|
+
memory_threshold = self.config.memory_limit_mb * 0.85 # 15% buffer
|
|
681
|
+
if memory_usage > memory_threshold:
|
|
682
|
+
await self.memory_cleanup()
|
|
683
|
+
if self.get_memory_usage_mb() > memory_threshold:
|
|
684
|
+
logger.warning(f"Memory limit exceeded ({memory_usage:.1f}MB > {memory_threshold:.1f}MB), skipping {file_path}")
|
|
685
|
+
return False
|
|
686
|
+
|
|
687
|
+
project_path = file_path.parent.name # Use just the project directory name, not full path
|
|
688
|
+
collection_name = self.get_collection_name(project_path)
|
|
689
|
+
conversation_id = file_path.stem
|
|
690
|
+
|
|
691
|
+
logger.info(f"Processing: {file_path.name}")
|
|
692
|
+
|
|
693
|
+
await self.qdrant_service.ensure_collection(collection_name)
|
|
694
|
+
|
|
695
|
+
# Read messages
|
|
696
|
+
all_messages = []
|
|
697
|
+
with open(file_path, 'r') as f:
|
|
698
|
+
for line in f:
|
|
699
|
+
if line.strip():
|
|
700
|
+
try:
|
|
701
|
+
data = json.loads(line)
|
|
702
|
+
|
|
703
|
+
# Skip summary messages
|
|
704
|
+
if data.get('type') == 'summary':
|
|
705
|
+
continue
|
|
706
|
+
|
|
707
|
+
# Handle messages with type user/assistant that have nested message
|
|
708
|
+
if data.get('type') in ['user', 'assistant'] and 'message' in data:
|
|
709
|
+
all_messages.append(data['message'])
|
|
710
|
+
elif 'message' in data and data['message']:
|
|
711
|
+
all_messages.append(data['message'])
|
|
712
|
+
elif 'role' in data and 'content' in data:
|
|
713
|
+
all_messages.append(data)
|
|
714
|
+
except json.JSONDecodeError:
|
|
715
|
+
continue
|
|
716
|
+
|
|
717
|
+
if not all_messages:
|
|
718
|
+
logger.warning(f"No messages in {file_path}")
|
|
719
|
+
return True # Mark as processed
|
|
720
|
+
|
|
721
|
+
# Extract metadata
|
|
722
|
+
tool_usage = extract_tool_usage_from_conversation(all_messages)
|
|
723
|
+
|
|
724
|
+
# Build text efficiently
|
|
725
|
+
text_parts = []
|
|
726
|
+
for msg in all_messages:
|
|
727
|
+
role = msg.get('role', 'unknown')
|
|
728
|
+
content = msg.get('content', '')
|
|
729
|
+
text = self._extract_message_text(content)
|
|
730
|
+
if text:
|
|
731
|
+
text_parts.append(f"{role}: {text}")
|
|
732
|
+
|
|
733
|
+
combined_text = "\n\n".join(text_parts)
|
|
734
|
+
if not combined_text.strip():
|
|
735
|
+
return True
|
|
736
|
+
|
|
737
|
+
concepts = extract_concepts(combined_text, tool_usage)
|
|
738
|
+
|
|
739
|
+
# Process chunks in streaming fashion
|
|
740
|
+
chunks_processed = 0
|
|
741
|
+
chunk_index = 0
|
|
742
|
+
|
|
743
|
+
for chunk_text in self.chunker.chunk_text_stream(combined_text):
|
|
744
|
+
# Check for shutdown
|
|
745
|
+
if self.shutdown_event.is_set():
|
|
746
|
+
return False
|
|
747
|
+
|
|
748
|
+
# CPU throttling
|
|
749
|
+
if self.cpu_monitor.should_throttle():
|
|
750
|
+
await asyncio.sleep(0.5)
|
|
751
|
+
|
|
752
|
+
# FIXED: Generate embedding with retry
|
|
753
|
+
embeddings = None
|
|
754
|
+
for attempt in range(self.config.max_retries):
|
|
755
|
+
try:
|
|
756
|
+
embeddings = await self.embedding_provider.embed_documents([chunk_text])
|
|
757
|
+
break
|
|
758
|
+
except Exception as e:
|
|
759
|
+
logger.warning(f"Embed failed (attempt {attempt+1}/{self.config.max_retries}): {e}")
|
|
760
|
+
if attempt < self.config.max_retries - 1:
|
|
761
|
+
await asyncio.sleep(self.config.retry_delay_s * (2 ** attempt))
|
|
762
|
+
|
|
763
|
+
if not embeddings:
|
|
764
|
+
logger.error(f"Failed to embed chunk {chunk_index} for {conversation_id}")
|
|
765
|
+
self.stats["failures"] += 1
|
|
766
|
+
continue # Skip this chunk but continue with others
|
|
767
|
+
|
|
768
|
+
# Create payload
|
|
769
|
+
payload = {
|
|
770
|
+
"text": chunk_text[:10000], # Limit text size
|
|
771
|
+
"conversation_id": conversation_id,
|
|
772
|
+
"chunk_index": chunk_index,
|
|
773
|
+
"message_count": len(all_messages),
|
|
774
|
+
"project": normalize_project_name(project_path),
|
|
775
|
+
"timestamp": datetime.now().isoformat(),
|
|
776
|
+
"total_length": len(chunk_text),
|
|
777
|
+
"chunking_version": "v2",
|
|
778
|
+
"concepts": concepts,
|
|
779
|
+
"files_analyzed": tool_usage['files_analyzed'],
|
|
780
|
+
"files_edited": tool_usage['files_edited'],
|
|
781
|
+
"tools_used": tool_usage['tools_used']
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
# Create point
|
|
785
|
+
point_id_str = hashlib.md5(
|
|
786
|
+
f"{conversation_id}_{chunk_index}".encode()
|
|
787
|
+
).hexdigest()[:16]
|
|
788
|
+
point_id = int(point_id_str, 16) % (2**63)
|
|
789
|
+
|
|
790
|
+
point = models.PointStruct(
|
|
791
|
+
id=point_id,
|
|
792
|
+
vector=embeddings[0],
|
|
793
|
+
payload=payload
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
# Store with retry
|
|
797
|
+
success = await self.qdrant_service.store_points_with_retry(
|
|
798
|
+
collection_name,
|
|
799
|
+
[point]
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
if not success:
|
|
803
|
+
logger.error(f"Failed to store chunk {chunk_index} for {conversation_id}")
|
|
804
|
+
self.stats["failures"] += 1
|
|
805
|
+
else:
|
|
806
|
+
chunks_processed += 1
|
|
807
|
+
|
|
808
|
+
chunk_index += 1
|
|
809
|
+
|
|
810
|
+
# Memory check mid-file
|
|
811
|
+
if chunk_index % 10 == 0:
|
|
812
|
+
if self.get_memory_usage_mb() > memory_threshold:
|
|
813
|
+
await self.memory_cleanup()
|
|
814
|
+
|
|
815
|
+
# Critical fix: Only mark as imported if we actually processed chunks
|
|
816
|
+
if chunks_processed > 0:
|
|
817
|
+
# Update state with cached timestamp for efficiency
|
|
818
|
+
self.state["imported_files"][str(file_path)] = {
|
|
819
|
+
"imported_at": datetime.now().isoformat(),
|
|
820
|
+
"_parsed_time": datetime.now().timestamp(), # FIXED: Cache parsed timestamp
|
|
821
|
+
"chunks": chunks_processed,
|
|
822
|
+
"collection": collection_name
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
self.stats["files_processed"] += 1
|
|
826
|
+
self.stats["chunks_processed"] += chunks_processed
|
|
827
|
+
|
|
828
|
+
logger.info(f"Completed: {file_path.name} ({chunks_processed} chunks)")
|
|
829
|
+
return True
|
|
830
|
+
else:
|
|
831
|
+
logger.warning(f"File produced 0 chunks, not marking as imported: {file_path.name}")
|
|
832
|
+
return False
|
|
833
|
+
|
|
834
|
+
except Exception as e:
|
|
835
|
+
logger.error(f"Error processing {file_path}: {e}")
|
|
836
|
+
self.stats["failures"] += 1
|
|
837
|
+
return False
|
|
838
|
+
|
|
839
|
+
async def find_new_files(self) -> List[Tuple[Path, datetime]]:
|
|
840
|
+
"""Find new files efficiently using high water mark."""
|
|
841
|
+
# FIXED: Guard against missing logs_dir
|
|
842
|
+
if not self.config.logs_dir.exists():
|
|
843
|
+
logger.warning(f"Logs dir not found: {self.config.logs_dir}")
|
|
844
|
+
return []
|
|
845
|
+
|
|
846
|
+
new_files = []
|
|
847
|
+
high_water_mark = self.state.get("high_water_mark", 0)
|
|
848
|
+
new_high_water = high_water_mark
|
|
849
|
+
|
|
850
|
+
try:
|
|
851
|
+
for project_dir in self.config.logs_dir.iterdir():
|
|
852
|
+
if not project_dir.is_dir():
|
|
853
|
+
continue
|
|
854
|
+
|
|
855
|
+
try:
|
|
856
|
+
for jsonl_file in project_dir.glob("*.jsonl"):
|
|
857
|
+
file_mtime = jsonl_file.stat().st_mtime
|
|
858
|
+
new_high_water = max(new_high_water, file_mtime)
|
|
859
|
+
|
|
860
|
+
# Skip if already processed (using cached timestamp)
|
|
861
|
+
if str(jsonl_file) in self.state["imported_files"]:
|
|
862
|
+
stored = self.state["imported_files"][str(jsonl_file)]
|
|
863
|
+
# FIXED: Use cached parsed timestamp for efficiency
|
|
864
|
+
if "_parsed_time" in stored:
|
|
865
|
+
if file_mtime <= stored["_parsed_time"]:
|
|
866
|
+
continue
|
|
867
|
+
elif "imported_at" in stored:
|
|
868
|
+
import_time = datetime.fromisoformat(stored["imported_at"]).timestamp()
|
|
869
|
+
stored["_parsed_time"] = import_time # Cache for next time
|
|
870
|
+
if file_mtime <= import_time:
|
|
871
|
+
continue
|
|
872
|
+
|
|
873
|
+
# Add to queue
|
|
874
|
+
new_files.append((jsonl_file, datetime.fromtimestamp(file_mtime)))
|
|
875
|
+
except Exception as e:
|
|
876
|
+
logger.error(f"Error scanning project dir {project_dir}: {e}")
|
|
877
|
+
|
|
878
|
+
except Exception as e:
|
|
879
|
+
logger.error(f"Error scanning logs dir {self.config.logs_dir}: {e}")
|
|
880
|
+
|
|
881
|
+
# Update high water mark
|
|
882
|
+
self.state["high_water_mark"] = new_high_water
|
|
883
|
+
|
|
884
|
+
# Sort by age (oldest first) to prevent starvation
|
|
885
|
+
new_files.sort(key=lambda x: x[1])
|
|
886
|
+
|
|
887
|
+
return new_files
|
|
888
|
+
|
|
889
|
+
async def run_continuous(self) -> None:
|
|
890
|
+
"""Main loop with proper shutdown handling."""
|
|
891
|
+
logger.info("Starting production streaming importer v2.5.17 FINAL")
|
|
892
|
+
logger.info(f"CPU limit: {self.cpu_monitor.max_total_cpu:.1f}%")
|
|
893
|
+
logger.info(f"Queue size: {self.config.max_queue_size}")
|
|
894
|
+
logger.info(f"State file: {self.config.state_file}")
|
|
895
|
+
|
|
896
|
+
await self.load_state()
|
|
897
|
+
|
|
898
|
+
try:
|
|
899
|
+
while not self.shutdown_event.is_set():
|
|
900
|
+
try:
|
|
901
|
+
# Find new files
|
|
902
|
+
new_files = await self.find_new_files()
|
|
903
|
+
|
|
904
|
+
if new_files:
|
|
905
|
+
added = self.queue_manager.add_files(new_files)
|
|
906
|
+
logger.info(f"Added {added} files to queue")
|
|
907
|
+
|
|
908
|
+
# Process batch
|
|
909
|
+
batch = self.queue_manager.get_batch(self.config.batch_size)
|
|
910
|
+
|
|
911
|
+
for file_path in batch:
|
|
912
|
+
if self.shutdown_event.is_set():
|
|
913
|
+
break
|
|
914
|
+
|
|
915
|
+
success = await self.process_file(file_path)
|
|
916
|
+
|
|
917
|
+
# Save state after each file for durability
|
|
918
|
+
if success:
|
|
919
|
+
await self.save_state()
|
|
920
|
+
|
|
921
|
+
# Log metrics
|
|
922
|
+
if batch:
|
|
923
|
+
metrics = self.queue_manager.get_metrics()
|
|
924
|
+
cpu = self.cpu_monitor.get_cpu_nowait()
|
|
925
|
+
mem = self.get_memory_usage_mb()
|
|
926
|
+
logger.info(f"Metrics: Queue={metrics['queue_size']}, "
|
|
927
|
+
f"CPU={cpu:.1f}%, Mem={mem:.1f}MB, "
|
|
928
|
+
f"Processed={self.stats['files_processed']}, "
|
|
929
|
+
f"Failures={self.stats['failures']}")
|
|
930
|
+
|
|
931
|
+
if metrics['oldest_age_hours'] > self.config.max_backlog_hours:
|
|
932
|
+
logger.error(f"BACKLOG ALERT: Oldest file is {metrics['oldest_age_hours']:.1f} hours old")
|
|
933
|
+
|
|
934
|
+
# Wait before next cycle
|
|
935
|
+
await asyncio.sleep(self.config.import_frequency)
|
|
936
|
+
|
|
937
|
+
except Exception as e:
|
|
938
|
+
logger.error(f"Error in main loop: {e}")
|
|
939
|
+
await asyncio.sleep(self.config.import_frequency)
|
|
940
|
+
|
|
941
|
+
except asyncio.CancelledError:
|
|
942
|
+
# FIXED: Handle CancelledError properly
|
|
943
|
+
logger.info("Main task cancelled")
|
|
944
|
+
raise
|
|
945
|
+
finally:
|
|
946
|
+
# Cleanup
|
|
947
|
+
logger.info("Shutting down...")
|
|
948
|
+
await self.save_state()
|
|
949
|
+
await self.embedding_provider.close()
|
|
950
|
+
await self.qdrant_service.close()
|
|
951
|
+
logger.info("Shutdown complete")
|
|
952
|
+
|
|
953
|
+
async def shutdown(self):
|
|
954
|
+
"""Trigger graceful shutdown."""
|
|
955
|
+
logger.info("Shutdown requested")
|
|
956
|
+
self.shutdown_event.set()
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
async def main():
|
|
960
|
+
"""Main entry point with signal handling."""
|
|
961
|
+
config = Config()
|
|
962
|
+
importer = StreamingImporter(config)
|
|
963
|
+
|
|
964
|
+
# FIXED: Setup signal handlers using asyncio-native approach
|
|
965
|
+
import signal
|
|
966
|
+
|
|
967
|
+
loop = asyncio.get_running_loop()
|
|
968
|
+
|
|
969
|
+
# Define shutdown handler
|
|
970
|
+
def shutdown_handler():
|
|
971
|
+
logger.info("Received shutdown signal")
|
|
972
|
+
importer.shutdown_event.set()
|
|
973
|
+
|
|
974
|
+
# Use asyncio-native signal handling on Unix
|
|
975
|
+
if hasattr(loop, "add_signal_handler"):
|
|
976
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
977
|
+
loop.add_signal_handler(sig, shutdown_handler)
|
|
978
|
+
else:
|
|
979
|
+
# Fallback for Windows
|
|
980
|
+
def signal_handler(sig, frame):
|
|
981
|
+
logger.info(f"Received signal {sig}")
|
|
982
|
+
# Set the shutdown event directly - it's thread-safe
|
|
983
|
+
importer.shutdown_event.set()
|
|
984
|
+
|
|
985
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
986
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
|
987
|
+
|
|
988
|
+
try:
|
|
989
|
+
await importer.run_continuous()
|
|
990
|
+
except (KeyboardInterrupt, asyncio.CancelledError):
|
|
991
|
+
await importer.shutdown()
|
|
992
|
+
|
|
993
|
+
|
|
994
|
+
if __name__ == "__main__":
|
|
995
|
+
asyncio.run(main())
|