claude-self-reflect 5.0.7 → 6.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/.claude/agents/open-source-maintainer.md +1 -1
  2. package/.claude/agents/reflection-specialist.md +2 -2
  3. package/Dockerfile.async-importer +6 -4
  4. package/Dockerfile.importer +6 -6
  5. package/Dockerfile.safe-watcher +8 -8
  6. package/Dockerfile.streaming-importer +8 -1
  7. package/Dockerfile.watcher +8 -16
  8. package/README.md +0 -3
  9. package/docker-compose.yaml +2 -6
  10. package/installer/.claude/agents/README.md +138 -0
  11. package/package.json +5 -26
  12. package/src/__init__.py +0 -0
  13. package/src/cli/__init__.py +0 -0
  14. package/src/runtime/__init__.py +0 -0
  15. package/src/runtime/import-latest.py +124 -0
  16. package/{scripts → src/runtime}/precompact-hook.sh +1 -1
  17. package/src/runtime/streaming-importer.py +995 -0
  18. package/{scripts → src/runtime}/watcher-loop.sh +1 -1
  19. package/.claude/agents/claude-self-reflect-test.md +0 -1274
  20. package/.claude/agents/reflect-tester.md +0 -300
  21. package/scripts/add-timestamp-indexes.py +0 -134
  22. package/scripts/ast_grep_final_analyzer.py +0 -338
  23. package/scripts/ast_grep_unified_registry.py +0 -710
  24. package/scripts/check-collections.py +0 -29
  25. package/scripts/debug-august-parsing.py +0 -80
  26. package/scripts/debug-import-single.py +0 -91
  27. package/scripts/debug-project-resolver.py +0 -82
  28. package/scripts/debug-temporal-tools.py +0 -135
  29. package/scripts/import-conversations-enhanced.py +0 -672
  30. package/scripts/migrate-to-unified-state.py +0 -426
  31. package/scripts/session_quality_tracker.py +0 -671
  32. package/scripts/update_patterns.py +0 -334
  33. /package/{scripts → src}/importer/__init__.py +0 -0
  34. /package/{scripts → src}/importer/__main__.py +0 -0
  35. /package/{scripts → src}/importer/core/__init__.py +0 -0
  36. /package/{scripts → src}/importer/core/config.py +0 -0
  37. /package/{scripts → src}/importer/core/exceptions.py +0 -0
  38. /package/{scripts → src}/importer/core/models.py +0 -0
  39. /package/{scripts → src}/importer/embeddings/__init__.py +0 -0
  40. /package/{scripts → src}/importer/embeddings/base.py +0 -0
  41. /package/{scripts → src}/importer/embeddings/fastembed_provider.py +0 -0
  42. /package/{scripts → src}/importer/embeddings/validator.py +0 -0
  43. /package/{scripts → src}/importer/embeddings/voyage_provider.py +0 -0
  44. /package/{scripts → src}/importer/main.py +0 -0
  45. /package/{scripts → src}/importer/processors/__init__.py +0 -0
  46. /package/{scripts → src}/importer/processors/ast_extractor.py +0 -0
  47. /package/{scripts → src}/importer/processors/chunker.py +0 -0
  48. /package/{scripts → src}/importer/processors/concept_extractor.py +0 -0
  49. /package/{scripts → src}/importer/processors/conversation_parser.py +0 -0
  50. /package/{scripts → src}/importer/processors/tool_extractor.py +0 -0
  51. /package/{scripts → src}/importer/state/__init__.py +0 -0
  52. /package/{scripts → src}/importer/state/state_manager.py +0 -0
  53. /package/{scripts → src}/importer/storage/__init__.py +0 -0
  54. /package/{scripts → src}/importer/storage/qdrant_storage.py +0 -0
  55. /package/{scripts → src}/importer/utils/__init__.py +0 -0
  56. /package/{scripts → src}/importer/utils/logger.py +0 -0
  57. /package/{scripts → src}/importer/utils/project_normalizer.py +0 -0
  58. /package/{scripts → src/runtime}/delta-metadata-update-safe.py +0 -0
  59. /package/{scripts → src/runtime}/delta-metadata-update.py +0 -0
  60. /package/{scripts → src/runtime}/doctor.py +0 -0
  61. /package/{scripts → src/runtime}/embedding_service.py +0 -0
  62. /package/{scripts → src/runtime}/force-metadata-recovery.py +0 -0
  63. /package/{scripts → src/runtime}/import-conversations-unified.py +0 -0
  64. /package/{scripts → src/runtime}/import_strategies.py +0 -0
  65. /package/{scripts → src/runtime}/message_processors.py +0 -0
  66. /package/{scripts → src/runtime}/metadata_extractor.py +0 -0
  67. /package/{scripts → src/runtime}/streaming-watcher.py +0 -0
  68. /package/{scripts → src/runtime}/unified_state_manager.py +0 -0
  69. /package/{scripts → src/runtime}/utils.py +0 -0
@@ -0,0 +1,995 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Production-Ready Streaming Importer v2.5.17 FINAL
4
+ Addresses all critical issues from Opus 4.1 and GPT-5 code reviews:
5
+ 1. Fixed signal handler race condition
6
+ 2. Fixed CPU monitoring initialization
7
+ 3. Fixed queue overflow data loss
8
+ 4. Fixed state persistence across restarts
9
+ 5. Fixed cgroup-aware CPU detection
10
+ 6. Fixed async operation cancellation
11
+ 7. Fixed atomic file operations with fsync
12
+ """
13
+
14
+ import asyncio
15
+ import json
16
+ import os
17
+ import time
18
+ import hashlib
19
+ import re
20
+ import gc
21
+ import ctypes
22
+ import platform
23
+ from pathlib import Path
24
+ from typing import Dict, List, Optional, Any, Set, Tuple, Generator
25
+ from datetime import datetime, timedelta
26
+ from concurrent.futures import ThreadPoolExecutor
27
+ from dataclasses import dataclass, field
28
+ import logging
29
+ from collections import deque
30
+
31
+ from qdrant_client import AsyncQdrantClient, models
32
+ from qdrant_client.http.exceptions import UnexpectedResponse
33
+ from fastembed import TextEmbedding
34
+ import psutil
35
+
36
+ # Import normalize_project_name
37
+ import sys
38
+ sys.path.insert(0, str(Path(__file__).parent))
39
+ from utils import normalize_project_name
40
+
41
+ # Configure logging
42
+ logging.basicConfig(
43
+ level=logging.INFO,
44
+ format='%(asctime)s - %(levelname)s - %(message)s'
45
+ )
46
+ logger = logging.getLogger(__name__)
47
+
48
+ # Configuration from environment
49
+ @dataclass
50
+ class Config:
51
+ """Production configuration with proper defaults."""
52
+ qdrant_url: str = field(default_factory=lambda: os.getenv("QDRANT_URL", "http://localhost:6333"))
53
+ voyage_api_key: Optional[str] = field(default_factory=lambda: os.getenv("VOYAGE_API_KEY"))
54
+ prefer_local_embeddings: bool = field(default_factory=lambda: os.getenv("PREFER_LOCAL_EMBEDDINGS", "true").lower() == "true")
55
+ embedding_model: str = field(default_factory=lambda: os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2"))
56
+
57
+ logs_dir: Path = field(default_factory=lambda: Path(os.getenv("LOGS_DIR", "~/.claude/projects")).expanduser())
58
+ # FIXED: Use writable location with fallback
59
+ state_file: Path = field(default_factory=lambda: Path(os.getenv("STATE_FILE", "~/.claude/streaming-state.json")).expanduser())
60
+ collection_prefix: str = "conv"
61
+ vector_size: int = 384 # FastEmbed all-MiniLM-L6-v2 (will be 1024 for Voyage)
62
+
63
+ # Production throttling controls
64
+ import_frequency: int = field(default_factory=lambda: int(os.getenv("IMPORT_FREQUENCY", "10"))) # Check every 10s
65
+ batch_size: int = field(default_factory=lambda: int(os.getenv("BATCH_SIZE", "10"))) # Increased from 5
66
+ memory_limit_mb: int = field(default_factory=lambda: int(os.getenv("MEMORY_LIMIT_MB", "2048"))) # Increased from 400MB to 2GB
67
+
68
+ # CPU management - properly scaled for multi-core
69
+ max_cpu_percent_per_core: float = field(default_factory=lambda: float(os.getenv("MAX_CPU_PERCENT_PER_CORE", "50")))
70
+ max_concurrent_embeddings: int = field(default_factory=lambda: int(os.getenv("MAX_CONCURRENT_EMBEDDINGS", "2")))
71
+ max_concurrent_qdrant: int = field(default_factory=lambda: int(os.getenv("MAX_CONCURRENT_QDRANT", "3")))
72
+
73
+ # Queue management
74
+ max_queue_size: int = field(default_factory=lambda: int(os.getenv("MAX_QUEUE_SIZE", "100"))) # Max files in queue
75
+ max_backlog_hours: int = field(default_factory=lambda: int(os.getenv("MAX_BACKLOG_HOURS", "24"))) # Alert if older
76
+
77
+ # Reliability settings
78
+ qdrant_timeout_s: float = field(default_factory=lambda: float(os.getenv("QDRANT_TIMEOUT", "10")))
79
+ max_retries: int = field(default_factory=lambda: int(os.getenv("MAX_RETRIES", "3")))
80
+ retry_delay_s: float = field(default_factory=lambda: float(os.getenv("RETRY_DELAY", "1")))
81
+
82
+ # Collection cache settings
83
+ collection_cache_ttl: int = field(default_factory=lambda: int(os.getenv("COLLECTION_CACHE_TTL", "3600"))) # 1 hour
84
+ collection_cache_max_size: int = field(default_factory=lambda: int(os.getenv("COLLECTION_CACHE_MAX_SIZE", "100")))
85
+
86
+
87
+ # Check if malloc_trim is available
88
+ try:
89
+ libc = ctypes.CDLL("libc.so.6")
90
+ malloc_trim = libc.malloc_trim
91
+ malloc_trim.argtypes = [ctypes.c_size_t]
92
+ malloc_trim.restype = ctypes.c_int
93
+ MALLOC_TRIM_AVAILABLE = True
94
+ except:
95
+ MALLOC_TRIM_AVAILABLE = False
96
+ logger.debug("malloc_trim not available on this platform")
97
+
98
+
99
+ def get_effective_cpus() -> float:
100
+ """Get effective CPU count considering cgroup limits."""
101
+ # Try to get from environment first
102
+ effective_cores_env = os.getenv("EFFECTIVE_CORES")
103
+ if effective_cores_env:
104
+ try:
105
+ return float(effective_cores_env)
106
+ except ValueError:
107
+ pass
108
+
109
+ # cgroup v2
110
+ cpu_max = Path("/sys/fs/cgroup/cpu.max")
111
+ # cgroup v1
112
+ cpu_quota = Path("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
113
+ cpu_period = Path("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
114
+
115
+ try:
116
+ if cpu_max.exists():
117
+ # format: "<quota> <period>" or "max <period>"
118
+ content = cpu_max.read_text().strip().split()
119
+ if content[0] != "max":
120
+ quota, period = int(content[0]), int(content[1])
121
+ if period > 0:
122
+ return max(1.0, quota / period)
123
+ elif cpu_quota.exists() and cpu_period.exists():
124
+ quota = int(cpu_quota.read_text())
125
+ period = int(cpu_period.read_text())
126
+ if quota > 0 and period > 0:
127
+ return max(1.0, quota / period)
128
+ except Exception:
129
+ pass
130
+
131
+ return float(psutil.cpu_count() or 1)
132
+
133
+
134
+ def extract_tool_usage_from_conversation(messages: List[Dict]) -> Dict[str, Any]:
135
+ """Extract tool usage metadata from conversation messages."""
136
+ tool_usage = {
137
+ 'files_analyzed': [],
138
+ 'files_edited': [],
139
+ 'tools_used': set()
140
+ }
141
+
142
+ for msg in messages:
143
+ content = msg.get('content', '')
144
+
145
+ # Handle different content types
146
+ if isinstance(content, str):
147
+ text = content
148
+ elif isinstance(content, list):
149
+ text_parts = []
150
+ for item in content:
151
+ if isinstance(item, str):
152
+ text_parts.append(item)
153
+ elif isinstance(item, dict):
154
+ if item.get('type') == 'text':
155
+ text_parts.append(item.get('text', ''))
156
+ elif item.get('type') == 'tool_use':
157
+ # Extract tool information
158
+ tool_name = item.get('name', '')
159
+ tool_usage['tools_used'].add(tool_name)
160
+
161
+ # Extract file paths from tool inputs
162
+ if 'input' in item:
163
+ tool_input = item['input']
164
+ if isinstance(tool_input, dict):
165
+ # Check for file paths in common tool parameters
166
+ if 'file_path' in tool_input:
167
+ file_path = tool_input['file_path']
168
+ if tool_name in ['Read', 'Grep', 'Glob', 'LS']:
169
+ tool_usage['files_analyzed'].append(file_path)
170
+ elif tool_name in ['Edit', 'Write', 'MultiEdit']:
171
+ tool_usage['files_edited'].append(file_path)
172
+
173
+ # Handle multiple files
174
+ if 'files' in tool_input:
175
+ files = tool_input['files']
176
+ if isinstance(files, list):
177
+ tool_usage['files_analyzed'].extend(files)
178
+ text = ' '.join(text_parts)
179
+ else:
180
+ text = str(content)
181
+
182
+ # Extract file paths from text content using regex
183
+ file_patterns = [
184
+ r'`([/\w\-\.]+\.\w+)`',
185
+ r'File: ([/\w\-\.]+\.\w+)',
186
+ r'(?:^|\s)(/[\w\-\./]+\.\w+)',
187
+ r'(?:^|\s)([\w\-]+\.\w+)',
188
+ ]
189
+
190
+ for pattern in file_patterns:
191
+ matches = re.findall(pattern, text[:5000]) # Limit regex to first 5k chars
192
+ for match in matches[:10]: # Limit matches
193
+ if match and not match.startswith('http'):
194
+ if any(keyword in text.lower() for keyword in ['edit', 'modify', 'update', 'write', 'create']):
195
+ tool_usage['files_edited'].append(match)
196
+ else:
197
+ tool_usage['files_analyzed'].append(match)
198
+
199
+ # Convert sets to lists and deduplicate
200
+ tool_usage['tools_used'] = list(tool_usage['tools_used'])
201
+ tool_usage['files_analyzed'] = list(set(tool_usage['files_analyzed']))[:20]
202
+ tool_usage['files_edited'] = list(set(tool_usage['files_edited']))[:20]
203
+
204
+ return tool_usage
205
+
206
+
207
+ def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> List[str]:
208
+ """Extract development concepts from conversation text."""
209
+ concepts = set()
210
+
211
+ # Limit text for concept extraction
212
+ text_sample = text[:50000] if len(text) > 50000 else text
213
+
214
+ concept_patterns = {
215
+ 'docker': r'\b(?:docker|container|compose|dockerfile)\b',
216
+ 'testing': r'\b(?:test|testing|unittest|pytest)\b',
217
+ 'database': r'\b(?:database|sql|postgres|mysql|mongodb)\b',
218
+ 'api': r'\b(?:api|rest|graphql|endpoint)\b',
219
+ 'security': r'\b(?:security|auth|authentication)\b',
220
+ 'performance': r'\b(?:performance|optimization|cache)\b',
221
+ 'debugging': r'\b(?:debug|debugging|error|bug)\b',
222
+ 'deployment': r'\b(?:deploy|deployment|ci\/cd)\b',
223
+ }
224
+
225
+ text_lower = text_sample.lower()
226
+
227
+ for concept, pattern in concept_patterns.items():
228
+ if re.search(pattern, text_lower, re.IGNORECASE):
229
+ concepts.add(concept)
230
+
231
+ # Add concepts based on tools used
232
+ if 'Docker' in tool_usage.get('tools_used', []):
233
+ concepts.add('docker')
234
+ if 'Bash' in tool_usage.get('tools_used', []):
235
+ concepts.add('scripting')
236
+
237
+ return list(concepts)[:15]
238
+
239
+
240
+ class EmbeddingProvider:
241
+ """Base class for embedding providers."""
242
+
243
+ async def embed_documents(self, texts: List[str]) -> List[List[float]]:
244
+ raise NotImplementedError
245
+
246
+ async def close(self):
247
+ """Cleanup resources."""
248
+ pass
249
+
250
+
251
+ class FastEmbedProvider(EmbeddingProvider):
252
+ """FastEmbed provider with proper resource management."""
253
+
254
+ def __init__(self, model_name: str, max_concurrent: int = 2):
255
+ self.model = TextEmbedding(model_name)
256
+ self.executor = ThreadPoolExecutor(max_workers=1)
257
+ self.semaphore = asyncio.Semaphore(max_concurrent)
258
+
259
+ async def embed_documents(self, texts: List[str]) -> List[List[float]]:
260
+ """Generate embeddings with concurrency control and retry."""
261
+ async with self.semaphore:
262
+ loop = asyncio.get_event_loop()
263
+ embeddings = await loop.run_in_executor(
264
+ self.executor,
265
+ lambda: list(self.model.embed(texts))
266
+ )
267
+ return [embedding.tolist() for embedding in embeddings]
268
+
269
+ async def close(self):
270
+ """Shutdown executor properly."""
271
+ # FIXED: Use proper shutdown with wait=True
272
+ self.executor.shutdown(wait=True, cancel_futures=True)
273
+
274
+
275
+ class VoyageProvider(EmbeddingProvider):
276
+ """Voyage AI provider for cloud embeddings."""
277
+
278
+ def __init__(self, api_key: str, max_concurrent: int = 2):
279
+ import voyageai
280
+ self.client = voyageai.Client(api_key=api_key)
281
+ self.semaphore = asyncio.Semaphore(max_concurrent)
282
+
283
+ async def embed_documents(self, texts: List[str]) -> List[List[float]]:
284
+ """Generate embeddings using Voyage AI."""
285
+ async with self.semaphore:
286
+ loop = asyncio.get_event_loop()
287
+ # Run Voyage API call in executor to avoid blocking
288
+ result = await loop.run_in_executor(
289
+ None,
290
+ lambda: self.client.embed(
291
+ texts=texts,
292
+ model="voyage-3",
293
+ input_type="document"
294
+ )
295
+ )
296
+ return result.embeddings
297
+
298
+ async def close(self):
299
+ """Cleanup resources."""
300
+ pass # Voyage client doesn't need explicit cleanup
301
+
302
+
303
+ class QdrantService:
304
+ """Qdrant service with proper backpressure and retries."""
305
+
306
+ def __init__(self, config: Config, embedding_provider: EmbeddingProvider):
307
+ self.config = config
308
+ self.client = AsyncQdrantClient(url=config.qdrant_url)
309
+ self.embedding_provider = embedding_provider
310
+ self._collection_cache: Dict[str, float] = {} # name -> timestamp
311
+ self.request_semaphore = asyncio.Semaphore(config.max_concurrent_qdrant)
312
+
313
+ async def ensure_collection(self, collection_name: str) -> None:
314
+ """Ensure collection exists with TTL cache."""
315
+ now = time.time()
316
+
317
+ # Check cache with TTL
318
+ if collection_name in self._collection_cache:
319
+ if now - self._collection_cache[collection_name] < self.config.collection_cache_ttl:
320
+ return
321
+
322
+ # Enforce cache size limit
323
+ if len(self._collection_cache) >= self.config.collection_cache_max_size:
324
+ # Remove oldest entry
325
+ oldest = min(self._collection_cache.items(), key=lambda x: x[1])
326
+ del self._collection_cache[oldest[0]]
327
+
328
+ async with self.request_semaphore:
329
+ try:
330
+ await asyncio.wait_for(
331
+ self.client.get_collection(collection_name),
332
+ timeout=self.config.qdrant_timeout_s
333
+ )
334
+ self._collection_cache[collection_name] = now
335
+ logger.debug(f"Collection {collection_name} exists")
336
+ except (UnexpectedResponse, asyncio.TimeoutError):
337
+ # Create collection
338
+ vector_size = 1024 if "_voyage" in collection_name else self.config.vector_size
339
+
340
+ try:
341
+ await asyncio.wait_for(
342
+ self.client.create_collection(
343
+ collection_name=collection_name,
344
+ vectors_config=models.VectorParams(
345
+ size=vector_size,
346
+ distance=models.Distance.COSINE
347
+ ),
348
+ optimizers_config=models.OptimizersConfigDiff(
349
+ indexing_threshold=100
350
+ )
351
+ ),
352
+ timeout=self.config.qdrant_timeout_s
353
+ )
354
+ self._collection_cache[collection_name] = now
355
+ logger.info(f"Created collection {collection_name}")
356
+ except UnexpectedResponse as e:
357
+ if "already exists" in str(e):
358
+ self._collection_cache[collection_name] = now
359
+ else:
360
+ raise
361
+
362
+ async def store_points_with_retry(
363
+ self,
364
+ collection_name: str,
365
+ points: List[models.PointStruct]
366
+ ) -> bool:
367
+ """Store points with retry logic and proper acknowledgment."""
368
+ if not points:
369
+ return True
370
+
371
+ for attempt in range(self.config.max_retries):
372
+ try:
373
+ async with self.request_semaphore:
374
+ # FIXED: Create task for proper cancellation on timeout
375
+ task = asyncio.create_task(
376
+ self.client.upsert(
377
+ collection_name=collection_name,
378
+ points=points,
379
+ wait=True # CRITICAL: Wait for acknowledgment
380
+ )
381
+ )
382
+ await asyncio.wait_for(task, timeout=self.config.qdrant_timeout_s)
383
+ logger.debug(f"Stored {len(points)} points in {collection_name}")
384
+ return True
385
+
386
+ except asyncio.TimeoutError:
387
+ # FIXED: Cancel the background operation
388
+ task.cancel()
389
+ try:
390
+ await task
391
+ except asyncio.CancelledError:
392
+ pass
393
+ logger.warning(f"Timeout storing points (attempt {attempt + 1}/{self.config.max_retries})")
394
+ if attempt < self.config.max_retries - 1:
395
+ await asyncio.sleep(self.config.retry_delay_s * (2 ** attempt)) # Exponential backoff
396
+ except Exception as e:
397
+ logger.error(f"Error storing points: {e}")
398
+ if attempt < self.config.max_retries - 1:
399
+ await asyncio.sleep(self.config.retry_delay_s)
400
+
401
+ return False
402
+
403
+ async def close(self):
404
+ """Close client connection."""
405
+ # AsyncQdrantClient doesn't have explicit close, but we can clear cache
406
+ self._collection_cache.clear()
407
+
408
+
409
+ class TokenAwareChunker:
410
+ """Memory-efficient streaming chunker."""
411
+
412
+ def __init__(self, chunk_size_tokens: int = 400, chunk_overlap_tokens: int = 75):
413
+ self.chunk_size_chars = chunk_size_tokens * 4
414
+ self.chunk_overlap_chars = chunk_overlap_tokens * 4
415
+ logger.info(f"TokenAwareChunker: {chunk_size_tokens} tokens (~{self.chunk_size_chars} chars)")
416
+
417
+ def chunk_text_stream(self, text: str) -> Generator[str, None, None]:
418
+ """Stream chunks without holding all in memory."""
419
+ if not text:
420
+ return
421
+
422
+ if len(text) <= self.chunk_size_chars:
423
+ yield text
424
+ return
425
+
426
+ start = 0
427
+ while start < len(text):
428
+ end = min(start + self.chunk_size_chars, len(text))
429
+
430
+ if end < len(text):
431
+ # Find natural boundary
432
+ for separator in ['. ', '.\n', '! ', '? ', '\n\n', '\n', ' ']:
433
+ last_sep = text.rfind(separator, start, end)
434
+ if last_sep > start + (self.chunk_size_chars // 2):
435
+ end = last_sep + len(separator)
436
+ break
437
+
438
+ chunk = text[start:end].strip()
439
+ if chunk:
440
+ yield chunk
441
+
442
+ if end >= len(text):
443
+ break
444
+ start = max(start + 1, end - self.chunk_overlap_chars)
445
+
446
+
447
+ class CPUMonitor:
448
+ """Non-blocking CPU monitoring with cgroup awareness."""
449
+
450
+ def __init__(self, max_cpu_per_core: float):
451
+ self.process = psutil.Process()
452
+ # FIXED: Use cgroup-aware CPU count
453
+ effective_cores = get_effective_cpus()
454
+ self.max_total_cpu = max_cpu_per_core * effective_cores
455
+ logger.info(f"CPU Monitor: {effective_cores:.1f} effective cores, {self.max_total_cpu:.1f}% limit")
456
+
457
+ # FIXED: Initialize CPU tracking properly
458
+ self.process.cpu_percent(interval=None) # First call to initialize
459
+ time.sleep(0.01) # Brief pause
460
+ self.last_check = time.time()
461
+ self.last_cpu = self.process.cpu_percent(interval=None)
462
+
463
+ def get_cpu_nowait(self) -> float:
464
+ """Get CPU without blocking (uses cached value)."""
465
+ now = time.time()
466
+ if now - self.last_check > 1.0: # Update every second
467
+ val = self.process.cpu_percent(interval=None)
468
+ # FIXED: Guard against 0.0 from uninitialized reads
469
+ if val == 0.0 and self.last_cpu == 0.0:
470
+ # Best effort quick second sample
471
+ time.sleep(0.01)
472
+ val = self.process.cpu_percent(interval=None)
473
+ self.last_cpu = val
474
+ self.last_check = now
475
+ return self.last_cpu
476
+
477
+ def should_throttle(self) -> bool:
478
+ """Check if we should throttle based on CPU."""
479
+ return self.get_cpu_nowait() > self.max_total_cpu
480
+
481
+
482
+ class QueueManager:
483
+ """Manage file processing queue with limits."""
484
+
485
+ def __init__(self, max_size: int, max_age_hours: int):
486
+ self.max_size = max_size
487
+ self.max_age = timedelta(hours=max_age_hours)
488
+ self.queue: deque = deque(maxlen=max_size)
489
+ self.processed_count = 0
490
+ self.deferred_count = 0 # FIXED: Track deferred vs dropped
491
+
492
+ def add_files(self, files: List[Tuple[Path, datetime]]) -> int:
493
+ """Add files to queue, return number added."""
494
+ added = 0
495
+ overflow = []
496
+
497
+ for file_path, mod_time in files:
498
+ if len(self.queue) >= self.max_size:
499
+ overflow.append((file_path, mod_time))
500
+ else:
501
+ self.queue.append((file_path, mod_time))
502
+ added += 1
503
+
504
+ # FIXED: More accurate logging and alerting
505
+ if overflow:
506
+ self.deferred_count += len(overflow)
507
+ oldest = min(overflow, key=lambda x: x[1])
508
+ logger.critical(f"QUEUE OVERFLOW: {len(overflow)} files deferred to next cycle. "
509
+ f"Oldest: {oldest[0].name} ({(datetime.now() - oldest[1]).total_seconds() / 3600:.1f}h old). "
510
+ f"Consider increasing MAX_QUEUE_SIZE or BATCH_SIZE")
511
+
512
+ return added
513
+
514
+ def get_batch(self, batch_size: int) -> List[Path]:
515
+ """Get next batch of files, prioritizing oldest."""
516
+ batch = []
517
+ now = datetime.now()
518
+
519
+ # Check for stale files
520
+ if self.queue:
521
+ oldest_time = self.queue[0][1]
522
+ if now - oldest_time > self.max_age:
523
+ logger.warning(f"BACKLOG ALERT: Oldest file is {(now - oldest_time).total_seconds() / 3600:.1f} hours old")
524
+
525
+ # Get batch (process oldest first)
526
+ for _ in range(min(batch_size, len(self.queue))):
527
+ if self.queue:
528
+ file_path, _ = self.queue.popleft()
529
+ batch.append(file_path)
530
+ self.processed_count += 1
531
+
532
+ return batch
533
+
534
+ def get_metrics(self) -> Dict[str, Any]:
535
+ """Get queue metrics."""
536
+ return {
537
+ "queue_size": len(self.queue),
538
+ "processed": self.processed_count,
539
+ "deferred": self.deferred_count, # FIXED: Use deferred instead of dropped
540
+ "oldest_age_hours": self._get_oldest_age()
541
+ }
542
+
543
+ def _get_oldest_age(self) -> float:
544
+ """Get age of oldest item in hours."""
545
+ if not self.queue:
546
+ return 0
547
+ oldest_time = self.queue[0][1]
548
+ return (datetime.now() - oldest_time).total_seconds() / 3600
549
+
550
+
551
+ class StreamingImporter:
552
+ """Production-ready streaming importer."""
553
+
554
+ def __init__(self, config: Config):
555
+ self.config = config
556
+ self.state: Dict[str, Any] = {}
557
+ self.embedding_provider = self._create_embedding_provider()
558
+
559
+ # Update vector_size based on embedding provider
560
+ if isinstance(self.embedding_provider, VoyageProvider):
561
+ self.config.vector_size = 1024 # Voyage uses 1024 dimensions
562
+
563
+ self.qdrant_service = QdrantService(config, self.embedding_provider)
564
+ self.chunker = TokenAwareChunker()
565
+ self.cpu_monitor = CPUMonitor(config.max_cpu_percent_per_core)
566
+ self.queue_manager = QueueManager(config.max_queue_size, config.max_backlog_hours)
567
+
568
+ self.stats = {
569
+ "files_processed": 0,
570
+ "chunks_processed": 0,
571
+ "failures": 0,
572
+ "start_time": time.time()
573
+ }
574
+
575
+ self.shutdown_event = asyncio.Event()
576
+
577
+ def _create_embedding_provider(self) -> EmbeddingProvider:
578
+ """Create embedding provider with config."""
579
+ if not self.config.prefer_local_embeddings and self.config.voyage_api_key:
580
+ logger.info("Using Voyage AI for cloud embeddings")
581
+ return VoyageProvider(
582
+ self.config.voyage_api_key,
583
+ self.config.max_concurrent_embeddings
584
+ )
585
+ else:
586
+ logger.info(f"Using FastEmbed: {self.config.embedding_model}")
587
+ return FastEmbedProvider(
588
+ self.config.embedding_model,
589
+ self.config.max_concurrent_embeddings
590
+ )
591
+
592
+ async def load_state(self) -> None:
593
+ """Load persisted state."""
594
+ if self.config.state_file.exists():
595
+ try:
596
+ with open(self.config.state_file, 'r') as f:
597
+ self.state = json.load(f)
598
+ logger.info(f"Loaded state with {len(self.state.get('imported_files', {}))} files")
599
+ except Exception as e:
600
+ logger.error(f"Error loading state: {e}")
601
+ self.state = {}
602
+
603
+ if "imported_files" not in self.state:
604
+ self.state["imported_files"] = {}
605
+ if "high_water_mark" not in self.state:
606
+ self.state["high_water_mark"] = 0
607
+
608
+ async def save_state(self) -> None:
609
+ """Save state atomically with fsync."""
610
+ try:
611
+ self.config.state_file.parent.mkdir(parents=True, exist_ok=True)
612
+ temp_file = self.config.state_file.with_suffix('.tmp')
613
+
614
+ # FIXED: Write with fsync for durability
615
+ with open(temp_file, 'w') as f:
616
+ json.dump(self.state, f, indent=2)
617
+ f.flush()
618
+ os.fsync(f.fileno())
619
+
620
+ # FIXED: Platform-specific atomic replacement
621
+ if platform.system() == 'Windows':
622
+ # Windows requires explicit removal
623
+ if self.config.state_file.exists():
624
+ self.config.state_file.unlink()
625
+ temp_file.rename(self.config.state_file)
626
+ else:
627
+ # POSIX atomic rename
628
+ os.replace(temp_file, self.config.state_file)
629
+
630
+ # Optionally fsync directory for stronger guarantees
631
+ try:
632
+ dir_fd = os.open(str(self.config.state_file.parent), os.O_DIRECTORY)
633
+ os.fsync(dir_fd)
634
+ os.close(dir_fd)
635
+ except:
636
+ pass # Directory fsync is best-effort
637
+
638
+ except Exception as e:
639
+ logger.error(f"Error saving state: {e}")
640
+
641
+ def get_memory_usage_mb(self) -> float:
642
+ """Get current memory usage."""
643
+ process = psutil.Process()
644
+ return process.memory_info().rss / 1024 / 1024
645
+
646
+ async def memory_cleanup(self) -> None:
647
+ """Perform memory cleanup."""
648
+ collected = gc.collect()
649
+ if MALLOC_TRIM_AVAILABLE:
650
+ malloc_trim(0)
651
+ logger.debug(f"Memory cleanup: collected {collected} objects")
652
+
653
+ def get_collection_name(self, project_path: str) -> str:
654
+ """Get collection name for project."""
655
+ normalized = normalize_project_name(project_path)
656
+ project_hash = hashlib.md5(normalized.encode()).hexdigest()[:8]
657
+ suffix = "_local" if self.config.prefer_local_embeddings else "_voyage"
658
+ return f"{self.config.collection_prefix}_{project_hash}{suffix}"
659
+
660
+ def _extract_message_text(self, content: Any) -> str:
661
+ """Extract text from message content."""
662
+ if isinstance(content, str):
663
+ return content
664
+ elif isinstance(content, list):
665
+ text_parts = []
666
+ for item in content:
667
+ if isinstance(item, str):
668
+ text_parts.append(item)
669
+ elif isinstance(item, dict):
670
+ if item.get('type') == 'text':
671
+ text_parts.append(item.get('text', ''))
672
+ return ' '.join(text_parts)
673
+ return str(content) if content else ''
674
+
675
+ async def process_file(self, file_path: Path) -> bool:
676
+ """Process a single file with proper error handling."""
677
+ try:
678
+ # FIXED: Memory check with GC overhead buffer
679
+ memory_usage = self.get_memory_usage_mb()
680
+ memory_threshold = self.config.memory_limit_mb * 0.85 # 15% buffer
681
+ if memory_usage > memory_threshold:
682
+ await self.memory_cleanup()
683
+ if self.get_memory_usage_mb() > memory_threshold:
684
+ logger.warning(f"Memory limit exceeded ({memory_usage:.1f}MB > {memory_threshold:.1f}MB), skipping {file_path}")
685
+ return False
686
+
687
+ project_path = file_path.parent.name # Use just the project directory name, not full path
688
+ collection_name = self.get_collection_name(project_path)
689
+ conversation_id = file_path.stem
690
+
691
+ logger.info(f"Processing: {file_path.name}")
692
+
693
+ await self.qdrant_service.ensure_collection(collection_name)
694
+
695
+ # Read messages
696
+ all_messages = []
697
+ with open(file_path, 'r') as f:
698
+ for line in f:
699
+ if line.strip():
700
+ try:
701
+ data = json.loads(line)
702
+
703
+ # Skip summary messages
704
+ if data.get('type') == 'summary':
705
+ continue
706
+
707
+ # Handle messages with type user/assistant that have nested message
708
+ if data.get('type') in ['user', 'assistant'] and 'message' in data:
709
+ all_messages.append(data['message'])
710
+ elif 'message' in data and data['message']:
711
+ all_messages.append(data['message'])
712
+ elif 'role' in data and 'content' in data:
713
+ all_messages.append(data)
714
+ except json.JSONDecodeError:
715
+ continue
716
+
717
+ if not all_messages:
718
+ logger.warning(f"No messages in {file_path}")
719
+ return True # Mark as processed
720
+
721
+ # Extract metadata
722
+ tool_usage = extract_tool_usage_from_conversation(all_messages)
723
+
724
+ # Build text efficiently
725
+ text_parts = []
726
+ for msg in all_messages:
727
+ role = msg.get('role', 'unknown')
728
+ content = msg.get('content', '')
729
+ text = self._extract_message_text(content)
730
+ if text:
731
+ text_parts.append(f"{role}: {text}")
732
+
733
+ combined_text = "\n\n".join(text_parts)
734
+ if not combined_text.strip():
735
+ return True
736
+
737
+ concepts = extract_concepts(combined_text, tool_usage)
738
+
739
+ # Process chunks in streaming fashion
740
+ chunks_processed = 0
741
+ chunk_index = 0
742
+
743
+ for chunk_text in self.chunker.chunk_text_stream(combined_text):
744
+ # Check for shutdown
745
+ if self.shutdown_event.is_set():
746
+ return False
747
+
748
+ # CPU throttling
749
+ if self.cpu_monitor.should_throttle():
750
+ await asyncio.sleep(0.5)
751
+
752
+ # FIXED: Generate embedding with retry
753
+ embeddings = None
754
+ for attempt in range(self.config.max_retries):
755
+ try:
756
+ embeddings = await self.embedding_provider.embed_documents([chunk_text])
757
+ break
758
+ except Exception as e:
759
+ logger.warning(f"Embed failed (attempt {attempt+1}/{self.config.max_retries}): {e}")
760
+ if attempt < self.config.max_retries - 1:
761
+ await asyncio.sleep(self.config.retry_delay_s * (2 ** attempt))
762
+
763
+ if not embeddings:
764
+ logger.error(f"Failed to embed chunk {chunk_index} for {conversation_id}")
765
+ self.stats["failures"] += 1
766
+ continue # Skip this chunk but continue with others
767
+
768
+ # Create payload
769
+ payload = {
770
+ "text": chunk_text[:10000], # Limit text size
771
+ "conversation_id": conversation_id,
772
+ "chunk_index": chunk_index,
773
+ "message_count": len(all_messages),
774
+ "project": normalize_project_name(project_path),
775
+ "timestamp": datetime.now().isoformat(),
776
+ "total_length": len(chunk_text),
777
+ "chunking_version": "v2",
778
+ "concepts": concepts,
779
+ "files_analyzed": tool_usage['files_analyzed'],
780
+ "files_edited": tool_usage['files_edited'],
781
+ "tools_used": tool_usage['tools_used']
782
+ }
783
+
784
+ # Create point
785
+ point_id_str = hashlib.md5(
786
+ f"{conversation_id}_{chunk_index}".encode()
787
+ ).hexdigest()[:16]
788
+ point_id = int(point_id_str, 16) % (2**63)
789
+
790
+ point = models.PointStruct(
791
+ id=point_id,
792
+ vector=embeddings[0],
793
+ payload=payload
794
+ )
795
+
796
+ # Store with retry
797
+ success = await self.qdrant_service.store_points_with_retry(
798
+ collection_name,
799
+ [point]
800
+ )
801
+
802
+ if not success:
803
+ logger.error(f"Failed to store chunk {chunk_index} for {conversation_id}")
804
+ self.stats["failures"] += 1
805
+ else:
806
+ chunks_processed += 1
807
+
808
+ chunk_index += 1
809
+
810
+ # Memory check mid-file
811
+ if chunk_index % 10 == 0:
812
+ if self.get_memory_usage_mb() > memory_threshold:
813
+ await self.memory_cleanup()
814
+
815
+ # Critical fix: Only mark as imported if we actually processed chunks
816
+ if chunks_processed > 0:
817
+ # Update state with cached timestamp for efficiency
818
+ self.state["imported_files"][str(file_path)] = {
819
+ "imported_at": datetime.now().isoformat(),
820
+ "_parsed_time": datetime.now().timestamp(), # FIXED: Cache parsed timestamp
821
+ "chunks": chunks_processed,
822
+ "collection": collection_name
823
+ }
824
+
825
+ self.stats["files_processed"] += 1
826
+ self.stats["chunks_processed"] += chunks_processed
827
+
828
+ logger.info(f"Completed: {file_path.name} ({chunks_processed} chunks)")
829
+ return True
830
+ else:
831
+ logger.warning(f"File produced 0 chunks, not marking as imported: {file_path.name}")
832
+ return False
833
+
834
+ except Exception as e:
835
+ logger.error(f"Error processing {file_path}: {e}")
836
+ self.stats["failures"] += 1
837
+ return False
838
+
839
+ async def find_new_files(self) -> List[Tuple[Path, datetime]]:
840
+ """Find new files efficiently using high water mark."""
841
+ # FIXED: Guard against missing logs_dir
842
+ if not self.config.logs_dir.exists():
843
+ logger.warning(f"Logs dir not found: {self.config.logs_dir}")
844
+ return []
845
+
846
+ new_files = []
847
+ high_water_mark = self.state.get("high_water_mark", 0)
848
+ new_high_water = high_water_mark
849
+
850
+ try:
851
+ for project_dir in self.config.logs_dir.iterdir():
852
+ if not project_dir.is_dir():
853
+ continue
854
+
855
+ try:
856
+ for jsonl_file in project_dir.glob("*.jsonl"):
857
+ file_mtime = jsonl_file.stat().st_mtime
858
+ new_high_water = max(new_high_water, file_mtime)
859
+
860
+ # Skip if already processed (using cached timestamp)
861
+ if str(jsonl_file) in self.state["imported_files"]:
862
+ stored = self.state["imported_files"][str(jsonl_file)]
863
+ # FIXED: Use cached parsed timestamp for efficiency
864
+ if "_parsed_time" in stored:
865
+ if file_mtime <= stored["_parsed_time"]:
866
+ continue
867
+ elif "imported_at" in stored:
868
+ import_time = datetime.fromisoformat(stored["imported_at"]).timestamp()
869
+ stored["_parsed_time"] = import_time # Cache for next time
870
+ if file_mtime <= import_time:
871
+ continue
872
+
873
+ # Add to queue
874
+ new_files.append((jsonl_file, datetime.fromtimestamp(file_mtime)))
875
+ except Exception as e:
876
+ logger.error(f"Error scanning project dir {project_dir}: {e}")
877
+
878
+ except Exception as e:
879
+ logger.error(f"Error scanning logs dir {self.config.logs_dir}: {e}")
880
+
881
+ # Update high water mark
882
+ self.state["high_water_mark"] = new_high_water
883
+
884
+ # Sort by age (oldest first) to prevent starvation
885
+ new_files.sort(key=lambda x: x[1])
886
+
887
+ return new_files
888
+
889
+ async def run_continuous(self) -> None:
890
+ """Main loop with proper shutdown handling."""
891
+ logger.info("Starting production streaming importer v2.5.17 FINAL")
892
+ logger.info(f"CPU limit: {self.cpu_monitor.max_total_cpu:.1f}%")
893
+ logger.info(f"Queue size: {self.config.max_queue_size}")
894
+ logger.info(f"State file: {self.config.state_file}")
895
+
896
+ await self.load_state()
897
+
898
+ try:
899
+ while not self.shutdown_event.is_set():
900
+ try:
901
+ # Find new files
902
+ new_files = await self.find_new_files()
903
+
904
+ if new_files:
905
+ added = self.queue_manager.add_files(new_files)
906
+ logger.info(f"Added {added} files to queue")
907
+
908
+ # Process batch
909
+ batch = self.queue_manager.get_batch(self.config.batch_size)
910
+
911
+ for file_path in batch:
912
+ if self.shutdown_event.is_set():
913
+ break
914
+
915
+ success = await self.process_file(file_path)
916
+
917
+ # Save state after each file for durability
918
+ if success:
919
+ await self.save_state()
920
+
921
+ # Log metrics
922
+ if batch:
923
+ metrics = self.queue_manager.get_metrics()
924
+ cpu = self.cpu_monitor.get_cpu_nowait()
925
+ mem = self.get_memory_usage_mb()
926
+ logger.info(f"Metrics: Queue={metrics['queue_size']}, "
927
+ f"CPU={cpu:.1f}%, Mem={mem:.1f}MB, "
928
+ f"Processed={self.stats['files_processed']}, "
929
+ f"Failures={self.stats['failures']}")
930
+
931
+ if metrics['oldest_age_hours'] > self.config.max_backlog_hours:
932
+ logger.error(f"BACKLOG ALERT: Oldest file is {metrics['oldest_age_hours']:.1f} hours old")
933
+
934
+ # Wait before next cycle
935
+ await asyncio.sleep(self.config.import_frequency)
936
+
937
+ except Exception as e:
938
+ logger.error(f"Error in main loop: {e}")
939
+ await asyncio.sleep(self.config.import_frequency)
940
+
941
+ except asyncio.CancelledError:
942
+ # FIXED: Handle CancelledError properly
943
+ logger.info("Main task cancelled")
944
+ raise
945
+ finally:
946
+ # Cleanup
947
+ logger.info("Shutting down...")
948
+ await self.save_state()
949
+ await self.embedding_provider.close()
950
+ await self.qdrant_service.close()
951
+ logger.info("Shutdown complete")
952
+
953
+ async def shutdown(self):
954
+ """Trigger graceful shutdown."""
955
+ logger.info("Shutdown requested")
956
+ self.shutdown_event.set()
957
+
958
+
959
+ async def main():
960
+ """Main entry point with signal handling."""
961
+ config = Config()
962
+ importer = StreamingImporter(config)
963
+
964
+ # FIXED: Setup signal handlers using asyncio-native approach
965
+ import signal
966
+
967
+ loop = asyncio.get_running_loop()
968
+
969
+ # Define shutdown handler
970
+ def shutdown_handler():
971
+ logger.info("Received shutdown signal")
972
+ importer.shutdown_event.set()
973
+
974
+ # Use asyncio-native signal handling on Unix
975
+ if hasattr(loop, "add_signal_handler"):
976
+ for sig in (signal.SIGINT, signal.SIGTERM):
977
+ loop.add_signal_handler(sig, shutdown_handler)
978
+ else:
979
+ # Fallback for Windows
980
+ def signal_handler(sig, frame):
981
+ logger.info(f"Received signal {sig}")
982
+ # Set the shutdown event directly - it's thread-safe
983
+ importer.shutdown_event.set()
984
+
985
+ signal.signal(signal.SIGINT, signal_handler)
986
+ signal.signal(signal.SIGTERM, signal_handler)
987
+
988
+ try:
989
+ await importer.run_continuous()
990
+ except (KeyboardInterrupt, asyncio.CancelledError):
991
+ await importer.shutdown()
992
+
993
+
994
+ if __name__ == "__main__":
995
+ asyncio.run(main())