claude-self-reflect 5.0.6 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/.claude/agents/open-source-maintainer.md +1 -1
  2. package/.claude/agents/reflection-specialist.md +2 -2
  3. package/Dockerfile.async-importer +6 -4
  4. package/Dockerfile.importer +6 -6
  5. package/Dockerfile.safe-watcher +8 -8
  6. package/Dockerfile.streaming-importer +8 -1
  7. package/Dockerfile.watcher +8 -16
  8. package/docker-compose.yaml +12 -6
  9. package/installer/.claude/agents/README.md +138 -0
  10. package/package.json +5 -26
  11. package/src/__init__.py +0 -0
  12. package/src/cli/__init__.py +0 -0
  13. package/src/runtime/__init__.py +0 -0
  14. package/src/runtime/import-latest.py +124 -0
  15. package/{scripts → src/runtime}/precompact-hook.sh +1 -1
  16. package/src/runtime/streaming-importer.py +995 -0
  17. package/{scripts → src/runtime}/watcher-loop.sh +1 -1
  18. package/.claude/agents/claude-self-reflect-test.md +0 -1274
  19. package/.claude/agents/reflect-tester.md +0 -300
  20. package/scripts/add-timestamp-indexes.py +0 -134
  21. package/scripts/ast_grep_final_analyzer.py +0 -338
  22. package/scripts/ast_grep_unified_registry.py +0 -710
  23. package/scripts/check-collections.py +0 -29
  24. package/scripts/debug-august-parsing.py +0 -80
  25. package/scripts/debug-import-single.py +0 -91
  26. package/scripts/debug-project-resolver.py +0 -82
  27. package/scripts/debug-temporal-tools.py +0 -135
  28. package/scripts/import-conversations-enhanced.py +0 -672
  29. package/scripts/migrate-to-unified-state.py +0 -426
  30. package/scripts/session_quality_tracker.py +0 -671
  31. package/scripts/update_patterns.py +0 -334
  32. /package/{scripts → src}/importer/__init__.py +0 -0
  33. /package/{scripts → src}/importer/__main__.py +0 -0
  34. /package/{scripts → src}/importer/core/__init__.py +0 -0
  35. /package/{scripts → src}/importer/core/config.py +0 -0
  36. /package/{scripts → src}/importer/core/exceptions.py +0 -0
  37. /package/{scripts → src}/importer/core/models.py +0 -0
  38. /package/{scripts → src}/importer/embeddings/__init__.py +0 -0
  39. /package/{scripts → src}/importer/embeddings/base.py +0 -0
  40. /package/{scripts → src}/importer/embeddings/fastembed_provider.py +0 -0
  41. /package/{scripts → src}/importer/embeddings/validator.py +0 -0
  42. /package/{scripts → src}/importer/embeddings/voyage_provider.py +0 -0
  43. /package/{scripts → src}/importer/main.py +0 -0
  44. /package/{scripts → src}/importer/processors/__init__.py +0 -0
  45. /package/{scripts → src}/importer/processors/ast_extractor.py +0 -0
  46. /package/{scripts → src}/importer/processors/chunker.py +0 -0
  47. /package/{scripts → src}/importer/processors/concept_extractor.py +0 -0
  48. /package/{scripts → src}/importer/processors/conversation_parser.py +0 -0
  49. /package/{scripts → src}/importer/processors/tool_extractor.py +0 -0
  50. /package/{scripts → src}/importer/state/__init__.py +0 -0
  51. /package/{scripts → src}/importer/state/state_manager.py +0 -0
  52. /package/{scripts → src}/importer/storage/__init__.py +0 -0
  53. /package/{scripts → src}/importer/storage/qdrant_storage.py +0 -0
  54. /package/{scripts → src}/importer/utils/__init__.py +0 -0
  55. /package/{scripts → src}/importer/utils/logger.py +0 -0
  56. /package/{scripts → src}/importer/utils/project_normalizer.py +0 -0
  57. /package/{scripts → src/runtime}/delta-metadata-update-safe.py +0 -0
  58. /package/{scripts → src/runtime}/delta-metadata-update.py +0 -0
  59. /package/{scripts → src/runtime}/doctor.py +0 -0
  60. /package/{scripts → src/runtime}/embedding_service.py +0 -0
  61. /package/{scripts → src/runtime}/force-metadata-recovery.py +0 -0
  62. /package/{scripts → src/runtime}/import-conversations-unified.py +0 -0
  63. /package/{scripts → src/runtime}/import_strategies.py +0 -0
  64. /package/{scripts → src/runtime}/message_processors.py +0 -0
  65. /package/{scripts → src/runtime}/metadata_extractor.py +0 -0
  66. /package/{scripts → src/runtime}/streaming-watcher.py +0 -0
  67. /package/{scripts → src/runtime}/unified_state_manager.py +0 -0
  68. /package/{scripts → src/runtime}/utils.py +0 -0
@@ -1,672 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Enhanced import script that extracts tool usage metadata from conversations.
4
- Supports both local and Voyage AI embeddings with tool tracking.
5
- """
6
-
7
- import os
8
- import sys
9
- import json
10
- import glob
11
- import hashlib
12
- import gc
13
- import re
14
- import time
15
- from datetime import datetime, timedelta
16
- from typing import List, Dict, Any, Set, Tuple
17
- import logging
18
- from pathlib import Path
19
-
20
- from qdrant_client import QdrantClient
21
- from qdrant_client.models import (
22
- VectorParams, Distance, PointStruct,
23
- Filter, FieldCondition, MatchValue
24
- )
25
-
26
- from tenacity import (
27
- retry,
28
- stop_after_attempt,
29
- wait_random_exponential,
30
- )
31
-
32
- # Configuration
33
- QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
34
- LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
35
- STATE_FILE = os.getenv("STATE_FILE", "./config/imported-files-enhanced.json")
36
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
37
- PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
38
- VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
39
- DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
40
-
41
- # Set up logging
42
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
43
- logger = logging.getLogger(__name__)
44
-
45
- # Import timing stats
46
- timing_stats = {
47
- "extract": [],
48
- "chunk": [],
49
- "embed": [],
50
- "store": [],
51
- "total": []
52
- }
53
-
54
- def normalize_path(path: str) -> str:
55
- """Normalize file paths for consistency across platforms."""
56
- if not path:
57
- return ""
58
-
59
- # Remove common prefixes
60
- path = path.replace("/Users/", "~/")
61
- path = path.replace("\\Users\\", "~\\")
62
-
63
- # Convert to forward slashes
64
- path = path.replace("\\", "/")
65
-
66
- # Remove duplicate slashes
67
- path = re.sub(r'/+', '/', path)
68
-
69
- return path
70
-
71
- def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
72
- """Extract high-level concepts from conversation and tool usage."""
73
- concepts = set()
74
-
75
- # Common development concepts with patterns
76
- concept_patterns = {
77
- 'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
78
- 'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
79
- 'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
80
- 'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
81
- 'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
82
- 'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb)',
83
- 'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
84
- 'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
85
- 'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
86
- 'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
87
- 'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
88
- 'architecture': r'(architecture|design|pattern|structure|component|module)',
89
- 'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
90
- 'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
91
- 'search': r'(search|query|find|filter|match|relevance)'
92
- }
93
-
94
- # Check text content
95
- combined_text = text.lower()
96
- for concept, pattern in concept_patterns.items():
97
- if re.search(pattern, combined_text, re.IGNORECASE):
98
- concepts.add(concept)
99
-
100
- # Check tool usage patterns
101
- tool_text = json.dumps(tool_usage).lower()
102
- for concept, pattern in concept_patterns.items():
103
- if re.search(pattern, tool_text, re.IGNORECASE):
104
- concepts.add(concept)
105
-
106
- # Add concepts based on specific tool usage
107
- if tool_usage.get('grep_searches'):
108
- concepts.add('search')
109
- if tool_usage.get('files_edited') or tool_usage.get('files_created'):
110
- concepts.add('development')
111
- if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
112
- concepts.add('testing')
113
- if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
114
- concepts.add('docker')
115
-
116
- return concepts
117
-
118
- def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
119
- """Extract all tool usage from a conversation."""
120
- tool_usage = {
121
- "files_read": [],
122
- "files_edited": [],
123
- "files_created": [],
124
- "grep_searches": [],
125
- "bash_commands": [],
126
- "glob_patterns": [],
127
- "task_calls": [],
128
- "mcp_calls": [],
129
- "tools_summary": {},
130
- "concepts": set(),
131
- "timing": {},
132
- "errors": [],
133
- "tool_results": {}
134
- }
135
-
136
- start_time = time.time()
137
-
138
- with open(jsonl_path, 'r', encoding='utf-8') as f:
139
- for line_num, line in enumerate(f, 1):
140
- line = line.strip()
141
- if not line:
142
- continue
143
-
144
- try:
145
- data = json.loads(line)
146
-
147
- # Skip API error messages
148
- if data.get('isApiErrorMessage'):
149
- continue
150
-
151
- # Process message content
152
- if 'message' in data and 'content' in data['message']:
153
- content = data['message']['content']
154
-
155
- # Handle content array (where tool_use lives)
156
- if isinstance(content, list):
157
- for item in content:
158
- if isinstance(item, dict) and item.get('type') == 'tool_use':
159
- extract_single_tool_use(item, tool_usage)
160
-
161
- except json.JSONDecodeError as e:
162
- logger.debug(f"Skipping invalid JSON at line {line_num}: {e}")
163
- except Exception as e:
164
- logger.error(f"Error processing line {line_num}: {e}")
165
- tool_usage["errors"].append({"line": line_num, "error": str(e)})
166
-
167
- # Calculate timing
168
- tool_usage["timing"]["extract_ms"] = int((time.time() - start_time) * 1000)
169
-
170
- # Convert sets to lists for JSON serialization
171
- tool_usage["concepts"] = list(tool_usage["concepts"])
172
-
173
- return tool_usage
174
-
175
- def extract_single_tool_use(tool_data: Dict[str, Any], usage_dict: Dict[str, Any]) -> None:
176
- """Parse individual tool usage with enhanced metadata extraction."""
177
- tool_name = tool_data.get('name')
178
- inputs = tool_data.get('input', {})
179
- tool_id = tool_data.get('id')
180
-
181
- # Track tool frequency
182
- usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
183
-
184
- # Extract based on tool type
185
- if tool_name == 'Read':
186
- path = inputs.get('file_path')
187
- if path:
188
- usage_dict['files_read'].append({
189
- 'path': normalize_path(path),
190
- 'offset': inputs.get('offset', 0),
191
- 'limit': inputs.get('limit', -1),
192
- 'tool_id': tool_id
193
- })
194
-
195
- elif tool_name == 'Grep':
196
- pattern = inputs.get('pattern')
197
- if pattern:
198
- usage_dict['grep_searches'].append({
199
- 'pattern': pattern[:100], # Limit pattern length
200
- 'path': normalize_path(inputs.get('path', '.')),
201
- 'glob': inputs.get('glob'),
202
- 'output_mode': inputs.get('output_mode', 'files_with_matches'),
203
- 'case_insensitive': inputs.get('-i', False)
204
- })
205
- # Add search concept
206
- usage_dict['concepts'].add('search')
207
-
208
- elif tool_name == 'Edit' or tool_name == 'MultiEdit':
209
- path = inputs.get('file_path')
210
- if path:
211
- usage_dict['files_edited'].append({
212
- 'path': normalize_path(path),
213
- 'operation': tool_name.lower()
214
- })
215
-
216
- elif tool_name == 'Write':
217
- path = inputs.get('file_path')
218
- if path:
219
- usage_dict['files_created'].append(normalize_path(path))
220
-
221
- elif tool_name == 'Bash':
222
- cmd = inputs.get('command', '')
223
- if cmd:
224
- # Extract command name
225
- cmd_parts = cmd.split()
226
- cmd_name = cmd_parts[0] if cmd_parts else 'unknown'
227
-
228
- usage_dict['bash_commands'].append({
229
- 'command': cmd_name,
230
- 'description': inputs.get('description', '')[:100]
231
- })
232
-
233
- # Add concepts based on commands
234
- if 'docker' in cmd.lower():
235
- usage_dict['concepts'].add('docker')
236
- if 'git' in cmd.lower():
237
- usage_dict['concepts'].add('git')
238
- if 'test' in cmd.lower() or 'pytest' in cmd.lower():
239
- usage_dict['concepts'].add('testing')
240
-
241
- elif tool_name == 'Glob':
242
- pattern = inputs.get('pattern')
243
- if pattern:
244
- usage_dict['glob_patterns'].append({
245
- 'pattern': pattern,
246
- 'path': normalize_path(inputs.get('path', '.'))
247
- })
248
-
249
- elif tool_name == 'Task':
250
- usage_dict['task_calls'].append({
251
- 'description': inputs.get('description', '')[:100],
252
- 'subagent_type': inputs.get('subagent_type')
253
- })
254
-
255
- # Handle MCP tools
256
- elif tool_name and tool_name.startswith('mcp__'):
257
- usage_dict['mcp_calls'].append({
258
- 'tool': tool_name,
259
- 'params': list(inputs.keys()) if inputs else []
260
- })
261
- usage_dict['concepts'].add('mcp')
262
-
263
- def create_enhanced_chunk(messages: List[Dict], chunk_index: int, tool_usage: Dict[str, Any],
264
- conversation_metadata: Dict[str, Any]) -> Dict[str, Any]:
265
- """Create chunk with tool usage metadata."""
266
- # Extract text from messages
267
- chunk_text = "\n\n".join([
268
- f"{msg['role'].upper()}: {msg['content']}"
269
- for msg in messages
270
- ])
271
-
272
- # Extract concepts from chunk text and tool usage
273
- concepts = extract_concepts(chunk_text, tool_usage)
274
-
275
- # Deduplicate and clean file paths
276
- all_file_items = tool_usage.get('files_read', []) + tool_usage.get('files_edited', [])
277
- files_analyzed = list(set([
278
- item['path'] if isinstance(item, dict) else item
279
- for item in all_file_items
280
- if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
281
- ]))[:20] # Limit to 20 files
282
-
283
- files_edited = list(set([
284
- item['path'] if isinstance(item, dict) else item
285
- for item in tool_usage.get('files_edited', [])
286
- if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
287
- ]))[:10] # Limit to 10 files
288
-
289
- # Build enhanced chunk
290
- chunk = {
291
- "text": chunk_text,
292
- "conversation_id": conversation_metadata['id'],
293
- "chunk_index": chunk_index,
294
- "timestamp": conversation_metadata['timestamp'],
295
- "project": conversation_metadata['project'],
296
- "start_role": messages[0]['role'] if messages else 'unknown',
297
-
298
- # Tool usage metadata
299
- "files_analyzed": files_analyzed,
300
- "files_edited": files_edited,
301
- "search_patterns": [s['pattern'] for s in tool_usage.get('grep_searches', [])][:10],
302
- "concepts": list(concepts)[:15],
303
- "tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
304
- "analysis_only": len(tool_usage.get('files_edited', [])) == 0 and len(tool_usage.get('files_created', [])) == 0,
305
-
306
- # Additional context
307
- "commands_used": list(set([c['command'] for c in tool_usage.get('bash_commands', [])]))[:10],
308
- "has_security_check": 'security' in concepts,
309
- "has_performance_check": 'performance' in concepts,
310
- "mcp_tools_used": list(set([m['tool'].split('__')[1] if '__' in m['tool'] else m['tool']
311
- for m in tool_usage.get('mcp_calls', [])]))[:5]
312
- }
313
-
314
- return chunk
315
-
316
- # Import state management functions (same as original)
317
- def load_state():
318
- """Load the import state from file."""
319
- if os.path.exists(STATE_FILE):
320
- try:
321
- with open(STATE_FILE, 'r') as f:
322
- state = json.load(f)
323
- if "imported_files" not in state:
324
- state["imported_files"] = {}
325
- return state
326
- except Exception as e:
327
- logger.warning(f"Failed to load state file: {e}")
328
- return {"imported_files": {}}
329
-
330
- def save_state(state):
331
- """Save the import state to file."""
332
- try:
333
- os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
334
- temp_file = STATE_FILE + ".tmp"
335
- with open(temp_file, 'w') as f:
336
- json.dump(state, f, indent=2)
337
- os.replace(temp_file, STATE_FILE)
338
- logger.debug(f"Saved state with {len(state['imported_files'])} files")
339
- except Exception as e:
340
- logger.error(f"Failed to save state file: {e}")
341
-
342
- def should_import_file(file_path, state):
343
- """Check if a file should be imported based on modification time."""
344
- str_path = str(file_path)
345
- file_mtime = os.path.getmtime(file_path)
346
-
347
- if str_path in state["imported_files"]:
348
- last_imported = state["imported_files"][str_path].get("last_imported", 0)
349
- last_modified = state["imported_files"][str_path].get("last_modified", 0)
350
-
351
- if file_mtime <= last_modified and last_imported > 0:
352
- logger.info(f"Skipping unchanged file: {file_path.name}")
353
- return False
354
-
355
- return True
356
-
357
- def update_file_state(file_path, state, chunks_imported, tool_stats=None):
358
- """Update the state for an imported file with tool usage stats."""
359
- str_path = str(file_path)
360
- state["imported_files"][str_path] = {
361
- "last_modified": os.path.getmtime(file_path),
362
- "last_imported": datetime.now().timestamp(),
363
- "chunks_imported": chunks_imported,
364
- "tool_stats": tool_stats or {}
365
- }
366
-
367
- # Initialize embedding provider
368
- embedding_provider = None
369
- embedding_dimension = None
370
- collection_suffix = None
371
-
372
- if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
373
- logger.info("Using local FastEmbed embeddings")
374
- from fastembed import TextEmbedding
375
- embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
376
- embedding_dimension = 384
377
- collection_suffix = "_local"
378
- else:
379
- logger.info("Using Voyage AI embeddings")
380
- import voyageai
381
- vo = voyageai.Client(api_key=VOYAGE_API_KEY)
382
- embedding_provider = vo
383
- embedding_dimension = 1024
384
- collection_suffix = "_voyage"
385
-
386
- # Initialize Qdrant client
387
- client = QdrantClient(url=QDRANT_URL)
388
-
389
- def chunk_conversation(messages: List[Dict], chunk_size: int = 10) -> List[Dict]:
390
- """Split conversation into chunks of messages."""
391
- chunks = []
392
- for i in range(0, len(messages), chunk_size):
393
- chunk_messages = messages[i:i + chunk_size]
394
- chunks.append({
395
- "messages": chunk_messages,
396
- "chunk_index": i // chunk_size
397
- })
398
- return chunks
399
-
400
- @retry(stop=stop_after_attempt(3), wait=wait_random_exponential(min=1, max=20))
401
- def generate_embeddings(texts: List[str]) -> List[List[float]]:
402
- """Generate embeddings for texts with retry logic."""
403
- if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
404
- embeddings = list(embedding_provider.embed(texts))
405
- return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
406
- else:
407
- result = embedding_provider.embed(texts, model="voyage-3", input_type="document")
408
- return result.embeddings
409
-
410
- def import_project(project_path: Path, state: Dict) -> int:
411
- """Import conversations from a single project with tool usage extraction."""
412
- total_chunks = 0
413
- jsonl_files = list(project_path.glob("*.jsonl"))
414
-
415
- if not jsonl_files:
416
- return 0
417
-
418
- # Create or verify collection
419
- collection_name = f"conv_{hashlib.md5(project_path.name.encode()).hexdigest()[:8]}{collection_suffix}"
420
-
421
- try:
422
- collections = [c.name for c in client.get_collections().collections]
423
- if collection_name not in collections:
424
- client.create_collection(
425
- collection_name=collection_name,
426
- vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
427
- )
428
- logger.info(f"Created collection: {collection_name}")
429
- except Exception as e:
430
- logger.error(f"Failed to create/verify collection {collection_name}: {e}")
431
- return 0
432
-
433
- for jsonl_file in jsonl_files:
434
- if not should_import_file(jsonl_file, state):
435
- continue
436
-
437
- logger.info(f"Processing file: {jsonl_file.name}")
438
-
439
- try:
440
- file_start_time = time.time()
441
-
442
- # Extract tool usage
443
- extract_start = time.time()
444
- tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
445
- extract_time = time.time() - extract_start
446
- timing_stats["extract"].append(extract_time)
447
-
448
- # Read and process messages (original logic)
449
- messages = []
450
- created_at = None
451
-
452
- with open(jsonl_file, 'r', encoding='utf-8') as f:
453
- for line_num, line in enumerate(f, 1):
454
- line = line.strip()
455
- if not line:
456
- continue
457
-
458
- try:
459
- data = json.loads(line)
460
-
461
- if created_at is None and 'timestamp' in data:
462
- created_at = data.get('timestamp')
463
-
464
- if data.get('type') == 'summary':
465
- continue
466
-
467
- if 'message' in data and data['message']:
468
- msg = data['message']
469
- if msg.get('role') and msg.get('content'):
470
- content = msg['content']
471
- if isinstance(content, list):
472
- text_parts = []
473
- for item in content:
474
- if isinstance(item, dict) and item.get('type') == 'text':
475
- text_parts.append(item.get('text', ''))
476
- elif isinstance(item, str):
477
- text_parts.append(item)
478
- content = '\n'.join(text_parts)
479
-
480
- if content:
481
- messages.append({
482
- 'role': msg['role'],
483
- 'content': content
484
- })
485
- except Exception as e:
486
- logger.error(f"Error processing line {line_num}: {e}")
487
-
488
- if not messages:
489
- continue
490
-
491
- # Prepare metadata
492
- if created_at is None:
493
- created_at = datetime.now().isoformat()
494
- conversation_id = jsonl_file.stem
495
-
496
- conversation_metadata = {
497
- 'id': conversation_id,
498
- 'timestamp': created_at,
499
- 'project': project_path.name
500
- }
501
-
502
- # Chunk the conversation
503
- chunk_start = time.time()
504
- chunks_data = chunk_conversation(messages)
505
- enhanced_chunks = []
506
-
507
- for chunk_data in chunks_data:
508
- enhanced_chunk = create_enhanced_chunk(
509
- chunk_data["messages"],
510
- chunk_data["chunk_index"],
511
- tool_usage,
512
- conversation_metadata
513
- )
514
- enhanced_chunks.append(enhanced_chunk)
515
-
516
- chunk_time = time.time() - chunk_start
517
- timing_stats["chunk"].append(chunk_time)
518
-
519
- if not enhanced_chunks:
520
- continue
521
-
522
- # Process in batches
523
- for batch_start in range(0, len(enhanced_chunks), BATCH_SIZE):
524
- batch = enhanced_chunks[batch_start:batch_start + BATCH_SIZE]
525
- texts = [chunk["text"] for chunk in batch]
526
-
527
- # Generate embeddings
528
- embed_start = time.time()
529
- embeddings = generate_embeddings(texts)
530
- embed_time = time.time() - embed_start
531
- timing_stats["embed"].append(embed_time)
532
-
533
- # Create points
534
- points = []
535
- for chunk, embedding in zip(batch, embeddings):
536
- point_id = hashlib.md5(
537
- f"{conversation_id}_{chunk['chunk_index']}".encode()
538
- ).hexdigest()[:16]
539
-
540
- points.append(PointStruct(
541
- id=int(point_id, 16) % (2**63),
542
- vector=embedding,
543
- payload=chunk
544
- ))
545
-
546
- # Upload to Qdrant (unless dry run)
547
- if not DRY_RUN:
548
- store_start = time.time()
549
- client.upsert(
550
- collection_name=collection_name,
551
- points=points
552
- )
553
- store_time = time.time() - store_start
554
- timing_stats["store"].append(store_time)
555
- else:
556
- logger.info(f"[DRY RUN] Would upload {len(points)} points to {collection_name}")
557
-
558
- total_chunks += len(points)
559
-
560
- file_chunks = len(enhanced_chunks)
561
- total_time = time.time() - file_start_time
562
- timing_stats["total"].append(total_time)
563
-
564
- logger.info(f"Imported {file_chunks} chunks from {jsonl_file.name} "
565
- f"(extract: {extract_time:.2f}s, chunk: {chunk_time:.2f}s, total: {total_time:.2f}s)")
566
-
567
- # Update state with tool stats
568
- tool_stats = {
569
- "tools_used": list(tool_usage['tools_summary'].keys()),
570
- "files_analyzed": len(enhanced_chunks[0].get('files_analyzed', [])) if enhanced_chunks else 0,
571
- "concepts": list(tool_usage.get('concepts', []))[:10]
572
- }
573
- update_file_state(jsonl_file, state, file_chunks, tool_stats)
574
-
575
- # Save state after each file
576
- if not DRY_RUN:
577
- save_state(state)
578
-
579
- gc.collect()
580
-
581
- except Exception as e:
582
- logger.error(f"Failed to import {jsonl_file}: {e}")
583
- import traceback
584
- logger.error(traceback.format_exc())
585
-
586
- return total_chunks
587
-
588
- def main():
589
- """Main import function with enhanced features."""
590
- import argparse
591
-
592
- parser = argparse.ArgumentParser(description='Import conversations with tool usage extraction')
593
- parser.add_argument('--days', type=int, help='Import only files from last N days')
594
- parser.add_argument('--limit', type=int, help='Limit number of files to import')
595
- parser.add_argument('--dry-run', action='store_true', help='Run without actually importing')
596
- parser.add_argument('--project', type=str, help='Import only specific project')
597
-
598
- args = parser.parse_args()
599
-
600
- if args.dry_run:
601
- global DRY_RUN
602
- DRY_RUN = True
603
- logger.info("Running in DRY RUN mode - no data will be imported")
604
-
605
- logs_path = Path(LOGS_DIR)
606
-
607
- # Handle local development vs Docker paths
608
- if not logs_path.exists():
609
- # Try local development path
610
- home_logs = Path.home() / '.claude' / 'projects'
611
- if home_logs.exists():
612
- logs_path = home_logs
613
- logger.info(f"Using local logs directory: {logs_path}")
614
- else:
615
- logger.error(f"Logs directory not found: {LOGS_DIR}")
616
- return
617
-
618
- # Load existing state
619
- state = load_state()
620
- logger.info(f"Loaded state with {len(state['imported_files'])} previously imported files")
621
-
622
- # Find project directories
623
- if args.project:
624
- project_dirs = [d for d in logs_path.iterdir() if d.is_dir() and args.project in d.name]
625
- else:
626
- project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
627
-
628
- if not project_dirs:
629
- logger.warning("No project directories found")
630
- return
631
-
632
- # Filter by date if specified
633
- if args.days:
634
- cutoff_date = datetime.now() - timedelta(days=args.days)
635
- filtered_dirs = []
636
- for project_dir in project_dirs:
637
- jsonl_files = list(project_dir.glob("*.jsonl"))
638
- recent_files = [f for f in jsonl_files if datetime.fromtimestamp(f.stat().st_mtime) > cutoff_date]
639
- if recent_files:
640
- filtered_dirs.append(project_dir)
641
- project_dirs = filtered_dirs
642
- logger.info(f"Filtered to {len(project_dirs)} projects with files from last {args.days} days")
643
-
644
- # Apply limit if specified
645
- if args.limit:
646
- project_dirs = project_dirs[:args.limit]
647
-
648
- logger.info(f"Found {len(project_dirs)} projects to import")
649
-
650
- # Import each project
651
- total_imported = 0
652
- for project_dir in project_dirs:
653
- logger.info(f"Importing project: {project_dir.name}")
654
- chunks = import_project(project_dir, state)
655
- total_imported += chunks
656
-
657
- # Print timing statistics
658
- logger.info("\n=== Import Performance Summary ===")
659
- logger.info(f"Total chunks imported: {total_imported}")
660
-
661
- if timing_stats["total"]:
662
- logger.info(f"\nTiming averages:")
663
- logger.info(f" Extract: {sum(timing_stats['extract'])/len(timing_stats['extract']):.2f}s")
664
- logger.info(f" Chunk: {sum(timing_stats['chunk'])/len(timing_stats['chunk']):.2f}s")
665
- if timing_stats['embed']:
666
- logger.info(f" Embed: {sum(timing_stats['embed'])/len(timing_stats['embed']):.2f}s")
667
- if timing_stats['store']:
668
- logger.info(f" Store: {sum(timing_stats['store'])/len(timing_stats['store']):.2f}s")
669
- logger.info(f" Total: {sum(timing_stats['total'])/len(timing_stats['total']):.2f}s per file")
670
-
671
- if __name__ == "__main__":
672
- main()