claude-self-reflect 3.0.2 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,8 @@ import hashlib
11
11
  import gc
12
12
  import ast
13
13
  import re
14
+ import fcntl
15
+ import time
14
16
  from pathlib import Path
15
17
  from datetime import datetime
16
18
  from typing import List, Dict, Any, Optional, Set
@@ -45,6 +47,10 @@ MAX_CONCEPTS = 10
45
47
  MAX_AST_ELEMENTS = 30
46
48
  MAX_CODE_BLOCKS = 5
47
49
  MAX_ELEMENTS_PER_BLOCK = 10
50
+ MAX_FILES_ANALYZED = 20
51
+ MAX_FILES_EDITED = 20
52
+ MAX_TOOLS_USED = 15
53
+ MAX_CONCEPT_MESSAGES = 50
48
54
 
49
55
  # Robust cross-platform state file resolution
50
56
  def get_default_state_file():
@@ -171,7 +177,7 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
171
177
  # Check variance is above threshold
172
178
  import statistics
173
179
  variance = statistics.variance(embedding)
174
- if variance < 1e-6:
180
+ if variance < 1e-4: # Less strict threshold for valid embeddings
175
181
  logger.warning(f"Low variance embedding detected: {variance}")
176
182
 
177
183
  # Validate dimension
@@ -194,7 +200,7 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
194
200
  "start_role": messages[0].get("role", "unknown") if messages else "unknown",
195
201
  "message_count": len(messages),
196
202
  "total_messages": total_messages,
197
- "message_index": message_indices[0] if message_indices else 0,
203
+ "message_index": message_indices[0] if message_indices else None,
198
204
  "message_indices": message_indices # Store all indices in this chunk
199
205
  }
200
206
 
@@ -205,16 +211,22 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
205
211
  # Create point
206
212
  point = PointStruct(
207
213
  id=int(point_id, 16) % (2**63),
208
- vector=embeddings[0],
214
+ vector=embedding, # Use validated embedding variable
209
215
  payload=payload
210
216
  )
211
217
 
212
- # Upload immediately (no wait for better throughput)
213
- client.upsert(
218
+ # Upload with wait to ensure persistence (with retries)
219
+ result = _with_retries(lambda: client.upsert(
214
220
  collection_name=collection_name,
215
221
  points=[point],
216
- wait=False # Don't wait for indexing to complete
217
- )
222
+ wait=True # Ensure operation completed before continuing
223
+ ))
224
+
225
+ # Verify the operation completed successfully (handle enum or string representations)
226
+ status = getattr(result, 'status', None)
227
+ if status and 'completed' not in str(status).lower():
228
+ logger.error(f"Upsert not completed for {conversation_id}:{chunk_index}, status={status}")
229
+ return 0
218
230
 
219
231
  return 1
220
232
 
@@ -333,15 +345,15 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, i
333
345
  if '```' in item.get('text', ''):
334
346
  metadata['has_code_blocks'] = True
335
347
  # Extract code for AST analysis with bounds checking
336
- if len(metadata['ast_elements']) < 30:
348
+ if len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
337
349
  # Fix: More permissive regex to handle various fence formats
338
- code_blocks = re.findall(r'```[^\n]*\n?(.*?)```', item.get('text', ''), re.DOTALL)
339
- for code_block in code_blocks[:5]: # Limit to 5 blocks
340
- if len(metadata['ast_elements']) >= 30:
350
+ code_blocks = re.findall(r'```[^`]*?\n(.*?)```', item.get('text', ''), re.DOTALL)
351
+ for code_block in code_blocks[:MAX_CODE_BLOCKS]: # Use defined constant
352
+ if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
341
353
  break
342
354
  ast_elems = extract_ast_elements(code_block)
343
- for elem in list(ast_elems)[:10]: # Limit elements per block
344
- if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < 30:
355
+ for elem in list(ast_elems)[:MAX_ELEMENTS_PER_BLOCK]: # Use defined constant
356
+ if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
345
357
  metadata['ast_elements'].append(elem)
346
358
 
347
359
  elif item.get('type') == 'tool_use':
@@ -388,17 +400,17 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, i
388
400
 
389
401
  # Extract concepts from collected text
390
402
  if all_text:
391
- combined_text = ' '.join(all_text[:50]) # Limit to first 50 messages
403
+ combined_text = ' '.join(all_text[:MAX_CONCEPT_MESSAGES]) # Limit messages for concept extraction
392
404
  metadata['concepts'] = extract_concepts(combined_text)
393
405
 
394
406
  # Set total messages
395
407
  metadata['total_messages'] = message_count
396
408
 
397
409
  # Limit arrays
398
- metadata['files_analyzed'] = metadata['files_analyzed'][:20]
399
- metadata['files_edited'] = metadata['files_edited'][:20]
400
- metadata['tools_used'] = metadata['tools_used'][:15]
401
- metadata['ast_elements'] = metadata['ast_elements'][:30]
410
+ metadata['files_analyzed'] = metadata['files_analyzed'][:MAX_FILES_ANALYZED]
411
+ metadata['files_edited'] = metadata['files_edited'][:MAX_FILES_EDITED]
412
+ metadata['tools_used'] = metadata['tools_used'][:MAX_TOOLS_USED]
413
+ metadata['ast_elements'] = metadata['ast_elements'][:MAX_AST_ELEMENTS]
402
414
 
403
415
  return metadata, first_timestamp or datetime.now().isoformat(), message_count
404
416
 
@@ -406,15 +418,32 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
406
418
  """Stream import a single JSONL file without loading it into memory."""
407
419
  logger.info(f"Streaming import of {jsonl_file.name}")
408
420
 
421
+ # Delete existing points for this conversation to prevent stale data
422
+ conversation_id = jsonl_file.stem
423
+ try:
424
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
425
+ client.delete(
426
+ collection_name=collection_name,
427
+ points_selector=Filter(
428
+ must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
429
+ ),
430
+ wait=True
431
+ )
432
+ logger.info(f"Deleted existing points for conversation {conversation_id}")
433
+ except Exception as e:
434
+ logger.warning(f"Could not delete existing points for {conversation_id}: {e}")
435
+
409
436
  # Extract metadata in first pass (lightweight)
410
437
  metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
411
438
 
439
+ # Reset counters for each conversation (critical for correct indexing)
440
+ current_message_index = 0 # Must be reset before processing each conversation
441
+
412
442
  # Stream messages and process in chunks
413
443
  chunk_buffer = []
414
444
  chunk_index = 0
415
445
  total_chunks = 0
416
446
  conversation_id = jsonl_file.stem
417
- current_message_index = 0
418
447
 
419
448
  try:
420
449
  with open(jsonl_file, 'r', encoding='utf-8') as f:
@@ -434,13 +463,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
434
463
  if 'message' in data and data['message']:
435
464
  msg = data['message']
436
465
  if msg.get('role') and msg.get('content'):
437
- # Extract content
466
+ # Extract content from various message types
438
467
  content = msg['content']
439
468
  if isinstance(content, list):
440
469
  text_parts = []
441
470
  for item in content:
442
- if isinstance(item, dict) and item.get('type') == 'text':
443
- text_parts.append(item.get('text', ''))
471
+ if isinstance(item, dict):
472
+ item_type = item.get('type', '')
473
+ if item_type == 'text':
474
+ text_parts.append(item.get('text', ''))
475
+ elif item_type == 'tool_use':
476
+ # Include tool use information
477
+ tool_name = item.get('name', 'unknown')
478
+ tool_input = str(item.get('input', ''))[:500] # Limit size
479
+ text_parts.append(f"[Tool: {tool_name}] {tool_input}")
480
+ elif item_type == 'tool_result':
481
+ # Include tool results
482
+ result_content = str(item.get('content', ''))[:1000] # Limit size
483
+ text_parts.append(f"[Result] {result_content}")
444
484
  elif isinstance(item, str):
445
485
  text_parts.append(item)
446
486
  content = '\n'.join(text_parts)
@@ -448,8 +488,8 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
448
488
  if content:
449
489
  # Track message index for user/assistant messages
450
490
  if msg['role'] in ['user', 'assistant']:
451
- current_message_index += 1
452
491
  message_idx = current_message_index
492
+ current_message_index += 1
453
493
  else:
454
494
  message_idx = 0
455
495
 
@@ -475,6 +515,51 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
475
515
  # Log progress
476
516
  if chunk_index % 10 == 0:
477
517
  logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
518
+
519
+ # Handle top-level tool_result/tool_use events (no message wrapper)
520
+ entry_type = data.get('type')
521
+ if entry_type in ('tool_result', 'tool_use'):
522
+ text_parts = []
523
+ if entry_type == 'tool_use':
524
+ tool_name = data.get('name', 'unknown')
525
+ tool_input = str(data.get('input', ''))[:500]
526
+ text_parts.append(f"[Tool: {tool_name}] {tool_input}")
527
+ elif entry_type == 'tool_result':
528
+ # Common structures: either 'content' (list/str) or 'result'
529
+ result_content = data.get('content')
530
+ if isinstance(result_content, list):
531
+ # flatten to text
532
+ flat = []
533
+ for itm in result_content:
534
+ if isinstance(itm, dict) and itm.get('type') == 'text':
535
+ flat.append(itm.get('text', ''))
536
+ elif isinstance(itm, str):
537
+ flat.append(itm)
538
+ result_content = "\n".join(flat)
539
+ if not result_content:
540
+ result_content = data.get('result', '') # fallback key used by some tools
541
+ text_parts.append(f"[Result] {str(result_content)[:1000]}")
542
+
543
+ content = "\n".join([p for p in text_parts if p])
544
+ if content:
545
+ # Track message index for summary format too
546
+ message_idx = current_message_index
547
+ current_message_index += 1
548
+
549
+ chunk_buffer.append({
550
+ 'role': entry_type,
551
+ 'content': content,
552
+ 'message_index': message_idx
553
+ })
554
+ if len(chunk_buffer) >= MAX_CHUNK_SIZE:
555
+ chunks = process_and_upload_chunk(
556
+ chunk_buffer, chunk_index, conversation_id,
557
+ created_at, metadata, collection_name, project_path, total_messages
558
+ )
559
+ total_chunks += chunks
560
+ chunk_buffer = []
561
+ chunk_index += 1
562
+ gc.collect()
478
563
 
479
564
  except json.JSONDecodeError:
480
565
  logger.debug(f"Skipping invalid JSON at line {line_num}")
@@ -496,14 +581,35 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
496
581
  logger.error(f"Failed to import {jsonl_file}: {e}")
497
582
  return 0
498
583
 
584
+ def _locked_open(path, mode):
585
+ """Open file with exclusive lock for concurrent safety."""
586
+ f = open(path, mode)
587
+ try:
588
+ fcntl.flock(f.fileno(), fcntl.LOCK_EX)
589
+ except Exception:
590
+ f.close()
591
+ raise
592
+ return f
593
+
594
+ def _with_retries(fn, attempts=3, base_sleep=0.5):
595
+ """Execute function with retries and exponential backoff."""
596
+ for i in range(attempts):
597
+ try:
598
+ return fn()
599
+ except Exception as e:
600
+ if i == attempts - 1:
601
+ raise
602
+ time.sleep(base_sleep * (2 ** i))
603
+ logger.debug(f"Retrying after error: {e}")
604
+
499
605
  def load_state() -> dict:
500
- """Load import state."""
606
+ """Load import state with file locking."""
501
607
  if os.path.exists(STATE_FILE):
502
608
  try:
503
- with open(STATE_FILE, 'r') as f:
609
+ with _locked_open(STATE_FILE, 'r') as f:
504
610
  return json.load(f)
505
- except:
506
- pass
611
+ except Exception as e:
612
+ logger.warning(f"Failed to load state: {e}")
507
613
  return {"imported_files": {}}
508
614
 
509
615
  def save_state(state: dict):
@@ -513,10 +619,12 @@ def save_state(state: dict):
513
619
  if state_dir:
514
620
  os.makedirs(state_dir, exist_ok=True)
515
621
 
516
- # Use atomic write to prevent corruption during crashes
622
+ # Use atomic write with locking to prevent corruption
517
623
  temp_file = f"{STATE_FILE}.tmp"
518
- with open(temp_file, 'w') as f:
624
+ with _locked_open(temp_file, 'w') as f:
519
625
  json.dump(state, f, indent=2)
626
+ f.flush()
627
+ os.fsync(f.fileno())
520
628
 
521
629
  # Atomic rename (on POSIX systems)
522
630
  os.replace(temp_file, STATE_FILE)
@@ -527,9 +635,23 @@ def should_import_file(file_path: Path, state: dict) -> bool:
527
635
  if file_str in state.get("imported_files", {}):
528
636
  file_info = state["imported_files"][file_str]
529
637
  last_modified = file_path.stat().st_mtime
530
- if file_info.get("last_modified") == last_modified:
531
- logger.info(f"Skipping unchanged file: {file_path.name}")
532
- return False
638
+
639
+ # Check if file has been modified
640
+ if file_info.get("last_modified") != last_modified:
641
+ logger.info(f"File modified, will re-import: {file_path.name}")
642
+ return True
643
+
644
+ # Check for suspiciously low chunk counts (likely failed imports)
645
+ chunks = file_info.get("chunks", 0)
646
+ file_size_kb = file_path.stat().st_size / 1024
647
+
648
+ # Heuristic: Files > 10KB should have more than 2 chunks
649
+ if file_size_kb > 10 and chunks <= 2:
650
+ logger.warning(f"File has suspiciously low chunks ({chunks}) for size {file_size_kb:.1f}KB, will re-import: {file_path.name}")
651
+ return True
652
+
653
+ logger.info(f"Skipping unchanged file: {file_path.name}")
654
+ return False
533
655
  return True
534
656
 
535
657
  def update_file_state(file_path: Path, state: dict, chunks: int):
@@ -585,12 +707,37 @@ def main():
585
707
  if should_import_file(jsonl_file, state):
586
708
  chunks = stream_import_file(jsonl_file, collection_name, project_dir)
587
709
  if chunks > 0:
588
- update_file_state(jsonl_file, state, chunks)
589
- save_state(state)
590
- total_imported += 1
710
+ # Verify data is actually in Qdrant before marking as imported
711
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
712
+ try:
713
+ conversation_id = jsonl_file.stem
714
+ count_result = _with_retries(lambda: client.count(
715
+ collection_name=collection_name,
716
+ count_filter=Filter(
717
+ must=[FieldCondition(key="conversation_id",
718
+ match=MatchValue(value=conversation_id))]
719
+ ),
720
+ exact=True # Ensure exact count, not approximation
721
+ ))
722
+ actual_count = count_result.count if hasattr(count_result, 'count') else 0
723
+
724
+ if actual_count > 0:
725
+ logger.info(f"Verified {actual_count} points in Qdrant for {conversation_id}")
726
+ update_file_state(jsonl_file, state, chunks)
727
+ save_state(state)
728
+ total_imported += 1
729
+ else:
730
+ logger.error(f"No points found in Qdrant for {conversation_id} despite {chunks} chunks processed - not marking as imported")
731
+ except Exception as e:
732
+ logger.error(f"Failed to verify Qdrant points for {jsonl_file.name}: {e}")
733
+ # Don't mark as imported if we can't verify
591
734
 
592
735
  # Force GC after each file
593
736
  gc.collect()
737
+ else:
738
+ # Critical fix: Don't mark files with 0 chunks as imported
739
+ # This allows retry on next run
740
+ logger.warning(f"File produced 0 chunks, not marking as imported: {jsonl_file.name}")
594
741
 
595
742
  logger.info(f"Import complete: processed {total_imported} files")
596
743