dasein-core 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dasein/pipecleaner.py CHANGED
@@ -211,7 +211,8 @@ class RunScopedCorpus:
211
211
 
212
212
  # Batching state
213
213
  self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
214
- self.batch_lock = threading.Lock()
214
+ self.batch_lock = threading.Lock() # Protects batch_queue, batch_timer, etc.
215
+ self.processing_lock = threading.Lock() # CRITICAL: Ensures only ONE batch processes at a time
215
216
  self.batch_timer: Optional[threading.Timer] = None
216
217
  self.batch_start_time: Optional[float] = None
217
218
  self.barrier_duration: float = 5.0 # Start at 5s (min wait)
@@ -219,6 +220,7 @@ class RunScopedCorpus:
219
220
  self.barrier_cap: float = 10.0 # Max 10s
220
221
  self.batch_ready = threading.Event() # Signal when batch is processed
221
222
  self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
223
+ self.prompt_loops: Dict[str, asyncio.AbstractEventLoop] = {} # Event loops for thread-safe signaling
222
224
 
223
225
  # Sequence tracking
224
226
  self.next_seq = 0
@@ -381,7 +383,9 @@ class RunScopedCorpus:
381
383
  # Add to batch queue and manage barrier
382
384
  # Create per-prompt ASYNC event for sequential release
383
385
  prompt_ready = asyncio.Event()
386
+ loop = asyncio.get_running_loop()
384
387
  self.prompt_events[prompt_id] = prompt_ready
388
+ self.prompt_loops[prompt_id] = loop
385
389
 
386
390
  with self.batch_lock:
387
391
  self.batch_queue.append(prompt_id)
@@ -436,334 +440,342 @@ class RunScopedCorpus:
436
440
 
437
441
  def _process_batch(self):
438
442
  """Process current batch: cross-prompt dedupe, entity coverage check, emit (synchronous)."""
439
- with self.batch_lock:
440
- if not self.batch_queue:
441
- # No prompts to process, just return (shouldn't happen)
442
- return
443
+ # CRITICAL: Acquire processing lock to prevent multiple batches from processing simultaneously
444
+ with self.processing_lock:
445
+ with self.batch_lock:
446
+ if not self.batch_queue:
447
+ # No prompts to process, just return (shouldn't happen)
448
+ return
443
449
 
444
- batch_prompts = self.batch_queue.copy()
445
- self.batch_queue.clear()
446
- self.batch_timer = None
450
+ batch_prompts = self.batch_queue.copy()
451
+ self.batch_queue.clear()
452
+ self.batch_timer = None
447
453
 
448
- batch_duration_ms = (time.time() - self.batch_start_time) * 1000
449
- self.telemetry.barrier_times.append(batch_duration_ms)
450
- self.telemetry.batches_processed += 1
454
+ batch_duration_ms = (time.time() - self.batch_start_time) * 1000
455
+ self.telemetry.barrier_times.append(batch_duration_ms)
456
+ self.telemetry.batches_processed += 1
451
457
 
452
- # Always show batch summary (key metric)
453
- print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
458
+ # Always show batch summary (key metric)
459
+ print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
454
460
 
455
- # Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
456
- # This is done ONCE for the entire batch, allowing parallel arrivals
457
- _vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
458
- model = _get_embedding_model()
461
+ # Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
462
+ # This is done ONCE for the entire batch, allowing parallel arrivals
463
+ _vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
464
+ model = _get_embedding_model()
459
465
 
460
- for prompt_id in batch_prompts:
461
- prompt_state = self.prompt_registry[prompt_id]
466
+ for prompt_id in batch_prompts:
467
+ prompt_state = self.prompt_registry[prompt_id]
462
468
 
463
- if not prompt_state.cluster_ids: # Only process if not yet clustered
464
- # Compute embeddings for all sentences in this prompt (batch operation)
465
- sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
469
+ if not prompt_state.cluster_ids: # Only process if not yet clustered
470
+ # Compute embeddings for all sentences in this prompt (batch operation)
471
+ sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
466
472
 
467
- # Match/create clusters for each sentence
468
- cluster_ids = []
469
- for i, sentence in enumerate(prompt_state.sentences):
470
- # Compute salience
471
- salience = len(sentence) / 100.0
472
- salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
473
+ # Match/create clusters for each sentence
474
+ cluster_ids = []
475
+ for i, sentence in enumerate(prompt_state.sentences):
476
+ # Compute salience
477
+ salience = len(sentence) / 100.0
478
+ salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
473
479
 
474
- # Extract entities
475
- entities, numbers = extract_entities_regex(sentence)
480
+ # Extract entities
481
+ entities, numbers = extract_entities_regex(sentence)
476
482
 
477
- # Match against existing clusters
478
- cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
483
+ # Match against existing clusters
484
+ cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
479
485
 
480
- if cluster_id is None:
481
- # Create new cluster
482
- with self.batch_lock:
483
- cluster_id = self._generate_cluster_id()
484
- simhash = compute_simhash(sentence)
486
+ if cluster_id is None:
487
+ # Create new cluster
488
+ with self.batch_lock:
489
+ cluster_id = self._generate_cluster_id()
490
+ simhash = compute_simhash(sentence)
485
491
 
486
- cluster = SentenceCluster(
487
- cluster_id=cluster_id,
488
- canonical_sentence=sentence,
489
- owner_prompt_id=prompt_id,
490
- simhash=simhash,
491
- salience=salience,
492
- entities=entities | numbers,
493
- first_seen_seq=self.next_seq,
494
- length=len(sentence),
495
- embedding=sentence_embeddings[i]
496
- )
492
+ cluster = SentenceCluster(
493
+ cluster_id=cluster_id,
494
+ canonical_sentence=sentence,
495
+ owner_prompt_id=prompt_id,
496
+ simhash=simhash,
497
+ salience=salience,
498
+ entities=entities | numbers,
499
+ first_seen_seq=self.next_seq,
500
+ length=len(sentence),
501
+ embedding=sentence_embeddings[i]
502
+ )
497
503
 
498
- self.clusters[cluster_id] = cluster
499
- self.next_seq += 1
500
- self.telemetry.clusters_total += 1
504
+ self.clusters[cluster_id] = cluster
505
+ self.next_seq += 1
506
+ self.telemetry.clusters_total += 1
501
507
 
502
- cluster_ids.append(cluster_id)
508
+ cluster_ids.append(cluster_id)
503
509
 
504
- # Update prompt state with cluster_ids
505
- prompt_state.cluster_ids = cluster_ids
510
+ # Update prompt state with cluster_ids
511
+ prompt_state.cluster_ids = cluster_ids
506
512
 
507
- _vprint(f"[CORPUS] ✅ Embeddings computed and clusters assigned", self.verbose)
513
+ _vprint(f"[CORPUS] ✅ Embeddings computed and clusters assigned", self.verbose)
508
514
 
509
- # Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
510
- # This is critical for true run-scoped deduplication
511
- all_sentences = []
512
- sentence_to_prompt = {} # Map sentence_id → (prompt_id, index)
513
- locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
515
+ # Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
516
+ # This is critical for true run-scoped deduplication
517
+ all_sentences = []
518
+ sentence_to_prompt = {} # Map sentence_id → (prompt_id, index)
519
+ locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
514
520
 
515
- # Iterate over ALL prompts in registry (including previous batches)
516
- for prompt_id, prompt_state in self.prompt_registry.items():
517
- is_previous_batch = prompt_id not in batch_prompts
521
+ # Iterate over ALL prompts in registry (including previous batches)
522
+ for prompt_id, prompt_state in self.prompt_registry.items():
523
+ is_previous_batch = prompt_id not in batch_prompts
518
524
 
519
- for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
520
- cluster = self.clusters.get(cluster_id)
521
- if not cluster:
522
- continue
525
+ for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
526
+ cluster = self.clusters.get(cluster_id)
527
+ if not cluster:
528
+ continue
523
529
 
524
- # Create Sentence object for greedy algorithm
525
- sent_id = f"{prompt_id}_{idx}"
526
- sent_obj = Sentence(
527
- id=sent_id,
528
- text=sentence_text,
529
- embedding=cluster.embedding,
530
- entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
531
- numbers=set(), # Already in entities
532
- salience=cluster.salience,
533
- position=cluster.first_seen_seq
534
- )
535
- all_sentences.append(sent_obj)
536
- sentence_to_prompt[sent_id] = (prompt_id, idx)
530
+ # Create Sentence object for greedy algorithm
531
+ sent_id = f"{prompt_id}_{idx}"
532
+ sent_obj = Sentence(
533
+ id=sent_id,
534
+ text=sentence_text,
535
+ embedding=cluster.embedding,
536
+ entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
537
+ numbers=set(), # Already in entities
538
+ salience=cluster.salience,
539
+ position=cluster.first_seen_seq
540
+ )
541
+ all_sentences.append(sent_obj)
542
+ sentence_to_prompt[sent_id] = (prompt_id, idx)
537
543
 
538
- # Lock sentences from previous batches (already emitted to user)
539
- if is_previous_batch:
540
- locked_sentences.add(sent_id)
544
+ # Lock sentences from previous batches (already emitted to user)
545
+ if is_previous_batch:
546
+ locked_sentences.add(sent_id)
541
547
 
542
- _vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
543
- _vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
548
+ _vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
549
+ _vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
544
550
 
545
- # Step 2: Compute degree map (needed for isolates pass later)
546
- degree_map = {}
547
- for sent in all_sentences:
548
- degree = 0
549
- for other in all_sentences:
550
- if sent.id != other.id:
551
- if are_sentences_similar(sent, other, semantic_threshold=0.60):
552
- degree += 1
553
- degree_map[sent.id] = degree
554
-
555
- # Sanity checks
556
- isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
557
- non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
558
- pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
559
- avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
560
- print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
561
-
562
- # Step 3: Run greedy maximum-independent-set selection
563
- # Start with LOCKED sentences (from previous batches, already emitted)
564
- # Then run MIS only on NEW sentences (current batch)
565
- selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
566
- selected_ids = locked_sentences.copy()
567
-
568
- print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
569
-
570
- # Now run MIS on NEW sentences only (exclude locked)
571
- new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
572
-
573
- if new_sentences:
574
- # Run MIS on new sentences, considering locked ones as neighbors
575
- new_selected = greedy_max_independent_set(
576
- new_sentences,
577
- similarity_threshold=0.60,
578
- verbose=False, # Set to True for debugging
579
- precomputed_degree_map=degree_map # Pass precomputed degrees
580
- )
551
+ # Step 2: Compute degree map (needed for isolates pass later)
552
+ degree_map = {}
553
+ for sent in all_sentences:
554
+ degree = 0
555
+ for other in all_sentences:
556
+ if sent.id != other.id:
557
+ if are_sentences_similar(sent, other, semantic_threshold=0.60):
558
+ degree += 1
559
+ degree_map[sent.id] = degree
560
+
561
+ # Sanity checks
562
+ isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
563
+ non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
564
+ pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
565
+ avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
566
+ print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
567
+
568
+ # Step 3: Run greedy maximum-independent-set selection
569
+ # Start with LOCKED sentences (from previous batches, already emitted)
570
+ # Then run MIS only on NEW sentences (current batch)
571
+ selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
572
+ selected_ids = locked_sentences.copy()
573
+
574
+ print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
575
+
576
+ # Now run MIS on NEW sentences only (exclude locked)
577
+ new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
578
+
579
+ if new_sentences:
580
+ # Run MIS on new sentences, considering locked ones as neighbors
581
+ new_selected = greedy_max_independent_set(
582
+ new_sentences,
583
+ similarity_threshold=0.60,
584
+ verbose=False, # Set to True for debugging
585
+ precomputed_degree_map=degree_map # Pass precomputed degrees
586
+ )
581
587
 
582
- # Add newly selected sentences
583
- selected_sentences.extend(new_selected)
584
- selected_ids.update(s.id for s in new_selected)
585
-
586
- _vprint(f"[CORPUS] ✅ MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
587
-
588
- # Step 3: Compute NODE COVERAGE (align universe for backfill)
589
- # covered_nodes = S ∪ N(S) (selected + their neighbors)
590
- covered_nodes = set(selected_ids)
591
- sentence_map = {s.id: s for s in all_sentences}
592
-
593
- for selected_id in selected_ids:
594
- selected_sent = sentence_map[selected_id]
595
- # Add all neighbors (similar nodes)
596
- for other in all_sentences:
597
- if other.id != selected_id:
598
- if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
599
- covered_nodes.add(other.id)
600
-
601
- total_nodes = len(all_sentences)
602
- node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
603
-
604
- _vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
605
-
606
- # Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
607
- # Goal: Maximize node coverage (S ∪ N(S)) by re-adding removed nodes with highest gain
608
- # gain(u) = |({u} ∪ N(u)) \ covered_nodes|
609
- backfill_added = 0
610
- isolates_added = 0
611
- target_coverage = 0.90 # 90% node coverage target
612
-
613
- if node_coverage_before < target_coverage:
614
- uncovered_count = total_nodes - len(covered_nodes)
615
- _vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
588
+ # Add newly selected sentences
589
+ selected_sentences.extend(new_selected)
590
+ selected_ids.update(s.id for s in new_selected)
591
+
592
+ _vprint(f"[CORPUS] ✅ MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
593
+
594
+ # Step 3: Compute NODE COVERAGE (align universe for backfill)
595
+ # covered_nodes = S ∪ N(S) (selected + their neighbors)
596
+ covered_nodes = set(selected_ids)
597
+ sentence_map = {s.id: s for s in all_sentences}
598
+
599
+ for selected_id in selected_ids:
600
+ selected_sent = sentence_map[selected_id]
601
+ # Add all neighbors (similar nodes)
602
+ for other in all_sentences:
603
+ if other.id != selected_id:
604
+ if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
605
+ covered_nodes.add(other.id)
606
+
607
+ total_nodes = len(all_sentences)
608
+ node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
609
+
610
+ _vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
611
+
612
+ # Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
613
+ # Goal: Maximize node coverage (S ∪ N(S)) by re-adding removed nodes with highest gain
614
+ # gain(u) = |({u} ∪ N(u)) \ covered_nodes|
615
+ backfill_added = 0
616
+ isolates_added = 0
617
+ target_coverage = 0.90 # 90% node coverage target
618
+
619
+ if node_coverage_before < target_coverage:
620
+ uncovered_count = total_nodes - len(covered_nodes)
621
+ _vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
616
622
 
617
- # Get ALL removed sentences (candidates for backfill)
618
- removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
623
+ # Get ALL removed sentences (candidates for backfill)
624
+ removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
619
625
 
620
- # Helper: compute node gain for a candidate
621
- def compute_node_gain(sent):
622
- """Compute how many uncovered nodes this sentence + its neighbors would cover."""
623
- candidate_coverage = {sent.id}
624
- # Add neighbors
625
- for other in all_sentences:
626
- if other.id != sent.id:
627
- if are_sentences_similar(sent, other, semantic_threshold=0.60):
628
- candidate_coverage.add(other.id)
629
- # Gain = new nodes not already covered
630
- return len(candidate_coverage - covered_nodes)
626
+ # Helper: compute node gain for a candidate
627
+ def compute_node_gain(sent):
628
+ """Compute how many uncovered nodes this sentence + its neighbors would cover."""
629
+ candidate_coverage = {sent.id}
630
+ # Add neighbors
631
+ for other in all_sentences:
632
+ if other.id != sent.id:
633
+ if are_sentences_similar(sent, other, semantic_threshold=0.60):
634
+ candidate_coverage.add(other.id)
635
+ # Gain = new nodes not already covered
636
+ return len(candidate_coverage - covered_nodes)
631
637
 
632
- # Debug: Print top-5 candidates by gain (first iteration only)
633
- if removed_sentences:
634
- gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
635
- gains.sort(key=lambda x: x[1], reverse=True)
636
- _vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
637
- for sent, gain in gains[:5]:
638
- _vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
638
+ # Debug: Print top-5 candidates by gain (first iteration only)
639
+ if removed_sentences:
640
+ gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
641
+ gains.sort(key=lambda x: x[1], reverse=True)
642
+ _vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
643
+ for sent, gain in gains[:5]:
644
+ _vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
639
645
 
640
- # GREEDY SET COVER: repeatedly pick sentence with max gain
641
- iteration = 0
642
- while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
643
- # Find best candidate
644
- best_sent = None
645
- best_gain = 0
646
+ # GREEDY SET COVER: repeatedly pick sentence with max gain
647
+ iteration = 0
648
+ while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
649
+ # Find best candidate
650
+ best_sent = None
651
+ best_gain = 0
646
652
 
647
- for sent in removed_sentences:
648
- gain = compute_node_gain(sent)
649
- if gain > best_gain:
650
- best_gain = gain
651
- best_sent = sent
653
+ for sent in removed_sentences:
654
+ gain = compute_node_gain(sent)
655
+ if gain > best_gain:
656
+ best_gain = gain
657
+ best_sent = sent
652
658
 
653
- if best_gain == 0:
654
- _vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
655
- break
659
+ if best_gain == 0:
660
+ _vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
661
+ break
656
662
 
657
- # Add best sentence back
658
- selected_ids.add(best_sent.id)
659
- selected_sentences.append(best_sent)
663
+ # Add best sentence back
664
+ selected_ids.add(best_sent.id)
665
+ selected_sentences.append(best_sent)
660
666
 
661
- # Update covered_nodes: add this node + its neighbors
662
- covered_nodes.add(best_sent.id)
663
- for other in all_sentences:
664
- if other.id != best_sent.id:
665
- if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
666
- covered_nodes.add(other.id)
667
+ # Update covered_nodes: add this node + its neighbors
668
+ covered_nodes.add(best_sent.id)
669
+ for other in all_sentences:
670
+ if other.id != best_sent.id:
671
+ if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
672
+ covered_nodes.add(other.id)
667
673
 
668
- removed_sentences.remove(best_sent)
669
- backfill_added += 1
674
+ removed_sentences.remove(best_sent)
675
+ backfill_added += 1
670
676
 
671
- # Update coverage
672
- node_coverage_before = len(covered_nodes) / total_nodes
673
- iteration += 1
677
+ # Update coverage
678
+ node_coverage_before = len(covered_nodes) / total_nodes
679
+ iteration += 1
674
680
 
675
- if backfill_added <= 5:
676
- _vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
681
+ if backfill_added <= 5:
682
+ _vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
677
683
 
678
- _vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
684
+ _vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
679
685
 
680
- # Step 5: ISOLATES PASS - add uncovered degree=0 nodes
681
- # These are unique nodes with no similar neighbors
682
- uncovered_isolates = [sent for sent in all_sentences
683
- if sent.id not in covered_nodes and degree_map[sent.id] == 0]
686
+ # Step 5: ISOLATES PASS - add uncovered degree=0 nodes
687
+ # These are unique nodes with no similar neighbors
688
+ uncovered_isolates = [sent for sent in all_sentences
689
+ if sent.id not in covered_nodes and degree_map[sent.id] == 0]
684
690
 
685
- if uncovered_isolates:
686
- _vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
691
+ if uncovered_isolates:
692
+ _vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
687
693
 
688
- for sent in uncovered_isolates:
689
- if node_coverage_before >= target_coverage:
690
- break
691
- selected_ids.add(sent.id)
692
- covered_nodes.add(sent.id)
693
- isolates_added += 1
694
- node_coverage_before = len(covered_nodes) / total_nodes
694
+ for sent in uncovered_isolates:
695
+ if node_coverage_before >= target_coverage:
696
+ break
697
+ selected_ids.add(sent.id)
698
+ covered_nodes.add(sent.id)
699
+ isolates_added += 1
700
+ node_coverage_before = len(covered_nodes) / total_nodes
695
701
 
696
- if isolates_added <= 5:
697
- _vprint(f"[CORPUS] ✅ Isolate: '{sent.text[:60]}...'", self.verbose)
702
+ if isolates_added <= 5:
703
+ _vprint(f"[CORPUS] ✅ Isolate: '{sent.text[:60]}...'", self.verbose)
698
704
 
699
- if isolates_added > 0:
700
- _vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
701
-
702
- # Final coverage stats (NODE universe)
703
- final_selected = len(selected_ids)
704
- final_covered_nodes = len(covered_nodes)
705
- final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
706
-
707
- # Assert denominator is |V| (all nodes, no filtering)
708
- assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
709
-
710
- _vprint(f"[CORPUS] ✅ Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
711
- _vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
712
-
713
- # Step 6: Map results back to prompts
714
- results = {}
715
- for prompt_id in batch_prompts:
716
- prompt_state = self.prompt_registry[prompt_id]
717
- kept_sentences = []
718
- removed_count = 0
705
+ if isolates_added > 0:
706
+ _vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
707
+
708
+ # Final coverage stats (NODE universe)
709
+ final_selected = len(selected_ids)
710
+ final_covered_nodes = len(covered_nodes)
711
+ final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
712
+
713
+ # Assert denominator is |V| (all nodes, no filtering)
714
+ assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
715
+
716
+ _vprint(f"[CORPUS] ✅ Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
717
+ _vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
718
+
719
+ # Step 6: Map results back to prompts
720
+ results = {}
721
+ for prompt_id in batch_prompts:
722
+ prompt_state = self.prompt_registry[prompt_id]
723
+ kept_sentences = []
724
+ removed_count = 0
719
725
 
720
- for idx, sentence_text in enumerate(prompt_state.sentences):
721
- sent_id = f"{prompt_id}_{idx}"
722
- if sent_id in selected_ids:
723
- kept_sentences.append(sentence_text)
724
- else:
725
- removed_count += 1
726
+ for idx, sentence_text in enumerate(prompt_state.sentences):
727
+ sent_id = f"{prompt_id}_{idx}"
728
+ if sent_id in selected_ids:
729
+ kept_sentences.append(sentence_text)
730
+ else:
731
+ removed_count += 1
726
732
 
727
- results[prompt_id] = {
728
- 'kept': kept_sentences,
729
- 'removed': removed_count,
730
- 'original_count': len(prompt_state.sentences)
731
- }
732
-
733
- # Step 7: Store results and emit to prompts
734
- for prompt_id in batch_prompts:
735
- prompt_state = self.prompt_registry[prompt_id]
736
- result = results[prompt_id]
737
- prompt_state.sentences = result['kept']
733
+ results[prompt_id] = {
734
+ 'kept': kept_sentences,
735
+ 'removed': removed_count,
736
+ 'original_count': len(prompt_state.sentences)
737
+ }
738
+
739
+ # Step 7: Store results and emit to prompts
740
+ for prompt_id in batch_prompts:
741
+ prompt_state = self.prompt_registry[prompt_id]
742
+ result = results[prompt_id]
743
+ prompt_state.sentences = result['kept']
738
744
 
739
- reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
740
- _vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} → {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
741
-
742
- # Update telemetry
743
- self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
744
- # Always show final batch summary (key metric)
745
- print(f"[CORPUS] ✅ Batch complete: Node coverage {final_node_coverage*100:.1f}%")
746
-
747
- # Update telemetry
748
- if self.telemetry.barrier_times:
749
- self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
750
- self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
751
-
752
- self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
753
-
754
- # Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
755
- _vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
756
- for i, prompt_id in enumerate(batch_prompts):
757
- event = self.prompt_events.get(prompt_id)
758
- if event:
759
- event.set() # Wake up this specific thread
760
- # Longer delay to ensure threads hit on_llm_start one at a time
761
- if i < len(batch_prompts) - 1: # Don't delay after the last one
762
- time.sleep(0.5) # 500ms stagger to be safe
763
-
764
- # Clean up events to prevent memory leak
765
- for prompt_id in batch_prompts:
766
- self.prompt_events.pop(prompt_id, None)
745
+ reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
746
+ _vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} → {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
747
+
748
+ # Update telemetry
749
+ self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
750
+ # Always show final batch summary (key metric)
751
+ print(f"[CORPUS] ✅ Batch complete: Node coverage {final_node_coverage*100:.1f}%")
752
+
753
+ # Update telemetry
754
+ if self.telemetry.barrier_times:
755
+ self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
756
+ self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
757
+
758
+ self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
759
+
760
+ # Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
761
+ _vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
762
+ for i, prompt_id in enumerate(batch_prompts):
763
+ event = self.prompt_events.get(prompt_id)
764
+ if event:
765
+ # Signal the asyncio.Event from the original loop thread-safely
766
+ loop = self.prompt_loops.get(prompt_id)
767
+ if loop:
768
+ loop.call_soon_threadsafe(event.set)
769
+ else:
770
+ event.set()
771
+ # Longer delay to ensure threads hit on_llm_start one at a time
772
+ if i < len(batch_prompts) - 1: # Don't delay after the last one
773
+ time.sleep(0.5) # 500ms stagger to be safe
774
+
775
+ # Clean up events to prevent memory leak
776
+ for prompt_id in batch_prompts:
777
+ self.prompt_events.pop(prompt_id, None)
778
+ self.prompt_loops.pop(prompt_id, None)
767
779
 
768
780
  def _get_deduplicated_prompt(self, prompt_id: str) -> str:
769
781
  """Get deduplicated prompt text."""
@@ -942,23 +954,12 @@ def split_into_sentences(text: str) -> List[str]:
942
954
  r'\n\s*\n', # Paragraph breaks
943
955
  ]
944
956
 
945
- combined_pattern = '|'.join(f'({p})' for p in patterns)
957
+ # Use non-capturing groups so delimiters are discarded by re.split
958
+ combined_pattern = '(?:' + '|'.join(patterns) + ')'
946
959
  parts = re.split(combined_pattern, text)
947
960
 
948
- # Reconstruct sentences (filter out delimiters)
949
- current = ""
950
- for part in parts:
951
- if part is None:
952
- continue
953
- if re.match(combined_pattern, part):
954
- if current.strip():
955
- sentences.append(current.strip())
956
- current = ""
957
- else:
958
- current += part
959
-
960
- if current.strip():
961
- sentences.append(current.strip())
961
+ # Collect non-empty segments as sentences
962
+ sentences = [p.strip() for p in parts if p and p.strip()]
962
963
 
963
964
  # Restore code blocks
964
965
  restored = []