dasein-core 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dasein/api.py +4744 -4686
- dasein/capture.py +2 -2
- dasein/pipecleaner.py +302 -301
- {dasein_core-0.2.14.dist-info → dasein_core-0.2.16.dist-info}/METADATA +2 -1
- {dasein_core-0.2.14.dist-info → dasein_core-0.2.16.dist-info}/RECORD +8 -8
- {dasein_core-0.2.14.dist-info → dasein_core-0.2.16.dist-info}/WHEEL +0 -0
- {dasein_core-0.2.14.dist-info → dasein_core-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {dasein_core-0.2.14.dist-info → dasein_core-0.2.16.dist-info}/top_level.txt +0 -0
dasein/pipecleaner.py
CHANGED
@@ -211,7 +211,8 @@ class RunScopedCorpus:
|
|
211
211
|
|
212
212
|
# Batching state
|
213
213
|
self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
|
214
|
-
self.batch_lock = threading.Lock()
|
214
|
+
self.batch_lock = threading.Lock() # Protects batch_queue, batch_timer, etc.
|
215
|
+
self.processing_lock = threading.Lock() # CRITICAL: Ensures only ONE batch processes at a time
|
215
216
|
self.batch_timer: Optional[threading.Timer] = None
|
216
217
|
self.batch_start_time: Optional[float] = None
|
217
218
|
self.barrier_duration: float = 5.0 # Start at 5s (min wait)
|
@@ -219,6 +220,7 @@ class RunScopedCorpus:
|
|
219
220
|
self.barrier_cap: float = 10.0 # Max 10s
|
220
221
|
self.batch_ready = threading.Event() # Signal when batch is processed
|
221
222
|
self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
|
223
|
+
self.prompt_loops: Dict[str, asyncio.AbstractEventLoop] = {} # Event loops for thread-safe signaling
|
222
224
|
|
223
225
|
# Sequence tracking
|
224
226
|
self.next_seq = 0
|
@@ -381,7 +383,9 @@ class RunScopedCorpus:
|
|
381
383
|
# Add to batch queue and manage barrier
|
382
384
|
# Create per-prompt ASYNC event for sequential release
|
383
385
|
prompt_ready = asyncio.Event()
|
386
|
+
loop = asyncio.get_running_loop()
|
384
387
|
self.prompt_events[prompt_id] = prompt_ready
|
388
|
+
self.prompt_loops[prompt_id] = loop
|
385
389
|
|
386
390
|
with self.batch_lock:
|
387
391
|
self.batch_queue.append(prompt_id)
|
@@ -436,334 +440,342 @@ class RunScopedCorpus:
|
|
436
440
|
|
437
441
|
def _process_batch(self):
|
438
442
|
"""Process current batch: cross-prompt dedupe, entity coverage check, emit (synchronous)."""
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
+
# CRITICAL: Acquire processing lock to prevent multiple batches from processing simultaneously
|
444
|
+
with self.processing_lock:
|
445
|
+
with self.batch_lock:
|
446
|
+
if not self.batch_queue:
|
447
|
+
# No prompts to process, just return (shouldn't happen)
|
448
|
+
return
|
443
449
|
|
444
|
-
|
445
|
-
|
446
|
-
|
450
|
+
batch_prompts = self.batch_queue.copy()
|
451
|
+
self.batch_queue.clear()
|
452
|
+
self.batch_timer = None
|
447
453
|
|
448
|
-
|
449
|
-
|
450
|
-
|
454
|
+
batch_duration_ms = (time.time() - self.batch_start_time) * 1000
|
455
|
+
self.telemetry.barrier_times.append(batch_duration_ms)
|
456
|
+
self.telemetry.batches_processed += 1
|
451
457
|
|
452
|
-
|
453
|
-
|
458
|
+
# Always show batch summary (key metric)
|
459
|
+
print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
|
454
460
|
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
461
|
+
# Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
|
462
|
+
# This is done ONCE for the entire batch, allowing parallel arrivals
|
463
|
+
_vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
|
464
|
+
model = _get_embedding_model()
|
459
465
|
|
460
|
-
|
461
|
-
|
466
|
+
for prompt_id in batch_prompts:
|
467
|
+
prompt_state = self.prompt_registry[prompt_id]
|
462
468
|
|
463
|
-
|
464
|
-
|
465
|
-
|
469
|
+
if not prompt_state.cluster_ids: # Only process if not yet clustered
|
470
|
+
# Compute embeddings for all sentences in this prompt (batch operation)
|
471
|
+
sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
|
466
472
|
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
+
# Match/create clusters for each sentence
|
474
|
+
cluster_ids = []
|
475
|
+
for i, sentence in enumerate(prompt_state.sentences):
|
476
|
+
# Compute salience
|
477
|
+
salience = len(sentence) / 100.0
|
478
|
+
salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
|
473
479
|
|
474
|
-
|
475
|
-
|
480
|
+
# Extract entities
|
481
|
+
entities, numbers = extract_entities_regex(sentence)
|
476
482
|
|
477
|
-
|
478
|
-
|
483
|
+
# Match against existing clusters
|
484
|
+
cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
|
479
485
|
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
486
|
+
if cluster_id is None:
|
487
|
+
# Create new cluster
|
488
|
+
with self.batch_lock:
|
489
|
+
cluster_id = self._generate_cluster_id()
|
490
|
+
simhash = compute_simhash(sentence)
|
485
491
|
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
492
|
+
cluster = SentenceCluster(
|
493
|
+
cluster_id=cluster_id,
|
494
|
+
canonical_sentence=sentence,
|
495
|
+
owner_prompt_id=prompt_id,
|
496
|
+
simhash=simhash,
|
497
|
+
salience=salience,
|
498
|
+
entities=entities | numbers,
|
499
|
+
first_seen_seq=self.next_seq,
|
500
|
+
length=len(sentence),
|
501
|
+
embedding=sentence_embeddings[i]
|
502
|
+
)
|
497
503
|
|
498
|
-
|
499
|
-
|
500
|
-
|
504
|
+
self.clusters[cluster_id] = cluster
|
505
|
+
self.next_seq += 1
|
506
|
+
self.telemetry.clusters_total += 1
|
501
507
|
|
502
|
-
|
508
|
+
cluster_ids.append(cluster_id)
|
503
509
|
|
504
|
-
|
505
|
-
|
510
|
+
# Update prompt state with cluster_ids
|
511
|
+
prompt_state.cluster_ids = cluster_ids
|
506
512
|
|
507
|
-
|
513
|
+
_vprint(f"[CORPUS] ✅ Embeddings computed and clusters assigned", self.verbose)
|
508
514
|
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
515
|
+
# Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
|
516
|
+
# This is critical for true run-scoped deduplication
|
517
|
+
all_sentences = []
|
518
|
+
sentence_to_prompt = {} # Map sentence_id → (prompt_id, index)
|
519
|
+
locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
|
514
520
|
|
515
|
-
|
516
|
-
|
517
|
-
|
521
|
+
# Iterate over ALL prompts in registry (including previous batches)
|
522
|
+
for prompt_id, prompt_state in self.prompt_registry.items():
|
523
|
+
is_previous_batch = prompt_id not in batch_prompts
|
518
524
|
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
525
|
+
for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
|
526
|
+
cluster = self.clusters.get(cluster_id)
|
527
|
+
if not cluster:
|
528
|
+
continue
|
523
529
|
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
530
|
+
# Create Sentence object for greedy algorithm
|
531
|
+
sent_id = f"{prompt_id}_{idx}"
|
532
|
+
sent_obj = Sentence(
|
533
|
+
id=sent_id,
|
534
|
+
text=sentence_text,
|
535
|
+
embedding=cluster.embedding,
|
536
|
+
entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
|
537
|
+
numbers=set(), # Already in entities
|
538
|
+
salience=cluster.salience,
|
539
|
+
position=cluster.first_seen_seq
|
540
|
+
)
|
541
|
+
all_sentences.append(sent_obj)
|
542
|
+
sentence_to_prompt[sent_id] = (prompt_id, idx)
|
537
543
|
|
538
|
-
|
539
|
-
|
540
|
-
|
544
|
+
# Lock sentences from previous batches (already emitted to user)
|
545
|
+
if is_previous_batch:
|
546
|
+
locked_sentences.add(sent_id)
|
541
547
|
|
542
|
-
|
543
|
-
|
548
|
+
_vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
|
549
|
+
_vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
|
544
550
|
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
551
|
+
# Step 2: Compute degree map (needed for isolates pass later)
|
552
|
+
degree_map = {}
|
553
|
+
for sent in all_sentences:
|
554
|
+
degree = 0
|
555
|
+
for other in all_sentences:
|
556
|
+
if sent.id != other.id:
|
557
|
+
if are_sentences_similar(sent, other, semantic_threshold=0.60):
|
558
|
+
degree += 1
|
559
|
+
degree_map[sent.id] = degree
|
560
|
+
|
561
|
+
# Sanity checks
|
562
|
+
isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
|
563
|
+
non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
|
564
|
+
pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
|
565
|
+
avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
|
566
|
+
print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
|
567
|
+
|
568
|
+
# Step 3: Run greedy maximum-independent-set selection
|
569
|
+
# Start with LOCKED sentences (from previous batches, already emitted)
|
570
|
+
# Then run MIS only on NEW sentences (current batch)
|
571
|
+
selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
|
572
|
+
selected_ids = locked_sentences.copy()
|
573
|
+
|
574
|
+
print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
|
575
|
+
|
576
|
+
# Now run MIS on NEW sentences only (exclude locked)
|
577
|
+
new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
|
578
|
+
|
579
|
+
if new_sentences:
|
580
|
+
# Run MIS on new sentences, considering locked ones as neighbors
|
581
|
+
new_selected = greedy_max_independent_set(
|
582
|
+
new_sentences,
|
583
|
+
similarity_threshold=0.60,
|
584
|
+
verbose=False, # Set to True for debugging
|
585
|
+
precomputed_degree_map=degree_map # Pass precomputed degrees
|
586
|
+
)
|
581
587
|
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
588
|
+
# Add newly selected sentences
|
589
|
+
selected_sentences.extend(new_selected)
|
590
|
+
selected_ids.update(s.id for s in new_selected)
|
591
|
+
|
592
|
+
_vprint(f"[CORPUS] ✅ MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
|
593
|
+
|
594
|
+
# Step 3: Compute NODE COVERAGE (align universe for backfill)
|
595
|
+
# covered_nodes = S ∪ N(S) (selected + their neighbors)
|
596
|
+
covered_nodes = set(selected_ids)
|
597
|
+
sentence_map = {s.id: s for s in all_sentences}
|
598
|
+
|
599
|
+
for selected_id in selected_ids:
|
600
|
+
selected_sent = sentence_map[selected_id]
|
601
|
+
# Add all neighbors (similar nodes)
|
602
|
+
for other in all_sentences:
|
603
|
+
if other.id != selected_id:
|
604
|
+
if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
|
605
|
+
covered_nodes.add(other.id)
|
606
|
+
|
607
|
+
total_nodes = len(all_sentences)
|
608
|
+
node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
|
609
|
+
|
610
|
+
_vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
|
611
|
+
|
612
|
+
# Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
|
613
|
+
# Goal: Maximize node coverage (S ∪ N(S)) by re-adding removed nodes with highest gain
|
614
|
+
# gain(u) = |({u} ∪ N(u)) \ covered_nodes|
|
615
|
+
backfill_added = 0
|
616
|
+
isolates_added = 0
|
617
|
+
target_coverage = 0.90 # 90% node coverage target
|
618
|
+
|
619
|
+
if node_coverage_before < target_coverage:
|
620
|
+
uncovered_count = total_nodes - len(covered_nodes)
|
621
|
+
_vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
|
616
622
|
|
617
|
-
|
618
|
-
|
623
|
+
# Get ALL removed sentences (candidates for backfill)
|
624
|
+
removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
|
619
625
|
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
626
|
+
# Helper: compute node gain for a candidate
|
627
|
+
def compute_node_gain(sent):
|
628
|
+
"""Compute how many uncovered nodes this sentence + its neighbors would cover."""
|
629
|
+
candidate_coverage = {sent.id}
|
630
|
+
# Add neighbors
|
631
|
+
for other in all_sentences:
|
632
|
+
if other.id != sent.id:
|
633
|
+
if are_sentences_similar(sent, other, semantic_threshold=0.60):
|
634
|
+
candidate_coverage.add(other.id)
|
635
|
+
# Gain = new nodes not already covered
|
636
|
+
return len(candidate_coverage - covered_nodes)
|
631
637
|
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
638
|
+
# Debug: Print top-5 candidates by gain (first iteration only)
|
639
|
+
if removed_sentences:
|
640
|
+
gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
|
641
|
+
gains.sort(key=lambda x: x[1], reverse=True)
|
642
|
+
_vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
|
643
|
+
for sent, gain in gains[:5]:
|
644
|
+
_vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
|
639
645
|
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
+
# GREEDY SET COVER: repeatedly pick sentence with max gain
|
647
|
+
iteration = 0
|
648
|
+
while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
|
649
|
+
# Find best candidate
|
650
|
+
best_sent = None
|
651
|
+
best_gain = 0
|
646
652
|
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
653
|
+
for sent in removed_sentences:
|
654
|
+
gain = compute_node_gain(sent)
|
655
|
+
if gain > best_gain:
|
656
|
+
best_gain = gain
|
657
|
+
best_sent = sent
|
652
658
|
|
653
|
-
|
654
|
-
|
655
|
-
|
659
|
+
if best_gain == 0:
|
660
|
+
_vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
|
661
|
+
break
|
656
662
|
|
657
|
-
|
658
|
-
|
659
|
-
|
663
|
+
# Add best sentence back
|
664
|
+
selected_ids.add(best_sent.id)
|
665
|
+
selected_sentences.append(best_sent)
|
660
666
|
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
+
# Update covered_nodes: add this node + its neighbors
|
668
|
+
covered_nodes.add(best_sent.id)
|
669
|
+
for other in all_sentences:
|
670
|
+
if other.id != best_sent.id:
|
671
|
+
if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
|
672
|
+
covered_nodes.add(other.id)
|
667
673
|
|
668
|
-
|
669
|
-
|
674
|
+
removed_sentences.remove(best_sent)
|
675
|
+
backfill_added += 1
|
670
676
|
|
671
|
-
|
672
|
-
|
673
|
-
|
677
|
+
# Update coverage
|
678
|
+
node_coverage_before = len(covered_nodes) / total_nodes
|
679
|
+
iteration += 1
|
674
680
|
|
675
|
-
|
676
|
-
|
681
|
+
if backfill_added <= 5:
|
682
|
+
_vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
|
677
683
|
|
678
|
-
|
684
|
+
_vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
|
679
685
|
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
686
|
+
# Step 5: ISOLATES PASS - add uncovered degree=0 nodes
|
687
|
+
# These are unique nodes with no similar neighbors
|
688
|
+
uncovered_isolates = [sent for sent in all_sentences
|
689
|
+
if sent.id not in covered_nodes and degree_map[sent.id] == 0]
|
684
690
|
|
685
|
-
|
686
|
-
|
691
|
+
if uncovered_isolates:
|
692
|
+
_vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
|
687
693
|
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
694
|
+
for sent in uncovered_isolates:
|
695
|
+
if node_coverage_before >= target_coverage:
|
696
|
+
break
|
697
|
+
selected_ids.add(sent.id)
|
698
|
+
covered_nodes.add(sent.id)
|
699
|
+
isolates_added += 1
|
700
|
+
node_coverage_before = len(covered_nodes) / total_nodes
|
695
701
|
|
696
|
-
|
697
|
-
|
702
|
+
if isolates_added <= 5:
|
703
|
+
_vprint(f"[CORPUS] ✅ Isolate: '{sent.text[:60]}...'", self.verbose)
|
698
704
|
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
705
|
+
if isolates_added > 0:
|
706
|
+
_vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
|
707
|
+
|
708
|
+
# Final coverage stats (NODE universe)
|
709
|
+
final_selected = len(selected_ids)
|
710
|
+
final_covered_nodes = len(covered_nodes)
|
711
|
+
final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
|
712
|
+
|
713
|
+
# Assert denominator is |V| (all nodes, no filtering)
|
714
|
+
assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
|
715
|
+
|
716
|
+
_vprint(f"[CORPUS] ✅ Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
|
717
|
+
_vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
|
718
|
+
|
719
|
+
# Step 6: Map results back to prompts
|
720
|
+
results = {}
|
721
|
+
for prompt_id in batch_prompts:
|
722
|
+
prompt_state = self.prompt_registry[prompt_id]
|
723
|
+
kept_sentences = []
|
724
|
+
removed_count = 0
|
719
725
|
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
+
for idx, sentence_text in enumerate(prompt_state.sentences):
|
727
|
+
sent_id = f"{prompt_id}_{idx}"
|
728
|
+
if sent_id in selected_ids:
|
729
|
+
kept_sentences.append(sentence_text)
|
730
|
+
else:
|
731
|
+
removed_count += 1
|
726
732
|
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
733
|
+
results[prompt_id] = {
|
734
|
+
'kept': kept_sentences,
|
735
|
+
'removed': removed_count,
|
736
|
+
'original_count': len(prompt_state.sentences)
|
737
|
+
}
|
738
|
+
|
739
|
+
# Step 7: Store results and emit to prompts
|
740
|
+
for prompt_id in batch_prompts:
|
741
|
+
prompt_state = self.prompt_registry[prompt_id]
|
742
|
+
result = results[prompt_id]
|
743
|
+
prompt_state.sentences = result['kept']
|
738
744
|
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
745
|
+
reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
|
746
|
+
_vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} → {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
|
747
|
+
|
748
|
+
# Update telemetry
|
749
|
+
self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
|
750
|
+
# Always show final batch summary (key metric)
|
751
|
+
print(f"[CORPUS] ✅ Batch complete: Node coverage {final_node_coverage*100:.1f}%")
|
752
|
+
|
753
|
+
# Update telemetry
|
754
|
+
if self.telemetry.barrier_times:
|
755
|
+
self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
|
756
|
+
self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
|
757
|
+
|
758
|
+
self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
|
759
|
+
|
760
|
+
# Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
|
761
|
+
_vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
|
762
|
+
for i, prompt_id in enumerate(batch_prompts):
|
763
|
+
event = self.prompt_events.get(prompt_id)
|
764
|
+
if event:
|
765
|
+
# Signal the asyncio.Event from the original loop thread-safely
|
766
|
+
loop = self.prompt_loops.get(prompt_id)
|
767
|
+
if loop:
|
768
|
+
loop.call_soon_threadsafe(event.set)
|
769
|
+
else:
|
770
|
+
event.set()
|
771
|
+
# Longer delay to ensure threads hit on_llm_start one at a time
|
772
|
+
if i < len(batch_prompts) - 1: # Don't delay after the last one
|
773
|
+
time.sleep(0.5) # 500ms stagger to be safe
|
774
|
+
|
775
|
+
# Clean up events to prevent memory leak
|
776
|
+
for prompt_id in batch_prompts:
|
777
|
+
self.prompt_events.pop(prompt_id, None)
|
778
|
+
self.prompt_loops.pop(prompt_id, None)
|
767
779
|
|
768
780
|
def _get_deduplicated_prompt(self, prompt_id: str) -> str:
|
769
781
|
"""Get deduplicated prompt text."""
|
@@ -942,23 +954,12 @@ def split_into_sentences(text: str) -> List[str]:
|
|
942
954
|
r'\n\s*\n', # Paragraph breaks
|
943
955
|
]
|
944
956
|
|
945
|
-
|
957
|
+
# Use non-capturing groups so delimiters are discarded by re.split
|
958
|
+
combined_pattern = '(?:' + '|'.join(patterns) + ')'
|
946
959
|
parts = re.split(combined_pattern, text)
|
947
960
|
|
948
|
-
#
|
949
|
-
|
950
|
-
for part in parts:
|
951
|
-
if part is None:
|
952
|
-
continue
|
953
|
-
if re.match(combined_pattern, part):
|
954
|
-
if current.strip():
|
955
|
-
sentences.append(current.strip())
|
956
|
-
current = ""
|
957
|
-
else:
|
958
|
-
current += part
|
959
|
-
|
960
|
-
if current.strip():
|
961
|
-
sentences.append(current.strip())
|
961
|
+
# Collect non-empty segments as sentences
|
962
|
+
sentences = [p.strip() for p in parts if p and p.strip()]
|
962
963
|
|
963
964
|
# Restore code blocks
|
964
965
|
restored = []
|