dasein-core 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dasein/pipecleaner.py CHANGED
@@ -1,1920 +1,1918 @@
1
- """
2
- Pipecleaner: Run-scoped global corpus deduplication for multi-agent systems.
3
-
4
- V2.0: Global ClusterBank with dynamic batching barrier (5-10s) for cross-prompt deduplication.
5
- - Run-scoped corpus: All prompts in a run share a global ClusterBank
6
- - SimHash near-dup matching: Hamming distance ≤6 for 64-bit fingerprints
7
- - Dynamic barrier: 5s min, +2s per arrival (cap 10s), maximizes dedupe by collecting bursts
8
- - Canonical ownership: First prompt to use a cluster owns it, others drop duplicates
9
- - Entity coverage: 95% threshold RUN-LEVEL (cumulative across all batches, not per-batch)
10
-
11
- Algorithm:
12
- 1. Intercept prompt → split sentences → compute SimHash signatures
13
- 2. Match against ClusterBank (Hamming ≤6) → assign cluster_id or create new
14
- 3. Queue prompt into micro-batch, extend barrier (+2s per arrival, cap 10s)
15
- 4. On timer: cross-prompt dedupe (keep only canonical owners)
16
- 5. RUN-LEVEL entity coverage check (95% cumulative across entire run), re-add if needed
17
- 6. Emit cleaned prompts (original sentence order preserved)
18
-
19
- Expected savings: 50-90% char reduction with 95%+ entity coverage across entire run.
20
- Later batches are MORE aggressive (earlier batches already covered entities).
21
- """
22
-
23
- import re
24
- import hashlib
25
- import threading
26
- import time
27
- from typing import List, Dict, Set, Tuple, Optional, Any
28
- from dataclasses import dataclass, field
29
- from collections import defaultdict
30
- import numpy as np
31
- import asyncio
32
-
33
- # Type alias for return type
34
- DeduplicationResult = Tuple[str, Dict]
35
-
36
- # Lazy imports for performance (only load when needed)
37
- _embedding_model = None
38
- _spacy_nlp = None
39
- _model_lock = threading.Lock() # Thread-safe singleton access
40
-
41
-
42
- def _vprint(message: str, verbose: bool = False, force: bool = False):
43
- """Helper function for verbose printing."""
44
- if force or verbose:
45
- print(message)
46
-
47
-
48
- def _get_embedding_model():
49
- """
50
- Lazy load sentence transformer model (thread-safe singleton).
51
- Forces CPU to avoid meta tensor issues on Win + Py3.13 + Torch.
52
- """
53
- global _embedding_model
54
-
55
- # Double-checked locking pattern for performance
56
- if _embedding_model is None:
57
- with _model_lock:
58
- # Check again inside lock (another thread might have loaded it)
59
- if _embedding_model is None:
60
- try:
61
- from sentence_transformers import SentenceTransformer
62
- print("[PIPECLEANER] Loading embedding model: all-MiniLM-L6-v2 (384-dim, ~80MB)...")
63
- # Force CPU device to avoid meta tensor issues
64
- _embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
65
- print("[PIPECLEANER] ✅ Embedding model loaded successfully (CPU)")
66
- except ImportError:
67
- print("[PIPECLEANER] ⚠️ sentence-transformers not installed. Install: pip install sentence-transformers")
68
- raise
69
- except Exception as e:
70
- print(f"[PIPECLEANER] ⚠️ Failed to load embedding model: {e}")
71
- raise
72
-
73
- return _embedding_model
74
-
75
-
76
- def _get_spacy_model():
77
- """Lazy load spaCy model for entity extraction."""
78
- global _spacy_nlp
79
- if _spacy_nlp is None:
80
- try:
81
- import spacy
82
- print("[PIPECLEANER] Loading spaCy model: en_core_web_sm...")
83
- _spacy_nlp = spacy.load("en_core_web_sm")
84
- print("[PIPECLEANER] ✅ spaCy model loaded successfully")
85
- except ImportError:
86
- print("[PIPECLEANER] ⚠️ spaCy not installed. Using regex fallback for entities.")
87
- _spacy_nlp = "fallback"
88
- except OSError:
89
- print("[PIPECLEANER] ⚠️ spaCy model not found. Using regex fallback for entities.")
90
- _spacy_nlp = "fallback"
91
- return _spacy_nlp
92
-
93
-
94
- # ============================================================================
95
- # Run-Scoped Global Corpus System V2.0
96
- # ============================================================================
97
-
98
- @dataclass
99
- class SentenceCluster:
100
- """Represents a cluster of similar sentences across the run."""
101
- cluster_id: str
102
- canonical_sentence: str
103
- owner_prompt_id: str # First prompt to use this cluster
104
- simhash: int # 64-bit SimHash fingerprint
105
- salience: float
106
- entities: Set[str]
107
- first_seen_seq: int
108
- length: int
109
- embedding: Optional[np.ndarray] = None # Sentence embedding for cosine similarity
110
-
111
- @dataclass
112
- class PromptState:
113
- """State for a single prompt in the batch."""
114
- prompt_id: str
115
- sentences: List[str]
116
- cluster_ids: List[str] # parallel to sentences
117
- original_order: List[int] # track reordering
118
- entities: Set[str]
119
- arrived_at: float
120
-
121
- @dataclass
122
- class RunCorpusTelemetry:
123
- """Run-level statistics for the corpus."""
124
- prompts_total: int = 0
125
- sentences_total: int = 0
126
- clusters_total: int = 0
127
- cross_prompt_dups_removed: int = 0
128
- chars_in: int = 0
129
- chars_out: int = 0
130
- tokens_saved: int = 0
131
- entity_coverage_avg: float = 100.0
132
- batches_processed: int = 0
133
- avg_barrier_ms: float = 0.0
134
- max_barrier_ms: float = 0.0
135
- barrier_times: List[float] = field(default_factory=list)
136
-
137
-
138
- def compute_simhash(text: str, hash_bits: int = 64) -> int:
139
- """
140
- Compute SimHash fingerprint for near-dup detection.
141
-
142
- Args:
143
- text: Input text
144
- hash_bits: Hash size (64-bit default)
145
-
146
- Returns:
147
- Integer hash value
148
- """
149
- # Tokenize and compute feature hashes
150
- tokens = re.findall(r'\b\w+\b', text.lower())
151
- if not tokens:
152
- return 0
153
-
154
- # Initialize bit vector
155
- v = [0] * hash_bits
156
-
157
- for token in tokens:
158
- # Hash each token
159
- h = int(hashlib.md5(token.encode()).hexdigest(), 16)
160
-
161
- # Update bit vector
162
- for i in range(hash_bits):
163
- if h & (1 << i):
164
- v[i] += 1
165
- else:
166
- v[i] -= 1
167
-
168
- # Generate final hash
169
- fingerprint = 0
170
- for i in range(hash_bits):
171
- if v[i] > 0:
172
- fingerprint |= (1 << i)
173
-
174
- return fingerprint
175
-
176
-
177
- def hamming_distance(hash1: int, hash2: int) -> int:
178
- """Count differing bits between two hashes."""
179
- return bin(hash1 ^ hash2).count('1')
180
-
181
-
182
- class RunScopedCorpus:
183
- """
184
- Global corpus for a single run, with dynamic batching barrier.
185
- All prompts in the run share this corpus for cross-prompt deduplication.
186
-
187
- CONCURRENCY MODEL:
188
- - All shared state (clusters, prompt_registry, run_entities, kept_entities, batch_queue)
189
- is protected by `self.batch_lock` (threading.Lock)
190
- - All reads iterate over snapshots (dict(...), list(...)) to avoid "dict changed size" errors
191
- - All writes are atomic under lock (copy-on-write when possible)
192
- - Re-entrancy guard in caller (DaseinCallbackHandler) prevents nested calls
193
- - Background timer thread (_process_batch) acquires lock before any mutations
194
- """
195
-
196
- def __init__(self, run_id: str, hamming_threshold: int = 6, entity_coverage_min: float = 0.95, verbose: bool = False):
197
- self.run_id = run_id
198
- self.hamming_threshold = hamming_threshold
199
- self.entity_coverage_min = entity_coverage_min
200
- self.verbose = verbose # Gate debug logging
201
-
202
- # Core state
203
- self.clusters: Dict[str, SentenceCluster] = {} # cluster_id → cluster
204
- self.simhash_index: Dict[int, List[str]] = defaultdict(list) # simhash → [cluster_ids]
205
- self.prompt_registry: Dict[str, PromptState] = {} # prompt_id → state
206
- self.entity_index: Dict[str, Set[str]] = defaultdict(set) # entity → {cluster_ids}
207
-
208
- # Run-level entity tracking for global coverage
209
- self.run_entities: Set[str] = set() # All entities seen across entire run
210
- self.kept_entities: Set[str] = set() # All entities kept across all batches
211
-
212
- # Batching state
213
- self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
214
- self.batch_lock = threading.Lock() # Protects batch_queue, batch_timer, etc.
215
- self.processing_lock = threading.Lock() # CRITICAL: Ensures only ONE batch processes at a time
216
- self.batch_timer: Optional[threading.Timer] = None
217
- self.batch_start_time: Optional[float] = None
218
- self.barrier_duration: float = 5.0 # Start at 5s (min wait)
219
- self.barrier_increment: float = 2.0 # Add 2s per new arrival
220
- self.barrier_cap: float = 10.0 # Max 10s
221
- self.batch_ready = threading.Event() # Signal when batch is processed
222
- self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
223
-
224
- # Sequence tracking
225
- self.next_seq = 0
226
- self.next_cluster_id = 0
227
-
228
- # Telemetry
229
- self.telemetry = RunCorpusTelemetry()
230
-
231
- _vprint(f"[CORPUS] 🏗️ Created run-scoped corpus for run_id={run_id[:8]} (barrier: 5s min, +2s/arrival, 10s cap)", self.verbose)
232
-
233
- def _generate_cluster_id(self) -> str:
234
- """Generate unique cluster ID."""
235
- cluster_id = f"c{self.next_cluster_id:06d}"
236
- self.next_cluster_id += 1
237
- return cluster_id
238
-
239
- def find_matching_cluster(self, simhash: int, sentence: str, sentence_embedding=None) -> Optional[str]:
240
- """
241
- Find existing cluster that matches this sentence using cosine similarity.
242
-
243
- Args:
244
- simhash: SimHash of the sentence (for indexing, not matching)
245
- sentence: Original sentence text
246
- sentence_embedding: Pre-computed embedding for this sentence
247
-
248
- Returns:
249
- cluster_id if match found, None otherwise
250
- """
251
- if sentence_embedding is None:
252
- return None
253
-
254
- # Check all existing clusters for semantic similarity
255
- # Use cosine similarity 0.60 (catches cross-site paraphrases)
256
- best_match_id = None
257
- best_similarity = 0.60 # Threshold for considering duplicate (lowered to catch paraphrases)
258
-
259
- # Snapshot clusters to avoid "dict changed size" errors (thread-safe read)
260
- with self.batch_lock:
261
- clusters_snapshot = dict(self.clusters)
262
-
263
- for cluster_id, cluster in clusters_snapshot.items():
264
- if cluster.canonical_sentence == sentence:
265
- # Exact match
266
- return cluster_id
267
-
268
- # Hybrid similarity: semantic + lexical fallback for short sentences
269
- if hasattr(cluster, 'embedding') and cluster.embedding is not None:
270
- # Semantic similarity
271
- similarity = np.dot(sentence_embedding, cluster.embedding)
272
-
273
- # Lexical fallback for short sentences (boilerplate detection)
274
- max_len = max(len(sentence), len(cluster.canonical_sentence))
275
- if max_len <= 120 and similarity < 0.60:
276
- lexical_sim = compute_char_3gram_jaccard(sentence, cluster.canonical_sentence)
277
- if lexical_sim >= 0.82:
278
- # Boost similarity to indicate match via lexical path
279
- similarity = max(similarity, 0.82)
280
-
281
- if similarity > best_similarity:
282
- best_similarity = similarity
283
- best_match_id = cluster_id
284
-
285
- return best_match_id
286
-
287
- def add_sentence_to_corpus(self, sentence: str, prompt_id: str, salience: float, entities: Set[str]) -> str:
288
- """
289
- Add sentence to corpus or match to existing cluster.
290
-
291
- Args:
292
- sentence: Sentence text
293
- prompt_id: Owner prompt
294
- salience: Importance score
295
- entities: Extracted entities
296
-
297
- Returns:
298
- cluster_id (new or matched)
299
- """
300
- # Compute SimHash
301
- simhash = compute_simhash(sentence)
302
-
303
- # Try to match existing cluster
304
- existing_cluster_id = self.find_matching_cluster(simhash, sentence)
305
-
306
- if existing_cluster_id:
307
- # Matched existing cluster
308
- return existing_cluster_id
309
-
310
- # Create new cluster
311
- cluster_id = self._generate_cluster_id()
312
- cluster = SentenceCluster(
313
- cluster_id=cluster_id,
314
- canonical_sentence=sentence,
315
- owner_prompt_id=prompt_id,
316
- simhash=simhash,
317
- salience=salience,
318
- entities=entities,
319
- first_seen_seq=self.next_seq,
320
- length=len(sentence)
321
- )
322
-
323
- self.clusters[cluster_id] = cluster
324
- self.simhash_index[simhash].append(cluster_id)
325
-
326
- # Update entity index
327
- for entity in entities:
328
- self.entity_index[entity].add(cluster_id)
329
-
330
- self.next_seq += 1
331
- self.telemetry.clusters_total += 1
332
-
333
- return cluster_id
334
-
335
- async def enqueue_prompt(self, prompt_id: str, prompt_text: str) -> str:
336
- """
337
- Enqueue prompt for batched processing with dynamic barrier (ASYNC - allows parallel arrivals).
338
-
339
- Args:
340
- prompt_id: Unique prompt identifier
341
- prompt_text: Full prompt text
342
-
343
- Returns:
344
- Deduplicated prompt text (after barrier)
345
- """
346
- arrival_time = time.time()
347
-
348
- # Split into sentences
349
- sentences = split_into_sentences(prompt_text)
350
-
351
- if not sentences:
352
- return prompt_text
353
-
354
- self.telemetry.prompts_total += 1
355
- self.telemetry.sentences_total += len(sentences)
356
- self.telemetry.chars_in += len(prompt_text)
357
-
358
- # ⚡ CRITICAL: DO NOT compute embeddings here! It blocks async arrivals.
359
- # Store raw sentences and compute embeddings in batch during _process_batch
360
- all_entities = set()
361
-
362
- for sentence in sentences:
363
- # Extract entities (fast, non-blocking)
364
- entities, numbers = extract_entities_regex(sentence)
365
- all_entities.update(entities)
366
- all_entities.update(numbers)
367
-
368
- # Create prompt state (thread-safe mutation)
369
- # NOTE: cluster_ids will be computed during batch processing (after embeddings)
370
- with self.batch_lock:
371
- prompt_state = PromptState(
372
- prompt_id=prompt_id,
373
- sentences=sentences,
374
- cluster_ids=[], # Will be filled during _process_batch
375
- original_order=list(range(len(sentences))),
376
- entities=all_entities,
377
- arrived_at=arrival_time
378
- )
379
-
380
- self.prompt_registry[prompt_id] = prompt_state
381
-
382
- # Add to batch queue and manage barrier
383
- # Create per-prompt ASYNC event for sequential release
384
- prompt_ready = asyncio.Event()
385
- self.prompt_events[prompt_id] = prompt_ready
386
-
387
- with self.batch_lock:
388
- self.batch_queue.append(prompt_id)
389
-
390
- if self.batch_timer is None:
391
- # First prompt in batch, start timer at 5s
392
- self.batch_start_time = arrival_time
393
- self.barrier_duration = 5.0
394
- print(f"[CORPUS] ⏱️ Starting batch barrier: 5.0s (first prompt, min wait)")
395
- self.batch_timer = threading.Timer(self.barrier_duration, self._process_batch)
396
- self.batch_timer.start()
397
- else:
398
- # Extend barrier by +2s per arrival (capped at 10s)
399
- elapsed = arrival_time - self.batch_start_time
400
- new_duration = min(elapsed + self.barrier_increment, self.barrier_cap)
401
-
402
- if new_duration > self.barrier_duration:
403
- # Cancel old timer, start new one
404
- self.batch_timer.cancel()
405
- remaining = new_duration - elapsed
406
- self.barrier_duration = new_duration
407
- _vprint(f"[CORPUS] ⏱️ Extending barrier to {new_duration:.1f}s (+{remaining:.1f}s remaining, +{self.barrier_increment:.1f}s per arrival)", self.verbose)
408
- self.batch_timer = threading.Timer(remaining, self._process_batch)
409
- self.batch_timer.start()
410
-
411
- # ASYNC wait for THIS prompt's individual event (allows other async tasks to proceed)
412
- # Timeout must be generous to account for model loading on first batch
413
- try:
414
- await asyncio.wait_for(prompt_ready.wait(), timeout=30.0) # 30s max wait (model load + processing)
415
- timed_out = False
416
- except asyncio.TimeoutError:
417
- timed_out = True
418
-
419
- if timed_out:
420
- # Fail open: return original text if batch processing hangs
421
- print(f"[CORPUS] ⚠️ Timeout waiting for batch processing, returning original prompt")
422
- self.telemetry.chars_out += len(prompt_text)
423
- return prompt_text
424
-
425
- # Retrieve deduplicated result
426
- deduplicated_text = self._get_deduplicated_prompt(prompt_id)
427
-
428
- if not deduplicated_text:
429
- # Safety: if result is missing, return original
430
- print(f"[CORPUS] ⚠️ Missing deduplicated result for prompt {prompt_id[:8]}, returning original")
431
- self.telemetry.chars_out += len(prompt_text)
432
- return prompt_text
433
-
434
- self.telemetry.chars_out += len(deduplicated_text)
435
-
436
- return deduplicated_text
437
-
438
- def _process_batch(self):
439
- """Process current batch: cross-prompt dedupe, entity coverage check, emit (synchronous)."""
440
- # CRITICAL: Acquire processing lock to prevent multiple batches from processing simultaneously
441
- with self.processing_lock:
442
- with self.batch_lock:
443
- if not self.batch_queue:
444
- # No prompts to process, just return (shouldn't happen)
445
- return
446
-
447
- batch_prompts = self.batch_queue.copy()
448
- self.batch_queue.clear()
449
- self.batch_timer = None
450
-
451
- batch_duration_ms = (time.time() - self.batch_start_time) * 1000
452
- self.telemetry.barrier_times.append(batch_duration_ms)
453
- self.telemetry.batches_processed += 1
454
-
455
- # Always show batch summary (key metric)
456
- print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
457
-
458
- # Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
459
- # This is done ONCE for the entire batch, allowing parallel arrivals
460
- _vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
461
- model = _get_embedding_model()
462
-
463
- for prompt_id in batch_prompts:
464
- prompt_state = self.prompt_registry[prompt_id]
465
-
466
- if not prompt_state.cluster_ids: # Only process if not yet clustered
467
- # Compute embeddings for all sentences in this prompt (batch operation)
468
- sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
469
-
470
- # Match/create clusters for each sentence
471
- cluster_ids = []
472
- for i, sentence in enumerate(prompt_state.sentences):
473
- # Compute salience
474
- salience = len(sentence) / 100.0
475
- salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
476
-
477
- # Extract entities
478
- entities, numbers = extract_entities_regex(sentence)
479
-
480
- # Match against existing clusters
481
- cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
482
-
483
- if cluster_id is None:
484
- # Create new cluster
485
- with self.batch_lock:
486
- cluster_id = self._generate_cluster_id()
487
- simhash = compute_simhash(sentence)
488
-
489
- cluster = SentenceCluster(
490
- cluster_id=cluster_id,
491
- canonical_sentence=sentence,
492
- owner_prompt_id=prompt_id,
493
- simhash=simhash,
494
- salience=salience,
495
- entities=entities | numbers,
496
- first_seen_seq=self.next_seq,
497
- length=len(sentence),
498
- embedding=sentence_embeddings[i]
499
- )
500
-
501
- self.clusters[cluster_id] = cluster
502
- self.next_seq += 1
503
- self.telemetry.clusters_total += 1
504
-
505
- cluster_ids.append(cluster_id)
506
-
507
- # Update prompt state with cluster_ids
508
- prompt_state.cluster_ids = cluster_ids
509
-
510
- _vprint(f"[CORPUS] Embeddings computed and clusters assigned", self.verbose)
511
-
512
- # Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
513
- # This is critical for true run-scoped deduplication
514
- all_sentences = []
515
- sentence_to_prompt = {} # Map sentence_id (prompt_id, index)
516
- locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
517
-
518
- # Iterate over ALL prompts in registry (including previous batches)
519
- for prompt_id, prompt_state in self.prompt_registry.items():
520
- is_previous_batch = prompt_id not in batch_prompts
521
-
522
- for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
523
- cluster = self.clusters.get(cluster_id)
524
- if not cluster:
525
- continue
526
-
527
- # Create Sentence object for greedy algorithm
528
- sent_id = f"{prompt_id}_{idx}"
529
- sent_obj = Sentence(
530
- id=sent_id,
531
- text=sentence_text,
532
- embedding=cluster.embedding,
533
- entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
534
- numbers=set(), # Already in entities
535
- salience=cluster.salience,
536
- position=cluster.first_seen_seq
537
- )
538
- all_sentences.append(sent_obj)
539
- sentence_to_prompt[sent_id] = (prompt_id, idx)
540
-
541
- # Lock sentences from previous batches (already emitted to user)
542
- if is_previous_batch:
543
- locked_sentences.add(sent_id)
544
-
545
- _vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
546
- _vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
547
-
548
- # Step 2: Compute degree map (needed for isolates pass later)
549
- degree_map = {}
550
- for sent in all_sentences:
551
- degree = 0
552
- for other in all_sentences:
553
- if sent.id != other.id:
554
- if are_sentences_similar(sent, other, semantic_threshold=0.60):
555
- degree += 1
556
- degree_map[sent.id] = degree
557
-
558
- # Sanity checks
559
- isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
560
- non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
561
- pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
562
- avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
563
- print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
564
-
565
- # Step 3: Run greedy maximum-independent-set selection
566
- # Start with LOCKED sentences (from previous batches, already emitted)
567
- # Then run MIS only on NEW sentences (current batch)
568
- selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
569
- selected_ids = locked_sentences.copy()
570
-
571
- print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
572
-
573
- # Now run MIS on NEW sentences only (exclude locked)
574
- new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
575
-
576
- if new_sentences:
577
- # Run MIS on new sentences, considering locked ones as neighbors
578
- new_selected = greedy_max_independent_set(
579
- new_sentences,
580
- similarity_threshold=0.60,
581
- verbose=False, # Set to True for debugging
582
- precomputed_degree_map=degree_map # Pass precomputed degrees
583
- )
584
-
585
- # Add newly selected sentences
586
- selected_sentences.extend(new_selected)
587
- selected_ids.update(s.id for s in new_selected)
588
-
589
- _vprint(f"[CORPUS] ✅ MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
590
-
591
- # Step 3: Compute NODE COVERAGE (align universe for backfill)
592
- # covered_nodes = S ∪ N(S) (selected + their neighbors)
593
- covered_nodes = set(selected_ids)
594
- sentence_map = {s.id: s for s in all_sentences}
595
-
596
- for selected_id in selected_ids:
597
- selected_sent = sentence_map[selected_id]
598
- # Add all neighbors (similar nodes)
599
- for other in all_sentences:
600
- if other.id != selected_id:
601
- if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
602
- covered_nodes.add(other.id)
603
-
604
- total_nodes = len(all_sentences)
605
- node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
606
-
607
- _vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
608
-
609
- # Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
610
- # Goal: Maximize node coverage (S N(S)) by re-adding removed nodes with highest gain
611
- # gain(u) = |({u} ∪ N(u)) \ covered_nodes|
612
- backfill_added = 0
613
- isolates_added = 0
614
- target_coverage = 0.90 # 90% node coverage target
615
-
616
- if node_coverage_before < target_coverage:
617
- uncovered_count = total_nodes - len(covered_nodes)
618
- _vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
619
-
620
- # Get ALL removed sentences (candidates for backfill)
621
- removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
622
-
623
- # Helper: compute node gain for a candidate
624
- def compute_node_gain(sent):
625
- """Compute how many uncovered nodes this sentence + its neighbors would cover."""
626
- candidate_coverage = {sent.id}
627
- # Add neighbors
628
- for other in all_sentences:
629
- if other.id != sent.id:
630
- if are_sentences_similar(sent, other, semantic_threshold=0.60):
631
- candidate_coverage.add(other.id)
632
- # Gain = new nodes not already covered
633
- return len(candidate_coverage - covered_nodes)
634
-
635
- # Debug: Print top-5 candidates by gain (first iteration only)
636
- if removed_sentences:
637
- gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
638
- gains.sort(key=lambda x: x[1], reverse=True)
639
- _vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
640
- for sent, gain in gains[:5]:
641
- _vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
642
-
643
- # GREEDY SET COVER: repeatedly pick sentence with max gain
644
- iteration = 0
645
- while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
646
- # Find best candidate
647
- best_sent = None
648
- best_gain = 0
649
-
650
- for sent in removed_sentences:
651
- gain = compute_node_gain(sent)
652
- if gain > best_gain:
653
- best_gain = gain
654
- best_sent = sent
655
-
656
- if best_gain == 0:
657
- _vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
658
- break
659
-
660
- # Add best sentence back
661
- selected_ids.add(best_sent.id)
662
- selected_sentences.append(best_sent)
663
-
664
- # Update covered_nodes: add this node + its neighbors
665
- covered_nodes.add(best_sent.id)
666
- for other in all_sentences:
667
- if other.id != best_sent.id:
668
- if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
669
- covered_nodes.add(other.id)
670
-
671
- removed_sentences.remove(best_sent)
672
- backfill_added += 1
673
-
674
- # Update coverage
675
- node_coverage_before = len(covered_nodes) / total_nodes
676
- iteration += 1
677
-
678
- if backfill_added <= 5:
679
- _vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
680
-
681
- _vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
682
-
683
- # Step 5: ISOLATES PASS - add uncovered degree=0 nodes
684
- # These are unique nodes with no similar neighbors
685
- uncovered_isolates = [sent for sent in all_sentences
686
- if sent.id not in covered_nodes and degree_map[sent.id] == 0]
687
-
688
- if uncovered_isolates:
689
- _vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
690
-
691
- for sent in uncovered_isolates:
692
- if node_coverage_before >= target_coverage:
693
- break
694
- selected_ids.add(sent.id)
695
- covered_nodes.add(sent.id)
696
- isolates_added += 1
697
- node_coverage_before = len(covered_nodes) / total_nodes
698
-
699
- if isolates_added <= 5:
700
- _vprint(f"[CORPUS] ✅ Isolate: '{sent.text[:60]}...'", self.verbose)
701
-
702
- if isolates_added > 0:
703
- _vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
704
-
705
- # Final coverage stats (NODE universe)
706
- final_selected = len(selected_ids)
707
- final_covered_nodes = len(covered_nodes)
708
- final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
709
-
710
- # Assert denominator is |V| (all nodes, no filtering)
711
- assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
712
-
713
- _vprint(f"[CORPUS] Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
714
- _vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
715
-
716
- # Step 6: Map results back to prompts
717
- results = {}
718
- for prompt_id in batch_prompts:
719
- prompt_state = self.prompt_registry[prompt_id]
720
- kept_sentences = []
721
- removed_count = 0
722
-
723
- for idx, sentence_text in enumerate(prompt_state.sentences):
724
- sent_id = f"{prompt_id}_{idx}"
725
- if sent_id in selected_ids:
726
- kept_sentences.append(sentence_text)
727
- else:
728
- removed_count += 1
729
-
730
- results[prompt_id] = {
731
- 'kept': kept_sentences,
732
- 'removed': removed_count,
733
- 'original_count': len(prompt_state.sentences)
734
- }
735
-
736
- # Step 7: Store results and emit to prompts
737
- for prompt_id in batch_prompts:
738
- prompt_state = self.prompt_registry[prompt_id]
739
- result = results[prompt_id]
740
- prompt_state.sentences = result['kept']
741
-
742
- reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
743
- _vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} → {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
744
-
745
- # Update telemetry
746
- self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
747
- # Always show final batch summary (key metric)
748
- print(f"[CORPUS] Batch complete: Node coverage {final_node_coverage*100:.1f}%")
749
-
750
- # Update telemetry
751
- if self.telemetry.barrier_times:
752
- self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
753
- self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
754
-
755
- self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
756
-
757
- # Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
758
- _vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
759
- for i, prompt_id in enumerate(batch_prompts):
760
- event = self.prompt_events.get(prompt_id)
761
- if event:
762
- event.set() # Wake up this specific thread
763
- # Longer delay to ensure threads hit on_llm_start one at a time
764
- if i < len(batch_prompts) - 1: # Don't delay after the last one
765
- time.sleep(0.5) # 500ms stagger to be safe
766
-
767
- # Clean up events to prevent memory leak
768
- for prompt_id in batch_prompts:
769
- self.prompt_events.pop(prompt_id, None)
770
-
771
- def _get_deduplicated_prompt(self, prompt_id: str) -> str:
772
- """Get deduplicated prompt text."""
773
- prompt_state = self.prompt_registry.get(prompt_id)
774
- if not prompt_state:
775
- return ""
776
-
777
- return "\n".join(prompt_state.sentences)
778
-
779
- def get_telemetry_summary(self) -> str:
780
- """Generate human-readable telemetry summary."""
781
- t = self.telemetry
782
- reduction_pct = ((t.chars_in - t.chars_out) / t.chars_in * 100) if t.chars_in > 0 else 0
783
-
784
- summary = f"""
785
- {'='*70}
786
- [CORPUS] 📊 RUN-SCOPED TELEMETRY (run_id={self.run_id[:8]})
787
- {'='*70}
788
- Prompts processed: {t.prompts_total}
789
- Sentences total: {t.sentences_total}
790
- Clusters created: {t.clusters_total}
791
- Cross-prompt dups removed: {t.cross_prompt_dups_removed}
792
- {'='*70}
793
- Chars in: {t.chars_in:,}
794
- Chars out: {t.chars_out:,}
795
- Reduction: {reduction_pct:.1f}%
796
- Tokens saved (est): {t.tokens_saved:,} tokens
797
- {'='*70}
798
- Node Coverage (S∪N(S)): {t.entity_coverage_avg:.1f}%
799
- Batches processed: {t.batches_processed}
800
- Avg barrier: {t.avg_barrier_ms:.0f}ms
801
- Max barrier: {t.max_barrier_ms:.0f}ms
802
- {'='*70}
803
- """
804
- return summary
805
-
806
-
807
- # Global registry of run-scoped corpuses
808
- _run_corpuses: Dict[str, RunScopedCorpus] = {}
809
- _corpus_lock = threading.Lock()
810
-
811
-
812
- def get_or_create_corpus(run_id: str, verbose: bool = False) -> RunScopedCorpus:
813
- """Get or create run-scoped corpus (thread-safe)."""
814
- with _corpus_lock:
815
- if run_id not in _run_corpuses:
816
- _run_corpuses[run_id] = RunScopedCorpus(run_id, verbose=verbose)
817
- return _run_corpuses[run_id]
818
-
819
-
820
- def cleanup_corpus(run_id: str):
821
- """Cleanup corpus when run ends."""
822
- with _corpus_lock:
823
- if run_id in _run_corpuses:
824
- corpus = _run_corpuses[run_id]
825
- print(corpus.get_telemetry_summary())
826
- del _run_corpuses[run_id]
827
- print(f"[CORPUS] 🗑️ Cleaned up corpus for run_id={run_id[:8]}")
828
-
829
-
830
- # ============================================================================
831
- # Legacy Per-Prompt Deduplication (V1.0 - Fallback)
832
- # ============================================================================
833
-
834
- @dataclass
835
- class Sentence:
836
- """Represents a sentence with metadata for deduplication."""
837
- id: str
838
- text: str
839
- embedding: Optional[np.ndarray] = None
840
- entities: Set[str] = None
841
- numbers: Set[str] = None
842
- salience: float = 0.0
843
- position: int = 0
844
-
845
- def __post_init__(self):
846
- if self.entities is None:
847
- self.entities = set()
848
- if self.numbers is None:
849
- self.numbers = set()
850
-
851
- @property
852
- def protected_entities(self) -> Set[str]:
853
- """All entities that must be preserved."""
854
- return self.entities | self.numbers
855
-
856
-
857
- def estimate_tokens(text: str) -> int:
858
- """Estimate token count (roughly chars/4 for English)."""
859
- return len(text) // 4
860
-
861
-
862
- def adaptive_resize_sentences(sentences: List[str]) -> List[str]:
863
- """
864
- Adaptively resize sentences for optimal embedding similarity:
865
- - Long (>120 tokens): Split on commas, semicolons, conjunctions
866
- - Short (<40 tokens): Merge with next sentence
867
- - Mid (40-120 tokens): Keep as-is
868
-
869
- This improves cross-page similarity and reduces false uniqueness.
870
- """
871
- resized = []
872
- i = 0
873
-
874
- while i < len(sentences):
875
- sent = sentences[i]
876
- tokens = estimate_tokens(sent)
877
-
878
- if tokens > 120:
879
- # LONG: Split on commas, semicolons, and conjunctions
880
- # Split points: , ; : and, but, or, however, therefore (preceded by space/comma)
881
- split_pattern = r'(?:,\s+(?:and|but|or|however|therefore|while|although)\s+|[;:])\s+'
882
- chunks = re.split(split_pattern, sent)
883
-
884
- # Ensure each chunk is reasonable (not too tiny)
885
- for chunk in chunks:
886
- if chunk.strip() and estimate_tokens(chunk) >= 20:
887
- resized.append(chunk.strip())
888
- elif resized:
889
- # Merge tiny chunk with previous
890
- resized[-1] += " " + chunk.strip()
891
- i += 1
892
-
893
- elif tokens < 40 and i + 1 < len(sentences):
894
- # SHORT: Merge with next sentence
895
- next_sent = sentences[i + 1]
896
- merged = sent + " " + next_sent
897
- merged_tokens = estimate_tokens(merged)
898
-
899
- # Only merge if result is ≤120 tokens (don't create overly long sentences)
900
- if merged_tokens <= 120:
901
- resized.append(merged)
902
- i += 2 # Skip next sentence (already merged)
903
- else:
904
- # Next sentence would make it too long, keep short one as-is
905
- resized.append(sent)
906
- i += 1
907
-
908
- else:
909
- # MID-RANGE (40-120) or last sentence: Keep as-is
910
- resized.append(sent)
911
- i += 1
912
-
913
- return resized
914
-
915
-
916
- def split_into_sentences(text: str) -> List[str]:
917
- """
918
- Split text into sentences with special handling for markdown structures,
919
- then adaptively resize for optimal embedding similarity.
920
-
921
- Handles:
922
- - Standard sentences ending with .!?
923
- - Bullet points and numbered lists
924
- - Code blocks (preserve as single units)
925
- - Headers
926
- - Adaptive resizing: long sentences split, short ones merged
927
- """
928
- sentences = []
929
-
930
- # First, protect code blocks
931
- code_block_pattern = r'```[\s\S]*?```'
932
- code_blocks = {}
933
- for i, match in enumerate(re.finditer(code_block_pattern, text)):
934
- placeholder = f"__CODE_BLOCK_{i}__"
935
- code_blocks[placeholder] = match.group()
936
- text = text.replace(match.group(), placeholder)
937
-
938
- # Split on sentence boundaries
939
- # Handle: . ! ? followed by space/newline, or newlines with list markers
940
- patterns = [
941
- r'(?<=[.!?])\s+(?=[A-Z])', # Standard sentences
942
- r'\n\s*[-*•]\s+', # Bullet points
943
- r'\n\s*\d+\.\s+', # Numbered lists
944
- r'\n#{1,6}\s+', # Markdown headers
945
- r'\n\s*\n', # Paragraph breaks
946
- ]
947
-
948
- combined_pattern = '|'.join(f'({p})' for p in patterns)
949
- parts = re.split(combined_pattern, text)
950
-
951
- # Reconstruct sentences (filter out delimiters)
952
- current = ""
953
- for part in parts:
954
- if part is None:
955
- continue
956
- if re.match(combined_pattern, part):
957
- if current.strip():
958
- sentences.append(current.strip())
959
- current = ""
960
- else:
961
- current += part
962
-
963
- if current.strip():
964
- sentences.append(current.strip())
965
-
966
- # Restore code blocks
967
- restored = []
968
- for sent in sentences:
969
- for placeholder, code in code_blocks.items():
970
- sent = sent.replace(placeholder, code)
971
- if sent.strip():
972
- restored.append(sent.strip())
973
-
974
- # ADAPTIVE RESIZING: Split long sentences, merge short ones
975
- resized = adaptive_resize_sentences(restored)
976
-
977
- return resized
978
-
979
-
980
- def extract_entities_regex(text: str) -> Tuple[Set[str], Set[str]]:
981
- """
982
- Fallback regex-based entity extraction.
983
-
984
- Returns:
985
- (entities, numbers) - Sets of extracted entities and numbers
986
- """
987
- entities = set()
988
- numbers = set()
989
-
990
- # Proper nouns: Capitalized words (basic heuristic) - at least 3 chars
991
- proper_nouns = re.findall(r'\b[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]+)*\b', text)
992
- entities.update(proper_nouns)
993
-
994
- # Technical terms: CamelCase, snake_case, package names
995
- technical = re.findall(r'\b[A-Z][a-z]+[A-Z]\w+\b', text) # CamelCase
996
- technical += re.findall(r'\b\w+_\w+\b', text) # snake_case
997
- entities.update(technical)
998
-
999
- # Numbers: MEANINGFUL numbers only (exclude single digits 0-9)
1000
- # Include: multi-digit numbers, floats, percentages, version numbers
1001
- nums = re.findall(r'\b\d{2,}(?:\.\d+)?%?\b', text) # 2+ digits
1002
- nums += re.findall(r'\b\d+\.\d+\b', text) # Floats like 14.4, 2.0
1003
- numbers.update(nums)
1004
-
1005
- # Dates: YYYY-MM-DD, MM/DD/YYYY, etc.
1006
- dates = re.findall(r'\b\d{4}[-/]\d{1,2}[-/]\d{1,4}\b', text) # Full dates
1007
- dates += re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text)
1008
- numbers.update(dates)
1009
-
1010
- # Filter out common non-informative words and malformed entities
1011
- stopwords = {
1012
- # Common words
1013
- 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1014
- 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1015
- 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1016
- 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1017
- # Markup/formatting artifacts
1018
- 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1019
- }
1020
-
1021
- # Filter entities
1022
- filtered_entities = set()
1023
- for e in entities:
1024
- # Skip short entities
1025
- if len(e) < 3:
1026
- continue
1027
-
1028
- # Skip if contains newlines (malformed extraction)
1029
- if '\n' in e:
1030
- continue
1031
-
1032
- # Skip stopwords (case-insensitive)
1033
- if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1034
- continue
1035
-
1036
- # Skip if it's just a URL fragment
1037
- if e.lower() in ['url', 'http', 'https', 'www']:
1038
- continue
1039
-
1040
- # Skip if ends with common suffixes that indicate malformed extraction
1041
- if e.endswith('---') or e.endswith('...') or e.endswith('--'):
1042
- continue
1043
-
1044
- filtered_entities.add(e)
1045
-
1046
- # Filter numbers - remove single digits 0-9 (often SOURCE numbers)
1047
- filtered_numbers = {n for n in numbers if len(n) >= 2 or '.' in n or '%' in n}
1048
-
1049
- return filtered_entities, filtered_numbers
1050
-
1051
-
1052
- def extract_entities_spacy(text: str, nlp) -> Tuple[Set[str], Set[str]]:
1053
- """
1054
- spaCy-based entity extraction (more accurate).
1055
-
1056
- Returns:
1057
- (entities, numbers) - Sets of extracted entities and numbers
1058
- """
1059
- entities = set()
1060
- numbers = set()
1061
-
1062
- doc = nlp(text)
1063
-
1064
- # Named entities
1065
- for ent in doc.ents:
1066
- if ent.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']:
1067
- entities.add(ent.text)
1068
- elif ent.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
1069
- numbers.add(ent.text)
1070
-
1071
- # Also grab technical terms (capitalized noun phrases)
1072
- for chunk in doc.noun_chunks:
1073
- if chunk.text[0].isupper():
1074
- entities.add(chunk.text)
1075
-
1076
- # Apply SAME filtering as regex version
1077
- stopwords = {
1078
- 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1079
- 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1080
- 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1081
- 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1082
- 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1083
- }
1084
-
1085
- # Filter entities
1086
- filtered_entities = set()
1087
- for e in entities:
1088
- # Skip short entities
1089
- if len(e) < 3:
1090
- continue
1091
-
1092
- # Skip if contains newlines (malformed)
1093
- if '\n' in e:
1094
- continue
1095
-
1096
- # Skip stopwords (case-insensitive)
1097
- if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1098
- continue
1099
-
1100
- # Skip URL fragments
1101
- if e.lower() in ['url', 'http', 'https', 'www']:
1102
- continue
1103
-
1104
- # Skip malformed endings
1105
- if e.endswith('---') or e.endswith('...') or e.endswith('--') or e.endswith('---\\nURL'):
1106
- continue
1107
-
1108
- filtered_entities.add(e)
1109
-
1110
- # Filter numbers - remove single digits 0-9
1111
- filtered_numbers = {n for n in numbers if len(str(n).strip()) >= 2 or '.' in str(n) or '%' in str(n)}
1112
-
1113
- return filtered_entities, filtered_numbers
1114
-
1115
-
1116
- def extract_entities(text: str) -> Tuple[Set[str], Set[str]]:
1117
- """
1118
- Extract entities and numbers from text.
1119
-
1120
- Uses spaCy if available, falls back to regex.
1121
-
1122
- Returns:
1123
- (entities, numbers) - Sets of protected entities and numbers
1124
- """
1125
- nlp = _get_spacy_model()
1126
-
1127
- if nlp == "fallback":
1128
- return extract_entities_regex(text)
1129
- else:
1130
- return extract_entities_spacy(text, nlp)
1131
-
1132
-
1133
- def compute_salience(sentence: str, position: int, total_sentences: int) -> float:
1134
- """
1135
- Compute salience score for a sentence.
1136
-
1137
- Factors:
1138
- - Position: Earlier sentences weighted higher (first paragraph effect)
1139
- - Length: Moderate length preferred (too short = filler, too long = verbose)
1140
- - Entity density: More entities = more information-dense
1141
- - Numbers: Presence of numbers = factual content
1142
-
1143
- Returns:
1144
- Salience score (0.0 to 1.0, higher = more important)
1145
- """
1146
- score = 0.0
1147
-
1148
- # Position-based (exponential decay)
1149
- position_weight = np.exp(-position / (total_sentences * 0.3))
1150
- score += position_weight * 0.3
1151
-
1152
- # Length-based (optimal ~50-150 chars)
1153
- length = len(sentence)
1154
- if 50 <= length <= 150:
1155
- length_weight = 1.0
1156
- elif length < 50:
1157
- length_weight = length / 50
1158
- else:
1159
- length_weight = 150 / length
1160
- score += length_weight * 0.2
1161
-
1162
- # Entity density (basic heuristic: count capitalized words)
1163
- words = sentence.split()
1164
- cap_words = sum(1 for w in words if w and w[0].isupper())
1165
- entity_density = min(cap_words / max(len(words), 1), 1.0)
1166
- score += entity_density * 0.3
1167
-
1168
- # Number presence
1169
- has_numbers = bool(re.search(r'\d', sentence))
1170
- score += 0.2 if has_numbers else 0.0
1171
-
1172
- return min(score, 1.0)
1173
-
1174
-
1175
- def compute_char_3gram_jaccard(text1: str, text2: str) -> float:
1176
- """
1177
- Compute character 3-gram Jaccard similarity.
1178
- Captures boilerplate and tight phrasing that embeddings might miss.
1179
-
1180
- Returns:
1181
- Jaccard similarity [0, 1]
1182
- """
1183
- def get_3grams(text):
1184
- text = text.lower()
1185
- return set(text[i:i+3] for i in range(len(text) - 2))
1186
-
1187
- grams1 = get_3grams(text1)
1188
- grams2 = get_3grams(text2)
1189
-
1190
- if not grams1 or not grams2:
1191
- return 0.0
1192
-
1193
- intersection = len(grams1 & grams2)
1194
- union = len(grams1 | grams2)
1195
-
1196
- return intersection / union if union > 0 else 0.0
1197
-
1198
-
1199
- def compute_similarity(emb1: np.ndarray, emb2: np.ndarray) -> float:
1200
- """
1201
- Compute cosine similarity between two embeddings.
1202
- Assumes embeddings are L2-normalized (unit vectors), so cosine = dot product.
1203
- """
1204
- return np.dot(emb1, emb2)
1205
-
1206
-
1207
- def are_sentences_similar(sent1: Sentence, sent2: Sentence, semantic_threshold: float = 0.60) -> bool:
1208
- """
1209
- Check if two sentences are similar using semantic + lexical signals.
1210
-
1211
- - Semantic: cosine similarity on embeddings
1212
- - Lexical fallback: 3-gram Jaccard for short sentences (≤120 chars)
1213
-
1214
- Args:
1215
- sent1, sent2: Sentence objects with embeddings
1216
- semantic_threshold: Threshold for semantic similarity
1217
-
1218
- Returns:
1219
- True if similar, False otherwise
1220
- """
1221
- # Primary: semantic similarity
1222
- semantic_sim = compute_similarity(sent1.embedding, sent2.embedding)
1223
- if semantic_sim >= semantic_threshold:
1224
- return True
1225
-
1226
- # Fallback: lexical for short sentences (captures boilerplate)
1227
- max_len = max(len(sent1.text), len(sent2.text))
1228
- if max_len <= 120: # ~30 tokens
1229
- lexical_sim = compute_char_3gram_jaccard(sent1.text, sent2.text)
1230
- if lexical_sim >= 0.82: # High Jaccard = tight phrasing match
1231
- return True
1232
-
1233
- return False
1234
-
1235
-
1236
- def build_sentence_objects(sentences_text: List[str], embeddings: np.ndarray) -> List[Sentence]:
1237
- """
1238
- Build Sentence objects with metadata.
1239
-
1240
- Args:
1241
- sentences_text: List of sentence strings
1242
- embeddings: Numpy array of embeddings (N x 384)
1243
-
1244
- Returns:
1245
- List of Sentence objects with computed metadata
1246
- """
1247
- sentence_objects = []
1248
- total = len(sentences_text)
1249
-
1250
- for i, text in enumerate(sentences_text):
1251
- # Generate ID
1252
- sent_id = hashlib.md5(text.encode()).hexdigest()[:8]
1253
-
1254
- # Extract entities
1255
- entities, numbers = extract_entities(text)
1256
-
1257
- # Compute salience
1258
- salience = compute_salience(text, i, total)
1259
-
1260
- sentence_objects.append(Sentence(
1261
- id=sent_id,
1262
- text=text,
1263
- embedding=embeddings[i],
1264
- entities=entities,
1265
- numbers=numbers,
1266
- salience=salience,
1267
- position=i
1268
- ))
1269
-
1270
- return sentence_objects
1271
-
1272
-
1273
- def greedy_max_independent_set(
1274
- sentences: List[Sentence],
1275
- similarity_threshold: float = 0.60,
1276
- verbose: bool = True,
1277
- precomputed_degree_map: Dict = None
1278
- ) -> List[Sentence]:
1279
- """
1280
- Greedy maximum-independent-set selection with degree×length-aware ordering.
1281
-
1282
- Algorithm:
1283
- 1. Compute degree (# of similar neighbors) for each sentence
1284
- 2. Sort by (token_length × degree) DESCENDING → prioritizes ejecting long redundant sentences
1285
- 3. Pick highest degree×length sentence (most redundant, highest token savings)
1286
- 4. Remove all similar neighbors (similarity > threshold)
1287
- 5. Check removed sentences for unique entities
1288
- 6. If removed sentence has unique entities, re-add it (HARD GUARD)
1289
- 7. Repeat until all sentences processed
1290
-
1291
- This preserves coverage while ejecting long, low-value uniques → bigger trims without raising sim bar.
1292
-
1293
- Args:
1294
- sentences: List of Sentence objects
1295
- similarity_threshold: Similarity threshold for edge creation (0.75 = 75% similar)
1296
- verbose: Print debug info
1297
-
1298
- Returns:
1299
- List of selected Sentence objects (deduplicated)
1300
- """
1301
- if verbose:
1302
- print(f"\n[PIPECLEANER] Starting degree×length-aware greedy max-independent-set")
1303
- print(f"[PIPECLEANER] Input: {len(sentences)} sentences")
1304
- print(f"[PIPECLEANER] Similarity threshold: {similarity_threshold}")
1305
-
1306
- # Step 1: Use precomputed degree map (or compute if not provided)
1307
- if precomputed_degree_map is None:
1308
- # Compute degree (# of connections) for each sentence
1309
- # Use hybrid similarity: semantic (0.60) OR lexical (0.82 Jaccard for short spans)
1310
- degree_map = {}
1311
- for sent in sentences:
1312
- degree = 0
1313
- for other in sentences:
1314
- if sent.id != other.id:
1315
- # Hybrid check: semantic OR lexical
1316
- if are_sentences_similar(sent, other, semantic_threshold=similarity_threshold):
1317
- degree += 1
1318
- degree_map[sent.id] = degree
1319
-
1320
- # Sanity checks (as requested)
1321
- isolates = [s for s in sentences if degree_map[s.id] == 0]
1322
- non_isolates = [s for s in sentences if degree_map[s.id] > 0]
1323
- pct_isolates = len(isolates) / len(sentences) * 100 if sentences else 0
1324
- avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
1325
-
1326
- if verbose:
1327
- avg_degree = sum(degree_map.values()) / len(degree_map) if degree_map else 0
1328
- print(f"[PIPECLEANER] Degree stats: avg={avg_degree:.1f}, isolates={pct_isolates:.1f}%, non-isolate avg={avg_degree_non_iso:.1f}")
1329
- print(f"[PIPECLEANER] Sanity: isolates {pct_isolates:.0f}% (expect <20%), non-isolate avg {avg_degree_non_iso:.1f} (expect >3)")
1330
- else:
1331
- # Use precomputed degree map (more efficient)
1332
- degree_map = precomputed_degree_map
1333
-
1334
- # Step 2: Sort by (token_length × degree) ASCENDING
1335
- # LOW degree×length = short + unique → keep first (high value)
1336
- # HIGH degree×length = long + redundant → eject (low value)
1337
- def sort_key(s):
1338
- token_len = estimate_tokens(s.text)
1339
- degree = degree_map[s.id]
1340
- return token_len * degree
1341
-
1342
- # Sort ASCENDING - pick short unique sentences first
1343
- sorted_sentences = sorted(sentences, key=sort_key, reverse=False)
1344
-
1345
- if verbose:
1346
- top_5 = sorted_sentences[:5]
1347
- print(f"[PIPECLEANER] Top 5 to keep (low degree×length = short + unique):")
1348
- for i, s in enumerate(top_5, 1):
1349
- score = sort_key(s)
1350
- print(f" {i}. {estimate_tokens(s.text)}tok × {degree_map[s.id]}deg = {score:.0f} | '{s.text[:60]}...'")
1351
-
1352
-
1353
- selected = []
1354
- remaining = sorted_sentences.copy()
1355
- entity_coverage = set()
1356
- iteration = 0
1357
-
1358
- while remaining:
1359
- iteration += 1
1360
- # Pick highest degree×length sentence (most redundant + expensive)
1361
- best = remaining[0]
1362
-
1363
- if verbose and iteration <= 5: # Print first 5 iterations
1364
- score = sort_key(best)
1365
- print(f"\n[PIPECLEANER] Iteration {iteration}:")
1366
- print(f" Selected: '{best.text[:80]}...'")
1367
- print(f" Degree×Length: {estimate_tokens(best.text)}tok × {degree_map[best.id]}deg = {score:.0f}")
1368
- print(f" Entities: {best.protected_entities}")
1369
-
1370
- # Add to selected
1371
- selected.append(best)
1372
- entity_coverage |= best.protected_entities
1373
-
1374
- # Remove from remaining
1375
- remaining.remove(best)
1376
-
1377
- # Find similar neighbors to remove (using hybrid similarity)
1378
- to_remove = []
1379
- for candidate in remaining:
1380
- if are_sentences_similar(best, candidate, semantic_threshold=similarity_threshold):
1381
- # Get semantic sim for logging
1382
- sem_sim = compute_similarity(best.embedding, candidate.embedding)
1383
- to_remove.append((candidate, sem_sim))
1384
-
1385
- if verbose and iteration <= 5 and to_remove:
1386
- print(f" Removing {len(to_remove)} similar sentences (similarity >= {similarity_threshold})")
1387
-
1388
- # Remove similar sentences
1389
- for candidate, sim in to_remove:
1390
- remaining.remove(candidate)
1391
-
1392
- # HARD GUARD: Check removed sentences for unique entities
1393
- # Only re-add if they have MULTIPLE (3+) meaningful unique entities
1394
- # This prevents re-adding for trivial differences
1395
- re_added = 0
1396
- for candidate, sim in to_remove:
1397
- unique_entities = candidate.protected_entities - entity_coverage
1398
-
1399
- # Require at least 3 unique entities OR at least 1 unique multi-word entity
1400
- multi_word_entities = {e for e in unique_entities if ' ' in e or len(e) > 10}
1401
- should_readd = len(unique_entities) >= 3 or len(multi_word_entities) >= 1
1402
-
1403
- if should_readd:
1404
- if verbose and iteration <= 5:
1405
- print(f" ⚠️ RE-ADDING sentence with {len(unique_entities)} unique entities: {unique_entities}")
1406
- print(f" Text: '{candidate.text[:80]}...'")
1407
- selected.append(candidate)
1408
- entity_coverage |= candidate.protected_entities
1409
- re_added += 1
1410
-
1411
- if verbose and iteration <= 5 and re_added:
1412
- print(f" Re-added {re_added} sentences to preserve entity coverage")
1413
-
1414
- if verbose:
1415
- print(f"\n[PIPECLEANER] Selection complete:")
1416
- print(f" Input: {len(sentences)} sentences")
1417
- print(f" Output: {len(selected)} sentences")
1418
- print(f" Reduction: {(1 - len(selected)/len(sentences))*100:.1f}%")
1419
- print(f" Entities preserved: {len(entity_coverage)}")
1420
-
1421
- return selected
1422
-
1423
-
1424
- def deduplicate_search_results(
1425
- text: str,
1426
- similarity_threshold: float = 0.60,
1427
- verbose: bool = True,
1428
- cached_model=None
1429
- ) -> Tuple[str, Dict, any]:
1430
- """
1431
- Main entry point: Deduplicate search results using graph-based approach.
1432
-
1433
- Args:
1434
- text: Raw search results text
1435
- similarity_threshold: Cosine similarity threshold (0.60 catches cross-site paraphrases at 0.55-0.68)
1436
- verbose: Print debug info
1437
- cached_model: Optional cached embedding model to reuse
1438
-
1439
- Returns:
1440
- Tuple of (deduplicated_text, stats_dict, embedding_model)
1441
- stats_dict contains: {
1442
- 'original_chars': int,
1443
- 'deduplicated_chars': int,
1444
- 'original_sentences': int,
1445
- 'deduplicated_sentences': int,
1446
- 'prune_pct': float,
1447
- 'original_tokens': int,
1448
- 'deduplicated_tokens': int,
1449
- 'tokens_saved': int,
1450
- 'entity_coverage_pct': float,
1451
- 'entities_total': int,
1452
- 'entities_preserved': int
1453
- }
1454
- """
1455
- if verbose:
1456
- print(f"\n{'='*70}")
1457
- print(f"[PIPECLEANER] DEDUPLICATION STARTED")
1458
- print(f"{'='*70}")
1459
- print(f"[PIPECLEANER] Input text: {len(text)} chars, ~{len(text.split())} words")
1460
-
1461
- # Step 1: Split into sentences
1462
- sentences_text = split_into_sentences(text)
1463
-
1464
- if verbose:
1465
- print(f"[PIPECLEANER] Split into {len(sentences_text)} sentences")
1466
-
1467
- # Initialize stats
1468
- stats = {
1469
- 'original_chars': len(text),
1470
- 'deduplicated_chars': len(text),
1471
- 'original_sentences': len(sentences_text),
1472
- 'deduplicated_sentences': len(sentences_text),
1473
- 'prune_pct': 0.0,
1474
- 'original_tokens': int(len(text) / 4),
1475
- 'deduplicated_tokens': int(len(text) / 4),
1476
- 'tokens_saved': 0,
1477
- 'entity_coverage_pct': 100.0,
1478
- 'entities_total': 0,
1479
- 'entities_preserved': 0
1480
- }
1481
-
1482
- if len(sentences_text) == 0:
1483
- if verbose:
1484
- print(f"[PIPECLEANER] ⚠️ No sentences found, returning original text")
1485
- return text, stats, cached_model
1486
-
1487
- if len(sentences_text) == 1:
1488
- if verbose:
1489
- print(f"[PIPECLEANER] Only 1 sentence, skipping deduplication")
1490
- return text, stats, cached_model
1491
-
1492
- # Step 2: Compute embeddings
1493
- # Always use the thread-safe singleton model
1494
- model = _get_embedding_model()
1495
-
1496
- if verbose:
1497
- print(f"[PIPECLEANER] Computing embeddings...")
1498
-
1499
- # L2 normalize embeddings so cosine similarity = dot product (faster)
1500
- embeddings = model.encode(sentences_text, show_progress_bar=False, normalize_embeddings=True)
1501
-
1502
- if verbose:
1503
- print(f"[PIPECLEANER] Embeddings computed: shape {embeddings.shape}")
1504
-
1505
- # Step 3: Build sentence objects with metadata
1506
- sentences = build_sentence_objects(sentences_text, embeddings)
1507
-
1508
- # Calculate total entities across all sentences
1509
- all_entities = set()
1510
- for sent in sentences:
1511
- all_entities |= sent.protected_entities
1512
-
1513
- # Step 4: Run greedy max-independent-set selection
1514
- selected = greedy_max_independent_set(sentences, similarity_threshold, verbose)
1515
-
1516
- # Calculate preserved entities
1517
- preserved_entities = set()
1518
- for sent in selected:
1519
- preserved_entities |= sent.protected_entities
1520
-
1521
- # Step 5: Reconstruct text preserving original order
1522
- selected_by_position = sorted(selected, key=lambda s: s.position)
1523
- deduplicated_text = '\n\n'.join(s.text for s in selected_by_position)
1524
-
1525
- # Calculate stats
1526
- stats['deduplicated_chars'] = len(deduplicated_text)
1527
- stats['deduplicated_sentences'] = len(selected)
1528
- stats['prune_pct'] = (1 - len(selected) / len(sentences_text)) * 100 if len(sentences_text) > 0 else 0
1529
- stats['deduplicated_tokens'] = int(len(deduplicated_text) / 4)
1530
- stats['tokens_saved'] = stats['original_tokens'] - stats['deduplicated_tokens']
1531
- stats['entities_total'] = len(all_entities)
1532
- stats['entities_preserved'] = len(preserved_entities)
1533
- stats['entity_coverage_pct'] = (len(preserved_entities) / len(all_entities) * 100) if len(all_entities) > 0 else 100.0
1534
-
1535
- if verbose:
1536
- print(f"\n[PIPECLEANER] DEDUPLICATION COMPLETE")
1537
- print(f" Input: {len(text)} chars")
1538
- print(f" Output: {len(deduplicated_text)} chars")
1539
- print(f" Reduction: {(1 - len(deduplicated_text)/len(text))*100:.1f}%")
1540
- print(f" Sentences: {len(sentences_text)} → {len(selected)}")
1541
- print(f"{'='*70}\n")
1542
-
1543
- return deduplicated_text, stats, model
1544
-
1545
-
1546
- # ============================================================================
1547
- # CONVENIENCE FUNCTIONS
1548
- # ============================================================================
1549
-
1550
- def estimate_tokens(text: str) -> int:
1551
- """Rough estimate of token count (words / 0.75)."""
1552
- return int(len(text.split()) / 0.75)
1553
-
1554
-
1555
- def should_deduplicate(text: str, min_length: int = 500) -> bool:
1556
- """
1557
- Check if text is worth deduplicating.
1558
-
1559
- Args:
1560
- text: Input text
1561
- min_length: Minimum character length to bother deduplicating
1562
-
1563
- Returns:
1564
- True if text should be deduplicated
1565
- """
1566
- return len(text) >= min_length
1567
-
1568
-
1569
- def apply_pipecleaner_if_applicable(tool_name: str, output_str: str, selected_rules: list, cached_model=None) -> Tuple[str, any]:
1570
- """
1571
- High-level function to check for filter search rules and apply deduplication.
1572
-
1573
- This is called from capture.py's on_tool_end callback.
1574
-
1575
- Args:
1576
- tool_name: Name of the tool that just finished
1577
- output_str: Raw output from the tool
1578
- selected_rules: List of rules selected for this run
1579
- cached_model: Optional cached embedding model to reuse across searches
1580
-
1581
- Returns:
1582
- Tuple of (deduplicated_output, embedding_model) for caching
1583
- Returns (original_output, None) if no filter rule applies
1584
- """
1585
- try:
1586
- # Find applicable filter search rules for this tool
1587
- filter_rules = _find_filter_search_rules(tool_name, selected_rules)
1588
-
1589
- # If we found applicable filter rules, apply deduplication
1590
- if filter_rules:
1591
- print(f"\n{'='*70}")
1592
- print(f"[PIPECLEANER] 🧹 FILTER SEARCH RULE DETECTED")
1593
- print(f"{'='*70}")
1594
- print(f"[PIPECLEANER] Tool: {tool_name}")
1595
- print(f"[PIPECLEANER] Rules matched: {len(filter_rules)}")
1596
- for rule in filter_rules:
1597
- rule_id = getattr(rule, 'id', 'unknown')
1598
- advice = getattr(rule, 'advice', '') or getattr(rule, 'advice_text', '')
1599
- print(f"[PIPECLEANER] - Rule {rule_id}: {advice[:80]}...")
1600
- print(f"{'='*70}")
1601
-
1602
- # Apply deduplication with cached model
1603
- deduplicated, stats, model = deduplicate_search_results(
1604
- text=output_str,
1605
- similarity_threshold=0.60, # 0.60 catches cross-site paraphrases (0.55-0.68 typical)
1606
- verbose=True, # Show detailed deduplication stats
1607
- cached_model=cached_model # Reuse model if available
1608
- )
1609
-
1610
- # Print comprehensive stats after every search
1611
- print(f"\n{'='*70}")
1612
- print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1613
- print(f"{'='*70}")
1614
- print(f"[PIPECLEANER] 🔢 Sentences:")
1615
- print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1616
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1617
- print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1618
- print(f"[PIPECLEANER]")
1619
- print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1620
- print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1621
- print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1622
- print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1623
- print(f"[PIPECLEANER]")
1624
- print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1625
- print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1626
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1627
- print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1628
- print(f"[PIPECLEANER]")
1629
- print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1630
- print(f"{'='*70}\n")
1631
-
1632
- return deduplicated, model
1633
-
1634
- # No filter rules found, return original
1635
- return output_str, None
1636
-
1637
- except ImportError as e:
1638
- print(f"\n{'='*70}")
1639
- print(f"[PIPECLEANER] IMPORT ERROR - FAILING OPEN")
1640
- print(f"{'='*70}")
1641
- print(f"[PIPECLEANER] Error: {e}")
1642
- print(f"[PIPECLEANER] Install: pip install sentence-transformers")
1643
- print(f"{'='*70}\n")
1644
- return output_str, None
1645
- except Exception as e:
1646
- print(f"\n{'='*70}")
1647
- print(f"[PIPECLEANER] EXCEPTION - FAILING OPEN")
1648
- print(f"{'='*70}")
1649
- print(f"[PIPECLEANER] Error type: {type(e).__name__}")
1650
- print(f"[PIPECLEANER] Error message: {e}")
1651
- import traceback
1652
- print(f"[PIPECLEANER] Traceback:")
1653
- traceback.print_exc()
1654
- print(f"{'='*70}\n")
1655
- return output_str, None
1656
-
1657
-
1658
- def _find_filter_search_rules(tool_name: str, selected_rules: list) -> list:
1659
- """
1660
- Find llm_start scoped rules with "filter search" keywords that apply to this tool.
1661
-
1662
- This is called from on_llm_start when a Summary tool's LLM is about to be called.
1663
- Rule synthesis will generate rules scoped to llm_start when it detects search→summary patterns.
1664
-
1665
- Args:
1666
- tool_name: Name of the tool whose LLM is starting (e.g., 'Summary')
1667
- selected_rules: List of rules to search through
1668
-
1669
- Returns:
1670
- List of applicable filter search rules
1671
- """
1672
- filter_rules = []
1673
-
1674
- for rule_meta in selected_rules:
1675
- # Unwrap tuple if needed (rules come as (rule, metadata) from select_rules)
1676
- if isinstance(rule_meta, tuple) and len(rule_meta) == 2:
1677
- rule_obj, _metadata = rule_meta
1678
- else:
1679
- rule_obj = rule_meta
1680
-
1681
- # Check if this is an llm_start scoped rule
1682
- target_step_type = getattr(rule_obj, 'target_step_type', None)
1683
-
1684
- # Must be scoped to llm_start (where we intercept Summary LLM calls)
1685
- if target_step_type != 'llm_start':
1686
- continue
1687
-
1688
- # Check if the rule contains "filter search" keywords
1689
- # Try both field names that might be used
1690
- advice = getattr(rule_obj, 'advice_text', None) or getattr(rule_obj, 'advice', None) or ''
1691
- advice_lower = advice.lower() if advice else ''
1692
-
1693
- if not advice_lower or 'filter' not in advice_lower or 'search' not in advice_lower:
1694
- continue
1695
-
1696
- # Check if the rule applies to this tool
1697
- applies = _rule_applies_to_tool(rule_obj, tool_name, advice_lower)
1698
-
1699
- if applies:
1700
- filter_rules.append(rule_obj)
1701
-
1702
- return filter_rules
1703
-
1704
-
1705
- def _rule_applies_to_tool(rule_obj, tool_name: str, advice_lower: str) -> bool:
1706
- """
1707
- Check if a rule applies to the given tool.
1708
-
1709
- Args:
1710
- rule_obj: Rule object or dict to check
1711
- tool_name: Name of the tool (case-insensitive)
1712
- advice_lower: Lowercased advice text for fallback matching
1713
-
1714
- Returns:
1715
- True if rule applies to this tool
1716
- """
1717
- # Wildcard matches everything (used for initial check)
1718
- if tool_name == "*":
1719
- return True
1720
-
1721
- tool_name_lower = tool_name.lower()
1722
-
1723
- # Extract references.tools from rule (handle both dict and object formats)
1724
- if isinstance(rule_obj, dict):
1725
- references = rule_obj.get('references', {})
1726
- tools = references.get('tools', []) if isinstance(references, dict) else []
1727
- else:
1728
- references = getattr(rule_obj, 'references', None)
1729
- if references:
1730
- # Try both object attribute and dict access for tools
1731
- if hasattr(references, 'tools'):
1732
- tools = references.tools
1733
- elif isinstance(references, dict):
1734
- tools = references.get('tools', [])
1735
- else:
1736
- tools = []
1737
- else:
1738
- tools = []
1739
-
1740
- if tools:
1741
- # Check if tool_name matches any tool in references.tools (case-insensitive exact match)
1742
- for ref_tool in tools:
1743
- ref_tool_lower = ref_tool.lower()
1744
- if tool_name_lower == ref_tool_lower:
1745
- return True
1746
- # No match found in references.tools
1747
- return False
1748
- else:
1749
- # Rule has no tools list - don't apply to anything (be conservative)
1750
- return False
1751
-
1752
-
1753
- async def run_pipecleaner_enforcement(
1754
- messages_or_prompts: tuple,
1755
- callback_handler: any,
1756
- patch_depth: any
1757
- ) -> bool:
1758
- """
1759
- Main pipecleaner enforcement logic - parallel to run_microturn_enforcement.
1760
-
1761
- This intercepts ToolMessage objects and applies deduplication.
1762
-
1763
- Args:
1764
- messages_or_prompts: Args tuple from _generate (first element is messages)
1765
- callback_handler: DaseinCallbackHandler with rules
1766
- patch_depth: Thread-local object with caching
1767
-
1768
- Returns:
1769
- True if enforcement was applied, False if skipped
1770
- """
1771
- try:
1772
- print(f"[PIPECLEANER] 🧹 run_pipecleaner_enforcement called")
1773
-
1774
- if not callback_handler or not hasattr(callback_handler, '_selected_rules'):
1775
- return False
1776
-
1777
- rules = callback_handler._selected_rules
1778
- print(f"[PIPECLEANER] Found {len(rules)} rules")
1779
-
1780
- filter_rules = _find_filter_search_rules("*", rules)
1781
- if not filter_rules:
1782
- return False
1783
-
1784
- print(f"[PIPECLEANER] 🎯 Found {len(filter_rules)} filter search rules!")
1785
-
1786
- # Extract messages from args
1787
- if not messages_or_prompts or len(messages_or_prompts) == 0:
1788
- return False
1789
-
1790
- messages = messages_or_prompts[0]
1791
- if not isinstance(messages, list):
1792
- return False
1793
-
1794
- # Find the most recent ToolMessage (tool result)
1795
- tool_message = None
1796
- for idx in range(len(messages) - 1, -1, -1):
1797
- msg = messages[idx]
1798
- msg_type = getattr(msg, 'type', None) or (msg.get('type') if isinstance(msg, dict) else None)
1799
- if msg_type == 'tool':
1800
- tool_message = msg
1801
- break
1802
-
1803
- if not tool_message:
1804
- return False
1805
-
1806
- # Extract tool name and content
1807
- tool_name = getattr(tool_message, 'name', None) or tool_message.get('name', 'unknown')
1808
- tool_content = str(getattr(tool_message, 'content', None) or tool_message.get('content', ''))
1809
-
1810
- print(f"[PIPECLEANER] Tool: {tool_name}, content: {len(tool_content)} chars")
1811
-
1812
- # Check if this tool matches our filter rules
1813
- matching_rules = _find_filter_search_rules(tool_name, rules)
1814
- if not matching_rules:
1815
- print(f"[PIPECLEANER] Tool '{tool_name}' doesn't match filter rules, skipping")
1816
- return False
1817
-
1818
- print(f"[PIPECLEANER] 🎯 Tool '{tool_name}' matches filter rules! Starting deduplication...")
1819
-
1820
- # Prevent infinite regression - check if we've already processed this exact message
1821
- if not hasattr(patch_depth, 'processed_tool_messages'):
1822
- patch_depth.processed_tool_messages = set()
1823
-
1824
- # Create signature from tool name + content hash
1825
- msg_signature = f"{tool_name}_{hash(tool_content[:200])}"
1826
- if msg_signature in patch_depth.processed_tool_messages:
1827
- print(f"[PIPECLEANER] Already processed this ToolMessage, skipping")
1828
- return False
1829
-
1830
- # Mark as processed
1831
- patch_depth.processed_tool_messages.add(msg_signature)
1832
-
1833
- # Apply deduplication
1834
- cached_model = getattr(callback_handler, '_pipecleaner_embedding_model', None)
1835
-
1836
- deduplicated, stats, model = deduplicate_search_results(
1837
- text=tool_content,
1838
- similarity_threshold=0.60, # Lowered to catch paraphrases
1839
- verbose=True,
1840
- cached_model=cached_model
1841
- )
1842
-
1843
- # Cache model
1844
- callback_handler._pipecleaner_embedding_model = model
1845
-
1846
- # Modify ToolMessage content IN PLACE
1847
- if hasattr(tool_message, 'content'):
1848
- tool_message.content = deduplicated
1849
- elif isinstance(tool_message, dict):
1850
- tool_message['content'] = deduplicated
1851
-
1852
- # Cache result for potential reuse
1853
- if not hasattr(patch_depth, 'tool_result_cache'):
1854
- patch_depth.tool_result_cache = {}
1855
-
1856
- result_key = f"{tool_name}_{hash(tool_content[:100])}"
1857
- patch_depth.tool_result_cache[result_key] = deduplicated
1858
-
1859
- print(f"[PIPECLEANER] Applied deduplication to {tool_name}")
1860
-
1861
- # Print stats
1862
- print(f"\n{'='*70}")
1863
- print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1864
- print(f"{'='*70}")
1865
- print(f"[PIPECLEANER] 🔢 Sentences:")
1866
- print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1867
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1868
- print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1869
- print(f"[PIPECLEANER]")
1870
- print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1871
- print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1872
- print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1873
- print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1874
- print(f"[PIPECLEANER]")
1875
- print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1876
- print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1877
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1878
- print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1879
- print(f"[PIPECLEANER]")
1880
- print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1881
- print(f"{'='*70}\n")
1882
-
1883
- return True
1884
-
1885
- except Exception as e:
1886
- print(f"[PIPECLEANER] ⚠️ Error during enforcement: {e}")
1887
- import traceback
1888
- traceback.print_exc()
1889
- return False
1890
-
1891
-
1892
- if __name__ == "__main__":
1893
- # Simple test
1894
- test_text = """
1895
- LangChain is a framework for developing applications powered by language models.
1896
- The LangChain framework enables developers to build LLM applications easily.
1897
- LangChain provides many useful features for LLM apps.
1898
- It supports multiple model providers including OpenAI and Anthropic.
1899
- The framework was created in 2022 by Harrison Chase.
1900
- LlamaIndex is another popular framework for LLM applications.
1901
- LlamaIndex focuses on data indexing and retrieval.
1902
- Both frameworks are open source and widely used.
1903
- """
1904
-
1905
- print("Testing pipecleaner deduplication...")
1906
- result, stats, model = deduplicate_search_results(test_text, verbose=True)
1907
-
1908
- print("\n" + "="*70)
1909
- print("STATS:")
1910
- print(f" Prune %: {stats['prune_pct']:.1f}%")
1911
- print(f" Entity Coverage: {stats['entity_coverage_pct']:.1f}%")
1912
- print(f" Tokens saved: {stats['tokens_saved']:,} ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1913
-
1914
- print("\n" + "="*70)
1915
- print("ORIGINAL:")
1916
- print(test_text)
1917
- print("\n" + "="*70)
1918
- print("DEDUPLICATED:")
1919
- print(result)
1920
-
1
+ """
2
+ Pipecleaner: Run-scoped global corpus deduplication for multi-agent systems.
3
+
4
+ V2.0: Global ClusterBank with dynamic batching barrier (5-10s) for cross-prompt deduplication.
5
+ - Run-scoped corpus: All prompts in a run share a global ClusterBank
6
+ - SimHash near-dup matching: Hamming distance ≤6 for 64-bit fingerprints
7
+ - Dynamic barrier: 5s min, +2s per arrival (cap 10s), maximizes dedupe by collecting bursts
8
+ - Canonical ownership: First prompt to use a cluster owns it, others drop duplicates
9
+ - Entity coverage: 95% threshold RUN-LEVEL (cumulative across all batches, not per-batch)
10
+
11
+ Algorithm:
12
+ 1. Intercept prompt → split sentences → compute SimHash signatures
13
+ 2. Match against ClusterBank (Hamming ≤6) → assign cluster_id or create new
14
+ 3. Queue prompt into micro-batch, extend barrier (+2s per arrival, cap 10s)
15
+ 4. On timer: cross-prompt dedupe (keep only canonical owners)
16
+ 5. RUN-LEVEL entity coverage check (95% cumulative across entire run), re-add if needed
17
+ 6. Emit cleaned prompts (original sentence order preserved)
18
+
19
+ Expected savings: 50-90% char reduction with 95%+ entity coverage across entire run.
20
+ Later batches are MORE aggressive (earlier batches already covered entities).
21
+ """
22
+
23
+ import re
24
+ import hashlib
25
+ import threading
26
+ import time
27
+ from typing import List, Dict, Set, Tuple, Optional, Any
28
+ from dataclasses import dataclass, field
29
+ from collections import defaultdict
30
+ import numpy as np
31
+ import asyncio
32
+
33
+ # Type alias for return type
34
+ DeduplicationResult = Tuple[str, Dict]
35
+
36
+ # Lazy imports for performance (only load when needed)
37
+ _embedding_model = None
38
+ _spacy_nlp = None
39
+ _model_lock = threading.Lock() # Thread-safe singleton access
40
+
41
+
42
+ def _vprint(message: str, verbose: bool = False, force: bool = False):
43
+ """Helper function for verbose printing."""
44
+ if force or verbose:
45
+ print(message)
46
+
47
+
48
+ def _get_embedding_model():
49
+ """
50
+ Lazy load sentence transformer model (thread-safe singleton).
51
+ Forces CPU to avoid meta tensor issues on Win + Py3.13 + Torch.
52
+ """
53
+ global _embedding_model
54
+
55
+ # Double-checked locking pattern for performance
56
+ if _embedding_model is None:
57
+ with _model_lock:
58
+ # Check again inside lock (another thread might have loaded it)
59
+ if _embedding_model is None:
60
+ try:
61
+ from sentence_transformers import SentenceTransformer
62
+ print("[PIPECLEANER] Loading embedding model: all-MiniLM-L6-v2 (384-dim, ~80MB)...")
63
+ # Force CPU device to avoid meta tensor issues
64
+ _embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
65
+ print("[PIPECLEANER] ✅ Embedding model loaded successfully (CPU)")
66
+ except ImportError:
67
+ print("[PIPECLEANER] ⚠️ sentence-transformers not installed. Install: pip install sentence-transformers")
68
+ raise
69
+ except Exception as e:
70
+ print(f"[PIPECLEANER] ⚠️ Failed to load embedding model: {e}")
71
+ raise
72
+
73
+ return _embedding_model
74
+
75
+
76
+ def _get_spacy_model():
77
+ """Lazy load spaCy model for entity extraction."""
78
+ global _spacy_nlp
79
+ if _spacy_nlp is None:
80
+ try:
81
+ import spacy
82
+ print("[PIPECLEANER] Loading spaCy model: en_core_web_sm...")
83
+ _spacy_nlp = spacy.load("en_core_web_sm")
84
+ print("[PIPECLEANER] ✅ spaCy model loaded successfully")
85
+ except ImportError:
86
+ print("[PIPECLEANER] ⚠️ spaCy not installed. Using regex fallback for entities.")
87
+ _spacy_nlp = "fallback"
88
+ except OSError:
89
+ print("[PIPECLEANER] ⚠️ spaCy model not found. Using regex fallback for entities.")
90
+ _spacy_nlp = "fallback"
91
+ return _spacy_nlp
92
+
93
+
94
+ # ============================================================================
95
+ # Run-Scoped Global Corpus System V2.0
96
+ # ============================================================================
97
+
98
+ @dataclass
99
+ class SentenceCluster:
100
+ """Represents a cluster of similar sentences across the run."""
101
+ cluster_id: str
102
+ canonical_sentence: str
103
+ owner_prompt_id: str # First prompt to use this cluster
104
+ simhash: int # 64-bit SimHash fingerprint
105
+ salience: float
106
+ entities: Set[str]
107
+ first_seen_seq: int
108
+ length: int
109
+ embedding: Optional[np.ndarray] = None # Sentence embedding for cosine similarity
110
+
111
+ @dataclass
112
+ class PromptState:
113
+ """State for a single prompt in the batch."""
114
+ prompt_id: str
115
+ sentences: List[str]
116
+ cluster_ids: List[str] # parallel to sentences
117
+ original_order: List[int] # track reordering
118
+ entities: Set[str]
119
+ arrived_at: float
120
+
121
+ @dataclass
122
+ class RunCorpusTelemetry:
123
+ """Run-level statistics for the corpus."""
124
+ prompts_total: int = 0
125
+ sentences_total: int = 0
126
+ clusters_total: int = 0
127
+ cross_prompt_dups_removed: int = 0
128
+ chars_in: int = 0
129
+ chars_out: int = 0
130
+ tokens_saved: int = 0
131
+ entity_coverage_avg: float = 100.0
132
+ batches_processed: int = 0
133
+ avg_barrier_ms: float = 0.0
134
+ max_barrier_ms: float = 0.0
135
+ barrier_times: List[float] = field(default_factory=list)
136
+
137
+
138
+ def compute_simhash(text: str, hash_bits: int = 64) -> int:
139
+ """
140
+ Compute SimHash fingerprint for near-dup detection.
141
+
142
+ Args:
143
+ text: Input text
144
+ hash_bits: Hash size (64-bit default)
145
+
146
+ Returns:
147
+ Integer hash value
148
+ """
149
+ # Tokenize and compute feature hashes
150
+ tokens = re.findall(r'\b\w+\b', text.lower())
151
+ if not tokens:
152
+ return 0
153
+
154
+ # Initialize bit vector
155
+ v = [0] * hash_bits
156
+
157
+ for token in tokens:
158
+ # Hash each token
159
+ h = int(hashlib.md5(token.encode()).hexdigest(), 16)
160
+
161
+ # Update bit vector
162
+ for i in range(hash_bits):
163
+ if h & (1 << i):
164
+ v[i] += 1
165
+ else:
166
+ v[i] -= 1
167
+
168
+ # Generate final hash
169
+ fingerprint = 0
170
+ for i in range(hash_bits):
171
+ if v[i] > 0:
172
+ fingerprint |= (1 << i)
173
+
174
+ return fingerprint
175
+
176
+
177
+ def hamming_distance(hash1: int, hash2: int) -> int:
178
+ """Count differing bits between two hashes."""
179
+ return bin(hash1 ^ hash2).count('1')
180
+
181
+
182
+ class RunScopedCorpus:
183
+ """
184
+ Global corpus for a single run, with dynamic batching barrier.
185
+ All prompts in the run share this corpus for cross-prompt deduplication.
186
+
187
+ CONCURRENCY MODEL:
188
+ - All shared state (clusters, prompt_registry, run_entities, kept_entities, batch_queue)
189
+ is protected by `self.batch_lock` (threading.Lock)
190
+ - All reads iterate over snapshots (dict(...), list(...)) to avoid "dict changed size" errors
191
+ - All writes are atomic under lock (copy-on-write when possible)
192
+ - Re-entrancy guard in caller (DaseinCallbackHandler) prevents nested calls
193
+ - Background timer thread (_process_batch) acquires lock before any mutations
194
+ """
195
+
196
+ def __init__(self, run_id: str, hamming_threshold: int = 6, entity_coverage_min: float = 0.95, verbose: bool = False):
197
+ self.run_id = run_id
198
+ self.hamming_threshold = hamming_threshold
199
+ self.entity_coverage_min = entity_coverage_min
200
+ self.verbose = verbose # Gate debug logging
201
+
202
+ # Core state
203
+ self.clusters: Dict[str, SentenceCluster] = {} # cluster_id → cluster
204
+ self.simhash_index: Dict[int, List[str]] = defaultdict(list) # simhash → [cluster_ids]
205
+ self.prompt_registry: Dict[str, PromptState] = {} # prompt_id → state
206
+ self.entity_index: Dict[str, Set[str]] = defaultdict(set) # entity → {cluster_ids}
207
+
208
+ # Run-level entity tracking for global coverage
209
+ self.run_entities: Set[str] = set() # All entities seen across entire run
210
+ self.kept_entities: Set[str] = set() # All entities kept across all batches
211
+
212
+ # Batching state
213
+ self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
214
+ self.batch_lock = threading.Lock() # Protects batch_queue, batch_timer, etc.
215
+ self.processing_lock = threading.Lock() # CRITICAL: Ensures only ONE batch processes at a time
216
+ self.batch_timer: Optional[threading.Timer] = None
217
+ self.batch_start_time: Optional[float] = None
218
+ self.barrier_duration: float = 5.0 # Start at 5s (min wait)
219
+ self.barrier_increment: float = 2.0 # Add 2s per new arrival
220
+ self.barrier_cap: float = 10.0 # Max 10s
221
+ self.batch_ready = threading.Event() # Signal when batch is processed
222
+ self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
223
+ self.prompt_loops: Dict[str, asyncio.AbstractEventLoop] = {} # Event loops for thread-safe signaling
224
+
225
+ # Sequence tracking
226
+ self.next_seq = 0
227
+ self.next_cluster_id = 0
228
+
229
+ # Telemetry
230
+ self.telemetry = RunCorpusTelemetry()
231
+
232
+ _vprint(f"[CORPUS] 🏗️ Created run-scoped corpus for run_id={run_id[:8]} (barrier: 5s min, +2s/arrival, 10s cap)", self.verbose)
233
+
234
+ def _generate_cluster_id(self) -> str:
235
+ """Generate unique cluster ID."""
236
+ cluster_id = f"c{self.next_cluster_id:06d}"
237
+ self.next_cluster_id += 1
238
+ return cluster_id
239
+
240
+ def find_matching_cluster(self, simhash: int, sentence: str, sentence_embedding=None) -> Optional[str]:
241
+ """
242
+ Find existing cluster that matches this sentence using cosine similarity.
243
+
244
+ Args:
245
+ simhash: SimHash of the sentence (for indexing, not matching)
246
+ sentence: Original sentence text
247
+ sentence_embedding: Pre-computed embedding for this sentence
248
+
249
+ Returns:
250
+ cluster_id if match found, None otherwise
251
+ """
252
+ if sentence_embedding is None:
253
+ return None
254
+
255
+ # Check all existing clusters for semantic similarity
256
+ # Use cosine similarity ≥ 0.60 (catches cross-site paraphrases)
257
+ best_match_id = None
258
+ best_similarity = 0.60 # Threshold for considering duplicate (lowered to catch paraphrases)
259
+
260
+ # Snapshot clusters to avoid "dict changed size" errors (thread-safe read)
261
+ with self.batch_lock:
262
+ clusters_snapshot = dict(self.clusters)
263
+
264
+ for cluster_id, cluster in clusters_snapshot.items():
265
+ if cluster.canonical_sentence == sentence:
266
+ # Exact match
267
+ return cluster_id
268
+
269
+ # Hybrid similarity: semantic + lexical fallback for short sentences
270
+ if hasattr(cluster, 'embedding') and cluster.embedding is not None:
271
+ # Semantic similarity
272
+ similarity = np.dot(sentence_embedding, cluster.embedding)
273
+
274
+ # Lexical fallback for short sentences (boilerplate detection)
275
+ max_len = max(len(sentence), len(cluster.canonical_sentence))
276
+ if max_len <= 120 and similarity < 0.60:
277
+ lexical_sim = compute_char_3gram_jaccard(sentence, cluster.canonical_sentence)
278
+ if lexical_sim >= 0.82:
279
+ # Boost similarity to indicate match via lexical path
280
+ similarity = max(similarity, 0.82)
281
+
282
+ if similarity > best_similarity:
283
+ best_similarity = similarity
284
+ best_match_id = cluster_id
285
+
286
+ return best_match_id
287
+
288
+ def add_sentence_to_corpus(self, sentence: str, prompt_id: str, salience: float, entities: Set[str]) -> str:
289
+ """
290
+ Add sentence to corpus or match to existing cluster.
291
+
292
+ Args:
293
+ sentence: Sentence text
294
+ prompt_id: Owner prompt
295
+ salience: Importance score
296
+ entities: Extracted entities
297
+
298
+ Returns:
299
+ cluster_id (new or matched)
300
+ """
301
+ # Compute SimHash
302
+ simhash = compute_simhash(sentence)
303
+
304
+ # Try to match existing cluster
305
+ existing_cluster_id = self.find_matching_cluster(simhash, sentence)
306
+
307
+ if existing_cluster_id:
308
+ # Matched existing cluster
309
+ return existing_cluster_id
310
+
311
+ # Create new cluster
312
+ cluster_id = self._generate_cluster_id()
313
+ cluster = SentenceCluster(
314
+ cluster_id=cluster_id,
315
+ canonical_sentence=sentence,
316
+ owner_prompt_id=prompt_id,
317
+ simhash=simhash,
318
+ salience=salience,
319
+ entities=entities,
320
+ first_seen_seq=self.next_seq,
321
+ length=len(sentence)
322
+ )
323
+
324
+ self.clusters[cluster_id] = cluster
325
+ self.simhash_index[simhash].append(cluster_id)
326
+
327
+ # Update entity index
328
+ for entity in entities:
329
+ self.entity_index[entity].add(cluster_id)
330
+
331
+ self.next_seq += 1
332
+ self.telemetry.clusters_total += 1
333
+
334
+ return cluster_id
335
+
336
+ async def enqueue_prompt(self, prompt_id: str, prompt_text: str) -> str:
337
+ """
338
+ Enqueue prompt for batched processing with dynamic barrier (ASYNC - allows parallel arrivals).
339
+
340
+ Args:
341
+ prompt_id: Unique prompt identifier
342
+ prompt_text: Full prompt text
343
+
344
+ Returns:
345
+ Deduplicated prompt text (after barrier)
346
+ """
347
+ arrival_time = time.time()
348
+
349
+ # Split into sentences
350
+ sentences = split_into_sentences(prompt_text)
351
+
352
+ if not sentences:
353
+ return prompt_text
354
+
355
+ self.telemetry.prompts_total += 1
356
+ self.telemetry.sentences_total += len(sentences)
357
+ self.telemetry.chars_in += len(prompt_text)
358
+
359
+ # CRITICAL: DO NOT compute embeddings here! It blocks async arrivals.
360
+ # Store raw sentences and compute embeddings in batch during _process_batch
361
+ all_entities = set()
362
+
363
+ for sentence in sentences:
364
+ # Extract entities (fast, non-blocking)
365
+ entities, numbers = extract_entities_regex(sentence)
366
+ all_entities.update(entities)
367
+ all_entities.update(numbers)
368
+
369
+ # Create prompt state (thread-safe mutation)
370
+ # NOTE: cluster_ids will be computed during batch processing (after embeddings)
371
+ with self.batch_lock:
372
+ prompt_state = PromptState(
373
+ prompt_id=prompt_id,
374
+ sentences=sentences,
375
+ cluster_ids=[], # Will be filled during _process_batch
376
+ original_order=list(range(len(sentences))),
377
+ entities=all_entities,
378
+ arrived_at=arrival_time
379
+ )
380
+
381
+ self.prompt_registry[prompt_id] = prompt_state
382
+
383
+ # Add to batch queue and manage barrier
384
+ # Create per-prompt ASYNC event for sequential release
385
+ prompt_ready = asyncio.Event()
386
+ loop = asyncio.get_running_loop()
387
+ self.prompt_events[prompt_id] = prompt_ready
388
+ self.prompt_loops[prompt_id] = loop
389
+
390
+ with self.batch_lock:
391
+ self.batch_queue.append(prompt_id)
392
+
393
+ if self.batch_timer is None:
394
+ # First prompt in batch, start timer at 5s
395
+ self.batch_start_time = arrival_time
396
+ self.barrier_duration = 5.0
397
+ print(f"[CORPUS] ⏱️ Starting batch barrier: 5.0s (first prompt, min wait)")
398
+ self.batch_timer = threading.Timer(self.barrier_duration, self._process_batch)
399
+ self.batch_timer.start()
400
+ else:
401
+ # Extend barrier by +2s per arrival (capped at 10s)
402
+ elapsed = arrival_time - self.batch_start_time
403
+ new_duration = min(elapsed + self.barrier_increment, self.barrier_cap)
404
+
405
+ if new_duration > self.barrier_duration:
406
+ # Cancel old timer, start new one
407
+ self.batch_timer.cancel()
408
+ remaining = new_duration - elapsed
409
+ self.barrier_duration = new_duration
410
+ _vprint(f"[CORPUS] ⏱️ Extending barrier to {new_duration:.1f}s (+{remaining:.1f}s remaining, +{self.barrier_increment:.1f}s per arrival)", self.verbose)
411
+ self.batch_timer = threading.Timer(remaining, self._process_batch)
412
+ self.batch_timer.start()
413
+
414
+ # ASYNC wait for THIS prompt's individual event (allows other async tasks to proceed)
415
+ # Timeout must be generous to account for model loading on first batch
416
+ try:
417
+ await asyncio.wait_for(prompt_ready.wait(), timeout=30.0) # 30s max wait (model load + processing)
418
+ timed_out = False
419
+ except asyncio.TimeoutError:
420
+ timed_out = True
421
+
422
+ if timed_out:
423
+ # Fail open: return original text if batch processing hangs
424
+ print(f"[CORPUS] ⚠️ Timeout waiting for batch processing, returning original prompt")
425
+ self.telemetry.chars_out += len(prompt_text)
426
+ return prompt_text
427
+
428
+ # Retrieve deduplicated result
429
+ deduplicated_text = self._get_deduplicated_prompt(prompt_id)
430
+
431
+ if not deduplicated_text:
432
+ # Safety: if result is missing, return original
433
+ print(f"[CORPUS] ⚠️ Missing deduplicated result for prompt {prompt_id[:8]}, returning original")
434
+ self.telemetry.chars_out += len(prompt_text)
435
+ return prompt_text
436
+
437
+ self.telemetry.chars_out += len(deduplicated_text)
438
+
439
+ return deduplicated_text
440
+
441
+ def _process_batch(self):
442
+ """Process current batch: cross-prompt dedupe, entity coverage check, emit (synchronous)."""
443
+ # CRITICAL: Acquire processing lock to prevent multiple batches from processing simultaneously
444
+ with self.processing_lock:
445
+ with self.batch_lock:
446
+ if not self.batch_queue:
447
+ # No prompts to process, just return (shouldn't happen)
448
+ return
449
+
450
+ batch_prompts = self.batch_queue.copy()
451
+ self.batch_queue.clear()
452
+ self.batch_timer = None
453
+
454
+ batch_duration_ms = (time.time() - self.batch_start_time) * 1000
455
+ self.telemetry.barrier_times.append(batch_duration_ms)
456
+ self.telemetry.batches_processed += 1
457
+
458
+ # Always show batch summary (key metric)
459
+ print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
460
+
461
+ # Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
462
+ # This is done ONCE for the entire batch, allowing parallel arrivals
463
+ _vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
464
+ model = _get_embedding_model()
465
+
466
+ for prompt_id in batch_prompts:
467
+ prompt_state = self.prompt_registry[prompt_id]
468
+
469
+ if not prompt_state.cluster_ids: # Only process if not yet clustered
470
+ # Compute embeddings for all sentences in this prompt (batch operation)
471
+ sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
472
+
473
+ # Match/create clusters for each sentence
474
+ cluster_ids = []
475
+ for i, sentence in enumerate(prompt_state.sentences):
476
+ # Compute salience
477
+ salience = len(sentence) / 100.0
478
+ salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
479
+
480
+ # Extract entities
481
+ entities, numbers = extract_entities_regex(sentence)
482
+
483
+ # Match against existing clusters
484
+ cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
485
+
486
+ if cluster_id is None:
487
+ # Create new cluster
488
+ with self.batch_lock:
489
+ cluster_id = self._generate_cluster_id()
490
+ simhash = compute_simhash(sentence)
491
+
492
+ cluster = SentenceCluster(
493
+ cluster_id=cluster_id,
494
+ canonical_sentence=sentence,
495
+ owner_prompt_id=prompt_id,
496
+ simhash=simhash,
497
+ salience=salience,
498
+ entities=entities | numbers,
499
+ first_seen_seq=self.next_seq,
500
+ length=len(sentence),
501
+ embedding=sentence_embeddings[i]
502
+ )
503
+
504
+ self.clusters[cluster_id] = cluster
505
+ self.next_seq += 1
506
+ self.telemetry.clusters_total += 1
507
+
508
+ cluster_ids.append(cluster_id)
509
+
510
+ # Update prompt state with cluster_ids
511
+ prompt_state.cluster_ids = cluster_ids
512
+
513
+ _vprint(f"[CORPUS] Embeddings computed and clusters assigned", self.verbose)
514
+
515
+ # Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
516
+ # This is critical for true run-scoped deduplication
517
+ all_sentences = []
518
+ sentence_to_prompt = {} # Map sentence_id (prompt_id, index)
519
+ locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
520
+
521
+ # Iterate over ALL prompts in registry (including previous batches)
522
+ for prompt_id, prompt_state in self.prompt_registry.items():
523
+ is_previous_batch = prompt_id not in batch_prompts
524
+
525
+ for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
526
+ cluster = self.clusters.get(cluster_id)
527
+ if not cluster:
528
+ continue
529
+
530
+ # Create Sentence object for greedy algorithm
531
+ sent_id = f"{prompt_id}_{idx}"
532
+ sent_obj = Sentence(
533
+ id=sent_id,
534
+ text=sentence_text,
535
+ embedding=cluster.embedding,
536
+ entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
537
+ numbers=set(), # Already in entities
538
+ salience=cluster.salience,
539
+ position=cluster.first_seen_seq
540
+ )
541
+ all_sentences.append(sent_obj)
542
+ sentence_to_prompt[sent_id] = (prompt_id, idx)
543
+
544
+ # Lock sentences from previous batches (already emitted to user)
545
+ if is_previous_batch:
546
+ locked_sentences.add(sent_id)
547
+
548
+ _vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
549
+ _vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
550
+
551
+ # Step 2: Compute degree map (needed for isolates pass later)
552
+ degree_map = {}
553
+ for sent in all_sentences:
554
+ degree = 0
555
+ for other in all_sentences:
556
+ if sent.id != other.id:
557
+ if are_sentences_similar(sent, other, semantic_threshold=0.60):
558
+ degree += 1
559
+ degree_map[sent.id] = degree
560
+
561
+ # Sanity checks
562
+ isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
563
+ non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
564
+ pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
565
+ avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
566
+ print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
567
+
568
+ # Step 3: Run greedy maximum-independent-set selection
569
+ # Start with LOCKED sentences (from previous batches, already emitted)
570
+ # Then run MIS only on NEW sentences (current batch)
571
+ selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
572
+ selected_ids = locked_sentences.copy()
573
+
574
+ print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
575
+
576
+ # Now run MIS on NEW sentences only (exclude locked)
577
+ new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
578
+
579
+ if new_sentences:
580
+ # Run MIS on new sentences, considering locked ones as neighbors
581
+ new_selected = greedy_max_independent_set(
582
+ new_sentences,
583
+ similarity_threshold=0.60,
584
+ verbose=False, # Set to True for debugging
585
+ precomputed_degree_map=degree_map # Pass precomputed degrees
586
+ )
587
+
588
+ # Add newly selected sentences
589
+ selected_sentences.extend(new_selected)
590
+ selected_ids.update(s.id for s in new_selected)
591
+
592
+ _vprint(f"[CORPUS] MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
593
+
594
+ # Step 3: Compute NODE COVERAGE (align universe for backfill)
595
+ # covered_nodes = S ∪ N(S) (selected + their neighbors)
596
+ covered_nodes = set(selected_ids)
597
+ sentence_map = {s.id: s for s in all_sentences}
598
+
599
+ for selected_id in selected_ids:
600
+ selected_sent = sentence_map[selected_id]
601
+ # Add all neighbors (similar nodes)
602
+ for other in all_sentences:
603
+ if other.id != selected_id:
604
+ if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
605
+ covered_nodes.add(other.id)
606
+
607
+ total_nodes = len(all_sentences)
608
+ node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
609
+
610
+ _vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
611
+
612
+ # Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
613
+ # Goal: Maximize node coverage (S ∪ N(S)) by re-adding removed nodes with highest gain
614
+ # gain(u) = |({u} N(u)) \ covered_nodes|
615
+ backfill_added = 0
616
+ isolates_added = 0
617
+ target_coverage = 0.90 # 90% node coverage target
618
+
619
+ if node_coverage_before < target_coverage:
620
+ uncovered_count = total_nodes - len(covered_nodes)
621
+ _vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
622
+
623
+ # Get ALL removed sentences (candidates for backfill)
624
+ removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
625
+
626
+ # Helper: compute node gain for a candidate
627
+ def compute_node_gain(sent):
628
+ """Compute how many uncovered nodes this sentence + its neighbors would cover."""
629
+ candidate_coverage = {sent.id}
630
+ # Add neighbors
631
+ for other in all_sentences:
632
+ if other.id != sent.id:
633
+ if are_sentences_similar(sent, other, semantic_threshold=0.60):
634
+ candidate_coverage.add(other.id)
635
+ # Gain = new nodes not already covered
636
+ return len(candidate_coverage - covered_nodes)
637
+
638
+ # Debug: Print top-5 candidates by gain (first iteration only)
639
+ if removed_sentences:
640
+ gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
641
+ gains.sort(key=lambda x: x[1], reverse=True)
642
+ _vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
643
+ for sent, gain in gains[:5]:
644
+ _vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
645
+
646
+ # GREEDY SET COVER: repeatedly pick sentence with max gain
647
+ iteration = 0
648
+ while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
649
+ # Find best candidate
650
+ best_sent = None
651
+ best_gain = 0
652
+
653
+ for sent in removed_sentences:
654
+ gain = compute_node_gain(sent)
655
+ if gain > best_gain:
656
+ best_gain = gain
657
+ best_sent = sent
658
+
659
+ if best_gain == 0:
660
+ _vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
661
+ break
662
+
663
+ # Add best sentence back
664
+ selected_ids.add(best_sent.id)
665
+ selected_sentences.append(best_sent)
666
+
667
+ # Update covered_nodes: add this node + its neighbors
668
+ covered_nodes.add(best_sent.id)
669
+ for other in all_sentences:
670
+ if other.id != best_sent.id:
671
+ if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
672
+ covered_nodes.add(other.id)
673
+
674
+ removed_sentences.remove(best_sent)
675
+ backfill_added += 1
676
+
677
+ # Update coverage
678
+ node_coverage_before = len(covered_nodes) / total_nodes
679
+ iteration += 1
680
+
681
+ if backfill_added <= 5:
682
+ _vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
683
+
684
+ _vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
685
+
686
+ # Step 5: ISOLATES PASS - add uncovered degree=0 nodes
687
+ # These are unique nodes with no similar neighbors
688
+ uncovered_isolates = [sent for sent in all_sentences
689
+ if sent.id not in covered_nodes and degree_map[sent.id] == 0]
690
+
691
+ if uncovered_isolates:
692
+ _vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
693
+
694
+ for sent in uncovered_isolates:
695
+ if node_coverage_before >= target_coverage:
696
+ break
697
+ selected_ids.add(sent.id)
698
+ covered_nodes.add(sent.id)
699
+ isolates_added += 1
700
+ node_coverage_before = len(covered_nodes) / total_nodes
701
+
702
+ if isolates_added <= 5:
703
+ _vprint(f"[CORPUS] Isolate: '{sent.text[:60]}...'", self.verbose)
704
+
705
+ if isolates_added > 0:
706
+ _vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
707
+
708
+ # Final coverage stats (NODE universe)
709
+ final_selected = len(selected_ids)
710
+ final_covered_nodes = len(covered_nodes)
711
+ final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
712
+
713
+ # Assert denominator is |V| (all nodes, no filtering)
714
+ assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
715
+
716
+ _vprint(f"[CORPUS] Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
717
+ _vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
718
+
719
+ # Step 6: Map results back to prompts
720
+ results = {}
721
+ for prompt_id in batch_prompts:
722
+ prompt_state = self.prompt_registry[prompt_id]
723
+ kept_sentences = []
724
+ removed_count = 0
725
+
726
+ for idx, sentence_text in enumerate(prompt_state.sentences):
727
+ sent_id = f"{prompt_id}_{idx}"
728
+ if sent_id in selected_ids:
729
+ kept_sentences.append(sentence_text)
730
+ else:
731
+ removed_count += 1
732
+
733
+ results[prompt_id] = {
734
+ 'kept': kept_sentences,
735
+ 'removed': removed_count,
736
+ 'original_count': len(prompt_state.sentences)
737
+ }
738
+
739
+ # Step 7: Store results and emit to prompts
740
+ for prompt_id in batch_prompts:
741
+ prompt_state = self.prompt_registry[prompt_id]
742
+ result = results[prompt_id]
743
+ prompt_state.sentences = result['kept']
744
+
745
+ reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
746
+ _vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
747
+
748
+ # Update telemetry
749
+ self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
750
+ # Always show final batch summary (key metric)
751
+ print(f"[CORPUS] ✅ Batch complete: Node coverage {final_node_coverage*100:.1f}%")
752
+
753
+ # Update telemetry
754
+ if self.telemetry.barrier_times:
755
+ self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
756
+ self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
757
+
758
+ self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
759
+
760
+ # Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
761
+ _vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
762
+ for i, prompt_id in enumerate(batch_prompts):
763
+ event = self.prompt_events.get(prompt_id)
764
+ if event:
765
+ # Signal the asyncio.Event from the original loop thread-safely
766
+ loop = self.prompt_loops.get(prompt_id)
767
+ if loop:
768
+ loop.call_soon_threadsafe(event.set)
769
+ else:
770
+ event.set()
771
+ # Longer delay to ensure threads hit on_llm_start one at a time
772
+ if i < len(batch_prompts) - 1: # Don't delay after the last one
773
+ time.sleep(0.5) # 500ms stagger to be safe
774
+
775
+ # Clean up events to prevent memory leak
776
+ for prompt_id in batch_prompts:
777
+ self.prompt_events.pop(prompt_id, None)
778
+ self.prompt_loops.pop(prompt_id, None)
779
+
780
+ def _get_deduplicated_prompt(self, prompt_id: str) -> str:
781
+ """Get deduplicated prompt text."""
782
+ prompt_state = self.prompt_registry.get(prompt_id)
783
+ if not prompt_state:
784
+ return ""
785
+
786
+ return "\n".join(prompt_state.sentences)
787
+
788
+ def get_telemetry_summary(self) -> str:
789
+ """Generate human-readable telemetry summary."""
790
+ t = self.telemetry
791
+ reduction_pct = ((t.chars_in - t.chars_out) / t.chars_in * 100) if t.chars_in > 0 else 0
792
+
793
+ summary = f"""
794
+ {'='*70}
795
+ [CORPUS] 📊 RUN-SCOPED TELEMETRY (run_id={self.run_id[:8]})
796
+ {'='*70}
797
+ Prompts processed: {t.prompts_total}
798
+ Sentences total: {t.sentences_total}
799
+ Clusters created: {t.clusters_total}
800
+ Cross-prompt dups removed: {t.cross_prompt_dups_removed}
801
+ {'='*70}
802
+ Chars in: {t.chars_in:,}
803
+ Chars out: {t.chars_out:,}
804
+ Reduction: {reduction_pct:.1f}%
805
+ Tokens saved (est): {t.tokens_saved:,} tokens
806
+ {'='*70}
807
+ Node Coverage (S∪N(S)): {t.entity_coverage_avg:.1f}%
808
+ Batches processed: {t.batches_processed}
809
+ Avg barrier: {t.avg_barrier_ms:.0f}ms
810
+ Max barrier: {t.max_barrier_ms:.0f}ms
811
+ {'='*70}
812
+ """
813
+ return summary
814
+
815
+
816
+ # Global registry of run-scoped corpuses
817
+ _run_corpuses: Dict[str, RunScopedCorpus] = {}
818
+ _corpus_lock = threading.Lock()
819
+
820
+
821
+ def get_or_create_corpus(run_id: str, verbose: bool = False) -> RunScopedCorpus:
822
+ """Get or create run-scoped corpus (thread-safe)."""
823
+ with _corpus_lock:
824
+ if run_id not in _run_corpuses:
825
+ _run_corpuses[run_id] = RunScopedCorpus(run_id, verbose=verbose)
826
+ return _run_corpuses[run_id]
827
+
828
+
829
+ def cleanup_corpus(run_id: str):
830
+ """Cleanup corpus when run ends."""
831
+ with _corpus_lock:
832
+ if run_id in _run_corpuses:
833
+ corpus = _run_corpuses[run_id]
834
+ print(corpus.get_telemetry_summary())
835
+ del _run_corpuses[run_id]
836
+ print(f"[CORPUS] 🗑️ Cleaned up corpus for run_id={run_id[:8]}")
837
+
838
+
839
+ # ============================================================================
840
+ # Legacy Per-Prompt Deduplication (V1.0 - Fallback)
841
+ # ============================================================================
842
+
843
+ @dataclass
844
+ class Sentence:
845
+ """Represents a sentence with metadata for deduplication."""
846
+ id: str
847
+ text: str
848
+ embedding: Optional[np.ndarray] = None
849
+ entities: Set[str] = None
850
+ numbers: Set[str] = None
851
+ salience: float = 0.0
852
+ position: int = 0
853
+
854
+ def __post_init__(self):
855
+ if self.entities is None:
856
+ self.entities = set()
857
+ if self.numbers is None:
858
+ self.numbers = set()
859
+
860
+ @property
861
+ def protected_entities(self) -> Set[str]:
862
+ """All entities that must be preserved."""
863
+ return self.entities | self.numbers
864
+
865
+
866
+ def estimate_tokens(text: str) -> int:
867
+ """Estimate token count (roughly chars/4 for English)."""
868
+ return len(text) // 4
869
+
870
+
871
+ def adaptive_resize_sentences(sentences: List[str]) -> List[str]:
872
+ """
873
+ Adaptively resize sentences for optimal embedding similarity:
874
+ - Long (>120 tokens): Split on commas, semicolons, conjunctions
875
+ - Short (<40 tokens): Merge with next sentence
876
+ - Mid (40-120 tokens): Keep as-is
877
+
878
+ This improves cross-page similarity and reduces false uniqueness.
879
+ """
880
+ resized = []
881
+ i = 0
882
+
883
+ while i < len(sentences):
884
+ sent = sentences[i]
885
+ tokens = estimate_tokens(sent)
886
+
887
+ if tokens > 120:
888
+ # LONG: Split on commas, semicolons, and conjunctions
889
+ # Split points: , ; : and, but, or, however, therefore (preceded by space/comma)
890
+ split_pattern = r'(?:,\s+(?:and|but|or|however|therefore|while|although)\s+|[;:])\s+'
891
+ chunks = re.split(split_pattern, sent)
892
+
893
+ # Ensure each chunk is reasonable (not too tiny)
894
+ for chunk in chunks:
895
+ if chunk.strip() and estimate_tokens(chunk) >= 20:
896
+ resized.append(chunk.strip())
897
+ elif resized:
898
+ # Merge tiny chunk with previous
899
+ resized[-1] += " " + chunk.strip()
900
+ i += 1
901
+
902
+ elif tokens < 40 and i + 1 < len(sentences):
903
+ # SHORT: Merge with next sentence
904
+ next_sent = sentences[i + 1]
905
+ merged = sent + " " + next_sent
906
+ merged_tokens = estimate_tokens(merged)
907
+
908
+ # Only merge if result is ≤120 tokens (don't create overly long sentences)
909
+ if merged_tokens <= 120:
910
+ resized.append(merged)
911
+ i += 2 # Skip next sentence (already merged)
912
+ else:
913
+ # Next sentence would make it too long, keep short one as-is
914
+ resized.append(sent)
915
+ i += 1
916
+
917
+ else:
918
+ # MID-RANGE (40-120) or last sentence: Keep as-is
919
+ resized.append(sent)
920
+ i += 1
921
+
922
+ return resized
923
+
924
+
925
+ def split_into_sentences(text: str) -> List[str]:
926
+ """
927
+ Split text into sentences with special handling for markdown structures,
928
+ then adaptively resize for optimal embedding similarity.
929
+
930
+ Handles:
931
+ - Standard sentences ending with .!?
932
+ - Bullet points and numbered lists
933
+ - Code blocks (preserve as single units)
934
+ - Headers
935
+ - Adaptive resizing: long sentences split, short ones merged
936
+ """
937
+ sentences = []
938
+
939
+ # First, protect code blocks
940
+ code_block_pattern = r'```[\s\S]*?```'
941
+ code_blocks = {}
942
+ for i, match in enumerate(re.finditer(code_block_pattern, text)):
943
+ placeholder = f"__CODE_BLOCK_{i}__"
944
+ code_blocks[placeholder] = match.group()
945
+ text = text.replace(match.group(), placeholder)
946
+
947
+ # Split on sentence boundaries
948
+ # Handle: . ! ? followed by space/newline, or newlines with list markers
949
+ patterns = [
950
+ r'(?<=[.!?])\s+(?=[A-Z])', # Standard sentences
951
+ r'\n\s*[-*•]\s+', # Bullet points
952
+ r'\n\s*\d+\.\s+', # Numbered lists
953
+ r'\n#{1,6}\s+', # Markdown headers
954
+ r'\n\s*\n', # Paragraph breaks
955
+ ]
956
+
957
+ # Use non-capturing groups so delimiters are discarded by re.split
958
+ combined_pattern = '(?:' + '|'.join(patterns) + ')'
959
+ parts = re.split(combined_pattern, text)
960
+
961
+ # Collect non-empty segments as sentences
962
+ sentences = [p.strip() for p in parts if p and p.strip()]
963
+
964
+ # Restore code blocks
965
+ restored = []
966
+ for sent in sentences:
967
+ for placeholder, code in code_blocks.items():
968
+ sent = sent.replace(placeholder, code)
969
+ if sent.strip():
970
+ restored.append(sent.strip())
971
+
972
+ # ADAPTIVE RESIZING: Split long sentences, merge short ones
973
+ resized = adaptive_resize_sentences(restored)
974
+
975
+ return resized
976
+
977
+
978
+ def extract_entities_regex(text: str) -> Tuple[Set[str], Set[str]]:
979
+ """
980
+ Fallback regex-based entity extraction.
981
+
982
+ Returns:
983
+ (entities, numbers) - Sets of extracted entities and numbers
984
+ """
985
+ entities = set()
986
+ numbers = set()
987
+
988
+ # Proper nouns: Capitalized words (basic heuristic) - at least 3 chars
989
+ proper_nouns = re.findall(r'\b[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]+)*\b', text)
990
+ entities.update(proper_nouns)
991
+
992
+ # Technical terms: CamelCase, snake_case, package names
993
+ technical = re.findall(r'\b[A-Z][a-z]+[A-Z]\w+\b', text) # CamelCase
994
+ technical += re.findall(r'\b\w+_\w+\b', text) # snake_case
995
+ entities.update(technical)
996
+
997
+ # Numbers: MEANINGFUL numbers only (exclude single digits 0-9)
998
+ # Include: multi-digit numbers, floats, percentages, version numbers
999
+ nums = re.findall(r'\b\d{2,}(?:\.\d+)?%?\b', text) # 2+ digits
1000
+ nums += re.findall(r'\b\d+\.\d+\b', text) # Floats like 14.4, 2.0
1001
+ numbers.update(nums)
1002
+
1003
+ # Dates: YYYY-MM-DD, MM/DD/YYYY, etc.
1004
+ dates = re.findall(r'\b\d{4}[-/]\d{1,2}[-/]\d{1,4}\b', text) # Full dates
1005
+ dates += re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text)
1006
+ numbers.update(dates)
1007
+
1008
+ # Filter out common non-informative words and malformed entities
1009
+ stopwords = {
1010
+ # Common words
1011
+ 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1012
+ 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1013
+ 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1014
+ 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1015
+ # Markup/formatting artifacts
1016
+ 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1017
+ }
1018
+
1019
+ # Filter entities
1020
+ filtered_entities = set()
1021
+ for e in entities:
1022
+ # Skip short entities
1023
+ if len(e) < 3:
1024
+ continue
1025
+
1026
+ # Skip if contains newlines (malformed extraction)
1027
+ if '\n' in e:
1028
+ continue
1029
+
1030
+ # Skip stopwords (case-insensitive)
1031
+ if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1032
+ continue
1033
+
1034
+ # Skip if it's just a URL fragment
1035
+ if e.lower() in ['url', 'http', 'https', 'www']:
1036
+ continue
1037
+
1038
+ # Skip if ends with common suffixes that indicate malformed extraction
1039
+ if e.endswith('---') or e.endswith('...') or e.endswith('--'):
1040
+ continue
1041
+
1042
+ filtered_entities.add(e)
1043
+
1044
+ # Filter numbers - remove single digits 0-9 (often SOURCE numbers)
1045
+ filtered_numbers = {n for n in numbers if len(n) >= 2 or '.' in n or '%' in n}
1046
+
1047
+ return filtered_entities, filtered_numbers
1048
+
1049
+
1050
+ def extract_entities_spacy(text: str, nlp) -> Tuple[Set[str], Set[str]]:
1051
+ """
1052
+ spaCy-based entity extraction (more accurate).
1053
+
1054
+ Returns:
1055
+ (entities, numbers) - Sets of extracted entities and numbers
1056
+ """
1057
+ entities = set()
1058
+ numbers = set()
1059
+
1060
+ doc = nlp(text)
1061
+
1062
+ # Named entities
1063
+ for ent in doc.ents:
1064
+ if ent.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']:
1065
+ entities.add(ent.text)
1066
+ elif ent.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
1067
+ numbers.add(ent.text)
1068
+
1069
+ # Also grab technical terms (capitalized noun phrases)
1070
+ for chunk in doc.noun_chunks:
1071
+ if chunk.text[0].isupper():
1072
+ entities.add(chunk.text)
1073
+
1074
+ # Apply SAME filtering as regex version
1075
+ stopwords = {
1076
+ 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1077
+ 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1078
+ 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1079
+ 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1080
+ 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1081
+ }
1082
+
1083
+ # Filter entities
1084
+ filtered_entities = set()
1085
+ for e in entities:
1086
+ # Skip short entities
1087
+ if len(e) < 3:
1088
+ continue
1089
+
1090
+ # Skip if contains newlines (malformed)
1091
+ if '\n' in e:
1092
+ continue
1093
+
1094
+ # Skip stopwords (case-insensitive)
1095
+ if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1096
+ continue
1097
+
1098
+ # Skip URL fragments
1099
+ if e.lower() in ['url', 'http', 'https', 'www']:
1100
+ continue
1101
+
1102
+ # Skip malformed endings
1103
+ if e.endswith('---') or e.endswith('...') or e.endswith('--') or e.endswith('---\\nURL'):
1104
+ continue
1105
+
1106
+ filtered_entities.add(e)
1107
+
1108
+ # Filter numbers - remove single digits 0-9
1109
+ filtered_numbers = {n for n in numbers if len(str(n).strip()) >= 2 or '.' in str(n) or '%' in str(n)}
1110
+
1111
+ return filtered_entities, filtered_numbers
1112
+
1113
+
1114
+ def extract_entities(text: str) -> Tuple[Set[str], Set[str]]:
1115
+ """
1116
+ Extract entities and numbers from text.
1117
+
1118
+ Uses spaCy if available, falls back to regex.
1119
+
1120
+ Returns:
1121
+ (entities, numbers) - Sets of protected entities and numbers
1122
+ """
1123
+ nlp = _get_spacy_model()
1124
+
1125
+ if nlp == "fallback":
1126
+ return extract_entities_regex(text)
1127
+ else:
1128
+ return extract_entities_spacy(text, nlp)
1129
+
1130
+
1131
+ def compute_salience(sentence: str, position: int, total_sentences: int) -> float:
1132
+ """
1133
+ Compute salience score for a sentence.
1134
+
1135
+ Factors:
1136
+ - Position: Earlier sentences weighted higher (first paragraph effect)
1137
+ - Length: Moderate length preferred (too short = filler, too long = verbose)
1138
+ - Entity density: More entities = more information-dense
1139
+ - Numbers: Presence of numbers = factual content
1140
+
1141
+ Returns:
1142
+ Salience score (0.0 to 1.0, higher = more important)
1143
+ """
1144
+ score = 0.0
1145
+
1146
+ # Position-based (exponential decay)
1147
+ position_weight = np.exp(-position / (total_sentences * 0.3))
1148
+ score += position_weight * 0.3
1149
+
1150
+ # Length-based (optimal ~50-150 chars)
1151
+ length = len(sentence)
1152
+ if 50 <= length <= 150:
1153
+ length_weight = 1.0
1154
+ elif length < 50:
1155
+ length_weight = length / 50
1156
+ else:
1157
+ length_weight = 150 / length
1158
+ score += length_weight * 0.2
1159
+
1160
+ # Entity density (basic heuristic: count capitalized words)
1161
+ words = sentence.split()
1162
+ cap_words = sum(1 for w in words if w and w[0].isupper())
1163
+ entity_density = min(cap_words / max(len(words), 1), 1.0)
1164
+ score += entity_density * 0.3
1165
+
1166
+ # Number presence
1167
+ has_numbers = bool(re.search(r'\d', sentence))
1168
+ score += 0.2 if has_numbers else 0.0
1169
+
1170
+ return min(score, 1.0)
1171
+
1172
+
1173
+ def compute_char_3gram_jaccard(text1: str, text2: str) -> float:
1174
+ """
1175
+ Compute character 3-gram Jaccard similarity.
1176
+ Captures boilerplate and tight phrasing that embeddings might miss.
1177
+
1178
+ Returns:
1179
+ Jaccard similarity [0, 1]
1180
+ """
1181
+ def get_3grams(text):
1182
+ text = text.lower()
1183
+ return set(text[i:i+3] for i in range(len(text) - 2))
1184
+
1185
+ grams1 = get_3grams(text1)
1186
+ grams2 = get_3grams(text2)
1187
+
1188
+ if not grams1 or not grams2:
1189
+ return 0.0
1190
+
1191
+ intersection = len(grams1 & grams2)
1192
+ union = len(grams1 | grams2)
1193
+
1194
+ return intersection / union if union > 0 else 0.0
1195
+
1196
+
1197
+ def compute_similarity(emb1: np.ndarray, emb2: np.ndarray) -> float:
1198
+ """
1199
+ Compute cosine similarity between two embeddings.
1200
+ Assumes embeddings are L2-normalized (unit vectors), so cosine = dot product.
1201
+ """
1202
+ return np.dot(emb1, emb2)
1203
+
1204
+
1205
+ def are_sentences_similar(sent1: Sentence, sent2: Sentence, semantic_threshold: float = 0.60) -> bool:
1206
+ """
1207
+ Check if two sentences are similar using semantic + lexical signals.
1208
+
1209
+ - Semantic: cosine similarity on embeddings
1210
+ - Lexical fallback: 3-gram Jaccard for short sentences (≤120 chars)
1211
+
1212
+ Args:
1213
+ sent1, sent2: Sentence objects with embeddings
1214
+ semantic_threshold: Threshold for semantic similarity
1215
+
1216
+ Returns:
1217
+ True if similar, False otherwise
1218
+ """
1219
+ # Primary: semantic similarity
1220
+ semantic_sim = compute_similarity(sent1.embedding, sent2.embedding)
1221
+ if semantic_sim >= semantic_threshold:
1222
+ return True
1223
+
1224
+ # Fallback: lexical for short sentences (captures boilerplate)
1225
+ max_len = max(len(sent1.text), len(sent2.text))
1226
+ if max_len <= 120: # ~30 tokens
1227
+ lexical_sim = compute_char_3gram_jaccard(sent1.text, sent2.text)
1228
+ if lexical_sim >= 0.82: # High Jaccard = tight phrasing match
1229
+ return True
1230
+
1231
+ return False
1232
+
1233
+
1234
+ def build_sentence_objects(sentences_text: List[str], embeddings: np.ndarray) -> List[Sentence]:
1235
+ """
1236
+ Build Sentence objects with metadata.
1237
+
1238
+ Args:
1239
+ sentences_text: List of sentence strings
1240
+ embeddings: Numpy array of embeddings (N x 384)
1241
+
1242
+ Returns:
1243
+ List of Sentence objects with computed metadata
1244
+ """
1245
+ sentence_objects = []
1246
+ total = len(sentences_text)
1247
+
1248
+ for i, text in enumerate(sentences_text):
1249
+ # Generate ID
1250
+ sent_id = hashlib.md5(text.encode()).hexdigest()[:8]
1251
+
1252
+ # Extract entities
1253
+ entities, numbers = extract_entities(text)
1254
+
1255
+ # Compute salience
1256
+ salience = compute_salience(text, i, total)
1257
+
1258
+ sentence_objects.append(Sentence(
1259
+ id=sent_id,
1260
+ text=text,
1261
+ embedding=embeddings[i],
1262
+ entities=entities,
1263
+ numbers=numbers,
1264
+ salience=salience,
1265
+ position=i
1266
+ ))
1267
+
1268
+ return sentence_objects
1269
+
1270
+
1271
+ def greedy_max_independent_set(
1272
+ sentences: List[Sentence],
1273
+ similarity_threshold: float = 0.60,
1274
+ verbose: bool = True,
1275
+ precomputed_degree_map: Dict = None
1276
+ ) -> List[Sentence]:
1277
+ """
1278
+ Greedy maximum-independent-set selection with degree×length-aware ordering.
1279
+
1280
+ Algorithm:
1281
+ 1. Compute degree (# of similar neighbors) for each sentence
1282
+ 2. Sort by (token_length × degree) DESCENDING → prioritizes ejecting long redundant sentences
1283
+ 3. Pick highest degree×length sentence (most redundant, highest token savings)
1284
+ 4. Remove all similar neighbors (similarity > threshold)
1285
+ 5. Check removed sentences for unique entities
1286
+ 6. If removed sentence has unique entities, re-add it (HARD GUARD)
1287
+ 7. Repeat until all sentences processed
1288
+
1289
+ This preserves coverage while ejecting long, low-value uniques → bigger trims without raising sim bar.
1290
+
1291
+ Args:
1292
+ sentences: List of Sentence objects
1293
+ similarity_threshold: Similarity threshold for edge creation (0.75 = 75% similar)
1294
+ verbose: Print debug info
1295
+
1296
+ Returns:
1297
+ List of selected Sentence objects (deduplicated)
1298
+ """
1299
+ if verbose:
1300
+ print(f"\n[PIPECLEANER] Starting degree×length-aware greedy max-independent-set")
1301
+ print(f"[PIPECLEANER] Input: {len(sentences)} sentences")
1302
+ print(f"[PIPECLEANER] Similarity threshold: {similarity_threshold}")
1303
+
1304
+ # Step 1: Use precomputed degree map (or compute if not provided)
1305
+ if precomputed_degree_map is None:
1306
+ # Compute degree (# of connections) for each sentence
1307
+ # Use hybrid similarity: semantic (0.60) OR lexical (0.82 Jaccard for short spans)
1308
+ degree_map = {}
1309
+ for sent in sentences:
1310
+ degree = 0
1311
+ for other in sentences:
1312
+ if sent.id != other.id:
1313
+ # Hybrid check: semantic OR lexical
1314
+ if are_sentences_similar(sent, other, semantic_threshold=similarity_threshold):
1315
+ degree += 1
1316
+ degree_map[sent.id] = degree
1317
+
1318
+ # Sanity checks (as requested)
1319
+ isolates = [s for s in sentences if degree_map[s.id] == 0]
1320
+ non_isolates = [s for s in sentences if degree_map[s.id] > 0]
1321
+ pct_isolates = len(isolates) / len(sentences) * 100 if sentences else 0
1322
+ avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
1323
+
1324
+ if verbose:
1325
+ avg_degree = sum(degree_map.values()) / len(degree_map) if degree_map else 0
1326
+ print(f"[PIPECLEANER] Degree stats: avg={avg_degree:.1f}, isolates={pct_isolates:.1f}%, non-isolate avg={avg_degree_non_iso:.1f}")
1327
+ print(f"[PIPECLEANER] Sanity: isolates {pct_isolates:.0f}% (expect <20%), non-isolate avg {avg_degree_non_iso:.1f} (expect >3)")
1328
+ else:
1329
+ # Use precomputed degree map (more efficient)
1330
+ degree_map = precomputed_degree_map
1331
+
1332
+ # Step 2: Sort by (token_length × degree) ASCENDING
1333
+ # LOW degree×length = short + unique → keep first (high value)
1334
+ # HIGH degree×length = long + redundant → eject (low value)
1335
+ def sort_key(s):
1336
+ token_len = estimate_tokens(s.text)
1337
+ degree = degree_map[s.id]
1338
+ return token_len * degree
1339
+
1340
+ # Sort ASCENDING - pick short unique sentences first
1341
+ sorted_sentences = sorted(sentences, key=sort_key, reverse=False)
1342
+
1343
+ if verbose:
1344
+ top_5 = sorted_sentences[:5]
1345
+ print(f"[PIPECLEANER] Top 5 to keep (low degree×length = short + unique):")
1346
+ for i, s in enumerate(top_5, 1):
1347
+ score = sort_key(s)
1348
+ print(f" {i}. {estimate_tokens(s.text)}tok × {degree_map[s.id]}deg = {score:.0f} | '{s.text[:60]}...'")
1349
+
1350
+
1351
+ selected = []
1352
+ remaining = sorted_sentences.copy()
1353
+ entity_coverage = set()
1354
+ iteration = 0
1355
+
1356
+ while remaining:
1357
+ iteration += 1
1358
+ # Pick highest degree×length sentence (most redundant + expensive)
1359
+ best = remaining[0]
1360
+
1361
+ if verbose and iteration <= 5: # Print first 5 iterations
1362
+ score = sort_key(best)
1363
+ print(f"\n[PIPECLEANER] Iteration {iteration}:")
1364
+ print(f" Selected: '{best.text[:80]}...'")
1365
+ print(f" Degree×Length: {estimate_tokens(best.text)}tok × {degree_map[best.id]}deg = {score:.0f}")
1366
+ print(f" Entities: {best.protected_entities}")
1367
+
1368
+ # Add to selected
1369
+ selected.append(best)
1370
+ entity_coverage |= best.protected_entities
1371
+
1372
+ # Remove from remaining
1373
+ remaining.remove(best)
1374
+
1375
+ # Find similar neighbors to remove (using hybrid similarity)
1376
+ to_remove = []
1377
+ for candidate in remaining:
1378
+ if are_sentences_similar(best, candidate, semantic_threshold=similarity_threshold):
1379
+ # Get semantic sim for logging
1380
+ sem_sim = compute_similarity(best.embedding, candidate.embedding)
1381
+ to_remove.append((candidate, sem_sim))
1382
+
1383
+ if verbose and iteration <= 5 and to_remove:
1384
+ print(f" Removing {len(to_remove)} similar sentences (similarity >= {similarity_threshold})")
1385
+
1386
+ # Remove similar sentences
1387
+ for candidate, sim in to_remove:
1388
+ remaining.remove(candidate)
1389
+
1390
+ # HARD GUARD: Check removed sentences for unique entities
1391
+ # Only re-add if they have MULTIPLE (3+) meaningful unique entities
1392
+ # This prevents re-adding for trivial differences
1393
+ re_added = 0
1394
+ for candidate, sim in to_remove:
1395
+ unique_entities = candidate.protected_entities - entity_coverage
1396
+
1397
+ # Require at least 3 unique entities OR at least 1 unique multi-word entity
1398
+ multi_word_entities = {e for e in unique_entities if ' ' in e or len(e) > 10}
1399
+ should_readd = len(unique_entities) >= 3 or len(multi_word_entities) >= 1
1400
+
1401
+ if should_readd:
1402
+ if verbose and iteration <= 5:
1403
+ print(f" ⚠️ RE-ADDING sentence with {len(unique_entities)} unique entities: {unique_entities}")
1404
+ print(f" Text: '{candidate.text[:80]}...'")
1405
+ selected.append(candidate)
1406
+ entity_coverage |= candidate.protected_entities
1407
+ re_added += 1
1408
+
1409
+ if verbose and iteration <= 5 and re_added:
1410
+ print(f" Re-added {re_added} sentences to preserve entity coverage")
1411
+
1412
+ if verbose:
1413
+ print(f"\n[PIPECLEANER] Selection complete:")
1414
+ print(f" Input: {len(sentences)} sentences")
1415
+ print(f" Output: {len(selected)} sentences")
1416
+ print(f" Reduction: {(1 - len(selected)/len(sentences))*100:.1f}%")
1417
+ print(f" Entities preserved: {len(entity_coverage)}")
1418
+
1419
+ return selected
1420
+
1421
+
1422
+ def deduplicate_search_results(
1423
+ text: str,
1424
+ similarity_threshold: float = 0.60,
1425
+ verbose: bool = True,
1426
+ cached_model=None
1427
+ ) -> Tuple[str, Dict, any]:
1428
+ """
1429
+ Main entry point: Deduplicate search results using graph-based approach.
1430
+
1431
+ Args:
1432
+ text: Raw search results text
1433
+ similarity_threshold: Cosine similarity threshold (0.60 catches cross-site paraphrases at 0.55-0.68)
1434
+ verbose: Print debug info
1435
+ cached_model: Optional cached embedding model to reuse
1436
+
1437
+ Returns:
1438
+ Tuple of (deduplicated_text, stats_dict, embedding_model)
1439
+ stats_dict contains: {
1440
+ 'original_chars': int,
1441
+ 'deduplicated_chars': int,
1442
+ 'original_sentences': int,
1443
+ 'deduplicated_sentences': int,
1444
+ 'prune_pct': float,
1445
+ 'original_tokens': int,
1446
+ 'deduplicated_tokens': int,
1447
+ 'tokens_saved': int,
1448
+ 'entity_coverage_pct': float,
1449
+ 'entities_total': int,
1450
+ 'entities_preserved': int
1451
+ }
1452
+ """
1453
+ if verbose:
1454
+ print(f"\n{'='*70}")
1455
+ print(f"[PIPECLEANER] DEDUPLICATION STARTED")
1456
+ print(f"{'='*70}")
1457
+ print(f"[PIPECLEANER] Input text: {len(text)} chars, ~{len(text.split())} words")
1458
+
1459
+ # Step 1: Split into sentences
1460
+ sentences_text = split_into_sentences(text)
1461
+
1462
+ if verbose:
1463
+ print(f"[PIPECLEANER] Split into {len(sentences_text)} sentences")
1464
+
1465
+ # Initialize stats
1466
+ stats = {
1467
+ 'original_chars': len(text),
1468
+ 'deduplicated_chars': len(text),
1469
+ 'original_sentences': len(sentences_text),
1470
+ 'deduplicated_sentences': len(sentences_text),
1471
+ 'prune_pct': 0.0,
1472
+ 'original_tokens': int(len(text) / 4),
1473
+ 'deduplicated_tokens': int(len(text) / 4),
1474
+ 'tokens_saved': 0,
1475
+ 'entity_coverage_pct': 100.0,
1476
+ 'entities_total': 0,
1477
+ 'entities_preserved': 0
1478
+ }
1479
+
1480
+ if len(sentences_text) == 0:
1481
+ if verbose:
1482
+ print(f"[PIPECLEANER] ⚠️ No sentences found, returning original text")
1483
+ return text, stats, cached_model
1484
+
1485
+ if len(sentences_text) == 1:
1486
+ if verbose:
1487
+ print(f"[PIPECLEANER] Only 1 sentence, skipping deduplication")
1488
+ return text, stats, cached_model
1489
+
1490
+ # Step 2: Compute embeddings
1491
+ # Always use the thread-safe singleton model
1492
+ model = _get_embedding_model()
1493
+
1494
+ if verbose:
1495
+ print(f"[PIPECLEANER] Computing embeddings...")
1496
+
1497
+ # L2 normalize embeddings so cosine similarity = dot product (faster)
1498
+ embeddings = model.encode(sentences_text, show_progress_bar=False, normalize_embeddings=True)
1499
+
1500
+ if verbose:
1501
+ print(f"[PIPECLEANER] Embeddings computed: shape {embeddings.shape}")
1502
+
1503
+ # Step 3: Build sentence objects with metadata
1504
+ sentences = build_sentence_objects(sentences_text, embeddings)
1505
+
1506
+ # Calculate total entities across all sentences
1507
+ all_entities = set()
1508
+ for sent in sentences:
1509
+ all_entities |= sent.protected_entities
1510
+
1511
+ # Step 4: Run greedy max-independent-set selection
1512
+ selected = greedy_max_independent_set(sentences, similarity_threshold, verbose)
1513
+
1514
+ # Calculate preserved entities
1515
+ preserved_entities = set()
1516
+ for sent in selected:
1517
+ preserved_entities |= sent.protected_entities
1518
+
1519
+ # Step 5: Reconstruct text preserving original order
1520
+ selected_by_position = sorted(selected, key=lambda s: s.position)
1521
+ deduplicated_text = '\n\n'.join(s.text for s in selected_by_position)
1522
+
1523
+ # Calculate stats
1524
+ stats['deduplicated_chars'] = len(deduplicated_text)
1525
+ stats['deduplicated_sentences'] = len(selected)
1526
+ stats['prune_pct'] = (1 - len(selected) / len(sentences_text)) * 100 if len(sentences_text) > 0 else 0
1527
+ stats['deduplicated_tokens'] = int(len(deduplicated_text) / 4)
1528
+ stats['tokens_saved'] = stats['original_tokens'] - stats['deduplicated_tokens']
1529
+ stats['entities_total'] = len(all_entities)
1530
+ stats['entities_preserved'] = len(preserved_entities)
1531
+ stats['entity_coverage_pct'] = (len(preserved_entities) / len(all_entities) * 100) if len(all_entities) > 0 else 100.0
1532
+
1533
+ if verbose:
1534
+ print(f"\n[PIPECLEANER] DEDUPLICATION COMPLETE")
1535
+ print(f" Input: {len(text)} chars")
1536
+ print(f" Output: {len(deduplicated_text)} chars")
1537
+ print(f" Reduction: {(1 - len(deduplicated_text)/len(text))*100:.1f}%")
1538
+ print(f" Sentences: {len(sentences_text)} → {len(selected)}")
1539
+ print(f"{'='*70}\n")
1540
+
1541
+ return deduplicated_text, stats, model
1542
+
1543
+
1544
+ # ============================================================================
1545
+ # CONVENIENCE FUNCTIONS
1546
+ # ============================================================================
1547
+
1548
+ def estimate_tokens(text: str) -> int:
1549
+ """Rough estimate of token count (words / 0.75)."""
1550
+ return int(len(text.split()) / 0.75)
1551
+
1552
+
1553
+ def should_deduplicate(text: str, min_length: int = 500) -> bool:
1554
+ """
1555
+ Check if text is worth deduplicating.
1556
+
1557
+ Args:
1558
+ text: Input text
1559
+ min_length: Minimum character length to bother deduplicating
1560
+
1561
+ Returns:
1562
+ True if text should be deduplicated
1563
+ """
1564
+ return len(text) >= min_length
1565
+
1566
+
1567
+ def apply_pipecleaner_if_applicable(tool_name: str, output_str: str, selected_rules: list, cached_model=None) -> Tuple[str, any]:
1568
+ """
1569
+ High-level function to check for filter search rules and apply deduplication.
1570
+
1571
+ This is called from capture.py's on_tool_end callback.
1572
+
1573
+ Args:
1574
+ tool_name: Name of the tool that just finished
1575
+ output_str: Raw output from the tool
1576
+ selected_rules: List of rules selected for this run
1577
+ cached_model: Optional cached embedding model to reuse across searches
1578
+
1579
+ Returns:
1580
+ Tuple of (deduplicated_output, embedding_model) for caching
1581
+ Returns (original_output, None) if no filter rule applies
1582
+ """
1583
+ try:
1584
+ # Find applicable filter search rules for this tool
1585
+ filter_rules = _find_filter_search_rules(tool_name, selected_rules)
1586
+
1587
+ # If we found applicable filter rules, apply deduplication
1588
+ if filter_rules:
1589
+ print(f"\n{'='*70}")
1590
+ print(f"[PIPECLEANER] 🧹 FILTER SEARCH RULE DETECTED")
1591
+ print(f"{'='*70}")
1592
+ print(f"[PIPECLEANER] Tool: {tool_name}")
1593
+ print(f"[PIPECLEANER] Rules matched: {len(filter_rules)}")
1594
+ for rule in filter_rules:
1595
+ rule_id = getattr(rule, 'id', 'unknown')
1596
+ advice = getattr(rule, 'advice', '') or getattr(rule, 'advice_text', '')
1597
+ print(f"[PIPECLEANER] - Rule {rule_id}: {advice[:80]}...")
1598
+ print(f"{'='*70}")
1599
+
1600
+ # Apply deduplication with cached model
1601
+ deduplicated, stats, model = deduplicate_search_results(
1602
+ text=output_str,
1603
+ similarity_threshold=0.60, # 0.60 catches cross-site paraphrases (0.55-0.68 typical)
1604
+ verbose=True, # Show detailed deduplication stats
1605
+ cached_model=cached_model # Reuse model if available
1606
+ )
1607
+
1608
+ # Print comprehensive stats after every search
1609
+ print(f"\n{'='*70}")
1610
+ print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1611
+ print(f"{'='*70}")
1612
+ print(f"[PIPECLEANER] 🔢 Sentences:")
1613
+ print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1614
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1615
+ print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1616
+ print(f"[PIPECLEANER]")
1617
+ print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1618
+ print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1619
+ print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1620
+ print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1621
+ print(f"[PIPECLEANER]")
1622
+ print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1623
+ print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1624
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1625
+ print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1626
+ print(f"[PIPECLEANER]")
1627
+ print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1628
+ print(f"{'='*70}\n")
1629
+
1630
+ return deduplicated, model
1631
+
1632
+ # No filter rules found, return original
1633
+ return output_str, None
1634
+
1635
+ except ImportError as e:
1636
+ print(f"\n{'='*70}")
1637
+ print(f"[PIPECLEANER] IMPORT ERROR - FAILING OPEN")
1638
+ print(f"{'='*70}")
1639
+ print(f"[PIPECLEANER] Error: {e}")
1640
+ print(f"[PIPECLEANER] Install: pip install sentence-transformers")
1641
+ print(f"{'='*70}\n")
1642
+ return output_str, None
1643
+ except Exception as e:
1644
+ print(f"\n{'='*70}")
1645
+ print(f"[PIPECLEANER] EXCEPTION - FAILING OPEN")
1646
+ print(f"{'='*70}")
1647
+ print(f"[PIPECLEANER] Error type: {type(e).__name__}")
1648
+ print(f"[PIPECLEANER] Error message: {e}")
1649
+ import traceback
1650
+ print(f"[PIPECLEANER] Traceback:")
1651
+ traceback.print_exc()
1652
+ print(f"{'='*70}\n")
1653
+ return output_str, None
1654
+
1655
+
1656
+ def _find_filter_search_rules(tool_name: str, selected_rules: list) -> list:
1657
+ """
1658
+ Find llm_start scoped rules with "filter search" keywords that apply to this tool.
1659
+
1660
+ This is called from on_llm_start when a Summary tool's LLM is about to be called.
1661
+ Rule synthesis will generate rules scoped to llm_start when it detects search→summary patterns.
1662
+
1663
+ Args:
1664
+ tool_name: Name of the tool whose LLM is starting (e.g., 'Summary')
1665
+ selected_rules: List of rules to search through
1666
+
1667
+ Returns:
1668
+ List of applicable filter search rules
1669
+ """
1670
+ filter_rules = []
1671
+
1672
+ for rule_meta in selected_rules:
1673
+ # Unwrap tuple if needed (rules come as (rule, metadata) from select_rules)
1674
+ if isinstance(rule_meta, tuple) and len(rule_meta) == 2:
1675
+ rule_obj, _metadata = rule_meta
1676
+ else:
1677
+ rule_obj = rule_meta
1678
+
1679
+ # Check if this is an llm_start scoped rule
1680
+ target_step_type = getattr(rule_obj, 'target_step_type', None)
1681
+
1682
+ # Must be scoped to llm_start (where we intercept Summary LLM calls)
1683
+ if target_step_type != 'llm_start':
1684
+ continue
1685
+
1686
+ # Check if the rule contains "filter search" keywords
1687
+ # Try both field names that might be used
1688
+ advice = getattr(rule_obj, 'advice_text', None) or getattr(rule_obj, 'advice', None) or ''
1689
+ advice_lower = advice.lower() if advice else ''
1690
+
1691
+ if not advice_lower or 'filter' not in advice_lower or 'search' not in advice_lower:
1692
+ continue
1693
+
1694
+ # Check if the rule applies to this tool
1695
+ applies = _rule_applies_to_tool(rule_obj, tool_name, advice_lower)
1696
+
1697
+ if applies:
1698
+ filter_rules.append(rule_obj)
1699
+
1700
+ return filter_rules
1701
+
1702
+
1703
+ def _rule_applies_to_tool(rule_obj, tool_name: str, advice_lower: str) -> bool:
1704
+ """
1705
+ Check if a rule applies to the given tool.
1706
+
1707
+ Args:
1708
+ rule_obj: Rule object or dict to check
1709
+ tool_name: Name of the tool (case-insensitive)
1710
+ advice_lower: Lowercased advice text for fallback matching
1711
+
1712
+ Returns:
1713
+ True if rule applies to this tool
1714
+ """
1715
+ # Wildcard matches everything (used for initial check)
1716
+ if tool_name == "*":
1717
+ return True
1718
+
1719
+ tool_name_lower = tool_name.lower()
1720
+
1721
+ # Extract references.tools from rule (handle both dict and object formats)
1722
+ if isinstance(rule_obj, dict):
1723
+ references = rule_obj.get('references', {})
1724
+ tools = references.get('tools', []) if isinstance(references, dict) else []
1725
+ else:
1726
+ references = getattr(rule_obj, 'references', None)
1727
+ if references:
1728
+ # Try both object attribute and dict access for tools
1729
+ if hasattr(references, 'tools'):
1730
+ tools = references.tools
1731
+ elif isinstance(references, dict):
1732
+ tools = references.get('tools', [])
1733
+ else:
1734
+ tools = []
1735
+ else:
1736
+ tools = []
1737
+
1738
+ if tools:
1739
+ # Check if tool_name matches any tool in references.tools (case-insensitive exact match)
1740
+ for ref_tool in tools:
1741
+ ref_tool_lower = ref_tool.lower()
1742
+ if tool_name_lower == ref_tool_lower:
1743
+ return True
1744
+ # No match found in references.tools
1745
+ return False
1746
+ else:
1747
+ # Rule has no tools list - don't apply to anything (be conservative)
1748
+ return False
1749
+
1750
+
1751
+ async def run_pipecleaner_enforcement(
1752
+ messages_or_prompts: tuple,
1753
+ callback_handler: any,
1754
+ patch_depth: any
1755
+ ) -> bool:
1756
+ """
1757
+ Main pipecleaner enforcement logic - parallel to run_microturn_enforcement.
1758
+
1759
+ This intercepts ToolMessage objects and applies deduplication.
1760
+
1761
+ Args:
1762
+ messages_or_prompts: Args tuple from _generate (first element is messages)
1763
+ callback_handler: DaseinCallbackHandler with rules
1764
+ patch_depth: Thread-local object with caching
1765
+
1766
+ Returns:
1767
+ True if enforcement was applied, False if skipped
1768
+ """
1769
+ try:
1770
+ print(f"[PIPECLEANER] 🧹 run_pipecleaner_enforcement called")
1771
+
1772
+ if not callback_handler or not hasattr(callback_handler, '_selected_rules'):
1773
+ return False
1774
+
1775
+ rules = callback_handler._selected_rules
1776
+ print(f"[PIPECLEANER] Found {len(rules)} rules")
1777
+
1778
+ filter_rules = _find_filter_search_rules("*", rules)
1779
+ if not filter_rules:
1780
+ return False
1781
+
1782
+ print(f"[PIPECLEANER] 🎯 Found {len(filter_rules)} filter search rules!")
1783
+
1784
+ # Extract messages from args
1785
+ if not messages_or_prompts or len(messages_or_prompts) == 0:
1786
+ return False
1787
+
1788
+ messages = messages_or_prompts[0]
1789
+ if not isinstance(messages, list):
1790
+ return False
1791
+
1792
+ # Find the most recent ToolMessage (tool result)
1793
+ tool_message = None
1794
+ for idx in range(len(messages) - 1, -1, -1):
1795
+ msg = messages[idx]
1796
+ msg_type = getattr(msg, 'type', None) or (msg.get('type') if isinstance(msg, dict) else None)
1797
+ if msg_type == 'tool':
1798
+ tool_message = msg
1799
+ break
1800
+
1801
+ if not tool_message:
1802
+ return False
1803
+
1804
+ # Extract tool name and content
1805
+ tool_name = getattr(tool_message, 'name', None) or tool_message.get('name', 'unknown')
1806
+ tool_content = str(getattr(tool_message, 'content', None) or tool_message.get('content', ''))
1807
+
1808
+ print(f"[PIPECLEANER] Tool: {tool_name}, content: {len(tool_content)} chars")
1809
+
1810
+ # Check if this tool matches our filter rules
1811
+ matching_rules = _find_filter_search_rules(tool_name, rules)
1812
+ if not matching_rules:
1813
+ print(f"[PIPECLEANER] Tool '{tool_name}' doesn't match filter rules, skipping")
1814
+ return False
1815
+
1816
+ print(f"[PIPECLEANER] 🎯 Tool '{tool_name}' matches filter rules! Starting deduplication...")
1817
+
1818
+ # Prevent infinite regression - check if we've already processed this exact message
1819
+ if not hasattr(patch_depth, 'processed_tool_messages'):
1820
+ patch_depth.processed_tool_messages = set()
1821
+
1822
+ # Create signature from tool name + content hash
1823
+ msg_signature = f"{tool_name}_{hash(tool_content[:200])}"
1824
+ if msg_signature in patch_depth.processed_tool_messages:
1825
+ print(f"[PIPECLEANER] Already processed this ToolMessage, skipping")
1826
+ return False
1827
+
1828
+ # Mark as processed
1829
+ patch_depth.processed_tool_messages.add(msg_signature)
1830
+
1831
+ # Apply deduplication
1832
+ cached_model = getattr(callback_handler, '_pipecleaner_embedding_model', None)
1833
+
1834
+ deduplicated, stats, model = deduplicate_search_results(
1835
+ text=tool_content,
1836
+ similarity_threshold=0.60, # Lowered to catch paraphrases
1837
+ verbose=True,
1838
+ cached_model=cached_model
1839
+ )
1840
+
1841
+ # Cache model
1842
+ callback_handler._pipecleaner_embedding_model = model
1843
+
1844
+ # Modify ToolMessage content IN PLACE
1845
+ if hasattr(tool_message, 'content'):
1846
+ tool_message.content = deduplicated
1847
+ elif isinstance(tool_message, dict):
1848
+ tool_message['content'] = deduplicated
1849
+
1850
+ # Cache result for potential reuse
1851
+ if not hasattr(patch_depth, 'tool_result_cache'):
1852
+ patch_depth.tool_result_cache = {}
1853
+
1854
+ result_key = f"{tool_name}_{hash(tool_content[:100])}"
1855
+ patch_depth.tool_result_cache[result_key] = deduplicated
1856
+
1857
+ print(f"[PIPECLEANER] Applied deduplication to {tool_name}")
1858
+
1859
+ # Print stats
1860
+ print(f"\n{'='*70}")
1861
+ print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1862
+ print(f"{'='*70}")
1863
+ print(f"[PIPECLEANER] 🔢 Sentences:")
1864
+ print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1865
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1866
+ print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1867
+ print(f"[PIPECLEANER]")
1868
+ print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1869
+ print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1870
+ print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1871
+ print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1872
+ print(f"[PIPECLEANER]")
1873
+ print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1874
+ print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1875
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1876
+ print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1877
+ print(f"[PIPECLEANER]")
1878
+ print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1879
+ print(f"{'='*70}\n")
1880
+
1881
+ return True
1882
+
1883
+ except Exception as e:
1884
+ print(f"[PIPECLEANER] ⚠️ Error during enforcement: {e}")
1885
+ import traceback
1886
+ traceback.print_exc()
1887
+ return False
1888
+
1889
+
1890
+ if __name__ == "__main__":
1891
+ # Simple test
1892
+ test_text = """
1893
+ LangChain is a framework for developing applications powered by language models.
1894
+ The LangChain framework enables developers to build LLM applications easily.
1895
+ LangChain provides many useful features for LLM apps.
1896
+ It supports multiple model providers including OpenAI and Anthropic.
1897
+ The framework was created in 2022 by Harrison Chase.
1898
+ LlamaIndex is another popular framework for LLM applications.
1899
+ LlamaIndex focuses on data indexing and retrieval.
1900
+ Both frameworks are open source and widely used.
1901
+ """
1902
+
1903
+ print("Testing pipecleaner deduplication...")
1904
+ result, stats, model = deduplicate_search_results(test_text, verbose=True)
1905
+
1906
+ print("\n" + "="*70)
1907
+ print("STATS:")
1908
+ print(f" Prune %: {stats['prune_pct']:.1f}%")
1909
+ print(f" Entity Coverage: {stats['entity_coverage_pct']:.1f}%")
1910
+ print(f" Tokens saved: {stats['tokens_saved']:,} ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1911
+
1912
+ print("\n" + "="*70)
1913
+ print("ORIGINAL:")
1914
+ print(test_text)
1915
+ print("\n" + "="*70)
1916
+ print("DEDUPLICATED:")
1917
+ print(result)
1918
+