dasein-core 0.2.14__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dasein/pipecleaner.py CHANGED
@@ -1,1917 +1,1920 @@
1
- """
2
- Pipecleaner: Run-scoped global corpus deduplication for multi-agent systems.
3
-
4
- V2.0: Global ClusterBank with dynamic batching barrier (5-10s) for cross-prompt deduplication.
5
- - Run-scoped corpus: All prompts in a run share a global ClusterBank
6
- - SimHash near-dup matching: Hamming distance ≤6 for 64-bit fingerprints
7
- - Dynamic barrier: 5s min, +2s per arrival (cap 10s), maximizes dedupe by collecting bursts
8
- - Canonical ownership: First prompt to use a cluster owns it, others drop duplicates
9
- - Entity coverage: 95% threshold RUN-LEVEL (cumulative across all batches, not per-batch)
10
-
11
- Algorithm:
12
- 1. Intercept prompt → split sentences → compute SimHash signatures
13
- 2. Match against ClusterBank (Hamming ≤6) → assign cluster_id or create new
14
- 3. Queue prompt into micro-batch, extend barrier (+2s per arrival, cap 10s)
15
- 4. On timer: cross-prompt dedupe (keep only canonical owners)
16
- 5. RUN-LEVEL entity coverage check (95% cumulative across entire run), re-add if needed
17
- 6. Emit cleaned prompts (original sentence order preserved)
18
-
19
- Expected savings: 50-90% char reduction with 95%+ entity coverage across entire run.
20
- Later batches are MORE aggressive (earlier batches already covered entities).
21
- """
22
-
23
- import re
24
- import hashlib
25
- import threading
26
- import time
27
- from typing import List, Dict, Set, Tuple, Optional, Any
28
- from dataclasses import dataclass, field
29
- from collections import defaultdict
30
- import numpy as np
31
- import asyncio
32
-
33
- # Type alias for return type
34
- DeduplicationResult = Tuple[str, Dict]
35
-
36
- # Lazy imports for performance (only load when needed)
37
- _embedding_model = None
38
- _spacy_nlp = None
39
- _model_lock = threading.Lock() # Thread-safe singleton access
40
-
41
-
42
- def _vprint(message: str, verbose: bool = False, force: bool = False):
43
- """Helper function for verbose printing."""
44
- if force or verbose:
45
- print(message)
46
-
47
-
48
- def _get_embedding_model():
49
- """
50
- Lazy load sentence transformer model (thread-safe singleton).
51
- Forces CPU to avoid meta tensor issues on Win + Py3.13 + Torch.
52
- """
53
- global _embedding_model
54
-
55
- # Double-checked locking pattern for performance
56
- if _embedding_model is None:
57
- with _model_lock:
58
- # Check again inside lock (another thread might have loaded it)
59
- if _embedding_model is None:
60
- try:
61
- from sentence_transformers import SentenceTransformer
62
- print("[PIPECLEANER] Loading embedding model: all-MiniLM-L6-v2 (384-dim, ~80MB)...")
63
- # Force CPU device to avoid meta tensor issues
64
- _embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
65
- print("[PIPECLEANER] ✅ Embedding model loaded successfully (CPU)")
66
- except ImportError:
67
- print("[PIPECLEANER] ⚠️ sentence-transformers not installed. Install: pip install sentence-transformers")
68
- raise
69
- except Exception as e:
70
- print(f"[PIPECLEANER] ⚠️ Failed to load embedding model: {e}")
71
- raise
72
-
73
- return _embedding_model
74
-
75
-
76
- def _get_spacy_model():
77
- """Lazy load spaCy model for entity extraction."""
78
- global _spacy_nlp
79
- if _spacy_nlp is None:
80
- try:
81
- import spacy
82
- print("[PIPECLEANER] Loading spaCy model: en_core_web_sm...")
83
- _spacy_nlp = spacy.load("en_core_web_sm")
84
- print("[PIPECLEANER] ✅ spaCy model loaded successfully")
85
- except ImportError:
86
- print("[PIPECLEANER] ⚠️ spaCy not installed. Using regex fallback for entities.")
87
- _spacy_nlp = "fallback"
88
- except OSError:
89
- print("[PIPECLEANER] ⚠️ spaCy model not found. Using regex fallback for entities.")
90
- _spacy_nlp = "fallback"
91
- return _spacy_nlp
92
-
93
-
94
- # ============================================================================
95
- # Run-Scoped Global Corpus System V2.0
96
- # ============================================================================
97
-
98
- @dataclass
99
- class SentenceCluster:
100
- """Represents a cluster of similar sentences across the run."""
101
- cluster_id: str
102
- canonical_sentence: str
103
- owner_prompt_id: str # First prompt to use this cluster
104
- simhash: int # 64-bit SimHash fingerprint
105
- salience: float
106
- entities: Set[str]
107
- first_seen_seq: int
108
- length: int
109
- embedding: Optional[np.ndarray] = None # Sentence embedding for cosine similarity
110
-
111
- @dataclass
112
- class PromptState:
113
- """State for a single prompt in the batch."""
114
- prompt_id: str
115
- sentences: List[str]
116
- cluster_ids: List[str] # parallel to sentences
117
- original_order: List[int] # track reordering
118
- entities: Set[str]
119
- arrived_at: float
120
-
121
- @dataclass
122
- class RunCorpusTelemetry:
123
- """Run-level statistics for the corpus."""
124
- prompts_total: int = 0
125
- sentences_total: int = 0
126
- clusters_total: int = 0
127
- cross_prompt_dups_removed: int = 0
128
- chars_in: int = 0
129
- chars_out: int = 0
130
- tokens_saved: int = 0
131
- entity_coverage_avg: float = 100.0
132
- batches_processed: int = 0
133
- avg_barrier_ms: float = 0.0
134
- max_barrier_ms: float = 0.0
135
- barrier_times: List[float] = field(default_factory=list)
136
-
137
-
138
- def compute_simhash(text: str, hash_bits: int = 64) -> int:
139
- """
140
- Compute SimHash fingerprint for near-dup detection.
141
-
142
- Args:
143
- text: Input text
144
- hash_bits: Hash size (64-bit default)
145
-
146
- Returns:
147
- Integer hash value
148
- """
149
- # Tokenize and compute feature hashes
150
- tokens = re.findall(r'\b\w+\b', text.lower())
151
- if not tokens:
152
- return 0
153
-
154
- # Initialize bit vector
155
- v = [0] * hash_bits
156
-
157
- for token in tokens:
158
- # Hash each token
159
- h = int(hashlib.md5(token.encode()).hexdigest(), 16)
160
-
161
- # Update bit vector
162
- for i in range(hash_bits):
163
- if h & (1 << i):
164
- v[i] += 1
165
- else:
166
- v[i] -= 1
167
-
168
- # Generate final hash
169
- fingerprint = 0
170
- for i in range(hash_bits):
171
- if v[i] > 0:
172
- fingerprint |= (1 << i)
173
-
174
- return fingerprint
175
-
176
-
177
- def hamming_distance(hash1: int, hash2: int) -> int:
178
- """Count differing bits between two hashes."""
179
- return bin(hash1 ^ hash2).count('1')
180
-
181
-
182
- class RunScopedCorpus:
183
- """
184
- Global corpus for a single run, with dynamic batching barrier.
185
- All prompts in the run share this corpus for cross-prompt deduplication.
186
-
187
- CONCURRENCY MODEL:
188
- - All shared state (clusters, prompt_registry, run_entities, kept_entities, batch_queue)
189
- is protected by `self.batch_lock` (threading.Lock)
190
- - All reads iterate over snapshots (dict(...), list(...)) to avoid "dict changed size" errors
191
- - All writes are atomic under lock (copy-on-write when possible)
192
- - Re-entrancy guard in caller (DaseinCallbackHandler) prevents nested calls
193
- - Background timer thread (_process_batch) acquires lock before any mutations
194
- """
195
-
196
- def __init__(self, run_id: str, hamming_threshold: int = 6, entity_coverage_min: float = 0.95, verbose: bool = False):
197
- self.run_id = run_id
198
- self.hamming_threshold = hamming_threshold
199
- self.entity_coverage_min = entity_coverage_min
200
- self.verbose = verbose # Gate debug logging
201
-
202
- # Core state
203
- self.clusters: Dict[str, SentenceCluster] = {} # cluster_id → cluster
204
- self.simhash_index: Dict[int, List[str]] = defaultdict(list) # simhash → [cluster_ids]
205
- self.prompt_registry: Dict[str, PromptState] = {} # prompt_id → state
206
- self.entity_index: Dict[str, Set[str]] = defaultdict(set) # entity → {cluster_ids}
207
-
208
- # Run-level entity tracking for global coverage
209
- self.run_entities: Set[str] = set() # All entities seen across entire run
210
- self.kept_entities: Set[str] = set() # All entities kept across all batches
211
-
212
- # Batching state
213
- self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
214
- self.batch_lock = threading.Lock()
215
- self.batch_timer: Optional[threading.Timer] = None
216
- self.batch_start_time: Optional[float] = None
217
- self.barrier_duration: float = 5.0 # Start at 5s (min wait)
218
- self.barrier_increment: float = 2.0 # Add 2s per new arrival
219
- self.barrier_cap: float = 10.0 # Max 10s
220
- self.batch_ready = threading.Event() # Signal when batch is processed
221
- self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
222
-
223
- # Sequence tracking
224
- self.next_seq = 0
225
- self.next_cluster_id = 0
226
-
227
- # Telemetry
228
- self.telemetry = RunCorpusTelemetry()
229
-
230
- _vprint(f"[CORPUS] 🏗️ Created run-scoped corpus for run_id={run_id[:8]} (barrier: 5s min, +2s/arrival, 10s cap)", self.verbose)
231
-
232
- def _generate_cluster_id(self) -> str:
233
- """Generate unique cluster ID."""
234
- cluster_id = f"c{self.next_cluster_id:06d}"
235
- self.next_cluster_id += 1
236
- return cluster_id
237
-
238
- def find_matching_cluster(self, simhash: int, sentence: str, sentence_embedding=None) -> Optional[str]:
239
- """
240
- Find existing cluster that matches this sentence using cosine similarity.
241
-
242
- Args:
243
- simhash: SimHash of the sentence (for indexing, not matching)
244
- sentence: Original sentence text
245
- sentence_embedding: Pre-computed embedding for this sentence
246
-
247
- Returns:
248
- cluster_id if match found, None otherwise
249
- """
250
- if sentence_embedding is None:
251
- return None
252
-
253
- # Check all existing clusters for semantic similarity
254
- # Use cosine similarity 0.60 (catches cross-site paraphrases)
255
- best_match_id = None
256
- best_similarity = 0.60 # Threshold for considering duplicate (lowered to catch paraphrases)
257
-
258
- # Snapshot clusters to avoid "dict changed size" errors (thread-safe read)
259
- with self.batch_lock:
260
- clusters_snapshot = dict(self.clusters)
261
-
262
- for cluster_id, cluster in clusters_snapshot.items():
263
- if cluster.canonical_sentence == sentence:
264
- # Exact match
265
- return cluster_id
266
-
267
- # Hybrid similarity: semantic + lexical fallback for short sentences
268
- if hasattr(cluster, 'embedding') and cluster.embedding is not None:
269
- # Semantic similarity
270
- similarity = np.dot(sentence_embedding, cluster.embedding)
271
-
272
- # Lexical fallback for short sentences (boilerplate detection)
273
- max_len = max(len(sentence), len(cluster.canonical_sentence))
274
- if max_len <= 120 and similarity < 0.60:
275
- lexical_sim = compute_char_3gram_jaccard(sentence, cluster.canonical_sentence)
276
- if lexical_sim >= 0.82:
277
- # Boost similarity to indicate match via lexical path
278
- similarity = max(similarity, 0.82)
279
-
280
- if similarity > best_similarity:
281
- best_similarity = similarity
282
- best_match_id = cluster_id
283
-
284
- return best_match_id
285
-
286
- def add_sentence_to_corpus(self, sentence: str, prompt_id: str, salience: float, entities: Set[str]) -> str:
287
- """
288
- Add sentence to corpus or match to existing cluster.
289
-
290
- Args:
291
- sentence: Sentence text
292
- prompt_id: Owner prompt
293
- salience: Importance score
294
- entities: Extracted entities
295
-
296
- Returns:
297
- cluster_id (new or matched)
298
- """
299
- # Compute SimHash
300
- simhash = compute_simhash(sentence)
301
-
302
- # Try to match existing cluster
303
- existing_cluster_id = self.find_matching_cluster(simhash, sentence)
304
-
305
- if existing_cluster_id:
306
- # Matched existing cluster
307
- return existing_cluster_id
308
-
309
- # Create new cluster
310
- cluster_id = self._generate_cluster_id()
311
- cluster = SentenceCluster(
312
- cluster_id=cluster_id,
313
- canonical_sentence=sentence,
314
- owner_prompt_id=prompt_id,
315
- simhash=simhash,
316
- salience=salience,
317
- entities=entities,
318
- first_seen_seq=self.next_seq,
319
- length=len(sentence)
320
- )
321
-
322
- self.clusters[cluster_id] = cluster
323
- self.simhash_index[simhash].append(cluster_id)
324
-
325
- # Update entity index
326
- for entity in entities:
327
- self.entity_index[entity].add(cluster_id)
328
-
329
- self.next_seq += 1
330
- self.telemetry.clusters_total += 1
331
-
332
- return cluster_id
333
-
334
- async def enqueue_prompt(self, prompt_id: str, prompt_text: str) -> str:
335
- """
336
- Enqueue prompt for batched processing with dynamic barrier (ASYNC - allows parallel arrivals).
337
-
338
- Args:
339
- prompt_id: Unique prompt identifier
340
- prompt_text: Full prompt text
341
-
342
- Returns:
343
- Deduplicated prompt text (after barrier)
344
- """
345
- arrival_time = time.time()
346
-
347
- # Split into sentences
348
- sentences = split_into_sentences(prompt_text)
349
-
350
- if not sentences:
351
- return prompt_text
352
-
353
- self.telemetry.prompts_total += 1
354
- self.telemetry.sentences_total += len(sentences)
355
- self.telemetry.chars_in += len(prompt_text)
356
-
357
- # ⚡ CRITICAL: DO NOT compute embeddings here! It blocks async arrivals.
358
- # Store raw sentences and compute embeddings in batch during _process_batch
359
- all_entities = set()
360
-
361
- for sentence in sentences:
362
- # Extract entities (fast, non-blocking)
363
- entities, numbers = extract_entities_regex(sentence)
364
- all_entities.update(entities)
365
- all_entities.update(numbers)
366
-
367
- # Create prompt state (thread-safe mutation)
368
- # NOTE: cluster_ids will be computed during batch processing (after embeddings)
369
- with self.batch_lock:
370
- prompt_state = PromptState(
371
- prompt_id=prompt_id,
372
- sentences=sentences,
373
- cluster_ids=[], # Will be filled during _process_batch
374
- original_order=list(range(len(sentences))),
375
- entities=all_entities,
376
- arrived_at=arrival_time
377
- )
378
-
379
- self.prompt_registry[prompt_id] = prompt_state
380
-
381
- # Add to batch queue and manage barrier
382
- # Create per-prompt ASYNC event for sequential release
383
- prompt_ready = asyncio.Event()
384
- self.prompt_events[prompt_id] = prompt_ready
385
-
386
- with self.batch_lock:
387
- self.batch_queue.append(prompt_id)
388
-
389
- if self.batch_timer is None:
390
- # First prompt in batch, start timer at 5s
391
- self.batch_start_time = arrival_time
392
- self.barrier_duration = 5.0
393
- print(f"[CORPUS] ⏱️ Starting batch barrier: 5.0s (first prompt, min wait)")
394
- self.batch_timer = threading.Timer(self.barrier_duration, self._process_batch)
395
- self.batch_timer.start()
396
- else:
397
- # Extend barrier by +2s per arrival (capped at 10s)
398
- elapsed = arrival_time - self.batch_start_time
399
- new_duration = min(elapsed + self.barrier_increment, self.barrier_cap)
400
-
401
- if new_duration > self.barrier_duration:
402
- # Cancel old timer, start new one
403
- self.batch_timer.cancel()
404
- remaining = new_duration - elapsed
405
- self.barrier_duration = new_duration
406
- _vprint(f"[CORPUS] ⏱️ Extending barrier to {new_duration:.1f}s (+{remaining:.1f}s remaining, +{self.barrier_increment:.1f}s per arrival)", self.verbose)
407
- self.batch_timer = threading.Timer(remaining, self._process_batch)
408
- self.batch_timer.start()
409
-
410
- # ASYNC wait for THIS prompt's individual event (allows other async tasks to proceed)
411
- # Timeout must be generous to account for model loading on first batch
412
- try:
413
- await asyncio.wait_for(prompt_ready.wait(), timeout=30.0) # 30s max wait (model load + processing)
414
- timed_out = False
415
- except asyncio.TimeoutError:
416
- timed_out = True
417
-
418
- if timed_out:
419
- # Fail open: return original text if batch processing hangs
420
- print(f"[CORPUS] ⚠️ Timeout waiting for batch processing, returning original prompt")
421
- self.telemetry.chars_out += len(prompt_text)
422
- return prompt_text
423
-
424
- # Retrieve deduplicated result
425
- deduplicated_text = self._get_deduplicated_prompt(prompt_id)
426
-
427
- if not deduplicated_text:
428
- # Safety: if result is missing, return original
429
- print(f"[CORPUS] ⚠️ Missing deduplicated result for prompt {prompt_id[:8]}, returning original")
430
- self.telemetry.chars_out += len(prompt_text)
431
- return prompt_text
432
-
433
- self.telemetry.chars_out += len(deduplicated_text)
434
-
435
- return deduplicated_text
436
-
437
- def _process_batch(self):
438
- """Process current batch: cross-prompt dedupe, entity coverage check, emit (synchronous)."""
439
- with self.batch_lock:
440
- if not self.batch_queue:
441
- # No prompts to process, just return (shouldn't happen)
442
- return
443
-
444
- batch_prompts = self.batch_queue.copy()
445
- self.batch_queue.clear()
446
- self.batch_timer = None
447
-
448
- batch_duration_ms = (time.time() - self.batch_start_time) * 1000
449
- self.telemetry.barrier_times.append(batch_duration_ms)
450
- self.telemetry.batches_processed += 1
451
-
452
- # Always show batch summary (key metric)
453
- print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
454
-
455
- # Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
456
- # This is done ONCE for the entire batch, allowing parallel arrivals
457
- _vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
458
- model = _get_embedding_model()
459
-
460
- for prompt_id in batch_prompts:
461
- prompt_state = self.prompt_registry[prompt_id]
462
-
463
- if not prompt_state.cluster_ids: # Only process if not yet clustered
464
- # Compute embeddings for all sentences in this prompt (batch operation)
465
- sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
466
-
467
- # Match/create clusters for each sentence
468
- cluster_ids = []
469
- for i, sentence in enumerate(prompt_state.sentences):
470
- # Compute salience
471
- salience = len(sentence) / 100.0
472
- salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
473
-
474
- # Extract entities
475
- entities, numbers = extract_entities_regex(sentence)
476
-
477
- # Match against existing clusters
478
- cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
479
-
480
- if cluster_id is None:
481
- # Create new cluster
482
- with self.batch_lock:
483
- cluster_id = self._generate_cluster_id()
484
- simhash = compute_simhash(sentence)
485
-
486
- cluster = SentenceCluster(
487
- cluster_id=cluster_id,
488
- canonical_sentence=sentence,
489
- owner_prompt_id=prompt_id,
490
- simhash=simhash,
491
- salience=salience,
492
- entities=entities | numbers,
493
- first_seen_seq=self.next_seq,
494
- length=len(sentence),
495
- embedding=sentence_embeddings[i]
496
- )
497
-
498
- self.clusters[cluster_id] = cluster
499
- self.next_seq += 1
500
- self.telemetry.clusters_total += 1
501
-
502
- cluster_ids.append(cluster_id)
503
-
504
- # Update prompt state with cluster_ids
505
- prompt_state.cluster_ids = cluster_ids
506
-
507
- _vprint(f"[CORPUS] Embeddings computed and clusters assigned", self.verbose)
508
-
509
- # Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
510
- # This is critical for true run-scoped deduplication
511
- all_sentences = []
512
- sentence_to_prompt = {} # Map sentence_id (prompt_id, index)
513
- locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
514
-
515
- # Iterate over ALL prompts in registry (including previous batches)
516
- for prompt_id, prompt_state in self.prompt_registry.items():
517
- is_previous_batch = prompt_id not in batch_prompts
518
-
519
- for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
520
- cluster = self.clusters.get(cluster_id)
521
- if not cluster:
522
- continue
523
-
524
- # Create Sentence object for greedy algorithm
525
- sent_id = f"{prompt_id}_{idx}"
526
- sent_obj = Sentence(
527
- id=sent_id,
528
- text=sentence_text,
529
- embedding=cluster.embedding,
530
- entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
531
- numbers=set(), # Already in entities
532
- salience=cluster.salience,
533
- position=cluster.first_seen_seq
534
- )
535
- all_sentences.append(sent_obj)
536
- sentence_to_prompt[sent_id] = (prompt_id, idx)
537
-
538
- # Lock sentences from previous batches (already emitted to user)
539
- if is_previous_batch:
540
- locked_sentences.add(sent_id)
541
-
542
- _vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
543
- _vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
544
-
545
- # Step 2: Compute degree map (needed for isolates pass later)
546
- degree_map = {}
547
- for sent in all_sentences:
548
- degree = 0
549
- for other in all_sentences:
550
- if sent.id != other.id:
551
- if are_sentences_similar(sent, other, semantic_threshold=0.60):
552
- degree += 1
553
- degree_map[sent.id] = degree
554
-
555
- # Sanity checks
556
- isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
557
- non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
558
- pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
559
- avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
560
- print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
561
-
562
- # Step 3: Run greedy maximum-independent-set selection
563
- # Start with LOCKED sentences (from previous batches, already emitted)
564
- # Then run MIS only on NEW sentences (current batch)
565
- selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
566
- selected_ids = locked_sentences.copy()
567
-
568
- print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
569
-
570
- # Now run MIS on NEW sentences only (exclude locked)
571
- new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
572
-
573
- if new_sentences:
574
- # Run MIS on new sentences, considering locked ones as neighbors
575
- new_selected = greedy_max_independent_set(
576
- new_sentences,
577
- similarity_threshold=0.60,
578
- verbose=False, # Set to True for debugging
579
- precomputed_degree_map=degree_map # Pass precomputed degrees
580
- )
581
-
582
- # Add newly selected sentences
583
- selected_sentences.extend(new_selected)
584
- selected_ids.update(s.id for s in new_selected)
585
-
586
- _vprint(f"[CORPUS] ✅ MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
587
-
588
- # Step 3: Compute NODE COVERAGE (align universe for backfill)
589
- # covered_nodes = S ∪ N(S) (selected + their neighbors)
590
- covered_nodes = set(selected_ids)
591
- sentence_map = {s.id: s for s in all_sentences}
592
-
593
- for selected_id in selected_ids:
594
- selected_sent = sentence_map[selected_id]
595
- # Add all neighbors (similar nodes)
596
- for other in all_sentences:
597
- if other.id != selected_id:
598
- if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
599
- covered_nodes.add(other.id)
600
-
601
- total_nodes = len(all_sentences)
602
- node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
603
-
604
- _vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
605
-
606
- # Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
607
- # Goal: Maximize node coverage (S N(S)) by re-adding removed nodes with highest gain
608
- # gain(u) = |({u} ∪ N(u)) \ covered_nodes|
609
- backfill_added = 0
610
- isolates_added = 0
611
- target_coverage = 0.90 # 90% node coverage target
612
-
613
- if node_coverage_before < target_coverage:
614
- uncovered_count = total_nodes - len(covered_nodes)
615
- _vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
616
-
617
- # Get ALL removed sentences (candidates for backfill)
618
- removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
619
-
620
- # Helper: compute node gain for a candidate
621
- def compute_node_gain(sent):
622
- """Compute how many uncovered nodes this sentence + its neighbors would cover."""
623
- candidate_coverage = {sent.id}
624
- # Add neighbors
625
- for other in all_sentences:
626
- if other.id != sent.id:
627
- if are_sentences_similar(sent, other, semantic_threshold=0.60):
628
- candidate_coverage.add(other.id)
629
- # Gain = new nodes not already covered
630
- return len(candidate_coverage - covered_nodes)
631
-
632
- # Debug: Print top-5 candidates by gain (first iteration only)
633
- if removed_sentences:
634
- gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
635
- gains.sort(key=lambda x: x[1], reverse=True)
636
- _vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
637
- for sent, gain in gains[:5]:
638
- _vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
639
-
640
- # GREEDY SET COVER: repeatedly pick sentence with max gain
641
- iteration = 0
642
- while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
643
- # Find best candidate
644
- best_sent = None
645
- best_gain = 0
646
-
647
- for sent in removed_sentences:
648
- gain = compute_node_gain(sent)
649
- if gain > best_gain:
650
- best_gain = gain
651
- best_sent = sent
652
-
653
- if best_gain == 0:
654
- _vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
655
- break
656
-
657
- # Add best sentence back
658
- selected_ids.add(best_sent.id)
659
- selected_sentences.append(best_sent)
660
-
661
- # Update covered_nodes: add this node + its neighbors
662
- covered_nodes.add(best_sent.id)
663
- for other in all_sentences:
664
- if other.id != best_sent.id:
665
- if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
666
- covered_nodes.add(other.id)
667
-
668
- removed_sentences.remove(best_sent)
669
- backfill_added += 1
670
-
671
- # Update coverage
672
- node_coverage_before = len(covered_nodes) / total_nodes
673
- iteration += 1
674
-
675
- if backfill_added <= 5:
676
- _vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
677
-
678
- _vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
679
-
680
- # Step 5: ISOLATES PASS - add uncovered degree=0 nodes
681
- # These are unique nodes with no similar neighbors
682
- uncovered_isolates = [sent for sent in all_sentences
683
- if sent.id not in covered_nodes and degree_map[sent.id] == 0]
684
-
685
- if uncovered_isolates:
686
- _vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
687
-
688
- for sent in uncovered_isolates:
689
- if node_coverage_before >= target_coverage:
690
- break
691
- selected_ids.add(sent.id)
692
- covered_nodes.add(sent.id)
693
- isolates_added += 1
694
- node_coverage_before = len(covered_nodes) / total_nodes
695
-
696
- if isolates_added <= 5:
697
- _vprint(f"[CORPUS] ✅ Isolate: '{sent.text[:60]}...'", self.verbose)
698
-
699
- if isolates_added > 0:
700
- _vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
701
-
702
- # Final coverage stats (NODE universe)
703
- final_selected = len(selected_ids)
704
- final_covered_nodes = len(covered_nodes)
705
- final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
706
-
707
- # Assert denominator is |V| (all nodes, no filtering)
708
- assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
709
-
710
- _vprint(f"[CORPUS] Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
711
- _vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
712
-
713
- # Step 6: Map results back to prompts
714
- results = {}
715
- for prompt_id in batch_prompts:
716
- prompt_state = self.prompt_registry[prompt_id]
717
- kept_sentences = []
718
- removed_count = 0
719
-
720
- for idx, sentence_text in enumerate(prompt_state.sentences):
721
- sent_id = f"{prompt_id}_{idx}"
722
- if sent_id in selected_ids:
723
- kept_sentences.append(sentence_text)
724
- else:
725
- removed_count += 1
726
-
727
- results[prompt_id] = {
728
- 'kept': kept_sentences,
729
- 'removed': removed_count,
730
- 'original_count': len(prompt_state.sentences)
731
- }
732
-
733
- # Step 7: Store results and emit to prompts
734
- for prompt_id in batch_prompts:
735
- prompt_state = self.prompt_registry[prompt_id]
736
- result = results[prompt_id]
737
- prompt_state.sentences = result['kept']
738
-
739
- reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
740
- _vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} → {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
741
-
742
- # Update telemetry
743
- self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
744
- # Always show final batch summary (key metric)
745
- print(f"[CORPUS] Batch complete: Node coverage {final_node_coverage*100:.1f}%")
746
-
747
- # Update telemetry
748
- if self.telemetry.barrier_times:
749
- self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
750
- self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
751
-
752
- self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
753
-
754
- # Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
755
- _vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
756
- for i, prompt_id in enumerate(batch_prompts):
757
- event = self.prompt_events.get(prompt_id)
758
- if event:
759
- event.set() # Wake up this specific thread
760
- # Longer delay to ensure threads hit on_llm_start one at a time
761
- if i < len(batch_prompts) - 1: # Don't delay after the last one
762
- time.sleep(0.5) # 500ms stagger to be safe
763
-
764
- # Clean up events to prevent memory leak
765
- for prompt_id in batch_prompts:
766
- self.prompt_events.pop(prompt_id, None)
767
-
768
- def _get_deduplicated_prompt(self, prompt_id: str) -> str:
769
- """Get deduplicated prompt text."""
770
- prompt_state = self.prompt_registry.get(prompt_id)
771
- if not prompt_state:
772
- return ""
773
-
774
- return "\n".join(prompt_state.sentences)
775
-
776
- def get_telemetry_summary(self) -> str:
777
- """Generate human-readable telemetry summary."""
778
- t = self.telemetry
779
- reduction_pct = ((t.chars_in - t.chars_out) / t.chars_in * 100) if t.chars_in > 0 else 0
780
-
781
- summary = f"""
782
- {'='*70}
783
- [CORPUS] 📊 RUN-SCOPED TELEMETRY (run_id={self.run_id[:8]})
784
- {'='*70}
785
- Prompts processed: {t.prompts_total}
786
- Sentences total: {t.sentences_total}
787
- Clusters created: {t.clusters_total}
788
- Cross-prompt dups removed: {t.cross_prompt_dups_removed}
789
- {'='*70}
790
- Chars in: {t.chars_in:,}
791
- Chars out: {t.chars_out:,}
792
- Reduction: {reduction_pct:.1f}%
793
- Tokens saved (est): {t.tokens_saved:,} tokens
794
- {'='*70}
795
- Node Coverage (S∪N(S)): {t.entity_coverage_avg:.1f}%
796
- Batches processed: {t.batches_processed}
797
- Avg barrier: {t.avg_barrier_ms:.0f}ms
798
- Max barrier: {t.max_barrier_ms:.0f}ms
799
- {'='*70}
800
- """
801
- return summary
802
-
803
-
804
- # Global registry of run-scoped corpuses
805
- _run_corpuses: Dict[str, RunScopedCorpus] = {}
806
- _corpus_lock = threading.Lock()
807
-
808
-
809
- def get_or_create_corpus(run_id: str, verbose: bool = False) -> RunScopedCorpus:
810
- """Get or create run-scoped corpus (thread-safe)."""
811
- with _corpus_lock:
812
- if run_id not in _run_corpuses:
813
- _run_corpuses[run_id] = RunScopedCorpus(run_id, verbose=verbose)
814
- return _run_corpuses[run_id]
815
-
816
-
817
- def cleanup_corpus(run_id: str):
818
- """Cleanup corpus when run ends."""
819
- with _corpus_lock:
820
- if run_id in _run_corpuses:
821
- corpus = _run_corpuses[run_id]
822
- print(corpus.get_telemetry_summary())
823
- del _run_corpuses[run_id]
824
- print(f"[CORPUS] 🗑️ Cleaned up corpus for run_id={run_id[:8]}")
825
-
826
-
827
- # ============================================================================
828
- # Legacy Per-Prompt Deduplication (V1.0 - Fallback)
829
- # ============================================================================
830
-
831
- @dataclass
832
- class Sentence:
833
- """Represents a sentence with metadata for deduplication."""
834
- id: str
835
- text: str
836
- embedding: Optional[np.ndarray] = None
837
- entities: Set[str] = None
838
- numbers: Set[str] = None
839
- salience: float = 0.0
840
- position: int = 0
841
-
842
- def __post_init__(self):
843
- if self.entities is None:
844
- self.entities = set()
845
- if self.numbers is None:
846
- self.numbers = set()
847
-
848
- @property
849
- def protected_entities(self) -> Set[str]:
850
- """All entities that must be preserved."""
851
- return self.entities | self.numbers
852
-
853
-
854
- def estimate_tokens(text: str) -> int:
855
- """Estimate token count (roughly chars/4 for English)."""
856
- return len(text) // 4
857
-
858
-
859
- def adaptive_resize_sentences(sentences: List[str]) -> List[str]:
860
- """
861
- Adaptively resize sentences for optimal embedding similarity:
862
- - Long (>120 tokens): Split on commas, semicolons, conjunctions
863
- - Short (<40 tokens): Merge with next sentence
864
- - Mid (40-120 tokens): Keep as-is
865
-
866
- This improves cross-page similarity and reduces false uniqueness.
867
- """
868
- resized = []
869
- i = 0
870
-
871
- while i < len(sentences):
872
- sent = sentences[i]
873
- tokens = estimate_tokens(sent)
874
-
875
- if tokens > 120:
876
- # LONG: Split on commas, semicolons, and conjunctions
877
- # Split points: , ; : and, but, or, however, therefore (preceded by space/comma)
878
- split_pattern = r'(?:,\s+(?:and|but|or|however|therefore|while|although)\s+|[;:])\s+'
879
- chunks = re.split(split_pattern, sent)
880
-
881
- # Ensure each chunk is reasonable (not too tiny)
882
- for chunk in chunks:
883
- if chunk.strip() and estimate_tokens(chunk) >= 20:
884
- resized.append(chunk.strip())
885
- elif resized:
886
- # Merge tiny chunk with previous
887
- resized[-1] += " " + chunk.strip()
888
- i += 1
889
-
890
- elif tokens < 40 and i + 1 < len(sentences):
891
- # SHORT: Merge with next sentence
892
- next_sent = sentences[i + 1]
893
- merged = sent + " " + next_sent
894
- merged_tokens = estimate_tokens(merged)
895
-
896
- # Only merge if result is ≤120 tokens (don't create overly long sentences)
897
- if merged_tokens <= 120:
898
- resized.append(merged)
899
- i += 2 # Skip next sentence (already merged)
900
- else:
901
- # Next sentence would make it too long, keep short one as-is
902
- resized.append(sent)
903
- i += 1
904
-
905
- else:
906
- # MID-RANGE (40-120) or last sentence: Keep as-is
907
- resized.append(sent)
908
- i += 1
909
-
910
- return resized
911
-
912
-
913
- def split_into_sentences(text: str) -> List[str]:
914
- """
915
- Split text into sentences with special handling for markdown structures,
916
- then adaptively resize for optimal embedding similarity.
917
-
918
- Handles:
919
- - Standard sentences ending with .!?
920
- - Bullet points and numbered lists
921
- - Code blocks (preserve as single units)
922
- - Headers
923
- - Adaptive resizing: long sentences split, short ones merged
924
- """
925
- sentences = []
926
-
927
- # First, protect code blocks
928
- code_block_pattern = r'```[\s\S]*?```'
929
- code_blocks = {}
930
- for i, match in enumerate(re.finditer(code_block_pattern, text)):
931
- placeholder = f"__CODE_BLOCK_{i}__"
932
- code_blocks[placeholder] = match.group()
933
- text = text.replace(match.group(), placeholder)
934
-
935
- # Split on sentence boundaries
936
- # Handle: . ! ? followed by space/newline, or newlines with list markers
937
- patterns = [
938
- r'(?<=[.!?])\s+(?=[A-Z])', # Standard sentences
939
- r'\n\s*[-*•]\s+', # Bullet points
940
- r'\n\s*\d+\.\s+', # Numbered lists
941
- r'\n#{1,6}\s+', # Markdown headers
942
- r'\n\s*\n', # Paragraph breaks
943
- ]
944
-
945
- combined_pattern = '|'.join(f'({p})' for p in patterns)
946
- parts = re.split(combined_pattern, text)
947
-
948
- # Reconstruct sentences (filter out delimiters)
949
- current = ""
950
- for part in parts:
951
- if part is None:
952
- continue
953
- if re.match(combined_pattern, part):
954
- if current.strip():
955
- sentences.append(current.strip())
956
- current = ""
957
- else:
958
- current += part
959
-
960
- if current.strip():
961
- sentences.append(current.strip())
962
-
963
- # Restore code blocks
964
- restored = []
965
- for sent in sentences:
966
- for placeholder, code in code_blocks.items():
967
- sent = sent.replace(placeholder, code)
968
- if sent.strip():
969
- restored.append(sent.strip())
970
-
971
- # ADAPTIVE RESIZING: Split long sentences, merge short ones
972
- resized = adaptive_resize_sentences(restored)
973
-
974
- return resized
975
-
976
-
977
- def extract_entities_regex(text: str) -> Tuple[Set[str], Set[str]]:
978
- """
979
- Fallback regex-based entity extraction.
980
-
981
- Returns:
982
- (entities, numbers) - Sets of extracted entities and numbers
983
- """
984
- entities = set()
985
- numbers = set()
986
-
987
- # Proper nouns: Capitalized words (basic heuristic) - at least 3 chars
988
- proper_nouns = re.findall(r'\b[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]+)*\b', text)
989
- entities.update(proper_nouns)
990
-
991
- # Technical terms: CamelCase, snake_case, package names
992
- technical = re.findall(r'\b[A-Z][a-z]+[A-Z]\w+\b', text) # CamelCase
993
- technical += re.findall(r'\b\w+_\w+\b', text) # snake_case
994
- entities.update(technical)
995
-
996
- # Numbers: MEANINGFUL numbers only (exclude single digits 0-9)
997
- # Include: multi-digit numbers, floats, percentages, version numbers
998
- nums = re.findall(r'\b\d{2,}(?:\.\d+)?%?\b', text) # 2+ digits
999
- nums += re.findall(r'\b\d+\.\d+\b', text) # Floats like 14.4, 2.0
1000
- numbers.update(nums)
1001
-
1002
- # Dates: YYYY-MM-DD, MM/DD/YYYY, etc.
1003
- dates = re.findall(r'\b\d{4}[-/]\d{1,2}[-/]\d{1,4}\b', text) # Full dates
1004
- dates += re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text)
1005
- numbers.update(dates)
1006
-
1007
- # Filter out common non-informative words and malformed entities
1008
- stopwords = {
1009
- # Common words
1010
- 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1011
- 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1012
- 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1013
- 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1014
- # Markup/formatting artifacts
1015
- 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1016
- }
1017
-
1018
- # Filter entities
1019
- filtered_entities = set()
1020
- for e in entities:
1021
- # Skip short entities
1022
- if len(e) < 3:
1023
- continue
1024
-
1025
- # Skip if contains newlines (malformed extraction)
1026
- if '\n' in e:
1027
- continue
1028
-
1029
- # Skip stopwords (case-insensitive)
1030
- if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1031
- continue
1032
-
1033
- # Skip if it's just a URL fragment
1034
- if e.lower() in ['url', 'http', 'https', 'www']:
1035
- continue
1036
-
1037
- # Skip if ends with common suffixes that indicate malformed extraction
1038
- if e.endswith('---') or e.endswith('...') or e.endswith('--'):
1039
- continue
1040
-
1041
- filtered_entities.add(e)
1042
-
1043
- # Filter numbers - remove single digits 0-9 (often SOURCE numbers)
1044
- filtered_numbers = {n for n in numbers if len(n) >= 2 or '.' in n or '%' in n}
1045
-
1046
- return filtered_entities, filtered_numbers
1047
-
1048
-
1049
- def extract_entities_spacy(text: str, nlp) -> Tuple[Set[str], Set[str]]:
1050
- """
1051
- spaCy-based entity extraction (more accurate).
1052
-
1053
- Returns:
1054
- (entities, numbers) - Sets of extracted entities and numbers
1055
- """
1056
- entities = set()
1057
- numbers = set()
1058
-
1059
- doc = nlp(text)
1060
-
1061
- # Named entities
1062
- for ent in doc.ents:
1063
- if ent.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']:
1064
- entities.add(ent.text)
1065
- elif ent.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
1066
- numbers.add(ent.text)
1067
-
1068
- # Also grab technical terms (capitalized noun phrases)
1069
- for chunk in doc.noun_chunks:
1070
- if chunk.text[0].isupper():
1071
- entities.add(chunk.text)
1072
-
1073
- # Apply SAME filtering as regex version
1074
- stopwords = {
1075
- 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1076
- 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1077
- 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1078
- 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1079
- 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1080
- }
1081
-
1082
- # Filter entities
1083
- filtered_entities = set()
1084
- for e in entities:
1085
- # Skip short entities
1086
- if len(e) < 3:
1087
- continue
1088
-
1089
- # Skip if contains newlines (malformed)
1090
- if '\n' in e:
1091
- continue
1092
-
1093
- # Skip stopwords (case-insensitive)
1094
- if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1095
- continue
1096
-
1097
- # Skip URL fragments
1098
- if e.lower() in ['url', 'http', 'https', 'www']:
1099
- continue
1100
-
1101
- # Skip malformed endings
1102
- if e.endswith('---') or e.endswith('...') or e.endswith('--') or e.endswith('---\\nURL'):
1103
- continue
1104
-
1105
- filtered_entities.add(e)
1106
-
1107
- # Filter numbers - remove single digits 0-9
1108
- filtered_numbers = {n for n in numbers if len(str(n).strip()) >= 2 or '.' in str(n) or '%' in str(n)}
1109
-
1110
- return filtered_entities, filtered_numbers
1111
-
1112
-
1113
- def extract_entities(text: str) -> Tuple[Set[str], Set[str]]:
1114
- """
1115
- Extract entities and numbers from text.
1116
-
1117
- Uses spaCy if available, falls back to regex.
1118
-
1119
- Returns:
1120
- (entities, numbers) - Sets of protected entities and numbers
1121
- """
1122
- nlp = _get_spacy_model()
1123
-
1124
- if nlp == "fallback":
1125
- return extract_entities_regex(text)
1126
- else:
1127
- return extract_entities_spacy(text, nlp)
1128
-
1129
-
1130
- def compute_salience(sentence: str, position: int, total_sentences: int) -> float:
1131
- """
1132
- Compute salience score for a sentence.
1133
-
1134
- Factors:
1135
- - Position: Earlier sentences weighted higher (first paragraph effect)
1136
- - Length: Moderate length preferred (too short = filler, too long = verbose)
1137
- - Entity density: More entities = more information-dense
1138
- - Numbers: Presence of numbers = factual content
1139
-
1140
- Returns:
1141
- Salience score (0.0 to 1.0, higher = more important)
1142
- """
1143
- score = 0.0
1144
-
1145
- # Position-based (exponential decay)
1146
- position_weight = np.exp(-position / (total_sentences * 0.3))
1147
- score += position_weight * 0.3
1148
-
1149
- # Length-based (optimal ~50-150 chars)
1150
- length = len(sentence)
1151
- if 50 <= length <= 150:
1152
- length_weight = 1.0
1153
- elif length < 50:
1154
- length_weight = length / 50
1155
- else:
1156
- length_weight = 150 / length
1157
- score += length_weight * 0.2
1158
-
1159
- # Entity density (basic heuristic: count capitalized words)
1160
- words = sentence.split()
1161
- cap_words = sum(1 for w in words if w and w[0].isupper())
1162
- entity_density = min(cap_words / max(len(words), 1), 1.0)
1163
- score += entity_density * 0.3
1164
-
1165
- # Number presence
1166
- has_numbers = bool(re.search(r'\d', sentence))
1167
- score += 0.2 if has_numbers else 0.0
1168
-
1169
- return min(score, 1.0)
1170
-
1171
-
1172
- def compute_char_3gram_jaccard(text1: str, text2: str) -> float:
1173
- """
1174
- Compute character 3-gram Jaccard similarity.
1175
- Captures boilerplate and tight phrasing that embeddings might miss.
1176
-
1177
- Returns:
1178
- Jaccard similarity [0, 1]
1179
- """
1180
- def get_3grams(text):
1181
- text = text.lower()
1182
- return set(text[i:i+3] for i in range(len(text) - 2))
1183
-
1184
- grams1 = get_3grams(text1)
1185
- grams2 = get_3grams(text2)
1186
-
1187
- if not grams1 or not grams2:
1188
- return 0.0
1189
-
1190
- intersection = len(grams1 & grams2)
1191
- union = len(grams1 | grams2)
1192
-
1193
- return intersection / union if union > 0 else 0.0
1194
-
1195
-
1196
- def compute_similarity(emb1: np.ndarray, emb2: np.ndarray) -> float:
1197
- """
1198
- Compute cosine similarity between two embeddings.
1199
- Assumes embeddings are L2-normalized (unit vectors), so cosine = dot product.
1200
- """
1201
- return np.dot(emb1, emb2)
1202
-
1203
-
1204
- def are_sentences_similar(sent1: Sentence, sent2: Sentence, semantic_threshold: float = 0.60) -> bool:
1205
- """
1206
- Check if two sentences are similar using semantic + lexical signals.
1207
-
1208
- - Semantic: cosine similarity on embeddings
1209
- - Lexical fallback: 3-gram Jaccard for short sentences (≤120 chars)
1210
-
1211
- Args:
1212
- sent1, sent2: Sentence objects with embeddings
1213
- semantic_threshold: Threshold for semantic similarity
1214
-
1215
- Returns:
1216
- True if similar, False otherwise
1217
- """
1218
- # Primary: semantic similarity
1219
- semantic_sim = compute_similarity(sent1.embedding, sent2.embedding)
1220
- if semantic_sim >= semantic_threshold:
1221
- return True
1222
-
1223
- # Fallback: lexical for short sentences (captures boilerplate)
1224
- max_len = max(len(sent1.text), len(sent2.text))
1225
- if max_len <= 120: # ~30 tokens
1226
- lexical_sim = compute_char_3gram_jaccard(sent1.text, sent2.text)
1227
- if lexical_sim >= 0.82: # High Jaccard = tight phrasing match
1228
- return True
1229
-
1230
- return False
1231
-
1232
-
1233
- def build_sentence_objects(sentences_text: List[str], embeddings: np.ndarray) -> List[Sentence]:
1234
- """
1235
- Build Sentence objects with metadata.
1236
-
1237
- Args:
1238
- sentences_text: List of sentence strings
1239
- embeddings: Numpy array of embeddings (N x 384)
1240
-
1241
- Returns:
1242
- List of Sentence objects with computed metadata
1243
- """
1244
- sentence_objects = []
1245
- total = len(sentences_text)
1246
-
1247
- for i, text in enumerate(sentences_text):
1248
- # Generate ID
1249
- sent_id = hashlib.md5(text.encode()).hexdigest()[:8]
1250
-
1251
- # Extract entities
1252
- entities, numbers = extract_entities(text)
1253
-
1254
- # Compute salience
1255
- salience = compute_salience(text, i, total)
1256
-
1257
- sentence_objects.append(Sentence(
1258
- id=sent_id,
1259
- text=text,
1260
- embedding=embeddings[i],
1261
- entities=entities,
1262
- numbers=numbers,
1263
- salience=salience,
1264
- position=i
1265
- ))
1266
-
1267
- return sentence_objects
1268
-
1269
-
1270
- def greedy_max_independent_set(
1271
- sentences: List[Sentence],
1272
- similarity_threshold: float = 0.60,
1273
- verbose: bool = True,
1274
- precomputed_degree_map: Dict = None
1275
- ) -> List[Sentence]:
1276
- """
1277
- Greedy maximum-independent-set selection with degree×length-aware ordering.
1278
-
1279
- Algorithm:
1280
- 1. Compute degree (# of similar neighbors) for each sentence
1281
- 2. Sort by (token_length × degree) DESCENDING → prioritizes ejecting long redundant sentences
1282
- 3. Pick highest degree×length sentence (most redundant, highest token savings)
1283
- 4. Remove all similar neighbors (similarity > threshold)
1284
- 5. Check removed sentences for unique entities
1285
- 6. If removed sentence has unique entities, re-add it (HARD GUARD)
1286
- 7. Repeat until all sentences processed
1287
-
1288
- This preserves coverage while ejecting long, low-value uniques bigger trims without raising sim bar.
1289
-
1290
- Args:
1291
- sentences: List of Sentence objects
1292
- similarity_threshold: Similarity threshold for edge creation (0.75 = 75% similar)
1293
- verbose: Print debug info
1294
-
1295
- Returns:
1296
- List of selected Sentence objects (deduplicated)
1297
- """
1298
- if verbose:
1299
- print(f"\n[PIPECLEANER] Starting degree×length-aware greedy max-independent-set")
1300
- print(f"[PIPECLEANER] Input: {len(sentences)} sentences")
1301
- print(f"[PIPECLEANER] Similarity threshold: {similarity_threshold}")
1302
-
1303
- # Step 1: Use precomputed degree map (or compute if not provided)
1304
- if precomputed_degree_map is None:
1305
- # Compute degree (# of connections) for each sentence
1306
- # Use hybrid similarity: semantic (0.60) OR lexical (0.82 Jaccard for short spans)
1307
- degree_map = {}
1308
- for sent in sentences:
1309
- degree = 0
1310
- for other in sentences:
1311
- if sent.id != other.id:
1312
- # Hybrid check: semantic OR lexical
1313
- if are_sentences_similar(sent, other, semantic_threshold=similarity_threshold):
1314
- degree += 1
1315
- degree_map[sent.id] = degree
1316
-
1317
- # Sanity checks (as requested)
1318
- isolates = [s for s in sentences if degree_map[s.id] == 0]
1319
- non_isolates = [s for s in sentences if degree_map[s.id] > 0]
1320
- pct_isolates = len(isolates) / len(sentences) * 100 if sentences else 0
1321
- avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
1322
-
1323
- if verbose:
1324
- avg_degree = sum(degree_map.values()) / len(degree_map) if degree_map else 0
1325
- print(f"[PIPECLEANER] Degree stats: avg={avg_degree:.1f}, isolates={pct_isolates:.1f}%, non-isolate avg={avg_degree_non_iso:.1f}")
1326
- print(f"[PIPECLEANER] Sanity: isolates {pct_isolates:.0f}% (expect <20%), non-isolate avg {avg_degree_non_iso:.1f} (expect >3)")
1327
- else:
1328
- # Use precomputed degree map (more efficient)
1329
- degree_map = precomputed_degree_map
1330
-
1331
- # Step 2: Sort by (token_length × degree) ASCENDING
1332
- # LOW degree×length = short + unique → keep first (high value)
1333
- # HIGH degree×length = long + redundant → eject (low value)
1334
- def sort_key(s):
1335
- token_len = estimate_tokens(s.text)
1336
- degree = degree_map[s.id]
1337
- return token_len * degree
1338
-
1339
- # Sort ASCENDING - pick short unique sentences first
1340
- sorted_sentences = sorted(sentences, key=sort_key, reverse=False)
1341
-
1342
- if verbose:
1343
- top_5 = sorted_sentences[:5]
1344
- print(f"[PIPECLEANER] Top 5 to keep (low degree×length = short + unique):")
1345
- for i, s in enumerate(top_5, 1):
1346
- score = sort_key(s)
1347
- print(f" {i}. {estimate_tokens(s.text)}tok × {degree_map[s.id]}deg = {score:.0f} | '{s.text[:60]}...'")
1348
-
1349
-
1350
- selected = []
1351
- remaining = sorted_sentences.copy()
1352
- entity_coverage = set()
1353
- iteration = 0
1354
-
1355
- while remaining:
1356
- iteration += 1
1357
- # Pick highest degree×length sentence (most redundant + expensive)
1358
- best = remaining[0]
1359
-
1360
- if verbose and iteration <= 5: # Print first 5 iterations
1361
- score = sort_key(best)
1362
- print(f"\n[PIPECLEANER] Iteration {iteration}:")
1363
- print(f" Selected: '{best.text[:80]}...'")
1364
- print(f" Degree×Length: {estimate_tokens(best.text)}tok × {degree_map[best.id]}deg = {score:.0f}")
1365
- print(f" Entities: {best.protected_entities}")
1366
-
1367
- # Add to selected
1368
- selected.append(best)
1369
- entity_coverage |= best.protected_entities
1370
-
1371
- # Remove from remaining
1372
- remaining.remove(best)
1373
-
1374
- # Find similar neighbors to remove (using hybrid similarity)
1375
- to_remove = []
1376
- for candidate in remaining:
1377
- if are_sentences_similar(best, candidate, semantic_threshold=similarity_threshold):
1378
- # Get semantic sim for logging
1379
- sem_sim = compute_similarity(best.embedding, candidate.embedding)
1380
- to_remove.append((candidate, sem_sim))
1381
-
1382
- if verbose and iteration <= 5 and to_remove:
1383
- print(f" Removing {len(to_remove)} similar sentences (similarity >= {similarity_threshold})")
1384
-
1385
- # Remove similar sentences
1386
- for candidate, sim in to_remove:
1387
- remaining.remove(candidate)
1388
-
1389
- # HARD GUARD: Check removed sentences for unique entities
1390
- # Only re-add if they have MULTIPLE (3+) meaningful unique entities
1391
- # This prevents re-adding for trivial differences
1392
- re_added = 0
1393
- for candidate, sim in to_remove:
1394
- unique_entities = candidate.protected_entities - entity_coverage
1395
-
1396
- # Require at least 3 unique entities OR at least 1 unique multi-word entity
1397
- multi_word_entities = {e for e in unique_entities if ' ' in e or len(e) > 10}
1398
- should_readd = len(unique_entities) >= 3 or len(multi_word_entities) >= 1
1399
-
1400
- if should_readd:
1401
- if verbose and iteration <= 5:
1402
- print(f" ⚠️ RE-ADDING sentence with {len(unique_entities)} unique entities: {unique_entities}")
1403
- print(f" Text: '{candidate.text[:80]}...'")
1404
- selected.append(candidate)
1405
- entity_coverage |= candidate.protected_entities
1406
- re_added += 1
1407
-
1408
- if verbose and iteration <= 5 and re_added:
1409
- print(f" Re-added {re_added} sentences to preserve entity coverage")
1410
-
1411
- if verbose:
1412
- print(f"\n[PIPECLEANER] Selection complete:")
1413
- print(f" Input: {len(sentences)} sentences")
1414
- print(f" Output: {len(selected)} sentences")
1415
- print(f" Reduction: {(1 - len(selected)/len(sentences))*100:.1f}%")
1416
- print(f" Entities preserved: {len(entity_coverage)}")
1417
-
1418
- return selected
1419
-
1420
-
1421
- def deduplicate_search_results(
1422
- text: str,
1423
- similarity_threshold: float = 0.60,
1424
- verbose: bool = True,
1425
- cached_model=None
1426
- ) -> Tuple[str, Dict, any]:
1427
- """
1428
- Main entry point: Deduplicate search results using graph-based approach.
1429
-
1430
- Args:
1431
- text: Raw search results text
1432
- similarity_threshold: Cosine similarity threshold (0.60 catches cross-site paraphrases at 0.55-0.68)
1433
- verbose: Print debug info
1434
- cached_model: Optional cached embedding model to reuse
1435
-
1436
- Returns:
1437
- Tuple of (deduplicated_text, stats_dict, embedding_model)
1438
- stats_dict contains: {
1439
- 'original_chars': int,
1440
- 'deduplicated_chars': int,
1441
- 'original_sentences': int,
1442
- 'deduplicated_sentences': int,
1443
- 'prune_pct': float,
1444
- 'original_tokens': int,
1445
- 'deduplicated_tokens': int,
1446
- 'tokens_saved': int,
1447
- 'entity_coverage_pct': float,
1448
- 'entities_total': int,
1449
- 'entities_preserved': int
1450
- }
1451
- """
1452
- if verbose:
1453
- print(f"\n{'='*70}")
1454
- print(f"[PIPECLEANER] DEDUPLICATION STARTED")
1455
- print(f"{'='*70}")
1456
- print(f"[PIPECLEANER] Input text: {len(text)} chars, ~{len(text.split())} words")
1457
-
1458
- # Step 1: Split into sentences
1459
- sentences_text = split_into_sentences(text)
1460
-
1461
- if verbose:
1462
- print(f"[PIPECLEANER] Split into {len(sentences_text)} sentences")
1463
-
1464
- # Initialize stats
1465
- stats = {
1466
- 'original_chars': len(text),
1467
- 'deduplicated_chars': len(text),
1468
- 'original_sentences': len(sentences_text),
1469
- 'deduplicated_sentences': len(sentences_text),
1470
- 'prune_pct': 0.0,
1471
- 'original_tokens': int(len(text) / 4),
1472
- 'deduplicated_tokens': int(len(text) / 4),
1473
- 'tokens_saved': 0,
1474
- 'entity_coverage_pct': 100.0,
1475
- 'entities_total': 0,
1476
- 'entities_preserved': 0
1477
- }
1478
-
1479
- if len(sentences_text) == 0:
1480
- if verbose:
1481
- print(f"[PIPECLEANER] ⚠️ No sentences found, returning original text")
1482
- return text, stats, cached_model
1483
-
1484
- if len(sentences_text) == 1:
1485
- if verbose:
1486
- print(f"[PIPECLEANER] Only 1 sentence, skipping deduplication")
1487
- return text, stats, cached_model
1488
-
1489
- # Step 2: Compute embeddings
1490
- # Always use the thread-safe singleton model
1491
- model = _get_embedding_model()
1492
-
1493
- if verbose:
1494
- print(f"[PIPECLEANER] Computing embeddings...")
1495
-
1496
- # L2 normalize embeddings so cosine similarity = dot product (faster)
1497
- embeddings = model.encode(sentences_text, show_progress_bar=False, normalize_embeddings=True)
1498
-
1499
- if verbose:
1500
- print(f"[PIPECLEANER] Embeddings computed: shape {embeddings.shape}")
1501
-
1502
- # Step 3: Build sentence objects with metadata
1503
- sentences = build_sentence_objects(sentences_text, embeddings)
1504
-
1505
- # Calculate total entities across all sentences
1506
- all_entities = set()
1507
- for sent in sentences:
1508
- all_entities |= sent.protected_entities
1509
-
1510
- # Step 4: Run greedy max-independent-set selection
1511
- selected = greedy_max_independent_set(sentences, similarity_threshold, verbose)
1512
-
1513
- # Calculate preserved entities
1514
- preserved_entities = set()
1515
- for sent in selected:
1516
- preserved_entities |= sent.protected_entities
1517
-
1518
- # Step 5: Reconstruct text preserving original order
1519
- selected_by_position = sorted(selected, key=lambda s: s.position)
1520
- deduplicated_text = '\n\n'.join(s.text for s in selected_by_position)
1521
-
1522
- # Calculate stats
1523
- stats['deduplicated_chars'] = len(deduplicated_text)
1524
- stats['deduplicated_sentences'] = len(selected)
1525
- stats['prune_pct'] = (1 - len(selected) / len(sentences_text)) * 100 if len(sentences_text) > 0 else 0
1526
- stats['deduplicated_tokens'] = int(len(deduplicated_text) / 4)
1527
- stats['tokens_saved'] = stats['original_tokens'] - stats['deduplicated_tokens']
1528
- stats['entities_total'] = len(all_entities)
1529
- stats['entities_preserved'] = len(preserved_entities)
1530
- stats['entity_coverage_pct'] = (len(preserved_entities) / len(all_entities) * 100) if len(all_entities) > 0 else 100.0
1531
-
1532
- if verbose:
1533
- print(f"\n[PIPECLEANER] DEDUPLICATION COMPLETE")
1534
- print(f" Input: {len(text)} chars")
1535
- print(f" Output: {len(deduplicated_text)} chars")
1536
- print(f" Reduction: {(1 - len(deduplicated_text)/len(text))*100:.1f}%")
1537
- print(f" Sentences: {len(sentences_text)} → {len(selected)}")
1538
- print(f"{'='*70}\n")
1539
-
1540
- return deduplicated_text, stats, model
1541
-
1542
-
1543
- # ============================================================================
1544
- # CONVENIENCE FUNCTIONS
1545
- # ============================================================================
1546
-
1547
- def estimate_tokens(text: str) -> int:
1548
- """Rough estimate of token count (words / 0.75)."""
1549
- return int(len(text.split()) / 0.75)
1550
-
1551
-
1552
- def should_deduplicate(text: str, min_length: int = 500) -> bool:
1553
- """
1554
- Check if text is worth deduplicating.
1555
-
1556
- Args:
1557
- text: Input text
1558
- min_length: Minimum character length to bother deduplicating
1559
-
1560
- Returns:
1561
- True if text should be deduplicated
1562
- """
1563
- return len(text) >= min_length
1564
-
1565
-
1566
- def apply_pipecleaner_if_applicable(tool_name: str, output_str: str, selected_rules: list, cached_model=None) -> Tuple[str, any]:
1567
- """
1568
- High-level function to check for filter search rules and apply deduplication.
1569
-
1570
- This is called from capture.py's on_tool_end callback.
1571
-
1572
- Args:
1573
- tool_name: Name of the tool that just finished
1574
- output_str: Raw output from the tool
1575
- selected_rules: List of rules selected for this run
1576
- cached_model: Optional cached embedding model to reuse across searches
1577
-
1578
- Returns:
1579
- Tuple of (deduplicated_output, embedding_model) for caching
1580
- Returns (original_output, None) if no filter rule applies
1581
- """
1582
- try:
1583
- # Find applicable filter search rules for this tool
1584
- filter_rules = _find_filter_search_rules(tool_name, selected_rules)
1585
-
1586
- # If we found applicable filter rules, apply deduplication
1587
- if filter_rules:
1588
- print(f"\n{'='*70}")
1589
- print(f"[PIPECLEANER] 🧹 FILTER SEARCH RULE DETECTED")
1590
- print(f"{'='*70}")
1591
- print(f"[PIPECLEANER] Tool: {tool_name}")
1592
- print(f"[PIPECLEANER] Rules matched: {len(filter_rules)}")
1593
- for rule in filter_rules:
1594
- rule_id = getattr(rule, 'id', 'unknown')
1595
- advice = getattr(rule, 'advice', '') or getattr(rule, 'advice_text', '')
1596
- print(f"[PIPECLEANER] - Rule {rule_id}: {advice[:80]}...")
1597
- print(f"{'='*70}")
1598
-
1599
- # Apply deduplication with cached model
1600
- deduplicated, stats, model = deduplicate_search_results(
1601
- text=output_str,
1602
- similarity_threshold=0.60, # 0.60 catches cross-site paraphrases (0.55-0.68 typical)
1603
- verbose=True, # Show detailed deduplication stats
1604
- cached_model=cached_model # Reuse model if available
1605
- )
1606
-
1607
- # Print comprehensive stats after every search
1608
- print(f"\n{'='*70}")
1609
- print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1610
- print(f"{'='*70}")
1611
- print(f"[PIPECLEANER] 🔢 Sentences:")
1612
- print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1613
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1614
- print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1615
- print(f"[PIPECLEANER]")
1616
- print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1617
- print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1618
- print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1619
- print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1620
- print(f"[PIPECLEANER]")
1621
- print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1622
- print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1623
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1624
- print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1625
- print(f"[PIPECLEANER]")
1626
- print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1627
- print(f"{'='*70}\n")
1628
-
1629
- return deduplicated, model
1630
-
1631
- # No filter rules found, return original
1632
- return output_str, None
1633
-
1634
- except ImportError as e:
1635
- print(f"\n{'='*70}")
1636
- print(f"[PIPECLEANER] ❌ IMPORT ERROR - FAILING OPEN")
1637
- print(f"{'='*70}")
1638
- print(f"[PIPECLEANER] Error: {e}")
1639
- print(f"[PIPECLEANER] Install: pip install sentence-transformers")
1640
- print(f"{'='*70}\n")
1641
- return output_str, None
1642
- except Exception as e:
1643
- print(f"\n{'='*70}")
1644
- print(f"[PIPECLEANER] EXCEPTION - FAILING OPEN")
1645
- print(f"{'='*70}")
1646
- print(f"[PIPECLEANER] Error type: {type(e).__name__}")
1647
- print(f"[PIPECLEANER] Error message: {e}")
1648
- import traceback
1649
- print(f"[PIPECLEANER] Traceback:")
1650
- traceback.print_exc()
1651
- print(f"{'='*70}\n")
1652
- return output_str, None
1653
-
1654
-
1655
- def _find_filter_search_rules(tool_name: str, selected_rules: list) -> list:
1656
- """
1657
- Find llm_start scoped rules with "filter search" keywords that apply to this tool.
1658
-
1659
- This is called from on_llm_start when a Summary tool's LLM is about to be called.
1660
- Rule synthesis will generate rules scoped to llm_start when it detects search→summary patterns.
1661
-
1662
- Args:
1663
- tool_name: Name of the tool whose LLM is starting (e.g., 'Summary')
1664
- selected_rules: List of rules to search through
1665
-
1666
- Returns:
1667
- List of applicable filter search rules
1668
- """
1669
- filter_rules = []
1670
-
1671
- for rule_meta in selected_rules:
1672
- # Unwrap tuple if needed (rules come as (rule, metadata) from select_rules)
1673
- if isinstance(rule_meta, tuple) and len(rule_meta) == 2:
1674
- rule_obj, _metadata = rule_meta
1675
- else:
1676
- rule_obj = rule_meta
1677
-
1678
- # Check if this is an llm_start scoped rule
1679
- target_step_type = getattr(rule_obj, 'target_step_type', None)
1680
-
1681
- # Must be scoped to llm_start (where we intercept Summary LLM calls)
1682
- if target_step_type != 'llm_start':
1683
- continue
1684
-
1685
- # Check if the rule contains "filter search" keywords
1686
- # Try both field names that might be used
1687
- advice = getattr(rule_obj, 'advice_text', None) or getattr(rule_obj, 'advice', None) or ''
1688
- advice_lower = advice.lower() if advice else ''
1689
-
1690
- if not advice_lower or 'filter' not in advice_lower or 'search' not in advice_lower:
1691
- continue
1692
-
1693
- # Check if the rule applies to this tool
1694
- applies = _rule_applies_to_tool(rule_obj, tool_name, advice_lower)
1695
-
1696
- if applies:
1697
- filter_rules.append(rule_obj)
1698
-
1699
- return filter_rules
1700
-
1701
-
1702
- def _rule_applies_to_tool(rule_obj, tool_name: str, advice_lower: str) -> bool:
1703
- """
1704
- Check if a rule applies to the given tool.
1705
-
1706
- Args:
1707
- rule_obj: Rule object or dict to check
1708
- tool_name: Name of the tool (case-insensitive)
1709
- advice_lower: Lowercased advice text for fallback matching
1710
-
1711
- Returns:
1712
- True if rule applies to this tool
1713
- """
1714
- # Wildcard matches everything (used for initial check)
1715
- if tool_name == "*":
1716
- return True
1717
-
1718
- tool_name_lower = tool_name.lower()
1719
-
1720
- # Extract references.tools from rule (handle both dict and object formats)
1721
- if isinstance(rule_obj, dict):
1722
- references = rule_obj.get('references', {})
1723
- tools = references.get('tools', []) if isinstance(references, dict) else []
1724
- else:
1725
- references = getattr(rule_obj, 'references', None)
1726
- if references:
1727
- # Try both object attribute and dict access for tools
1728
- if hasattr(references, 'tools'):
1729
- tools = references.tools
1730
- elif isinstance(references, dict):
1731
- tools = references.get('tools', [])
1732
- else:
1733
- tools = []
1734
- else:
1735
- tools = []
1736
-
1737
- if tools:
1738
- # Check if tool_name matches any tool in references.tools (case-insensitive exact match)
1739
- for ref_tool in tools:
1740
- ref_tool_lower = ref_tool.lower()
1741
- if tool_name_lower == ref_tool_lower:
1742
- return True
1743
- # No match found in references.tools
1744
- return False
1745
- else:
1746
- # Rule has no tools list - don't apply to anything (be conservative)
1747
- return False
1748
-
1749
-
1750
- async def run_pipecleaner_enforcement(
1751
- messages_or_prompts: tuple,
1752
- callback_handler: any,
1753
- patch_depth: any
1754
- ) -> bool:
1755
- """
1756
- Main pipecleaner enforcement logic - parallel to run_microturn_enforcement.
1757
-
1758
- This intercepts ToolMessage objects and applies deduplication.
1759
-
1760
- Args:
1761
- messages_or_prompts: Args tuple from _generate (first element is messages)
1762
- callback_handler: DaseinCallbackHandler with rules
1763
- patch_depth: Thread-local object with caching
1764
-
1765
- Returns:
1766
- True if enforcement was applied, False if skipped
1767
- """
1768
- try:
1769
- print(f"[PIPECLEANER] 🧹 run_pipecleaner_enforcement called")
1770
-
1771
- if not callback_handler or not hasattr(callback_handler, '_selected_rules'):
1772
- return False
1773
-
1774
- rules = callback_handler._selected_rules
1775
- print(f"[PIPECLEANER] Found {len(rules)} rules")
1776
-
1777
- filter_rules = _find_filter_search_rules("*", rules)
1778
- if not filter_rules:
1779
- return False
1780
-
1781
- print(f"[PIPECLEANER] 🎯 Found {len(filter_rules)} filter search rules!")
1782
-
1783
- # Extract messages from args
1784
- if not messages_or_prompts or len(messages_or_prompts) == 0:
1785
- return False
1786
-
1787
- messages = messages_or_prompts[0]
1788
- if not isinstance(messages, list):
1789
- return False
1790
-
1791
- # Find the most recent ToolMessage (tool result)
1792
- tool_message = None
1793
- for idx in range(len(messages) - 1, -1, -1):
1794
- msg = messages[idx]
1795
- msg_type = getattr(msg, 'type', None) or (msg.get('type') if isinstance(msg, dict) else None)
1796
- if msg_type == 'tool':
1797
- tool_message = msg
1798
- break
1799
-
1800
- if not tool_message:
1801
- return False
1802
-
1803
- # Extract tool name and content
1804
- tool_name = getattr(tool_message, 'name', None) or tool_message.get('name', 'unknown')
1805
- tool_content = str(getattr(tool_message, 'content', None) or tool_message.get('content', ''))
1806
-
1807
- print(f"[PIPECLEANER] Tool: {tool_name}, content: {len(tool_content)} chars")
1808
-
1809
- # Check if this tool matches our filter rules
1810
- matching_rules = _find_filter_search_rules(tool_name, rules)
1811
- if not matching_rules:
1812
- print(f"[PIPECLEANER] Tool '{tool_name}' doesn't match filter rules, skipping")
1813
- return False
1814
-
1815
- print(f"[PIPECLEANER] 🎯 Tool '{tool_name}' matches filter rules! Starting deduplication...")
1816
-
1817
- # Prevent infinite regression - check if we've already processed this exact message
1818
- if not hasattr(patch_depth, 'processed_tool_messages'):
1819
- patch_depth.processed_tool_messages = set()
1820
-
1821
- # Create signature from tool name + content hash
1822
- msg_signature = f"{tool_name}_{hash(tool_content[:200])}"
1823
- if msg_signature in patch_depth.processed_tool_messages:
1824
- print(f"[PIPECLEANER] Already processed this ToolMessage, skipping")
1825
- return False
1826
-
1827
- # Mark as processed
1828
- patch_depth.processed_tool_messages.add(msg_signature)
1829
-
1830
- # Apply deduplication
1831
- cached_model = getattr(callback_handler, '_pipecleaner_embedding_model', None)
1832
-
1833
- deduplicated, stats, model = deduplicate_search_results(
1834
- text=tool_content,
1835
- similarity_threshold=0.60, # Lowered to catch paraphrases
1836
- verbose=True,
1837
- cached_model=cached_model
1838
- )
1839
-
1840
- # Cache model
1841
- callback_handler._pipecleaner_embedding_model = model
1842
-
1843
- # Modify ToolMessage content IN PLACE
1844
- if hasattr(tool_message, 'content'):
1845
- tool_message.content = deduplicated
1846
- elif isinstance(tool_message, dict):
1847
- tool_message['content'] = deduplicated
1848
-
1849
- # Cache result for potential reuse
1850
- if not hasattr(patch_depth, 'tool_result_cache'):
1851
- patch_depth.tool_result_cache = {}
1852
-
1853
- result_key = f"{tool_name}_{hash(tool_content[:100])}"
1854
- patch_depth.tool_result_cache[result_key] = deduplicated
1855
-
1856
- print(f"[PIPECLEANER] ✅ Applied deduplication to {tool_name}")
1857
-
1858
- # Print stats
1859
- print(f"\n{'='*70}")
1860
- print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1861
- print(f"{'='*70}")
1862
- print(f"[PIPECLEANER] 🔢 Sentences:")
1863
- print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1864
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1865
- print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1866
- print(f"[PIPECLEANER]")
1867
- print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1868
- print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1869
- print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1870
- print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1871
- print(f"[PIPECLEANER]")
1872
- print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1873
- print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1874
- print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1875
- print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1876
- print(f"[PIPECLEANER]")
1877
- print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1878
- print(f"{'='*70}\n")
1879
-
1880
- return True
1881
-
1882
- except Exception as e:
1883
- print(f"[PIPECLEANER] ⚠️ Error during enforcement: {e}")
1884
- import traceback
1885
- traceback.print_exc()
1886
- return False
1887
-
1888
-
1889
- if __name__ == "__main__":
1890
- # Simple test
1891
- test_text = """
1892
- LangChain is a framework for developing applications powered by language models.
1893
- The LangChain framework enables developers to build LLM applications easily.
1894
- LangChain provides many useful features for LLM apps.
1895
- It supports multiple model providers including OpenAI and Anthropic.
1896
- The framework was created in 2022 by Harrison Chase.
1897
- LlamaIndex is another popular framework for LLM applications.
1898
- LlamaIndex focuses on data indexing and retrieval.
1899
- Both frameworks are open source and widely used.
1900
- """
1901
-
1902
- print("Testing pipecleaner deduplication...")
1903
- result, stats, model = deduplicate_search_results(test_text, verbose=True)
1904
-
1905
- print("\n" + "="*70)
1906
- print("STATS:")
1907
- print(f" Prune %: {stats['prune_pct']:.1f}%")
1908
- print(f" Entity Coverage: {stats['entity_coverage_pct']:.1f}%")
1909
- print(f" Tokens saved: {stats['tokens_saved']:,} ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1910
-
1911
- print("\n" + "="*70)
1912
- print("ORIGINAL:")
1913
- print(test_text)
1914
- print("\n" + "="*70)
1915
- print("DEDUPLICATED:")
1916
- print(result)
1917
-
1
+ """
2
+ Pipecleaner: Run-scoped global corpus deduplication for multi-agent systems.
3
+
4
+ V2.0: Global ClusterBank with dynamic batching barrier (5-10s) for cross-prompt deduplication.
5
+ - Run-scoped corpus: All prompts in a run share a global ClusterBank
6
+ - SimHash near-dup matching: Hamming distance ≤6 for 64-bit fingerprints
7
+ - Dynamic barrier: 5s min, +2s per arrival (cap 10s), maximizes dedupe by collecting bursts
8
+ - Canonical ownership: First prompt to use a cluster owns it, others drop duplicates
9
+ - Entity coverage: 95% threshold RUN-LEVEL (cumulative across all batches, not per-batch)
10
+
11
+ Algorithm:
12
+ 1. Intercept prompt → split sentences → compute SimHash signatures
13
+ 2. Match against ClusterBank (Hamming ≤6) → assign cluster_id or create new
14
+ 3. Queue prompt into micro-batch, extend barrier (+2s per arrival, cap 10s)
15
+ 4. On timer: cross-prompt dedupe (keep only canonical owners)
16
+ 5. RUN-LEVEL entity coverage check (95% cumulative across entire run), re-add if needed
17
+ 6. Emit cleaned prompts (original sentence order preserved)
18
+
19
+ Expected savings: 50-90% char reduction with 95%+ entity coverage across entire run.
20
+ Later batches are MORE aggressive (earlier batches already covered entities).
21
+ """
22
+
23
+ import re
24
+ import hashlib
25
+ import threading
26
+ import time
27
+ from typing import List, Dict, Set, Tuple, Optional, Any
28
+ from dataclasses import dataclass, field
29
+ from collections import defaultdict
30
+ import numpy as np
31
+ import asyncio
32
+
33
+ # Type alias for return type
34
+ DeduplicationResult = Tuple[str, Dict]
35
+
36
+ # Lazy imports for performance (only load when needed)
37
+ _embedding_model = None
38
+ _spacy_nlp = None
39
+ _model_lock = threading.Lock() # Thread-safe singleton access
40
+
41
+
42
+ def _vprint(message: str, verbose: bool = False, force: bool = False):
43
+ """Helper function for verbose printing."""
44
+ if force or verbose:
45
+ print(message)
46
+
47
+
48
+ def _get_embedding_model():
49
+ """
50
+ Lazy load sentence transformer model (thread-safe singleton).
51
+ Forces CPU to avoid meta tensor issues on Win + Py3.13 + Torch.
52
+ """
53
+ global _embedding_model
54
+
55
+ # Double-checked locking pattern for performance
56
+ if _embedding_model is None:
57
+ with _model_lock:
58
+ # Check again inside lock (another thread might have loaded it)
59
+ if _embedding_model is None:
60
+ try:
61
+ from sentence_transformers import SentenceTransformer
62
+ print("[PIPECLEANER] Loading embedding model: all-MiniLM-L6-v2 (384-dim, ~80MB)...")
63
+ # Force CPU device to avoid meta tensor issues
64
+ _embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
65
+ print("[PIPECLEANER] ✅ Embedding model loaded successfully (CPU)")
66
+ except ImportError:
67
+ print("[PIPECLEANER] ⚠️ sentence-transformers not installed. Install: pip install sentence-transformers")
68
+ raise
69
+ except Exception as e:
70
+ print(f"[PIPECLEANER] ⚠️ Failed to load embedding model: {e}")
71
+ raise
72
+
73
+ return _embedding_model
74
+
75
+
76
+ def _get_spacy_model():
77
+ """Lazy load spaCy model for entity extraction."""
78
+ global _spacy_nlp
79
+ if _spacy_nlp is None:
80
+ try:
81
+ import spacy
82
+ print("[PIPECLEANER] Loading spaCy model: en_core_web_sm...")
83
+ _spacy_nlp = spacy.load("en_core_web_sm")
84
+ print("[PIPECLEANER] ✅ spaCy model loaded successfully")
85
+ except ImportError:
86
+ print("[PIPECLEANER] ⚠️ spaCy not installed. Using regex fallback for entities.")
87
+ _spacy_nlp = "fallback"
88
+ except OSError:
89
+ print("[PIPECLEANER] ⚠️ spaCy model not found. Using regex fallback for entities.")
90
+ _spacy_nlp = "fallback"
91
+ return _spacy_nlp
92
+
93
+
94
+ # ============================================================================
95
+ # Run-Scoped Global Corpus System V2.0
96
+ # ============================================================================
97
+
98
+ @dataclass
99
+ class SentenceCluster:
100
+ """Represents a cluster of similar sentences across the run."""
101
+ cluster_id: str
102
+ canonical_sentence: str
103
+ owner_prompt_id: str # First prompt to use this cluster
104
+ simhash: int # 64-bit SimHash fingerprint
105
+ salience: float
106
+ entities: Set[str]
107
+ first_seen_seq: int
108
+ length: int
109
+ embedding: Optional[np.ndarray] = None # Sentence embedding for cosine similarity
110
+
111
+ @dataclass
112
+ class PromptState:
113
+ """State for a single prompt in the batch."""
114
+ prompt_id: str
115
+ sentences: List[str]
116
+ cluster_ids: List[str] # parallel to sentences
117
+ original_order: List[int] # track reordering
118
+ entities: Set[str]
119
+ arrived_at: float
120
+
121
+ @dataclass
122
+ class RunCorpusTelemetry:
123
+ """Run-level statistics for the corpus."""
124
+ prompts_total: int = 0
125
+ sentences_total: int = 0
126
+ clusters_total: int = 0
127
+ cross_prompt_dups_removed: int = 0
128
+ chars_in: int = 0
129
+ chars_out: int = 0
130
+ tokens_saved: int = 0
131
+ entity_coverage_avg: float = 100.0
132
+ batches_processed: int = 0
133
+ avg_barrier_ms: float = 0.0
134
+ max_barrier_ms: float = 0.0
135
+ barrier_times: List[float] = field(default_factory=list)
136
+
137
+
138
+ def compute_simhash(text: str, hash_bits: int = 64) -> int:
139
+ """
140
+ Compute SimHash fingerprint for near-dup detection.
141
+
142
+ Args:
143
+ text: Input text
144
+ hash_bits: Hash size (64-bit default)
145
+
146
+ Returns:
147
+ Integer hash value
148
+ """
149
+ # Tokenize and compute feature hashes
150
+ tokens = re.findall(r'\b\w+\b', text.lower())
151
+ if not tokens:
152
+ return 0
153
+
154
+ # Initialize bit vector
155
+ v = [0] * hash_bits
156
+
157
+ for token in tokens:
158
+ # Hash each token
159
+ h = int(hashlib.md5(token.encode()).hexdigest(), 16)
160
+
161
+ # Update bit vector
162
+ for i in range(hash_bits):
163
+ if h & (1 << i):
164
+ v[i] += 1
165
+ else:
166
+ v[i] -= 1
167
+
168
+ # Generate final hash
169
+ fingerprint = 0
170
+ for i in range(hash_bits):
171
+ if v[i] > 0:
172
+ fingerprint |= (1 << i)
173
+
174
+ return fingerprint
175
+
176
+
177
+ def hamming_distance(hash1: int, hash2: int) -> int:
178
+ """Count differing bits between two hashes."""
179
+ return bin(hash1 ^ hash2).count('1')
180
+
181
+
182
+ class RunScopedCorpus:
183
+ """
184
+ Global corpus for a single run, with dynamic batching barrier.
185
+ All prompts in the run share this corpus for cross-prompt deduplication.
186
+
187
+ CONCURRENCY MODEL:
188
+ - All shared state (clusters, prompt_registry, run_entities, kept_entities, batch_queue)
189
+ is protected by `self.batch_lock` (threading.Lock)
190
+ - All reads iterate over snapshots (dict(...), list(...)) to avoid "dict changed size" errors
191
+ - All writes are atomic under lock (copy-on-write when possible)
192
+ - Re-entrancy guard in caller (DaseinCallbackHandler) prevents nested calls
193
+ - Background timer thread (_process_batch) acquires lock before any mutations
194
+ """
195
+
196
+ def __init__(self, run_id: str, hamming_threshold: int = 6, entity_coverage_min: float = 0.95, verbose: bool = False):
197
+ self.run_id = run_id
198
+ self.hamming_threshold = hamming_threshold
199
+ self.entity_coverage_min = entity_coverage_min
200
+ self.verbose = verbose # Gate debug logging
201
+
202
+ # Core state
203
+ self.clusters: Dict[str, SentenceCluster] = {} # cluster_id → cluster
204
+ self.simhash_index: Dict[int, List[str]] = defaultdict(list) # simhash → [cluster_ids]
205
+ self.prompt_registry: Dict[str, PromptState] = {} # prompt_id → state
206
+ self.entity_index: Dict[str, Set[str]] = defaultdict(set) # entity → {cluster_ids}
207
+
208
+ # Run-level entity tracking for global coverage
209
+ self.run_entities: Set[str] = set() # All entities seen across entire run
210
+ self.kept_entities: Set[str] = set() # All entities kept across all batches
211
+
212
+ # Batching state
213
+ self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
214
+ self.batch_lock = threading.Lock() # Protects batch_queue, batch_timer, etc.
215
+ self.processing_lock = threading.Lock() # CRITICAL: Ensures only ONE batch processes at a time
216
+ self.batch_timer: Optional[threading.Timer] = None
217
+ self.batch_start_time: Optional[float] = None
218
+ self.barrier_duration: float = 5.0 # Start at 5s (min wait)
219
+ self.barrier_increment: float = 2.0 # Add 2s per new arrival
220
+ self.barrier_cap: float = 10.0 # Max 10s
221
+ self.batch_ready = threading.Event() # Signal when batch is processed
222
+ self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
223
+
224
+ # Sequence tracking
225
+ self.next_seq = 0
226
+ self.next_cluster_id = 0
227
+
228
+ # Telemetry
229
+ self.telemetry = RunCorpusTelemetry()
230
+
231
+ _vprint(f"[CORPUS] 🏗️ Created run-scoped corpus for run_id={run_id[:8]} (barrier: 5s min, +2s/arrival, 10s cap)", self.verbose)
232
+
233
+ def _generate_cluster_id(self) -> str:
234
+ """Generate unique cluster ID."""
235
+ cluster_id = f"c{self.next_cluster_id:06d}"
236
+ self.next_cluster_id += 1
237
+ return cluster_id
238
+
239
+ def find_matching_cluster(self, simhash: int, sentence: str, sentence_embedding=None) -> Optional[str]:
240
+ """
241
+ Find existing cluster that matches this sentence using cosine similarity.
242
+
243
+ Args:
244
+ simhash: SimHash of the sentence (for indexing, not matching)
245
+ sentence: Original sentence text
246
+ sentence_embedding: Pre-computed embedding for this sentence
247
+
248
+ Returns:
249
+ cluster_id if match found, None otherwise
250
+ """
251
+ if sentence_embedding is None:
252
+ return None
253
+
254
+ # Check all existing clusters for semantic similarity
255
+ # Use cosine similarity ≥ 0.60 (catches cross-site paraphrases)
256
+ best_match_id = None
257
+ best_similarity = 0.60 # Threshold for considering duplicate (lowered to catch paraphrases)
258
+
259
+ # Snapshot clusters to avoid "dict changed size" errors (thread-safe read)
260
+ with self.batch_lock:
261
+ clusters_snapshot = dict(self.clusters)
262
+
263
+ for cluster_id, cluster in clusters_snapshot.items():
264
+ if cluster.canonical_sentence == sentence:
265
+ # Exact match
266
+ return cluster_id
267
+
268
+ # Hybrid similarity: semantic + lexical fallback for short sentences
269
+ if hasattr(cluster, 'embedding') and cluster.embedding is not None:
270
+ # Semantic similarity
271
+ similarity = np.dot(sentence_embedding, cluster.embedding)
272
+
273
+ # Lexical fallback for short sentences (boilerplate detection)
274
+ max_len = max(len(sentence), len(cluster.canonical_sentence))
275
+ if max_len <= 120 and similarity < 0.60:
276
+ lexical_sim = compute_char_3gram_jaccard(sentence, cluster.canonical_sentence)
277
+ if lexical_sim >= 0.82:
278
+ # Boost similarity to indicate match via lexical path
279
+ similarity = max(similarity, 0.82)
280
+
281
+ if similarity > best_similarity:
282
+ best_similarity = similarity
283
+ best_match_id = cluster_id
284
+
285
+ return best_match_id
286
+
287
+ def add_sentence_to_corpus(self, sentence: str, prompt_id: str, salience: float, entities: Set[str]) -> str:
288
+ """
289
+ Add sentence to corpus or match to existing cluster.
290
+
291
+ Args:
292
+ sentence: Sentence text
293
+ prompt_id: Owner prompt
294
+ salience: Importance score
295
+ entities: Extracted entities
296
+
297
+ Returns:
298
+ cluster_id (new or matched)
299
+ """
300
+ # Compute SimHash
301
+ simhash = compute_simhash(sentence)
302
+
303
+ # Try to match existing cluster
304
+ existing_cluster_id = self.find_matching_cluster(simhash, sentence)
305
+
306
+ if existing_cluster_id:
307
+ # Matched existing cluster
308
+ return existing_cluster_id
309
+
310
+ # Create new cluster
311
+ cluster_id = self._generate_cluster_id()
312
+ cluster = SentenceCluster(
313
+ cluster_id=cluster_id,
314
+ canonical_sentence=sentence,
315
+ owner_prompt_id=prompt_id,
316
+ simhash=simhash,
317
+ salience=salience,
318
+ entities=entities,
319
+ first_seen_seq=self.next_seq,
320
+ length=len(sentence)
321
+ )
322
+
323
+ self.clusters[cluster_id] = cluster
324
+ self.simhash_index[simhash].append(cluster_id)
325
+
326
+ # Update entity index
327
+ for entity in entities:
328
+ self.entity_index[entity].add(cluster_id)
329
+
330
+ self.next_seq += 1
331
+ self.telemetry.clusters_total += 1
332
+
333
+ return cluster_id
334
+
335
+ async def enqueue_prompt(self, prompt_id: str, prompt_text: str) -> str:
336
+ """
337
+ Enqueue prompt for batched processing with dynamic barrier (ASYNC - allows parallel arrivals).
338
+
339
+ Args:
340
+ prompt_id: Unique prompt identifier
341
+ prompt_text: Full prompt text
342
+
343
+ Returns:
344
+ Deduplicated prompt text (after barrier)
345
+ """
346
+ arrival_time = time.time()
347
+
348
+ # Split into sentences
349
+ sentences = split_into_sentences(prompt_text)
350
+
351
+ if not sentences:
352
+ return prompt_text
353
+
354
+ self.telemetry.prompts_total += 1
355
+ self.telemetry.sentences_total += len(sentences)
356
+ self.telemetry.chars_in += len(prompt_text)
357
+
358
+ # CRITICAL: DO NOT compute embeddings here! It blocks async arrivals.
359
+ # Store raw sentences and compute embeddings in batch during _process_batch
360
+ all_entities = set()
361
+
362
+ for sentence in sentences:
363
+ # Extract entities (fast, non-blocking)
364
+ entities, numbers = extract_entities_regex(sentence)
365
+ all_entities.update(entities)
366
+ all_entities.update(numbers)
367
+
368
+ # Create prompt state (thread-safe mutation)
369
+ # NOTE: cluster_ids will be computed during batch processing (after embeddings)
370
+ with self.batch_lock:
371
+ prompt_state = PromptState(
372
+ prompt_id=prompt_id,
373
+ sentences=sentences,
374
+ cluster_ids=[], # Will be filled during _process_batch
375
+ original_order=list(range(len(sentences))),
376
+ entities=all_entities,
377
+ arrived_at=arrival_time
378
+ )
379
+
380
+ self.prompt_registry[prompt_id] = prompt_state
381
+
382
+ # Add to batch queue and manage barrier
383
+ # Create per-prompt ASYNC event for sequential release
384
+ prompt_ready = asyncio.Event()
385
+ self.prompt_events[prompt_id] = prompt_ready
386
+
387
+ with self.batch_lock:
388
+ self.batch_queue.append(prompt_id)
389
+
390
+ if self.batch_timer is None:
391
+ # First prompt in batch, start timer at 5s
392
+ self.batch_start_time = arrival_time
393
+ self.barrier_duration = 5.0
394
+ print(f"[CORPUS] ⏱️ Starting batch barrier: 5.0s (first prompt, min wait)")
395
+ self.batch_timer = threading.Timer(self.barrier_duration, self._process_batch)
396
+ self.batch_timer.start()
397
+ else:
398
+ # Extend barrier by +2s per arrival (capped at 10s)
399
+ elapsed = arrival_time - self.batch_start_time
400
+ new_duration = min(elapsed + self.barrier_increment, self.barrier_cap)
401
+
402
+ if new_duration > self.barrier_duration:
403
+ # Cancel old timer, start new one
404
+ self.batch_timer.cancel()
405
+ remaining = new_duration - elapsed
406
+ self.barrier_duration = new_duration
407
+ _vprint(f"[CORPUS] ⏱️ Extending barrier to {new_duration:.1f}s (+{remaining:.1f}s remaining, +{self.barrier_increment:.1f}s per arrival)", self.verbose)
408
+ self.batch_timer = threading.Timer(remaining, self._process_batch)
409
+ self.batch_timer.start()
410
+
411
+ # ASYNC wait for THIS prompt's individual event (allows other async tasks to proceed)
412
+ # Timeout must be generous to account for model loading on first batch
413
+ try:
414
+ await asyncio.wait_for(prompt_ready.wait(), timeout=30.0) # 30s max wait (model load + processing)
415
+ timed_out = False
416
+ except asyncio.TimeoutError:
417
+ timed_out = True
418
+
419
+ if timed_out:
420
+ # Fail open: return original text if batch processing hangs
421
+ print(f"[CORPUS] ⚠️ Timeout waiting for batch processing, returning original prompt")
422
+ self.telemetry.chars_out += len(prompt_text)
423
+ return prompt_text
424
+
425
+ # Retrieve deduplicated result
426
+ deduplicated_text = self._get_deduplicated_prompt(prompt_id)
427
+
428
+ if not deduplicated_text:
429
+ # Safety: if result is missing, return original
430
+ print(f"[CORPUS] ⚠️ Missing deduplicated result for prompt {prompt_id[:8]}, returning original")
431
+ self.telemetry.chars_out += len(prompt_text)
432
+ return prompt_text
433
+
434
+ self.telemetry.chars_out += len(deduplicated_text)
435
+
436
+ return deduplicated_text
437
+
438
+ def _process_batch(self):
439
+ """Process current batch: cross-prompt dedupe, entity coverage check, emit (synchronous)."""
440
+ # CRITICAL: Acquire processing lock to prevent multiple batches from processing simultaneously
441
+ with self.processing_lock:
442
+ with self.batch_lock:
443
+ if not self.batch_queue:
444
+ # No prompts to process, just return (shouldn't happen)
445
+ return
446
+
447
+ batch_prompts = self.batch_queue.copy()
448
+ self.batch_queue.clear()
449
+ self.batch_timer = None
450
+
451
+ batch_duration_ms = (time.time() - self.batch_start_time) * 1000
452
+ self.telemetry.barrier_times.append(batch_duration_ms)
453
+ self.telemetry.batches_processed += 1
454
+
455
+ # Always show batch summary (key metric)
456
+ print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
457
+
458
+ # Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
459
+ # This is done ONCE for the entire batch, allowing parallel arrivals
460
+ _vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
461
+ model = _get_embedding_model()
462
+
463
+ for prompt_id in batch_prompts:
464
+ prompt_state = self.prompt_registry[prompt_id]
465
+
466
+ if not prompt_state.cluster_ids: # Only process if not yet clustered
467
+ # Compute embeddings for all sentences in this prompt (batch operation)
468
+ sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
469
+
470
+ # Match/create clusters for each sentence
471
+ cluster_ids = []
472
+ for i, sentence in enumerate(prompt_state.sentences):
473
+ # Compute salience
474
+ salience = len(sentence) / 100.0
475
+ salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
476
+
477
+ # Extract entities
478
+ entities, numbers = extract_entities_regex(sentence)
479
+
480
+ # Match against existing clusters
481
+ cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
482
+
483
+ if cluster_id is None:
484
+ # Create new cluster
485
+ with self.batch_lock:
486
+ cluster_id = self._generate_cluster_id()
487
+ simhash = compute_simhash(sentence)
488
+
489
+ cluster = SentenceCluster(
490
+ cluster_id=cluster_id,
491
+ canonical_sentence=sentence,
492
+ owner_prompt_id=prompt_id,
493
+ simhash=simhash,
494
+ salience=salience,
495
+ entities=entities | numbers,
496
+ first_seen_seq=self.next_seq,
497
+ length=len(sentence),
498
+ embedding=sentence_embeddings[i]
499
+ )
500
+
501
+ self.clusters[cluster_id] = cluster
502
+ self.next_seq += 1
503
+ self.telemetry.clusters_total += 1
504
+
505
+ cluster_ids.append(cluster_id)
506
+
507
+ # Update prompt state with cluster_ids
508
+ prompt_state.cluster_ids = cluster_ids
509
+
510
+ _vprint(f"[CORPUS] Embeddings computed and clusters assigned", self.verbose)
511
+
512
+ # Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
513
+ # This is critical for true run-scoped deduplication
514
+ all_sentences = []
515
+ sentence_to_prompt = {} # Map sentence_id (prompt_id, index)
516
+ locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
517
+
518
+ # Iterate over ALL prompts in registry (including previous batches)
519
+ for prompt_id, prompt_state in self.prompt_registry.items():
520
+ is_previous_batch = prompt_id not in batch_prompts
521
+
522
+ for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
523
+ cluster = self.clusters.get(cluster_id)
524
+ if not cluster:
525
+ continue
526
+
527
+ # Create Sentence object for greedy algorithm
528
+ sent_id = f"{prompt_id}_{idx}"
529
+ sent_obj = Sentence(
530
+ id=sent_id,
531
+ text=sentence_text,
532
+ embedding=cluster.embedding,
533
+ entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
534
+ numbers=set(), # Already in entities
535
+ salience=cluster.salience,
536
+ position=cluster.first_seen_seq
537
+ )
538
+ all_sentences.append(sent_obj)
539
+ sentence_to_prompt[sent_id] = (prompt_id, idx)
540
+
541
+ # Lock sentences from previous batches (already emitted to user)
542
+ if is_previous_batch:
543
+ locked_sentences.add(sent_id)
544
+
545
+ _vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
546
+ _vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
547
+
548
+ # Step 2: Compute degree map (needed for isolates pass later)
549
+ degree_map = {}
550
+ for sent in all_sentences:
551
+ degree = 0
552
+ for other in all_sentences:
553
+ if sent.id != other.id:
554
+ if are_sentences_similar(sent, other, semantic_threshold=0.60):
555
+ degree += 1
556
+ degree_map[sent.id] = degree
557
+
558
+ # Sanity checks
559
+ isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
560
+ non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
561
+ pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
562
+ avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
563
+ print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
564
+
565
+ # Step 3: Run greedy maximum-independent-set selection
566
+ # Start with LOCKED sentences (from previous batches, already emitted)
567
+ # Then run MIS only on NEW sentences (current batch)
568
+ selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
569
+ selected_ids = locked_sentences.copy()
570
+
571
+ print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
572
+
573
+ # Now run MIS on NEW sentences only (exclude locked)
574
+ new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
575
+
576
+ if new_sentences:
577
+ # Run MIS on new sentences, considering locked ones as neighbors
578
+ new_selected = greedy_max_independent_set(
579
+ new_sentences,
580
+ similarity_threshold=0.60,
581
+ verbose=False, # Set to True for debugging
582
+ precomputed_degree_map=degree_map # Pass precomputed degrees
583
+ )
584
+
585
+ # Add newly selected sentences
586
+ selected_sentences.extend(new_selected)
587
+ selected_ids.update(s.id for s in new_selected)
588
+
589
+ _vprint(f"[CORPUS] MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
590
+
591
+ # Step 3: Compute NODE COVERAGE (align universe for backfill)
592
+ # covered_nodes = S ∪ N(S) (selected + their neighbors)
593
+ covered_nodes = set(selected_ids)
594
+ sentence_map = {s.id: s for s in all_sentences}
595
+
596
+ for selected_id in selected_ids:
597
+ selected_sent = sentence_map[selected_id]
598
+ # Add all neighbors (similar nodes)
599
+ for other in all_sentences:
600
+ if other.id != selected_id:
601
+ if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
602
+ covered_nodes.add(other.id)
603
+
604
+ total_nodes = len(all_sentences)
605
+ node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
606
+
607
+ _vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
608
+
609
+ # Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
610
+ # Goal: Maximize node coverage (S ∪ N(S)) by re-adding removed nodes with highest gain
611
+ # gain(u) = |({u} N(u)) \ covered_nodes|
612
+ backfill_added = 0
613
+ isolates_added = 0
614
+ target_coverage = 0.90 # 90% node coverage target
615
+
616
+ if node_coverage_before < target_coverage:
617
+ uncovered_count = total_nodes - len(covered_nodes)
618
+ _vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
619
+
620
+ # Get ALL removed sentences (candidates for backfill)
621
+ removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
622
+
623
+ # Helper: compute node gain for a candidate
624
+ def compute_node_gain(sent):
625
+ """Compute how many uncovered nodes this sentence + its neighbors would cover."""
626
+ candidate_coverage = {sent.id}
627
+ # Add neighbors
628
+ for other in all_sentences:
629
+ if other.id != sent.id:
630
+ if are_sentences_similar(sent, other, semantic_threshold=0.60):
631
+ candidate_coverage.add(other.id)
632
+ # Gain = new nodes not already covered
633
+ return len(candidate_coverage - covered_nodes)
634
+
635
+ # Debug: Print top-5 candidates by gain (first iteration only)
636
+ if removed_sentences:
637
+ gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
638
+ gains.sort(key=lambda x: x[1], reverse=True)
639
+ _vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
640
+ for sent, gain in gains[:5]:
641
+ _vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
642
+
643
+ # GREEDY SET COVER: repeatedly pick sentence with max gain
644
+ iteration = 0
645
+ while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
646
+ # Find best candidate
647
+ best_sent = None
648
+ best_gain = 0
649
+
650
+ for sent in removed_sentences:
651
+ gain = compute_node_gain(sent)
652
+ if gain > best_gain:
653
+ best_gain = gain
654
+ best_sent = sent
655
+
656
+ if best_gain == 0:
657
+ _vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
658
+ break
659
+
660
+ # Add best sentence back
661
+ selected_ids.add(best_sent.id)
662
+ selected_sentences.append(best_sent)
663
+
664
+ # Update covered_nodes: add this node + its neighbors
665
+ covered_nodes.add(best_sent.id)
666
+ for other in all_sentences:
667
+ if other.id != best_sent.id:
668
+ if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
669
+ covered_nodes.add(other.id)
670
+
671
+ removed_sentences.remove(best_sent)
672
+ backfill_added += 1
673
+
674
+ # Update coverage
675
+ node_coverage_before = len(covered_nodes) / total_nodes
676
+ iteration += 1
677
+
678
+ if backfill_added <= 5:
679
+ _vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
680
+
681
+ _vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
682
+
683
+ # Step 5: ISOLATES PASS - add uncovered degree=0 nodes
684
+ # These are unique nodes with no similar neighbors
685
+ uncovered_isolates = [sent for sent in all_sentences
686
+ if sent.id not in covered_nodes and degree_map[sent.id] == 0]
687
+
688
+ if uncovered_isolates:
689
+ _vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
690
+
691
+ for sent in uncovered_isolates:
692
+ if node_coverage_before >= target_coverage:
693
+ break
694
+ selected_ids.add(sent.id)
695
+ covered_nodes.add(sent.id)
696
+ isolates_added += 1
697
+ node_coverage_before = len(covered_nodes) / total_nodes
698
+
699
+ if isolates_added <= 5:
700
+ _vprint(f"[CORPUS] Isolate: '{sent.text[:60]}...'", self.verbose)
701
+
702
+ if isolates_added > 0:
703
+ _vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
704
+
705
+ # Final coverage stats (NODE universe)
706
+ final_selected = len(selected_ids)
707
+ final_covered_nodes = len(covered_nodes)
708
+ final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
709
+
710
+ # Assert denominator is |V| (all nodes, no filtering)
711
+ assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
712
+
713
+ _vprint(f"[CORPUS] Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
714
+ _vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
715
+
716
+ # Step 6: Map results back to prompts
717
+ results = {}
718
+ for prompt_id in batch_prompts:
719
+ prompt_state = self.prompt_registry[prompt_id]
720
+ kept_sentences = []
721
+ removed_count = 0
722
+
723
+ for idx, sentence_text in enumerate(prompt_state.sentences):
724
+ sent_id = f"{prompt_id}_{idx}"
725
+ if sent_id in selected_ids:
726
+ kept_sentences.append(sentence_text)
727
+ else:
728
+ removed_count += 1
729
+
730
+ results[prompt_id] = {
731
+ 'kept': kept_sentences,
732
+ 'removed': removed_count,
733
+ 'original_count': len(prompt_state.sentences)
734
+ }
735
+
736
+ # Step 7: Store results and emit to prompts
737
+ for prompt_id in batch_prompts:
738
+ prompt_state = self.prompt_registry[prompt_id]
739
+ result = results[prompt_id]
740
+ prompt_state.sentences = result['kept']
741
+
742
+ reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
743
+ _vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
744
+
745
+ # Update telemetry
746
+ self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
747
+ # Always show final batch summary (key metric)
748
+ print(f"[CORPUS] ✅ Batch complete: Node coverage {final_node_coverage*100:.1f}%")
749
+
750
+ # Update telemetry
751
+ if self.telemetry.barrier_times:
752
+ self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
753
+ self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
754
+
755
+ self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
756
+
757
+ # Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
758
+ _vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
759
+ for i, prompt_id in enumerate(batch_prompts):
760
+ event = self.prompt_events.get(prompt_id)
761
+ if event:
762
+ event.set() # Wake up this specific thread
763
+ # Longer delay to ensure threads hit on_llm_start one at a time
764
+ if i < len(batch_prompts) - 1: # Don't delay after the last one
765
+ time.sleep(0.5) # 500ms stagger to be safe
766
+
767
+ # Clean up events to prevent memory leak
768
+ for prompt_id in batch_prompts:
769
+ self.prompt_events.pop(prompt_id, None)
770
+
771
+ def _get_deduplicated_prompt(self, prompt_id: str) -> str:
772
+ """Get deduplicated prompt text."""
773
+ prompt_state = self.prompt_registry.get(prompt_id)
774
+ if not prompt_state:
775
+ return ""
776
+
777
+ return "\n".join(prompt_state.sentences)
778
+
779
+ def get_telemetry_summary(self) -> str:
780
+ """Generate human-readable telemetry summary."""
781
+ t = self.telemetry
782
+ reduction_pct = ((t.chars_in - t.chars_out) / t.chars_in * 100) if t.chars_in > 0 else 0
783
+
784
+ summary = f"""
785
+ {'='*70}
786
+ [CORPUS] 📊 RUN-SCOPED TELEMETRY (run_id={self.run_id[:8]})
787
+ {'='*70}
788
+ Prompts processed: {t.prompts_total}
789
+ Sentences total: {t.sentences_total}
790
+ Clusters created: {t.clusters_total}
791
+ Cross-prompt dups removed: {t.cross_prompt_dups_removed}
792
+ {'='*70}
793
+ Chars in: {t.chars_in:,}
794
+ Chars out: {t.chars_out:,}
795
+ Reduction: {reduction_pct:.1f}%
796
+ Tokens saved (est): {t.tokens_saved:,} tokens
797
+ {'='*70}
798
+ Node Coverage (S∪N(S)): {t.entity_coverage_avg:.1f}%
799
+ Batches processed: {t.batches_processed}
800
+ Avg barrier: {t.avg_barrier_ms:.0f}ms
801
+ Max barrier: {t.max_barrier_ms:.0f}ms
802
+ {'='*70}
803
+ """
804
+ return summary
805
+
806
+
807
+ # Global registry of run-scoped corpuses
808
+ _run_corpuses: Dict[str, RunScopedCorpus] = {}
809
+ _corpus_lock = threading.Lock()
810
+
811
+
812
+ def get_or_create_corpus(run_id: str, verbose: bool = False) -> RunScopedCorpus:
813
+ """Get or create run-scoped corpus (thread-safe)."""
814
+ with _corpus_lock:
815
+ if run_id not in _run_corpuses:
816
+ _run_corpuses[run_id] = RunScopedCorpus(run_id, verbose=verbose)
817
+ return _run_corpuses[run_id]
818
+
819
+
820
+ def cleanup_corpus(run_id: str):
821
+ """Cleanup corpus when run ends."""
822
+ with _corpus_lock:
823
+ if run_id in _run_corpuses:
824
+ corpus = _run_corpuses[run_id]
825
+ print(corpus.get_telemetry_summary())
826
+ del _run_corpuses[run_id]
827
+ print(f"[CORPUS] 🗑️ Cleaned up corpus for run_id={run_id[:8]}")
828
+
829
+
830
+ # ============================================================================
831
+ # Legacy Per-Prompt Deduplication (V1.0 - Fallback)
832
+ # ============================================================================
833
+
834
+ @dataclass
835
+ class Sentence:
836
+ """Represents a sentence with metadata for deduplication."""
837
+ id: str
838
+ text: str
839
+ embedding: Optional[np.ndarray] = None
840
+ entities: Set[str] = None
841
+ numbers: Set[str] = None
842
+ salience: float = 0.0
843
+ position: int = 0
844
+
845
+ def __post_init__(self):
846
+ if self.entities is None:
847
+ self.entities = set()
848
+ if self.numbers is None:
849
+ self.numbers = set()
850
+
851
+ @property
852
+ def protected_entities(self) -> Set[str]:
853
+ """All entities that must be preserved."""
854
+ return self.entities | self.numbers
855
+
856
+
857
+ def estimate_tokens(text: str) -> int:
858
+ """Estimate token count (roughly chars/4 for English)."""
859
+ return len(text) // 4
860
+
861
+
862
+ def adaptive_resize_sentences(sentences: List[str]) -> List[str]:
863
+ """
864
+ Adaptively resize sentences for optimal embedding similarity:
865
+ - Long (>120 tokens): Split on commas, semicolons, conjunctions
866
+ - Short (<40 tokens): Merge with next sentence
867
+ - Mid (40-120 tokens): Keep as-is
868
+
869
+ This improves cross-page similarity and reduces false uniqueness.
870
+ """
871
+ resized = []
872
+ i = 0
873
+
874
+ while i < len(sentences):
875
+ sent = sentences[i]
876
+ tokens = estimate_tokens(sent)
877
+
878
+ if tokens > 120:
879
+ # LONG: Split on commas, semicolons, and conjunctions
880
+ # Split points: , ; : and, but, or, however, therefore (preceded by space/comma)
881
+ split_pattern = r'(?:,\s+(?:and|but|or|however|therefore|while|although)\s+|[;:])\s+'
882
+ chunks = re.split(split_pattern, sent)
883
+
884
+ # Ensure each chunk is reasonable (not too tiny)
885
+ for chunk in chunks:
886
+ if chunk.strip() and estimate_tokens(chunk) >= 20:
887
+ resized.append(chunk.strip())
888
+ elif resized:
889
+ # Merge tiny chunk with previous
890
+ resized[-1] += " " + chunk.strip()
891
+ i += 1
892
+
893
+ elif tokens < 40 and i + 1 < len(sentences):
894
+ # SHORT: Merge with next sentence
895
+ next_sent = sentences[i + 1]
896
+ merged = sent + " " + next_sent
897
+ merged_tokens = estimate_tokens(merged)
898
+
899
+ # Only merge if result is ≤120 tokens (don't create overly long sentences)
900
+ if merged_tokens <= 120:
901
+ resized.append(merged)
902
+ i += 2 # Skip next sentence (already merged)
903
+ else:
904
+ # Next sentence would make it too long, keep short one as-is
905
+ resized.append(sent)
906
+ i += 1
907
+
908
+ else:
909
+ # MID-RANGE (40-120) or last sentence: Keep as-is
910
+ resized.append(sent)
911
+ i += 1
912
+
913
+ return resized
914
+
915
+
916
+ def split_into_sentences(text: str) -> List[str]:
917
+ """
918
+ Split text into sentences with special handling for markdown structures,
919
+ then adaptively resize for optimal embedding similarity.
920
+
921
+ Handles:
922
+ - Standard sentences ending with .!?
923
+ - Bullet points and numbered lists
924
+ - Code blocks (preserve as single units)
925
+ - Headers
926
+ - Adaptive resizing: long sentences split, short ones merged
927
+ """
928
+ sentences = []
929
+
930
+ # First, protect code blocks
931
+ code_block_pattern = r'```[\s\S]*?```'
932
+ code_blocks = {}
933
+ for i, match in enumerate(re.finditer(code_block_pattern, text)):
934
+ placeholder = f"__CODE_BLOCK_{i}__"
935
+ code_blocks[placeholder] = match.group()
936
+ text = text.replace(match.group(), placeholder)
937
+
938
+ # Split on sentence boundaries
939
+ # Handle: . ! ? followed by space/newline, or newlines with list markers
940
+ patterns = [
941
+ r'(?<=[.!?])\s+(?=[A-Z])', # Standard sentences
942
+ r'\n\s*[-*•]\s+', # Bullet points
943
+ r'\n\s*\d+\.\s+', # Numbered lists
944
+ r'\n#{1,6}\s+', # Markdown headers
945
+ r'\n\s*\n', # Paragraph breaks
946
+ ]
947
+
948
+ combined_pattern = '|'.join(f'({p})' for p in patterns)
949
+ parts = re.split(combined_pattern, text)
950
+
951
+ # Reconstruct sentences (filter out delimiters)
952
+ current = ""
953
+ for part in parts:
954
+ if part is None:
955
+ continue
956
+ if re.match(combined_pattern, part):
957
+ if current.strip():
958
+ sentences.append(current.strip())
959
+ current = ""
960
+ else:
961
+ current += part
962
+
963
+ if current.strip():
964
+ sentences.append(current.strip())
965
+
966
+ # Restore code blocks
967
+ restored = []
968
+ for sent in sentences:
969
+ for placeholder, code in code_blocks.items():
970
+ sent = sent.replace(placeholder, code)
971
+ if sent.strip():
972
+ restored.append(sent.strip())
973
+
974
+ # ADAPTIVE RESIZING: Split long sentences, merge short ones
975
+ resized = adaptive_resize_sentences(restored)
976
+
977
+ return resized
978
+
979
+
980
+ def extract_entities_regex(text: str) -> Tuple[Set[str], Set[str]]:
981
+ """
982
+ Fallback regex-based entity extraction.
983
+
984
+ Returns:
985
+ (entities, numbers) - Sets of extracted entities and numbers
986
+ """
987
+ entities = set()
988
+ numbers = set()
989
+
990
+ # Proper nouns: Capitalized words (basic heuristic) - at least 3 chars
991
+ proper_nouns = re.findall(r'\b[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]+)*\b', text)
992
+ entities.update(proper_nouns)
993
+
994
+ # Technical terms: CamelCase, snake_case, package names
995
+ technical = re.findall(r'\b[A-Z][a-z]+[A-Z]\w+\b', text) # CamelCase
996
+ technical += re.findall(r'\b\w+_\w+\b', text) # snake_case
997
+ entities.update(technical)
998
+
999
+ # Numbers: MEANINGFUL numbers only (exclude single digits 0-9)
1000
+ # Include: multi-digit numbers, floats, percentages, version numbers
1001
+ nums = re.findall(r'\b\d{2,}(?:\.\d+)?%?\b', text) # 2+ digits
1002
+ nums += re.findall(r'\b\d+\.\d+\b', text) # Floats like 14.4, 2.0
1003
+ numbers.update(nums)
1004
+
1005
+ # Dates: YYYY-MM-DD, MM/DD/YYYY, etc.
1006
+ dates = re.findall(r'\b\d{4}[-/]\d{1,2}[-/]\d{1,4}\b', text) # Full dates
1007
+ dates += re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text)
1008
+ numbers.update(dates)
1009
+
1010
+ # Filter out common non-informative words and malformed entities
1011
+ stopwords = {
1012
+ # Common words
1013
+ 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1014
+ 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1015
+ 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1016
+ 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1017
+ # Markup/formatting artifacts
1018
+ 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1019
+ }
1020
+
1021
+ # Filter entities
1022
+ filtered_entities = set()
1023
+ for e in entities:
1024
+ # Skip short entities
1025
+ if len(e) < 3:
1026
+ continue
1027
+
1028
+ # Skip if contains newlines (malformed extraction)
1029
+ if '\n' in e:
1030
+ continue
1031
+
1032
+ # Skip stopwords (case-insensitive)
1033
+ if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1034
+ continue
1035
+
1036
+ # Skip if it's just a URL fragment
1037
+ if e.lower() in ['url', 'http', 'https', 'www']:
1038
+ continue
1039
+
1040
+ # Skip if ends with common suffixes that indicate malformed extraction
1041
+ if e.endswith('---') or e.endswith('...') or e.endswith('--'):
1042
+ continue
1043
+
1044
+ filtered_entities.add(e)
1045
+
1046
+ # Filter numbers - remove single digits 0-9 (often SOURCE numbers)
1047
+ filtered_numbers = {n for n in numbers if len(n) >= 2 or '.' in n or '%' in n}
1048
+
1049
+ return filtered_entities, filtered_numbers
1050
+
1051
+
1052
+ def extract_entities_spacy(text: str, nlp) -> Tuple[Set[str], Set[str]]:
1053
+ """
1054
+ spaCy-based entity extraction (more accurate).
1055
+
1056
+ Returns:
1057
+ (entities, numbers) - Sets of extracted entities and numbers
1058
+ """
1059
+ entities = set()
1060
+ numbers = set()
1061
+
1062
+ doc = nlp(text)
1063
+
1064
+ # Named entities
1065
+ for ent in doc.ents:
1066
+ if ent.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']:
1067
+ entities.add(ent.text)
1068
+ elif ent.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
1069
+ numbers.add(ent.text)
1070
+
1071
+ # Also grab technical terms (capitalized noun phrases)
1072
+ for chunk in doc.noun_chunks:
1073
+ if chunk.text[0].isupper():
1074
+ entities.add(chunk.text)
1075
+
1076
+ # Apply SAME filtering as regex version
1077
+ stopwords = {
1078
+ 'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
1079
+ 'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
1080
+ 'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
1081
+ 'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
1082
+ 'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
1083
+ }
1084
+
1085
+ # Filter entities
1086
+ filtered_entities = set()
1087
+ for e in entities:
1088
+ # Skip short entities
1089
+ if len(e) < 3:
1090
+ continue
1091
+
1092
+ # Skip if contains newlines (malformed)
1093
+ if '\n' in e:
1094
+ continue
1095
+
1096
+ # Skip stopwords (case-insensitive)
1097
+ if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
1098
+ continue
1099
+
1100
+ # Skip URL fragments
1101
+ if e.lower() in ['url', 'http', 'https', 'www']:
1102
+ continue
1103
+
1104
+ # Skip malformed endings
1105
+ if e.endswith('---') or e.endswith('...') or e.endswith('--') or e.endswith('---\\nURL'):
1106
+ continue
1107
+
1108
+ filtered_entities.add(e)
1109
+
1110
+ # Filter numbers - remove single digits 0-9
1111
+ filtered_numbers = {n for n in numbers if len(str(n).strip()) >= 2 or '.' in str(n) or '%' in str(n)}
1112
+
1113
+ return filtered_entities, filtered_numbers
1114
+
1115
+
1116
+ def extract_entities(text: str) -> Tuple[Set[str], Set[str]]:
1117
+ """
1118
+ Extract entities and numbers from text.
1119
+
1120
+ Uses spaCy if available, falls back to regex.
1121
+
1122
+ Returns:
1123
+ (entities, numbers) - Sets of protected entities and numbers
1124
+ """
1125
+ nlp = _get_spacy_model()
1126
+
1127
+ if nlp == "fallback":
1128
+ return extract_entities_regex(text)
1129
+ else:
1130
+ return extract_entities_spacy(text, nlp)
1131
+
1132
+
1133
+ def compute_salience(sentence: str, position: int, total_sentences: int) -> float:
1134
+ """
1135
+ Compute salience score for a sentence.
1136
+
1137
+ Factors:
1138
+ - Position: Earlier sentences weighted higher (first paragraph effect)
1139
+ - Length: Moderate length preferred (too short = filler, too long = verbose)
1140
+ - Entity density: More entities = more information-dense
1141
+ - Numbers: Presence of numbers = factual content
1142
+
1143
+ Returns:
1144
+ Salience score (0.0 to 1.0, higher = more important)
1145
+ """
1146
+ score = 0.0
1147
+
1148
+ # Position-based (exponential decay)
1149
+ position_weight = np.exp(-position / (total_sentences * 0.3))
1150
+ score += position_weight * 0.3
1151
+
1152
+ # Length-based (optimal ~50-150 chars)
1153
+ length = len(sentence)
1154
+ if 50 <= length <= 150:
1155
+ length_weight = 1.0
1156
+ elif length < 50:
1157
+ length_weight = length / 50
1158
+ else:
1159
+ length_weight = 150 / length
1160
+ score += length_weight * 0.2
1161
+
1162
+ # Entity density (basic heuristic: count capitalized words)
1163
+ words = sentence.split()
1164
+ cap_words = sum(1 for w in words if w and w[0].isupper())
1165
+ entity_density = min(cap_words / max(len(words), 1), 1.0)
1166
+ score += entity_density * 0.3
1167
+
1168
+ # Number presence
1169
+ has_numbers = bool(re.search(r'\d', sentence))
1170
+ score += 0.2 if has_numbers else 0.0
1171
+
1172
+ return min(score, 1.0)
1173
+
1174
+
1175
+ def compute_char_3gram_jaccard(text1: str, text2: str) -> float:
1176
+ """
1177
+ Compute character 3-gram Jaccard similarity.
1178
+ Captures boilerplate and tight phrasing that embeddings might miss.
1179
+
1180
+ Returns:
1181
+ Jaccard similarity [0, 1]
1182
+ """
1183
+ def get_3grams(text):
1184
+ text = text.lower()
1185
+ return set(text[i:i+3] for i in range(len(text) - 2))
1186
+
1187
+ grams1 = get_3grams(text1)
1188
+ grams2 = get_3grams(text2)
1189
+
1190
+ if not grams1 or not grams2:
1191
+ return 0.0
1192
+
1193
+ intersection = len(grams1 & grams2)
1194
+ union = len(grams1 | grams2)
1195
+
1196
+ return intersection / union if union > 0 else 0.0
1197
+
1198
+
1199
+ def compute_similarity(emb1: np.ndarray, emb2: np.ndarray) -> float:
1200
+ """
1201
+ Compute cosine similarity between two embeddings.
1202
+ Assumes embeddings are L2-normalized (unit vectors), so cosine = dot product.
1203
+ """
1204
+ return np.dot(emb1, emb2)
1205
+
1206
+
1207
+ def are_sentences_similar(sent1: Sentence, sent2: Sentence, semantic_threshold: float = 0.60) -> bool:
1208
+ """
1209
+ Check if two sentences are similar using semantic + lexical signals.
1210
+
1211
+ - Semantic: cosine similarity on embeddings
1212
+ - Lexical fallback: 3-gram Jaccard for short sentences (≤120 chars)
1213
+
1214
+ Args:
1215
+ sent1, sent2: Sentence objects with embeddings
1216
+ semantic_threshold: Threshold for semantic similarity
1217
+
1218
+ Returns:
1219
+ True if similar, False otherwise
1220
+ """
1221
+ # Primary: semantic similarity
1222
+ semantic_sim = compute_similarity(sent1.embedding, sent2.embedding)
1223
+ if semantic_sim >= semantic_threshold:
1224
+ return True
1225
+
1226
+ # Fallback: lexical for short sentences (captures boilerplate)
1227
+ max_len = max(len(sent1.text), len(sent2.text))
1228
+ if max_len <= 120: # ~30 tokens
1229
+ lexical_sim = compute_char_3gram_jaccard(sent1.text, sent2.text)
1230
+ if lexical_sim >= 0.82: # High Jaccard = tight phrasing match
1231
+ return True
1232
+
1233
+ return False
1234
+
1235
+
1236
+ def build_sentence_objects(sentences_text: List[str], embeddings: np.ndarray) -> List[Sentence]:
1237
+ """
1238
+ Build Sentence objects with metadata.
1239
+
1240
+ Args:
1241
+ sentences_text: List of sentence strings
1242
+ embeddings: Numpy array of embeddings (N x 384)
1243
+
1244
+ Returns:
1245
+ List of Sentence objects with computed metadata
1246
+ """
1247
+ sentence_objects = []
1248
+ total = len(sentences_text)
1249
+
1250
+ for i, text in enumerate(sentences_text):
1251
+ # Generate ID
1252
+ sent_id = hashlib.md5(text.encode()).hexdigest()[:8]
1253
+
1254
+ # Extract entities
1255
+ entities, numbers = extract_entities(text)
1256
+
1257
+ # Compute salience
1258
+ salience = compute_salience(text, i, total)
1259
+
1260
+ sentence_objects.append(Sentence(
1261
+ id=sent_id,
1262
+ text=text,
1263
+ embedding=embeddings[i],
1264
+ entities=entities,
1265
+ numbers=numbers,
1266
+ salience=salience,
1267
+ position=i
1268
+ ))
1269
+
1270
+ return sentence_objects
1271
+
1272
+
1273
+ def greedy_max_independent_set(
1274
+ sentences: List[Sentence],
1275
+ similarity_threshold: float = 0.60,
1276
+ verbose: bool = True,
1277
+ precomputed_degree_map: Dict = None
1278
+ ) -> List[Sentence]:
1279
+ """
1280
+ Greedy maximum-independent-set selection with degree×length-aware ordering.
1281
+
1282
+ Algorithm:
1283
+ 1. Compute degree (# of similar neighbors) for each sentence
1284
+ 2. Sort by (token_length × degree) DESCENDING → prioritizes ejecting long redundant sentences
1285
+ 3. Pick highest degree×length sentence (most redundant, highest token savings)
1286
+ 4. Remove all similar neighbors (similarity > threshold)
1287
+ 5. Check removed sentences for unique entities
1288
+ 6. If removed sentence has unique entities, re-add it (HARD GUARD)
1289
+ 7. Repeat until all sentences processed
1290
+
1291
+ This preserves coverage while ejecting long, low-value uniques → bigger trims without raising sim bar.
1292
+
1293
+ Args:
1294
+ sentences: List of Sentence objects
1295
+ similarity_threshold: Similarity threshold for edge creation (0.75 = 75% similar)
1296
+ verbose: Print debug info
1297
+
1298
+ Returns:
1299
+ List of selected Sentence objects (deduplicated)
1300
+ """
1301
+ if verbose:
1302
+ print(f"\n[PIPECLEANER] Starting degree×length-aware greedy max-independent-set")
1303
+ print(f"[PIPECLEANER] Input: {len(sentences)} sentences")
1304
+ print(f"[PIPECLEANER] Similarity threshold: {similarity_threshold}")
1305
+
1306
+ # Step 1: Use precomputed degree map (or compute if not provided)
1307
+ if precomputed_degree_map is None:
1308
+ # Compute degree (# of connections) for each sentence
1309
+ # Use hybrid similarity: semantic (0.60) OR lexical (0.82 Jaccard for short spans)
1310
+ degree_map = {}
1311
+ for sent in sentences:
1312
+ degree = 0
1313
+ for other in sentences:
1314
+ if sent.id != other.id:
1315
+ # Hybrid check: semantic OR lexical
1316
+ if are_sentences_similar(sent, other, semantic_threshold=similarity_threshold):
1317
+ degree += 1
1318
+ degree_map[sent.id] = degree
1319
+
1320
+ # Sanity checks (as requested)
1321
+ isolates = [s for s in sentences if degree_map[s.id] == 0]
1322
+ non_isolates = [s for s in sentences if degree_map[s.id] > 0]
1323
+ pct_isolates = len(isolates) / len(sentences) * 100 if sentences else 0
1324
+ avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
1325
+
1326
+ if verbose:
1327
+ avg_degree = sum(degree_map.values()) / len(degree_map) if degree_map else 0
1328
+ print(f"[PIPECLEANER] Degree stats: avg={avg_degree:.1f}, isolates={pct_isolates:.1f}%, non-isolate avg={avg_degree_non_iso:.1f}")
1329
+ print(f"[PIPECLEANER] Sanity: isolates {pct_isolates:.0f}% (expect <20%), non-isolate avg {avg_degree_non_iso:.1f} (expect >3)")
1330
+ else:
1331
+ # Use precomputed degree map (more efficient)
1332
+ degree_map = precomputed_degree_map
1333
+
1334
+ # Step 2: Sort by (token_length × degree) ASCENDING
1335
+ # LOW degree×length = short + unique → keep first (high value)
1336
+ # HIGH degree×length = long + redundant → eject (low value)
1337
+ def sort_key(s):
1338
+ token_len = estimate_tokens(s.text)
1339
+ degree = degree_map[s.id]
1340
+ return token_len * degree
1341
+
1342
+ # Sort ASCENDING - pick short unique sentences first
1343
+ sorted_sentences = sorted(sentences, key=sort_key, reverse=False)
1344
+
1345
+ if verbose:
1346
+ top_5 = sorted_sentences[:5]
1347
+ print(f"[PIPECLEANER] Top 5 to keep (low degree×length = short + unique):")
1348
+ for i, s in enumerate(top_5, 1):
1349
+ score = sort_key(s)
1350
+ print(f" {i}. {estimate_tokens(s.text)}tok × {degree_map[s.id]}deg = {score:.0f} | '{s.text[:60]}...'")
1351
+
1352
+
1353
+ selected = []
1354
+ remaining = sorted_sentences.copy()
1355
+ entity_coverage = set()
1356
+ iteration = 0
1357
+
1358
+ while remaining:
1359
+ iteration += 1
1360
+ # Pick highest degree×length sentence (most redundant + expensive)
1361
+ best = remaining[0]
1362
+
1363
+ if verbose and iteration <= 5: # Print first 5 iterations
1364
+ score = sort_key(best)
1365
+ print(f"\n[PIPECLEANER] Iteration {iteration}:")
1366
+ print(f" Selected: '{best.text[:80]}...'")
1367
+ print(f" Degree×Length: {estimate_tokens(best.text)}tok × {degree_map[best.id]}deg = {score:.0f}")
1368
+ print(f" Entities: {best.protected_entities}")
1369
+
1370
+ # Add to selected
1371
+ selected.append(best)
1372
+ entity_coverage |= best.protected_entities
1373
+
1374
+ # Remove from remaining
1375
+ remaining.remove(best)
1376
+
1377
+ # Find similar neighbors to remove (using hybrid similarity)
1378
+ to_remove = []
1379
+ for candidate in remaining:
1380
+ if are_sentences_similar(best, candidate, semantic_threshold=similarity_threshold):
1381
+ # Get semantic sim for logging
1382
+ sem_sim = compute_similarity(best.embedding, candidate.embedding)
1383
+ to_remove.append((candidate, sem_sim))
1384
+
1385
+ if verbose and iteration <= 5 and to_remove:
1386
+ print(f" Removing {len(to_remove)} similar sentences (similarity >= {similarity_threshold})")
1387
+
1388
+ # Remove similar sentences
1389
+ for candidate, sim in to_remove:
1390
+ remaining.remove(candidate)
1391
+
1392
+ # HARD GUARD: Check removed sentences for unique entities
1393
+ # Only re-add if they have MULTIPLE (3+) meaningful unique entities
1394
+ # This prevents re-adding for trivial differences
1395
+ re_added = 0
1396
+ for candidate, sim in to_remove:
1397
+ unique_entities = candidate.protected_entities - entity_coverage
1398
+
1399
+ # Require at least 3 unique entities OR at least 1 unique multi-word entity
1400
+ multi_word_entities = {e for e in unique_entities if ' ' in e or len(e) > 10}
1401
+ should_readd = len(unique_entities) >= 3 or len(multi_word_entities) >= 1
1402
+
1403
+ if should_readd:
1404
+ if verbose and iteration <= 5:
1405
+ print(f" ⚠️ RE-ADDING sentence with {len(unique_entities)} unique entities: {unique_entities}")
1406
+ print(f" Text: '{candidate.text[:80]}...'")
1407
+ selected.append(candidate)
1408
+ entity_coverage |= candidate.protected_entities
1409
+ re_added += 1
1410
+
1411
+ if verbose and iteration <= 5 and re_added:
1412
+ print(f" Re-added {re_added} sentences to preserve entity coverage")
1413
+
1414
+ if verbose:
1415
+ print(f"\n[PIPECLEANER] Selection complete:")
1416
+ print(f" Input: {len(sentences)} sentences")
1417
+ print(f" Output: {len(selected)} sentences")
1418
+ print(f" Reduction: {(1 - len(selected)/len(sentences))*100:.1f}%")
1419
+ print(f" Entities preserved: {len(entity_coverage)}")
1420
+
1421
+ return selected
1422
+
1423
+
1424
+ def deduplicate_search_results(
1425
+ text: str,
1426
+ similarity_threshold: float = 0.60,
1427
+ verbose: bool = True,
1428
+ cached_model=None
1429
+ ) -> Tuple[str, Dict, any]:
1430
+ """
1431
+ Main entry point: Deduplicate search results using graph-based approach.
1432
+
1433
+ Args:
1434
+ text: Raw search results text
1435
+ similarity_threshold: Cosine similarity threshold (0.60 catches cross-site paraphrases at 0.55-0.68)
1436
+ verbose: Print debug info
1437
+ cached_model: Optional cached embedding model to reuse
1438
+
1439
+ Returns:
1440
+ Tuple of (deduplicated_text, stats_dict, embedding_model)
1441
+ stats_dict contains: {
1442
+ 'original_chars': int,
1443
+ 'deduplicated_chars': int,
1444
+ 'original_sentences': int,
1445
+ 'deduplicated_sentences': int,
1446
+ 'prune_pct': float,
1447
+ 'original_tokens': int,
1448
+ 'deduplicated_tokens': int,
1449
+ 'tokens_saved': int,
1450
+ 'entity_coverage_pct': float,
1451
+ 'entities_total': int,
1452
+ 'entities_preserved': int
1453
+ }
1454
+ """
1455
+ if verbose:
1456
+ print(f"\n{'='*70}")
1457
+ print(f"[PIPECLEANER] DEDUPLICATION STARTED")
1458
+ print(f"{'='*70}")
1459
+ print(f"[PIPECLEANER] Input text: {len(text)} chars, ~{len(text.split())} words")
1460
+
1461
+ # Step 1: Split into sentences
1462
+ sentences_text = split_into_sentences(text)
1463
+
1464
+ if verbose:
1465
+ print(f"[PIPECLEANER] Split into {len(sentences_text)} sentences")
1466
+
1467
+ # Initialize stats
1468
+ stats = {
1469
+ 'original_chars': len(text),
1470
+ 'deduplicated_chars': len(text),
1471
+ 'original_sentences': len(sentences_text),
1472
+ 'deduplicated_sentences': len(sentences_text),
1473
+ 'prune_pct': 0.0,
1474
+ 'original_tokens': int(len(text) / 4),
1475
+ 'deduplicated_tokens': int(len(text) / 4),
1476
+ 'tokens_saved': 0,
1477
+ 'entity_coverage_pct': 100.0,
1478
+ 'entities_total': 0,
1479
+ 'entities_preserved': 0
1480
+ }
1481
+
1482
+ if len(sentences_text) == 0:
1483
+ if verbose:
1484
+ print(f"[PIPECLEANER] ⚠️ No sentences found, returning original text")
1485
+ return text, stats, cached_model
1486
+
1487
+ if len(sentences_text) == 1:
1488
+ if verbose:
1489
+ print(f"[PIPECLEANER] Only 1 sentence, skipping deduplication")
1490
+ return text, stats, cached_model
1491
+
1492
+ # Step 2: Compute embeddings
1493
+ # Always use the thread-safe singleton model
1494
+ model = _get_embedding_model()
1495
+
1496
+ if verbose:
1497
+ print(f"[PIPECLEANER] Computing embeddings...")
1498
+
1499
+ # L2 normalize embeddings so cosine similarity = dot product (faster)
1500
+ embeddings = model.encode(sentences_text, show_progress_bar=False, normalize_embeddings=True)
1501
+
1502
+ if verbose:
1503
+ print(f"[PIPECLEANER] Embeddings computed: shape {embeddings.shape}")
1504
+
1505
+ # Step 3: Build sentence objects with metadata
1506
+ sentences = build_sentence_objects(sentences_text, embeddings)
1507
+
1508
+ # Calculate total entities across all sentences
1509
+ all_entities = set()
1510
+ for sent in sentences:
1511
+ all_entities |= sent.protected_entities
1512
+
1513
+ # Step 4: Run greedy max-independent-set selection
1514
+ selected = greedy_max_independent_set(sentences, similarity_threshold, verbose)
1515
+
1516
+ # Calculate preserved entities
1517
+ preserved_entities = set()
1518
+ for sent in selected:
1519
+ preserved_entities |= sent.protected_entities
1520
+
1521
+ # Step 5: Reconstruct text preserving original order
1522
+ selected_by_position = sorted(selected, key=lambda s: s.position)
1523
+ deduplicated_text = '\n\n'.join(s.text for s in selected_by_position)
1524
+
1525
+ # Calculate stats
1526
+ stats['deduplicated_chars'] = len(deduplicated_text)
1527
+ stats['deduplicated_sentences'] = len(selected)
1528
+ stats['prune_pct'] = (1 - len(selected) / len(sentences_text)) * 100 if len(sentences_text) > 0 else 0
1529
+ stats['deduplicated_tokens'] = int(len(deduplicated_text) / 4)
1530
+ stats['tokens_saved'] = stats['original_tokens'] - stats['deduplicated_tokens']
1531
+ stats['entities_total'] = len(all_entities)
1532
+ stats['entities_preserved'] = len(preserved_entities)
1533
+ stats['entity_coverage_pct'] = (len(preserved_entities) / len(all_entities) * 100) if len(all_entities) > 0 else 100.0
1534
+
1535
+ if verbose:
1536
+ print(f"\n[PIPECLEANER] DEDUPLICATION COMPLETE")
1537
+ print(f" Input: {len(text)} chars")
1538
+ print(f" Output: {len(deduplicated_text)} chars")
1539
+ print(f" Reduction: {(1 - len(deduplicated_text)/len(text))*100:.1f}%")
1540
+ print(f" Sentences: {len(sentences_text)} {len(selected)}")
1541
+ print(f"{'='*70}\n")
1542
+
1543
+ return deduplicated_text, stats, model
1544
+
1545
+
1546
+ # ============================================================================
1547
+ # CONVENIENCE FUNCTIONS
1548
+ # ============================================================================
1549
+
1550
+ def estimate_tokens(text: str) -> int:
1551
+ """Rough estimate of token count (words / 0.75)."""
1552
+ return int(len(text.split()) / 0.75)
1553
+
1554
+
1555
+ def should_deduplicate(text: str, min_length: int = 500) -> bool:
1556
+ """
1557
+ Check if text is worth deduplicating.
1558
+
1559
+ Args:
1560
+ text: Input text
1561
+ min_length: Minimum character length to bother deduplicating
1562
+
1563
+ Returns:
1564
+ True if text should be deduplicated
1565
+ """
1566
+ return len(text) >= min_length
1567
+
1568
+
1569
+ def apply_pipecleaner_if_applicable(tool_name: str, output_str: str, selected_rules: list, cached_model=None) -> Tuple[str, any]:
1570
+ """
1571
+ High-level function to check for filter search rules and apply deduplication.
1572
+
1573
+ This is called from capture.py's on_tool_end callback.
1574
+
1575
+ Args:
1576
+ tool_name: Name of the tool that just finished
1577
+ output_str: Raw output from the tool
1578
+ selected_rules: List of rules selected for this run
1579
+ cached_model: Optional cached embedding model to reuse across searches
1580
+
1581
+ Returns:
1582
+ Tuple of (deduplicated_output, embedding_model) for caching
1583
+ Returns (original_output, None) if no filter rule applies
1584
+ """
1585
+ try:
1586
+ # Find applicable filter search rules for this tool
1587
+ filter_rules = _find_filter_search_rules(tool_name, selected_rules)
1588
+
1589
+ # If we found applicable filter rules, apply deduplication
1590
+ if filter_rules:
1591
+ print(f"\n{'='*70}")
1592
+ print(f"[PIPECLEANER] 🧹 FILTER SEARCH RULE DETECTED")
1593
+ print(f"{'='*70}")
1594
+ print(f"[PIPECLEANER] Tool: {tool_name}")
1595
+ print(f"[PIPECLEANER] Rules matched: {len(filter_rules)}")
1596
+ for rule in filter_rules:
1597
+ rule_id = getattr(rule, 'id', 'unknown')
1598
+ advice = getattr(rule, 'advice', '') or getattr(rule, 'advice_text', '')
1599
+ print(f"[PIPECLEANER] - Rule {rule_id}: {advice[:80]}...")
1600
+ print(f"{'='*70}")
1601
+
1602
+ # Apply deduplication with cached model
1603
+ deduplicated, stats, model = deduplicate_search_results(
1604
+ text=output_str,
1605
+ similarity_threshold=0.60, # 0.60 catches cross-site paraphrases (0.55-0.68 typical)
1606
+ verbose=True, # Show detailed deduplication stats
1607
+ cached_model=cached_model # Reuse model if available
1608
+ )
1609
+
1610
+ # Print comprehensive stats after every search
1611
+ print(f"\n{'='*70}")
1612
+ print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1613
+ print(f"{'='*70}")
1614
+ print(f"[PIPECLEANER] 🔢 Sentences:")
1615
+ print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1616
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1617
+ print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1618
+ print(f"[PIPECLEANER]")
1619
+ print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1620
+ print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1621
+ print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1622
+ print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1623
+ print(f"[PIPECLEANER]")
1624
+ print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1625
+ print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1626
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1627
+ print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1628
+ print(f"[PIPECLEANER]")
1629
+ print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1630
+ print(f"{'='*70}\n")
1631
+
1632
+ return deduplicated, model
1633
+
1634
+ # No filter rules found, return original
1635
+ return output_str, None
1636
+
1637
+ except ImportError as e:
1638
+ print(f"\n{'='*70}")
1639
+ print(f"[PIPECLEANER] IMPORT ERROR - FAILING OPEN")
1640
+ print(f"{'='*70}")
1641
+ print(f"[PIPECLEANER] Error: {e}")
1642
+ print(f"[PIPECLEANER] Install: pip install sentence-transformers")
1643
+ print(f"{'='*70}\n")
1644
+ return output_str, None
1645
+ except Exception as e:
1646
+ print(f"\n{'='*70}")
1647
+ print(f"[PIPECLEANER] EXCEPTION - FAILING OPEN")
1648
+ print(f"{'='*70}")
1649
+ print(f"[PIPECLEANER] Error type: {type(e).__name__}")
1650
+ print(f"[PIPECLEANER] Error message: {e}")
1651
+ import traceback
1652
+ print(f"[PIPECLEANER] Traceback:")
1653
+ traceback.print_exc()
1654
+ print(f"{'='*70}\n")
1655
+ return output_str, None
1656
+
1657
+
1658
+ def _find_filter_search_rules(tool_name: str, selected_rules: list) -> list:
1659
+ """
1660
+ Find llm_start scoped rules with "filter search" keywords that apply to this tool.
1661
+
1662
+ This is called from on_llm_start when a Summary tool's LLM is about to be called.
1663
+ Rule synthesis will generate rules scoped to llm_start when it detects search→summary patterns.
1664
+
1665
+ Args:
1666
+ tool_name: Name of the tool whose LLM is starting (e.g., 'Summary')
1667
+ selected_rules: List of rules to search through
1668
+
1669
+ Returns:
1670
+ List of applicable filter search rules
1671
+ """
1672
+ filter_rules = []
1673
+
1674
+ for rule_meta in selected_rules:
1675
+ # Unwrap tuple if needed (rules come as (rule, metadata) from select_rules)
1676
+ if isinstance(rule_meta, tuple) and len(rule_meta) == 2:
1677
+ rule_obj, _metadata = rule_meta
1678
+ else:
1679
+ rule_obj = rule_meta
1680
+
1681
+ # Check if this is an llm_start scoped rule
1682
+ target_step_type = getattr(rule_obj, 'target_step_type', None)
1683
+
1684
+ # Must be scoped to llm_start (where we intercept Summary LLM calls)
1685
+ if target_step_type != 'llm_start':
1686
+ continue
1687
+
1688
+ # Check if the rule contains "filter search" keywords
1689
+ # Try both field names that might be used
1690
+ advice = getattr(rule_obj, 'advice_text', None) or getattr(rule_obj, 'advice', None) or ''
1691
+ advice_lower = advice.lower() if advice else ''
1692
+
1693
+ if not advice_lower or 'filter' not in advice_lower or 'search' not in advice_lower:
1694
+ continue
1695
+
1696
+ # Check if the rule applies to this tool
1697
+ applies = _rule_applies_to_tool(rule_obj, tool_name, advice_lower)
1698
+
1699
+ if applies:
1700
+ filter_rules.append(rule_obj)
1701
+
1702
+ return filter_rules
1703
+
1704
+
1705
+ def _rule_applies_to_tool(rule_obj, tool_name: str, advice_lower: str) -> bool:
1706
+ """
1707
+ Check if a rule applies to the given tool.
1708
+
1709
+ Args:
1710
+ rule_obj: Rule object or dict to check
1711
+ tool_name: Name of the tool (case-insensitive)
1712
+ advice_lower: Lowercased advice text for fallback matching
1713
+
1714
+ Returns:
1715
+ True if rule applies to this tool
1716
+ """
1717
+ # Wildcard matches everything (used for initial check)
1718
+ if tool_name == "*":
1719
+ return True
1720
+
1721
+ tool_name_lower = tool_name.lower()
1722
+
1723
+ # Extract references.tools from rule (handle both dict and object formats)
1724
+ if isinstance(rule_obj, dict):
1725
+ references = rule_obj.get('references', {})
1726
+ tools = references.get('tools', []) if isinstance(references, dict) else []
1727
+ else:
1728
+ references = getattr(rule_obj, 'references', None)
1729
+ if references:
1730
+ # Try both object attribute and dict access for tools
1731
+ if hasattr(references, 'tools'):
1732
+ tools = references.tools
1733
+ elif isinstance(references, dict):
1734
+ tools = references.get('tools', [])
1735
+ else:
1736
+ tools = []
1737
+ else:
1738
+ tools = []
1739
+
1740
+ if tools:
1741
+ # Check if tool_name matches any tool in references.tools (case-insensitive exact match)
1742
+ for ref_tool in tools:
1743
+ ref_tool_lower = ref_tool.lower()
1744
+ if tool_name_lower == ref_tool_lower:
1745
+ return True
1746
+ # No match found in references.tools
1747
+ return False
1748
+ else:
1749
+ # Rule has no tools list - don't apply to anything (be conservative)
1750
+ return False
1751
+
1752
+
1753
+ async def run_pipecleaner_enforcement(
1754
+ messages_or_prompts: tuple,
1755
+ callback_handler: any,
1756
+ patch_depth: any
1757
+ ) -> bool:
1758
+ """
1759
+ Main pipecleaner enforcement logic - parallel to run_microturn_enforcement.
1760
+
1761
+ This intercepts ToolMessage objects and applies deduplication.
1762
+
1763
+ Args:
1764
+ messages_or_prompts: Args tuple from _generate (first element is messages)
1765
+ callback_handler: DaseinCallbackHandler with rules
1766
+ patch_depth: Thread-local object with caching
1767
+
1768
+ Returns:
1769
+ True if enforcement was applied, False if skipped
1770
+ """
1771
+ try:
1772
+ print(f"[PIPECLEANER] 🧹 run_pipecleaner_enforcement called")
1773
+
1774
+ if not callback_handler or not hasattr(callback_handler, '_selected_rules'):
1775
+ return False
1776
+
1777
+ rules = callback_handler._selected_rules
1778
+ print(f"[PIPECLEANER] Found {len(rules)} rules")
1779
+
1780
+ filter_rules = _find_filter_search_rules("*", rules)
1781
+ if not filter_rules:
1782
+ return False
1783
+
1784
+ print(f"[PIPECLEANER] 🎯 Found {len(filter_rules)} filter search rules!")
1785
+
1786
+ # Extract messages from args
1787
+ if not messages_or_prompts or len(messages_or_prompts) == 0:
1788
+ return False
1789
+
1790
+ messages = messages_or_prompts[0]
1791
+ if not isinstance(messages, list):
1792
+ return False
1793
+
1794
+ # Find the most recent ToolMessage (tool result)
1795
+ tool_message = None
1796
+ for idx in range(len(messages) - 1, -1, -1):
1797
+ msg = messages[idx]
1798
+ msg_type = getattr(msg, 'type', None) or (msg.get('type') if isinstance(msg, dict) else None)
1799
+ if msg_type == 'tool':
1800
+ tool_message = msg
1801
+ break
1802
+
1803
+ if not tool_message:
1804
+ return False
1805
+
1806
+ # Extract tool name and content
1807
+ tool_name = getattr(tool_message, 'name', None) or tool_message.get('name', 'unknown')
1808
+ tool_content = str(getattr(tool_message, 'content', None) or tool_message.get('content', ''))
1809
+
1810
+ print(f"[PIPECLEANER] Tool: {tool_name}, content: {len(tool_content)} chars")
1811
+
1812
+ # Check if this tool matches our filter rules
1813
+ matching_rules = _find_filter_search_rules(tool_name, rules)
1814
+ if not matching_rules:
1815
+ print(f"[PIPECLEANER] Tool '{tool_name}' doesn't match filter rules, skipping")
1816
+ return False
1817
+
1818
+ print(f"[PIPECLEANER] 🎯 Tool '{tool_name}' matches filter rules! Starting deduplication...")
1819
+
1820
+ # Prevent infinite regression - check if we've already processed this exact message
1821
+ if not hasattr(patch_depth, 'processed_tool_messages'):
1822
+ patch_depth.processed_tool_messages = set()
1823
+
1824
+ # Create signature from tool name + content hash
1825
+ msg_signature = f"{tool_name}_{hash(tool_content[:200])}"
1826
+ if msg_signature in patch_depth.processed_tool_messages:
1827
+ print(f"[PIPECLEANER] Already processed this ToolMessage, skipping")
1828
+ return False
1829
+
1830
+ # Mark as processed
1831
+ patch_depth.processed_tool_messages.add(msg_signature)
1832
+
1833
+ # Apply deduplication
1834
+ cached_model = getattr(callback_handler, '_pipecleaner_embedding_model', None)
1835
+
1836
+ deduplicated, stats, model = deduplicate_search_results(
1837
+ text=tool_content,
1838
+ similarity_threshold=0.60, # Lowered to catch paraphrases
1839
+ verbose=True,
1840
+ cached_model=cached_model
1841
+ )
1842
+
1843
+ # Cache model
1844
+ callback_handler._pipecleaner_embedding_model = model
1845
+
1846
+ # Modify ToolMessage content IN PLACE
1847
+ if hasattr(tool_message, 'content'):
1848
+ tool_message.content = deduplicated
1849
+ elif isinstance(tool_message, dict):
1850
+ tool_message['content'] = deduplicated
1851
+
1852
+ # Cache result for potential reuse
1853
+ if not hasattr(patch_depth, 'tool_result_cache'):
1854
+ patch_depth.tool_result_cache = {}
1855
+
1856
+ result_key = f"{tool_name}_{hash(tool_content[:100])}"
1857
+ patch_depth.tool_result_cache[result_key] = deduplicated
1858
+
1859
+ print(f"[PIPECLEANER] ✅ Applied deduplication to {tool_name}")
1860
+
1861
+ # Print stats
1862
+ print(f"\n{'='*70}")
1863
+ print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
1864
+ print(f"{'='*70}")
1865
+ print(f"[PIPECLEANER] 🔢 Sentences:")
1866
+ print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
1867
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
1868
+ print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
1869
+ print(f"[PIPECLEANER]")
1870
+ print(f"[PIPECLEANER] 🎯 Entity Coverage:")
1871
+ print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
1872
+ print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
1873
+ print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
1874
+ print(f"[PIPECLEANER]")
1875
+ print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
1876
+ print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
1877
+ print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
1878
+ print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1879
+ print(f"[PIPECLEANER]")
1880
+ print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
1881
+ print(f"{'='*70}\n")
1882
+
1883
+ return True
1884
+
1885
+ except Exception as e:
1886
+ print(f"[PIPECLEANER] ⚠️ Error during enforcement: {e}")
1887
+ import traceback
1888
+ traceback.print_exc()
1889
+ return False
1890
+
1891
+
1892
+ if __name__ == "__main__":
1893
+ # Simple test
1894
+ test_text = """
1895
+ LangChain is a framework for developing applications powered by language models.
1896
+ The LangChain framework enables developers to build LLM applications easily.
1897
+ LangChain provides many useful features for LLM apps.
1898
+ It supports multiple model providers including OpenAI and Anthropic.
1899
+ The framework was created in 2022 by Harrison Chase.
1900
+ LlamaIndex is another popular framework for LLM applications.
1901
+ LlamaIndex focuses on data indexing and retrieval.
1902
+ Both frameworks are open source and widely used.
1903
+ """
1904
+
1905
+ print("Testing pipecleaner deduplication...")
1906
+ result, stats, model = deduplicate_search_results(test_text, verbose=True)
1907
+
1908
+ print("\n" + "="*70)
1909
+ print("STATS:")
1910
+ print(f" Prune %: {stats['prune_pct']:.1f}%")
1911
+ print(f" Entity Coverage: {stats['entity_coverage_pct']:.1f}%")
1912
+ print(f" Tokens saved: {stats['tokens_saved']:,} ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
1913
+
1914
+ print("\n" + "="*70)
1915
+ print("ORIGINAL:")
1916
+ print(test_text)
1917
+ print("\n" + "="*70)
1918
+ print("DEDUPLICATED:")
1919
+ print(result)
1920
+