dasein-core 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dasein/api.py +4 -8
- dasein/capture.py +30 -3
- dasein/pipecleaner.py +1918 -1920
- {dasein_core-0.2.15.dist-info → dasein_core-0.2.17.dist-info}/METADATA +1 -1
- {dasein_core-0.2.15.dist-info → dasein_core-0.2.17.dist-info}/RECORD +8 -8
- {dasein_core-0.2.15.dist-info → dasein_core-0.2.17.dist-info}/WHEEL +0 -0
- {dasein_core-0.2.15.dist-info → dasein_core-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {dasein_core-0.2.15.dist-info → dasein_core-0.2.17.dist-info}/top_level.txt +0 -0
dasein/pipecleaner.py
CHANGED
@@ -1,1920 +1,1918 @@
|
|
1
|
-
"""
|
2
|
-
Pipecleaner: Run-scoped global corpus deduplication for multi-agent systems.
|
3
|
-
|
4
|
-
V2.0: Global ClusterBank with dynamic batching barrier (5-10s) for cross-prompt deduplication.
|
5
|
-
- Run-scoped corpus: All prompts in a run share a global ClusterBank
|
6
|
-
- SimHash near-dup matching: Hamming distance ≤6 for 64-bit fingerprints
|
7
|
-
- Dynamic barrier: 5s min, +2s per arrival (cap 10s), maximizes dedupe by collecting bursts
|
8
|
-
- Canonical ownership: First prompt to use a cluster owns it, others drop duplicates
|
9
|
-
- Entity coverage: 95% threshold RUN-LEVEL (cumulative across all batches, not per-batch)
|
10
|
-
|
11
|
-
Algorithm:
|
12
|
-
1. Intercept prompt → split sentences → compute SimHash signatures
|
13
|
-
2. Match against ClusterBank (Hamming ≤6) → assign cluster_id or create new
|
14
|
-
3. Queue prompt into micro-batch, extend barrier (+2s per arrival, cap 10s)
|
15
|
-
4. On timer: cross-prompt dedupe (keep only canonical owners)
|
16
|
-
5. RUN-LEVEL entity coverage check (95% cumulative across entire run), re-add if needed
|
17
|
-
6. Emit cleaned prompts (original sentence order preserved)
|
18
|
-
|
19
|
-
Expected savings: 50-90% char reduction with 95%+ entity coverage across entire run.
|
20
|
-
Later batches are MORE aggressive (earlier batches already covered entities).
|
21
|
-
"""
|
22
|
-
|
23
|
-
import re
|
24
|
-
import hashlib
|
25
|
-
import threading
|
26
|
-
import time
|
27
|
-
from typing import List, Dict, Set, Tuple, Optional, Any
|
28
|
-
from dataclasses import dataclass, field
|
29
|
-
from collections import defaultdict
|
30
|
-
import numpy as np
|
31
|
-
import asyncio
|
32
|
-
|
33
|
-
# Type alias for return type
|
34
|
-
DeduplicationResult = Tuple[str, Dict]
|
35
|
-
|
36
|
-
# Lazy imports for performance (only load when needed)
|
37
|
-
_embedding_model = None
|
38
|
-
_spacy_nlp = None
|
39
|
-
_model_lock = threading.Lock() # Thread-safe singleton access
|
40
|
-
|
41
|
-
|
42
|
-
def _vprint(message: str, verbose: bool = False, force: bool = False):
|
43
|
-
"""Helper function for verbose printing."""
|
44
|
-
if force or verbose:
|
45
|
-
print(message)
|
46
|
-
|
47
|
-
|
48
|
-
def _get_embedding_model():
|
49
|
-
"""
|
50
|
-
Lazy load sentence transformer model (thread-safe singleton).
|
51
|
-
Forces CPU to avoid meta tensor issues on Win + Py3.13 + Torch.
|
52
|
-
"""
|
53
|
-
global _embedding_model
|
54
|
-
|
55
|
-
# Double-checked locking pattern for performance
|
56
|
-
if _embedding_model is None:
|
57
|
-
with _model_lock:
|
58
|
-
# Check again inside lock (another thread might have loaded it)
|
59
|
-
if _embedding_model is None:
|
60
|
-
try:
|
61
|
-
from sentence_transformers import SentenceTransformer
|
62
|
-
print("[PIPECLEANER] Loading embedding model: all-MiniLM-L6-v2 (384-dim, ~80MB)...")
|
63
|
-
# Force CPU device to avoid meta tensor issues
|
64
|
-
_embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
|
65
|
-
print("[PIPECLEANER] ✅ Embedding model loaded successfully (CPU)")
|
66
|
-
except ImportError:
|
67
|
-
print("[PIPECLEANER] ⚠️ sentence-transformers not installed. Install: pip install sentence-transformers")
|
68
|
-
raise
|
69
|
-
except Exception as e:
|
70
|
-
print(f"[PIPECLEANER] ⚠️ Failed to load embedding model: {e}")
|
71
|
-
raise
|
72
|
-
|
73
|
-
return _embedding_model
|
74
|
-
|
75
|
-
|
76
|
-
def _get_spacy_model():
|
77
|
-
"""Lazy load spaCy model for entity extraction."""
|
78
|
-
global _spacy_nlp
|
79
|
-
if _spacy_nlp is None:
|
80
|
-
try:
|
81
|
-
import spacy
|
82
|
-
print("[PIPECLEANER] Loading spaCy model: en_core_web_sm...")
|
83
|
-
_spacy_nlp = spacy.load("en_core_web_sm")
|
84
|
-
print("[PIPECLEANER] ✅ spaCy model loaded successfully")
|
85
|
-
except ImportError:
|
86
|
-
print("[PIPECLEANER] ⚠️ spaCy not installed. Using regex fallback for entities.")
|
87
|
-
_spacy_nlp = "fallback"
|
88
|
-
except OSError:
|
89
|
-
print("[PIPECLEANER] ⚠️ spaCy model not found. Using regex fallback for entities.")
|
90
|
-
_spacy_nlp = "fallback"
|
91
|
-
return _spacy_nlp
|
92
|
-
|
93
|
-
|
94
|
-
# ============================================================================
|
95
|
-
# Run-Scoped Global Corpus System V2.0
|
96
|
-
# ============================================================================
|
97
|
-
|
98
|
-
@dataclass
|
99
|
-
class SentenceCluster:
|
100
|
-
"""Represents a cluster of similar sentences across the run."""
|
101
|
-
cluster_id: str
|
102
|
-
canonical_sentence: str
|
103
|
-
owner_prompt_id: str # First prompt to use this cluster
|
104
|
-
simhash: int # 64-bit SimHash fingerprint
|
105
|
-
salience: float
|
106
|
-
entities: Set[str]
|
107
|
-
first_seen_seq: int
|
108
|
-
length: int
|
109
|
-
embedding: Optional[np.ndarray] = None # Sentence embedding for cosine similarity
|
110
|
-
|
111
|
-
@dataclass
|
112
|
-
class PromptState:
|
113
|
-
"""State for a single prompt in the batch."""
|
114
|
-
prompt_id: str
|
115
|
-
sentences: List[str]
|
116
|
-
cluster_ids: List[str] # parallel to sentences
|
117
|
-
original_order: List[int] # track reordering
|
118
|
-
entities: Set[str]
|
119
|
-
arrived_at: float
|
120
|
-
|
121
|
-
@dataclass
|
122
|
-
class RunCorpusTelemetry:
|
123
|
-
"""Run-level statistics for the corpus."""
|
124
|
-
prompts_total: int = 0
|
125
|
-
sentences_total: int = 0
|
126
|
-
clusters_total: int = 0
|
127
|
-
cross_prompt_dups_removed: int = 0
|
128
|
-
chars_in: int = 0
|
129
|
-
chars_out: int = 0
|
130
|
-
tokens_saved: int = 0
|
131
|
-
entity_coverage_avg: float = 100.0
|
132
|
-
batches_processed: int = 0
|
133
|
-
avg_barrier_ms: float = 0.0
|
134
|
-
max_barrier_ms: float = 0.0
|
135
|
-
barrier_times: List[float] = field(default_factory=list)
|
136
|
-
|
137
|
-
|
138
|
-
def compute_simhash(text: str, hash_bits: int = 64) -> int:
|
139
|
-
"""
|
140
|
-
Compute SimHash fingerprint for near-dup detection.
|
141
|
-
|
142
|
-
Args:
|
143
|
-
text: Input text
|
144
|
-
hash_bits: Hash size (64-bit default)
|
145
|
-
|
146
|
-
Returns:
|
147
|
-
Integer hash value
|
148
|
-
"""
|
149
|
-
# Tokenize and compute feature hashes
|
150
|
-
tokens = re.findall(r'\b\w+\b', text.lower())
|
151
|
-
if not tokens:
|
152
|
-
return 0
|
153
|
-
|
154
|
-
# Initialize bit vector
|
155
|
-
v = [0] * hash_bits
|
156
|
-
|
157
|
-
for token in tokens:
|
158
|
-
# Hash each token
|
159
|
-
h = int(hashlib.md5(token.encode()).hexdigest(), 16)
|
160
|
-
|
161
|
-
# Update bit vector
|
162
|
-
for i in range(hash_bits):
|
163
|
-
if h & (1 << i):
|
164
|
-
v[i] += 1
|
165
|
-
else:
|
166
|
-
v[i] -= 1
|
167
|
-
|
168
|
-
# Generate final hash
|
169
|
-
fingerprint = 0
|
170
|
-
for i in range(hash_bits):
|
171
|
-
if v[i] > 0:
|
172
|
-
fingerprint |= (1 << i)
|
173
|
-
|
174
|
-
return fingerprint
|
175
|
-
|
176
|
-
|
177
|
-
def hamming_distance(hash1: int, hash2: int) -> int:
|
178
|
-
"""Count differing bits between two hashes."""
|
179
|
-
return bin(hash1 ^ hash2).count('1')
|
180
|
-
|
181
|
-
|
182
|
-
class RunScopedCorpus:
|
183
|
-
"""
|
184
|
-
Global corpus for a single run, with dynamic batching barrier.
|
185
|
-
All prompts in the run share this corpus for cross-prompt deduplication.
|
186
|
-
|
187
|
-
CONCURRENCY MODEL:
|
188
|
-
- All shared state (clusters, prompt_registry, run_entities, kept_entities, batch_queue)
|
189
|
-
is protected by `self.batch_lock` (threading.Lock)
|
190
|
-
- All reads iterate over snapshots (dict(...), list(...)) to avoid "dict changed size" errors
|
191
|
-
- All writes are atomic under lock (copy-on-write when possible)
|
192
|
-
- Re-entrancy guard in caller (DaseinCallbackHandler) prevents nested calls
|
193
|
-
- Background timer thread (_process_batch) acquires lock before any mutations
|
194
|
-
"""
|
195
|
-
|
196
|
-
def __init__(self, run_id: str, hamming_threshold: int = 6, entity_coverage_min: float = 0.95, verbose: bool = False):
|
197
|
-
self.run_id = run_id
|
198
|
-
self.hamming_threshold = hamming_threshold
|
199
|
-
self.entity_coverage_min = entity_coverage_min
|
200
|
-
self.verbose = verbose # Gate debug logging
|
201
|
-
|
202
|
-
# Core state
|
203
|
-
self.clusters: Dict[str, SentenceCluster] = {} # cluster_id → cluster
|
204
|
-
self.simhash_index: Dict[int, List[str]] = defaultdict(list) # simhash → [cluster_ids]
|
205
|
-
self.prompt_registry: Dict[str, PromptState] = {} # prompt_id → state
|
206
|
-
self.entity_index: Dict[str, Set[str]] = defaultdict(set) # entity → {cluster_ids}
|
207
|
-
|
208
|
-
# Run-level entity tracking for global coverage
|
209
|
-
self.run_entities: Set[str] = set() # All entities seen across entire run
|
210
|
-
self.kept_entities: Set[str] = set() # All entities kept across all batches
|
211
|
-
|
212
|
-
# Batching state
|
213
|
-
self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
|
214
|
-
self.batch_lock = threading.Lock() # Protects batch_queue, batch_timer, etc.
|
215
|
-
self.processing_lock = threading.Lock() # CRITICAL: Ensures only ONE batch processes at a time
|
216
|
-
self.batch_timer: Optional[threading.Timer] = None
|
217
|
-
self.batch_start_time: Optional[float] = None
|
218
|
-
self.barrier_duration: float = 5.0 # Start at 5s (min wait)
|
219
|
-
self.barrier_increment: float = 2.0 # Add 2s per new arrival
|
220
|
-
self.barrier_cap: float = 10.0 # Max 10s
|
221
|
-
self.batch_ready = threading.Event() # Signal when batch is processed
|
222
|
-
self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
self.
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
self.next_cluster_id
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
#
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
self.
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
self.
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
self.telemetry.
|
356
|
-
self.telemetry.
|
357
|
-
|
358
|
-
|
359
|
-
#
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
all_entities.update(
|
367
|
-
|
368
|
-
|
369
|
-
#
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
#
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
self.
|
396
|
-
self.
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
self.
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
return
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
return
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
self.
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
for
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
#
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
#
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
#
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
#
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
{
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
{
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
#
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
sent
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
resized
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
'
|
1014
|
-
'
|
1015
|
-
|
1016
|
-
'
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
'
|
1079
|
-
'
|
1080
|
-
'
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
nlp
|
1126
|
-
|
1127
|
-
|
1128
|
-
return
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
-
|
1139
|
-
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
length_weight =
|
1156
|
-
|
1157
|
-
length_weight =
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
score
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
union
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
1218
|
-
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
print(f"
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
#
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
token_len
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
top_5
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
print(f"
|
1366
|
-
print(f"
|
1367
|
-
|
1368
|
-
|
1369
|
-
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1378
|
-
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
|
1388
|
-
|
1389
|
-
|
1390
|
-
|
1391
|
-
|
1392
|
-
#
|
1393
|
-
|
1394
|
-
|
1395
|
-
|
1396
|
-
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
should_readd
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1407
|
-
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
print(f"
|
1416
|
-
print(f"
|
1417
|
-
print(f"
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
'
|
1443
|
-
'
|
1444
|
-
'
|
1445
|
-
'
|
1446
|
-
'
|
1447
|
-
'
|
1448
|
-
'
|
1449
|
-
'
|
1450
|
-
'
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1456
|
-
print(f"
|
1457
|
-
print(f"[PIPECLEANER]
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1469
|
-
'
|
1470
|
-
'
|
1471
|
-
'
|
1472
|
-
'
|
1473
|
-
'
|
1474
|
-
'
|
1475
|
-
'
|
1476
|
-
'
|
1477
|
-
'
|
1478
|
-
|
1479
|
-
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1492
|
-
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
stats['
|
1527
|
-
stats['
|
1528
|
-
stats['
|
1529
|
-
stats['
|
1530
|
-
stats['
|
1531
|
-
stats['
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
print(f"
|
1537
|
-
print(f"
|
1538
|
-
print(f"
|
1539
|
-
print(f"
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
# ============================================================================
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
|
1591
|
-
print(f"
|
1592
|
-
print(f"[PIPECLEANER]
|
1593
|
-
print(f"{
|
1594
|
-
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
1603
|
-
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1611
|
-
print(f"
|
1612
|
-
print(f"[PIPECLEANER]
|
1613
|
-
print(f"{'
|
1614
|
-
print(f"[PIPECLEANER]
|
1615
|
-
print(f"[PIPECLEANER]
|
1616
|
-
print(f"[PIPECLEANER]
|
1617
|
-
print(f"[PIPECLEANER]
|
1618
|
-
print(f"[PIPECLEANER]")
|
1619
|
-
print(f"[PIPECLEANER]
|
1620
|
-
print(f"[PIPECLEANER]
|
1621
|
-
print(f"[PIPECLEANER]
|
1622
|
-
print(f"[PIPECLEANER]
|
1623
|
-
print(f"[PIPECLEANER]")
|
1624
|
-
print(f"[PIPECLEANER]
|
1625
|
-
print(f"[PIPECLEANER]
|
1626
|
-
print(f"[PIPECLEANER]
|
1627
|
-
print(f"[PIPECLEANER]
|
1628
|
-
print(f"
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1633
|
-
|
1634
|
-
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
print(f"
|
1639
|
-
print(f"[PIPECLEANER]
|
1640
|
-
print(f"
|
1641
|
-
print(f"
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
print(f"
|
1647
|
-
print(f"[PIPECLEANER]
|
1648
|
-
print(f"{
|
1649
|
-
|
1650
|
-
print(f"[PIPECLEANER]
|
1651
|
-
|
1652
|
-
print(f"
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1659
|
-
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
rule_obj
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1684
|
-
|
1685
|
-
|
1686
|
-
|
1687
|
-
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1691
|
-
advice_lower
|
1692
|
-
|
1693
|
-
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
applies
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
1705
|
-
|
1706
|
-
|
1707
|
-
|
1708
|
-
|
1709
|
-
|
1710
|
-
|
1711
|
-
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
""
|
1717
|
-
|
1718
|
-
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
1731
|
-
|
1732
|
-
tools = references.tools
|
1733
|
-
|
1734
|
-
tools =
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
|
1739
|
-
|
1740
|
-
|
1741
|
-
|
1742
|
-
|
1743
|
-
|
1744
|
-
|
1745
|
-
|
1746
|
-
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
|
1771
|
-
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
1776
|
-
|
1777
|
-
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
|
1792
|
-
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
1807
|
-
|
1808
|
-
|
1809
|
-
|
1810
|
-
|
1811
|
-
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1823
|
-
|
1824
|
-
|
1825
|
-
|
1826
|
-
|
1827
|
-
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1843
|
-
|
1844
|
-
|
1845
|
-
|
1846
|
-
|
1847
|
-
|
1848
|
-
tool_message
|
1849
|
-
|
1850
|
-
|
1851
|
-
|
1852
|
-
|
1853
|
-
|
1854
|
-
|
1855
|
-
|
1856
|
-
|
1857
|
-
|
1858
|
-
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
1862
|
-
print(f"
|
1863
|
-
print(f"[PIPECLEANER]
|
1864
|
-
print(f"{'
|
1865
|
-
print(f"[PIPECLEANER]
|
1866
|
-
print(f"[PIPECLEANER]
|
1867
|
-
print(f"[PIPECLEANER]
|
1868
|
-
print(f"[PIPECLEANER]
|
1869
|
-
print(f"[PIPECLEANER]")
|
1870
|
-
print(f"[PIPECLEANER]
|
1871
|
-
print(f"[PIPECLEANER]
|
1872
|
-
print(f"[PIPECLEANER]
|
1873
|
-
print(f"[PIPECLEANER]
|
1874
|
-
print(f"[PIPECLEANER]")
|
1875
|
-
print(f"[PIPECLEANER]
|
1876
|
-
print(f"[PIPECLEANER]
|
1877
|
-
print(f"[PIPECLEANER]
|
1878
|
-
print(f"[PIPECLEANER]
|
1879
|
-
print(f"
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
1887
|
-
|
1888
|
-
|
1889
|
-
|
1890
|
-
|
1891
|
-
|
1892
|
-
|
1893
|
-
|
1894
|
-
|
1895
|
-
LangChain
|
1896
|
-
|
1897
|
-
|
1898
|
-
|
1899
|
-
|
1900
|
-
|
1901
|
-
|
1902
|
-
|
1903
|
-
""
|
1904
|
-
|
1905
|
-
|
1906
|
-
|
1907
|
-
|
1908
|
-
print("
|
1909
|
-
print("
|
1910
|
-
print(f"
|
1911
|
-
|
1912
|
-
print(
|
1913
|
-
|
1914
|
-
print(
|
1915
|
-
print("
|
1916
|
-
print(
|
1917
|
-
print(
|
1918
|
-
|
1919
|
-
print(result)
|
1920
|
-
|
1
|
+
"""
|
2
|
+
Pipecleaner: Run-scoped global corpus deduplication for multi-agent systems.
|
3
|
+
|
4
|
+
V2.0: Global ClusterBank with dynamic batching barrier (5-10s) for cross-prompt deduplication.
|
5
|
+
- Run-scoped corpus: All prompts in a run share a global ClusterBank
|
6
|
+
- SimHash near-dup matching: Hamming distance ≤6 for 64-bit fingerprints
|
7
|
+
- Dynamic barrier: 5s min, +2s per arrival (cap 10s), maximizes dedupe by collecting bursts
|
8
|
+
- Canonical ownership: First prompt to use a cluster owns it, others drop duplicates
|
9
|
+
- Entity coverage: 95% threshold RUN-LEVEL (cumulative across all batches, not per-batch)
|
10
|
+
|
11
|
+
Algorithm:
|
12
|
+
1. Intercept prompt → split sentences → compute SimHash signatures
|
13
|
+
2. Match against ClusterBank (Hamming ≤6) → assign cluster_id or create new
|
14
|
+
3. Queue prompt into micro-batch, extend barrier (+2s per arrival, cap 10s)
|
15
|
+
4. On timer: cross-prompt dedupe (keep only canonical owners)
|
16
|
+
5. RUN-LEVEL entity coverage check (95% cumulative across entire run), re-add if needed
|
17
|
+
6. Emit cleaned prompts (original sentence order preserved)
|
18
|
+
|
19
|
+
Expected savings: 50-90% char reduction with 95%+ entity coverage across entire run.
|
20
|
+
Later batches are MORE aggressive (earlier batches already covered entities).
|
21
|
+
"""
|
22
|
+
|
23
|
+
import re
|
24
|
+
import hashlib
|
25
|
+
import threading
|
26
|
+
import time
|
27
|
+
from typing import List, Dict, Set, Tuple, Optional, Any
|
28
|
+
from dataclasses import dataclass, field
|
29
|
+
from collections import defaultdict
|
30
|
+
import numpy as np
|
31
|
+
import asyncio
|
32
|
+
|
33
|
+
# Type alias for return type
|
34
|
+
DeduplicationResult = Tuple[str, Dict]
|
35
|
+
|
36
|
+
# Lazy imports for performance (only load when needed)
|
37
|
+
_embedding_model = None
|
38
|
+
_spacy_nlp = None
|
39
|
+
_model_lock = threading.Lock() # Thread-safe singleton access
|
40
|
+
|
41
|
+
|
42
|
+
def _vprint(message: str, verbose: bool = False, force: bool = False):
|
43
|
+
"""Helper function for verbose printing."""
|
44
|
+
if force or verbose:
|
45
|
+
print(message)
|
46
|
+
|
47
|
+
|
48
|
+
def _get_embedding_model():
|
49
|
+
"""
|
50
|
+
Lazy load sentence transformer model (thread-safe singleton).
|
51
|
+
Forces CPU to avoid meta tensor issues on Win + Py3.13 + Torch.
|
52
|
+
"""
|
53
|
+
global _embedding_model
|
54
|
+
|
55
|
+
# Double-checked locking pattern for performance
|
56
|
+
if _embedding_model is None:
|
57
|
+
with _model_lock:
|
58
|
+
# Check again inside lock (another thread might have loaded it)
|
59
|
+
if _embedding_model is None:
|
60
|
+
try:
|
61
|
+
from sentence_transformers import SentenceTransformer
|
62
|
+
print("[PIPECLEANER] Loading embedding model: all-MiniLM-L6-v2 (384-dim, ~80MB)...")
|
63
|
+
# Force CPU device to avoid meta tensor issues
|
64
|
+
_embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
|
65
|
+
print("[PIPECLEANER] ✅ Embedding model loaded successfully (CPU)")
|
66
|
+
except ImportError:
|
67
|
+
print("[PIPECLEANER] ⚠️ sentence-transformers not installed. Install: pip install sentence-transformers")
|
68
|
+
raise
|
69
|
+
except Exception as e:
|
70
|
+
print(f"[PIPECLEANER] ⚠️ Failed to load embedding model: {e}")
|
71
|
+
raise
|
72
|
+
|
73
|
+
return _embedding_model
|
74
|
+
|
75
|
+
|
76
|
+
def _get_spacy_model():
|
77
|
+
"""Lazy load spaCy model for entity extraction."""
|
78
|
+
global _spacy_nlp
|
79
|
+
if _spacy_nlp is None:
|
80
|
+
try:
|
81
|
+
import spacy
|
82
|
+
print("[PIPECLEANER] Loading spaCy model: en_core_web_sm...")
|
83
|
+
_spacy_nlp = spacy.load("en_core_web_sm")
|
84
|
+
print("[PIPECLEANER] ✅ spaCy model loaded successfully")
|
85
|
+
except ImportError:
|
86
|
+
print("[PIPECLEANER] ⚠️ spaCy not installed. Using regex fallback for entities.")
|
87
|
+
_spacy_nlp = "fallback"
|
88
|
+
except OSError:
|
89
|
+
print("[PIPECLEANER] ⚠️ spaCy model not found. Using regex fallback for entities.")
|
90
|
+
_spacy_nlp = "fallback"
|
91
|
+
return _spacy_nlp
|
92
|
+
|
93
|
+
|
94
|
+
# ============================================================================
|
95
|
+
# Run-Scoped Global Corpus System V2.0
|
96
|
+
# ============================================================================
|
97
|
+
|
98
|
+
@dataclass
|
99
|
+
class SentenceCluster:
|
100
|
+
"""Represents a cluster of similar sentences across the run."""
|
101
|
+
cluster_id: str
|
102
|
+
canonical_sentence: str
|
103
|
+
owner_prompt_id: str # First prompt to use this cluster
|
104
|
+
simhash: int # 64-bit SimHash fingerprint
|
105
|
+
salience: float
|
106
|
+
entities: Set[str]
|
107
|
+
first_seen_seq: int
|
108
|
+
length: int
|
109
|
+
embedding: Optional[np.ndarray] = None # Sentence embedding for cosine similarity
|
110
|
+
|
111
|
+
@dataclass
|
112
|
+
class PromptState:
|
113
|
+
"""State for a single prompt in the batch."""
|
114
|
+
prompt_id: str
|
115
|
+
sentences: List[str]
|
116
|
+
cluster_ids: List[str] # parallel to sentences
|
117
|
+
original_order: List[int] # track reordering
|
118
|
+
entities: Set[str]
|
119
|
+
arrived_at: float
|
120
|
+
|
121
|
+
@dataclass
|
122
|
+
class RunCorpusTelemetry:
|
123
|
+
"""Run-level statistics for the corpus."""
|
124
|
+
prompts_total: int = 0
|
125
|
+
sentences_total: int = 0
|
126
|
+
clusters_total: int = 0
|
127
|
+
cross_prompt_dups_removed: int = 0
|
128
|
+
chars_in: int = 0
|
129
|
+
chars_out: int = 0
|
130
|
+
tokens_saved: int = 0
|
131
|
+
entity_coverage_avg: float = 100.0
|
132
|
+
batches_processed: int = 0
|
133
|
+
avg_barrier_ms: float = 0.0
|
134
|
+
max_barrier_ms: float = 0.0
|
135
|
+
barrier_times: List[float] = field(default_factory=list)
|
136
|
+
|
137
|
+
|
138
|
+
def compute_simhash(text: str, hash_bits: int = 64) -> int:
|
139
|
+
"""
|
140
|
+
Compute SimHash fingerprint for near-dup detection.
|
141
|
+
|
142
|
+
Args:
|
143
|
+
text: Input text
|
144
|
+
hash_bits: Hash size (64-bit default)
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
Integer hash value
|
148
|
+
"""
|
149
|
+
# Tokenize and compute feature hashes
|
150
|
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
151
|
+
if not tokens:
|
152
|
+
return 0
|
153
|
+
|
154
|
+
# Initialize bit vector
|
155
|
+
v = [0] * hash_bits
|
156
|
+
|
157
|
+
for token in tokens:
|
158
|
+
# Hash each token
|
159
|
+
h = int(hashlib.md5(token.encode()).hexdigest(), 16)
|
160
|
+
|
161
|
+
# Update bit vector
|
162
|
+
for i in range(hash_bits):
|
163
|
+
if h & (1 << i):
|
164
|
+
v[i] += 1
|
165
|
+
else:
|
166
|
+
v[i] -= 1
|
167
|
+
|
168
|
+
# Generate final hash
|
169
|
+
fingerprint = 0
|
170
|
+
for i in range(hash_bits):
|
171
|
+
if v[i] > 0:
|
172
|
+
fingerprint |= (1 << i)
|
173
|
+
|
174
|
+
return fingerprint
|
175
|
+
|
176
|
+
|
177
|
+
def hamming_distance(hash1: int, hash2: int) -> int:
|
178
|
+
"""Count differing bits between two hashes."""
|
179
|
+
return bin(hash1 ^ hash2).count('1')
|
180
|
+
|
181
|
+
|
182
|
+
class RunScopedCorpus:
|
183
|
+
"""
|
184
|
+
Global corpus for a single run, with dynamic batching barrier.
|
185
|
+
All prompts in the run share this corpus for cross-prompt deduplication.
|
186
|
+
|
187
|
+
CONCURRENCY MODEL:
|
188
|
+
- All shared state (clusters, prompt_registry, run_entities, kept_entities, batch_queue)
|
189
|
+
is protected by `self.batch_lock` (threading.Lock)
|
190
|
+
- All reads iterate over snapshots (dict(...), list(...)) to avoid "dict changed size" errors
|
191
|
+
- All writes are atomic under lock (copy-on-write when possible)
|
192
|
+
- Re-entrancy guard in caller (DaseinCallbackHandler) prevents nested calls
|
193
|
+
- Background timer thread (_process_batch) acquires lock before any mutations
|
194
|
+
"""
|
195
|
+
|
196
|
+
def __init__(self, run_id: str, hamming_threshold: int = 6, entity_coverage_min: float = 0.95, verbose: bool = False):
|
197
|
+
self.run_id = run_id
|
198
|
+
self.hamming_threshold = hamming_threshold
|
199
|
+
self.entity_coverage_min = entity_coverage_min
|
200
|
+
self.verbose = verbose # Gate debug logging
|
201
|
+
|
202
|
+
# Core state
|
203
|
+
self.clusters: Dict[str, SentenceCluster] = {} # cluster_id → cluster
|
204
|
+
self.simhash_index: Dict[int, List[str]] = defaultdict(list) # simhash → [cluster_ids]
|
205
|
+
self.prompt_registry: Dict[str, PromptState] = {} # prompt_id → state
|
206
|
+
self.entity_index: Dict[str, Set[str]] = defaultdict(set) # entity → {cluster_ids}
|
207
|
+
|
208
|
+
# Run-level entity tracking for global coverage
|
209
|
+
self.run_entities: Set[str] = set() # All entities seen across entire run
|
210
|
+
self.kept_entities: Set[str] = set() # All entities kept across all batches
|
211
|
+
|
212
|
+
# Batching state
|
213
|
+
self.batch_queue: List[str] = [] # [prompt_ids] waiting for barrier
|
214
|
+
self.batch_lock = threading.Lock() # Protects batch_queue, batch_timer, etc.
|
215
|
+
self.processing_lock = threading.Lock() # CRITICAL: Ensures only ONE batch processes at a time
|
216
|
+
self.batch_timer: Optional[threading.Timer] = None
|
217
|
+
self.batch_start_time: Optional[float] = None
|
218
|
+
self.barrier_duration: float = 5.0 # Start at 5s (min wait)
|
219
|
+
self.barrier_increment: float = 2.0 # Add 2s per new arrival
|
220
|
+
self.barrier_cap: float = 10.0 # Max 10s
|
221
|
+
self.batch_ready = threading.Event() # Signal when batch is processed
|
222
|
+
self.prompt_events: Dict[str, asyncio.Event] = {} # Per-prompt events for ASYNC sequential release
|
223
|
+
self.prompt_loops: Dict[str, asyncio.AbstractEventLoop] = {} # Event loops for thread-safe signaling
|
224
|
+
|
225
|
+
# Sequence tracking
|
226
|
+
self.next_seq = 0
|
227
|
+
self.next_cluster_id = 0
|
228
|
+
|
229
|
+
# Telemetry
|
230
|
+
self.telemetry = RunCorpusTelemetry()
|
231
|
+
|
232
|
+
_vprint(f"[CORPUS] 🏗️ Created run-scoped corpus for run_id={run_id[:8]} (barrier: 5s min, +2s/arrival, 10s cap)", self.verbose)
|
233
|
+
|
234
|
+
def _generate_cluster_id(self) -> str:
|
235
|
+
"""Generate unique cluster ID."""
|
236
|
+
cluster_id = f"c{self.next_cluster_id:06d}"
|
237
|
+
self.next_cluster_id += 1
|
238
|
+
return cluster_id
|
239
|
+
|
240
|
+
def find_matching_cluster(self, simhash: int, sentence: str, sentence_embedding=None) -> Optional[str]:
|
241
|
+
"""
|
242
|
+
Find existing cluster that matches this sentence using cosine similarity.
|
243
|
+
|
244
|
+
Args:
|
245
|
+
simhash: SimHash of the sentence (for indexing, not matching)
|
246
|
+
sentence: Original sentence text
|
247
|
+
sentence_embedding: Pre-computed embedding for this sentence
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
cluster_id if match found, None otherwise
|
251
|
+
"""
|
252
|
+
if sentence_embedding is None:
|
253
|
+
return None
|
254
|
+
|
255
|
+
# Check all existing clusters for semantic similarity
|
256
|
+
# Use cosine similarity ≥ 0.60 (catches cross-site paraphrases)
|
257
|
+
best_match_id = None
|
258
|
+
best_similarity = 0.60 # Threshold for considering duplicate (lowered to catch paraphrases)
|
259
|
+
|
260
|
+
# Snapshot clusters to avoid "dict changed size" errors (thread-safe read)
|
261
|
+
with self.batch_lock:
|
262
|
+
clusters_snapshot = dict(self.clusters)
|
263
|
+
|
264
|
+
for cluster_id, cluster in clusters_snapshot.items():
|
265
|
+
if cluster.canonical_sentence == sentence:
|
266
|
+
# Exact match
|
267
|
+
return cluster_id
|
268
|
+
|
269
|
+
# Hybrid similarity: semantic + lexical fallback for short sentences
|
270
|
+
if hasattr(cluster, 'embedding') and cluster.embedding is not None:
|
271
|
+
# Semantic similarity
|
272
|
+
similarity = np.dot(sentence_embedding, cluster.embedding)
|
273
|
+
|
274
|
+
# Lexical fallback for short sentences (boilerplate detection)
|
275
|
+
max_len = max(len(sentence), len(cluster.canonical_sentence))
|
276
|
+
if max_len <= 120 and similarity < 0.60:
|
277
|
+
lexical_sim = compute_char_3gram_jaccard(sentence, cluster.canonical_sentence)
|
278
|
+
if lexical_sim >= 0.82:
|
279
|
+
# Boost similarity to indicate match via lexical path
|
280
|
+
similarity = max(similarity, 0.82)
|
281
|
+
|
282
|
+
if similarity > best_similarity:
|
283
|
+
best_similarity = similarity
|
284
|
+
best_match_id = cluster_id
|
285
|
+
|
286
|
+
return best_match_id
|
287
|
+
|
288
|
+
def add_sentence_to_corpus(self, sentence: str, prompt_id: str, salience: float, entities: Set[str]) -> str:
|
289
|
+
"""
|
290
|
+
Add sentence to corpus or match to existing cluster.
|
291
|
+
|
292
|
+
Args:
|
293
|
+
sentence: Sentence text
|
294
|
+
prompt_id: Owner prompt
|
295
|
+
salience: Importance score
|
296
|
+
entities: Extracted entities
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
cluster_id (new or matched)
|
300
|
+
"""
|
301
|
+
# Compute SimHash
|
302
|
+
simhash = compute_simhash(sentence)
|
303
|
+
|
304
|
+
# Try to match existing cluster
|
305
|
+
existing_cluster_id = self.find_matching_cluster(simhash, sentence)
|
306
|
+
|
307
|
+
if existing_cluster_id:
|
308
|
+
# Matched existing cluster
|
309
|
+
return existing_cluster_id
|
310
|
+
|
311
|
+
# Create new cluster
|
312
|
+
cluster_id = self._generate_cluster_id()
|
313
|
+
cluster = SentenceCluster(
|
314
|
+
cluster_id=cluster_id,
|
315
|
+
canonical_sentence=sentence,
|
316
|
+
owner_prompt_id=prompt_id,
|
317
|
+
simhash=simhash,
|
318
|
+
salience=salience,
|
319
|
+
entities=entities,
|
320
|
+
first_seen_seq=self.next_seq,
|
321
|
+
length=len(sentence)
|
322
|
+
)
|
323
|
+
|
324
|
+
self.clusters[cluster_id] = cluster
|
325
|
+
self.simhash_index[simhash].append(cluster_id)
|
326
|
+
|
327
|
+
# Update entity index
|
328
|
+
for entity in entities:
|
329
|
+
self.entity_index[entity].add(cluster_id)
|
330
|
+
|
331
|
+
self.next_seq += 1
|
332
|
+
self.telemetry.clusters_total += 1
|
333
|
+
|
334
|
+
return cluster_id
|
335
|
+
|
336
|
+
async def enqueue_prompt(self, prompt_id: str, prompt_text: str) -> str:
|
337
|
+
"""
|
338
|
+
Enqueue prompt for batched processing with dynamic barrier (ASYNC - allows parallel arrivals).
|
339
|
+
|
340
|
+
Args:
|
341
|
+
prompt_id: Unique prompt identifier
|
342
|
+
prompt_text: Full prompt text
|
343
|
+
|
344
|
+
Returns:
|
345
|
+
Deduplicated prompt text (after barrier)
|
346
|
+
"""
|
347
|
+
arrival_time = time.time()
|
348
|
+
|
349
|
+
# Split into sentences
|
350
|
+
sentences = split_into_sentences(prompt_text)
|
351
|
+
|
352
|
+
if not sentences:
|
353
|
+
return prompt_text
|
354
|
+
|
355
|
+
self.telemetry.prompts_total += 1
|
356
|
+
self.telemetry.sentences_total += len(sentences)
|
357
|
+
self.telemetry.chars_in += len(prompt_text)
|
358
|
+
|
359
|
+
# ⚡ CRITICAL: DO NOT compute embeddings here! It blocks async arrivals.
|
360
|
+
# Store raw sentences and compute embeddings in batch during _process_batch
|
361
|
+
all_entities = set()
|
362
|
+
|
363
|
+
for sentence in sentences:
|
364
|
+
# Extract entities (fast, non-blocking)
|
365
|
+
entities, numbers = extract_entities_regex(sentence)
|
366
|
+
all_entities.update(entities)
|
367
|
+
all_entities.update(numbers)
|
368
|
+
|
369
|
+
# Create prompt state (thread-safe mutation)
|
370
|
+
# NOTE: cluster_ids will be computed during batch processing (after embeddings)
|
371
|
+
with self.batch_lock:
|
372
|
+
prompt_state = PromptState(
|
373
|
+
prompt_id=prompt_id,
|
374
|
+
sentences=sentences,
|
375
|
+
cluster_ids=[], # Will be filled during _process_batch
|
376
|
+
original_order=list(range(len(sentences))),
|
377
|
+
entities=all_entities,
|
378
|
+
arrived_at=arrival_time
|
379
|
+
)
|
380
|
+
|
381
|
+
self.prompt_registry[prompt_id] = prompt_state
|
382
|
+
|
383
|
+
# Add to batch queue and manage barrier
|
384
|
+
# Create per-prompt ASYNC event for sequential release
|
385
|
+
prompt_ready = asyncio.Event()
|
386
|
+
loop = asyncio.get_running_loop()
|
387
|
+
self.prompt_events[prompt_id] = prompt_ready
|
388
|
+
self.prompt_loops[prompt_id] = loop
|
389
|
+
|
390
|
+
with self.batch_lock:
|
391
|
+
self.batch_queue.append(prompt_id)
|
392
|
+
|
393
|
+
if self.batch_timer is None:
|
394
|
+
# First prompt in batch, start timer at 5s
|
395
|
+
self.batch_start_time = arrival_time
|
396
|
+
self.barrier_duration = 5.0
|
397
|
+
print(f"[CORPUS] ⏱️ Starting batch barrier: 5.0s (first prompt, min wait)")
|
398
|
+
self.batch_timer = threading.Timer(self.barrier_duration, self._process_batch)
|
399
|
+
self.batch_timer.start()
|
400
|
+
else:
|
401
|
+
# Extend barrier by +2s per arrival (capped at 10s)
|
402
|
+
elapsed = arrival_time - self.batch_start_time
|
403
|
+
new_duration = min(elapsed + self.barrier_increment, self.barrier_cap)
|
404
|
+
|
405
|
+
if new_duration > self.barrier_duration:
|
406
|
+
# Cancel old timer, start new one
|
407
|
+
self.batch_timer.cancel()
|
408
|
+
remaining = new_duration - elapsed
|
409
|
+
self.barrier_duration = new_duration
|
410
|
+
_vprint(f"[CORPUS] ⏱️ Extending barrier to {new_duration:.1f}s (+{remaining:.1f}s remaining, +{self.barrier_increment:.1f}s per arrival)", self.verbose)
|
411
|
+
self.batch_timer = threading.Timer(remaining, self._process_batch)
|
412
|
+
self.batch_timer.start()
|
413
|
+
|
414
|
+
# ASYNC wait for THIS prompt's individual event (allows other async tasks to proceed)
|
415
|
+
# Timeout must be generous to account for model loading on first batch
|
416
|
+
try:
|
417
|
+
await asyncio.wait_for(prompt_ready.wait(), timeout=30.0) # 30s max wait (model load + processing)
|
418
|
+
timed_out = False
|
419
|
+
except asyncio.TimeoutError:
|
420
|
+
timed_out = True
|
421
|
+
|
422
|
+
if timed_out:
|
423
|
+
# Fail open: return original text if batch processing hangs
|
424
|
+
print(f"[CORPUS] ⚠️ Timeout waiting for batch processing, returning original prompt")
|
425
|
+
self.telemetry.chars_out += len(prompt_text)
|
426
|
+
return prompt_text
|
427
|
+
|
428
|
+
# Retrieve deduplicated result
|
429
|
+
deduplicated_text = self._get_deduplicated_prompt(prompt_id)
|
430
|
+
|
431
|
+
if not deduplicated_text:
|
432
|
+
# Safety: if result is missing, return original
|
433
|
+
print(f"[CORPUS] ⚠️ Missing deduplicated result for prompt {prompt_id[:8]}, returning original")
|
434
|
+
self.telemetry.chars_out += len(prompt_text)
|
435
|
+
return prompt_text
|
436
|
+
|
437
|
+
self.telemetry.chars_out += len(deduplicated_text)
|
438
|
+
|
439
|
+
return deduplicated_text
|
440
|
+
|
441
|
+
def _process_batch(self):
|
442
|
+
"""Process current batch: cross-prompt dedupe, entity coverage check, emit (synchronous)."""
|
443
|
+
# CRITICAL: Acquire processing lock to prevent multiple batches from processing simultaneously
|
444
|
+
with self.processing_lock:
|
445
|
+
with self.batch_lock:
|
446
|
+
if not self.batch_queue:
|
447
|
+
# No prompts to process, just return (shouldn't happen)
|
448
|
+
return
|
449
|
+
|
450
|
+
batch_prompts = self.batch_queue.copy()
|
451
|
+
self.batch_queue.clear()
|
452
|
+
self.batch_timer = None
|
453
|
+
|
454
|
+
batch_duration_ms = (time.time() - self.batch_start_time) * 1000
|
455
|
+
self.telemetry.barrier_times.append(batch_duration_ms)
|
456
|
+
self.telemetry.batches_processed += 1
|
457
|
+
|
458
|
+
# Always show batch summary (key metric)
|
459
|
+
print(f"\n[CORPUS] 🔄 Processing batch: {len(batch_prompts)} prompts, barrier={batch_duration_ms:.0f}ms")
|
460
|
+
|
461
|
+
# Step 0: Compute embeddings for NEW prompts in this batch (BATCHED operation!)
|
462
|
+
# This is done ONCE for the entire batch, allowing parallel arrivals
|
463
|
+
_vprint(f"[CORPUS] 🧮 Computing embeddings for {len(batch_prompts)} new prompts...", self.verbose)
|
464
|
+
model = _get_embedding_model()
|
465
|
+
|
466
|
+
for prompt_id in batch_prompts:
|
467
|
+
prompt_state = self.prompt_registry[prompt_id]
|
468
|
+
|
469
|
+
if not prompt_state.cluster_ids: # Only process if not yet clustered
|
470
|
+
# Compute embeddings for all sentences in this prompt (batch operation)
|
471
|
+
sentence_embeddings = model.encode(prompt_state.sentences, show_progress_bar=False, normalize_embeddings=True)
|
472
|
+
|
473
|
+
# Match/create clusters for each sentence
|
474
|
+
cluster_ids = []
|
475
|
+
for i, sentence in enumerate(prompt_state.sentences):
|
476
|
+
# Compute salience
|
477
|
+
salience = len(sentence) / 100.0
|
478
|
+
salience += len(re.findall(r'\b[A-Z][a-z]+', sentence)) * 0.1
|
479
|
+
|
480
|
+
# Extract entities
|
481
|
+
entities, numbers = extract_entities_regex(sentence)
|
482
|
+
|
483
|
+
# Match against existing clusters
|
484
|
+
cluster_id = self.find_matching_cluster(0, sentence, sentence_embeddings[i])
|
485
|
+
|
486
|
+
if cluster_id is None:
|
487
|
+
# Create new cluster
|
488
|
+
with self.batch_lock:
|
489
|
+
cluster_id = self._generate_cluster_id()
|
490
|
+
simhash = compute_simhash(sentence)
|
491
|
+
|
492
|
+
cluster = SentenceCluster(
|
493
|
+
cluster_id=cluster_id,
|
494
|
+
canonical_sentence=sentence,
|
495
|
+
owner_prompt_id=prompt_id,
|
496
|
+
simhash=simhash,
|
497
|
+
salience=salience,
|
498
|
+
entities=entities | numbers,
|
499
|
+
first_seen_seq=self.next_seq,
|
500
|
+
length=len(sentence),
|
501
|
+
embedding=sentence_embeddings[i]
|
502
|
+
)
|
503
|
+
|
504
|
+
self.clusters[cluster_id] = cluster
|
505
|
+
self.next_seq += 1
|
506
|
+
self.telemetry.clusters_total += 1
|
507
|
+
|
508
|
+
cluster_ids.append(cluster_id)
|
509
|
+
|
510
|
+
# Update prompt state with cluster_ids
|
511
|
+
prompt_state.cluster_ids = cluster_ids
|
512
|
+
|
513
|
+
_vprint(f"[CORPUS] ✅ Embeddings computed and clusters assigned", self.verbose)
|
514
|
+
|
515
|
+
# Step 1: Collect ALL sentences from THE ENTIRE RUN (not just current batch!)
|
516
|
+
# This is critical for true run-scoped deduplication
|
517
|
+
all_sentences = []
|
518
|
+
sentence_to_prompt = {} # Map sentence_id → (prompt_id, index)
|
519
|
+
locked_sentences = set() # Sentences from previous batches (already emitted, can't remove)
|
520
|
+
|
521
|
+
# Iterate over ALL prompts in registry (including previous batches)
|
522
|
+
for prompt_id, prompt_state in self.prompt_registry.items():
|
523
|
+
is_previous_batch = prompt_id not in batch_prompts
|
524
|
+
|
525
|
+
for idx, (sentence_text, cluster_id) in enumerate(zip(prompt_state.sentences, prompt_state.cluster_ids)):
|
526
|
+
cluster = self.clusters.get(cluster_id)
|
527
|
+
if not cluster:
|
528
|
+
continue
|
529
|
+
|
530
|
+
# Create Sentence object for greedy algorithm
|
531
|
+
sent_id = f"{prompt_id}_{idx}"
|
532
|
+
sent_obj = Sentence(
|
533
|
+
id=sent_id,
|
534
|
+
text=sentence_text,
|
535
|
+
embedding=cluster.embedding,
|
536
|
+
entities=cluster.entities, # Keep ALL entities for accurate coverage tracking
|
537
|
+
numbers=set(), # Already in entities
|
538
|
+
salience=cluster.salience,
|
539
|
+
position=cluster.first_seen_seq
|
540
|
+
)
|
541
|
+
all_sentences.append(sent_obj)
|
542
|
+
sentence_to_prompt[sent_id] = (prompt_id, idx)
|
543
|
+
|
544
|
+
# Lock sentences from previous batches (already emitted to user)
|
545
|
+
if is_previous_batch:
|
546
|
+
locked_sentences.add(sent_id)
|
547
|
+
|
548
|
+
_vprint(f"[CORPUS] 🌐 Run-scoped MIS: {len(all_sentences)} total sentences ({len(locked_sentences)} locked from previous batches, {len(all_sentences)-len(locked_sentences)} new)", self.verbose)
|
549
|
+
_vprint(f"[CORPUS] 🧮 Running greedy max-independent-set on {len(all_sentences)} sentences", self.verbose)
|
550
|
+
|
551
|
+
# Step 2: Compute degree map (needed for isolates pass later)
|
552
|
+
degree_map = {}
|
553
|
+
for sent in all_sentences:
|
554
|
+
degree = 0
|
555
|
+
for other in all_sentences:
|
556
|
+
if sent.id != other.id:
|
557
|
+
if are_sentences_similar(sent, other, semantic_threshold=0.60):
|
558
|
+
degree += 1
|
559
|
+
degree_map[sent.id] = degree
|
560
|
+
|
561
|
+
# Sanity checks
|
562
|
+
isolates_before = [s for s in all_sentences if degree_map[s.id] == 0]
|
563
|
+
non_isolates = [s for s in all_sentences if degree_map[s.id] > 0]
|
564
|
+
pct_isolates = len(isolates_before) / len(all_sentences) * 100 if all_sentences else 0
|
565
|
+
avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
|
566
|
+
print(f"[CORPUS] 📊 Graph: isolates={pct_isolates:.1f}% (expect <20%), non-isolate avg degree={avg_degree_non_iso:.1f} (expect >3)")
|
567
|
+
|
568
|
+
# Step 3: Run greedy maximum-independent-set selection
|
569
|
+
# Start with LOCKED sentences (from previous batches, already emitted)
|
570
|
+
# Then run MIS only on NEW sentences (current batch)
|
571
|
+
selected_sentences = [s for s in all_sentences if s.id in locked_sentences]
|
572
|
+
selected_ids = locked_sentences.copy()
|
573
|
+
|
574
|
+
print(f"[CORPUS] 🔒 Pre-seeded MIS with {len(locked_sentences)} locked sentences from previous batches")
|
575
|
+
|
576
|
+
# Now run MIS on NEW sentences only (exclude locked)
|
577
|
+
new_sentences = [s for s in all_sentences if s.id not in locked_sentences]
|
578
|
+
|
579
|
+
if new_sentences:
|
580
|
+
# Run MIS on new sentences, considering locked ones as neighbors
|
581
|
+
new_selected = greedy_max_independent_set(
|
582
|
+
new_sentences,
|
583
|
+
similarity_threshold=0.60,
|
584
|
+
verbose=False, # Set to True for debugging
|
585
|
+
precomputed_degree_map=degree_map # Pass precomputed degrees
|
586
|
+
)
|
587
|
+
|
588
|
+
# Add newly selected sentences
|
589
|
+
selected_sentences.extend(new_selected)
|
590
|
+
selected_ids.update(s.id for s in new_selected)
|
591
|
+
|
592
|
+
_vprint(f"[CORPUS] ✅ MIS complete: {len(selected_ids)} total kept ({len(locked_sentences)} locked + {len(selected_ids)-len(locked_sentences)} new)", self.verbose)
|
593
|
+
|
594
|
+
# Step 3: Compute NODE COVERAGE (align universe for backfill)
|
595
|
+
# covered_nodes = S ∪ N(S) (selected + their neighbors)
|
596
|
+
covered_nodes = set(selected_ids)
|
597
|
+
sentence_map = {s.id: s for s in all_sentences}
|
598
|
+
|
599
|
+
for selected_id in selected_ids:
|
600
|
+
selected_sent = sentence_map[selected_id]
|
601
|
+
# Add all neighbors (similar nodes)
|
602
|
+
for other in all_sentences:
|
603
|
+
if other.id != selected_id:
|
604
|
+
if are_sentences_similar(selected_sent, other, semantic_threshold=0.60):
|
605
|
+
covered_nodes.add(other.id)
|
606
|
+
|
607
|
+
total_nodes = len(all_sentences)
|
608
|
+
node_coverage_before = len(covered_nodes) / total_nodes if total_nodes > 0 else 0.0
|
609
|
+
|
610
|
+
_vprint(f"[CORPUS] 📊 After MIS: nodes={len(selected_ids)}/{total_nodes} kept, coverage (S∪N(S))={len(covered_nodes)}/{total_nodes} ({node_coverage_before*100:.1f}%)", self.verbose)
|
611
|
+
|
612
|
+
# Step 4: Backfill = GREEDY SET COVER over NODES (no independence constraint!)
|
613
|
+
# Goal: Maximize node coverage (S ∪ N(S)) by re-adding removed nodes with highest gain
|
614
|
+
# gain(u) = |({u} ∪ N(u)) \ covered_nodes|
|
615
|
+
backfill_added = 0
|
616
|
+
isolates_added = 0
|
617
|
+
target_coverage = 0.90 # 90% node coverage target
|
618
|
+
|
619
|
+
if node_coverage_before < target_coverage:
|
620
|
+
uncovered_count = total_nodes - len(covered_nodes)
|
621
|
+
_vprint(f"[CORPUS] 🔧 Backfill: {uncovered_count} uncovered nodes, targeting {target_coverage*100:.0f}% coverage", self.verbose)
|
622
|
+
|
623
|
+
# Get ALL removed sentences (candidates for backfill)
|
624
|
+
removed_sentences = [sent for sent in all_sentences if sent.id not in selected_ids]
|
625
|
+
|
626
|
+
# Helper: compute node gain for a candidate
|
627
|
+
def compute_node_gain(sent):
|
628
|
+
"""Compute how many uncovered nodes this sentence + its neighbors would cover."""
|
629
|
+
candidate_coverage = {sent.id}
|
630
|
+
# Add neighbors
|
631
|
+
for other in all_sentences:
|
632
|
+
if other.id != sent.id:
|
633
|
+
if are_sentences_similar(sent, other, semantic_threshold=0.60):
|
634
|
+
candidate_coverage.add(other.id)
|
635
|
+
# Gain = new nodes not already covered
|
636
|
+
return len(candidate_coverage - covered_nodes)
|
637
|
+
|
638
|
+
# Debug: Print top-5 candidates by gain (first iteration only)
|
639
|
+
if removed_sentences:
|
640
|
+
gains = [(sent, compute_node_gain(sent)) for sent in removed_sentences[:20]] # Sample first 20 for speed
|
641
|
+
gains.sort(key=lambda x: x[1], reverse=True)
|
642
|
+
_vprint(f"[CORPUS] Top-5 backfill candidates by gain:", self.verbose)
|
643
|
+
for sent, gain in gains[:5]:
|
644
|
+
_vprint(f" gain={gain}: '{sent.text[:60]}...'", self.verbose)
|
645
|
+
|
646
|
+
# GREEDY SET COVER: repeatedly pick sentence with max gain
|
647
|
+
iteration = 0
|
648
|
+
while node_coverage_before < target_coverage and removed_sentences and iteration < 100:
|
649
|
+
# Find best candidate
|
650
|
+
best_sent = None
|
651
|
+
best_gain = 0
|
652
|
+
|
653
|
+
for sent in removed_sentences:
|
654
|
+
gain = compute_node_gain(sent)
|
655
|
+
if gain > best_gain:
|
656
|
+
best_gain = gain
|
657
|
+
best_sent = sent
|
658
|
+
|
659
|
+
if best_gain == 0:
|
660
|
+
_vprint(f"[CORPUS] Backfill: all remaining candidates have gain=0, stopping", self.verbose)
|
661
|
+
break
|
662
|
+
|
663
|
+
# Add best sentence back
|
664
|
+
selected_ids.add(best_sent.id)
|
665
|
+
selected_sentences.append(best_sent)
|
666
|
+
|
667
|
+
# Update covered_nodes: add this node + its neighbors
|
668
|
+
covered_nodes.add(best_sent.id)
|
669
|
+
for other in all_sentences:
|
670
|
+
if other.id != best_sent.id:
|
671
|
+
if are_sentences_similar(best_sent, other, semantic_threshold=0.60):
|
672
|
+
covered_nodes.add(other.id)
|
673
|
+
|
674
|
+
removed_sentences.remove(best_sent)
|
675
|
+
backfill_added += 1
|
676
|
+
|
677
|
+
# Update coverage
|
678
|
+
node_coverage_before = len(covered_nodes) / total_nodes
|
679
|
+
iteration += 1
|
680
|
+
|
681
|
+
if backfill_added <= 5:
|
682
|
+
_vprint(f"[CORPUS] ✅ Backfill +{best_gain} nodes: '{best_sent.text[:60]}...' (coverage now {node_coverage_before*100:.1f}%)", self.verbose)
|
683
|
+
|
684
|
+
_vprint(f"[CORPUS] 📈 After backfill: +{backfill_added} sentences, node coverage {node_coverage_before*100:.1f}%)", self.verbose)
|
685
|
+
|
686
|
+
# Step 5: ISOLATES PASS - add uncovered degree=0 nodes
|
687
|
+
# These are unique nodes with no similar neighbors
|
688
|
+
uncovered_isolates = [sent for sent in all_sentences
|
689
|
+
if sent.id not in covered_nodes and degree_map[sent.id] == 0]
|
690
|
+
|
691
|
+
if uncovered_isolates:
|
692
|
+
_vprint(f"[CORPUS] 🔧 Isolates pass: {len(uncovered_isolates)} uncovered isolates (degree=0)", self.verbose)
|
693
|
+
|
694
|
+
for sent in uncovered_isolates:
|
695
|
+
if node_coverage_before >= target_coverage:
|
696
|
+
break
|
697
|
+
selected_ids.add(sent.id)
|
698
|
+
covered_nodes.add(sent.id)
|
699
|
+
isolates_added += 1
|
700
|
+
node_coverage_before = len(covered_nodes) / total_nodes
|
701
|
+
|
702
|
+
if isolates_added <= 5:
|
703
|
+
_vprint(f"[CORPUS] ✅ Isolate: '{sent.text[:60]}...'", self.verbose)
|
704
|
+
|
705
|
+
if isolates_added > 0:
|
706
|
+
_vprint(f"[CORPUS] 📈 After isolates: +{isolates_added} sentences, node coverage {node_coverage_before*100:.1f}%", self.verbose)
|
707
|
+
|
708
|
+
# Final coverage stats (NODE universe)
|
709
|
+
final_selected = len(selected_ids)
|
710
|
+
final_covered_nodes = len(covered_nodes)
|
711
|
+
final_node_coverage = final_covered_nodes / total_nodes if total_nodes > 0 else 0.0
|
712
|
+
|
713
|
+
# Assert denominator is |V| (all nodes, no filtering)
|
714
|
+
assert total_nodes == len(all_sentences), f"Denominator mismatch: {total_nodes} != {len(all_sentences)}"
|
715
|
+
|
716
|
+
_vprint(f"[CORPUS] ✅ Final: kept={final_selected}/{total_nodes}, covered (S∪N(S))={final_covered_nodes}/{total_nodes} ({final_node_coverage*100:.1f}%)", self.verbose)
|
717
|
+
_vprint(f"[CORPUS] 📊 Backfill={backfill_added}, Isolates={isolates_added}", self.verbose)
|
718
|
+
|
719
|
+
# Step 6: Map results back to prompts
|
720
|
+
results = {}
|
721
|
+
for prompt_id in batch_prompts:
|
722
|
+
prompt_state = self.prompt_registry[prompt_id]
|
723
|
+
kept_sentences = []
|
724
|
+
removed_count = 0
|
725
|
+
|
726
|
+
for idx, sentence_text in enumerate(prompt_state.sentences):
|
727
|
+
sent_id = f"{prompt_id}_{idx}"
|
728
|
+
if sent_id in selected_ids:
|
729
|
+
kept_sentences.append(sentence_text)
|
730
|
+
else:
|
731
|
+
removed_count += 1
|
732
|
+
|
733
|
+
results[prompt_id] = {
|
734
|
+
'kept': kept_sentences,
|
735
|
+
'removed': removed_count,
|
736
|
+
'original_count': len(prompt_state.sentences)
|
737
|
+
}
|
738
|
+
|
739
|
+
# Step 7: Store results and emit to prompts
|
740
|
+
for prompt_id in batch_prompts:
|
741
|
+
prompt_state = self.prompt_registry[prompt_id]
|
742
|
+
result = results[prompt_id]
|
743
|
+
prompt_state.sentences = result['kept']
|
744
|
+
|
745
|
+
reduction_pct = (result['removed'] / result['original_count'] * 100) if result['original_count'] > 0 else 0
|
746
|
+
_vprint(f"[CORPUS] Prompt {prompt_id[:8]}: {result['original_count']} → {len(result['kept'])} sentences ({reduction_pct:.1f}% removed)", self.verbose)
|
747
|
+
|
748
|
+
# Update telemetry
|
749
|
+
self.telemetry.entity_coverage_avg = final_node_coverage * 100 # Now tracking NODE coverage
|
750
|
+
# Always show final batch summary (key metric)
|
751
|
+
print(f"[CORPUS] ✅ Batch complete: Node coverage {final_node_coverage*100:.1f}%")
|
752
|
+
|
753
|
+
# Update telemetry
|
754
|
+
if self.telemetry.barrier_times:
|
755
|
+
self.telemetry.avg_barrier_ms = sum(self.telemetry.barrier_times) / len(self.telemetry.barrier_times)
|
756
|
+
self.telemetry.max_barrier_ms = max(self.telemetry.barrier_times)
|
757
|
+
|
758
|
+
self.telemetry.tokens_saved = (self.telemetry.chars_in - self.telemetry.chars_out) // 4
|
759
|
+
|
760
|
+
# Release prompts SEQUENTIALLY to avoid race condition in on_llm_start
|
761
|
+
_vprint(f"[CORPUS] 🚦 Releasing {len(batch_prompts)} prompts sequentially...", self.verbose)
|
762
|
+
for i, prompt_id in enumerate(batch_prompts):
|
763
|
+
event = self.prompt_events.get(prompt_id)
|
764
|
+
if event:
|
765
|
+
# Signal the asyncio.Event from the original loop thread-safely
|
766
|
+
loop = self.prompt_loops.get(prompt_id)
|
767
|
+
if loop:
|
768
|
+
loop.call_soon_threadsafe(event.set)
|
769
|
+
else:
|
770
|
+
event.set()
|
771
|
+
# Longer delay to ensure threads hit on_llm_start one at a time
|
772
|
+
if i < len(batch_prompts) - 1: # Don't delay after the last one
|
773
|
+
time.sleep(0.5) # 500ms stagger to be safe
|
774
|
+
|
775
|
+
# Clean up events to prevent memory leak
|
776
|
+
for prompt_id in batch_prompts:
|
777
|
+
self.prompt_events.pop(prompt_id, None)
|
778
|
+
self.prompt_loops.pop(prompt_id, None)
|
779
|
+
|
780
|
+
def _get_deduplicated_prompt(self, prompt_id: str) -> str:
|
781
|
+
"""Get deduplicated prompt text."""
|
782
|
+
prompt_state = self.prompt_registry.get(prompt_id)
|
783
|
+
if not prompt_state:
|
784
|
+
return ""
|
785
|
+
|
786
|
+
return "\n".join(prompt_state.sentences)
|
787
|
+
|
788
|
+
def get_telemetry_summary(self) -> str:
|
789
|
+
"""Generate human-readable telemetry summary."""
|
790
|
+
t = self.telemetry
|
791
|
+
reduction_pct = ((t.chars_in - t.chars_out) / t.chars_in * 100) if t.chars_in > 0 else 0
|
792
|
+
|
793
|
+
summary = f"""
|
794
|
+
{'='*70}
|
795
|
+
[CORPUS] 📊 RUN-SCOPED TELEMETRY (run_id={self.run_id[:8]})
|
796
|
+
{'='*70}
|
797
|
+
Prompts processed: {t.prompts_total}
|
798
|
+
Sentences total: {t.sentences_total}
|
799
|
+
Clusters created: {t.clusters_total}
|
800
|
+
Cross-prompt dups removed: {t.cross_prompt_dups_removed}
|
801
|
+
{'='*70}
|
802
|
+
Chars in: {t.chars_in:,}
|
803
|
+
Chars out: {t.chars_out:,}
|
804
|
+
Reduction: {reduction_pct:.1f}%
|
805
|
+
Tokens saved (est): {t.tokens_saved:,} tokens
|
806
|
+
{'='*70}
|
807
|
+
Node Coverage (S∪N(S)): {t.entity_coverage_avg:.1f}%
|
808
|
+
Batches processed: {t.batches_processed}
|
809
|
+
Avg barrier: {t.avg_barrier_ms:.0f}ms
|
810
|
+
Max barrier: {t.max_barrier_ms:.0f}ms
|
811
|
+
{'='*70}
|
812
|
+
"""
|
813
|
+
return summary
|
814
|
+
|
815
|
+
|
816
|
+
# Global registry of run-scoped corpuses
|
817
|
+
_run_corpuses: Dict[str, RunScopedCorpus] = {}
|
818
|
+
_corpus_lock = threading.Lock()
|
819
|
+
|
820
|
+
|
821
|
+
def get_or_create_corpus(run_id: str, verbose: bool = False) -> RunScopedCorpus:
|
822
|
+
"""Get or create run-scoped corpus (thread-safe)."""
|
823
|
+
with _corpus_lock:
|
824
|
+
if run_id not in _run_corpuses:
|
825
|
+
_run_corpuses[run_id] = RunScopedCorpus(run_id, verbose=verbose)
|
826
|
+
return _run_corpuses[run_id]
|
827
|
+
|
828
|
+
|
829
|
+
def cleanup_corpus(run_id: str):
|
830
|
+
"""Cleanup corpus when run ends."""
|
831
|
+
with _corpus_lock:
|
832
|
+
if run_id in _run_corpuses:
|
833
|
+
corpus = _run_corpuses[run_id]
|
834
|
+
print(corpus.get_telemetry_summary())
|
835
|
+
del _run_corpuses[run_id]
|
836
|
+
print(f"[CORPUS] 🗑️ Cleaned up corpus for run_id={run_id[:8]}")
|
837
|
+
|
838
|
+
|
839
|
+
# ============================================================================
|
840
|
+
# Legacy Per-Prompt Deduplication (V1.0 - Fallback)
|
841
|
+
# ============================================================================
|
842
|
+
|
843
|
+
@dataclass
|
844
|
+
class Sentence:
|
845
|
+
"""Represents a sentence with metadata for deduplication."""
|
846
|
+
id: str
|
847
|
+
text: str
|
848
|
+
embedding: Optional[np.ndarray] = None
|
849
|
+
entities: Set[str] = None
|
850
|
+
numbers: Set[str] = None
|
851
|
+
salience: float = 0.0
|
852
|
+
position: int = 0
|
853
|
+
|
854
|
+
def __post_init__(self):
|
855
|
+
if self.entities is None:
|
856
|
+
self.entities = set()
|
857
|
+
if self.numbers is None:
|
858
|
+
self.numbers = set()
|
859
|
+
|
860
|
+
@property
|
861
|
+
def protected_entities(self) -> Set[str]:
|
862
|
+
"""All entities that must be preserved."""
|
863
|
+
return self.entities | self.numbers
|
864
|
+
|
865
|
+
|
866
|
+
def estimate_tokens(text: str) -> int:
|
867
|
+
"""Estimate token count (roughly chars/4 for English)."""
|
868
|
+
return len(text) // 4
|
869
|
+
|
870
|
+
|
871
|
+
def adaptive_resize_sentences(sentences: List[str]) -> List[str]:
|
872
|
+
"""
|
873
|
+
Adaptively resize sentences for optimal embedding similarity:
|
874
|
+
- Long (>120 tokens): Split on commas, semicolons, conjunctions
|
875
|
+
- Short (<40 tokens): Merge with next sentence
|
876
|
+
- Mid (40-120 tokens): Keep as-is
|
877
|
+
|
878
|
+
This improves cross-page similarity and reduces false uniqueness.
|
879
|
+
"""
|
880
|
+
resized = []
|
881
|
+
i = 0
|
882
|
+
|
883
|
+
while i < len(sentences):
|
884
|
+
sent = sentences[i]
|
885
|
+
tokens = estimate_tokens(sent)
|
886
|
+
|
887
|
+
if tokens > 120:
|
888
|
+
# LONG: Split on commas, semicolons, and conjunctions
|
889
|
+
# Split points: , ; : and, but, or, however, therefore (preceded by space/comma)
|
890
|
+
split_pattern = r'(?:,\s+(?:and|but|or|however|therefore|while|although)\s+|[;:])\s+'
|
891
|
+
chunks = re.split(split_pattern, sent)
|
892
|
+
|
893
|
+
# Ensure each chunk is reasonable (not too tiny)
|
894
|
+
for chunk in chunks:
|
895
|
+
if chunk.strip() and estimate_tokens(chunk) >= 20:
|
896
|
+
resized.append(chunk.strip())
|
897
|
+
elif resized:
|
898
|
+
# Merge tiny chunk with previous
|
899
|
+
resized[-1] += " " + chunk.strip()
|
900
|
+
i += 1
|
901
|
+
|
902
|
+
elif tokens < 40 and i + 1 < len(sentences):
|
903
|
+
# SHORT: Merge with next sentence
|
904
|
+
next_sent = sentences[i + 1]
|
905
|
+
merged = sent + " " + next_sent
|
906
|
+
merged_tokens = estimate_tokens(merged)
|
907
|
+
|
908
|
+
# Only merge if result is ≤120 tokens (don't create overly long sentences)
|
909
|
+
if merged_tokens <= 120:
|
910
|
+
resized.append(merged)
|
911
|
+
i += 2 # Skip next sentence (already merged)
|
912
|
+
else:
|
913
|
+
# Next sentence would make it too long, keep short one as-is
|
914
|
+
resized.append(sent)
|
915
|
+
i += 1
|
916
|
+
|
917
|
+
else:
|
918
|
+
# MID-RANGE (40-120) or last sentence: Keep as-is
|
919
|
+
resized.append(sent)
|
920
|
+
i += 1
|
921
|
+
|
922
|
+
return resized
|
923
|
+
|
924
|
+
|
925
|
+
def split_into_sentences(text: str) -> List[str]:
|
926
|
+
"""
|
927
|
+
Split text into sentences with special handling for markdown structures,
|
928
|
+
then adaptively resize for optimal embedding similarity.
|
929
|
+
|
930
|
+
Handles:
|
931
|
+
- Standard sentences ending with .!?
|
932
|
+
- Bullet points and numbered lists
|
933
|
+
- Code blocks (preserve as single units)
|
934
|
+
- Headers
|
935
|
+
- Adaptive resizing: long sentences split, short ones merged
|
936
|
+
"""
|
937
|
+
sentences = []
|
938
|
+
|
939
|
+
# First, protect code blocks
|
940
|
+
code_block_pattern = r'```[\s\S]*?```'
|
941
|
+
code_blocks = {}
|
942
|
+
for i, match in enumerate(re.finditer(code_block_pattern, text)):
|
943
|
+
placeholder = f"__CODE_BLOCK_{i}__"
|
944
|
+
code_blocks[placeholder] = match.group()
|
945
|
+
text = text.replace(match.group(), placeholder)
|
946
|
+
|
947
|
+
# Split on sentence boundaries
|
948
|
+
# Handle: . ! ? followed by space/newline, or newlines with list markers
|
949
|
+
patterns = [
|
950
|
+
r'(?<=[.!?])\s+(?=[A-Z])', # Standard sentences
|
951
|
+
r'\n\s*[-*•]\s+', # Bullet points
|
952
|
+
r'\n\s*\d+\.\s+', # Numbered lists
|
953
|
+
r'\n#{1,6}\s+', # Markdown headers
|
954
|
+
r'\n\s*\n', # Paragraph breaks
|
955
|
+
]
|
956
|
+
|
957
|
+
# Use non-capturing groups so delimiters are discarded by re.split
|
958
|
+
combined_pattern = '(?:' + '|'.join(patterns) + ')'
|
959
|
+
parts = re.split(combined_pattern, text)
|
960
|
+
|
961
|
+
# Collect non-empty segments as sentences
|
962
|
+
sentences = [p.strip() for p in parts if p and p.strip()]
|
963
|
+
|
964
|
+
# Restore code blocks
|
965
|
+
restored = []
|
966
|
+
for sent in sentences:
|
967
|
+
for placeholder, code in code_blocks.items():
|
968
|
+
sent = sent.replace(placeholder, code)
|
969
|
+
if sent.strip():
|
970
|
+
restored.append(sent.strip())
|
971
|
+
|
972
|
+
# ADAPTIVE RESIZING: Split long sentences, merge short ones
|
973
|
+
resized = adaptive_resize_sentences(restored)
|
974
|
+
|
975
|
+
return resized
|
976
|
+
|
977
|
+
|
978
|
+
def extract_entities_regex(text: str) -> Tuple[Set[str], Set[str]]:
|
979
|
+
"""
|
980
|
+
Fallback regex-based entity extraction.
|
981
|
+
|
982
|
+
Returns:
|
983
|
+
(entities, numbers) - Sets of extracted entities and numbers
|
984
|
+
"""
|
985
|
+
entities = set()
|
986
|
+
numbers = set()
|
987
|
+
|
988
|
+
# Proper nouns: Capitalized words (basic heuristic) - at least 3 chars
|
989
|
+
proper_nouns = re.findall(r'\b[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]+)*\b', text)
|
990
|
+
entities.update(proper_nouns)
|
991
|
+
|
992
|
+
# Technical terms: CamelCase, snake_case, package names
|
993
|
+
technical = re.findall(r'\b[A-Z][a-z]+[A-Z]\w+\b', text) # CamelCase
|
994
|
+
technical += re.findall(r'\b\w+_\w+\b', text) # snake_case
|
995
|
+
entities.update(technical)
|
996
|
+
|
997
|
+
# Numbers: MEANINGFUL numbers only (exclude single digits 0-9)
|
998
|
+
# Include: multi-digit numbers, floats, percentages, version numbers
|
999
|
+
nums = re.findall(r'\b\d{2,}(?:\.\d+)?%?\b', text) # 2+ digits
|
1000
|
+
nums += re.findall(r'\b\d+\.\d+\b', text) # Floats like 14.4, 2.0
|
1001
|
+
numbers.update(nums)
|
1002
|
+
|
1003
|
+
# Dates: YYYY-MM-DD, MM/DD/YYYY, etc.
|
1004
|
+
dates = re.findall(r'\b\d{4}[-/]\d{1,2}[-/]\d{1,4}\b', text) # Full dates
|
1005
|
+
dates += re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text)
|
1006
|
+
numbers.update(dates)
|
1007
|
+
|
1008
|
+
# Filter out common non-informative words and malformed entities
|
1009
|
+
stopwords = {
|
1010
|
+
# Common words
|
1011
|
+
'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
|
1012
|
+
'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
|
1013
|
+
'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
|
1014
|
+
'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
|
1015
|
+
# Markup/formatting artifacts
|
1016
|
+
'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
# Filter entities
|
1020
|
+
filtered_entities = set()
|
1021
|
+
for e in entities:
|
1022
|
+
# Skip short entities
|
1023
|
+
if len(e) < 3:
|
1024
|
+
continue
|
1025
|
+
|
1026
|
+
# Skip if contains newlines (malformed extraction)
|
1027
|
+
if '\n' in e:
|
1028
|
+
continue
|
1029
|
+
|
1030
|
+
# Skip stopwords (case-insensitive)
|
1031
|
+
if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
|
1032
|
+
continue
|
1033
|
+
|
1034
|
+
# Skip if it's just a URL fragment
|
1035
|
+
if e.lower() in ['url', 'http', 'https', 'www']:
|
1036
|
+
continue
|
1037
|
+
|
1038
|
+
# Skip if ends with common suffixes that indicate malformed extraction
|
1039
|
+
if e.endswith('---') or e.endswith('...') or e.endswith('--'):
|
1040
|
+
continue
|
1041
|
+
|
1042
|
+
filtered_entities.add(e)
|
1043
|
+
|
1044
|
+
# Filter numbers - remove single digits 0-9 (often SOURCE numbers)
|
1045
|
+
filtered_numbers = {n for n in numbers if len(n) >= 2 or '.' in n or '%' in n}
|
1046
|
+
|
1047
|
+
return filtered_entities, filtered_numbers
|
1048
|
+
|
1049
|
+
|
1050
|
+
def extract_entities_spacy(text: str, nlp) -> Tuple[Set[str], Set[str]]:
|
1051
|
+
"""
|
1052
|
+
spaCy-based entity extraction (more accurate).
|
1053
|
+
|
1054
|
+
Returns:
|
1055
|
+
(entities, numbers) - Sets of extracted entities and numbers
|
1056
|
+
"""
|
1057
|
+
entities = set()
|
1058
|
+
numbers = set()
|
1059
|
+
|
1060
|
+
doc = nlp(text)
|
1061
|
+
|
1062
|
+
# Named entities
|
1063
|
+
for ent in doc.ents:
|
1064
|
+
if ent.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']:
|
1065
|
+
entities.add(ent.text)
|
1066
|
+
elif ent.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
|
1067
|
+
numbers.add(ent.text)
|
1068
|
+
|
1069
|
+
# Also grab technical terms (capitalized noun phrases)
|
1070
|
+
for chunk in doc.noun_chunks:
|
1071
|
+
if chunk.text[0].isupper():
|
1072
|
+
entities.add(chunk.text)
|
1073
|
+
|
1074
|
+
# Apply SAME filtering as regex version
|
1075
|
+
stopwords = {
|
1076
|
+
'The', 'This', 'That', 'These', 'Those', 'What', 'Where', 'When', 'Why', 'How', 'Who', 'Which',
|
1077
|
+
'Welcome', 'Search', 'Summary', 'Source', 'Url', 'Http', 'Https', 'One', 'Two', 'Three', 'Four', 'Five',
|
1078
|
+
'Key', 'Our', 'Its', 'It', 'For', 'With', 'And', 'But', 'Not', 'You', 'All', 'Can', 'Her', 'Was',
|
1079
|
+
'She', 'Has', 'Had', 'His', 'Him', 'Are', 'Were', 'Been', 'Being', 'Have', 'Does', 'Did', 'Will',
|
1080
|
+
'URL', 'Http', 'Https', 'PDF', 'CSV', 'JSON', 'XML', 'HTML',
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
# Filter entities
|
1084
|
+
filtered_entities = set()
|
1085
|
+
for e in entities:
|
1086
|
+
# Skip short entities
|
1087
|
+
if len(e) < 3:
|
1088
|
+
continue
|
1089
|
+
|
1090
|
+
# Skip if contains newlines (malformed)
|
1091
|
+
if '\n' in e:
|
1092
|
+
continue
|
1093
|
+
|
1094
|
+
# Skip stopwords (case-insensitive)
|
1095
|
+
if e in stopwords or e.lower() in {s.lower() for s in stopwords}:
|
1096
|
+
continue
|
1097
|
+
|
1098
|
+
# Skip URL fragments
|
1099
|
+
if e.lower() in ['url', 'http', 'https', 'www']:
|
1100
|
+
continue
|
1101
|
+
|
1102
|
+
# Skip malformed endings
|
1103
|
+
if e.endswith('---') or e.endswith('...') or e.endswith('--') or e.endswith('---\\nURL'):
|
1104
|
+
continue
|
1105
|
+
|
1106
|
+
filtered_entities.add(e)
|
1107
|
+
|
1108
|
+
# Filter numbers - remove single digits 0-9
|
1109
|
+
filtered_numbers = {n for n in numbers if len(str(n).strip()) >= 2 or '.' in str(n) or '%' in str(n)}
|
1110
|
+
|
1111
|
+
return filtered_entities, filtered_numbers
|
1112
|
+
|
1113
|
+
|
1114
|
+
def extract_entities(text: str) -> Tuple[Set[str], Set[str]]:
|
1115
|
+
"""
|
1116
|
+
Extract entities and numbers from text.
|
1117
|
+
|
1118
|
+
Uses spaCy if available, falls back to regex.
|
1119
|
+
|
1120
|
+
Returns:
|
1121
|
+
(entities, numbers) - Sets of protected entities and numbers
|
1122
|
+
"""
|
1123
|
+
nlp = _get_spacy_model()
|
1124
|
+
|
1125
|
+
if nlp == "fallback":
|
1126
|
+
return extract_entities_regex(text)
|
1127
|
+
else:
|
1128
|
+
return extract_entities_spacy(text, nlp)
|
1129
|
+
|
1130
|
+
|
1131
|
+
def compute_salience(sentence: str, position: int, total_sentences: int) -> float:
|
1132
|
+
"""
|
1133
|
+
Compute salience score for a sentence.
|
1134
|
+
|
1135
|
+
Factors:
|
1136
|
+
- Position: Earlier sentences weighted higher (first paragraph effect)
|
1137
|
+
- Length: Moderate length preferred (too short = filler, too long = verbose)
|
1138
|
+
- Entity density: More entities = more information-dense
|
1139
|
+
- Numbers: Presence of numbers = factual content
|
1140
|
+
|
1141
|
+
Returns:
|
1142
|
+
Salience score (0.0 to 1.0, higher = more important)
|
1143
|
+
"""
|
1144
|
+
score = 0.0
|
1145
|
+
|
1146
|
+
# Position-based (exponential decay)
|
1147
|
+
position_weight = np.exp(-position / (total_sentences * 0.3))
|
1148
|
+
score += position_weight * 0.3
|
1149
|
+
|
1150
|
+
# Length-based (optimal ~50-150 chars)
|
1151
|
+
length = len(sentence)
|
1152
|
+
if 50 <= length <= 150:
|
1153
|
+
length_weight = 1.0
|
1154
|
+
elif length < 50:
|
1155
|
+
length_weight = length / 50
|
1156
|
+
else:
|
1157
|
+
length_weight = 150 / length
|
1158
|
+
score += length_weight * 0.2
|
1159
|
+
|
1160
|
+
# Entity density (basic heuristic: count capitalized words)
|
1161
|
+
words = sentence.split()
|
1162
|
+
cap_words = sum(1 for w in words if w and w[0].isupper())
|
1163
|
+
entity_density = min(cap_words / max(len(words), 1), 1.0)
|
1164
|
+
score += entity_density * 0.3
|
1165
|
+
|
1166
|
+
# Number presence
|
1167
|
+
has_numbers = bool(re.search(r'\d', sentence))
|
1168
|
+
score += 0.2 if has_numbers else 0.0
|
1169
|
+
|
1170
|
+
return min(score, 1.0)
|
1171
|
+
|
1172
|
+
|
1173
|
+
def compute_char_3gram_jaccard(text1: str, text2: str) -> float:
|
1174
|
+
"""
|
1175
|
+
Compute character 3-gram Jaccard similarity.
|
1176
|
+
Captures boilerplate and tight phrasing that embeddings might miss.
|
1177
|
+
|
1178
|
+
Returns:
|
1179
|
+
Jaccard similarity [0, 1]
|
1180
|
+
"""
|
1181
|
+
def get_3grams(text):
|
1182
|
+
text = text.lower()
|
1183
|
+
return set(text[i:i+3] for i in range(len(text) - 2))
|
1184
|
+
|
1185
|
+
grams1 = get_3grams(text1)
|
1186
|
+
grams2 = get_3grams(text2)
|
1187
|
+
|
1188
|
+
if not grams1 or not grams2:
|
1189
|
+
return 0.0
|
1190
|
+
|
1191
|
+
intersection = len(grams1 & grams2)
|
1192
|
+
union = len(grams1 | grams2)
|
1193
|
+
|
1194
|
+
return intersection / union if union > 0 else 0.0
|
1195
|
+
|
1196
|
+
|
1197
|
+
def compute_similarity(emb1: np.ndarray, emb2: np.ndarray) -> float:
|
1198
|
+
"""
|
1199
|
+
Compute cosine similarity between two embeddings.
|
1200
|
+
Assumes embeddings are L2-normalized (unit vectors), so cosine = dot product.
|
1201
|
+
"""
|
1202
|
+
return np.dot(emb1, emb2)
|
1203
|
+
|
1204
|
+
|
1205
|
+
def are_sentences_similar(sent1: Sentence, sent2: Sentence, semantic_threshold: float = 0.60) -> bool:
|
1206
|
+
"""
|
1207
|
+
Check if two sentences are similar using semantic + lexical signals.
|
1208
|
+
|
1209
|
+
- Semantic: cosine similarity on embeddings
|
1210
|
+
- Lexical fallback: 3-gram Jaccard for short sentences (≤120 chars)
|
1211
|
+
|
1212
|
+
Args:
|
1213
|
+
sent1, sent2: Sentence objects with embeddings
|
1214
|
+
semantic_threshold: Threshold for semantic similarity
|
1215
|
+
|
1216
|
+
Returns:
|
1217
|
+
True if similar, False otherwise
|
1218
|
+
"""
|
1219
|
+
# Primary: semantic similarity
|
1220
|
+
semantic_sim = compute_similarity(sent1.embedding, sent2.embedding)
|
1221
|
+
if semantic_sim >= semantic_threshold:
|
1222
|
+
return True
|
1223
|
+
|
1224
|
+
# Fallback: lexical for short sentences (captures boilerplate)
|
1225
|
+
max_len = max(len(sent1.text), len(sent2.text))
|
1226
|
+
if max_len <= 120: # ~30 tokens
|
1227
|
+
lexical_sim = compute_char_3gram_jaccard(sent1.text, sent2.text)
|
1228
|
+
if lexical_sim >= 0.82: # High Jaccard = tight phrasing match
|
1229
|
+
return True
|
1230
|
+
|
1231
|
+
return False
|
1232
|
+
|
1233
|
+
|
1234
|
+
def build_sentence_objects(sentences_text: List[str], embeddings: np.ndarray) -> List[Sentence]:
|
1235
|
+
"""
|
1236
|
+
Build Sentence objects with metadata.
|
1237
|
+
|
1238
|
+
Args:
|
1239
|
+
sentences_text: List of sentence strings
|
1240
|
+
embeddings: Numpy array of embeddings (N x 384)
|
1241
|
+
|
1242
|
+
Returns:
|
1243
|
+
List of Sentence objects with computed metadata
|
1244
|
+
"""
|
1245
|
+
sentence_objects = []
|
1246
|
+
total = len(sentences_text)
|
1247
|
+
|
1248
|
+
for i, text in enumerate(sentences_text):
|
1249
|
+
# Generate ID
|
1250
|
+
sent_id = hashlib.md5(text.encode()).hexdigest()[:8]
|
1251
|
+
|
1252
|
+
# Extract entities
|
1253
|
+
entities, numbers = extract_entities(text)
|
1254
|
+
|
1255
|
+
# Compute salience
|
1256
|
+
salience = compute_salience(text, i, total)
|
1257
|
+
|
1258
|
+
sentence_objects.append(Sentence(
|
1259
|
+
id=sent_id,
|
1260
|
+
text=text,
|
1261
|
+
embedding=embeddings[i],
|
1262
|
+
entities=entities,
|
1263
|
+
numbers=numbers,
|
1264
|
+
salience=salience,
|
1265
|
+
position=i
|
1266
|
+
))
|
1267
|
+
|
1268
|
+
return sentence_objects
|
1269
|
+
|
1270
|
+
|
1271
|
+
def greedy_max_independent_set(
|
1272
|
+
sentences: List[Sentence],
|
1273
|
+
similarity_threshold: float = 0.60,
|
1274
|
+
verbose: bool = True,
|
1275
|
+
precomputed_degree_map: Dict = None
|
1276
|
+
) -> List[Sentence]:
|
1277
|
+
"""
|
1278
|
+
Greedy maximum-independent-set selection with degree×length-aware ordering.
|
1279
|
+
|
1280
|
+
Algorithm:
|
1281
|
+
1. Compute degree (# of similar neighbors) for each sentence
|
1282
|
+
2. Sort by (token_length × degree) DESCENDING → prioritizes ejecting long redundant sentences
|
1283
|
+
3. Pick highest degree×length sentence (most redundant, highest token savings)
|
1284
|
+
4. Remove all similar neighbors (similarity > threshold)
|
1285
|
+
5. Check removed sentences for unique entities
|
1286
|
+
6. If removed sentence has unique entities, re-add it (HARD GUARD)
|
1287
|
+
7. Repeat until all sentences processed
|
1288
|
+
|
1289
|
+
This preserves coverage while ejecting long, low-value uniques → bigger trims without raising sim bar.
|
1290
|
+
|
1291
|
+
Args:
|
1292
|
+
sentences: List of Sentence objects
|
1293
|
+
similarity_threshold: Similarity threshold for edge creation (0.75 = 75% similar)
|
1294
|
+
verbose: Print debug info
|
1295
|
+
|
1296
|
+
Returns:
|
1297
|
+
List of selected Sentence objects (deduplicated)
|
1298
|
+
"""
|
1299
|
+
if verbose:
|
1300
|
+
print(f"\n[PIPECLEANER] Starting degree×length-aware greedy max-independent-set")
|
1301
|
+
print(f"[PIPECLEANER] Input: {len(sentences)} sentences")
|
1302
|
+
print(f"[PIPECLEANER] Similarity threshold: {similarity_threshold}")
|
1303
|
+
|
1304
|
+
# Step 1: Use precomputed degree map (or compute if not provided)
|
1305
|
+
if precomputed_degree_map is None:
|
1306
|
+
# Compute degree (# of connections) for each sentence
|
1307
|
+
# Use hybrid similarity: semantic (0.60) OR lexical (0.82 Jaccard for short spans)
|
1308
|
+
degree_map = {}
|
1309
|
+
for sent in sentences:
|
1310
|
+
degree = 0
|
1311
|
+
for other in sentences:
|
1312
|
+
if sent.id != other.id:
|
1313
|
+
# Hybrid check: semantic OR lexical
|
1314
|
+
if are_sentences_similar(sent, other, semantic_threshold=similarity_threshold):
|
1315
|
+
degree += 1
|
1316
|
+
degree_map[sent.id] = degree
|
1317
|
+
|
1318
|
+
# Sanity checks (as requested)
|
1319
|
+
isolates = [s for s in sentences if degree_map[s.id] == 0]
|
1320
|
+
non_isolates = [s for s in sentences if degree_map[s.id] > 0]
|
1321
|
+
pct_isolates = len(isolates) / len(sentences) * 100 if sentences else 0
|
1322
|
+
avg_degree_non_iso = sum(degree_map[s.id] for s in non_isolates) / len(non_isolates) if non_isolates else 0
|
1323
|
+
|
1324
|
+
if verbose:
|
1325
|
+
avg_degree = sum(degree_map.values()) / len(degree_map) if degree_map else 0
|
1326
|
+
print(f"[PIPECLEANER] Degree stats: avg={avg_degree:.1f}, isolates={pct_isolates:.1f}%, non-isolate avg={avg_degree_non_iso:.1f}")
|
1327
|
+
print(f"[PIPECLEANER] Sanity: isolates {pct_isolates:.0f}% (expect <20%), non-isolate avg {avg_degree_non_iso:.1f} (expect >3)")
|
1328
|
+
else:
|
1329
|
+
# Use precomputed degree map (more efficient)
|
1330
|
+
degree_map = precomputed_degree_map
|
1331
|
+
|
1332
|
+
# Step 2: Sort by (token_length × degree) ASCENDING
|
1333
|
+
# LOW degree×length = short + unique → keep first (high value)
|
1334
|
+
# HIGH degree×length = long + redundant → eject (low value)
|
1335
|
+
def sort_key(s):
|
1336
|
+
token_len = estimate_tokens(s.text)
|
1337
|
+
degree = degree_map[s.id]
|
1338
|
+
return token_len * degree
|
1339
|
+
|
1340
|
+
# Sort ASCENDING - pick short unique sentences first
|
1341
|
+
sorted_sentences = sorted(sentences, key=sort_key, reverse=False)
|
1342
|
+
|
1343
|
+
if verbose:
|
1344
|
+
top_5 = sorted_sentences[:5]
|
1345
|
+
print(f"[PIPECLEANER] Top 5 to keep (low degree×length = short + unique):")
|
1346
|
+
for i, s in enumerate(top_5, 1):
|
1347
|
+
score = sort_key(s)
|
1348
|
+
print(f" {i}. {estimate_tokens(s.text)}tok × {degree_map[s.id]}deg = {score:.0f} | '{s.text[:60]}...'")
|
1349
|
+
|
1350
|
+
|
1351
|
+
selected = []
|
1352
|
+
remaining = sorted_sentences.copy()
|
1353
|
+
entity_coverage = set()
|
1354
|
+
iteration = 0
|
1355
|
+
|
1356
|
+
while remaining:
|
1357
|
+
iteration += 1
|
1358
|
+
# Pick highest degree×length sentence (most redundant + expensive)
|
1359
|
+
best = remaining[0]
|
1360
|
+
|
1361
|
+
if verbose and iteration <= 5: # Print first 5 iterations
|
1362
|
+
score = sort_key(best)
|
1363
|
+
print(f"\n[PIPECLEANER] Iteration {iteration}:")
|
1364
|
+
print(f" Selected: '{best.text[:80]}...'")
|
1365
|
+
print(f" Degree×Length: {estimate_tokens(best.text)}tok × {degree_map[best.id]}deg = {score:.0f}")
|
1366
|
+
print(f" Entities: {best.protected_entities}")
|
1367
|
+
|
1368
|
+
# Add to selected
|
1369
|
+
selected.append(best)
|
1370
|
+
entity_coverage |= best.protected_entities
|
1371
|
+
|
1372
|
+
# Remove from remaining
|
1373
|
+
remaining.remove(best)
|
1374
|
+
|
1375
|
+
# Find similar neighbors to remove (using hybrid similarity)
|
1376
|
+
to_remove = []
|
1377
|
+
for candidate in remaining:
|
1378
|
+
if are_sentences_similar(best, candidate, semantic_threshold=similarity_threshold):
|
1379
|
+
# Get semantic sim for logging
|
1380
|
+
sem_sim = compute_similarity(best.embedding, candidate.embedding)
|
1381
|
+
to_remove.append((candidate, sem_sim))
|
1382
|
+
|
1383
|
+
if verbose and iteration <= 5 and to_remove:
|
1384
|
+
print(f" Removing {len(to_remove)} similar sentences (similarity >= {similarity_threshold})")
|
1385
|
+
|
1386
|
+
# Remove similar sentences
|
1387
|
+
for candidate, sim in to_remove:
|
1388
|
+
remaining.remove(candidate)
|
1389
|
+
|
1390
|
+
# HARD GUARD: Check removed sentences for unique entities
|
1391
|
+
# Only re-add if they have MULTIPLE (3+) meaningful unique entities
|
1392
|
+
# This prevents re-adding for trivial differences
|
1393
|
+
re_added = 0
|
1394
|
+
for candidate, sim in to_remove:
|
1395
|
+
unique_entities = candidate.protected_entities - entity_coverage
|
1396
|
+
|
1397
|
+
# Require at least 3 unique entities OR at least 1 unique multi-word entity
|
1398
|
+
multi_word_entities = {e for e in unique_entities if ' ' in e or len(e) > 10}
|
1399
|
+
should_readd = len(unique_entities) >= 3 or len(multi_word_entities) >= 1
|
1400
|
+
|
1401
|
+
if should_readd:
|
1402
|
+
if verbose and iteration <= 5:
|
1403
|
+
print(f" ⚠️ RE-ADDING sentence with {len(unique_entities)} unique entities: {unique_entities}")
|
1404
|
+
print(f" Text: '{candidate.text[:80]}...'")
|
1405
|
+
selected.append(candidate)
|
1406
|
+
entity_coverage |= candidate.protected_entities
|
1407
|
+
re_added += 1
|
1408
|
+
|
1409
|
+
if verbose and iteration <= 5 and re_added:
|
1410
|
+
print(f" Re-added {re_added} sentences to preserve entity coverage")
|
1411
|
+
|
1412
|
+
if verbose:
|
1413
|
+
print(f"\n[PIPECLEANER] Selection complete:")
|
1414
|
+
print(f" Input: {len(sentences)} sentences")
|
1415
|
+
print(f" Output: {len(selected)} sentences")
|
1416
|
+
print(f" Reduction: {(1 - len(selected)/len(sentences))*100:.1f}%")
|
1417
|
+
print(f" Entities preserved: {len(entity_coverage)}")
|
1418
|
+
|
1419
|
+
return selected
|
1420
|
+
|
1421
|
+
|
1422
|
+
def deduplicate_search_results(
|
1423
|
+
text: str,
|
1424
|
+
similarity_threshold: float = 0.60,
|
1425
|
+
verbose: bool = True,
|
1426
|
+
cached_model=None
|
1427
|
+
) -> Tuple[str, Dict, any]:
|
1428
|
+
"""
|
1429
|
+
Main entry point: Deduplicate search results using graph-based approach.
|
1430
|
+
|
1431
|
+
Args:
|
1432
|
+
text: Raw search results text
|
1433
|
+
similarity_threshold: Cosine similarity threshold (0.60 catches cross-site paraphrases at 0.55-0.68)
|
1434
|
+
verbose: Print debug info
|
1435
|
+
cached_model: Optional cached embedding model to reuse
|
1436
|
+
|
1437
|
+
Returns:
|
1438
|
+
Tuple of (deduplicated_text, stats_dict, embedding_model)
|
1439
|
+
stats_dict contains: {
|
1440
|
+
'original_chars': int,
|
1441
|
+
'deduplicated_chars': int,
|
1442
|
+
'original_sentences': int,
|
1443
|
+
'deduplicated_sentences': int,
|
1444
|
+
'prune_pct': float,
|
1445
|
+
'original_tokens': int,
|
1446
|
+
'deduplicated_tokens': int,
|
1447
|
+
'tokens_saved': int,
|
1448
|
+
'entity_coverage_pct': float,
|
1449
|
+
'entities_total': int,
|
1450
|
+
'entities_preserved': int
|
1451
|
+
}
|
1452
|
+
"""
|
1453
|
+
if verbose:
|
1454
|
+
print(f"\n{'='*70}")
|
1455
|
+
print(f"[PIPECLEANER] DEDUPLICATION STARTED")
|
1456
|
+
print(f"{'='*70}")
|
1457
|
+
print(f"[PIPECLEANER] Input text: {len(text)} chars, ~{len(text.split())} words")
|
1458
|
+
|
1459
|
+
# Step 1: Split into sentences
|
1460
|
+
sentences_text = split_into_sentences(text)
|
1461
|
+
|
1462
|
+
if verbose:
|
1463
|
+
print(f"[PIPECLEANER] Split into {len(sentences_text)} sentences")
|
1464
|
+
|
1465
|
+
# Initialize stats
|
1466
|
+
stats = {
|
1467
|
+
'original_chars': len(text),
|
1468
|
+
'deduplicated_chars': len(text),
|
1469
|
+
'original_sentences': len(sentences_text),
|
1470
|
+
'deduplicated_sentences': len(sentences_text),
|
1471
|
+
'prune_pct': 0.0,
|
1472
|
+
'original_tokens': int(len(text) / 4),
|
1473
|
+
'deduplicated_tokens': int(len(text) / 4),
|
1474
|
+
'tokens_saved': 0,
|
1475
|
+
'entity_coverage_pct': 100.0,
|
1476
|
+
'entities_total': 0,
|
1477
|
+
'entities_preserved': 0
|
1478
|
+
}
|
1479
|
+
|
1480
|
+
if len(sentences_text) == 0:
|
1481
|
+
if verbose:
|
1482
|
+
print(f"[PIPECLEANER] ⚠️ No sentences found, returning original text")
|
1483
|
+
return text, stats, cached_model
|
1484
|
+
|
1485
|
+
if len(sentences_text) == 1:
|
1486
|
+
if verbose:
|
1487
|
+
print(f"[PIPECLEANER] Only 1 sentence, skipping deduplication")
|
1488
|
+
return text, stats, cached_model
|
1489
|
+
|
1490
|
+
# Step 2: Compute embeddings
|
1491
|
+
# Always use the thread-safe singleton model
|
1492
|
+
model = _get_embedding_model()
|
1493
|
+
|
1494
|
+
if verbose:
|
1495
|
+
print(f"[PIPECLEANER] Computing embeddings...")
|
1496
|
+
|
1497
|
+
# L2 normalize embeddings so cosine similarity = dot product (faster)
|
1498
|
+
embeddings = model.encode(sentences_text, show_progress_bar=False, normalize_embeddings=True)
|
1499
|
+
|
1500
|
+
if verbose:
|
1501
|
+
print(f"[PIPECLEANER] Embeddings computed: shape {embeddings.shape}")
|
1502
|
+
|
1503
|
+
# Step 3: Build sentence objects with metadata
|
1504
|
+
sentences = build_sentence_objects(sentences_text, embeddings)
|
1505
|
+
|
1506
|
+
# Calculate total entities across all sentences
|
1507
|
+
all_entities = set()
|
1508
|
+
for sent in sentences:
|
1509
|
+
all_entities |= sent.protected_entities
|
1510
|
+
|
1511
|
+
# Step 4: Run greedy max-independent-set selection
|
1512
|
+
selected = greedy_max_independent_set(sentences, similarity_threshold, verbose)
|
1513
|
+
|
1514
|
+
# Calculate preserved entities
|
1515
|
+
preserved_entities = set()
|
1516
|
+
for sent in selected:
|
1517
|
+
preserved_entities |= sent.protected_entities
|
1518
|
+
|
1519
|
+
# Step 5: Reconstruct text preserving original order
|
1520
|
+
selected_by_position = sorted(selected, key=lambda s: s.position)
|
1521
|
+
deduplicated_text = '\n\n'.join(s.text for s in selected_by_position)
|
1522
|
+
|
1523
|
+
# Calculate stats
|
1524
|
+
stats['deduplicated_chars'] = len(deduplicated_text)
|
1525
|
+
stats['deduplicated_sentences'] = len(selected)
|
1526
|
+
stats['prune_pct'] = (1 - len(selected) / len(sentences_text)) * 100 if len(sentences_text) > 0 else 0
|
1527
|
+
stats['deduplicated_tokens'] = int(len(deduplicated_text) / 4)
|
1528
|
+
stats['tokens_saved'] = stats['original_tokens'] - stats['deduplicated_tokens']
|
1529
|
+
stats['entities_total'] = len(all_entities)
|
1530
|
+
stats['entities_preserved'] = len(preserved_entities)
|
1531
|
+
stats['entity_coverage_pct'] = (len(preserved_entities) / len(all_entities) * 100) if len(all_entities) > 0 else 100.0
|
1532
|
+
|
1533
|
+
if verbose:
|
1534
|
+
print(f"\n[PIPECLEANER] DEDUPLICATION COMPLETE")
|
1535
|
+
print(f" Input: {len(text)} chars")
|
1536
|
+
print(f" Output: {len(deduplicated_text)} chars")
|
1537
|
+
print(f" Reduction: {(1 - len(deduplicated_text)/len(text))*100:.1f}%")
|
1538
|
+
print(f" Sentences: {len(sentences_text)} → {len(selected)}")
|
1539
|
+
print(f"{'='*70}\n")
|
1540
|
+
|
1541
|
+
return deduplicated_text, stats, model
|
1542
|
+
|
1543
|
+
|
1544
|
+
# ============================================================================
|
1545
|
+
# CONVENIENCE FUNCTIONS
|
1546
|
+
# ============================================================================
|
1547
|
+
|
1548
|
+
def estimate_tokens(text: str) -> int:
|
1549
|
+
"""Rough estimate of token count (words / 0.75)."""
|
1550
|
+
return int(len(text.split()) / 0.75)
|
1551
|
+
|
1552
|
+
|
1553
|
+
def should_deduplicate(text: str, min_length: int = 500) -> bool:
|
1554
|
+
"""
|
1555
|
+
Check if text is worth deduplicating.
|
1556
|
+
|
1557
|
+
Args:
|
1558
|
+
text: Input text
|
1559
|
+
min_length: Minimum character length to bother deduplicating
|
1560
|
+
|
1561
|
+
Returns:
|
1562
|
+
True if text should be deduplicated
|
1563
|
+
"""
|
1564
|
+
return len(text) >= min_length
|
1565
|
+
|
1566
|
+
|
1567
|
+
def apply_pipecleaner_if_applicable(tool_name: str, output_str: str, selected_rules: list, cached_model=None) -> Tuple[str, any]:
|
1568
|
+
"""
|
1569
|
+
High-level function to check for filter search rules and apply deduplication.
|
1570
|
+
|
1571
|
+
This is called from capture.py's on_tool_end callback.
|
1572
|
+
|
1573
|
+
Args:
|
1574
|
+
tool_name: Name of the tool that just finished
|
1575
|
+
output_str: Raw output from the tool
|
1576
|
+
selected_rules: List of rules selected for this run
|
1577
|
+
cached_model: Optional cached embedding model to reuse across searches
|
1578
|
+
|
1579
|
+
Returns:
|
1580
|
+
Tuple of (deduplicated_output, embedding_model) for caching
|
1581
|
+
Returns (original_output, None) if no filter rule applies
|
1582
|
+
"""
|
1583
|
+
try:
|
1584
|
+
# Find applicable filter search rules for this tool
|
1585
|
+
filter_rules = _find_filter_search_rules(tool_name, selected_rules)
|
1586
|
+
|
1587
|
+
# If we found applicable filter rules, apply deduplication
|
1588
|
+
if filter_rules:
|
1589
|
+
print(f"\n{'='*70}")
|
1590
|
+
print(f"[PIPECLEANER] 🧹 FILTER SEARCH RULE DETECTED")
|
1591
|
+
print(f"{'='*70}")
|
1592
|
+
print(f"[PIPECLEANER] Tool: {tool_name}")
|
1593
|
+
print(f"[PIPECLEANER] Rules matched: {len(filter_rules)}")
|
1594
|
+
for rule in filter_rules:
|
1595
|
+
rule_id = getattr(rule, 'id', 'unknown')
|
1596
|
+
advice = getattr(rule, 'advice', '') or getattr(rule, 'advice_text', '')
|
1597
|
+
print(f"[PIPECLEANER] - Rule {rule_id}: {advice[:80]}...")
|
1598
|
+
print(f"{'='*70}")
|
1599
|
+
|
1600
|
+
# Apply deduplication with cached model
|
1601
|
+
deduplicated, stats, model = deduplicate_search_results(
|
1602
|
+
text=output_str,
|
1603
|
+
similarity_threshold=0.60, # 0.60 catches cross-site paraphrases (0.55-0.68 typical)
|
1604
|
+
verbose=True, # Show detailed deduplication stats
|
1605
|
+
cached_model=cached_model # Reuse model if available
|
1606
|
+
)
|
1607
|
+
|
1608
|
+
# Print comprehensive stats after every search
|
1609
|
+
print(f"\n{'='*70}")
|
1610
|
+
print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
|
1611
|
+
print(f"{'='*70}")
|
1612
|
+
print(f"[PIPECLEANER] 🔢 Sentences:")
|
1613
|
+
print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
|
1614
|
+
print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
|
1615
|
+
print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
|
1616
|
+
print(f"[PIPECLEANER]")
|
1617
|
+
print(f"[PIPECLEANER] 🎯 Entity Coverage:")
|
1618
|
+
print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
|
1619
|
+
print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
|
1620
|
+
print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
|
1621
|
+
print(f"[PIPECLEANER]")
|
1622
|
+
print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
|
1623
|
+
print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
|
1624
|
+
print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
|
1625
|
+
print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
|
1626
|
+
print(f"[PIPECLEANER]")
|
1627
|
+
print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
|
1628
|
+
print(f"{'='*70}\n")
|
1629
|
+
|
1630
|
+
return deduplicated, model
|
1631
|
+
|
1632
|
+
# No filter rules found, return original
|
1633
|
+
return output_str, None
|
1634
|
+
|
1635
|
+
except ImportError as e:
|
1636
|
+
print(f"\n{'='*70}")
|
1637
|
+
print(f"[PIPECLEANER] ❌ IMPORT ERROR - FAILING OPEN")
|
1638
|
+
print(f"{'='*70}")
|
1639
|
+
print(f"[PIPECLEANER] Error: {e}")
|
1640
|
+
print(f"[PIPECLEANER] Install: pip install sentence-transformers")
|
1641
|
+
print(f"{'='*70}\n")
|
1642
|
+
return output_str, None
|
1643
|
+
except Exception as e:
|
1644
|
+
print(f"\n{'='*70}")
|
1645
|
+
print(f"[PIPECLEANER] ❌ EXCEPTION - FAILING OPEN")
|
1646
|
+
print(f"{'='*70}")
|
1647
|
+
print(f"[PIPECLEANER] Error type: {type(e).__name__}")
|
1648
|
+
print(f"[PIPECLEANER] Error message: {e}")
|
1649
|
+
import traceback
|
1650
|
+
print(f"[PIPECLEANER] Traceback:")
|
1651
|
+
traceback.print_exc()
|
1652
|
+
print(f"{'='*70}\n")
|
1653
|
+
return output_str, None
|
1654
|
+
|
1655
|
+
|
1656
|
+
def _find_filter_search_rules(tool_name: str, selected_rules: list) -> list:
|
1657
|
+
"""
|
1658
|
+
Find llm_start scoped rules with "filter search" keywords that apply to this tool.
|
1659
|
+
|
1660
|
+
This is called from on_llm_start when a Summary tool's LLM is about to be called.
|
1661
|
+
Rule synthesis will generate rules scoped to llm_start when it detects search→summary patterns.
|
1662
|
+
|
1663
|
+
Args:
|
1664
|
+
tool_name: Name of the tool whose LLM is starting (e.g., 'Summary')
|
1665
|
+
selected_rules: List of rules to search through
|
1666
|
+
|
1667
|
+
Returns:
|
1668
|
+
List of applicable filter search rules
|
1669
|
+
"""
|
1670
|
+
filter_rules = []
|
1671
|
+
|
1672
|
+
for rule_meta in selected_rules:
|
1673
|
+
# Unwrap tuple if needed (rules come as (rule, metadata) from select_rules)
|
1674
|
+
if isinstance(rule_meta, tuple) and len(rule_meta) == 2:
|
1675
|
+
rule_obj, _metadata = rule_meta
|
1676
|
+
else:
|
1677
|
+
rule_obj = rule_meta
|
1678
|
+
|
1679
|
+
# Check if this is an llm_start scoped rule
|
1680
|
+
target_step_type = getattr(rule_obj, 'target_step_type', None)
|
1681
|
+
|
1682
|
+
# Must be scoped to llm_start (where we intercept Summary LLM calls)
|
1683
|
+
if target_step_type != 'llm_start':
|
1684
|
+
continue
|
1685
|
+
|
1686
|
+
# Check if the rule contains "filter search" keywords
|
1687
|
+
# Try both field names that might be used
|
1688
|
+
advice = getattr(rule_obj, 'advice_text', None) or getattr(rule_obj, 'advice', None) or ''
|
1689
|
+
advice_lower = advice.lower() if advice else ''
|
1690
|
+
|
1691
|
+
if not advice_lower or 'filter' not in advice_lower or 'search' not in advice_lower:
|
1692
|
+
continue
|
1693
|
+
|
1694
|
+
# Check if the rule applies to this tool
|
1695
|
+
applies = _rule_applies_to_tool(rule_obj, tool_name, advice_lower)
|
1696
|
+
|
1697
|
+
if applies:
|
1698
|
+
filter_rules.append(rule_obj)
|
1699
|
+
|
1700
|
+
return filter_rules
|
1701
|
+
|
1702
|
+
|
1703
|
+
def _rule_applies_to_tool(rule_obj, tool_name: str, advice_lower: str) -> bool:
|
1704
|
+
"""
|
1705
|
+
Check if a rule applies to the given tool.
|
1706
|
+
|
1707
|
+
Args:
|
1708
|
+
rule_obj: Rule object or dict to check
|
1709
|
+
tool_name: Name of the tool (case-insensitive)
|
1710
|
+
advice_lower: Lowercased advice text for fallback matching
|
1711
|
+
|
1712
|
+
Returns:
|
1713
|
+
True if rule applies to this tool
|
1714
|
+
"""
|
1715
|
+
# Wildcard matches everything (used for initial check)
|
1716
|
+
if tool_name == "*":
|
1717
|
+
return True
|
1718
|
+
|
1719
|
+
tool_name_lower = tool_name.lower()
|
1720
|
+
|
1721
|
+
# Extract references.tools from rule (handle both dict and object formats)
|
1722
|
+
if isinstance(rule_obj, dict):
|
1723
|
+
references = rule_obj.get('references', {})
|
1724
|
+
tools = references.get('tools', []) if isinstance(references, dict) else []
|
1725
|
+
else:
|
1726
|
+
references = getattr(rule_obj, 'references', None)
|
1727
|
+
if references:
|
1728
|
+
# Try both object attribute and dict access for tools
|
1729
|
+
if hasattr(references, 'tools'):
|
1730
|
+
tools = references.tools
|
1731
|
+
elif isinstance(references, dict):
|
1732
|
+
tools = references.get('tools', [])
|
1733
|
+
else:
|
1734
|
+
tools = []
|
1735
|
+
else:
|
1736
|
+
tools = []
|
1737
|
+
|
1738
|
+
if tools:
|
1739
|
+
# Check if tool_name matches any tool in references.tools (case-insensitive exact match)
|
1740
|
+
for ref_tool in tools:
|
1741
|
+
ref_tool_lower = ref_tool.lower()
|
1742
|
+
if tool_name_lower == ref_tool_lower:
|
1743
|
+
return True
|
1744
|
+
# No match found in references.tools
|
1745
|
+
return False
|
1746
|
+
else:
|
1747
|
+
# Rule has no tools list - don't apply to anything (be conservative)
|
1748
|
+
return False
|
1749
|
+
|
1750
|
+
|
1751
|
+
async def run_pipecleaner_enforcement(
|
1752
|
+
messages_or_prompts: tuple,
|
1753
|
+
callback_handler: any,
|
1754
|
+
patch_depth: any
|
1755
|
+
) -> bool:
|
1756
|
+
"""
|
1757
|
+
Main pipecleaner enforcement logic - parallel to run_microturn_enforcement.
|
1758
|
+
|
1759
|
+
This intercepts ToolMessage objects and applies deduplication.
|
1760
|
+
|
1761
|
+
Args:
|
1762
|
+
messages_or_prompts: Args tuple from _generate (first element is messages)
|
1763
|
+
callback_handler: DaseinCallbackHandler with rules
|
1764
|
+
patch_depth: Thread-local object with caching
|
1765
|
+
|
1766
|
+
Returns:
|
1767
|
+
True if enforcement was applied, False if skipped
|
1768
|
+
"""
|
1769
|
+
try:
|
1770
|
+
print(f"[PIPECLEANER] 🧹 run_pipecleaner_enforcement called")
|
1771
|
+
|
1772
|
+
if not callback_handler or not hasattr(callback_handler, '_selected_rules'):
|
1773
|
+
return False
|
1774
|
+
|
1775
|
+
rules = callback_handler._selected_rules
|
1776
|
+
print(f"[PIPECLEANER] Found {len(rules)} rules")
|
1777
|
+
|
1778
|
+
filter_rules = _find_filter_search_rules("*", rules)
|
1779
|
+
if not filter_rules:
|
1780
|
+
return False
|
1781
|
+
|
1782
|
+
print(f"[PIPECLEANER] 🎯 Found {len(filter_rules)} filter search rules!")
|
1783
|
+
|
1784
|
+
# Extract messages from args
|
1785
|
+
if not messages_or_prompts or len(messages_or_prompts) == 0:
|
1786
|
+
return False
|
1787
|
+
|
1788
|
+
messages = messages_or_prompts[0]
|
1789
|
+
if not isinstance(messages, list):
|
1790
|
+
return False
|
1791
|
+
|
1792
|
+
# Find the most recent ToolMessage (tool result)
|
1793
|
+
tool_message = None
|
1794
|
+
for idx in range(len(messages) - 1, -1, -1):
|
1795
|
+
msg = messages[idx]
|
1796
|
+
msg_type = getattr(msg, 'type', None) or (msg.get('type') if isinstance(msg, dict) else None)
|
1797
|
+
if msg_type == 'tool':
|
1798
|
+
tool_message = msg
|
1799
|
+
break
|
1800
|
+
|
1801
|
+
if not tool_message:
|
1802
|
+
return False
|
1803
|
+
|
1804
|
+
# Extract tool name and content
|
1805
|
+
tool_name = getattr(tool_message, 'name', None) or tool_message.get('name', 'unknown')
|
1806
|
+
tool_content = str(getattr(tool_message, 'content', None) or tool_message.get('content', ''))
|
1807
|
+
|
1808
|
+
print(f"[PIPECLEANER] Tool: {tool_name}, content: {len(tool_content)} chars")
|
1809
|
+
|
1810
|
+
# Check if this tool matches our filter rules
|
1811
|
+
matching_rules = _find_filter_search_rules(tool_name, rules)
|
1812
|
+
if not matching_rules:
|
1813
|
+
print(f"[PIPECLEANER] Tool '{tool_name}' doesn't match filter rules, skipping")
|
1814
|
+
return False
|
1815
|
+
|
1816
|
+
print(f"[PIPECLEANER] 🎯 Tool '{tool_name}' matches filter rules! Starting deduplication...")
|
1817
|
+
|
1818
|
+
# Prevent infinite regression - check if we've already processed this exact message
|
1819
|
+
if not hasattr(patch_depth, 'processed_tool_messages'):
|
1820
|
+
patch_depth.processed_tool_messages = set()
|
1821
|
+
|
1822
|
+
# Create signature from tool name + content hash
|
1823
|
+
msg_signature = f"{tool_name}_{hash(tool_content[:200])}"
|
1824
|
+
if msg_signature in patch_depth.processed_tool_messages:
|
1825
|
+
print(f"[PIPECLEANER] Already processed this ToolMessage, skipping")
|
1826
|
+
return False
|
1827
|
+
|
1828
|
+
# Mark as processed
|
1829
|
+
patch_depth.processed_tool_messages.add(msg_signature)
|
1830
|
+
|
1831
|
+
# Apply deduplication
|
1832
|
+
cached_model = getattr(callback_handler, '_pipecleaner_embedding_model', None)
|
1833
|
+
|
1834
|
+
deduplicated, stats, model = deduplicate_search_results(
|
1835
|
+
text=tool_content,
|
1836
|
+
similarity_threshold=0.60, # Lowered to catch paraphrases
|
1837
|
+
verbose=True,
|
1838
|
+
cached_model=cached_model
|
1839
|
+
)
|
1840
|
+
|
1841
|
+
# Cache model
|
1842
|
+
callback_handler._pipecleaner_embedding_model = model
|
1843
|
+
|
1844
|
+
# Modify ToolMessage content IN PLACE
|
1845
|
+
if hasattr(tool_message, 'content'):
|
1846
|
+
tool_message.content = deduplicated
|
1847
|
+
elif isinstance(tool_message, dict):
|
1848
|
+
tool_message['content'] = deduplicated
|
1849
|
+
|
1850
|
+
# Cache result for potential reuse
|
1851
|
+
if not hasattr(patch_depth, 'tool_result_cache'):
|
1852
|
+
patch_depth.tool_result_cache = {}
|
1853
|
+
|
1854
|
+
result_key = f"{tool_name}_{hash(tool_content[:100])}"
|
1855
|
+
patch_depth.tool_result_cache[result_key] = deduplicated
|
1856
|
+
|
1857
|
+
print(f"[PIPECLEANER] ✅ Applied deduplication to {tool_name}")
|
1858
|
+
|
1859
|
+
# Print stats
|
1860
|
+
print(f"\n{'='*70}")
|
1861
|
+
print(f"[PIPECLEANER] 📊 DEDUPLICATION RESULTS")
|
1862
|
+
print(f"{'='*70}")
|
1863
|
+
print(f"[PIPECLEANER] 🔢 Sentences:")
|
1864
|
+
print(f"[PIPECLEANER] Original: {stats['original_sentences']} sentences")
|
1865
|
+
print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_sentences']} sentences")
|
1866
|
+
print(f"[PIPECLEANER] Prune %: {stats['prune_pct']:.1f}% removed")
|
1867
|
+
print(f"[PIPECLEANER]")
|
1868
|
+
print(f"[PIPECLEANER] 🎯 Entity Coverage:")
|
1869
|
+
print(f"[PIPECLEANER] Total entities: {stats['entities_total']}")
|
1870
|
+
print(f"[PIPECLEANER] Entities preserved: {stats['entities_preserved']}")
|
1871
|
+
print(f"[PIPECLEANER] Coverage: {stats['entity_coverage_pct']:.1f}%")
|
1872
|
+
print(f"[PIPECLEANER]")
|
1873
|
+
print(f"[PIPECLEANER] 💰 Token Savings (len/4):")
|
1874
|
+
print(f"[PIPECLEANER] Original tokens: {stats['original_tokens']:,} tokens")
|
1875
|
+
print(f"[PIPECLEANER] Deduplicated: {stats['deduplicated_tokens']:,} tokens")
|
1876
|
+
print(f"[PIPECLEANER] Tokens saved: {stats['tokens_saved']:,} tokens ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
|
1877
|
+
print(f"[PIPECLEANER]")
|
1878
|
+
print(f"[PIPECLEANER] ✅ SUCCESS: Pruned {stats['prune_pct']:.1f}% redundancy, preserved {stats['entity_coverage_pct']:.1f}% entities")
|
1879
|
+
print(f"{'='*70}\n")
|
1880
|
+
|
1881
|
+
return True
|
1882
|
+
|
1883
|
+
except Exception as e:
|
1884
|
+
print(f"[PIPECLEANER] ⚠️ Error during enforcement: {e}")
|
1885
|
+
import traceback
|
1886
|
+
traceback.print_exc()
|
1887
|
+
return False
|
1888
|
+
|
1889
|
+
|
1890
|
+
if __name__ == "__main__":
|
1891
|
+
# Simple test
|
1892
|
+
test_text = """
|
1893
|
+
LangChain is a framework for developing applications powered by language models.
|
1894
|
+
The LangChain framework enables developers to build LLM applications easily.
|
1895
|
+
LangChain provides many useful features for LLM apps.
|
1896
|
+
It supports multiple model providers including OpenAI and Anthropic.
|
1897
|
+
The framework was created in 2022 by Harrison Chase.
|
1898
|
+
LlamaIndex is another popular framework for LLM applications.
|
1899
|
+
LlamaIndex focuses on data indexing and retrieval.
|
1900
|
+
Both frameworks are open source and widely used.
|
1901
|
+
"""
|
1902
|
+
|
1903
|
+
print("Testing pipecleaner deduplication...")
|
1904
|
+
result, stats, model = deduplicate_search_results(test_text, verbose=True)
|
1905
|
+
|
1906
|
+
print("\n" + "="*70)
|
1907
|
+
print("STATS:")
|
1908
|
+
print(f" Prune %: {stats['prune_pct']:.1f}%")
|
1909
|
+
print(f" Entity Coverage: {stats['entity_coverage_pct']:.1f}%")
|
1910
|
+
print(f" Tokens saved: {stats['tokens_saved']:,} ({(stats['tokens_saved']/stats['original_tokens']*100 if stats['original_tokens'] > 0 else 0):.1f}%)")
|
1911
|
+
|
1912
|
+
print("\n" + "="*70)
|
1913
|
+
print("ORIGINAL:")
|
1914
|
+
print(test_text)
|
1915
|
+
print("\n" + "="*70)
|
1916
|
+
print("DEDUPLICATED:")
|
1917
|
+
print(result)
|
1918
|
+
|