autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,606 @@
1
+
2
+ from __future__ import annotations
3
+ import time, json, os, random, math
4
+ import concurrent.futures
5
+ from opentelemetry import context
6
+ from typing import List, Dict, Any, Tuple, Optional, Callable
7
+ from .config import AutoChunkConfig
8
+ from .storage.cache import Cache
9
+ from .storage.plan import Plan
10
+ from .utils.hashing import content_hash
11
+ from .utils.io import load_documents
12
+ from .utils.logger import logger, current_job_id
13
+ from .utils.telemetry import init_telemetry, get_tracer
14
+ from .chunkers.fixed_length import FixedLengthChunker
15
+ from .chunkers.sentence_aware import SentenceAwareChunker
16
+ from .chunkers.recursive_character import RecursiveCharacterChunker
17
+ from .chunkers.semantic_local import SemanticLocalChunker
18
+ from .chunkers.parent_child import ParentChildChunker
19
+ from .chunkers.layout_aware import LayoutAwareChunker
20
+ from .chunkers.hybrid_semantic_stat import HybridSemanticStatChunker
21
+ from .chunkers.html_section import HTMLSectionChunker
22
+ from .chunkers.bridges.langchain.recursive import LangChainRecursiveBridge
23
+ from .chunkers.bridges.langchain.character import LangChainCharacterBridge
24
+ from .chunkers.bridges.langchain.markdown import LangChainMarkdownBridge
25
+ from .chunkers.bridges.langchain.token import LangChainTokenBridge
26
+ from .chunkers.bridges.langchain.python import LangChainPythonBridge
27
+ from .chunkers.bridges.langchain.html import LangChainHTMLBridge
28
+ from .chunkers.bridges.langchain.json import LangChainJSONBridge
29
+ from .embedding.hashing import HashingEmbedding
30
+ from .eval.harness import EvalHarness
31
+ from .eval.ragas_eval import RagasEvaluator
32
+ # Quality Layer - Post-Processing Pipeline
33
+ from .quality.post_processor import apply_post_processing, NATIVE_CHUNKERS
34
+
35
+ tracer = get_tracer(__name__)
36
+
37
+ GENERATOR_REGISTRY = {
38
+ "fixed_length": FixedLengthChunker(),
39
+ "sentence_aware": SentenceAwareChunker(),
40
+ "recursive_character": RecursiveCharacterChunker(),
41
+ "semantic_local": SemanticLocalChunker(),
42
+ "parent_child": ParentChildChunker(),
43
+ "layout_aware": LayoutAwareChunker(),
44
+ "hybrid_semantic_stat": HybridSemanticStatChunker(),
45
+ "html_section": HTMLSectionChunker(),
46
+ # Framework Bridges
47
+ "langchain_recursive": LangChainRecursiveBridge(),
48
+ "langchain_character": LangChainCharacterBridge(),
49
+ "langchain_markdown": LangChainMarkdownBridge(),
50
+ "langchain_token": LangChainTokenBridge(),
51
+ "langchain_python": LangChainPythonBridge(),
52
+ "langchain_html": LangChainHTMLBridge(),
53
+ "langchain_json": LangChainJSONBridge(),
54
+ }
55
+
56
+ class AutoChunker:
57
+ def __init__(self, eval_config=None, retrieval_strategy=None, proxy_config=None, ragas_config=None, mode="light",
58
+ cache_dir=".ac_cache", metadata_enrichment=None,
59
+ embedding_provider=None, embedding_model_or_path=None, embedding_api_key=None, network_config=None,
60
+ telemetry_enabled: bool = False,
61
+ # Post-Processing Options (AutoChunks native chunkers only)
62
+ enable_dedup: bool = False,
63
+ enable_overlap_opt: bool = False,
64
+ dedup_threshold: float = 0.98,
65
+ overlap_tokens: int = 50):
66
+ self.cfg = AutoChunkConfig()
67
+ if eval_config: self.cfg.eval_config = eval_config
68
+ if retrieval_strategy: self.cfg.retrieval_strategy = retrieval_strategy
69
+ if proxy_config: self.cfg.proxy_config = proxy_config
70
+ if ragas_config: self.cfg.ragas_config = ragas_config
71
+ if embedding_provider: self.cfg.embedding_provider = embedding_provider
72
+ if embedding_model_or_path: self.cfg.embedding_model_or_path = embedding_model_or_path
73
+ if embedding_api_key: self.cfg.embedding_api_key = embedding_api_key
74
+ if network_config: self.cfg.network = network_config
75
+ self.cfg.telemetry_enabled = telemetry_enabled
76
+ self.cfg.mode = mode
77
+ self.cfg.cache_dir = cache_dir
78
+ if metadata_enrichment: self.cfg.metadata_enrichment = metadata_enrichment
79
+
80
+ # Post-processing config
81
+ self.enable_dedup = enable_dedup
82
+ self.enable_overlap_opt = enable_overlap_opt
83
+ self.dedup_threshold = dedup_threshold
84
+ self.overlap_tokens = overlap_tokens
85
+
86
+ init_telemetry(enabled=self.cfg.telemetry_enabled)
87
+
88
+ def optimize(self, documents: List[Dict]|str, embedding_fn=None, retriever="in_memory",
89
+ framework="langchain", golden_qa=None, candidate_names: Optional[List[str]] = None,
90
+ sweep_params: Optional[Dict[str, List]] = None,
91
+ on_progress: Optional[Callable[[str, int], None]] = None,
92
+ on_result: Optional[Callable[[Dict], None]] = None):
93
+
94
+ if on_progress: on_progress(f"Scanning documents in {documents if isinstance(documents, str) else 'memory'}...", step=1)
95
+
96
+ # Load docs if path provided
97
+ if isinstance(documents, str):
98
+ # Check if any selected candidate needs high-fidelity (Markdown) extraction
99
+ high_fidelity = any(c in (candidate_names or []) for c in ["layout_aware", "hybrid_semantic_stat"])
100
+ if not candidate_names: high_fidelity = True # Default to best quality if none specified
101
+
102
+ docs = load_documents(
103
+ documents,
104
+ on_progress=lambda m: on_progress(m, 1) if on_progress else None,
105
+ high_fidelity=high_fidelity
106
+ )
107
+ else:
108
+ docs = documents
109
+
110
+ # 1. Compute Document Hashes
111
+ if on_progress: on_progress("Computing document fingerprints (hashing)...", step=1)
112
+ for d in docs:
113
+ if "hash" not in d:
114
+ d["hash"] = content_hash(d["text"])
115
+ logger.info(f"Hashed {len(docs)} documents.")
116
+
117
+ corpus_hash = content_hash("".join(sorted(d["hash"] for d in docs)))
118
+ logger.info(f"Corpus Fingerprint: {corpus_hash[:12]}...")
119
+
120
+ with tracer.start_as_current_span("optimize") as span:
121
+ span.set_attribute("corpus_hash", corpus_hash)
122
+ span.set_attribute("mode", self.cfg.mode)
123
+
124
+ # Embedding setup (Pluggable Architecture)
125
+ if embedding_fn is None:
126
+ from .embedding import get_encoder
127
+ if on_progress: on_progress(f"Initializing {self.cfg.embedding_provider} encoder...", step=1)
128
+ encoder = get_encoder(
129
+ provider=self.cfg.embedding_provider,
130
+ model_name=self.cfg.embedding_model_or_path,
131
+ api_key=self.cfg.embedding_api_key,
132
+ cache_folder=self.cfg.network.local_models_path,
133
+ trusted_orgs=self.cfg.network.trusted_orgs
134
+ )
135
+ logger.info(f"Initialized {self.cfg.embedding_provider} encoder: {encoder.model_name}")
136
+ if on_progress: on_progress(f"Encoder Ready: {self.cfg.embedding_provider}", step=1)
137
+ embedding_fn = encoder.embed_batch
138
+ span.set_attribute("embedding.provider", self.cfg.embedding_provider)
139
+ span.set_attribute("embedding.model", self.cfg.embedding_model_or_path)
140
+
141
+ harness = EvalHarness(embedding_fn, k=self.cfg.eval_config.k)
142
+
143
+ # 2. Representative Proxy Sampling (Optimized for Scale)
144
+ docs_sorted = sorted(docs, key=lambda x: x["id"])
145
+ proxy_docs = docs_sorted
146
+ if self.cfg.proxy_config.enabled or len(docs_sorted) > 10:
147
+ n_samples = max(2, int(len(docs_sorted) * (self.cfg.proxy_config.proxy_percent / 100.0)))
148
+ if self.cfg.mode == "light":
149
+ n_samples = min(n_samples, 5)
150
+
151
+ random.seed(42) # Deterministic sampling
152
+ proxy_docs = random.sample(docs_sorted, min(len(docs_sorted), n_samples))
153
+ logger.info(f"Proxy Strategy: Optimizing on {len(proxy_docs)} representative docs (Total: {len(docs_sorted)})")
154
+
155
+ with tracer.start_as_current_span("build_synthetic_qa"):
156
+ cache = Cache(self.cfg.cache_dir)
157
+ qa = []
158
+
159
+ if golden_qa:
160
+ qa = golden_qa
161
+ logger.info(f"Using provided golden QA ({len(qa)} pairs)")
162
+ else:
163
+ docs_needing_qa = []
164
+ for d in proxy_docs:
165
+ doc_qa_key = f"qa_doc_{d['hash']}"
166
+ cached_doc_qa = cache.get_json(doc_qa_key)
167
+ if cached_doc_qa:
168
+ # Re-bind doc_id to the current session's path to ensure cache portability
169
+ for q in cached_doc_qa:
170
+ q["doc_id"] = d["id"]
171
+ qa.extend(cached_doc_qa)
172
+ else:
173
+ docs_needing_qa.append(d)
174
+
175
+ if docs_needing_qa:
176
+ if on_progress: on_progress(f"Generating synthetic QA for {len(docs_needing_qa)} files...", step=2)
177
+ for i, d in enumerate(docs_needing_qa):
178
+ if on_progress: on_progress(f"Analyzing document {i+1}/{len(docs_needing_qa)} [{os.path.basename(d['id'])}]...", step=2)
179
+ doc_qa = harness.build_synthetic_qa([d], lambda m: on_progress(m, 2) if on_progress else None)
180
+ doc_qa_key = f"qa_doc_{d['hash']}"
181
+ cache.set_json(doc_qa_key, doc_qa)
182
+ qa.extend(doc_qa)
183
+
184
+ logger.info(f"Generated QA for {len(docs_needing_qa)} documents. Total QA pool: {len(qa)}")
185
+ if on_progress: on_progress(f"QA Generation complete ({len(qa)} pairs total)", step=2)
186
+ else:
187
+ logger.info(f"Cache Hit: Reusing {len(qa)} QA pairs for sampled docs")
188
+ if on_progress: on_progress(f"Reusing {len(qa)} cached QA pairs", step=2)
189
+
190
+ span.set_attribute("num_qa_pairs", len(qa))
191
+
192
+ # Candidate grid
193
+ all_candidates = [
194
+ ("fixed_length", {"base_token_size": 512, "overlap": 64}),
195
+ ("recursive_character", {"base_token_size": 512}),
196
+ ("sentence_aware", {"base_token_size": 512}),
197
+ ("semantic_local", {"threshold_percentile": 0.8}),
198
+ ("hybrid_semantic_stat", {"alpha": 0.7, "beta": 0.3}),
199
+ ("parent_child", {"parent_size": 1000, "child_size": 200}),
200
+ ]
201
+
202
+ # Layout-aware candidates for complex formats
203
+ has_rich_docs = any(d.get("ext") in [".pdf", ".md", ".html", ".htm"] for d in proxy_docs)
204
+ if has_rich_docs:
205
+ all_candidates.append(("layout_aware", {"base_token_size": 512}))
206
+
207
+ # HTML Section Chunker (Native)
208
+ has_html = any(d.get("ext") in [".html", ".htm"] for d in proxy_docs)
209
+ if has_html:
210
+ all_candidates.append(("html_section", {"base_token_size": 512}))
211
+
212
+ # Add Framework Bridges to candidates if they exist
213
+ all_candidates.extend([
214
+ ("langchain_recursive", {"base_token_size": 512, "overlap": 64}),
215
+ ("langchain_character", {"base_token_size": 512, "overlap": 64}),
216
+ ("langchain_markdown", {"base_token_size": 512, "overlap": 64}),
217
+ ("langchain_token", {"base_token_size": 512, "overlap": 64}),
218
+ ("langchain_python", {"base_token_size": 512, "overlap": 64}),
219
+ ("langchain_html", {"base_token_size": 512, "overlap": 64}),
220
+ ("langchain_json", {"base_token_size": 512, "overlap": 64}),
221
+ ])
222
+
223
+ if candidate_names:
224
+ base_candidates = [c for c in all_candidates if c[0] in candidate_names]
225
+ else:
226
+ base_candidates = all_candidates
227
+
228
+ if not base_candidates:
229
+ base_candidates = all_candidates[:3]
230
+
231
+ # --- Hyperparameter Sweep Expansion ---
232
+ candidates = []
233
+ if sweep_params and (sweep_params.get("chunk_sizes") or sweep_params.get("overlap_ratios")):
234
+ logger.info(f"Applying Hyperparameter Sweep: {sweep_params}")
235
+
236
+ sizes = sweep_params.get("chunk_sizes", [512])
237
+ ratios = sweep_params.get("overlap_ratios", [0.125])
238
+
239
+ for name, default_params in base_candidates:
240
+ # Check if this candidate supports sizing
241
+ is_sizable = "base_token_size" in default_params or name.startswith("langchain_") or name in ["fixed_length", "recursive_character", "sentence_aware", "layout_aware", "html_section"]
242
+
243
+ if is_sizable and name != "semantic_local":
244
+ for s in sizes:
245
+ for r in ratios:
246
+ p = default_params.copy()
247
+ p["base_token_size"] = s
248
+ # Calculate overlap
249
+ overlap = int(s * r)
250
+
251
+ # Apply to params if appropriate for the chunker
252
+ supported_overlap = [
253
+ "fixed_length", "recursive_character", "sentence_aware",
254
+ "layout_aware", "html_section", "langchain_recursive",
255
+ "langchain_token", "langchain_character", "langchain_markdown",
256
+ "langchain_python", "langchain_html", "langchain_json"
257
+ ]
258
+ if "overlap" in p or name in supported_overlap:
259
+ p["overlap"] = overlap
260
+ elif name == "parent_child":
261
+ # parent_child usually has fixed parent/child sizes, skipping sweep for now
262
+ continue
263
+
264
+ # Use a more descriptive name for the variants
265
+ variant_name = f"{name} ({s}|{int(r*100)}%)" if len(sizes) > 1 or len(ratios) > 1 else name
266
+ # We pass (base_name, display_name, params)
267
+ candidates.append((name, variant_name, p))
268
+ else:
269
+ candidates.append((name, name, default_params))
270
+ else:
271
+ # base_candidates are already (name, params) usually, but we need (base_name, display_name, params)
272
+ candidates = [(c[0], c[0], c[1]) for c in base_candidates]
273
+
274
+ if on_progress: on_progress(f"Starting parallel evaluation of {len(candidates)} candidates...", step=3)
275
+ best = None
276
+ best_metrics = None
277
+ best_score = -1.0
278
+ reports = []
279
+
280
+ logger.info(f"Starting Parallel Optimization across {len(candidates)} candidates...")
281
+
282
+ parent_ctx = context.get_current()
283
+ job_id_ctx = current_job_id.get()
284
+ # Safety: Cap workers to 4 to prevent GIL deadlock, further reduce for local models to prevent resource contention
285
+ safe_workers = min(4, self.cfg.parallel.embedding_concurrency or 4)
286
+ if self.cfg.embedding_provider == "local":
287
+ safe_workers = min(safe_workers, 2)
288
+ logger.info(f"Using reduced concurrency ({safe_workers} workers) for local embeddings to optimize resource usage.")
289
+
290
+ with concurrent.futures.ThreadPoolExecutor(max_workers=safe_workers) as executor:
291
+ futures = {
292
+ executor.submit(self._eval_candidate, cand, proxy_docs, qa, embedding_fn, retriever, parent_ctx, job_id_ctx): cand
293
+ for cand in candidates
294
+ }
295
+
296
+ for future in concurrent.futures.as_completed(futures):
297
+ # cand_info is (base_name, display_name, params)
298
+ cand_info = futures[future]
299
+ try:
300
+ logger.info(f"Checking result for candidate: {cand_info[1]}...")
301
+ # Safety: Add 5-minute timeout per candidate
302
+ name, params, metrics, chunks = future.result(timeout=300)
303
+ logger.info(f"Retrieved result for {name}. Processing...")
304
+ # Calculate the multi-objective score
305
+ current_score = self._calculate_score(metrics, self.cfg.eval_config.objective)
306
+ metrics["objective_score"] = current_score
307
+
308
+ # Create result object
309
+ result_entry = {
310
+ "name": name,
311
+ "params": params,
312
+ "metrics": metrics,
313
+ "score": current_score,
314
+ "is_partial": True
315
+ }
316
+
317
+ # Streaming Callback
318
+ if on_result:
319
+ on_result(result_entry)
320
+
321
+ # Include a small sample of chunks for Visual Fidelity Inspector
322
+ from .utils.text import count_tokens
323
+ chunk_samples = []
324
+ for c in chunks[:3]:
325
+ chunk_samples.append({
326
+ "text": c["text"],
327
+ "meta": c["meta"],
328
+ "tokens": count_tokens(c["text"])
329
+ })
330
+
331
+ report_entry = {
332
+ "name": name,
333
+ "params": params,
334
+ "metrics": metrics,
335
+ "chunk_samples": chunk_samples
336
+ }
337
+
338
+ logger.info(f"Evaluated {name}: Score {current_score:.4f}")
339
+
340
+ # Fix for 0.0000 logs: Ensure we use the same key mapping as _calculate_score
341
+ # Config defines e.g. 'mrr@10', but harness returns 'mrr@k'
342
+ primary_display_key = self.cfg.eval_config.metrics[0]
343
+ key_map = {"ndcg@10": "ndcg@k", "mrr@10": "mrr@k", "recall@50": "recall@k"}
344
+ target_key = key_map.get(primary_display_key, primary_display_key)
345
+
346
+ metric_val = metrics.get(target_key, 0)
347
+
348
+ # Diagnostic logging if metric is unexpectedly zero
349
+ if metric_val == 0 and current_score > 0:
350
+ logger.debug(f"Metric lookup for {primary_display_key} -> {target_key} returned 0. Available keys: {list(metrics.keys())}")
351
+
352
+ if on_progress: on_progress(f"Evaluated {name}: {metric_val:.4f}", step=3)
353
+
354
+ if (best is None) or (current_score > best_score):
355
+ best = (name, params) # Chunks are not stored in 'best' anymore
356
+ best_metrics = metrics
357
+ best_score = current_score
358
+ logger.success(f"New Leader: {name} (Score: {best_score:.4f}, {list(metrics.keys())[0]}: {list(metrics.values())[0]:.4f})")
359
+
360
+ reports.append(report_entry)
361
+
362
+ except Exception as e:
363
+ logger.error(f"Candidate evaluation failed: {e}")
364
+ import traceback
365
+ traceback.print_exc()
366
+ if on_progress: on_progress(f"Candidate failed", step=3)
367
+
368
+ if not best:
369
+ raise RuntimeError("Optimization failed: No valid candidates were successfully evaluated.")
370
+
371
+ name, params = best # Chunks are not part of 'best' anymore
372
+ # Re-run the best candidate to get its chunks for the final plan, if needed
373
+ # Or, if chunks are only for reporting, we can use the samples from reports.
374
+ # For now, assuming chunks are not needed for the final plan object directly.
375
+ plan = Plan(
376
+ id=content_hash(corpus_hash + name + json.dumps(params)),
377
+ corpus_hash=corpus_hash,
378
+ generator_pipeline={"name": name, "params": params},
379
+ metrics=best_metrics,
380
+ embedding={"name": self.cfg.embedding_provider, "model": self.cfg.embedding_model_or_path}
381
+ )
382
+ report = {"candidates": reports, "selected": {"name": name, "params": params, "metrics": best_metrics}}
383
+ return plan, report
384
+
385
+ def apply_with_generator(self, documents: str|List[Dict], gen_name: str, params: Dict) -> List[Dict]:
386
+ if isinstance(documents, str):
387
+ docs = load_documents(documents)
388
+ else:
389
+ docs = documents
390
+ gen = GENERATOR_REGISTRY[gen_name]
391
+ all_chunks = []
392
+ p = params.copy()
393
+ if gen_name in ["semantic_local", "hybrid_semantic_stat"] and "embedding_fn" not in p:
394
+ from .embedding import get_encoder
395
+ encoder = get_encoder(provider=self.cfg.embedding_provider, model_name=self.cfg.embedding_model_or_path, cache_folder=self.cfg.network.local_models_path, trusted_orgs=self.cfg.network.trusted_orgs)
396
+ p["embedding_fn"] = encoder.embed_batch
397
+
398
+ # Determine if this is a bridge chunker
399
+ is_bridge = gen_name.startswith("langchain_")
400
+
401
+ for d in docs:
402
+ p["local_models_path"] = self.cfg.network.local_models_path
403
+ doc_meta = d.get("metadata", {})
404
+
405
+ # Use raw_text for bridges, processed text for native chunkers
406
+ if is_bridge:
407
+ doc_text = d.get("raw_text", d["text"])
408
+ else:
409
+ doc_text = d["text"]
410
+
411
+ try:
412
+ for ch in gen.chunk(d["id"], doc_text, **p):
413
+ # Combine doc metadata with chunk-specific metadata
414
+ combined_meta = {**doc_meta, **ch.meta}
415
+ all_chunks.append({
416
+ "id": ch.id,
417
+ "doc_id": d["id"],
418
+ "text": ch.text,
419
+ "meta": combined_meta
420
+ })
421
+ except Exception as e:
422
+ logger.warning(f"Chunker {gen_name} failed on doc {d['id']}: {e}")
423
+ return all_chunks
424
+
425
+ def _calculate_score(self, metrics: Dict[str, Any], objective: str) -> float:
426
+ """
427
+ Weighted Scorer based on the target objective.
428
+ Weights:
429
+ - Quality: nDCG@k (Retrieval Precision)
430
+ - Coverage: Percentage of queries with a perfect match
431
+ - Efficiency: Penalty for excessive chunk counts
432
+ """
433
+ # Determine the anchor quality metric based on user selection
434
+ primary_key = self.cfg.eval_config.metrics[0]
435
+ key_map = {"ndcg@10": "ndcg@k", "mrr@10": "mrr@k", "recall@50": "recall@k"}
436
+ target_key = key_map.get(primary_key, "ndcg@k")
437
+
438
+ q = metrics.get(target_key, 0)
439
+ c = metrics.get("coverage", 0)
440
+ m = metrics.get("mrr@k", 0)
441
+ count = metrics.get("count", 1)
442
+
443
+ # Logarithmic penalty for chunk count
444
+ cost_penalty = 0.05 * math.log10(max(1, count))
445
+
446
+ if objective == "quality":
447
+ return q * 0.9 + m * 0.1
448
+ elif objective == "cost":
449
+ # Cost Optimized: Heavy penalty on count, but quality still matters
450
+ # Using inverse log scale so massive counts drop score towards 0
451
+ efficiency_score = 1.0 / (1.0 + 0.2 * math.log10(max(1, count)))
452
+ return q * 0.4 + efficiency_score * 0.6
453
+ elif objective == "latency":
454
+ # Latency Focus: MRR is king (finds answer fast at rank 1)
455
+ return m * 0.8 + q * 0.2
456
+ else: # "balanced"
457
+ # Balanced: Quality (nDCG) + Coverage (Reliability) - mild cost penalty
458
+ # Adjusted to be less punishing for large-but-good indices
459
+ base_score = (q * 0.6) + (m * 0.2) + (c * 0.2)
460
+
461
+ # Capped cost penalty: Max deduction is 15% for huge indices
462
+ # log10(1000) = 3 -> 0.09 penalty
463
+ # log10(10000) = 4 -> 0.12 penalty
464
+ normalized_cost_penalty = 0.03 * math.log10(max(1, count))
465
+
466
+ return max(0.0, base_score - normalized_cost_penalty)
467
+
468
+ def _eval_candidate(self, cand: Tuple[str, str, Dict], docs: List[Dict], qa: List[Dict], embedding_fn: Any, retriever: str, otel_context=None, job_id_ctx=None) -> Tuple[str, Dict, Dict, List]:
469
+ if otel_context:
470
+ context.attach(otel_context)
471
+ if job_id_ctx:
472
+ current_job_id.set(job_id_ctx)
473
+ base_name, display_name, params = cand
474
+ with tracer.start_as_current_span(f"candidate.{display_name}") as cspan:
475
+ cspan.set_attribute("params", json.dumps(params))
476
+ gen = GENERATOR_REGISTRY[base_name]
477
+ p = params.copy()
478
+ p["local_models_path"] = self.cfg.network.local_models_path
479
+ if base_name in ["semantic_local", "hybrid_semantic_stat"]:
480
+ p["embedding_fn"] = embedding_fn
481
+ harness = EvalHarness(embedding_fn, k=self.cfg.eval_config.k)
482
+
483
+ # ═══════════════════════════════════════════════════════════════════
484
+ # FAIR EVALUATION: Use different text versions for different chunkers
485
+ # - Native AutoChunks: Gets processed/optimized text (our value-add)
486
+ # - LangChain Bridges: Gets raw text (fair comparison, no preprocessing)
487
+ # ═══════════════════════════════════════════════════════════════════
488
+ is_native = base_name in NATIVE_CHUNKERS
489
+ is_bridge = base_name.startswith("langchain_")
490
+
491
+ if is_bridge:
492
+ logger.info(f"[{display_name}] Starting chunking (RAW text mode)...")
493
+ # DIAGNOSTIC: Log exact parameters passed to bridge
494
+ logger.debug(f"[{display_name}] Parameters: size={params.get('base_token_size')}, overlap={params.get('overlap')}")
495
+ else:
496
+ logger.info(f"[{display_name}] Starting chunking (Processed text mode)...")
497
+
498
+ start_time = time.time()
499
+ chunks = []
500
+ for i, d in enumerate(docs):
501
+ try:
502
+ # Use raw_text for bridges, processed text for native chunkers
503
+ if is_bridge:
504
+ # Bridges get raw text - no preprocessing advantage
505
+ doc_text = d.get("raw_text", d["text"])
506
+ else:
507
+ # Native chunkers get processed text (our value-add)
508
+ doc_text = d["text"]
509
+
510
+ logger.debug(f"[{display_name}] Chunking document {i+1}/{len(docs)}...")
511
+ for ch in gen.chunk(d["id"], doc_text, **p):
512
+ chunks.append({"id": ch.id, "doc_id": d["id"], "text": ch.text, "meta": ch.meta})
513
+ except Exception as e:
514
+ logger.warning(f"[{display_name}] Chunker failed on doc {d['id']}: {e}")
515
+
516
+ chunking_time = time.time() - start_time
517
+ logger.info(f"[{display_name}] Chunking complete: {len(chunks)} chunks in {chunking_time:.2f}s")
518
+
519
+ if not chunks:
520
+ logger.error(f"[{display_name}] returned zero chunks.")
521
+ raise ValueError(f"Chunker {display_name} returned zero chunks.")
522
+
523
+ # ═══════════════════════════════════════════════════════════════════
524
+ # POST-PROCESSING & QUALITY SCORING
525
+ # ═══════════════════════════════════════════════════════════════════
526
+ quality_metrics = {}
527
+
528
+ try:
529
+ logger.info(f"[{display_name}] Starting post-processing...")
530
+ pp_start = time.time()
531
+ # Always call post-processing. The processor internally checks 'chunker_name'
532
+ # and skips modifications for non-native chunkers, but still returns quality scores.
533
+ processed_chunks, quality_metrics = apply_post_processing(
534
+ chunks=chunks,
535
+ chunker_name=base_name, # Use base_name for internal logic
536
+ embedding_fn=embedding_fn,
537
+ enable_dedup=self.enable_dedup,
538
+ enable_overlap=self.enable_overlap_opt,
539
+ dedup_threshold=self.dedup_threshold,
540
+ overlap_tokens=self.overlap_tokens
541
+ )
542
+
543
+ pp_time = time.time() - pp_start
544
+ if quality_metrics.get("dedup_removed", 0) > 0:
545
+ logger.info(f"[{display_name}] Post-processing: Removed {quality_metrics['dedup_removed']} duplicate chunks in {pp_time:.2f}s")
546
+ else:
547
+ logger.info(f"[{display_name}] Post-processing complete in {pp_time:.2f}s")
548
+
549
+ chunks = processed_chunks
550
+ except Exception as e:
551
+ logger.warning(f"[{display_name}] Post-processing/Scoring failed: {e}")
552
+
553
+ # Evaluate with standard metrics (nDCG, MRR, etc.)
554
+ logger.info(f"[{display_name}] Starting evaluation against {len(qa)} queries...")
555
+ eval_start = time.time()
556
+ metrics = harness.evaluate(chunks, qa)
557
+ eval_time = time.time() - eval_start
558
+ logger.info(f"[{display_name}] Evaluation complete in {eval_time:.2f}s")
559
+
560
+ # Add explicit Chunk Count (Critical for UI)
561
+ metrics["count"] = len(chunks)
562
+
563
+ # Add quality metrics (fair: calculated for both native and bridge)
564
+ if quality_metrics:
565
+ metrics["avg_quality_score"] = quality_metrics.get("avg_quality_score", 0)
566
+ metrics["post_processed"] = quality_metrics.get("post_processing_applied", False)
567
+ metrics["dedup_removed"] = quality_metrics.get("dedup_removed", 0)
568
+ if "quality_dimensions" in quality_metrics:
569
+ metrics["quality_coherence"] = quality_metrics["quality_dimensions"].get("coherence", 0)
570
+ metrics["quality_completeness"] = quality_metrics["quality_dimensions"].get("completeness", 0)
571
+ metrics["quality_density"] = quality_metrics["quality_dimensions"].get("density", 0)
572
+
573
+ for k, v in metrics.items():
574
+ if isinstance(v, (int, float, str, bool)):
575
+ cspan.set_attribute(f"metrics.{k}", v)
576
+
577
+ # RAGAS Evaluation (Optional, Plug-and-Play)
578
+ if self.cfg.ragas_config.enabled:
579
+ logger.info(f"[{display_name}] Starting RAGAS evaluation...")
580
+ try:
581
+ ragas_eval = RagasEvaluator(self.cfg.ragas_config)
582
+ # Note: RagasEvaluator currently needs 'retrieved_ids' in QA items.
583
+ # Since we are decoupling, we will make RagasEvaluator handle retrieval internally
584
+ # OR we need to perform retrieval here to pass it.
585
+ # For perfectly clean separation, RagasEvaluator shoud accept embedding_fn to run search.
586
+ # Let's pass the embedding_fn to RagasEvaluator's run method if we update it.
587
+ # But wait, RagasEvaluator logic we wrote assumes 'retrieved_ids'.
588
+ # We need to bridge this gap.
589
+ # Let's simple utilize the harness to get hits first?
590
+ # The cleanest way without changing `evaluate` signature is to call a header helper
591
+ # or just rely on RagasEvaluator doing the search (which we need to implement).
592
+ # Let's assume for this step we update RagasEvaluator to accept embedding_fn.
593
+
594
+ ragas_metrics = ragas_eval.run(chunks, qa, embedding_fn=embedding_fn)
595
+ if ragas_metrics:
596
+ metrics.update(ragas_metrics)
597
+ logger.success(f"[{display_name}] RAGAS Metrics: {ragas_metrics}")
598
+ for k, v in ragas_metrics.items():
599
+ cspan.set_attribute(f"metrics.ragas.{k}", v)
600
+
601
+ except Exception as e:
602
+ logger.warning(f"[{display_name}] RAGAS Evaluation failed: {e}")
603
+
604
+ total_time = time.time() - start_time
605
+ logger.success(f"[{display_name}] Candidate evaluation finished in {total_time:.2f}s")
606
+ return display_name, params, metrics, chunks