autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
autochunk/autochunker.py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import time, json, os, random, math
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
from opentelemetry import context
|
|
6
|
+
from typing import List, Dict, Any, Tuple, Optional, Callable
|
|
7
|
+
from .config import AutoChunkConfig
|
|
8
|
+
from .storage.cache import Cache
|
|
9
|
+
from .storage.plan import Plan
|
|
10
|
+
from .utils.hashing import content_hash
|
|
11
|
+
from .utils.io import load_documents
|
|
12
|
+
from .utils.logger import logger, current_job_id
|
|
13
|
+
from .utils.telemetry import init_telemetry, get_tracer
|
|
14
|
+
from .chunkers.fixed_length import FixedLengthChunker
|
|
15
|
+
from .chunkers.sentence_aware import SentenceAwareChunker
|
|
16
|
+
from .chunkers.recursive_character import RecursiveCharacterChunker
|
|
17
|
+
from .chunkers.semantic_local import SemanticLocalChunker
|
|
18
|
+
from .chunkers.parent_child import ParentChildChunker
|
|
19
|
+
from .chunkers.layout_aware import LayoutAwareChunker
|
|
20
|
+
from .chunkers.hybrid_semantic_stat import HybridSemanticStatChunker
|
|
21
|
+
from .chunkers.html_section import HTMLSectionChunker
|
|
22
|
+
from .chunkers.bridges.langchain.recursive import LangChainRecursiveBridge
|
|
23
|
+
from .chunkers.bridges.langchain.character import LangChainCharacterBridge
|
|
24
|
+
from .chunkers.bridges.langchain.markdown import LangChainMarkdownBridge
|
|
25
|
+
from .chunkers.bridges.langchain.token import LangChainTokenBridge
|
|
26
|
+
from .chunkers.bridges.langchain.python import LangChainPythonBridge
|
|
27
|
+
from .chunkers.bridges.langchain.html import LangChainHTMLBridge
|
|
28
|
+
from .chunkers.bridges.langchain.json import LangChainJSONBridge
|
|
29
|
+
from .embedding.hashing import HashingEmbedding
|
|
30
|
+
from .eval.harness import EvalHarness
|
|
31
|
+
from .eval.ragas_eval import RagasEvaluator
|
|
32
|
+
# Quality Layer - Post-Processing Pipeline
|
|
33
|
+
from .quality.post_processor import apply_post_processing, NATIVE_CHUNKERS
|
|
34
|
+
|
|
35
|
+
tracer = get_tracer(__name__)
|
|
36
|
+
|
|
37
|
+
GENERATOR_REGISTRY = {
|
|
38
|
+
"fixed_length": FixedLengthChunker(),
|
|
39
|
+
"sentence_aware": SentenceAwareChunker(),
|
|
40
|
+
"recursive_character": RecursiveCharacterChunker(),
|
|
41
|
+
"semantic_local": SemanticLocalChunker(),
|
|
42
|
+
"parent_child": ParentChildChunker(),
|
|
43
|
+
"layout_aware": LayoutAwareChunker(),
|
|
44
|
+
"hybrid_semantic_stat": HybridSemanticStatChunker(),
|
|
45
|
+
"html_section": HTMLSectionChunker(),
|
|
46
|
+
# Framework Bridges
|
|
47
|
+
"langchain_recursive": LangChainRecursiveBridge(),
|
|
48
|
+
"langchain_character": LangChainCharacterBridge(),
|
|
49
|
+
"langchain_markdown": LangChainMarkdownBridge(),
|
|
50
|
+
"langchain_token": LangChainTokenBridge(),
|
|
51
|
+
"langchain_python": LangChainPythonBridge(),
|
|
52
|
+
"langchain_html": LangChainHTMLBridge(),
|
|
53
|
+
"langchain_json": LangChainJSONBridge(),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
class AutoChunker:
|
|
57
|
+
def __init__(self, eval_config=None, retrieval_strategy=None, proxy_config=None, ragas_config=None, mode="light",
|
|
58
|
+
cache_dir=".ac_cache", metadata_enrichment=None,
|
|
59
|
+
embedding_provider=None, embedding_model_or_path=None, embedding_api_key=None, network_config=None,
|
|
60
|
+
telemetry_enabled: bool = False,
|
|
61
|
+
# Post-Processing Options (AutoChunks native chunkers only)
|
|
62
|
+
enable_dedup: bool = False,
|
|
63
|
+
enable_overlap_opt: bool = False,
|
|
64
|
+
dedup_threshold: float = 0.98,
|
|
65
|
+
overlap_tokens: int = 50):
|
|
66
|
+
self.cfg = AutoChunkConfig()
|
|
67
|
+
if eval_config: self.cfg.eval_config = eval_config
|
|
68
|
+
if retrieval_strategy: self.cfg.retrieval_strategy = retrieval_strategy
|
|
69
|
+
if proxy_config: self.cfg.proxy_config = proxy_config
|
|
70
|
+
if ragas_config: self.cfg.ragas_config = ragas_config
|
|
71
|
+
if embedding_provider: self.cfg.embedding_provider = embedding_provider
|
|
72
|
+
if embedding_model_or_path: self.cfg.embedding_model_or_path = embedding_model_or_path
|
|
73
|
+
if embedding_api_key: self.cfg.embedding_api_key = embedding_api_key
|
|
74
|
+
if network_config: self.cfg.network = network_config
|
|
75
|
+
self.cfg.telemetry_enabled = telemetry_enabled
|
|
76
|
+
self.cfg.mode = mode
|
|
77
|
+
self.cfg.cache_dir = cache_dir
|
|
78
|
+
if metadata_enrichment: self.cfg.metadata_enrichment = metadata_enrichment
|
|
79
|
+
|
|
80
|
+
# Post-processing config
|
|
81
|
+
self.enable_dedup = enable_dedup
|
|
82
|
+
self.enable_overlap_opt = enable_overlap_opt
|
|
83
|
+
self.dedup_threshold = dedup_threshold
|
|
84
|
+
self.overlap_tokens = overlap_tokens
|
|
85
|
+
|
|
86
|
+
init_telemetry(enabled=self.cfg.telemetry_enabled)
|
|
87
|
+
|
|
88
|
+
def optimize(self, documents: List[Dict]|str, embedding_fn=None, retriever="in_memory",
|
|
89
|
+
framework="langchain", golden_qa=None, candidate_names: Optional[List[str]] = None,
|
|
90
|
+
sweep_params: Optional[Dict[str, List]] = None,
|
|
91
|
+
on_progress: Optional[Callable[[str, int], None]] = None,
|
|
92
|
+
on_result: Optional[Callable[[Dict], None]] = None):
|
|
93
|
+
|
|
94
|
+
if on_progress: on_progress(f"Scanning documents in {documents if isinstance(documents, str) else 'memory'}...", step=1)
|
|
95
|
+
|
|
96
|
+
# Load docs if path provided
|
|
97
|
+
if isinstance(documents, str):
|
|
98
|
+
# Check if any selected candidate needs high-fidelity (Markdown) extraction
|
|
99
|
+
high_fidelity = any(c in (candidate_names or []) for c in ["layout_aware", "hybrid_semantic_stat"])
|
|
100
|
+
if not candidate_names: high_fidelity = True # Default to best quality if none specified
|
|
101
|
+
|
|
102
|
+
docs = load_documents(
|
|
103
|
+
documents,
|
|
104
|
+
on_progress=lambda m: on_progress(m, 1) if on_progress else None,
|
|
105
|
+
high_fidelity=high_fidelity
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
docs = documents
|
|
109
|
+
|
|
110
|
+
# 1. Compute Document Hashes
|
|
111
|
+
if on_progress: on_progress("Computing document fingerprints (hashing)...", step=1)
|
|
112
|
+
for d in docs:
|
|
113
|
+
if "hash" not in d:
|
|
114
|
+
d["hash"] = content_hash(d["text"])
|
|
115
|
+
logger.info(f"Hashed {len(docs)} documents.")
|
|
116
|
+
|
|
117
|
+
corpus_hash = content_hash("".join(sorted(d["hash"] for d in docs)))
|
|
118
|
+
logger.info(f"Corpus Fingerprint: {corpus_hash[:12]}...")
|
|
119
|
+
|
|
120
|
+
with tracer.start_as_current_span("optimize") as span:
|
|
121
|
+
span.set_attribute("corpus_hash", corpus_hash)
|
|
122
|
+
span.set_attribute("mode", self.cfg.mode)
|
|
123
|
+
|
|
124
|
+
# Embedding setup (Pluggable Architecture)
|
|
125
|
+
if embedding_fn is None:
|
|
126
|
+
from .embedding import get_encoder
|
|
127
|
+
if on_progress: on_progress(f"Initializing {self.cfg.embedding_provider} encoder...", step=1)
|
|
128
|
+
encoder = get_encoder(
|
|
129
|
+
provider=self.cfg.embedding_provider,
|
|
130
|
+
model_name=self.cfg.embedding_model_or_path,
|
|
131
|
+
api_key=self.cfg.embedding_api_key,
|
|
132
|
+
cache_folder=self.cfg.network.local_models_path,
|
|
133
|
+
trusted_orgs=self.cfg.network.trusted_orgs
|
|
134
|
+
)
|
|
135
|
+
logger.info(f"Initialized {self.cfg.embedding_provider} encoder: {encoder.model_name}")
|
|
136
|
+
if on_progress: on_progress(f"Encoder Ready: {self.cfg.embedding_provider}", step=1)
|
|
137
|
+
embedding_fn = encoder.embed_batch
|
|
138
|
+
span.set_attribute("embedding.provider", self.cfg.embedding_provider)
|
|
139
|
+
span.set_attribute("embedding.model", self.cfg.embedding_model_or_path)
|
|
140
|
+
|
|
141
|
+
harness = EvalHarness(embedding_fn, k=self.cfg.eval_config.k)
|
|
142
|
+
|
|
143
|
+
# 2. Representative Proxy Sampling (Optimized for Scale)
|
|
144
|
+
docs_sorted = sorted(docs, key=lambda x: x["id"])
|
|
145
|
+
proxy_docs = docs_sorted
|
|
146
|
+
if self.cfg.proxy_config.enabled or len(docs_sorted) > 10:
|
|
147
|
+
n_samples = max(2, int(len(docs_sorted) * (self.cfg.proxy_config.proxy_percent / 100.0)))
|
|
148
|
+
if self.cfg.mode == "light":
|
|
149
|
+
n_samples = min(n_samples, 5)
|
|
150
|
+
|
|
151
|
+
random.seed(42) # Deterministic sampling
|
|
152
|
+
proxy_docs = random.sample(docs_sorted, min(len(docs_sorted), n_samples))
|
|
153
|
+
logger.info(f"Proxy Strategy: Optimizing on {len(proxy_docs)} representative docs (Total: {len(docs_sorted)})")
|
|
154
|
+
|
|
155
|
+
with tracer.start_as_current_span("build_synthetic_qa"):
|
|
156
|
+
cache = Cache(self.cfg.cache_dir)
|
|
157
|
+
qa = []
|
|
158
|
+
|
|
159
|
+
if golden_qa:
|
|
160
|
+
qa = golden_qa
|
|
161
|
+
logger.info(f"Using provided golden QA ({len(qa)} pairs)")
|
|
162
|
+
else:
|
|
163
|
+
docs_needing_qa = []
|
|
164
|
+
for d in proxy_docs:
|
|
165
|
+
doc_qa_key = f"qa_doc_{d['hash']}"
|
|
166
|
+
cached_doc_qa = cache.get_json(doc_qa_key)
|
|
167
|
+
if cached_doc_qa:
|
|
168
|
+
# Re-bind doc_id to the current session's path to ensure cache portability
|
|
169
|
+
for q in cached_doc_qa:
|
|
170
|
+
q["doc_id"] = d["id"]
|
|
171
|
+
qa.extend(cached_doc_qa)
|
|
172
|
+
else:
|
|
173
|
+
docs_needing_qa.append(d)
|
|
174
|
+
|
|
175
|
+
if docs_needing_qa:
|
|
176
|
+
if on_progress: on_progress(f"Generating synthetic QA for {len(docs_needing_qa)} files...", step=2)
|
|
177
|
+
for i, d in enumerate(docs_needing_qa):
|
|
178
|
+
if on_progress: on_progress(f"Analyzing document {i+1}/{len(docs_needing_qa)} [{os.path.basename(d['id'])}]...", step=2)
|
|
179
|
+
doc_qa = harness.build_synthetic_qa([d], lambda m: on_progress(m, 2) if on_progress else None)
|
|
180
|
+
doc_qa_key = f"qa_doc_{d['hash']}"
|
|
181
|
+
cache.set_json(doc_qa_key, doc_qa)
|
|
182
|
+
qa.extend(doc_qa)
|
|
183
|
+
|
|
184
|
+
logger.info(f"Generated QA for {len(docs_needing_qa)} documents. Total QA pool: {len(qa)}")
|
|
185
|
+
if on_progress: on_progress(f"QA Generation complete ({len(qa)} pairs total)", step=2)
|
|
186
|
+
else:
|
|
187
|
+
logger.info(f"Cache Hit: Reusing {len(qa)} QA pairs for sampled docs")
|
|
188
|
+
if on_progress: on_progress(f"Reusing {len(qa)} cached QA pairs", step=2)
|
|
189
|
+
|
|
190
|
+
span.set_attribute("num_qa_pairs", len(qa))
|
|
191
|
+
|
|
192
|
+
# Candidate grid
|
|
193
|
+
all_candidates = [
|
|
194
|
+
("fixed_length", {"base_token_size": 512, "overlap": 64}),
|
|
195
|
+
("recursive_character", {"base_token_size": 512}),
|
|
196
|
+
("sentence_aware", {"base_token_size": 512}),
|
|
197
|
+
("semantic_local", {"threshold_percentile": 0.8}),
|
|
198
|
+
("hybrid_semantic_stat", {"alpha": 0.7, "beta": 0.3}),
|
|
199
|
+
("parent_child", {"parent_size": 1000, "child_size": 200}),
|
|
200
|
+
]
|
|
201
|
+
|
|
202
|
+
# Layout-aware candidates for complex formats
|
|
203
|
+
has_rich_docs = any(d.get("ext") in [".pdf", ".md", ".html", ".htm"] for d in proxy_docs)
|
|
204
|
+
if has_rich_docs:
|
|
205
|
+
all_candidates.append(("layout_aware", {"base_token_size": 512}))
|
|
206
|
+
|
|
207
|
+
# HTML Section Chunker (Native)
|
|
208
|
+
has_html = any(d.get("ext") in [".html", ".htm"] for d in proxy_docs)
|
|
209
|
+
if has_html:
|
|
210
|
+
all_candidates.append(("html_section", {"base_token_size": 512}))
|
|
211
|
+
|
|
212
|
+
# Add Framework Bridges to candidates if they exist
|
|
213
|
+
all_candidates.extend([
|
|
214
|
+
("langchain_recursive", {"base_token_size": 512, "overlap": 64}),
|
|
215
|
+
("langchain_character", {"base_token_size": 512, "overlap": 64}),
|
|
216
|
+
("langchain_markdown", {"base_token_size": 512, "overlap": 64}),
|
|
217
|
+
("langchain_token", {"base_token_size": 512, "overlap": 64}),
|
|
218
|
+
("langchain_python", {"base_token_size": 512, "overlap": 64}),
|
|
219
|
+
("langchain_html", {"base_token_size": 512, "overlap": 64}),
|
|
220
|
+
("langchain_json", {"base_token_size": 512, "overlap": 64}),
|
|
221
|
+
])
|
|
222
|
+
|
|
223
|
+
if candidate_names:
|
|
224
|
+
base_candidates = [c for c in all_candidates if c[0] in candidate_names]
|
|
225
|
+
else:
|
|
226
|
+
base_candidates = all_candidates
|
|
227
|
+
|
|
228
|
+
if not base_candidates:
|
|
229
|
+
base_candidates = all_candidates[:3]
|
|
230
|
+
|
|
231
|
+
# --- Hyperparameter Sweep Expansion ---
|
|
232
|
+
candidates = []
|
|
233
|
+
if sweep_params and (sweep_params.get("chunk_sizes") or sweep_params.get("overlap_ratios")):
|
|
234
|
+
logger.info(f"Applying Hyperparameter Sweep: {sweep_params}")
|
|
235
|
+
|
|
236
|
+
sizes = sweep_params.get("chunk_sizes", [512])
|
|
237
|
+
ratios = sweep_params.get("overlap_ratios", [0.125])
|
|
238
|
+
|
|
239
|
+
for name, default_params in base_candidates:
|
|
240
|
+
# Check if this candidate supports sizing
|
|
241
|
+
is_sizable = "base_token_size" in default_params or name.startswith("langchain_") or name in ["fixed_length", "recursive_character", "sentence_aware", "layout_aware", "html_section"]
|
|
242
|
+
|
|
243
|
+
if is_sizable and name != "semantic_local":
|
|
244
|
+
for s in sizes:
|
|
245
|
+
for r in ratios:
|
|
246
|
+
p = default_params.copy()
|
|
247
|
+
p["base_token_size"] = s
|
|
248
|
+
# Calculate overlap
|
|
249
|
+
overlap = int(s * r)
|
|
250
|
+
|
|
251
|
+
# Apply to params if appropriate for the chunker
|
|
252
|
+
supported_overlap = [
|
|
253
|
+
"fixed_length", "recursive_character", "sentence_aware",
|
|
254
|
+
"layout_aware", "html_section", "langchain_recursive",
|
|
255
|
+
"langchain_token", "langchain_character", "langchain_markdown",
|
|
256
|
+
"langchain_python", "langchain_html", "langchain_json"
|
|
257
|
+
]
|
|
258
|
+
if "overlap" in p or name in supported_overlap:
|
|
259
|
+
p["overlap"] = overlap
|
|
260
|
+
elif name == "parent_child":
|
|
261
|
+
# parent_child usually has fixed parent/child sizes, skipping sweep for now
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
# Use a more descriptive name for the variants
|
|
265
|
+
variant_name = f"{name} ({s}|{int(r*100)}%)" if len(sizes) > 1 or len(ratios) > 1 else name
|
|
266
|
+
# We pass (base_name, display_name, params)
|
|
267
|
+
candidates.append((name, variant_name, p))
|
|
268
|
+
else:
|
|
269
|
+
candidates.append((name, name, default_params))
|
|
270
|
+
else:
|
|
271
|
+
# base_candidates are already (name, params) usually, but we need (base_name, display_name, params)
|
|
272
|
+
candidates = [(c[0], c[0], c[1]) for c in base_candidates]
|
|
273
|
+
|
|
274
|
+
if on_progress: on_progress(f"Starting parallel evaluation of {len(candidates)} candidates...", step=3)
|
|
275
|
+
best = None
|
|
276
|
+
best_metrics = None
|
|
277
|
+
best_score = -1.0
|
|
278
|
+
reports = []
|
|
279
|
+
|
|
280
|
+
logger.info(f"Starting Parallel Optimization across {len(candidates)} candidates...")
|
|
281
|
+
|
|
282
|
+
parent_ctx = context.get_current()
|
|
283
|
+
job_id_ctx = current_job_id.get()
|
|
284
|
+
# Safety: Cap workers to 4 to prevent GIL deadlock, further reduce for local models to prevent resource contention
|
|
285
|
+
safe_workers = min(4, self.cfg.parallel.embedding_concurrency or 4)
|
|
286
|
+
if self.cfg.embedding_provider == "local":
|
|
287
|
+
safe_workers = min(safe_workers, 2)
|
|
288
|
+
logger.info(f"Using reduced concurrency ({safe_workers} workers) for local embeddings to optimize resource usage.")
|
|
289
|
+
|
|
290
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=safe_workers) as executor:
|
|
291
|
+
futures = {
|
|
292
|
+
executor.submit(self._eval_candidate, cand, proxy_docs, qa, embedding_fn, retriever, parent_ctx, job_id_ctx): cand
|
|
293
|
+
for cand in candidates
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
for future in concurrent.futures.as_completed(futures):
|
|
297
|
+
# cand_info is (base_name, display_name, params)
|
|
298
|
+
cand_info = futures[future]
|
|
299
|
+
try:
|
|
300
|
+
logger.info(f"Checking result for candidate: {cand_info[1]}...")
|
|
301
|
+
# Safety: Add 5-minute timeout per candidate
|
|
302
|
+
name, params, metrics, chunks = future.result(timeout=300)
|
|
303
|
+
logger.info(f"Retrieved result for {name}. Processing...")
|
|
304
|
+
# Calculate the multi-objective score
|
|
305
|
+
current_score = self._calculate_score(metrics, self.cfg.eval_config.objective)
|
|
306
|
+
metrics["objective_score"] = current_score
|
|
307
|
+
|
|
308
|
+
# Create result object
|
|
309
|
+
result_entry = {
|
|
310
|
+
"name": name,
|
|
311
|
+
"params": params,
|
|
312
|
+
"metrics": metrics,
|
|
313
|
+
"score": current_score,
|
|
314
|
+
"is_partial": True
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
# Streaming Callback
|
|
318
|
+
if on_result:
|
|
319
|
+
on_result(result_entry)
|
|
320
|
+
|
|
321
|
+
# Include a small sample of chunks for Visual Fidelity Inspector
|
|
322
|
+
from .utils.text import count_tokens
|
|
323
|
+
chunk_samples = []
|
|
324
|
+
for c in chunks[:3]:
|
|
325
|
+
chunk_samples.append({
|
|
326
|
+
"text": c["text"],
|
|
327
|
+
"meta": c["meta"],
|
|
328
|
+
"tokens": count_tokens(c["text"])
|
|
329
|
+
})
|
|
330
|
+
|
|
331
|
+
report_entry = {
|
|
332
|
+
"name": name,
|
|
333
|
+
"params": params,
|
|
334
|
+
"metrics": metrics,
|
|
335
|
+
"chunk_samples": chunk_samples
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
logger.info(f"Evaluated {name}: Score {current_score:.4f}")
|
|
339
|
+
|
|
340
|
+
# Fix for 0.0000 logs: Ensure we use the same key mapping as _calculate_score
|
|
341
|
+
# Config defines e.g. 'mrr@10', but harness returns 'mrr@k'
|
|
342
|
+
primary_display_key = self.cfg.eval_config.metrics[0]
|
|
343
|
+
key_map = {"ndcg@10": "ndcg@k", "mrr@10": "mrr@k", "recall@50": "recall@k"}
|
|
344
|
+
target_key = key_map.get(primary_display_key, primary_display_key)
|
|
345
|
+
|
|
346
|
+
metric_val = metrics.get(target_key, 0)
|
|
347
|
+
|
|
348
|
+
# Diagnostic logging if metric is unexpectedly zero
|
|
349
|
+
if metric_val == 0 and current_score > 0:
|
|
350
|
+
logger.debug(f"Metric lookup for {primary_display_key} -> {target_key} returned 0. Available keys: {list(metrics.keys())}")
|
|
351
|
+
|
|
352
|
+
if on_progress: on_progress(f"Evaluated {name}: {metric_val:.4f}", step=3)
|
|
353
|
+
|
|
354
|
+
if (best is None) or (current_score > best_score):
|
|
355
|
+
best = (name, params) # Chunks are not stored in 'best' anymore
|
|
356
|
+
best_metrics = metrics
|
|
357
|
+
best_score = current_score
|
|
358
|
+
logger.success(f"New Leader: {name} (Score: {best_score:.4f}, {list(metrics.keys())[0]}: {list(metrics.values())[0]:.4f})")
|
|
359
|
+
|
|
360
|
+
reports.append(report_entry)
|
|
361
|
+
|
|
362
|
+
except Exception as e:
|
|
363
|
+
logger.error(f"Candidate evaluation failed: {e}")
|
|
364
|
+
import traceback
|
|
365
|
+
traceback.print_exc()
|
|
366
|
+
if on_progress: on_progress(f"Candidate failed", step=3)
|
|
367
|
+
|
|
368
|
+
if not best:
|
|
369
|
+
raise RuntimeError("Optimization failed: No valid candidates were successfully evaluated.")
|
|
370
|
+
|
|
371
|
+
name, params = best # Chunks are not part of 'best' anymore
|
|
372
|
+
# Re-run the best candidate to get its chunks for the final plan, if needed
|
|
373
|
+
# Or, if chunks are only for reporting, we can use the samples from reports.
|
|
374
|
+
# For now, assuming chunks are not needed for the final plan object directly.
|
|
375
|
+
plan = Plan(
|
|
376
|
+
id=content_hash(corpus_hash + name + json.dumps(params)),
|
|
377
|
+
corpus_hash=corpus_hash,
|
|
378
|
+
generator_pipeline={"name": name, "params": params},
|
|
379
|
+
metrics=best_metrics,
|
|
380
|
+
embedding={"name": self.cfg.embedding_provider, "model": self.cfg.embedding_model_or_path}
|
|
381
|
+
)
|
|
382
|
+
report = {"candidates": reports, "selected": {"name": name, "params": params, "metrics": best_metrics}}
|
|
383
|
+
return plan, report
|
|
384
|
+
|
|
385
|
+
def apply_with_generator(self, documents: str|List[Dict], gen_name: str, params: Dict) -> List[Dict]:
|
|
386
|
+
if isinstance(documents, str):
|
|
387
|
+
docs = load_documents(documents)
|
|
388
|
+
else:
|
|
389
|
+
docs = documents
|
|
390
|
+
gen = GENERATOR_REGISTRY[gen_name]
|
|
391
|
+
all_chunks = []
|
|
392
|
+
p = params.copy()
|
|
393
|
+
if gen_name in ["semantic_local", "hybrid_semantic_stat"] and "embedding_fn" not in p:
|
|
394
|
+
from .embedding import get_encoder
|
|
395
|
+
encoder = get_encoder(provider=self.cfg.embedding_provider, model_name=self.cfg.embedding_model_or_path, cache_folder=self.cfg.network.local_models_path, trusted_orgs=self.cfg.network.trusted_orgs)
|
|
396
|
+
p["embedding_fn"] = encoder.embed_batch
|
|
397
|
+
|
|
398
|
+
# Determine if this is a bridge chunker
|
|
399
|
+
is_bridge = gen_name.startswith("langchain_")
|
|
400
|
+
|
|
401
|
+
for d in docs:
|
|
402
|
+
p["local_models_path"] = self.cfg.network.local_models_path
|
|
403
|
+
doc_meta = d.get("metadata", {})
|
|
404
|
+
|
|
405
|
+
# Use raw_text for bridges, processed text for native chunkers
|
|
406
|
+
if is_bridge:
|
|
407
|
+
doc_text = d.get("raw_text", d["text"])
|
|
408
|
+
else:
|
|
409
|
+
doc_text = d["text"]
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
for ch in gen.chunk(d["id"], doc_text, **p):
|
|
413
|
+
# Combine doc metadata with chunk-specific metadata
|
|
414
|
+
combined_meta = {**doc_meta, **ch.meta}
|
|
415
|
+
all_chunks.append({
|
|
416
|
+
"id": ch.id,
|
|
417
|
+
"doc_id": d["id"],
|
|
418
|
+
"text": ch.text,
|
|
419
|
+
"meta": combined_meta
|
|
420
|
+
})
|
|
421
|
+
except Exception as e:
|
|
422
|
+
logger.warning(f"Chunker {gen_name} failed on doc {d['id']}: {e}")
|
|
423
|
+
return all_chunks
|
|
424
|
+
|
|
425
|
+
def _calculate_score(self, metrics: Dict[str, Any], objective: str) -> float:
|
|
426
|
+
"""
|
|
427
|
+
Weighted Scorer based on the target objective.
|
|
428
|
+
Weights:
|
|
429
|
+
- Quality: nDCG@k (Retrieval Precision)
|
|
430
|
+
- Coverage: Percentage of queries with a perfect match
|
|
431
|
+
- Efficiency: Penalty for excessive chunk counts
|
|
432
|
+
"""
|
|
433
|
+
# Determine the anchor quality metric based on user selection
|
|
434
|
+
primary_key = self.cfg.eval_config.metrics[0]
|
|
435
|
+
key_map = {"ndcg@10": "ndcg@k", "mrr@10": "mrr@k", "recall@50": "recall@k"}
|
|
436
|
+
target_key = key_map.get(primary_key, "ndcg@k")
|
|
437
|
+
|
|
438
|
+
q = metrics.get(target_key, 0)
|
|
439
|
+
c = metrics.get("coverage", 0)
|
|
440
|
+
m = metrics.get("mrr@k", 0)
|
|
441
|
+
count = metrics.get("count", 1)
|
|
442
|
+
|
|
443
|
+
# Logarithmic penalty for chunk count
|
|
444
|
+
cost_penalty = 0.05 * math.log10(max(1, count))
|
|
445
|
+
|
|
446
|
+
if objective == "quality":
|
|
447
|
+
return q * 0.9 + m * 0.1
|
|
448
|
+
elif objective == "cost":
|
|
449
|
+
# Cost Optimized: Heavy penalty on count, but quality still matters
|
|
450
|
+
# Using inverse log scale so massive counts drop score towards 0
|
|
451
|
+
efficiency_score = 1.0 / (1.0 + 0.2 * math.log10(max(1, count)))
|
|
452
|
+
return q * 0.4 + efficiency_score * 0.6
|
|
453
|
+
elif objective == "latency":
|
|
454
|
+
# Latency Focus: MRR is king (finds answer fast at rank 1)
|
|
455
|
+
return m * 0.8 + q * 0.2
|
|
456
|
+
else: # "balanced"
|
|
457
|
+
# Balanced: Quality (nDCG) + Coverage (Reliability) - mild cost penalty
|
|
458
|
+
# Adjusted to be less punishing for large-but-good indices
|
|
459
|
+
base_score = (q * 0.6) + (m * 0.2) + (c * 0.2)
|
|
460
|
+
|
|
461
|
+
# Capped cost penalty: Max deduction is 15% for huge indices
|
|
462
|
+
# log10(1000) = 3 -> 0.09 penalty
|
|
463
|
+
# log10(10000) = 4 -> 0.12 penalty
|
|
464
|
+
normalized_cost_penalty = 0.03 * math.log10(max(1, count))
|
|
465
|
+
|
|
466
|
+
return max(0.0, base_score - normalized_cost_penalty)
|
|
467
|
+
|
|
468
|
+
def _eval_candidate(self, cand: Tuple[str, str, Dict], docs: List[Dict], qa: List[Dict], embedding_fn: Any, retriever: str, otel_context=None, job_id_ctx=None) -> Tuple[str, Dict, Dict, List]:
|
|
469
|
+
if otel_context:
|
|
470
|
+
context.attach(otel_context)
|
|
471
|
+
if job_id_ctx:
|
|
472
|
+
current_job_id.set(job_id_ctx)
|
|
473
|
+
base_name, display_name, params = cand
|
|
474
|
+
with tracer.start_as_current_span(f"candidate.{display_name}") as cspan:
|
|
475
|
+
cspan.set_attribute("params", json.dumps(params))
|
|
476
|
+
gen = GENERATOR_REGISTRY[base_name]
|
|
477
|
+
p = params.copy()
|
|
478
|
+
p["local_models_path"] = self.cfg.network.local_models_path
|
|
479
|
+
if base_name in ["semantic_local", "hybrid_semantic_stat"]:
|
|
480
|
+
p["embedding_fn"] = embedding_fn
|
|
481
|
+
harness = EvalHarness(embedding_fn, k=self.cfg.eval_config.k)
|
|
482
|
+
|
|
483
|
+
# ═══════════════════════════════════════════════════════════════════
|
|
484
|
+
# FAIR EVALUATION: Use different text versions for different chunkers
|
|
485
|
+
# - Native AutoChunks: Gets processed/optimized text (our value-add)
|
|
486
|
+
# - LangChain Bridges: Gets raw text (fair comparison, no preprocessing)
|
|
487
|
+
# ═══════════════════════════════════════════════════════════════════
|
|
488
|
+
is_native = base_name in NATIVE_CHUNKERS
|
|
489
|
+
is_bridge = base_name.startswith("langchain_")
|
|
490
|
+
|
|
491
|
+
if is_bridge:
|
|
492
|
+
logger.info(f"[{display_name}] Starting chunking (RAW text mode)...")
|
|
493
|
+
# DIAGNOSTIC: Log exact parameters passed to bridge
|
|
494
|
+
logger.debug(f"[{display_name}] Parameters: size={params.get('base_token_size')}, overlap={params.get('overlap')}")
|
|
495
|
+
else:
|
|
496
|
+
logger.info(f"[{display_name}] Starting chunking (Processed text mode)...")
|
|
497
|
+
|
|
498
|
+
start_time = time.time()
|
|
499
|
+
chunks = []
|
|
500
|
+
for i, d in enumerate(docs):
|
|
501
|
+
try:
|
|
502
|
+
# Use raw_text for bridges, processed text for native chunkers
|
|
503
|
+
if is_bridge:
|
|
504
|
+
# Bridges get raw text - no preprocessing advantage
|
|
505
|
+
doc_text = d.get("raw_text", d["text"])
|
|
506
|
+
else:
|
|
507
|
+
# Native chunkers get processed text (our value-add)
|
|
508
|
+
doc_text = d["text"]
|
|
509
|
+
|
|
510
|
+
logger.debug(f"[{display_name}] Chunking document {i+1}/{len(docs)}...")
|
|
511
|
+
for ch in gen.chunk(d["id"], doc_text, **p):
|
|
512
|
+
chunks.append({"id": ch.id, "doc_id": d["id"], "text": ch.text, "meta": ch.meta})
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.warning(f"[{display_name}] Chunker failed on doc {d['id']}: {e}")
|
|
515
|
+
|
|
516
|
+
chunking_time = time.time() - start_time
|
|
517
|
+
logger.info(f"[{display_name}] Chunking complete: {len(chunks)} chunks in {chunking_time:.2f}s")
|
|
518
|
+
|
|
519
|
+
if not chunks:
|
|
520
|
+
logger.error(f"[{display_name}] returned zero chunks.")
|
|
521
|
+
raise ValueError(f"Chunker {display_name} returned zero chunks.")
|
|
522
|
+
|
|
523
|
+
# ═══════════════════════════════════════════════════════════════════
|
|
524
|
+
# POST-PROCESSING & QUALITY SCORING
|
|
525
|
+
# ═══════════════════════════════════════════════════════════════════
|
|
526
|
+
quality_metrics = {}
|
|
527
|
+
|
|
528
|
+
try:
|
|
529
|
+
logger.info(f"[{display_name}] Starting post-processing...")
|
|
530
|
+
pp_start = time.time()
|
|
531
|
+
# Always call post-processing. The processor internally checks 'chunker_name'
|
|
532
|
+
# and skips modifications for non-native chunkers, but still returns quality scores.
|
|
533
|
+
processed_chunks, quality_metrics = apply_post_processing(
|
|
534
|
+
chunks=chunks,
|
|
535
|
+
chunker_name=base_name, # Use base_name for internal logic
|
|
536
|
+
embedding_fn=embedding_fn,
|
|
537
|
+
enable_dedup=self.enable_dedup,
|
|
538
|
+
enable_overlap=self.enable_overlap_opt,
|
|
539
|
+
dedup_threshold=self.dedup_threshold,
|
|
540
|
+
overlap_tokens=self.overlap_tokens
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
pp_time = time.time() - pp_start
|
|
544
|
+
if quality_metrics.get("dedup_removed", 0) > 0:
|
|
545
|
+
logger.info(f"[{display_name}] Post-processing: Removed {quality_metrics['dedup_removed']} duplicate chunks in {pp_time:.2f}s")
|
|
546
|
+
else:
|
|
547
|
+
logger.info(f"[{display_name}] Post-processing complete in {pp_time:.2f}s")
|
|
548
|
+
|
|
549
|
+
chunks = processed_chunks
|
|
550
|
+
except Exception as e:
|
|
551
|
+
logger.warning(f"[{display_name}] Post-processing/Scoring failed: {e}")
|
|
552
|
+
|
|
553
|
+
# Evaluate with standard metrics (nDCG, MRR, etc.)
|
|
554
|
+
logger.info(f"[{display_name}] Starting evaluation against {len(qa)} queries...")
|
|
555
|
+
eval_start = time.time()
|
|
556
|
+
metrics = harness.evaluate(chunks, qa)
|
|
557
|
+
eval_time = time.time() - eval_start
|
|
558
|
+
logger.info(f"[{display_name}] Evaluation complete in {eval_time:.2f}s")
|
|
559
|
+
|
|
560
|
+
# Add explicit Chunk Count (Critical for UI)
|
|
561
|
+
metrics["count"] = len(chunks)
|
|
562
|
+
|
|
563
|
+
# Add quality metrics (fair: calculated for both native and bridge)
|
|
564
|
+
if quality_metrics:
|
|
565
|
+
metrics["avg_quality_score"] = quality_metrics.get("avg_quality_score", 0)
|
|
566
|
+
metrics["post_processed"] = quality_metrics.get("post_processing_applied", False)
|
|
567
|
+
metrics["dedup_removed"] = quality_metrics.get("dedup_removed", 0)
|
|
568
|
+
if "quality_dimensions" in quality_metrics:
|
|
569
|
+
metrics["quality_coherence"] = quality_metrics["quality_dimensions"].get("coherence", 0)
|
|
570
|
+
metrics["quality_completeness"] = quality_metrics["quality_dimensions"].get("completeness", 0)
|
|
571
|
+
metrics["quality_density"] = quality_metrics["quality_dimensions"].get("density", 0)
|
|
572
|
+
|
|
573
|
+
for k, v in metrics.items():
|
|
574
|
+
if isinstance(v, (int, float, str, bool)):
|
|
575
|
+
cspan.set_attribute(f"metrics.{k}", v)
|
|
576
|
+
|
|
577
|
+
# RAGAS Evaluation (Optional, Plug-and-Play)
|
|
578
|
+
if self.cfg.ragas_config.enabled:
|
|
579
|
+
logger.info(f"[{display_name}] Starting RAGAS evaluation...")
|
|
580
|
+
try:
|
|
581
|
+
ragas_eval = RagasEvaluator(self.cfg.ragas_config)
|
|
582
|
+
# Note: RagasEvaluator currently needs 'retrieved_ids' in QA items.
|
|
583
|
+
# Since we are decoupling, we will make RagasEvaluator handle retrieval internally
|
|
584
|
+
# OR we need to perform retrieval here to pass it.
|
|
585
|
+
# For perfectly clean separation, RagasEvaluator shoud accept embedding_fn to run search.
|
|
586
|
+
# Let's pass the embedding_fn to RagasEvaluator's run method if we update it.
|
|
587
|
+
# But wait, RagasEvaluator logic we wrote assumes 'retrieved_ids'.
|
|
588
|
+
# We need to bridge this gap.
|
|
589
|
+
# Let's simple utilize the harness to get hits first?
|
|
590
|
+
# The cleanest way without changing `evaluate` signature is to call a header helper
|
|
591
|
+
# or just rely on RagasEvaluator doing the search (which we need to implement).
|
|
592
|
+
# Let's assume for this step we update RagasEvaluator to accept embedding_fn.
|
|
593
|
+
|
|
594
|
+
ragas_metrics = ragas_eval.run(chunks, qa, embedding_fn=embedding_fn)
|
|
595
|
+
if ragas_metrics:
|
|
596
|
+
metrics.update(ragas_metrics)
|
|
597
|
+
logger.success(f"[{display_name}] RAGAS Metrics: {ragas_metrics}")
|
|
598
|
+
for k, v in ragas_metrics.items():
|
|
599
|
+
cspan.set_attribute(f"metrics.ragas.{k}", v)
|
|
600
|
+
|
|
601
|
+
except Exception as e:
|
|
602
|
+
logger.warning(f"[{display_name}] RAGAS Evaluation failed: {e}")
|
|
603
|
+
|
|
604
|
+
total_time = time.time() - start_time
|
|
605
|
+
logger.success(f"[{display_name}] Candidate evaluation finished in {total_time:.2f}s")
|
|
606
|
+
return display_name, params, metrics, chunks
|