autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional
|
|
2
|
+
from ..config import RagasConfig
|
|
3
|
+
from ..utils.logger import logger
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
class RagasEvaluator:
|
|
7
|
+
"""
|
|
8
|
+
Pluggable RAGAS Evaluator that supports multiple LLM backends:
|
|
9
|
+
- OpenAI (default, requires OPENAI_API_KEY)
|
|
10
|
+
- Ollama (local, requires ollama running)
|
|
11
|
+
- HuggingFace (local, requires GPU recommended)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, config: RagasConfig):
|
|
15
|
+
self.config = config
|
|
16
|
+
|
|
17
|
+
def _get_llm(self):
|
|
18
|
+
"""
|
|
19
|
+
Returns the appropriate LLM wrapper based on config.
|
|
20
|
+
Priority: config.llm_provider > OPENAI_API_KEY detection > fallback error
|
|
21
|
+
"""
|
|
22
|
+
provider = getattr(self.config, 'llm_provider', 'auto')
|
|
23
|
+
model_name = getattr(self.config, 'llm_model', None)
|
|
24
|
+
|
|
25
|
+
# Auto-detect: Check for available providers
|
|
26
|
+
if provider == 'auto':
|
|
27
|
+
if os.environ.get("OPENAI_API_KEY"):
|
|
28
|
+
provider = 'openai'
|
|
29
|
+
else:
|
|
30
|
+
# Try Ollama as fallback (common for local setups)
|
|
31
|
+
try:
|
|
32
|
+
import requests
|
|
33
|
+
resp = requests.get("http://localhost:11434/api/tags", timeout=2)
|
|
34
|
+
if resp.status_code == 200:
|
|
35
|
+
provider = 'ollama'
|
|
36
|
+
logger.info("RagasEvaluator: Auto-detected Ollama running locally")
|
|
37
|
+
except:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
if provider == 'openai':
|
|
41
|
+
from langchain_openai import ChatOpenAI
|
|
42
|
+
from ragas.llms import LangchainLLMWrapper
|
|
43
|
+
api_key = getattr(self.config, 'api_key', None) or os.environ.get("OPENAI_API_KEY")
|
|
44
|
+
llm = ChatOpenAI(model=model_name or "gpt-4o-mini", temperature=0, api_key=api_key)
|
|
45
|
+
return LangchainLLMWrapper(llm)
|
|
46
|
+
|
|
47
|
+
elif provider == 'ollama':
|
|
48
|
+
try:
|
|
49
|
+
from langchain_ollama import ChatOllama
|
|
50
|
+
from ragas.llms import LangchainLLMWrapper
|
|
51
|
+
llm = ChatOllama(model=model_name or "llama3.2", temperature=0)
|
|
52
|
+
# Set dummy key to prevent RAGAS from complaining
|
|
53
|
+
os.environ.setdefault("OPENAI_API_KEY", "sk-not-used-for-ollama")
|
|
54
|
+
return LangchainLLMWrapper(llm)
|
|
55
|
+
except ImportError:
|
|
56
|
+
logger.warning("RagasEvaluator: langchain-ollama not installed. Install with: pip install langchain-ollama")
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
elif provider == 'huggingface':
|
|
60
|
+
try:
|
|
61
|
+
from langchain_community.llms import HuggingFacePipeline
|
|
62
|
+
from ragas.llms import LangchainLLMWrapper
|
|
63
|
+
from transformers import pipeline
|
|
64
|
+
|
|
65
|
+
pipe = pipeline("text-generation", model=model_name or "microsoft/Phi-3-mini-4k-instruct", max_new_tokens=512)
|
|
66
|
+
llm = HuggingFacePipeline(pipeline=pipe)
|
|
67
|
+
os.environ.setdefault("OPENAI_API_KEY", "sk-not-used-for-hf")
|
|
68
|
+
return LangchainLLMWrapper(llm)
|
|
69
|
+
except ImportError:
|
|
70
|
+
logger.warning("RagasEvaluator: transformers/langchain-community not installed")
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
def run(self, chunks: List[Dict], qa: List[Dict], embedding_fn=None, k: int = 5) -> Dict[str, Any]:
|
|
76
|
+
"""
|
|
77
|
+
Runs RAGAS evaluation independently.
|
|
78
|
+
Returns a dictionary of RAGAS-specific metrics (e.g., context_precision, context_recall).
|
|
79
|
+
"""
|
|
80
|
+
if not self.config.enabled:
|
|
81
|
+
return {}
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
# Import lazily so core AutoChunks doesn't crash if RAGAS isn't installed
|
|
85
|
+
from ragas import evaluate
|
|
86
|
+
from ragas.metrics import context_precision, context_recall
|
|
87
|
+
from datasets import Dataset
|
|
88
|
+
|
|
89
|
+
# Use AutoChunks internal retrieval logic to support the 'contexts' creation
|
|
90
|
+
from ..retrieval.in_memory import InMemoryIndex
|
|
91
|
+
except ImportError as e:
|
|
92
|
+
logger.warning(f"RagasEvaluator: Missing dependencies ({e}). Install with 'pip install ragas datasets'")
|
|
93
|
+
return {"error": "Missing RAGAS dependencies"}
|
|
94
|
+
|
|
95
|
+
logger.info("RagasEvaluator: Starting RAGAS evaluation...")
|
|
96
|
+
|
|
97
|
+
# 0. Get the LLM
|
|
98
|
+
llm = self._get_llm()
|
|
99
|
+
if llm is None:
|
|
100
|
+
logger.warning("RagasEvaluator: No LLM available. Set OPENAI_API_KEY or start Ollama locally.")
|
|
101
|
+
return {"error": "No LLM provider available. Set OPENAI_API_KEY or run Ollama locally."}
|
|
102
|
+
|
|
103
|
+
# 1. Perform Retrieval
|
|
104
|
+
logger.info("RagasEvaluator: Performing independent retrieval step...")
|
|
105
|
+
|
|
106
|
+
if embedding_fn is None:
|
|
107
|
+
logger.warning("RagasEvaluator: No embedding_fn provided. Cannot perform retrieval.")
|
|
108
|
+
return {}
|
|
109
|
+
|
|
110
|
+
# Truncate texts to avoid embedding model max length errors
|
|
111
|
+
# Most models have 512 token limit (~2000 chars is safe)
|
|
112
|
+
MAX_CHARS = 1800
|
|
113
|
+
def truncate(text: str) -> str:
|
|
114
|
+
return text[:MAX_CHARS] if len(text) > MAX_CHARS else text
|
|
115
|
+
|
|
116
|
+
chunk_texts = [truncate(c["text"]) for c in chunks]
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
chunk_vectors = embedding_fn(chunk_texts)
|
|
120
|
+
except RuntimeError as e:
|
|
121
|
+
if "expanded size" in str(e) or "512" in str(e):
|
|
122
|
+
logger.error(f"RagasEvaluator: Embedding model max length exceeded. Try smaller chunks.")
|
|
123
|
+
return {"error": "Embedding model max length exceeded"}
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
index = InMemoryIndex(dim=len(chunk_vectors[0]))
|
|
127
|
+
index.add(chunk_vectors, chunks)
|
|
128
|
+
|
|
129
|
+
ragas_data = {
|
|
130
|
+
"question": [],
|
|
131
|
+
"ground_truth": [],
|
|
132
|
+
"contexts": []
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
valid_items_count = 0
|
|
136
|
+
limit = self.config.sample_size if self.config.sample_size > 0 else len(qa)
|
|
137
|
+
|
|
138
|
+
qa_subset = qa[:limit]
|
|
139
|
+
query_texts = [truncate(q["query"]) for q in qa_subset]
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
query_vectors = embedding_fn(query_texts)
|
|
143
|
+
except RuntimeError as e:
|
|
144
|
+
if "expanded size" in str(e):
|
|
145
|
+
logger.error(f"RagasEvaluator: Query embedding failed - text too long")
|
|
146
|
+
return {"error": "Query text too long for embedding model"}
|
|
147
|
+
raise
|
|
148
|
+
|
|
149
|
+
batch_hits = index.search(query_vectors, top_k=k)
|
|
150
|
+
|
|
151
|
+
for item, hits in zip(qa_subset, batch_hits):
|
|
152
|
+
contexts = [index.meta[idx]["text"] for idx, _ in hits]
|
|
153
|
+
|
|
154
|
+
ragas_data["question"].append(item["query"])
|
|
155
|
+
ragas_data["ground_truth"].append(item["answer_span"])
|
|
156
|
+
ragas_data["contexts"].append(contexts)
|
|
157
|
+
valid_items_count += 1
|
|
158
|
+
|
|
159
|
+
if valid_items_count == 0:
|
|
160
|
+
logger.warning("RagasEvaluator: No valid QA items found. Skipping.")
|
|
161
|
+
return {}
|
|
162
|
+
|
|
163
|
+
dataset = Dataset.from_dict(ragas_data)
|
|
164
|
+
|
|
165
|
+
# 2. Run Evaluation with configured LLM
|
|
166
|
+
logger.info(f"RagasEvaluator: Running evaluation with {valid_items_count} samples...")
|
|
167
|
+
|
|
168
|
+
# Determine safe limit (characters)
|
|
169
|
+
# 1 token ~= 4 chars. We allow 10% buffer.
|
|
170
|
+
try:
|
|
171
|
+
model_limit = 512 # Fallback
|
|
172
|
+
|
|
173
|
+
# Check for Hashing
|
|
174
|
+
is_hashing = getattr(embedding_fn, "name", "").startswith("hashing") or "HashingEmbedding" in str(type(embedding_fn))
|
|
175
|
+
|
|
176
|
+
if is_hashing:
|
|
177
|
+
# Ragas might still need *some* limit for internal processing but Hashing isn't constrained
|
|
178
|
+
model_limit = 250_000
|
|
179
|
+
SAFE_CHAR_LIMIT = 1_000_000
|
|
180
|
+
else:
|
|
181
|
+
if hasattr(embedding_fn, "max_seq_length"):
|
|
182
|
+
model_limit = embedding_fn.max_seq_length
|
|
183
|
+
elif hasattr(embedding_fn, "__self__") and hasattr(embedding_fn.__self__, "max_seq_length"):
|
|
184
|
+
# Handle bound methods like encoder.embed_batch
|
|
185
|
+
model_limit = embedding_fn.__self__.max_seq_length
|
|
186
|
+
|
|
187
|
+
SAFE_CHAR_LIMIT = int(model_limit * 4 * 0.95) # e.g. 512 -> ~1945 chars
|
|
188
|
+
|
|
189
|
+
# Wrap embedding function to enforce truncation inside RAGAS
|
|
190
|
+
from langchain_core.embeddings import Embeddings
|
|
191
|
+
|
|
192
|
+
class SafeEmbeddingWrapper(Embeddings):
|
|
193
|
+
def __init__(self, original_fn, limit):
|
|
194
|
+
self.fn = original_fn
|
|
195
|
+
self.limit = limit
|
|
196
|
+
self._has_warned = False
|
|
197
|
+
|
|
198
|
+
def _truncate(self, text: str) -> str:
|
|
199
|
+
if len(text) > self.limit:
|
|
200
|
+
if not self._has_warned:
|
|
201
|
+
logger.warning(f"RagasEvaluator: Truncating text > {self.limit} chars to fit embedding model ({model_limit} tokens). "
|
|
202
|
+
"Consider using a larger chunker or limited context model.")
|
|
203
|
+
self._has_warned = True
|
|
204
|
+
return text[:self.limit]
|
|
205
|
+
return text
|
|
206
|
+
|
|
207
|
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
208
|
+
safe_texts = [self._truncate(t) for t in texts]
|
|
209
|
+
return self.fn(safe_texts)
|
|
210
|
+
|
|
211
|
+
def embed_query(self, text: str) -> List[float]:
|
|
212
|
+
return self.fn([self._truncate(text)])[0]
|
|
213
|
+
|
|
214
|
+
safe_embeddings = SafeEmbeddingWrapper(embedding_fn, SAFE_CHAR_LIMIT)
|
|
215
|
+
|
|
216
|
+
metrics_to_run = [context_precision, context_recall]
|
|
217
|
+
results = evaluate(
|
|
218
|
+
dataset=dataset,
|
|
219
|
+
metrics=metrics_to_run,
|
|
220
|
+
llm=llm,
|
|
221
|
+
embeddings=safe_embeddings # Pass the safe wrapper
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Aggregate results
|
|
225
|
+
final_metrics = {}
|
|
226
|
+
for m in metrics_to_run:
|
|
227
|
+
if m.name in results:
|
|
228
|
+
final_metrics[f"ragas.{m.name}"] = results[m.name]
|
|
229
|
+
|
|
230
|
+
logger.info(f"RagasEvaluator: Evaluation complete. Metrics: {final_metrics}")
|
|
231
|
+
return final_metrics
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error(f"RagasEvaluator: Evaluation failed: {e}")
|
|
234
|
+
return {"error": str(e)}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
|
|
2
|
+
import random
|
|
3
|
+
import nltk
|
|
4
|
+
from typing import List, Dict, Optional, Callable
|
|
5
|
+
from nltk.corpus import wordnet
|
|
6
|
+
from ..utils.logger import logger
|
|
7
|
+
|
|
8
|
+
class SyntheticQAGenerator:
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self._initialized = False
|
|
11
|
+
|
|
12
|
+
def _ensure_nltk(self, on_progress: Optional[Callable[[str], None]] = None):
|
|
13
|
+
if self._initialized:
|
|
14
|
+
return
|
|
15
|
+
|
|
16
|
+
logger.info("Verifying NLTK linguist resources...")
|
|
17
|
+
if on_progress: on_progress("Verifying NLTK linguist resources...")
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
logger.debug("Checking WordNet...")
|
|
21
|
+
nltk.data.find('corpora/wordnet')
|
|
22
|
+
|
|
23
|
+
logger.debug("Checking Averaged Perceptron Tagger...")
|
|
24
|
+
nltk.data.find('taggers/averaged_perceptron_tagger')
|
|
25
|
+
|
|
26
|
+
logger.debug("Checking Punkt...")
|
|
27
|
+
nltk.data.find('tokenizers/punkt')
|
|
28
|
+
|
|
29
|
+
logger.info("NLTK resources verified.")
|
|
30
|
+
if on_progress: on_progress("NLTK resources verified.")
|
|
31
|
+
|
|
32
|
+
except LookupError:
|
|
33
|
+
if on_progress: on_progress("Downloading NLTK linguist data (this may take a minute)...")
|
|
34
|
+
logger.info("NLTK resources missing. Starting download...")
|
|
35
|
+
nltk.download('wordnet')
|
|
36
|
+
nltk.download('omw-1.4')
|
|
37
|
+
nltk.download('averaged_perceptron_tagger')
|
|
38
|
+
nltk.download('punkt')
|
|
39
|
+
self._initialized = True
|
|
40
|
+
|
|
41
|
+
def generate_hard_query(self, sentence: str, on_progress: Optional[Callable[[str], None]] = None) -> str:
|
|
42
|
+
self._ensure_nltk(on_progress)
|
|
43
|
+
|
|
44
|
+
tokens = nltk.word_tokenize(sentence)
|
|
45
|
+
pos_tags = nltk.pos_tag(tokens)
|
|
46
|
+
|
|
47
|
+
# Increase hardness: drop more common words, focus on entities
|
|
48
|
+
mode = random.choices(
|
|
49
|
+
["paraphrase", "keywords", "original"],
|
|
50
|
+
weights=[0.7, 0.2, 0.1],
|
|
51
|
+
k=1
|
|
52
|
+
)[0]
|
|
53
|
+
|
|
54
|
+
if mode == "original":
|
|
55
|
+
return sentence
|
|
56
|
+
|
|
57
|
+
if mode == "keywords":
|
|
58
|
+
# Keep meaningful words to ensure even hashing retrieval works
|
|
59
|
+
keywords = [w for w, t in pos_tags if t.startswith(('NN', 'VB', 'JJ')) and len(w) > 2]
|
|
60
|
+
if len(keywords) > 2:
|
|
61
|
+
random.shuffle(keywords)
|
|
62
|
+
return " ".join(keywords[:5])
|
|
63
|
+
return sentence
|
|
64
|
+
|
|
65
|
+
new_words = []
|
|
66
|
+
for word, tag in pos_tags:
|
|
67
|
+
if tag.startswith(('NN', 'VB', 'JJ')) and len(word) > 3:
|
|
68
|
+
syns = wordnet.synsets(word)
|
|
69
|
+
if syns:
|
|
70
|
+
# Get lemmas and Filter heavily
|
|
71
|
+
lemmas = {l.name().replace('_', ' ') for s in syns for l in s.lemmas()}
|
|
72
|
+
lemmas = {l for l in lemmas if l.lower() != word.lower() and "_" not in l}
|
|
73
|
+
if lemmas:
|
|
74
|
+
new_words.append(random.choice(list(lemmas)))
|
|
75
|
+
continue
|
|
76
|
+
new_words.append(word)
|
|
77
|
+
|
|
78
|
+
query = " ".join(new_words)
|
|
79
|
+
return query
|
|
80
|
+
|
|
81
|
+
def generate_boundary_qa(self, doc_id: str, sentences: list[str], on_progress: Optional[Callable[[str], None]] = None) -> list[dict]:
|
|
82
|
+
"""
|
|
83
|
+
Creates QA pairs where the answer span crosses sentence boundaries.
|
|
84
|
+
This tests if the chunker keeps related sentences together.
|
|
85
|
+
"""
|
|
86
|
+
self._ensure_nltk(on_progress)
|
|
87
|
+
qa = []
|
|
88
|
+
for i in range(len(sentences) - 1):
|
|
89
|
+
s1 = sentences[i]
|
|
90
|
+
s2 = sentences[i+1]
|
|
91
|
+
# Combine sentences into one answer span
|
|
92
|
+
combined = s1 + " " + s2
|
|
93
|
+
|
|
94
|
+
# Query is a paraphrase of the junction
|
|
95
|
+
query_base = s1[-30:] + " " + s2[:30]
|
|
96
|
+
query = self.generate_hard_query(query_base)
|
|
97
|
+
|
|
98
|
+
qa.append({
|
|
99
|
+
"id": f"{doc_id}#bqa#{i}",
|
|
100
|
+
"doc_id": doc_id,
|
|
101
|
+
"query": query,
|
|
102
|
+
"answer_span": combined,
|
|
103
|
+
})
|
|
104
|
+
return qa
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AutoChunks Quality Layer
|
|
3
|
+
|
|
4
|
+
World-class quality assurance tools for chunk evaluation and optimization.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .scorer import ChunkQualityScorer, ChunkQualityReport
|
|
8
|
+
from .deduplicator import ChunkDeduplicator, DeduplicationResult
|
|
9
|
+
from .overlap_optimizer import OverlapOptimizer, OverlapOptimizationResult
|
|
10
|
+
from .post_processor import ChunkPostProcessor, apply_post_processing, NATIVE_CHUNKERS, BRIDGE_CHUNKERS
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
# Scorer
|
|
14
|
+
'ChunkQualityScorer',
|
|
15
|
+
'ChunkQualityReport',
|
|
16
|
+
|
|
17
|
+
# Deduplicator
|
|
18
|
+
'ChunkDeduplicator',
|
|
19
|
+
'DeduplicationResult',
|
|
20
|
+
|
|
21
|
+
# Overlap Optimizer
|
|
22
|
+
'OverlapOptimizer',
|
|
23
|
+
'OverlapOptimizationResult',
|
|
24
|
+
|
|
25
|
+
# Post-Processor Pipeline
|
|
26
|
+
'ChunkPostProcessor',
|
|
27
|
+
'apply_post_processing',
|
|
28
|
+
'NATIVE_CHUNKERS',
|
|
29
|
+
'BRIDGE_CHUNKERS'
|
|
30
|
+
]
|
|
31
|
+
|