autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,234 @@
1
+ from typing import List, Dict, Any, Optional
2
+ from ..config import RagasConfig
3
+ from ..utils.logger import logger
4
+ import os
5
+
6
+ class RagasEvaluator:
7
+ """
8
+ Pluggable RAGAS Evaluator that supports multiple LLM backends:
9
+ - OpenAI (default, requires OPENAI_API_KEY)
10
+ - Ollama (local, requires ollama running)
11
+ - HuggingFace (local, requires GPU recommended)
12
+ """
13
+
14
+ def __init__(self, config: RagasConfig):
15
+ self.config = config
16
+
17
+ def _get_llm(self):
18
+ """
19
+ Returns the appropriate LLM wrapper based on config.
20
+ Priority: config.llm_provider > OPENAI_API_KEY detection > fallback error
21
+ """
22
+ provider = getattr(self.config, 'llm_provider', 'auto')
23
+ model_name = getattr(self.config, 'llm_model', None)
24
+
25
+ # Auto-detect: Check for available providers
26
+ if provider == 'auto':
27
+ if os.environ.get("OPENAI_API_KEY"):
28
+ provider = 'openai'
29
+ else:
30
+ # Try Ollama as fallback (common for local setups)
31
+ try:
32
+ import requests
33
+ resp = requests.get("http://localhost:11434/api/tags", timeout=2)
34
+ if resp.status_code == 200:
35
+ provider = 'ollama'
36
+ logger.info("RagasEvaluator: Auto-detected Ollama running locally")
37
+ except:
38
+ pass
39
+
40
+ if provider == 'openai':
41
+ from langchain_openai import ChatOpenAI
42
+ from ragas.llms import LangchainLLMWrapper
43
+ api_key = getattr(self.config, 'api_key', None) or os.environ.get("OPENAI_API_KEY")
44
+ llm = ChatOpenAI(model=model_name or "gpt-4o-mini", temperature=0, api_key=api_key)
45
+ return LangchainLLMWrapper(llm)
46
+
47
+ elif provider == 'ollama':
48
+ try:
49
+ from langchain_ollama import ChatOllama
50
+ from ragas.llms import LangchainLLMWrapper
51
+ llm = ChatOllama(model=model_name or "llama3.2", temperature=0)
52
+ # Set dummy key to prevent RAGAS from complaining
53
+ os.environ.setdefault("OPENAI_API_KEY", "sk-not-used-for-ollama")
54
+ return LangchainLLMWrapper(llm)
55
+ except ImportError:
56
+ logger.warning("RagasEvaluator: langchain-ollama not installed. Install with: pip install langchain-ollama")
57
+ return None
58
+
59
+ elif provider == 'huggingface':
60
+ try:
61
+ from langchain_community.llms import HuggingFacePipeline
62
+ from ragas.llms import LangchainLLMWrapper
63
+ from transformers import pipeline
64
+
65
+ pipe = pipeline("text-generation", model=model_name or "microsoft/Phi-3-mini-4k-instruct", max_new_tokens=512)
66
+ llm = HuggingFacePipeline(pipeline=pipe)
67
+ os.environ.setdefault("OPENAI_API_KEY", "sk-not-used-for-hf")
68
+ return LangchainLLMWrapper(llm)
69
+ except ImportError:
70
+ logger.warning("RagasEvaluator: transformers/langchain-community not installed")
71
+ return None
72
+
73
+ return None
74
+
75
+ def run(self, chunks: List[Dict], qa: List[Dict], embedding_fn=None, k: int = 5) -> Dict[str, Any]:
76
+ """
77
+ Runs RAGAS evaluation independently.
78
+ Returns a dictionary of RAGAS-specific metrics (e.g., context_precision, context_recall).
79
+ """
80
+ if not self.config.enabled:
81
+ return {}
82
+
83
+ try:
84
+ # Import lazily so core AutoChunks doesn't crash if RAGAS isn't installed
85
+ from ragas import evaluate
86
+ from ragas.metrics import context_precision, context_recall
87
+ from datasets import Dataset
88
+
89
+ # Use AutoChunks internal retrieval logic to support the 'contexts' creation
90
+ from ..retrieval.in_memory import InMemoryIndex
91
+ except ImportError as e:
92
+ logger.warning(f"RagasEvaluator: Missing dependencies ({e}). Install with 'pip install ragas datasets'")
93
+ return {"error": "Missing RAGAS dependencies"}
94
+
95
+ logger.info("RagasEvaluator: Starting RAGAS evaluation...")
96
+
97
+ # 0. Get the LLM
98
+ llm = self._get_llm()
99
+ if llm is None:
100
+ logger.warning("RagasEvaluator: No LLM available. Set OPENAI_API_KEY or start Ollama locally.")
101
+ return {"error": "No LLM provider available. Set OPENAI_API_KEY or run Ollama locally."}
102
+
103
+ # 1. Perform Retrieval
104
+ logger.info("RagasEvaluator: Performing independent retrieval step...")
105
+
106
+ if embedding_fn is None:
107
+ logger.warning("RagasEvaluator: No embedding_fn provided. Cannot perform retrieval.")
108
+ return {}
109
+
110
+ # Truncate texts to avoid embedding model max length errors
111
+ # Most models have 512 token limit (~2000 chars is safe)
112
+ MAX_CHARS = 1800
113
+ def truncate(text: str) -> str:
114
+ return text[:MAX_CHARS] if len(text) > MAX_CHARS else text
115
+
116
+ chunk_texts = [truncate(c["text"]) for c in chunks]
117
+
118
+ try:
119
+ chunk_vectors = embedding_fn(chunk_texts)
120
+ except RuntimeError as e:
121
+ if "expanded size" in str(e) or "512" in str(e):
122
+ logger.error(f"RagasEvaluator: Embedding model max length exceeded. Try smaller chunks.")
123
+ return {"error": "Embedding model max length exceeded"}
124
+ raise
125
+
126
+ index = InMemoryIndex(dim=len(chunk_vectors[0]))
127
+ index.add(chunk_vectors, chunks)
128
+
129
+ ragas_data = {
130
+ "question": [],
131
+ "ground_truth": [],
132
+ "contexts": []
133
+ }
134
+
135
+ valid_items_count = 0
136
+ limit = self.config.sample_size if self.config.sample_size > 0 else len(qa)
137
+
138
+ qa_subset = qa[:limit]
139
+ query_texts = [truncate(q["query"]) for q in qa_subset]
140
+
141
+ try:
142
+ query_vectors = embedding_fn(query_texts)
143
+ except RuntimeError as e:
144
+ if "expanded size" in str(e):
145
+ logger.error(f"RagasEvaluator: Query embedding failed - text too long")
146
+ return {"error": "Query text too long for embedding model"}
147
+ raise
148
+
149
+ batch_hits = index.search(query_vectors, top_k=k)
150
+
151
+ for item, hits in zip(qa_subset, batch_hits):
152
+ contexts = [index.meta[idx]["text"] for idx, _ in hits]
153
+
154
+ ragas_data["question"].append(item["query"])
155
+ ragas_data["ground_truth"].append(item["answer_span"])
156
+ ragas_data["contexts"].append(contexts)
157
+ valid_items_count += 1
158
+
159
+ if valid_items_count == 0:
160
+ logger.warning("RagasEvaluator: No valid QA items found. Skipping.")
161
+ return {}
162
+
163
+ dataset = Dataset.from_dict(ragas_data)
164
+
165
+ # 2. Run Evaluation with configured LLM
166
+ logger.info(f"RagasEvaluator: Running evaluation with {valid_items_count} samples...")
167
+
168
+ # Determine safe limit (characters)
169
+ # 1 token ~= 4 chars. We allow 10% buffer.
170
+ try:
171
+ model_limit = 512 # Fallback
172
+
173
+ # Check for Hashing
174
+ is_hashing = getattr(embedding_fn, "name", "").startswith("hashing") or "HashingEmbedding" in str(type(embedding_fn))
175
+
176
+ if is_hashing:
177
+ # Ragas might still need *some* limit for internal processing but Hashing isn't constrained
178
+ model_limit = 250_000
179
+ SAFE_CHAR_LIMIT = 1_000_000
180
+ else:
181
+ if hasattr(embedding_fn, "max_seq_length"):
182
+ model_limit = embedding_fn.max_seq_length
183
+ elif hasattr(embedding_fn, "__self__") and hasattr(embedding_fn.__self__, "max_seq_length"):
184
+ # Handle bound methods like encoder.embed_batch
185
+ model_limit = embedding_fn.__self__.max_seq_length
186
+
187
+ SAFE_CHAR_LIMIT = int(model_limit * 4 * 0.95) # e.g. 512 -> ~1945 chars
188
+
189
+ # Wrap embedding function to enforce truncation inside RAGAS
190
+ from langchain_core.embeddings import Embeddings
191
+
192
+ class SafeEmbeddingWrapper(Embeddings):
193
+ def __init__(self, original_fn, limit):
194
+ self.fn = original_fn
195
+ self.limit = limit
196
+ self._has_warned = False
197
+
198
+ def _truncate(self, text: str) -> str:
199
+ if len(text) > self.limit:
200
+ if not self._has_warned:
201
+ logger.warning(f"RagasEvaluator: Truncating text > {self.limit} chars to fit embedding model ({model_limit} tokens). "
202
+ "Consider using a larger chunker or limited context model.")
203
+ self._has_warned = True
204
+ return text[:self.limit]
205
+ return text
206
+
207
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
208
+ safe_texts = [self._truncate(t) for t in texts]
209
+ return self.fn(safe_texts)
210
+
211
+ def embed_query(self, text: str) -> List[float]:
212
+ return self.fn([self._truncate(text)])[0]
213
+
214
+ safe_embeddings = SafeEmbeddingWrapper(embedding_fn, SAFE_CHAR_LIMIT)
215
+
216
+ metrics_to_run = [context_precision, context_recall]
217
+ results = evaluate(
218
+ dataset=dataset,
219
+ metrics=metrics_to_run,
220
+ llm=llm,
221
+ embeddings=safe_embeddings # Pass the safe wrapper
222
+ )
223
+
224
+ # Aggregate results
225
+ final_metrics = {}
226
+ for m in metrics_to_run:
227
+ if m.name in results:
228
+ final_metrics[f"ragas.{m.name}"] = results[m.name]
229
+
230
+ logger.info(f"RagasEvaluator: Evaluation complete. Metrics: {final_metrics}")
231
+ return final_metrics
232
+ except Exception as e:
233
+ logger.error(f"RagasEvaluator: Evaluation failed: {e}")
234
+ return {"error": str(e)}
@@ -0,0 +1,104 @@
1
+
2
+ import random
3
+ import nltk
4
+ from typing import List, Dict, Optional, Callable
5
+ from nltk.corpus import wordnet
6
+ from ..utils.logger import logger
7
+
8
+ class SyntheticQAGenerator:
9
+ def __init__(self):
10
+ self._initialized = False
11
+
12
+ def _ensure_nltk(self, on_progress: Optional[Callable[[str], None]] = None):
13
+ if self._initialized:
14
+ return
15
+
16
+ logger.info("Verifying NLTK linguist resources...")
17
+ if on_progress: on_progress("Verifying NLTK linguist resources...")
18
+
19
+ try:
20
+ logger.debug("Checking WordNet...")
21
+ nltk.data.find('corpora/wordnet')
22
+
23
+ logger.debug("Checking Averaged Perceptron Tagger...")
24
+ nltk.data.find('taggers/averaged_perceptron_tagger')
25
+
26
+ logger.debug("Checking Punkt...")
27
+ nltk.data.find('tokenizers/punkt')
28
+
29
+ logger.info("NLTK resources verified.")
30
+ if on_progress: on_progress("NLTK resources verified.")
31
+
32
+ except LookupError:
33
+ if on_progress: on_progress("Downloading NLTK linguist data (this may take a minute)...")
34
+ logger.info("NLTK resources missing. Starting download...")
35
+ nltk.download('wordnet')
36
+ nltk.download('omw-1.4')
37
+ nltk.download('averaged_perceptron_tagger')
38
+ nltk.download('punkt')
39
+ self._initialized = True
40
+
41
+ def generate_hard_query(self, sentence: str, on_progress: Optional[Callable[[str], None]] = None) -> str:
42
+ self._ensure_nltk(on_progress)
43
+
44
+ tokens = nltk.word_tokenize(sentence)
45
+ pos_tags = nltk.pos_tag(tokens)
46
+
47
+ # Increase hardness: drop more common words, focus on entities
48
+ mode = random.choices(
49
+ ["paraphrase", "keywords", "original"],
50
+ weights=[0.7, 0.2, 0.1],
51
+ k=1
52
+ )[0]
53
+
54
+ if mode == "original":
55
+ return sentence
56
+
57
+ if mode == "keywords":
58
+ # Keep meaningful words to ensure even hashing retrieval works
59
+ keywords = [w for w, t in pos_tags if t.startswith(('NN', 'VB', 'JJ')) and len(w) > 2]
60
+ if len(keywords) > 2:
61
+ random.shuffle(keywords)
62
+ return " ".join(keywords[:5])
63
+ return sentence
64
+
65
+ new_words = []
66
+ for word, tag in pos_tags:
67
+ if tag.startswith(('NN', 'VB', 'JJ')) and len(word) > 3:
68
+ syns = wordnet.synsets(word)
69
+ if syns:
70
+ # Get lemmas and Filter heavily
71
+ lemmas = {l.name().replace('_', ' ') for s in syns for l in s.lemmas()}
72
+ lemmas = {l for l in lemmas if l.lower() != word.lower() and "_" not in l}
73
+ if lemmas:
74
+ new_words.append(random.choice(list(lemmas)))
75
+ continue
76
+ new_words.append(word)
77
+
78
+ query = " ".join(new_words)
79
+ return query
80
+
81
+ def generate_boundary_qa(self, doc_id: str, sentences: list[str], on_progress: Optional[Callable[[str], None]] = None) -> list[dict]:
82
+ """
83
+ Creates QA pairs where the answer span crosses sentence boundaries.
84
+ This tests if the chunker keeps related sentences together.
85
+ """
86
+ self._ensure_nltk(on_progress)
87
+ qa = []
88
+ for i in range(len(sentences) - 1):
89
+ s1 = sentences[i]
90
+ s2 = sentences[i+1]
91
+ # Combine sentences into one answer span
92
+ combined = s1 + " " + s2
93
+
94
+ # Query is a paraphrase of the junction
95
+ query_base = s1[-30:] + " " + s2[:30]
96
+ query = self.generate_hard_query(query_base)
97
+
98
+ qa.append({
99
+ "id": f"{doc_id}#bqa#{i}",
100
+ "doc_id": doc_id,
101
+ "query": query,
102
+ "answer_span": combined,
103
+ })
104
+ return qa
@@ -0,0 +1,31 @@
1
+ """
2
+ AutoChunks Quality Layer
3
+
4
+ World-class quality assurance tools for chunk evaluation and optimization.
5
+ """
6
+
7
+ from .scorer import ChunkQualityScorer, ChunkQualityReport
8
+ from .deduplicator import ChunkDeduplicator, DeduplicationResult
9
+ from .overlap_optimizer import OverlapOptimizer, OverlapOptimizationResult
10
+ from .post_processor import ChunkPostProcessor, apply_post_processing, NATIVE_CHUNKERS, BRIDGE_CHUNKERS
11
+
12
+ __all__ = [
13
+ # Scorer
14
+ 'ChunkQualityScorer',
15
+ 'ChunkQualityReport',
16
+
17
+ # Deduplicator
18
+ 'ChunkDeduplicator',
19
+ 'DeduplicationResult',
20
+
21
+ # Overlap Optimizer
22
+ 'OverlapOptimizer',
23
+ 'OverlapOptimizationResult',
24
+
25
+ # Post-Processor Pipeline
26
+ 'ChunkPostProcessor',
27
+ 'apply_post_processing',
28
+ 'NATIVE_CHUNKERS',
29
+ 'BRIDGE_CHUNKERS'
30
+ ]
31
+