autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AutoChunks - World-Class Text Chunking Library
|
|
3
|
+
|
|
4
|
+
The definitive Swiss Army Knife of text splitting with extreme precision.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .base import BaseChunker, Chunk
|
|
8
|
+
|
|
9
|
+
# ============================================================================
|
|
10
|
+
# BASIC SPLITTERS
|
|
11
|
+
# ============================================================================
|
|
12
|
+
from .fixed_length import FixedLengthChunker
|
|
13
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
14
|
+
from .sentence_aware import SentenceAwareChunker
|
|
15
|
+
|
|
16
|
+
# ============================================================================
|
|
17
|
+
# SEMANTIC SPLITTERS
|
|
18
|
+
# ============================================================================
|
|
19
|
+
from .semantic_local import SemanticLocalChunker
|
|
20
|
+
from .hybrid_semantic_stat import HybridSemanticStatChunker
|
|
21
|
+
from .proposition import PropositionChunker
|
|
22
|
+
from .agentic import AgenticChunker
|
|
23
|
+
|
|
24
|
+
# ============================================================================
|
|
25
|
+
# STRUCTURE-AWARE SPLITTERS
|
|
26
|
+
# ============================================================================
|
|
27
|
+
from .layout_aware import LayoutAwareChunker
|
|
28
|
+
from .parent_child import ParentChildChunker
|
|
29
|
+
from .contextual_retrieval import ContextualRetrievalChunker
|
|
30
|
+
from .html_section import HTMLSectionChunker
|
|
31
|
+
|
|
32
|
+
# ============================================================================
|
|
33
|
+
# CODE SPLITTERS
|
|
34
|
+
# ============================================================================
|
|
35
|
+
from .python_ast import PythonASTChunker
|
|
36
|
+
|
|
37
|
+
# ============================================================================
|
|
38
|
+
# CHUNKER REGISTRY
|
|
39
|
+
# ============================================================================
|
|
40
|
+
CHUNKER_REGISTRY = {
|
|
41
|
+
# Basic
|
|
42
|
+
'fixed_length': FixedLengthChunker,
|
|
43
|
+
'recursive_character': RecursiveCharacterChunker,
|
|
44
|
+
'sentence_aware': SentenceAwareChunker,
|
|
45
|
+
|
|
46
|
+
# Semantic
|
|
47
|
+
'semantic_local': SemanticLocalChunker,
|
|
48
|
+
'hybrid_semantic_stat': HybridSemanticStatChunker,
|
|
49
|
+
'proposition': PropositionChunker,
|
|
50
|
+
'agentic': AgenticChunker,
|
|
51
|
+
|
|
52
|
+
# Structure-Aware
|
|
53
|
+
'layout_aware': LayoutAwareChunker,
|
|
54
|
+
'parent_child': ParentChildChunker,
|
|
55
|
+
'contextual_retrieval': ContextualRetrievalChunker,
|
|
56
|
+
'html_section': HTMLSectionChunker,
|
|
57
|
+
|
|
58
|
+
# Code
|
|
59
|
+
'python_ast': PythonASTChunker,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def get_chunker(name: str) -> BaseChunker:
|
|
63
|
+
"""Get a chunker instance by name."""
|
|
64
|
+
if name not in CHUNKER_REGISTRY:
|
|
65
|
+
raise ValueError(f"Unknown chunker: {name}. Available: {list(CHUNKER_REGISTRY.keys())}")
|
|
66
|
+
return CHUNKER_REGISTRY[name]()
|
|
67
|
+
|
|
68
|
+
def list_chunkers() -> list:
|
|
69
|
+
"""List all available chunker names."""
|
|
70
|
+
return list(CHUNKER_REGISTRY.keys())
|
|
71
|
+
|
|
72
|
+
__all__ = [
|
|
73
|
+
# Base
|
|
74
|
+
'BaseChunker',
|
|
75
|
+
'Chunk',
|
|
76
|
+
|
|
77
|
+
# Basic
|
|
78
|
+
'FixedLengthChunker',
|
|
79
|
+
'RecursiveCharacterChunker',
|
|
80
|
+
'SentenceAwareChunker',
|
|
81
|
+
|
|
82
|
+
# Semantic
|
|
83
|
+
'SemanticLocalChunker',
|
|
84
|
+
'HybridSemanticStatChunker',
|
|
85
|
+
'PropositionChunker',
|
|
86
|
+
'AgenticChunker',
|
|
87
|
+
|
|
88
|
+
# Structure-Aware
|
|
89
|
+
'LayoutAwareChunker',
|
|
90
|
+
'ParentChildChunker',
|
|
91
|
+
'ContextualRetrievalChunker',
|
|
92
|
+
|
|
93
|
+
# Code
|
|
94
|
+
'PythonASTChunker',
|
|
95
|
+
|
|
96
|
+
# Utilities
|
|
97
|
+
'CHUNKER_REGISTRY',
|
|
98
|
+
'get_chunker',
|
|
99
|
+
'list_chunkers',
|
|
100
|
+
]
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Callable, Optional, Any
|
|
4
|
+
from .base import BaseChunker, Chunk
|
|
5
|
+
from ..utils.text import count_tokens, split_sentences
|
|
6
|
+
|
|
7
|
+
class AgenticChunker(BaseChunker):
|
|
8
|
+
"""
|
|
9
|
+
LLM-Powered Agentic Chunker for Intelligent Boundary Detection.
|
|
10
|
+
|
|
11
|
+
Uses a language model to decide optimal chunk boundaries based on
|
|
12
|
+
semantic coherence, topic shifts, and content structure.
|
|
13
|
+
|
|
14
|
+
BEST-OF-BREED FEATURES:
|
|
15
|
+
1. LLM-Decided Boundaries: Model determines where to split based on meaning.
|
|
16
|
+
2. Configurable Prompts: Custom instructions for domain-specific chunking.
|
|
17
|
+
3. Fallback Safety: Reverts to sentence-aware if LLM unavailable.
|
|
18
|
+
4. Batch Processing: Efficient API usage with batched boundary decisions.
|
|
19
|
+
|
|
20
|
+
Reference: Greg Kamradt's "Agentic Chunker" concept.
|
|
21
|
+
"""
|
|
22
|
+
name = "agentic"
|
|
23
|
+
|
|
24
|
+
DEFAULT_SYSTEM_PROMPT = """You are a text segmentation expert. Your task is to identify natural boundaries in text where the topic, theme, or focus shifts significantly.
|
|
25
|
+
|
|
26
|
+
For each boundary you identify, respond with the sentence number (1-indexed) where a NEW section should begin.
|
|
27
|
+
|
|
28
|
+
Guidelines:
|
|
29
|
+
- A new section should start when there's a clear topic shift
|
|
30
|
+
- Keep related information together
|
|
31
|
+
- Aim for chunks of roughly 3-10 sentences
|
|
32
|
+
- Don't split in the middle of a logical argument or explanation
|
|
33
|
+
- Consider paragraph breaks as potential (but not mandatory) boundaries"""
|
|
34
|
+
|
|
35
|
+
DEFAULT_USER_TEMPLATE = """Analyze the following text and identify where natural section boundaries should occur.
|
|
36
|
+
|
|
37
|
+
TEXT:
|
|
38
|
+
{text}
|
|
39
|
+
|
|
40
|
+
SENTENCES (numbered):
|
|
41
|
+
{numbered_sentences}
|
|
42
|
+
|
|
43
|
+
Respond with a JSON array of sentence numbers where new sections should BEGIN.
|
|
44
|
+
Example: [1, 5, 12, 18] means sections start at sentences 1, 5, 12, and 18.
|
|
45
|
+
|
|
46
|
+
Only output the JSON array, nothing else."""
|
|
47
|
+
|
|
48
|
+
def __init__(self,
|
|
49
|
+
llm_fn: Callable[[str, str], str] = None,
|
|
50
|
+
system_prompt: str = None,
|
|
51
|
+
user_template: str = None,
|
|
52
|
+
max_sentences_per_call: int = 50):
|
|
53
|
+
"""
|
|
54
|
+
Initialize the agentic chunker.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
llm_fn: Function that takes (system_prompt, user_message) and returns LLM response.
|
|
58
|
+
If None, uses a mock that falls back to sentence-aware chunking.
|
|
59
|
+
system_prompt: Custom system prompt for the LLM
|
|
60
|
+
user_template: Custom user message template (must include {text} and {numbered_sentences})
|
|
61
|
+
max_sentences_per_call: Max sentences to process in one LLM call
|
|
62
|
+
"""
|
|
63
|
+
self.llm_fn = llm_fn
|
|
64
|
+
self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
|
|
65
|
+
self.user_template = user_template or self.DEFAULT_USER_TEMPLATE
|
|
66
|
+
self.max_sentences_per_call = max_sentences_per_call
|
|
67
|
+
|
|
68
|
+
def chunk(self,
|
|
69
|
+
doc_id: str,
|
|
70
|
+
text: str,
|
|
71
|
+
base_token_size: int = 512,
|
|
72
|
+
**params) -> List[Chunk]:
|
|
73
|
+
"""
|
|
74
|
+
Use LLM to determine optimal chunk boundaries.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
doc_id: Document identifier
|
|
78
|
+
text: Input text
|
|
79
|
+
base_token_size: Target chunk size (used as guidance for LLM)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
List of Chunk objects
|
|
83
|
+
"""
|
|
84
|
+
sentences = split_sentences(text)
|
|
85
|
+
|
|
86
|
+
if len(sentences) <= 1:
|
|
87
|
+
return [Chunk(id=f"{doc_id}#ag#0", doc_id=doc_id, text=text,
|
|
88
|
+
meta={"chunk_index": 0, "strategy": "agentic"})]
|
|
89
|
+
|
|
90
|
+
# Fallback if no LLM function provided
|
|
91
|
+
if self.llm_fn is None:
|
|
92
|
+
from .sentence_aware import SentenceAwareChunker
|
|
93
|
+
return SentenceAwareChunker().chunk(doc_id, text, base_token_size=base_token_size)
|
|
94
|
+
|
|
95
|
+
# Get boundary decisions from LLM
|
|
96
|
+
boundaries = self._get_boundaries(sentences)
|
|
97
|
+
|
|
98
|
+
# Build chunks from boundaries
|
|
99
|
+
chunks = []
|
|
100
|
+
current_start = 0
|
|
101
|
+
|
|
102
|
+
for boundary in sorted(set(boundaries)):
|
|
103
|
+
if boundary > current_start and boundary <= len(sentences):
|
|
104
|
+
chunk_sentences = sentences[current_start:boundary]
|
|
105
|
+
chunk_text = " ".join(chunk_sentences)
|
|
106
|
+
|
|
107
|
+
chunks.append(Chunk(
|
|
108
|
+
id=f"{doc_id}#ag#{len(chunks)}",
|
|
109
|
+
doc_id=doc_id,
|
|
110
|
+
text=chunk_text,
|
|
111
|
+
meta={
|
|
112
|
+
"chunk_index": len(chunks),
|
|
113
|
+
"strategy": "agentic",
|
|
114
|
+
"sentence_range": [current_start, boundary],
|
|
115
|
+
"token_count": count_tokens(chunk_text)
|
|
116
|
+
}
|
|
117
|
+
))
|
|
118
|
+
current_start = boundary
|
|
119
|
+
|
|
120
|
+
# Add final chunk
|
|
121
|
+
if current_start < len(sentences):
|
|
122
|
+
chunk_sentences = sentences[current_start:]
|
|
123
|
+
chunk_text = " ".join(chunk_sentences)
|
|
124
|
+
chunks.append(Chunk(
|
|
125
|
+
id=f"{doc_id}#ag#{len(chunks)}",
|
|
126
|
+
doc_id=doc_id,
|
|
127
|
+
text=chunk_text,
|
|
128
|
+
meta={
|
|
129
|
+
"chunk_index": len(chunks),
|
|
130
|
+
"strategy": "agentic",
|
|
131
|
+
"sentence_range": [current_start, len(sentences)],
|
|
132
|
+
"token_count": count_tokens(chunk_text)
|
|
133
|
+
}
|
|
134
|
+
))
|
|
135
|
+
|
|
136
|
+
return chunks if chunks else [Chunk(id=f"{doc_id}#ag#0", doc_id=doc_id, text=text,
|
|
137
|
+
meta={"chunk_index": 0, "strategy": "agentic"})]
|
|
138
|
+
|
|
139
|
+
def _get_boundaries(self, sentences: List[str]) -> List[int]:
|
|
140
|
+
"""Get boundary positions from LLM."""
|
|
141
|
+
import json
|
|
142
|
+
|
|
143
|
+
# Always include position 0 as first boundary
|
|
144
|
+
all_boundaries = [0]
|
|
145
|
+
|
|
146
|
+
# Process in batches if needed
|
|
147
|
+
for batch_start in range(0, len(sentences), self.max_sentences_per_call):
|
|
148
|
+
batch_end = min(batch_start + self.max_sentences_per_call, len(sentences))
|
|
149
|
+
batch_sentences = sentences[batch_start:batch_end]
|
|
150
|
+
|
|
151
|
+
# Create numbered list
|
|
152
|
+
numbered = "\n".join([f"{i+1}. {s}" for i, s in enumerate(batch_sentences)])
|
|
153
|
+
batch_text = " ".join(batch_sentences)
|
|
154
|
+
|
|
155
|
+
user_message = self.user_template.format(
|
|
156
|
+
text=batch_text,
|
|
157
|
+
numbered_sentences=numbered
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
response = self.llm_fn(self.system_prompt, user_message)
|
|
162
|
+
|
|
163
|
+
# Parse JSON response
|
|
164
|
+
# Handle various response formats
|
|
165
|
+
response = response.strip()
|
|
166
|
+
if response.startswith("```"):
|
|
167
|
+
response = response.split("```")[1]
|
|
168
|
+
if response.startswith("json"):
|
|
169
|
+
response = response[4:]
|
|
170
|
+
|
|
171
|
+
boundaries = json.loads(response)
|
|
172
|
+
|
|
173
|
+
# Adjust for batch offset and add to all boundaries
|
|
174
|
+
for b in boundaries:
|
|
175
|
+
if isinstance(b, int) and b > 0:
|
|
176
|
+
all_boundaries.append(batch_start + b - 1) # Convert 1-indexed to 0-indexed
|
|
177
|
+
|
|
178
|
+
except Exception as e:
|
|
179
|
+
# On parse failure, use sentence-aware heuristic for this batch
|
|
180
|
+
# Split roughly every 5 sentences
|
|
181
|
+
for i in range(5, len(batch_sentences), 5):
|
|
182
|
+
all_boundaries.append(batch_start + i)
|
|
183
|
+
|
|
184
|
+
return sorted(set(all_boundaries))
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Chunk:
|
|
8
|
+
id: str
|
|
9
|
+
doc_id: str
|
|
10
|
+
text: str
|
|
11
|
+
meta: Dict[str, Any]
|
|
12
|
+
|
|
13
|
+
class BaseChunker:
|
|
14
|
+
name = "base"
|
|
15
|
+
def chunk(self, doc_id: str, text: str, **params) -> List[Chunk]:
|
|
16
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Callable, Optional
|
|
4
|
+
from .base import BaseChunker, Chunk
|
|
5
|
+
from ..utils.text import count_tokens
|
|
6
|
+
|
|
7
|
+
class ContextualRetrievalChunker(BaseChunker):
|
|
8
|
+
"""
|
|
9
|
+
Contextual Retrieval Chunker (Anthropic's Approach).
|
|
10
|
+
|
|
11
|
+
Each chunk is prepended with LLM-generated context that situates it
|
|
12
|
+
within the broader document. This dramatically improves retrieval accuracy.
|
|
13
|
+
|
|
14
|
+
BEST-OF-BREED FEATURES:
|
|
15
|
+
1. Context Prepending: Each chunk starts with situating context.
|
|
16
|
+
2. Document-Aware: Context is generated with full document visibility.
|
|
17
|
+
3. Retrieval-Optimized: Context is designed to improve semantic search.
|
|
18
|
+
4. Caching: Efficient context reuse for repeated chunks.
|
|
19
|
+
|
|
20
|
+
Reference: Anthropic's "Contextual Retrieval" blog post.
|
|
21
|
+
"""
|
|
22
|
+
name = "contextual_retrieval"
|
|
23
|
+
|
|
24
|
+
DEFAULT_CONTEXT_PROMPT = """<document>
|
|
25
|
+
{document}
|
|
26
|
+
</document>
|
|
27
|
+
|
|
28
|
+
Here is the chunk we want to situate within the whole document:
|
|
29
|
+
<chunk>
|
|
30
|
+
{chunk}
|
|
31
|
+
</chunk>
|
|
32
|
+
|
|
33
|
+
Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
|
|
34
|
+
|
|
35
|
+
def __init__(self,
|
|
36
|
+
llm_fn: Callable[[str, str], str] = None,
|
|
37
|
+
base_chunker: BaseChunker = None,
|
|
38
|
+
context_template: str = None,
|
|
39
|
+
max_document_tokens: int = 8000,
|
|
40
|
+
context_prefix: str = "Context: "):
|
|
41
|
+
"""
|
|
42
|
+
Initialize the contextual retrieval chunker.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
llm_fn: Function that takes (system_prompt, user_message) and returns LLM response.
|
|
46
|
+
base_chunker: Chunker to use for initial splitting. Defaults to RecursiveCharacter.
|
|
47
|
+
context_template: Custom template for context generation.
|
|
48
|
+
max_document_tokens: Max tokens of document to include in context generation.
|
|
49
|
+
context_prefix: Prefix before the generated context.
|
|
50
|
+
"""
|
|
51
|
+
self.llm_fn = llm_fn
|
|
52
|
+
self.base_chunker = base_chunker
|
|
53
|
+
self.context_template = context_template or self.DEFAULT_CONTEXT_PROMPT
|
|
54
|
+
self.max_document_tokens = max_document_tokens
|
|
55
|
+
self.context_prefix = context_prefix
|
|
56
|
+
|
|
57
|
+
def chunk(self,
|
|
58
|
+
doc_id: str,
|
|
59
|
+
text: str,
|
|
60
|
+
base_token_size: int = 512,
|
|
61
|
+
generate_context: bool = True,
|
|
62
|
+
**params) -> List[Chunk]:
|
|
63
|
+
"""
|
|
64
|
+
Create chunks with contextual headers.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
doc_id: Document identifier
|
|
68
|
+
text: Input text
|
|
69
|
+
base_token_size: Target chunk size
|
|
70
|
+
generate_context: If False, skip context generation (for testing)
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
List of Chunk objects with context prepended
|
|
74
|
+
"""
|
|
75
|
+
# Get base chunker
|
|
76
|
+
if self.base_chunker is None:
|
|
77
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
78
|
+
base_chunker = RecursiveCharacterChunker()
|
|
79
|
+
else:
|
|
80
|
+
base_chunker = self.base_chunker
|
|
81
|
+
|
|
82
|
+
# Create initial chunks
|
|
83
|
+
base_chunks = base_chunker.chunk(doc_id, text, base_token_size=base_token_size, **params)
|
|
84
|
+
|
|
85
|
+
if not generate_context or self.llm_fn is None:
|
|
86
|
+
# Return chunks with simple document context from lineage if available
|
|
87
|
+
for chunk in base_chunks:
|
|
88
|
+
chunk.meta["strategy"] = "contextual_retrieval_base"
|
|
89
|
+
return base_chunks
|
|
90
|
+
|
|
91
|
+
# Truncate document for context generation
|
|
92
|
+
from ..utils.text import get_tokens, decode_tokens
|
|
93
|
+
doc_tokens = get_tokens(text)
|
|
94
|
+
if len(doc_tokens) > self.max_document_tokens:
|
|
95
|
+
truncated_doc = decode_tokens(doc_tokens[:self.max_document_tokens]) + "\n... [truncated]"
|
|
96
|
+
else:
|
|
97
|
+
truncated_doc = text
|
|
98
|
+
|
|
99
|
+
# Generate context for each chunk
|
|
100
|
+
contextualized_chunks = []
|
|
101
|
+
|
|
102
|
+
for chunk in base_chunks:
|
|
103
|
+
context = self._generate_context(truncated_doc, chunk.text)
|
|
104
|
+
|
|
105
|
+
# Prepend context to chunk text
|
|
106
|
+
if context:
|
|
107
|
+
contextualized_text = f"{self.context_prefix}{context}\n\n{chunk.text}"
|
|
108
|
+
else:
|
|
109
|
+
contextualized_text = chunk.text
|
|
110
|
+
|
|
111
|
+
contextualized_chunks.append(Chunk(
|
|
112
|
+
id=f"{doc_id}#cr#{len(contextualized_chunks)}",
|
|
113
|
+
doc_id=doc_id,
|
|
114
|
+
text=contextualized_text,
|
|
115
|
+
meta={
|
|
116
|
+
"chunk_index": len(contextualized_chunks),
|
|
117
|
+
"strategy": "contextual_retrieval",
|
|
118
|
+
"original_text": chunk.text,
|
|
119
|
+
"generated_context": context,
|
|
120
|
+
"token_count": count_tokens(contextualized_text),
|
|
121
|
+
"original_token_count": count_tokens(chunk.text)
|
|
122
|
+
}
|
|
123
|
+
))
|
|
124
|
+
|
|
125
|
+
return contextualized_chunks
|
|
126
|
+
|
|
127
|
+
def _generate_context(self, document: str, chunk: str) -> str:
|
|
128
|
+
"""Generate situating context for a chunk."""
|
|
129
|
+
try:
|
|
130
|
+
# Use the context template
|
|
131
|
+
prompt = self.context_template.format(
|
|
132
|
+
document=document,
|
|
133
|
+
chunk=chunk
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Call LLM with empty system prompt (all in user message)
|
|
137
|
+
response = self.llm_fn("", prompt)
|
|
138
|
+
|
|
139
|
+
# Clean response
|
|
140
|
+
context = response.strip()
|
|
141
|
+
|
|
142
|
+
# Limit context length
|
|
143
|
+
if count_tokens(context) > 100:
|
|
144
|
+
from ..utils.text import get_tokens, decode_tokens
|
|
145
|
+
tokens = get_tokens(context)[:100]
|
|
146
|
+
context = decode_tokens(tokens)
|
|
147
|
+
|
|
148
|
+
return context
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
return ""
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Callable, Optional
|
|
4
|
+
from .base import BaseChunker, Chunk
|
|
5
|
+
|
|
6
|
+
class FixedLengthChunker(BaseChunker):
|
|
7
|
+
"""
|
|
8
|
+
Fixed-Length Chunker with Sliding Window (Overlap).
|
|
9
|
+
|
|
10
|
+
BEST-OF-BREED FEATURES:
|
|
11
|
+
1. Pluggable Length Function: Supports tiktoken, char, word, or custom functions.
|
|
12
|
+
2. Start Index Tracking: Records character offset for citation purposes.
|
|
13
|
+
3. Accurate Token Counting: Uses tiktoken by default for GPT-model accuracy.
|
|
14
|
+
"""
|
|
15
|
+
name = "fixed_length"
|
|
16
|
+
|
|
17
|
+
def __init__(self,
|
|
18
|
+
length_function: Callable[[str], int] = None,
|
|
19
|
+
tokenizer: str = "auto"):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the chunker.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
length_function: Custom function to measure text length.
|
|
25
|
+
If None, uses token counting.
|
|
26
|
+
tokenizer: Backend for tokenization ("auto", "tiktoken", "whitespace", "character")
|
|
27
|
+
"""
|
|
28
|
+
self.tokenizer = tokenizer
|
|
29
|
+
self._length_function = length_function
|
|
30
|
+
|
|
31
|
+
def _get_length(self, text: str) -> int:
|
|
32
|
+
"""Get length using configured method."""
|
|
33
|
+
if self._length_function:
|
|
34
|
+
return self._length_function(text)
|
|
35
|
+
from ..utils.text import count_tokens
|
|
36
|
+
return count_tokens(text, tokenizer=self.tokenizer)
|
|
37
|
+
|
|
38
|
+
def chunk(self,
|
|
39
|
+
doc_id: str,
|
|
40
|
+
text: str,
|
|
41
|
+
base_token_size: int = 512,
|
|
42
|
+
overlap: int = 64,
|
|
43
|
+
add_start_index: bool = False,
|
|
44
|
+
**params) -> List[Chunk]:
|
|
45
|
+
"""
|
|
46
|
+
Split text into fixed-size chunks with overlap.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
doc_id: Document identifier
|
|
50
|
+
text: Input text
|
|
51
|
+
base_token_size: Target chunk size in tokens
|
|
52
|
+
overlap: Number of tokens to overlap between chunks
|
|
53
|
+
add_start_index: If True, record character start position in metadata
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of Chunk objects
|
|
57
|
+
"""
|
|
58
|
+
from ..utils.text import get_tokens, decode_tokens, count_tokens
|
|
59
|
+
|
|
60
|
+
if not text:
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
# Get tokens for splitting
|
|
64
|
+
all_tokens = get_tokens(text, tokenizer=self.tokenizer)
|
|
65
|
+
if not all_tokens:
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
chunks = []
|
|
69
|
+
idx = 0
|
|
70
|
+
token_pos = 0
|
|
71
|
+
char_pos = 0 # Track character position for start_index
|
|
72
|
+
|
|
73
|
+
while token_pos < len(all_tokens):
|
|
74
|
+
# Take a window of tokens
|
|
75
|
+
window_tokens = all_tokens[token_pos : token_pos + base_token_size]
|
|
76
|
+
chunk_text = decode_tokens(window_tokens)
|
|
77
|
+
|
|
78
|
+
# Build metadata
|
|
79
|
+
meta = {
|
|
80
|
+
"chunk_index": idx,
|
|
81
|
+
"strategy": "fixed_length",
|
|
82
|
+
"token_count": len(window_tokens)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if add_start_index:
|
|
86
|
+
meta["start_index"] = char_pos
|
|
87
|
+
|
|
88
|
+
chunks.append(Chunk(
|
|
89
|
+
id=f"{doc_id}#fl#{idx}",
|
|
90
|
+
doc_id=doc_id,
|
|
91
|
+
text=chunk_text,
|
|
92
|
+
meta=meta
|
|
93
|
+
))
|
|
94
|
+
|
|
95
|
+
idx += 1
|
|
96
|
+
|
|
97
|
+
# Step size = window - overlap (move forward)
|
|
98
|
+
step = max(1, base_token_size - overlap)
|
|
99
|
+
|
|
100
|
+
# Update character position (for start_index tracking)
|
|
101
|
+
stepped_tokens = all_tokens[token_pos : token_pos + step]
|
|
102
|
+
char_pos += len(decode_tokens(stepped_tokens))
|
|
103
|
+
|
|
104
|
+
token_pos += step
|
|
105
|
+
|
|
106
|
+
# Boundary safety
|
|
107
|
+
if token_pos >= len(all_tokens):
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
return chunks
|