autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,100 @@
1
+ """
2
+ AutoChunks - World-Class Text Chunking Library
3
+
4
+ The definitive Swiss Army Knife of text splitting with extreme precision.
5
+ """
6
+
7
+ from .base import BaseChunker, Chunk
8
+
9
+ # ============================================================================
10
+ # BASIC SPLITTERS
11
+ # ============================================================================
12
+ from .fixed_length import FixedLengthChunker
13
+ from .recursive_character import RecursiveCharacterChunker
14
+ from .sentence_aware import SentenceAwareChunker
15
+
16
+ # ============================================================================
17
+ # SEMANTIC SPLITTERS
18
+ # ============================================================================
19
+ from .semantic_local import SemanticLocalChunker
20
+ from .hybrid_semantic_stat import HybridSemanticStatChunker
21
+ from .proposition import PropositionChunker
22
+ from .agentic import AgenticChunker
23
+
24
+ # ============================================================================
25
+ # STRUCTURE-AWARE SPLITTERS
26
+ # ============================================================================
27
+ from .layout_aware import LayoutAwareChunker
28
+ from .parent_child import ParentChildChunker
29
+ from .contextual_retrieval import ContextualRetrievalChunker
30
+ from .html_section import HTMLSectionChunker
31
+
32
+ # ============================================================================
33
+ # CODE SPLITTERS
34
+ # ============================================================================
35
+ from .python_ast import PythonASTChunker
36
+
37
+ # ============================================================================
38
+ # CHUNKER REGISTRY
39
+ # ============================================================================
40
+ CHUNKER_REGISTRY = {
41
+ # Basic
42
+ 'fixed_length': FixedLengthChunker,
43
+ 'recursive_character': RecursiveCharacterChunker,
44
+ 'sentence_aware': SentenceAwareChunker,
45
+
46
+ # Semantic
47
+ 'semantic_local': SemanticLocalChunker,
48
+ 'hybrid_semantic_stat': HybridSemanticStatChunker,
49
+ 'proposition': PropositionChunker,
50
+ 'agentic': AgenticChunker,
51
+
52
+ # Structure-Aware
53
+ 'layout_aware': LayoutAwareChunker,
54
+ 'parent_child': ParentChildChunker,
55
+ 'contextual_retrieval': ContextualRetrievalChunker,
56
+ 'html_section': HTMLSectionChunker,
57
+
58
+ # Code
59
+ 'python_ast': PythonASTChunker,
60
+ }
61
+
62
+ def get_chunker(name: str) -> BaseChunker:
63
+ """Get a chunker instance by name."""
64
+ if name not in CHUNKER_REGISTRY:
65
+ raise ValueError(f"Unknown chunker: {name}. Available: {list(CHUNKER_REGISTRY.keys())}")
66
+ return CHUNKER_REGISTRY[name]()
67
+
68
+ def list_chunkers() -> list:
69
+ """List all available chunker names."""
70
+ return list(CHUNKER_REGISTRY.keys())
71
+
72
+ __all__ = [
73
+ # Base
74
+ 'BaseChunker',
75
+ 'Chunk',
76
+
77
+ # Basic
78
+ 'FixedLengthChunker',
79
+ 'RecursiveCharacterChunker',
80
+ 'SentenceAwareChunker',
81
+
82
+ # Semantic
83
+ 'SemanticLocalChunker',
84
+ 'HybridSemanticStatChunker',
85
+ 'PropositionChunker',
86
+ 'AgenticChunker',
87
+
88
+ # Structure-Aware
89
+ 'LayoutAwareChunker',
90
+ 'ParentChildChunker',
91
+ 'ContextualRetrievalChunker',
92
+
93
+ # Code
94
+ 'PythonASTChunker',
95
+
96
+ # Utilities
97
+ 'CHUNKER_REGISTRY',
98
+ 'get_chunker',
99
+ 'list_chunkers',
100
+ ]
@@ -0,0 +1,184 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Callable, Optional, Any
4
+ from .base import BaseChunker, Chunk
5
+ from ..utils.text import count_tokens, split_sentences
6
+
7
+ class AgenticChunker(BaseChunker):
8
+ """
9
+ LLM-Powered Agentic Chunker for Intelligent Boundary Detection.
10
+
11
+ Uses a language model to decide optimal chunk boundaries based on
12
+ semantic coherence, topic shifts, and content structure.
13
+
14
+ BEST-OF-BREED FEATURES:
15
+ 1. LLM-Decided Boundaries: Model determines where to split based on meaning.
16
+ 2. Configurable Prompts: Custom instructions for domain-specific chunking.
17
+ 3. Fallback Safety: Reverts to sentence-aware if LLM unavailable.
18
+ 4. Batch Processing: Efficient API usage with batched boundary decisions.
19
+
20
+ Reference: Greg Kamradt's "Agentic Chunker" concept.
21
+ """
22
+ name = "agentic"
23
+
24
+ DEFAULT_SYSTEM_PROMPT = """You are a text segmentation expert. Your task is to identify natural boundaries in text where the topic, theme, or focus shifts significantly.
25
+
26
+ For each boundary you identify, respond with the sentence number (1-indexed) where a NEW section should begin.
27
+
28
+ Guidelines:
29
+ - A new section should start when there's a clear topic shift
30
+ - Keep related information together
31
+ - Aim for chunks of roughly 3-10 sentences
32
+ - Don't split in the middle of a logical argument or explanation
33
+ - Consider paragraph breaks as potential (but not mandatory) boundaries"""
34
+
35
+ DEFAULT_USER_TEMPLATE = """Analyze the following text and identify where natural section boundaries should occur.
36
+
37
+ TEXT:
38
+ {text}
39
+
40
+ SENTENCES (numbered):
41
+ {numbered_sentences}
42
+
43
+ Respond with a JSON array of sentence numbers where new sections should BEGIN.
44
+ Example: [1, 5, 12, 18] means sections start at sentences 1, 5, 12, and 18.
45
+
46
+ Only output the JSON array, nothing else."""
47
+
48
+ def __init__(self,
49
+ llm_fn: Callable[[str, str], str] = None,
50
+ system_prompt: str = None,
51
+ user_template: str = None,
52
+ max_sentences_per_call: int = 50):
53
+ """
54
+ Initialize the agentic chunker.
55
+
56
+ Args:
57
+ llm_fn: Function that takes (system_prompt, user_message) and returns LLM response.
58
+ If None, uses a mock that falls back to sentence-aware chunking.
59
+ system_prompt: Custom system prompt for the LLM
60
+ user_template: Custom user message template (must include {text} and {numbered_sentences})
61
+ max_sentences_per_call: Max sentences to process in one LLM call
62
+ """
63
+ self.llm_fn = llm_fn
64
+ self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
65
+ self.user_template = user_template or self.DEFAULT_USER_TEMPLATE
66
+ self.max_sentences_per_call = max_sentences_per_call
67
+
68
+ def chunk(self,
69
+ doc_id: str,
70
+ text: str,
71
+ base_token_size: int = 512,
72
+ **params) -> List[Chunk]:
73
+ """
74
+ Use LLM to determine optimal chunk boundaries.
75
+
76
+ Args:
77
+ doc_id: Document identifier
78
+ text: Input text
79
+ base_token_size: Target chunk size (used as guidance for LLM)
80
+
81
+ Returns:
82
+ List of Chunk objects
83
+ """
84
+ sentences = split_sentences(text)
85
+
86
+ if len(sentences) <= 1:
87
+ return [Chunk(id=f"{doc_id}#ag#0", doc_id=doc_id, text=text,
88
+ meta={"chunk_index": 0, "strategy": "agentic"})]
89
+
90
+ # Fallback if no LLM function provided
91
+ if self.llm_fn is None:
92
+ from .sentence_aware import SentenceAwareChunker
93
+ return SentenceAwareChunker().chunk(doc_id, text, base_token_size=base_token_size)
94
+
95
+ # Get boundary decisions from LLM
96
+ boundaries = self._get_boundaries(sentences)
97
+
98
+ # Build chunks from boundaries
99
+ chunks = []
100
+ current_start = 0
101
+
102
+ for boundary in sorted(set(boundaries)):
103
+ if boundary > current_start and boundary <= len(sentences):
104
+ chunk_sentences = sentences[current_start:boundary]
105
+ chunk_text = " ".join(chunk_sentences)
106
+
107
+ chunks.append(Chunk(
108
+ id=f"{doc_id}#ag#{len(chunks)}",
109
+ doc_id=doc_id,
110
+ text=chunk_text,
111
+ meta={
112
+ "chunk_index": len(chunks),
113
+ "strategy": "agentic",
114
+ "sentence_range": [current_start, boundary],
115
+ "token_count": count_tokens(chunk_text)
116
+ }
117
+ ))
118
+ current_start = boundary
119
+
120
+ # Add final chunk
121
+ if current_start < len(sentences):
122
+ chunk_sentences = sentences[current_start:]
123
+ chunk_text = " ".join(chunk_sentences)
124
+ chunks.append(Chunk(
125
+ id=f"{doc_id}#ag#{len(chunks)}",
126
+ doc_id=doc_id,
127
+ text=chunk_text,
128
+ meta={
129
+ "chunk_index": len(chunks),
130
+ "strategy": "agentic",
131
+ "sentence_range": [current_start, len(sentences)],
132
+ "token_count": count_tokens(chunk_text)
133
+ }
134
+ ))
135
+
136
+ return chunks if chunks else [Chunk(id=f"{doc_id}#ag#0", doc_id=doc_id, text=text,
137
+ meta={"chunk_index": 0, "strategy": "agentic"})]
138
+
139
+ def _get_boundaries(self, sentences: List[str]) -> List[int]:
140
+ """Get boundary positions from LLM."""
141
+ import json
142
+
143
+ # Always include position 0 as first boundary
144
+ all_boundaries = [0]
145
+
146
+ # Process in batches if needed
147
+ for batch_start in range(0, len(sentences), self.max_sentences_per_call):
148
+ batch_end = min(batch_start + self.max_sentences_per_call, len(sentences))
149
+ batch_sentences = sentences[batch_start:batch_end]
150
+
151
+ # Create numbered list
152
+ numbered = "\n".join([f"{i+1}. {s}" for i, s in enumerate(batch_sentences)])
153
+ batch_text = " ".join(batch_sentences)
154
+
155
+ user_message = self.user_template.format(
156
+ text=batch_text,
157
+ numbered_sentences=numbered
158
+ )
159
+
160
+ try:
161
+ response = self.llm_fn(self.system_prompt, user_message)
162
+
163
+ # Parse JSON response
164
+ # Handle various response formats
165
+ response = response.strip()
166
+ if response.startswith("```"):
167
+ response = response.split("```")[1]
168
+ if response.startswith("json"):
169
+ response = response[4:]
170
+
171
+ boundaries = json.loads(response)
172
+
173
+ # Adjust for batch offset and add to all boundaries
174
+ for b in boundaries:
175
+ if isinstance(b, int) and b > 0:
176
+ all_boundaries.append(batch_start + b - 1) # Convert 1-indexed to 0-indexed
177
+
178
+ except Exception as e:
179
+ # On parse failure, use sentence-aware heuristic for this batch
180
+ # Split roughly every 5 sentences
181
+ for i in range(5, len(batch_sentences), 5):
182
+ all_boundaries.append(batch_start + i)
183
+
184
+ return sorted(set(all_boundaries))
@@ -0,0 +1,16 @@
1
+
2
+ from __future__ import annotations
3
+ from dataclasses import dataclass
4
+ from typing import List, Dict, Any
5
+
6
+ @dataclass
7
+ class Chunk:
8
+ id: str
9
+ doc_id: str
10
+ text: str
11
+ meta: Dict[str, Any]
12
+
13
+ class BaseChunker:
14
+ name = "base"
15
+ def chunk(self, doc_id: str, text: str, **params) -> List[Chunk]:
16
+ raise NotImplementedError
@@ -0,0 +1,151 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Callable, Optional
4
+ from .base import BaseChunker, Chunk
5
+ from ..utils.text import count_tokens
6
+
7
+ class ContextualRetrievalChunker(BaseChunker):
8
+ """
9
+ Contextual Retrieval Chunker (Anthropic's Approach).
10
+
11
+ Each chunk is prepended with LLM-generated context that situates it
12
+ within the broader document. This dramatically improves retrieval accuracy.
13
+
14
+ BEST-OF-BREED FEATURES:
15
+ 1. Context Prepending: Each chunk starts with situating context.
16
+ 2. Document-Aware: Context is generated with full document visibility.
17
+ 3. Retrieval-Optimized: Context is designed to improve semantic search.
18
+ 4. Caching: Efficient context reuse for repeated chunks.
19
+
20
+ Reference: Anthropic's "Contextual Retrieval" blog post.
21
+ """
22
+ name = "contextual_retrieval"
23
+
24
+ DEFAULT_CONTEXT_PROMPT = """<document>
25
+ {document}
26
+ </document>
27
+
28
+ Here is the chunk we want to situate within the whole document:
29
+ <chunk>
30
+ {chunk}
31
+ </chunk>
32
+
33
+ Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
34
+
35
+ def __init__(self,
36
+ llm_fn: Callable[[str, str], str] = None,
37
+ base_chunker: BaseChunker = None,
38
+ context_template: str = None,
39
+ max_document_tokens: int = 8000,
40
+ context_prefix: str = "Context: "):
41
+ """
42
+ Initialize the contextual retrieval chunker.
43
+
44
+ Args:
45
+ llm_fn: Function that takes (system_prompt, user_message) and returns LLM response.
46
+ base_chunker: Chunker to use for initial splitting. Defaults to RecursiveCharacter.
47
+ context_template: Custom template for context generation.
48
+ max_document_tokens: Max tokens of document to include in context generation.
49
+ context_prefix: Prefix before the generated context.
50
+ """
51
+ self.llm_fn = llm_fn
52
+ self.base_chunker = base_chunker
53
+ self.context_template = context_template or self.DEFAULT_CONTEXT_PROMPT
54
+ self.max_document_tokens = max_document_tokens
55
+ self.context_prefix = context_prefix
56
+
57
+ def chunk(self,
58
+ doc_id: str,
59
+ text: str,
60
+ base_token_size: int = 512,
61
+ generate_context: bool = True,
62
+ **params) -> List[Chunk]:
63
+ """
64
+ Create chunks with contextual headers.
65
+
66
+ Args:
67
+ doc_id: Document identifier
68
+ text: Input text
69
+ base_token_size: Target chunk size
70
+ generate_context: If False, skip context generation (for testing)
71
+
72
+ Returns:
73
+ List of Chunk objects with context prepended
74
+ """
75
+ # Get base chunker
76
+ if self.base_chunker is None:
77
+ from .recursive_character import RecursiveCharacterChunker
78
+ base_chunker = RecursiveCharacterChunker()
79
+ else:
80
+ base_chunker = self.base_chunker
81
+
82
+ # Create initial chunks
83
+ base_chunks = base_chunker.chunk(doc_id, text, base_token_size=base_token_size, **params)
84
+
85
+ if not generate_context or self.llm_fn is None:
86
+ # Return chunks with simple document context from lineage if available
87
+ for chunk in base_chunks:
88
+ chunk.meta["strategy"] = "contextual_retrieval_base"
89
+ return base_chunks
90
+
91
+ # Truncate document for context generation
92
+ from ..utils.text import get_tokens, decode_tokens
93
+ doc_tokens = get_tokens(text)
94
+ if len(doc_tokens) > self.max_document_tokens:
95
+ truncated_doc = decode_tokens(doc_tokens[:self.max_document_tokens]) + "\n... [truncated]"
96
+ else:
97
+ truncated_doc = text
98
+
99
+ # Generate context for each chunk
100
+ contextualized_chunks = []
101
+
102
+ for chunk in base_chunks:
103
+ context = self._generate_context(truncated_doc, chunk.text)
104
+
105
+ # Prepend context to chunk text
106
+ if context:
107
+ contextualized_text = f"{self.context_prefix}{context}\n\n{chunk.text}"
108
+ else:
109
+ contextualized_text = chunk.text
110
+
111
+ contextualized_chunks.append(Chunk(
112
+ id=f"{doc_id}#cr#{len(contextualized_chunks)}",
113
+ doc_id=doc_id,
114
+ text=contextualized_text,
115
+ meta={
116
+ "chunk_index": len(contextualized_chunks),
117
+ "strategy": "contextual_retrieval",
118
+ "original_text": chunk.text,
119
+ "generated_context": context,
120
+ "token_count": count_tokens(contextualized_text),
121
+ "original_token_count": count_tokens(chunk.text)
122
+ }
123
+ ))
124
+
125
+ return contextualized_chunks
126
+
127
+ def _generate_context(self, document: str, chunk: str) -> str:
128
+ """Generate situating context for a chunk."""
129
+ try:
130
+ # Use the context template
131
+ prompt = self.context_template.format(
132
+ document=document,
133
+ chunk=chunk
134
+ )
135
+
136
+ # Call LLM with empty system prompt (all in user message)
137
+ response = self.llm_fn("", prompt)
138
+
139
+ # Clean response
140
+ context = response.strip()
141
+
142
+ # Limit context length
143
+ if count_tokens(context) > 100:
144
+ from ..utils.text import get_tokens, decode_tokens
145
+ tokens = get_tokens(context)[:100]
146
+ context = decode_tokens(tokens)
147
+
148
+ return context
149
+
150
+ except Exception as e:
151
+ return ""
@@ -0,0 +1,110 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Callable, Optional
4
+ from .base import BaseChunker, Chunk
5
+
6
+ class FixedLengthChunker(BaseChunker):
7
+ """
8
+ Fixed-Length Chunker with Sliding Window (Overlap).
9
+
10
+ BEST-OF-BREED FEATURES:
11
+ 1. Pluggable Length Function: Supports tiktoken, char, word, or custom functions.
12
+ 2. Start Index Tracking: Records character offset for citation purposes.
13
+ 3. Accurate Token Counting: Uses tiktoken by default for GPT-model accuracy.
14
+ """
15
+ name = "fixed_length"
16
+
17
+ def __init__(self,
18
+ length_function: Callable[[str], int] = None,
19
+ tokenizer: str = "auto"):
20
+ """
21
+ Initialize the chunker.
22
+
23
+ Args:
24
+ length_function: Custom function to measure text length.
25
+ If None, uses token counting.
26
+ tokenizer: Backend for tokenization ("auto", "tiktoken", "whitespace", "character")
27
+ """
28
+ self.tokenizer = tokenizer
29
+ self._length_function = length_function
30
+
31
+ def _get_length(self, text: str) -> int:
32
+ """Get length using configured method."""
33
+ if self._length_function:
34
+ return self._length_function(text)
35
+ from ..utils.text import count_tokens
36
+ return count_tokens(text, tokenizer=self.tokenizer)
37
+
38
+ def chunk(self,
39
+ doc_id: str,
40
+ text: str,
41
+ base_token_size: int = 512,
42
+ overlap: int = 64,
43
+ add_start_index: bool = False,
44
+ **params) -> List[Chunk]:
45
+ """
46
+ Split text into fixed-size chunks with overlap.
47
+
48
+ Args:
49
+ doc_id: Document identifier
50
+ text: Input text
51
+ base_token_size: Target chunk size in tokens
52
+ overlap: Number of tokens to overlap between chunks
53
+ add_start_index: If True, record character start position in metadata
54
+
55
+ Returns:
56
+ List of Chunk objects
57
+ """
58
+ from ..utils.text import get_tokens, decode_tokens, count_tokens
59
+
60
+ if not text:
61
+ return []
62
+
63
+ # Get tokens for splitting
64
+ all_tokens = get_tokens(text, tokenizer=self.tokenizer)
65
+ if not all_tokens:
66
+ return []
67
+
68
+ chunks = []
69
+ idx = 0
70
+ token_pos = 0
71
+ char_pos = 0 # Track character position for start_index
72
+
73
+ while token_pos < len(all_tokens):
74
+ # Take a window of tokens
75
+ window_tokens = all_tokens[token_pos : token_pos + base_token_size]
76
+ chunk_text = decode_tokens(window_tokens)
77
+
78
+ # Build metadata
79
+ meta = {
80
+ "chunk_index": idx,
81
+ "strategy": "fixed_length",
82
+ "token_count": len(window_tokens)
83
+ }
84
+
85
+ if add_start_index:
86
+ meta["start_index"] = char_pos
87
+
88
+ chunks.append(Chunk(
89
+ id=f"{doc_id}#fl#{idx}",
90
+ doc_id=doc_id,
91
+ text=chunk_text,
92
+ meta=meta
93
+ ))
94
+
95
+ idx += 1
96
+
97
+ # Step size = window - overlap (move forward)
98
+ step = max(1, base_token_size - overlap)
99
+
100
+ # Update character position (for start_index tracking)
101
+ stepped_tokens = all_tokens[token_pos : token_pos + step]
102
+ char_pos += len(decode_tokens(stepped_tokens))
103
+
104
+ token_pos += step
105
+
106
+ # Boundary safety
107
+ if token_pos >= len(all_tokens):
108
+ break
109
+
110
+ return chunks