adaptive-oci-chunking 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ from adaptive_chunking.models import Chunk, ChunkingResult, Document
2
+ from adaptive_chunking.pipeline import AdaptiveChunker
3
+
4
+ __all__ = ["AdaptiveChunker", "Chunk", "ChunkingResult", "Document"]
5
+
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from adaptive_chunking.pipeline import AdaptiveChunker
6
+
7
+ try:
8
+ from fastapi import FastAPI
9
+ except ImportError as exc: # pragma: no cover
10
+ raise RuntimeError("Install API support with `pip install -e .[api]`.") from exc
11
+
12
+
13
+ class ChunkRequest(BaseModel):
14
+ text: str = Field(min_length=1)
15
+ document_id: str = "document"
16
+
17
+
18
+ class ChunkResponse(BaseModel):
19
+ document_id: str
20
+ strategy_name: str
21
+ score: float
22
+ chunks: list[dict]
23
+ metrics: list[dict]
24
+
25
+
26
+ app = FastAPI(title="Adaptive OCI Chunking")
27
+ chunker = AdaptiveChunker()
28
+
29
+
30
+ @app.post("/chunk", response_model=ChunkResponse)
31
+ def chunk(request: ChunkRequest) -> dict:
32
+ result = chunker.chunk(request.text, document_id=request.document_id)
33
+ return {
34
+ "document_id": result.document_id,
35
+ "strategy_name": result.strategy_name,
36
+ "score": result.score,
37
+ "chunks": [chunk.__dict__ for chunk in result.chunks],
38
+ "metrics": [metric.__dict__ for metric in result.metrics],
39
+ }
40
+
@@ -0,0 +1,342 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from abc import ABC, abstractmethod
5
+
6
+ from adaptive_chunking.models import Chunk
7
+ from adaptive_chunking.text import cosine_bow, normalize_space, sentences
8
+
9
+
10
+ class BaseChunker(ABC):
11
+ name: str
12
+
13
+ @abstractmethod
14
+ def split(self, text: str) -> list[Chunk]:
15
+ raise NotImplementedError
16
+
17
+ def _build_chunks(self, spans: list[tuple[int, int]], text: str) -> list[Chunk]:
18
+ chunks: list[Chunk] = []
19
+ for start, end in spans:
20
+ chunk_text = normalize_space(text[start:end])
21
+ if chunk_text:
22
+ chunks.append(
23
+ Chunk(
24
+ text=chunk_text,
25
+ index=len(chunks),
26
+ start_char=start,
27
+ end_char=end,
28
+ )
29
+ )
30
+ return chunks
31
+
32
+
33
+ class FixedWindowChunker(BaseChunker):
34
+ name = "fixed-window"
35
+
36
+ def __init__(self, chunk_size: int = 1200, overlap: int = 120) -> None:
37
+ if chunk_size <= 0:
38
+ raise ValueError("chunk_size must be positive")
39
+ if overlap < 0 or overlap >= chunk_size:
40
+ raise ValueError("overlap must be non-negative and smaller than chunk_size")
41
+ self.chunk_size = chunk_size
42
+ self.overlap = overlap
43
+
44
+ def split(self, text: str) -> list[Chunk]:
45
+ spans: list[tuple[int, int]] = []
46
+ step = self.chunk_size - self.overlap
47
+ for start in range(0, len(text), step):
48
+ end = min(start + self.chunk_size, len(text))
49
+ spans.append((start, end))
50
+ if end == len(text):
51
+ break
52
+ return self._build_chunks(spans, text)
53
+
54
+
55
+ class SingleChunker(BaseChunker):
56
+ name = "single"
57
+
58
+ def split(self, text: str) -> list[Chunk]:
59
+ return self._build_chunks([(0, len(text))], text)
60
+
61
+
62
+ class DelimiterChunker(BaseChunker):
63
+ name = "delimiter"
64
+
65
+ def __init__(
66
+ self,
67
+ delimiter: str = "\n---\n",
68
+ keep_delimiter: bool = False,
69
+ max_size: int = 1800,
70
+ ) -> None:
71
+ self.delimiter = delimiter
72
+ self.keep_delimiter = keep_delimiter
73
+ self.max_size = max_size
74
+ self.fallback = RecursiveChunker(chunk_size=max_size)
75
+
76
+ def split(self, text: str) -> list[Chunk]:
77
+ if not self.delimiter or self.delimiter not in text:
78
+ return self.fallback.split(text)
79
+ spans: list[tuple[int, int]] = []
80
+ cursor = 0
81
+ while cursor < len(text):
82
+ split_at = text.find(self.delimiter, cursor)
83
+ if split_at < 0:
84
+ spans.append((cursor, len(text)))
85
+ break
86
+ end = split_at + len(self.delimiter) if self.keep_delimiter else split_at
87
+ spans.append((cursor, end))
88
+ cursor = split_at + len(self.delimiter)
89
+ return self._split_oversized_spans(spans, text)
90
+
91
+ def _split_oversized_spans(self, spans: list[tuple[int, int]], text: str) -> list[Chunk]:
92
+ chunks: list[Chunk] = []
93
+ for start, end in spans:
94
+ if end - start <= self.max_size:
95
+ chunks.extend(self._build_chunks([(start, end)], text))
96
+ continue
97
+ for chunk in self.fallback.split(text[start:end]):
98
+ chunks.append(
99
+ Chunk(chunk.text, len(chunks), start + chunk.start_char, start + chunk.end_char)
100
+ )
101
+ return chunks
102
+
103
+
104
+ class PageChunker(DelimiterChunker):
105
+ name = "page"
106
+
107
+ def __init__(self, page_delimiter: str = "\f", max_size: int = 2200) -> None:
108
+ super().__init__(delimiter=page_delimiter, keep_delimiter=False, max_size=max_size)
109
+
110
+
111
+ class RecursiveChunker(BaseChunker):
112
+ name = "recursive"
113
+
114
+ def __init__(self, chunk_size: int = 1200, separators: tuple[str, ...] | None = None) -> None:
115
+ self.chunk_size = chunk_size
116
+ self.separators = separators or ("\n\n", "\n", ". ", " ")
117
+
118
+ def split(self, text: str) -> list[Chunk]:
119
+ spans = self._split_span(text, 0, len(text), 0)
120
+ return self._build_chunks(spans, text)
121
+
122
+ def _split_span(
123
+ self,
124
+ text: str,
125
+ start: int,
126
+ end: int,
127
+ separator_index: int,
128
+ ) -> list[tuple[int, int]]:
129
+ if end - start <= self.chunk_size:
130
+ return [(start, end)]
131
+ if separator_index >= len(self.separators):
132
+ return [
133
+ (cursor, min(cursor + self.chunk_size, end))
134
+ for cursor in range(start, end, self.chunk_size)
135
+ ]
136
+
137
+ separator = self.separators[separator_index]
138
+ pieces: list[tuple[int, int]] = []
139
+ cursor = start
140
+ while cursor < end:
141
+ limit = min(cursor + self.chunk_size, end)
142
+ split_at = text.rfind(separator, cursor, limit)
143
+ if split_at <= cursor:
144
+ pieces.extend(self._split_span(text, cursor, limit, separator_index + 1))
145
+ cursor = limit
146
+ else:
147
+ split_end = split_at + len(separator)
148
+ pieces.append((cursor, split_end))
149
+ cursor = split_end
150
+ return pieces
151
+
152
+
153
+ class SplitThenMergeChunker(BaseChunker):
154
+ name = "split-then-merge"
155
+
156
+ def __init__(self, min_size: int = 600, max_size: int = 1400) -> None:
157
+ if min_size <= 0 or max_size <= min_size:
158
+ raise ValueError("expected 0 < min_size < max_size")
159
+ self.min_size = min_size
160
+ self.max_size = max_size
161
+
162
+ def split(self, text: str) -> list[Chunk]:
163
+ raw_spans = _paragraph_spans(text)
164
+ if not raw_spans:
165
+ return []
166
+ merged: list[tuple[int, int]] = []
167
+ start, end = raw_spans[0]
168
+ for next_start, next_end in raw_spans[1:]:
169
+ proposed_size = next_end - start
170
+ if proposed_size <= self.max_size or end - start < self.min_size:
171
+ end = next_end
172
+ else:
173
+ merged.append((start, end))
174
+ start, end = next_start, next_end
175
+ merged.append((start, end))
176
+ return self._build_chunks(merged, text)
177
+
178
+
179
+ class SectionAwareChunker(BaseChunker):
180
+ name = "section-aware"
181
+
182
+ def __init__(self, min_size: int = 500, max_size: int = 1800) -> None:
183
+ self.min_size = min_size
184
+ self.max_size = max_size
185
+ self.fallback = SplitThenMergeChunker(min_size=min_size, max_size=max_size)
186
+ self.heading_pattern = re.compile(
187
+ r"(?m)^(?:#{1,6}\s+.+|\d+(?:\.\d+)*\s+[A-Z].+|[A-Z][A-Za-z0-9 ,:;&()/-]{3,80})$"
188
+ )
189
+
190
+ def split(self, text: str) -> list[Chunk]:
191
+ starts = sorted(
192
+ {0, len(text), *(match.start() for match in self.heading_pattern.finditer(text))}
193
+ )
194
+ spans = [(starts[index], starts[index + 1]) for index in range(len(starts) - 1)]
195
+ merged: list[tuple[int, int]] = []
196
+ current_start: int | None = None
197
+ current_end: int | None = None
198
+ for start, end in spans:
199
+ if not text[start:end].strip():
200
+ continue
201
+ if current_start is None:
202
+ current_start, current_end = start, end
203
+ continue
204
+ assert current_end is not None
205
+ proposed = end - current_start
206
+ if proposed <= self.max_size or current_end - current_start < self.min_size:
207
+ current_end = end
208
+ else:
209
+ merged.append((current_start, current_end))
210
+ current_start, current_end = start, end
211
+ if current_start is not None and current_end is not None:
212
+ merged.append((current_start, current_end))
213
+ chunks: list[Chunk] = []
214
+ for start, end in merged:
215
+ if end - start <= self.max_size:
216
+ chunks.extend(self._build_chunks([(start, end)], text))
217
+ else:
218
+ for chunk in self.fallback.split(text[start:end]):
219
+ chunks.append(
220
+ Chunk(
221
+ chunk.text,
222
+ len(chunks),
223
+ start + chunk.start_char,
224
+ start + chunk.end_char,
225
+ )
226
+ )
227
+ return chunks
228
+
229
+
230
+ class SemanticChunker(BaseChunker):
231
+ name = "semantic"
232
+
233
+ def __init__(
234
+ self,
235
+ max_size: int = 1400,
236
+ min_size: int = 350,
237
+ similarity_threshold: float = 0.10,
238
+ ) -> None:
239
+ self.max_size = max_size
240
+ self.min_size = min_size
241
+ self.similarity_threshold = similarity_threshold
242
+
243
+ def split(self, text: str) -> list[Chunk]:
244
+ sentence_spans = _sentence_spans(text)
245
+ if not sentence_spans:
246
+ return []
247
+ chunks: list[tuple[int, int]] = []
248
+ start, end = sentence_spans[0]
249
+ previous_text = text[start:end]
250
+ for next_start, next_end in sentence_spans[1:]:
251
+ next_text = text[next_start:next_end]
252
+ proposed_size = next_end - start
253
+ similarity = cosine_bow(previous_text, next_text)
254
+ should_break = (
255
+ end - start >= self.min_size
256
+ and (proposed_size > self.max_size or similarity < self.similarity_threshold)
257
+ )
258
+ if should_break:
259
+ chunks.append((start, end))
260
+ start, end = next_start, next_end
261
+ else:
262
+ end = next_end
263
+ previous_text = next_text
264
+ chunks.append((start, end))
265
+ return self._build_chunks(chunks, text)
266
+
267
+
268
+ class RegexSectionChunker(BaseChunker):
269
+ name = "regex-section"
270
+
271
+ def __init__(self, max_size: int = 1800, heading_pattern: str | None = None) -> None:
272
+ self.max_size = max_size
273
+ self.heading_pattern = re.compile(
274
+ heading_pattern or r"(?m)^(?:#{1,6}\s+.+|\d+(?:\.\d+)*\s+[A-Z].+)$"
275
+ )
276
+ self.fallback = RecursiveChunker(chunk_size=max_size)
277
+
278
+ def split(self, text: str) -> list[Chunk]:
279
+ starts = [match.start() for match in self.heading_pattern.finditer(text)]
280
+ if not starts:
281
+ return self.fallback.split(text)
282
+ starts = sorted(set([0, *starts, len(text)]))
283
+ spans = [(starts[index], starts[index + 1]) for index in range(len(starts) - 1)]
284
+ chunks: list[Chunk] = []
285
+ for start, end in spans:
286
+ section = text[start:end]
287
+ if len(section) <= self.max_size:
288
+ chunks.extend(self._build_chunks([(start, end)], text))
289
+ else:
290
+ for chunk in self.fallback.split(section):
291
+ chunks.append(
292
+ Chunk(
293
+ text=chunk.text,
294
+ index=len(chunks),
295
+ start_char=start + chunk.start_char,
296
+ end_char=start + chunk.end_char,
297
+ )
298
+ )
299
+ return [
300
+ Chunk(c.text, i, c.start_char, c.end_char, c.metadata)
301
+ for i, c in enumerate(chunks)
302
+ ]
303
+
304
+
305
+ def default_chunkers() -> list[BaseChunker]:
306
+ return [
307
+ SingleChunker(),
308
+ FixedWindowChunker(),
309
+ RecursiveChunker(),
310
+ SplitThenMergeChunker(),
311
+ SectionAwareChunker(),
312
+ DelimiterChunker(),
313
+ PageChunker(),
314
+ SemanticChunker(),
315
+ RegexSectionChunker(),
316
+ ]
317
+
318
+
319
+ def _paragraph_spans(text: str) -> list[tuple[int, int]]:
320
+ spans: list[tuple[int, int]] = []
321
+ cursor = 0
322
+ for match in re.finditer(r"\n\s*\n", text):
323
+ end = match.end()
324
+ if text[cursor:end].strip():
325
+ spans.append((cursor, end))
326
+ cursor = end
327
+ if text[cursor:].strip():
328
+ spans.append((cursor, len(text)))
329
+ return spans or ([(0, len(text))] if text.strip() else [])
330
+
331
+
332
+ def _sentence_spans(text: str) -> list[tuple[int, int]]:
333
+ spans: list[tuple[int, int]] = []
334
+ cursor = 0
335
+ for sentence in sentences(text):
336
+ start = text.find(sentence, cursor)
337
+ if start < 0:
338
+ continue
339
+ end = start + len(sentence)
340
+ spans.append((start, end))
341
+ cursor = end
342
+ return spans
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Annotated
6
+
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from adaptive_chunking.io import load_text_file
12
+ from adaptive_chunking.pipeline import AdaptiveChunker
13
+
14
+ app = typer.Typer(help="Adaptive document chunking for RAG.")
15
+ console = Console()
16
+
17
+
18
+ @app.command()
19
+ def chunk(
20
+ path: Annotated[
21
+ Path,
22
+ typer.Argument(exists=True, readable=True, help="Text or Markdown file to chunk."),
23
+ ],
24
+ document_id: Annotated[
25
+ str | None,
26
+ typer.Option(help="Stable document identifier."),
27
+ ] = None,
28
+ json_output: Annotated[
29
+ bool,
30
+ typer.Option("--json", help="Print machine-readable JSON."),
31
+ ] = False,
32
+ ) -> None:
33
+ text = load_text_file(path)
34
+ result = AdaptiveChunker().chunk(text, document_id=document_id or path.stem)
35
+ if json_output:
36
+ console.print(
37
+ json.dumps(
38
+ {
39
+ "document_id": result.document_id,
40
+ "strategy_name": result.strategy_name,
41
+ "score": result.score,
42
+ "chunks": [chunk.__dict__ for chunk in result.chunks],
43
+ "metrics": [metric.__dict__ for metric in result.metrics],
44
+ },
45
+ indent=2,
46
+ )
47
+ )
48
+ return
49
+
50
+ console.print(f"[bold]Strategy:[/bold] {result.strategy_name}")
51
+ console.print(f"[bold]Score:[/bold] {result.score:.3f}")
52
+ table = Table("Metric", "Value", "Weight")
53
+ for metric in result.metrics:
54
+ table.add_row(metric.name, f"{metric.value:.3f}", f"{metric.weight:.2f}")
55
+ console.print(table)
56
+ for chunk_item in result.chunks:
57
+ console.rule(f"Chunk {chunk_item.index}")
58
+ console.print(chunk_item.text)
59
+
60
+
61
+ if __name__ == "__main__":
62
+ app()
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ SUPPORTED_TEXT_SUFFIXES = {".txt", ".md", ".markdown", ".rst"}
6
+
7
+
8
+ def load_text_file(path: str | Path) -> str:
9
+ file_path = Path(path)
10
+ if file_path.suffix.lower() not in SUPPORTED_TEXT_SUFFIXES:
11
+ raise ValueError(f"unsupported file type: {file_path.suffix}")
12
+ return file_path.read_text(encoding="utf-8")
13
+
14
+
15
+ def discover_text_files(path: str | Path) -> list[Path]:
16
+ root = Path(path)
17
+ if root.is_file():
18
+ return [root]
19
+ return sorted(
20
+ child
21
+ for child in root.rglob("*")
22
+ if child.is_file() and child.suffix.lower() in SUPPORTED_TEXT_SUFFIXES
23
+ )
24
+
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from adaptive_chunking.pipeline import AdaptiveChunker
6
+
7
+ try:
8
+ from langchain_text_splitters import TextSplitter
9
+ except ImportError: # pragma: no cover
10
+ TextSplitter = None # type: ignore[assignment]
11
+
12
+ _BaseTextSplitter = TextSplitter if TextSplitter is not None else object
13
+
14
+
15
+ class LangChainAdaptiveTextSplitter(_BaseTextSplitter): # type: ignore[misc,valid-type]
16
+ """LangChain TextSplitter backed by AdaptiveChunker."""
17
+
18
+ def __init__(
19
+ self,
20
+ chunker: AdaptiveChunker | None = None,
21
+ keep_separator: bool = False,
22
+ **kwargs: Any,
23
+ ) -> None:
24
+ if TextSplitter is None: # pragma: no cover
25
+ raise RuntimeError(
26
+ "Install LangChain support with `pip install -e .[langchain]`."
27
+ )
28
+ super().__init__(keep_separator=keep_separator, **kwargs)
29
+ self.chunker = chunker or AdaptiveChunker()
30
+
31
+ def split_text(self, text: str) -> list[str]:
32
+ result = self.chunker.chunk(text)
33
+ return [chunk.text for chunk in result.chunks]
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from adaptive_chunking.models import Chunk, ChunkingResult
6
+ from adaptive_chunking.pipeline import AdaptiveChunker
7
+
8
+
9
+ def chunks_to_llama_nodes(
10
+ chunks: list[Chunk],
11
+ *,
12
+ document_id: str = "document",
13
+ extra_metadata: dict[str, Any] | None = None,
14
+ ) -> list[Any]:
15
+ try:
16
+ from llama_index.core.schema import TextNode
17
+ except ImportError as exc: # pragma: no cover
18
+ raise RuntimeError(
19
+ "Install LlamaIndex support with `pip install -e .[llama-index]`."
20
+ ) from exc
21
+
22
+ metadata = extra_metadata or {}
23
+ return [
24
+ TextNode(
25
+ text=chunk.text,
26
+ id_=f"{document_id}:{chunk.index}",
27
+ metadata={
28
+ **metadata,
29
+ **chunk.metadata,
30
+ "document_id": document_id,
31
+ "chunk_index": chunk.index,
32
+ "start_char": chunk.start_char,
33
+ "end_char": chunk.end_char,
34
+ },
35
+ )
36
+ for chunk in chunks
37
+ ]
38
+
39
+
40
+ def result_to_llama_nodes(result: ChunkingResult) -> list[Any]:
41
+ return chunks_to_llama_nodes(
42
+ result.chunks,
43
+ document_id=result.document_id,
44
+ extra_metadata={"strategy_name": result.strategy_name, "adaptive_score": result.score},
45
+ )
46
+
47
+
48
+ class LlamaIndexAdaptiveParser:
49
+ """Small adapter with the same practical behavior as a LlamaIndex node parser."""
50
+
51
+ def __init__(self, chunker: AdaptiveChunker | None = None) -> None:
52
+ self.chunker = chunker or AdaptiveChunker()
53
+
54
+ def get_nodes_from_documents(self, documents: list[Any], **_: Any) -> list[Any]:
55
+ nodes: list[Any] = []
56
+ for document_index, document in enumerate(documents):
57
+ text = getattr(document, "text", None) or getattr(document, "get_content", lambda: "")()
58
+ metadata = dict(getattr(document, "metadata", {}) or {})
59
+ document_id = str(
60
+ metadata.get("document_id") or getattr(document, "id_", document_index)
61
+ )
62
+ result = self.chunker.chunk(text, document_id=document_id)
63
+ nodes.extend(
64
+ chunks_to_llama_nodes(
65
+ result.chunks,
66
+ document_id=document_id,
67
+ extra_metadata={
68
+ **metadata,
69
+ "strategy_name": result.strategy_name,
70
+ "adaptive_score": result.score,
71
+ },
72
+ )
73
+ )
74
+ return nodes