adaptive-oci-chunking 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adaptive_chunking/__init__.py +5 -0
- adaptive_chunking/api.py +40 -0
- adaptive_chunking/chunkers.py +342 -0
- adaptive_chunking/cli.py +62 -0
- adaptive_chunking/io.py +24 -0
- adaptive_chunking/langchain.py +33 -0
- adaptive_chunking/llama_index.py +74 -0
- adaptive_chunking/metrics.py +287 -0
- adaptive_chunking/models.py +61 -0
- adaptive_chunking/oci.py +73 -0
- adaptive_chunking/pipeline.py +25 -0
- adaptive_chunking/selector.py +30 -0
- adaptive_chunking/text.py +71 -0
- adaptive_oci_chunking-0.1.0.dist-info/METADATA +343 -0
- adaptive_oci_chunking-0.1.0.dist-info/RECORD +18 -0
- adaptive_oci_chunking-0.1.0.dist-info/WHEEL +4 -0
- adaptive_oci_chunking-0.1.0.dist-info/entry_points.txt +2 -0
- adaptive_oci_chunking-0.1.0.dist-info/licenses/LICENSE +21 -0
adaptive_chunking/api.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from adaptive_chunking.pipeline import AdaptiveChunker
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from fastapi import FastAPI
|
|
9
|
+
except ImportError as exc: # pragma: no cover
|
|
10
|
+
raise RuntimeError("Install API support with `pip install -e .[api]`.") from exc
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ChunkRequest(BaseModel):
|
|
14
|
+
text: str = Field(min_length=1)
|
|
15
|
+
document_id: str = "document"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ChunkResponse(BaseModel):
|
|
19
|
+
document_id: str
|
|
20
|
+
strategy_name: str
|
|
21
|
+
score: float
|
|
22
|
+
chunks: list[dict]
|
|
23
|
+
metrics: list[dict]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
app = FastAPI(title="Adaptive OCI Chunking")
|
|
27
|
+
chunker = AdaptiveChunker()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@app.post("/chunk", response_model=ChunkResponse)
|
|
31
|
+
def chunk(request: ChunkRequest) -> dict:
|
|
32
|
+
result = chunker.chunk(request.text, document_id=request.document_id)
|
|
33
|
+
return {
|
|
34
|
+
"document_id": result.document_id,
|
|
35
|
+
"strategy_name": result.strategy_name,
|
|
36
|
+
"score": result.score,
|
|
37
|
+
"chunks": [chunk.__dict__ for chunk in result.chunks],
|
|
38
|
+
"metrics": [metric.__dict__ for metric in result.metrics],
|
|
39
|
+
}
|
|
40
|
+
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
|
|
6
|
+
from adaptive_chunking.models import Chunk
|
|
7
|
+
from adaptive_chunking.text import cosine_bow, normalize_space, sentences
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseChunker(ABC):
|
|
11
|
+
name: str
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def split(self, text: str) -> list[Chunk]:
|
|
15
|
+
raise NotImplementedError
|
|
16
|
+
|
|
17
|
+
def _build_chunks(self, spans: list[tuple[int, int]], text: str) -> list[Chunk]:
|
|
18
|
+
chunks: list[Chunk] = []
|
|
19
|
+
for start, end in spans:
|
|
20
|
+
chunk_text = normalize_space(text[start:end])
|
|
21
|
+
if chunk_text:
|
|
22
|
+
chunks.append(
|
|
23
|
+
Chunk(
|
|
24
|
+
text=chunk_text,
|
|
25
|
+
index=len(chunks),
|
|
26
|
+
start_char=start,
|
|
27
|
+
end_char=end,
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
return chunks
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FixedWindowChunker(BaseChunker):
|
|
34
|
+
name = "fixed-window"
|
|
35
|
+
|
|
36
|
+
def __init__(self, chunk_size: int = 1200, overlap: int = 120) -> None:
|
|
37
|
+
if chunk_size <= 0:
|
|
38
|
+
raise ValueError("chunk_size must be positive")
|
|
39
|
+
if overlap < 0 or overlap >= chunk_size:
|
|
40
|
+
raise ValueError("overlap must be non-negative and smaller than chunk_size")
|
|
41
|
+
self.chunk_size = chunk_size
|
|
42
|
+
self.overlap = overlap
|
|
43
|
+
|
|
44
|
+
def split(self, text: str) -> list[Chunk]:
|
|
45
|
+
spans: list[tuple[int, int]] = []
|
|
46
|
+
step = self.chunk_size - self.overlap
|
|
47
|
+
for start in range(0, len(text), step):
|
|
48
|
+
end = min(start + self.chunk_size, len(text))
|
|
49
|
+
spans.append((start, end))
|
|
50
|
+
if end == len(text):
|
|
51
|
+
break
|
|
52
|
+
return self._build_chunks(spans, text)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class SingleChunker(BaseChunker):
|
|
56
|
+
name = "single"
|
|
57
|
+
|
|
58
|
+
def split(self, text: str) -> list[Chunk]:
|
|
59
|
+
return self._build_chunks([(0, len(text))], text)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DelimiterChunker(BaseChunker):
|
|
63
|
+
name = "delimiter"
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
delimiter: str = "\n---\n",
|
|
68
|
+
keep_delimiter: bool = False,
|
|
69
|
+
max_size: int = 1800,
|
|
70
|
+
) -> None:
|
|
71
|
+
self.delimiter = delimiter
|
|
72
|
+
self.keep_delimiter = keep_delimiter
|
|
73
|
+
self.max_size = max_size
|
|
74
|
+
self.fallback = RecursiveChunker(chunk_size=max_size)
|
|
75
|
+
|
|
76
|
+
def split(self, text: str) -> list[Chunk]:
|
|
77
|
+
if not self.delimiter or self.delimiter not in text:
|
|
78
|
+
return self.fallback.split(text)
|
|
79
|
+
spans: list[tuple[int, int]] = []
|
|
80
|
+
cursor = 0
|
|
81
|
+
while cursor < len(text):
|
|
82
|
+
split_at = text.find(self.delimiter, cursor)
|
|
83
|
+
if split_at < 0:
|
|
84
|
+
spans.append((cursor, len(text)))
|
|
85
|
+
break
|
|
86
|
+
end = split_at + len(self.delimiter) if self.keep_delimiter else split_at
|
|
87
|
+
spans.append((cursor, end))
|
|
88
|
+
cursor = split_at + len(self.delimiter)
|
|
89
|
+
return self._split_oversized_spans(spans, text)
|
|
90
|
+
|
|
91
|
+
def _split_oversized_spans(self, spans: list[tuple[int, int]], text: str) -> list[Chunk]:
|
|
92
|
+
chunks: list[Chunk] = []
|
|
93
|
+
for start, end in spans:
|
|
94
|
+
if end - start <= self.max_size:
|
|
95
|
+
chunks.extend(self._build_chunks([(start, end)], text))
|
|
96
|
+
continue
|
|
97
|
+
for chunk in self.fallback.split(text[start:end]):
|
|
98
|
+
chunks.append(
|
|
99
|
+
Chunk(chunk.text, len(chunks), start + chunk.start_char, start + chunk.end_char)
|
|
100
|
+
)
|
|
101
|
+
return chunks
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class PageChunker(DelimiterChunker):
|
|
105
|
+
name = "page"
|
|
106
|
+
|
|
107
|
+
def __init__(self, page_delimiter: str = "\f", max_size: int = 2200) -> None:
|
|
108
|
+
super().__init__(delimiter=page_delimiter, keep_delimiter=False, max_size=max_size)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class RecursiveChunker(BaseChunker):
|
|
112
|
+
name = "recursive"
|
|
113
|
+
|
|
114
|
+
def __init__(self, chunk_size: int = 1200, separators: tuple[str, ...] | None = None) -> None:
|
|
115
|
+
self.chunk_size = chunk_size
|
|
116
|
+
self.separators = separators or ("\n\n", "\n", ". ", " ")
|
|
117
|
+
|
|
118
|
+
def split(self, text: str) -> list[Chunk]:
|
|
119
|
+
spans = self._split_span(text, 0, len(text), 0)
|
|
120
|
+
return self._build_chunks(spans, text)
|
|
121
|
+
|
|
122
|
+
def _split_span(
|
|
123
|
+
self,
|
|
124
|
+
text: str,
|
|
125
|
+
start: int,
|
|
126
|
+
end: int,
|
|
127
|
+
separator_index: int,
|
|
128
|
+
) -> list[tuple[int, int]]:
|
|
129
|
+
if end - start <= self.chunk_size:
|
|
130
|
+
return [(start, end)]
|
|
131
|
+
if separator_index >= len(self.separators):
|
|
132
|
+
return [
|
|
133
|
+
(cursor, min(cursor + self.chunk_size, end))
|
|
134
|
+
for cursor in range(start, end, self.chunk_size)
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
separator = self.separators[separator_index]
|
|
138
|
+
pieces: list[tuple[int, int]] = []
|
|
139
|
+
cursor = start
|
|
140
|
+
while cursor < end:
|
|
141
|
+
limit = min(cursor + self.chunk_size, end)
|
|
142
|
+
split_at = text.rfind(separator, cursor, limit)
|
|
143
|
+
if split_at <= cursor:
|
|
144
|
+
pieces.extend(self._split_span(text, cursor, limit, separator_index + 1))
|
|
145
|
+
cursor = limit
|
|
146
|
+
else:
|
|
147
|
+
split_end = split_at + len(separator)
|
|
148
|
+
pieces.append((cursor, split_end))
|
|
149
|
+
cursor = split_end
|
|
150
|
+
return pieces
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class SplitThenMergeChunker(BaseChunker):
|
|
154
|
+
name = "split-then-merge"
|
|
155
|
+
|
|
156
|
+
def __init__(self, min_size: int = 600, max_size: int = 1400) -> None:
|
|
157
|
+
if min_size <= 0 or max_size <= min_size:
|
|
158
|
+
raise ValueError("expected 0 < min_size < max_size")
|
|
159
|
+
self.min_size = min_size
|
|
160
|
+
self.max_size = max_size
|
|
161
|
+
|
|
162
|
+
def split(self, text: str) -> list[Chunk]:
|
|
163
|
+
raw_spans = _paragraph_spans(text)
|
|
164
|
+
if not raw_spans:
|
|
165
|
+
return []
|
|
166
|
+
merged: list[tuple[int, int]] = []
|
|
167
|
+
start, end = raw_spans[0]
|
|
168
|
+
for next_start, next_end in raw_spans[1:]:
|
|
169
|
+
proposed_size = next_end - start
|
|
170
|
+
if proposed_size <= self.max_size or end - start < self.min_size:
|
|
171
|
+
end = next_end
|
|
172
|
+
else:
|
|
173
|
+
merged.append((start, end))
|
|
174
|
+
start, end = next_start, next_end
|
|
175
|
+
merged.append((start, end))
|
|
176
|
+
return self._build_chunks(merged, text)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class SectionAwareChunker(BaseChunker):
|
|
180
|
+
name = "section-aware"
|
|
181
|
+
|
|
182
|
+
def __init__(self, min_size: int = 500, max_size: int = 1800) -> None:
|
|
183
|
+
self.min_size = min_size
|
|
184
|
+
self.max_size = max_size
|
|
185
|
+
self.fallback = SplitThenMergeChunker(min_size=min_size, max_size=max_size)
|
|
186
|
+
self.heading_pattern = re.compile(
|
|
187
|
+
r"(?m)^(?:#{1,6}\s+.+|\d+(?:\.\d+)*\s+[A-Z].+|[A-Z][A-Za-z0-9 ,:;&()/-]{3,80})$"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def split(self, text: str) -> list[Chunk]:
|
|
191
|
+
starts = sorted(
|
|
192
|
+
{0, len(text), *(match.start() for match in self.heading_pattern.finditer(text))}
|
|
193
|
+
)
|
|
194
|
+
spans = [(starts[index], starts[index + 1]) for index in range(len(starts) - 1)]
|
|
195
|
+
merged: list[tuple[int, int]] = []
|
|
196
|
+
current_start: int | None = None
|
|
197
|
+
current_end: int | None = None
|
|
198
|
+
for start, end in spans:
|
|
199
|
+
if not text[start:end].strip():
|
|
200
|
+
continue
|
|
201
|
+
if current_start is None:
|
|
202
|
+
current_start, current_end = start, end
|
|
203
|
+
continue
|
|
204
|
+
assert current_end is not None
|
|
205
|
+
proposed = end - current_start
|
|
206
|
+
if proposed <= self.max_size or current_end - current_start < self.min_size:
|
|
207
|
+
current_end = end
|
|
208
|
+
else:
|
|
209
|
+
merged.append((current_start, current_end))
|
|
210
|
+
current_start, current_end = start, end
|
|
211
|
+
if current_start is not None and current_end is not None:
|
|
212
|
+
merged.append((current_start, current_end))
|
|
213
|
+
chunks: list[Chunk] = []
|
|
214
|
+
for start, end in merged:
|
|
215
|
+
if end - start <= self.max_size:
|
|
216
|
+
chunks.extend(self._build_chunks([(start, end)], text))
|
|
217
|
+
else:
|
|
218
|
+
for chunk in self.fallback.split(text[start:end]):
|
|
219
|
+
chunks.append(
|
|
220
|
+
Chunk(
|
|
221
|
+
chunk.text,
|
|
222
|
+
len(chunks),
|
|
223
|
+
start + chunk.start_char,
|
|
224
|
+
start + chunk.end_char,
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
return chunks
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class SemanticChunker(BaseChunker):
|
|
231
|
+
name = "semantic"
|
|
232
|
+
|
|
233
|
+
def __init__(
|
|
234
|
+
self,
|
|
235
|
+
max_size: int = 1400,
|
|
236
|
+
min_size: int = 350,
|
|
237
|
+
similarity_threshold: float = 0.10,
|
|
238
|
+
) -> None:
|
|
239
|
+
self.max_size = max_size
|
|
240
|
+
self.min_size = min_size
|
|
241
|
+
self.similarity_threshold = similarity_threshold
|
|
242
|
+
|
|
243
|
+
def split(self, text: str) -> list[Chunk]:
|
|
244
|
+
sentence_spans = _sentence_spans(text)
|
|
245
|
+
if not sentence_spans:
|
|
246
|
+
return []
|
|
247
|
+
chunks: list[tuple[int, int]] = []
|
|
248
|
+
start, end = sentence_spans[0]
|
|
249
|
+
previous_text = text[start:end]
|
|
250
|
+
for next_start, next_end in sentence_spans[1:]:
|
|
251
|
+
next_text = text[next_start:next_end]
|
|
252
|
+
proposed_size = next_end - start
|
|
253
|
+
similarity = cosine_bow(previous_text, next_text)
|
|
254
|
+
should_break = (
|
|
255
|
+
end - start >= self.min_size
|
|
256
|
+
and (proposed_size > self.max_size or similarity < self.similarity_threshold)
|
|
257
|
+
)
|
|
258
|
+
if should_break:
|
|
259
|
+
chunks.append((start, end))
|
|
260
|
+
start, end = next_start, next_end
|
|
261
|
+
else:
|
|
262
|
+
end = next_end
|
|
263
|
+
previous_text = next_text
|
|
264
|
+
chunks.append((start, end))
|
|
265
|
+
return self._build_chunks(chunks, text)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class RegexSectionChunker(BaseChunker):
|
|
269
|
+
name = "regex-section"
|
|
270
|
+
|
|
271
|
+
def __init__(self, max_size: int = 1800, heading_pattern: str | None = None) -> None:
|
|
272
|
+
self.max_size = max_size
|
|
273
|
+
self.heading_pattern = re.compile(
|
|
274
|
+
heading_pattern or r"(?m)^(?:#{1,6}\s+.+|\d+(?:\.\d+)*\s+[A-Z].+)$"
|
|
275
|
+
)
|
|
276
|
+
self.fallback = RecursiveChunker(chunk_size=max_size)
|
|
277
|
+
|
|
278
|
+
def split(self, text: str) -> list[Chunk]:
|
|
279
|
+
starts = [match.start() for match in self.heading_pattern.finditer(text)]
|
|
280
|
+
if not starts:
|
|
281
|
+
return self.fallback.split(text)
|
|
282
|
+
starts = sorted(set([0, *starts, len(text)]))
|
|
283
|
+
spans = [(starts[index], starts[index + 1]) for index in range(len(starts) - 1)]
|
|
284
|
+
chunks: list[Chunk] = []
|
|
285
|
+
for start, end in spans:
|
|
286
|
+
section = text[start:end]
|
|
287
|
+
if len(section) <= self.max_size:
|
|
288
|
+
chunks.extend(self._build_chunks([(start, end)], text))
|
|
289
|
+
else:
|
|
290
|
+
for chunk in self.fallback.split(section):
|
|
291
|
+
chunks.append(
|
|
292
|
+
Chunk(
|
|
293
|
+
text=chunk.text,
|
|
294
|
+
index=len(chunks),
|
|
295
|
+
start_char=start + chunk.start_char,
|
|
296
|
+
end_char=start + chunk.end_char,
|
|
297
|
+
)
|
|
298
|
+
)
|
|
299
|
+
return [
|
|
300
|
+
Chunk(c.text, i, c.start_char, c.end_char, c.metadata)
|
|
301
|
+
for i, c in enumerate(chunks)
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def default_chunkers() -> list[BaseChunker]:
|
|
306
|
+
return [
|
|
307
|
+
SingleChunker(),
|
|
308
|
+
FixedWindowChunker(),
|
|
309
|
+
RecursiveChunker(),
|
|
310
|
+
SplitThenMergeChunker(),
|
|
311
|
+
SectionAwareChunker(),
|
|
312
|
+
DelimiterChunker(),
|
|
313
|
+
PageChunker(),
|
|
314
|
+
SemanticChunker(),
|
|
315
|
+
RegexSectionChunker(),
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _paragraph_spans(text: str) -> list[tuple[int, int]]:
|
|
320
|
+
spans: list[tuple[int, int]] = []
|
|
321
|
+
cursor = 0
|
|
322
|
+
for match in re.finditer(r"\n\s*\n", text):
|
|
323
|
+
end = match.end()
|
|
324
|
+
if text[cursor:end].strip():
|
|
325
|
+
spans.append((cursor, end))
|
|
326
|
+
cursor = end
|
|
327
|
+
if text[cursor:].strip():
|
|
328
|
+
spans.append((cursor, len(text)))
|
|
329
|
+
return spans or ([(0, len(text))] if text.strip() else [])
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _sentence_spans(text: str) -> list[tuple[int, int]]:
|
|
333
|
+
spans: list[tuple[int, int]] = []
|
|
334
|
+
cursor = 0
|
|
335
|
+
for sentence in sentences(text):
|
|
336
|
+
start = text.find(sentence, cursor)
|
|
337
|
+
if start < 0:
|
|
338
|
+
continue
|
|
339
|
+
end = start + len(sentence)
|
|
340
|
+
spans.append((start, end))
|
|
341
|
+
cursor = end
|
|
342
|
+
return spans
|
adaptive_chunking/cli.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Annotated
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
from adaptive_chunking.io import load_text_file
|
|
12
|
+
from adaptive_chunking.pipeline import AdaptiveChunker
|
|
13
|
+
|
|
14
|
+
app = typer.Typer(help="Adaptive document chunking for RAG.")
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@app.command()
|
|
19
|
+
def chunk(
|
|
20
|
+
path: Annotated[
|
|
21
|
+
Path,
|
|
22
|
+
typer.Argument(exists=True, readable=True, help="Text or Markdown file to chunk."),
|
|
23
|
+
],
|
|
24
|
+
document_id: Annotated[
|
|
25
|
+
str | None,
|
|
26
|
+
typer.Option(help="Stable document identifier."),
|
|
27
|
+
] = None,
|
|
28
|
+
json_output: Annotated[
|
|
29
|
+
bool,
|
|
30
|
+
typer.Option("--json", help="Print machine-readable JSON."),
|
|
31
|
+
] = False,
|
|
32
|
+
) -> None:
|
|
33
|
+
text = load_text_file(path)
|
|
34
|
+
result = AdaptiveChunker().chunk(text, document_id=document_id or path.stem)
|
|
35
|
+
if json_output:
|
|
36
|
+
console.print(
|
|
37
|
+
json.dumps(
|
|
38
|
+
{
|
|
39
|
+
"document_id": result.document_id,
|
|
40
|
+
"strategy_name": result.strategy_name,
|
|
41
|
+
"score": result.score,
|
|
42
|
+
"chunks": [chunk.__dict__ for chunk in result.chunks],
|
|
43
|
+
"metrics": [metric.__dict__ for metric in result.metrics],
|
|
44
|
+
},
|
|
45
|
+
indent=2,
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
console.print(f"[bold]Strategy:[/bold] {result.strategy_name}")
|
|
51
|
+
console.print(f"[bold]Score:[/bold] {result.score:.3f}")
|
|
52
|
+
table = Table("Metric", "Value", "Weight")
|
|
53
|
+
for metric in result.metrics:
|
|
54
|
+
table.add_row(metric.name, f"{metric.value:.3f}", f"{metric.weight:.2f}")
|
|
55
|
+
console.print(table)
|
|
56
|
+
for chunk_item in result.chunks:
|
|
57
|
+
console.rule(f"Chunk {chunk_item.index}")
|
|
58
|
+
console.print(chunk_item.text)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == "__main__":
|
|
62
|
+
app()
|
adaptive_chunking/io.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
SUPPORTED_TEXT_SUFFIXES = {".txt", ".md", ".markdown", ".rst"}
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_text_file(path: str | Path) -> str:
|
|
9
|
+
file_path = Path(path)
|
|
10
|
+
if file_path.suffix.lower() not in SUPPORTED_TEXT_SUFFIXES:
|
|
11
|
+
raise ValueError(f"unsupported file type: {file_path.suffix}")
|
|
12
|
+
return file_path.read_text(encoding="utf-8")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def discover_text_files(path: str | Path) -> list[Path]:
|
|
16
|
+
root = Path(path)
|
|
17
|
+
if root.is_file():
|
|
18
|
+
return [root]
|
|
19
|
+
return sorted(
|
|
20
|
+
child
|
|
21
|
+
for child in root.rglob("*")
|
|
22
|
+
if child.is_file() and child.suffix.lower() in SUPPORTED_TEXT_SUFFIXES
|
|
23
|
+
)
|
|
24
|
+
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from adaptive_chunking.pipeline import AdaptiveChunker
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from langchain_text_splitters import TextSplitter
|
|
9
|
+
except ImportError: # pragma: no cover
|
|
10
|
+
TextSplitter = None # type: ignore[assignment]
|
|
11
|
+
|
|
12
|
+
_BaseTextSplitter = TextSplitter if TextSplitter is not None else object
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LangChainAdaptiveTextSplitter(_BaseTextSplitter): # type: ignore[misc,valid-type]
|
|
16
|
+
"""LangChain TextSplitter backed by AdaptiveChunker."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
chunker: AdaptiveChunker | None = None,
|
|
21
|
+
keep_separator: bool = False,
|
|
22
|
+
**kwargs: Any,
|
|
23
|
+
) -> None:
|
|
24
|
+
if TextSplitter is None: # pragma: no cover
|
|
25
|
+
raise RuntimeError(
|
|
26
|
+
"Install LangChain support with `pip install -e .[langchain]`."
|
|
27
|
+
)
|
|
28
|
+
super().__init__(keep_separator=keep_separator, **kwargs)
|
|
29
|
+
self.chunker = chunker or AdaptiveChunker()
|
|
30
|
+
|
|
31
|
+
def split_text(self, text: str) -> list[str]:
|
|
32
|
+
result = self.chunker.chunk(text)
|
|
33
|
+
return [chunk.text for chunk in result.chunks]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from adaptive_chunking.models import Chunk, ChunkingResult
|
|
6
|
+
from adaptive_chunking.pipeline import AdaptiveChunker
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def chunks_to_llama_nodes(
|
|
10
|
+
chunks: list[Chunk],
|
|
11
|
+
*,
|
|
12
|
+
document_id: str = "document",
|
|
13
|
+
extra_metadata: dict[str, Any] | None = None,
|
|
14
|
+
) -> list[Any]:
|
|
15
|
+
try:
|
|
16
|
+
from llama_index.core.schema import TextNode
|
|
17
|
+
except ImportError as exc: # pragma: no cover
|
|
18
|
+
raise RuntimeError(
|
|
19
|
+
"Install LlamaIndex support with `pip install -e .[llama-index]`."
|
|
20
|
+
) from exc
|
|
21
|
+
|
|
22
|
+
metadata = extra_metadata or {}
|
|
23
|
+
return [
|
|
24
|
+
TextNode(
|
|
25
|
+
text=chunk.text,
|
|
26
|
+
id_=f"{document_id}:{chunk.index}",
|
|
27
|
+
metadata={
|
|
28
|
+
**metadata,
|
|
29
|
+
**chunk.metadata,
|
|
30
|
+
"document_id": document_id,
|
|
31
|
+
"chunk_index": chunk.index,
|
|
32
|
+
"start_char": chunk.start_char,
|
|
33
|
+
"end_char": chunk.end_char,
|
|
34
|
+
},
|
|
35
|
+
)
|
|
36
|
+
for chunk in chunks
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def result_to_llama_nodes(result: ChunkingResult) -> list[Any]:
|
|
41
|
+
return chunks_to_llama_nodes(
|
|
42
|
+
result.chunks,
|
|
43
|
+
document_id=result.document_id,
|
|
44
|
+
extra_metadata={"strategy_name": result.strategy_name, "adaptive_score": result.score},
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LlamaIndexAdaptiveParser:
|
|
49
|
+
"""Small adapter with the same practical behavior as a LlamaIndex node parser."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, chunker: AdaptiveChunker | None = None) -> None:
|
|
52
|
+
self.chunker = chunker or AdaptiveChunker()
|
|
53
|
+
|
|
54
|
+
def get_nodes_from_documents(self, documents: list[Any], **_: Any) -> list[Any]:
|
|
55
|
+
nodes: list[Any] = []
|
|
56
|
+
for document_index, document in enumerate(documents):
|
|
57
|
+
text = getattr(document, "text", None) or getattr(document, "get_content", lambda: "")()
|
|
58
|
+
metadata = dict(getattr(document, "metadata", {}) or {})
|
|
59
|
+
document_id = str(
|
|
60
|
+
metadata.get("document_id") or getattr(document, "id_", document_index)
|
|
61
|
+
)
|
|
62
|
+
result = self.chunker.chunk(text, document_id=document_id)
|
|
63
|
+
nodes.extend(
|
|
64
|
+
chunks_to_llama_nodes(
|
|
65
|
+
result.chunks,
|
|
66
|
+
document_id=document_id,
|
|
67
|
+
extra_metadata={
|
|
68
|
+
**metadata,
|
|
69
|
+
"strategy_name": result.strategy_name,
|
|
70
|
+
"adaptive_score": result.score,
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
return nodes
|