biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +21 -1
- biblicus/analysis/markov.py +35 -3
- biblicus/backends/__init__.py +6 -2
- biblicus/backends/embedding_index_common.py +334 -0
- biblicus/backends/embedding_index_file.py +272 -0
- biblicus/backends/embedding_index_inmemory.py +270 -0
- biblicus/backends/hybrid.py +8 -5
- biblicus/backends/scan.py +1 -0
- biblicus/backends/sqlite_full_text_search.py +1 -1
- biblicus/backends/{vector.py → tf_vector.py} +28 -35
- biblicus/chunking.py +396 -0
- biblicus/cli.py +75 -25
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1060 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +129 -0
- biblicus/corpus.py +117 -16
- biblicus/embedding_providers.py +122 -0
- biblicus/errors.py +24 -0
- biblicus/frontmatter.py +2 -0
- biblicus/knowledge_base.py +1 -1
- biblicus/models.py +15 -3
- biblicus/retrieval.py +7 -2
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +2 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,7 @@ import math
|
|
|
8
8
|
import re
|
|
9
9
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
10
10
|
|
|
11
|
-
from pydantic import BaseModel, ConfigDict
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
14
|
from ..frontmatter import parse_front_matter
|
|
@@ -24,23 +24,23 @@ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifes
|
|
|
24
24
|
from ..time import utc_now_iso
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
class
|
|
27
|
+
class TfVectorRecipeConfig(BaseModel):
|
|
28
28
|
"""
|
|
29
|
-
Configuration for the vector retrieval backend.
|
|
29
|
+
Configuration for the term-frequency vector retrieval backend.
|
|
30
30
|
|
|
31
|
-
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
32
|
-
:vartype snippet_characters: int
|
|
33
31
|
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
34
32
|
:vartype extraction_run: str or None
|
|
33
|
+
:ivar snippet_characters: Optional maximum character count for returned evidence text.
|
|
34
|
+
:vartype snippet_characters: int or None
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
model_config = ConfigDict(extra="forbid")
|
|
38
38
|
|
|
39
|
-
snippet_characters: int = Field(default=400, ge=1)
|
|
40
39
|
extraction_run: Optional[str] = None
|
|
40
|
+
snippet_characters: Optional[int] = None
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
class
|
|
43
|
+
class TfVectorBackend:
|
|
44
44
|
"""
|
|
45
45
|
Deterministic vector backend using term-frequency cosine similarity.
|
|
46
46
|
|
|
@@ -48,7 +48,7 @@ class VectorBackend:
|
|
|
48
48
|
:vartype backend_id: str
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
|
-
backend_id = "vector"
|
|
51
|
+
backend_id = "tf-vector"
|
|
52
52
|
|
|
53
53
|
def build_run(
|
|
54
54
|
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
@@ -65,7 +65,7 @@ class VectorBackend:
|
|
|
65
65
|
:return: Run manifest describing the build.
|
|
66
66
|
:rtype: RetrievalRun
|
|
67
67
|
"""
|
|
68
|
-
recipe_config =
|
|
68
|
+
recipe_config = TfVectorRecipeConfig.model_validate(config)
|
|
69
69
|
catalog = corpus.load_catalog()
|
|
70
70
|
recipe = create_recipe_manifest(
|
|
71
71
|
backend_id=self.backend_id,
|
|
@@ -102,7 +102,7 @@ class VectorBackend:
|
|
|
102
102
|
:return: Retrieval results containing evidence.
|
|
103
103
|
:rtype: RetrievalResult
|
|
104
104
|
"""
|
|
105
|
-
recipe_config =
|
|
105
|
+
recipe_config = TfVectorRecipeConfig.model_validate(run.recipe.config)
|
|
106
106
|
query_tokens = _tokenize_text(query_text)
|
|
107
107
|
if not query_tokens:
|
|
108
108
|
return RetrievalResult(
|
|
@@ -125,8 +125,8 @@ class VectorBackend:
|
|
|
125
125
|
query_tokens=query_tokens,
|
|
126
126
|
query_vector=query_vector,
|
|
127
127
|
query_norm=query_norm,
|
|
128
|
-
snippet_characters=recipe_config.snippet_characters,
|
|
129
128
|
extraction_reference=extraction_reference,
|
|
129
|
+
snippet_characters=recipe_config.snippet_characters,
|
|
130
130
|
)
|
|
131
131
|
sorted_candidates = sorted(
|
|
132
132
|
scored_candidates,
|
|
@@ -157,7 +157,7 @@ class VectorBackend:
|
|
|
157
157
|
|
|
158
158
|
|
|
159
159
|
def _resolve_extraction_reference(
|
|
160
|
-
corpus: Corpus, recipe_config:
|
|
160
|
+
corpus: Corpus, recipe_config: TfVectorRecipeConfig
|
|
161
161
|
) -> Optional[ExtractionRunReference]:
|
|
162
162
|
"""
|
|
163
163
|
Resolve an extraction run reference from a recipe config.
|
|
@@ -165,7 +165,7 @@ def _resolve_extraction_reference(
|
|
|
165
165
|
:param corpus: Corpus associated with the recipe.
|
|
166
166
|
:type corpus: Corpus
|
|
167
167
|
:param recipe_config: Parsed vector recipe configuration.
|
|
168
|
-
:type recipe_config:
|
|
168
|
+
:type recipe_config: TfVectorRecipeConfig
|
|
169
169
|
:return: Parsed extraction reference or None.
|
|
170
170
|
:rtype: ExtractionRunReference or None
|
|
171
171
|
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
@@ -183,7 +183,7 @@ def _resolve_extraction_reference(
|
|
|
183
183
|
|
|
184
184
|
|
|
185
185
|
def _count_text_items(
|
|
186
|
-
corpus: Corpus, items: Iterable[object], recipe_config:
|
|
186
|
+
corpus: Corpus, items: Iterable[object], recipe_config: TfVectorRecipeConfig
|
|
187
187
|
) -> int:
|
|
188
188
|
"""
|
|
189
189
|
Count catalog items that represent text content.
|
|
@@ -193,7 +193,7 @@ def _count_text_items(
|
|
|
193
193
|
:param items: Catalog items to inspect.
|
|
194
194
|
:type items: Iterable[object]
|
|
195
195
|
:param recipe_config: Parsed vector recipe configuration.
|
|
196
|
-
:type recipe_config:
|
|
196
|
+
:type recipe_config: TfVectorRecipeConfig
|
|
197
197
|
:return: Number of text items.
|
|
198
198
|
:rtype: int
|
|
199
199
|
"""
|
|
@@ -359,21 +359,13 @@ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]
|
|
|
359
359
|
return best_start, best_end
|
|
360
360
|
|
|
361
361
|
|
|
362
|
-
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
:param text: Source text to slice.
|
|
367
|
-
:type text: str
|
|
368
|
-
:param span: Match span to center on.
|
|
369
|
-
:type span: tuple[int, int] or None
|
|
370
|
-
:param max_chars: Maximum snippet length.
|
|
371
|
-
:type max_chars: int
|
|
372
|
-
:return: Snippet text.
|
|
373
|
-
:rtype: str
|
|
374
|
-
"""
|
|
362
|
+
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: Optional[int]) -> str:
|
|
363
|
+
if max_chars is None:
|
|
364
|
+
return text
|
|
375
365
|
if not text:
|
|
376
366
|
return ""
|
|
367
|
+
if max_chars <= 0:
|
|
368
|
+
return ""
|
|
377
369
|
if span is None:
|
|
378
370
|
return text[:max_chars]
|
|
379
371
|
span_start, span_end = span
|
|
@@ -390,8 +382,8 @@ def _score_items(
|
|
|
390
382
|
query_tokens: List[str],
|
|
391
383
|
query_vector: Dict[str, float],
|
|
392
384
|
query_norm: float,
|
|
393
|
-
snippet_characters: int,
|
|
394
385
|
extraction_reference: Optional[ExtractionRunReference],
|
|
386
|
+
snippet_characters: Optional[int],
|
|
395
387
|
) -> List[Evidence]:
|
|
396
388
|
"""
|
|
397
389
|
Score catalog items and return evidence candidates.
|
|
@@ -406,10 +398,10 @@ def _score_items(
|
|
|
406
398
|
:type query_vector: dict[str, float]
|
|
407
399
|
:param query_norm: Query vector norm.
|
|
408
400
|
:type query_norm: float
|
|
409
|
-
:param snippet_characters: Snippet length budget.
|
|
410
|
-
:type snippet_characters: int
|
|
411
401
|
:param extraction_reference: Optional extraction run reference.
|
|
412
402
|
:type extraction_reference: ExtractionRunReference or None
|
|
403
|
+
:param snippet_characters: Optional maximum character count for returned evidence text.
|
|
404
|
+
:type snippet_characters: int or None
|
|
413
405
|
:return: Evidence candidates with provisional ranks.
|
|
414
406
|
:rtype: list[Evidence]
|
|
415
407
|
"""
|
|
@@ -437,9 +429,9 @@ def _score_items(
|
|
|
437
429
|
if similarity <= 0:
|
|
438
430
|
continue
|
|
439
431
|
span = _find_first_match(item_text, query_tokens)
|
|
440
|
-
snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
441
432
|
span_start = span[0] if span else None
|
|
442
433
|
span_end = span[1] if span else None
|
|
434
|
+
evidence_text = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
443
435
|
evidence_items.append(
|
|
444
436
|
Evidence(
|
|
445
437
|
item_id=str(getattr(catalog_item, "id")),
|
|
@@ -447,14 +439,15 @@ def _score_items(
|
|
|
447
439
|
media_type=str(media_type),
|
|
448
440
|
score=float(similarity),
|
|
449
441
|
rank=1,
|
|
450
|
-
text=
|
|
442
|
+
text=evidence_text,
|
|
451
443
|
content_ref=None,
|
|
452
444
|
span_start=span_start,
|
|
453
445
|
span_end=span_end,
|
|
454
|
-
stage="vector",
|
|
446
|
+
stage="tf-vector",
|
|
455
447
|
recipe_id="",
|
|
456
448
|
run_id="",
|
|
457
|
-
|
|
449
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
450
|
+
hash=hash_text(evidence_text or ""),
|
|
458
451
|
)
|
|
459
452
|
)
|
|
460
453
|
return evidence_items
|
biblicus/chunking.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunking primitives for text retrieval backends.
|
|
3
|
+
|
|
4
|
+
Chunking converts a document-sized text string into smaller spans that can be embedded or indexed.
|
|
5
|
+
The chunk span offsets are expressed as character positions into the original text string.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TokenSpan(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
A token with its character span in a source string.
|
|
19
|
+
|
|
20
|
+
:ivar token: Token text.
|
|
21
|
+
:vartype token: str
|
|
22
|
+
:ivar span_start: Inclusive start character offset.
|
|
23
|
+
:vartype span_start: int
|
|
24
|
+
:ivar span_end: Exclusive end character offset.
|
|
25
|
+
:vartype span_end: int
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
token: str = Field(min_length=1)
|
|
31
|
+
span_start: int = Field(ge=0)
|
|
32
|
+
span_end: int = Field(ge=0)
|
|
33
|
+
|
|
34
|
+
@model_validator(mode="after")
|
|
35
|
+
def _validate_span(self) -> "TokenSpan":
|
|
36
|
+
if self.span_end <= self.span_start:
|
|
37
|
+
raise ValueError("token span_end must be greater than span_start")
|
|
38
|
+
return self
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Tokenizer(ABC):
|
|
42
|
+
"""
|
|
43
|
+
Interface for producing token spans from text.
|
|
44
|
+
|
|
45
|
+
Tokenizers are used by token-window chunking strategies to convert token indices into
|
|
46
|
+
stable character spans.
|
|
47
|
+
|
|
48
|
+
:ivar tokenizer_id: Tokenizer identifier.
|
|
49
|
+
:vartype tokenizer_id: str
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
tokenizer_id: str
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def tokenize(self, text: str) -> List[TokenSpan]:
|
|
56
|
+
"""
|
|
57
|
+
Tokenize a string and return spans for each token.
|
|
58
|
+
|
|
59
|
+
:param text: Input text.
|
|
60
|
+
:type text: str
|
|
61
|
+
:return: Token spans.
|
|
62
|
+
:rtype: list[TokenSpan]
|
|
63
|
+
"""
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class WhitespaceTokenizer(Tokenizer):
|
|
68
|
+
"""
|
|
69
|
+
Tokenizer that treats runs of non-whitespace characters as tokens.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
tokenizer_id = "whitespace"
|
|
73
|
+
|
|
74
|
+
def tokenize(self, text: str) -> List[TokenSpan]:
|
|
75
|
+
"""
|
|
76
|
+
Tokenize a string by whitespace boundaries.
|
|
77
|
+
|
|
78
|
+
:param text: Input text.
|
|
79
|
+
:type text: str
|
|
80
|
+
:return: Token spans for each non-whitespace token.
|
|
81
|
+
:rtype: list[TokenSpan]
|
|
82
|
+
"""
|
|
83
|
+
import re
|
|
84
|
+
|
|
85
|
+
spans: List[TokenSpan] = []
|
|
86
|
+
for match in re.finditer(r"\S+", text):
|
|
87
|
+
spans.append(
|
|
88
|
+
TokenSpan(
|
|
89
|
+
token=match.group(0),
|
|
90
|
+
span_start=int(match.start()),
|
|
91
|
+
span_end=int(match.end()),
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
return spans
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class TextChunk(BaseModel):
|
|
98
|
+
"""
|
|
99
|
+
A chunk extracted from a larger text string.
|
|
100
|
+
|
|
101
|
+
:ivar chunk_id: Stable chunk identifier within a build.
|
|
102
|
+
:vartype chunk_id: int
|
|
103
|
+
:ivar item_id: Source item identifier.
|
|
104
|
+
:vartype item_id: str
|
|
105
|
+
:ivar span_start: Inclusive start character offset.
|
|
106
|
+
:vartype span_start: int
|
|
107
|
+
:ivar span_end: Exclusive end character offset.
|
|
108
|
+
:vartype span_end: int
|
|
109
|
+
:ivar text: Chunk text.
|
|
110
|
+
:vartype text: str
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
model_config = ConfigDict(extra="forbid")
|
|
114
|
+
|
|
115
|
+
chunk_id: int = Field(ge=0)
|
|
116
|
+
item_id: str = Field(min_length=1)
|
|
117
|
+
span_start: int = Field(ge=0)
|
|
118
|
+
span_end: int = Field(ge=0)
|
|
119
|
+
text: str
|
|
120
|
+
|
|
121
|
+
@model_validator(mode="after")
|
|
122
|
+
def _validate_span(self) -> "TextChunk":
|
|
123
|
+
if self.span_end <= self.span_start:
|
|
124
|
+
raise ValueError("chunk span_end must be greater than span_start")
|
|
125
|
+
if not isinstance(self.text, str) or not self.text:
|
|
126
|
+
raise ValueError("chunk text must be non-empty")
|
|
127
|
+
return self
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Chunker(ABC):
|
|
131
|
+
"""
|
|
132
|
+
Interface for converting text into chunks.
|
|
133
|
+
|
|
134
|
+
:ivar chunker_id: Chunker identifier.
|
|
135
|
+
:vartype chunker_id: str
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
chunker_id: str
|
|
139
|
+
|
|
140
|
+
@abstractmethod
|
|
141
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
142
|
+
"""
|
|
143
|
+
Split a text string into chunks.
|
|
144
|
+
|
|
145
|
+
:param item_id: Item identifier that produced the text.
|
|
146
|
+
:type item_id: str
|
|
147
|
+
:param text: Full text to chunk.
|
|
148
|
+
:type text: str
|
|
149
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
150
|
+
:type starting_chunk_id: int
|
|
151
|
+
:return: Chunk list.
|
|
152
|
+
:rtype: list[TextChunk]
|
|
153
|
+
"""
|
|
154
|
+
raise NotImplementedError
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class FixedCharWindowChunker(Chunker):
|
|
158
|
+
"""
|
|
159
|
+
Chunker that produces overlapping fixed-size character windows.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
chunker_id = "fixed-char-window"
|
|
163
|
+
|
|
164
|
+
def __init__(self, *, window_characters: int, overlap_characters: int) -> None:
|
|
165
|
+
self._window_characters = int(window_characters)
|
|
166
|
+
self._overlap_characters = int(overlap_characters)
|
|
167
|
+
if self._window_characters <= 0:
|
|
168
|
+
raise ValueError("window_characters must be greater than 0")
|
|
169
|
+
if self._overlap_characters < 0:
|
|
170
|
+
raise ValueError("overlap_characters must be greater than or equal to 0")
|
|
171
|
+
if self._overlap_characters >= self._window_characters:
|
|
172
|
+
raise ValueError("overlap_characters must be less than window_characters")
|
|
173
|
+
|
|
174
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
175
|
+
"""
|
|
176
|
+
Chunk a text string into fixed-size character windows.
|
|
177
|
+
|
|
178
|
+
:param item_id: Item identifier.
|
|
179
|
+
:type item_id: str
|
|
180
|
+
:param text: Input text.
|
|
181
|
+
:type text: str
|
|
182
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
183
|
+
:type starting_chunk_id: int
|
|
184
|
+
:return: Chunk list.
|
|
185
|
+
:rtype: list[TextChunk]
|
|
186
|
+
"""
|
|
187
|
+
chunks: List[TextChunk] = []
|
|
188
|
+
chunk_id = int(starting_chunk_id)
|
|
189
|
+
position = 0
|
|
190
|
+
stride = self._window_characters - self._overlap_characters
|
|
191
|
+
while position < len(text):
|
|
192
|
+
span_start = position
|
|
193
|
+
span_end = min(position + self._window_characters, len(text))
|
|
194
|
+
chunk_text = text[span_start:span_end]
|
|
195
|
+
if chunk_text.strip():
|
|
196
|
+
chunks.append(
|
|
197
|
+
TextChunk(
|
|
198
|
+
chunk_id=chunk_id,
|
|
199
|
+
item_id=item_id,
|
|
200
|
+
span_start=span_start,
|
|
201
|
+
span_end=span_end,
|
|
202
|
+
text=chunk_text,
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
chunk_id += 1
|
|
206
|
+
if span_end >= len(text):
|
|
207
|
+
position = len(text)
|
|
208
|
+
else:
|
|
209
|
+
position += stride
|
|
210
|
+
return chunks
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class ParagraphChunker(Chunker):
|
|
214
|
+
"""
|
|
215
|
+
Chunker that produces paragraph spans separated by blank lines.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
chunker_id = "paragraph"
|
|
219
|
+
|
|
220
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
221
|
+
"""
|
|
222
|
+
Chunk a text string by paragraph boundaries.
|
|
223
|
+
|
|
224
|
+
:param item_id: Item identifier.
|
|
225
|
+
:type item_id: str
|
|
226
|
+
:param text: Input text.
|
|
227
|
+
:type text: str
|
|
228
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
229
|
+
:type starting_chunk_id: int
|
|
230
|
+
:return: Chunk list.
|
|
231
|
+
:rtype: list[TextChunk]
|
|
232
|
+
"""
|
|
233
|
+
import re
|
|
234
|
+
|
|
235
|
+
chunks: List[TextChunk] = []
|
|
236
|
+
chunk_id = int(starting_chunk_id)
|
|
237
|
+
for match in re.finditer(r"(?:[^\n]|\n(?!\n))+", text):
|
|
238
|
+
span_start = int(match.start())
|
|
239
|
+
span_end = int(match.end())
|
|
240
|
+
chunk_text = text[span_start:span_end]
|
|
241
|
+
if not chunk_text.strip():
|
|
242
|
+
continue
|
|
243
|
+
chunks.append(
|
|
244
|
+
TextChunk(
|
|
245
|
+
chunk_id=chunk_id,
|
|
246
|
+
item_id=item_id,
|
|
247
|
+
span_start=span_start,
|
|
248
|
+
span_end=span_end,
|
|
249
|
+
text=chunk_text,
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
chunk_id += 1
|
|
253
|
+
return chunks
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class FixedTokenWindowChunker(Chunker):
|
|
257
|
+
"""
|
|
258
|
+
Chunker that produces overlapping fixed-size token windows.
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
chunker_id = "fixed-token-window"
|
|
262
|
+
|
|
263
|
+
def __init__(self, *, window_tokens: int, overlap_tokens: int, tokenizer: Tokenizer) -> None:
|
|
264
|
+
self._window_tokens = int(window_tokens)
|
|
265
|
+
self._overlap_tokens = int(overlap_tokens)
|
|
266
|
+
self._tokenizer = tokenizer
|
|
267
|
+
if self._window_tokens <= 0:
|
|
268
|
+
raise ValueError("window_tokens must be greater than 0")
|
|
269
|
+
if self._overlap_tokens < 0:
|
|
270
|
+
raise ValueError("overlap_tokens must be greater than or equal to 0")
|
|
271
|
+
if self._overlap_tokens >= self._window_tokens:
|
|
272
|
+
raise ValueError("overlap_tokens must be less than window_tokens")
|
|
273
|
+
|
|
274
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
275
|
+
"""
|
|
276
|
+
Chunk a text string into fixed-size token windows.
|
|
277
|
+
|
|
278
|
+
:param item_id: Item identifier.
|
|
279
|
+
:type item_id: str
|
|
280
|
+
:param text: Input text.
|
|
281
|
+
:type text: str
|
|
282
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
283
|
+
:type starting_chunk_id: int
|
|
284
|
+
:return: Chunk list.
|
|
285
|
+
:rtype: list[TextChunk]
|
|
286
|
+
"""
|
|
287
|
+
token_spans = self._tokenizer.tokenize(text)
|
|
288
|
+
if not token_spans:
|
|
289
|
+
return []
|
|
290
|
+
chunks: List[TextChunk] = []
|
|
291
|
+
chunk_id = int(starting_chunk_id)
|
|
292
|
+
stride = self._window_tokens - self._overlap_tokens
|
|
293
|
+
token_index = 0
|
|
294
|
+
while token_index < len(token_spans):
|
|
295
|
+
window_end = min(token_index + self._window_tokens, len(token_spans))
|
|
296
|
+
span_start = token_spans[token_index].span_start
|
|
297
|
+
span_end = token_spans[window_end - 1].span_end
|
|
298
|
+
chunk_text = text[span_start:span_end]
|
|
299
|
+
chunks.append(
|
|
300
|
+
TextChunk(
|
|
301
|
+
chunk_id=chunk_id,
|
|
302
|
+
item_id=item_id,
|
|
303
|
+
span_start=span_start,
|
|
304
|
+
span_end=span_end,
|
|
305
|
+
text=chunk_text,
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
chunk_id += 1
|
|
309
|
+
if window_end >= len(token_spans):
|
|
310
|
+
token_index = len(token_spans)
|
|
311
|
+
else:
|
|
312
|
+
token_index += stride
|
|
313
|
+
return chunks
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
class TokenizerConfig(BaseModel):
|
|
317
|
+
"""
|
|
318
|
+
Configuration for tokenizer selection.
|
|
319
|
+
|
|
320
|
+
:ivar tokenizer_id: Tokenizer identifier.
|
|
321
|
+
:vartype tokenizer_id: str
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
model_config = ConfigDict(extra="forbid")
|
|
325
|
+
|
|
326
|
+
tokenizer_id: str = Field(min_length=1)
|
|
327
|
+
|
|
328
|
+
def build_tokenizer(self) -> Tokenizer:
|
|
329
|
+
"""
|
|
330
|
+
Build a tokenizer instance from this configuration.
|
|
331
|
+
|
|
332
|
+
:return: Tokenizer instance.
|
|
333
|
+
:rtype: Tokenizer
|
|
334
|
+
:raises ValueError: If the tokenizer identifier is unknown.
|
|
335
|
+
"""
|
|
336
|
+
if self.tokenizer_id == WhitespaceTokenizer.tokenizer_id:
|
|
337
|
+
return WhitespaceTokenizer()
|
|
338
|
+
raise ValueError(f"Unknown tokenizer_id: {self.tokenizer_id!r}")
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class ChunkerConfig(BaseModel):
|
|
342
|
+
"""
|
|
343
|
+
Configuration for chunker selection.
|
|
344
|
+
|
|
345
|
+
:ivar chunker_id: Chunker identifier.
|
|
346
|
+
:vartype chunker_id: str
|
|
347
|
+
:ivar window_characters: Window size for fixed character chunking.
|
|
348
|
+
:vartype window_characters: int or None
|
|
349
|
+
:ivar overlap_characters: Overlap size for fixed character chunking.
|
|
350
|
+
:vartype overlap_characters: int or None
|
|
351
|
+
:ivar window_tokens: Window size for fixed token chunking.
|
|
352
|
+
:vartype window_tokens: int or None
|
|
353
|
+
:ivar overlap_tokens: Overlap size for fixed token chunking.
|
|
354
|
+
:vartype overlap_tokens: int or None
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
model_config = ConfigDict(extra="forbid")
|
|
358
|
+
|
|
359
|
+
chunker_id: str = Field(min_length=1)
|
|
360
|
+
window_characters: Optional[int] = Field(default=None, ge=1)
|
|
361
|
+
overlap_characters: Optional[int] = Field(default=None, ge=0)
|
|
362
|
+
window_tokens: Optional[int] = Field(default=None, ge=1)
|
|
363
|
+
overlap_tokens: Optional[int] = Field(default=None, ge=0)
|
|
364
|
+
|
|
365
|
+
def build_chunker(self, *, tokenizer: Optional[Tokenizer]) -> Chunker:
|
|
366
|
+
"""
|
|
367
|
+
Build a chunker instance from this configuration.
|
|
368
|
+
|
|
369
|
+
:param tokenizer: Tokenizer used by token-window chunking strategies.
|
|
370
|
+
:type tokenizer: Tokenizer or None
|
|
371
|
+
:return: Chunker instance.
|
|
372
|
+
:rtype: Chunker
|
|
373
|
+
:raises ValueError: If required configuration is missing or unknown.
|
|
374
|
+
"""
|
|
375
|
+
if self.chunker_id == FixedCharWindowChunker.chunker_id:
|
|
376
|
+
if self.window_characters is None or self.overlap_characters is None:
|
|
377
|
+
raise ValueError(
|
|
378
|
+
"fixed-char-window requires window_characters and overlap_characters"
|
|
379
|
+
)
|
|
380
|
+
return FixedCharWindowChunker(
|
|
381
|
+
window_characters=self.window_characters,
|
|
382
|
+
overlap_characters=self.overlap_characters,
|
|
383
|
+
)
|
|
384
|
+
if self.chunker_id == ParagraphChunker.chunker_id:
|
|
385
|
+
return ParagraphChunker()
|
|
386
|
+
if self.chunker_id == FixedTokenWindowChunker.chunker_id:
|
|
387
|
+
if self.window_tokens is None or self.overlap_tokens is None:
|
|
388
|
+
raise ValueError("fixed-token-window requires window_tokens and overlap_tokens")
|
|
389
|
+
if tokenizer is None:
|
|
390
|
+
raise ValueError("tokenizer configuration is required for fixed-token-window")
|
|
391
|
+
return FixedTokenWindowChunker(
|
|
392
|
+
window_tokens=self.window_tokens,
|
|
393
|
+
overlap_tokens=self.overlap_tokens,
|
|
394
|
+
tokenizer=tokenizer,
|
|
395
|
+
)
|
|
396
|
+
raise ValueError(f"Unknown chunker_id: {self.chunker_id!r}")
|