biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. biblicus/__init__.py +21 -1
  2. biblicus/analysis/markov.py +35 -3
  3. biblicus/backends/__init__.py +6 -2
  4. biblicus/backends/embedding_index_common.py +334 -0
  5. biblicus/backends/embedding_index_file.py +272 -0
  6. biblicus/backends/embedding_index_inmemory.py +270 -0
  7. biblicus/backends/hybrid.py +8 -5
  8. biblicus/backends/scan.py +1 -0
  9. biblicus/backends/sqlite_full_text_search.py +1 -1
  10. biblicus/backends/{vector.py → tf_vector.py} +28 -35
  11. biblicus/chunking.py +396 -0
  12. biblicus/cli.py +75 -25
  13. biblicus/context.py +27 -12
  14. biblicus/context_engine/__init__.py +53 -0
  15. biblicus/context_engine/assembler.py +1060 -0
  16. biblicus/context_engine/compaction.py +110 -0
  17. biblicus/context_engine/models.py +423 -0
  18. biblicus/context_engine/retrieval.py +129 -0
  19. biblicus/corpus.py +117 -16
  20. biblicus/embedding_providers.py +122 -0
  21. biblicus/errors.py +24 -0
  22. biblicus/frontmatter.py +2 -0
  23. biblicus/knowledge_base.py +1 -1
  24. biblicus/models.py +15 -3
  25. biblicus/retrieval.py +7 -2
  26. biblicus/sources.py +46 -11
  27. biblicus/text/link.py +6 -0
  28. biblicus/text/prompts.py +2 -0
  29. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
  30. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
  31. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
  32. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
  33. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
  34. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ import math
8
8
  import re
9
9
  from typing import Dict, Iterable, List, Optional, Tuple
10
10
 
11
- from pydantic import BaseModel, ConfigDict, Field
11
+ from pydantic import BaseModel, ConfigDict
12
12
 
13
13
  from ..corpus import Corpus
14
14
  from ..frontmatter import parse_front_matter
@@ -24,23 +24,23 @@ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifes
24
24
  from ..time import utc_now_iso
25
25
 
26
26
 
27
- class VectorRecipeConfig(BaseModel):
27
+ class TfVectorRecipeConfig(BaseModel):
28
28
  """
29
- Configuration for the vector retrieval backend.
29
+ Configuration for the term-frequency vector retrieval backend.
30
30
 
31
- :ivar snippet_characters: Maximum characters to include in evidence snippets.
32
- :vartype snippet_characters: int
33
31
  :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
34
32
  :vartype extraction_run: str or None
33
+ :ivar snippet_characters: Optional maximum character count for returned evidence text.
34
+ :vartype snippet_characters: int or None
35
35
  """
36
36
 
37
37
  model_config = ConfigDict(extra="forbid")
38
38
 
39
- snippet_characters: int = Field(default=400, ge=1)
40
39
  extraction_run: Optional[str] = None
40
+ snippet_characters: Optional[int] = None
41
41
 
42
42
 
43
- class VectorBackend:
43
+ class TfVectorBackend:
44
44
  """
45
45
  Deterministic vector backend using term-frequency cosine similarity.
46
46
 
@@ -48,7 +48,7 @@ class VectorBackend:
48
48
  :vartype backend_id: str
49
49
  """
50
50
 
51
- backend_id = "vector"
51
+ backend_id = "tf-vector"
52
52
 
53
53
  def build_run(
54
54
  self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
@@ -65,7 +65,7 @@ class VectorBackend:
65
65
  :return: Run manifest describing the build.
66
66
  :rtype: RetrievalRun
67
67
  """
68
- recipe_config = VectorRecipeConfig.model_validate(config)
68
+ recipe_config = TfVectorRecipeConfig.model_validate(config)
69
69
  catalog = corpus.load_catalog()
70
70
  recipe = create_recipe_manifest(
71
71
  backend_id=self.backend_id,
@@ -102,7 +102,7 @@ class VectorBackend:
102
102
  :return: Retrieval results containing evidence.
103
103
  :rtype: RetrievalResult
104
104
  """
105
- recipe_config = VectorRecipeConfig.model_validate(run.recipe.config)
105
+ recipe_config = TfVectorRecipeConfig.model_validate(run.recipe.config)
106
106
  query_tokens = _tokenize_text(query_text)
107
107
  if not query_tokens:
108
108
  return RetrievalResult(
@@ -125,8 +125,8 @@ class VectorBackend:
125
125
  query_tokens=query_tokens,
126
126
  query_vector=query_vector,
127
127
  query_norm=query_norm,
128
- snippet_characters=recipe_config.snippet_characters,
129
128
  extraction_reference=extraction_reference,
129
+ snippet_characters=recipe_config.snippet_characters,
130
130
  )
131
131
  sorted_candidates = sorted(
132
132
  scored_candidates,
@@ -157,7 +157,7 @@ class VectorBackend:
157
157
 
158
158
 
159
159
  def _resolve_extraction_reference(
160
- corpus: Corpus, recipe_config: VectorRecipeConfig
160
+ corpus: Corpus, recipe_config: TfVectorRecipeConfig
161
161
  ) -> Optional[ExtractionRunReference]:
162
162
  """
163
163
  Resolve an extraction run reference from a recipe config.
@@ -165,7 +165,7 @@ def _resolve_extraction_reference(
165
165
  :param corpus: Corpus associated with the recipe.
166
166
  :type corpus: Corpus
167
167
  :param recipe_config: Parsed vector recipe configuration.
168
- :type recipe_config: VectorRecipeConfig
168
+ :type recipe_config: TfVectorRecipeConfig
169
169
  :return: Parsed extraction reference or None.
170
170
  :rtype: ExtractionRunReference or None
171
171
  :raises FileNotFoundError: If an extraction run is referenced but not present.
@@ -183,7 +183,7 @@ def _resolve_extraction_reference(
183
183
 
184
184
 
185
185
  def _count_text_items(
186
- corpus: Corpus, items: Iterable[object], recipe_config: VectorRecipeConfig
186
+ corpus: Corpus, items: Iterable[object], recipe_config: TfVectorRecipeConfig
187
187
  ) -> int:
188
188
  """
189
189
  Count catalog items that represent text content.
@@ -193,7 +193,7 @@ def _count_text_items(
193
193
  :param items: Catalog items to inspect.
194
194
  :type items: Iterable[object]
195
195
  :param recipe_config: Parsed vector recipe configuration.
196
- :type recipe_config: VectorRecipeConfig
196
+ :type recipe_config: TfVectorRecipeConfig
197
197
  :return: Number of text items.
198
198
  :rtype: int
199
199
  """
@@ -359,21 +359,13 @@ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]
359
359
  return best_start, best_end
360
360
 
361
361
 
362
- def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
363
- """
364
- Build a snippet around a match span, constrained by a character budget.
365
-
366
- :param text: Source text to slice.
367
- :type text: str
368
- :param span: Match span to center on.
369
- :type span: tuple[int, int] or None
370
- :param max_chars: Maximum snippet length.
371
- :type max_chars: int
372
- :return: Snippet text.
373
- :rtype: str
374
- """
362
+ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: Optional[int]) -> str:
363
+ if max_chars is None:
364
+ return text
375
365
  if not text:
376
366
  return ""
367
+ if max_chars <= 0:
368
+ return ""
377
369
  if span is None:
378
370
  return text[:max_chars]
379
371
  span_start, span_end = span
@@ -390,8 +382,8 @@ def _score_items(
390
382
  query_tokens: List[str],
391
383
  query_vector: Dict[str, float],
392
384
  query_norm: float,
393
- snippet_characters: int,
394
385
  extraction_reference: Optional[ExtractionRunReference],
386
+ snippet_characters: Optional[int],
395
387
  ) -> List[Evidence]:
396
388
  """
397
389
  Score catalog items and return evidence candidates.
@@ -406,10 +398,10 @@ def _score_items(
406
398
  :type query_vector: dict[str, float]
407
399
  :param query_norm: Query vector norm.
408
400
  :type query_norm: float
409
- :param snippet_characters: Snippet length budget.
410
- :type snippet_characters: int
411
401
  :param extraction_reference: Optional extraction run reference.
412
402
  :type extraction_reference: ExtractionRunReference or None
403
+ :param snippet_characters: Optional maximum character count for returned evidence text.
404
+ :type snippet_characters: int or None
413
405
  :return: Evidence candidates with provisional ranks.
414
406
  :rtype: list[Evidence]
415
407
  """
@@ -437,9 +429,9 @@ def _score_items(
437
429
  if similarity <= 0:
438
430
  continue
439
431
  span = _find_first_match(item_text, query_tokens)
440
- snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
441
432
  span_start = span[0] if span else None
442
433
  span_end = span[1] if span else None
434
+ evidence_text = _build_snippet(item_text, span, max_chars=snippet_characters)
443
435
  evidence_items.append(
444
436
  Evidence(
445
437
  item_id=str(getattr(catalog_item, "id")),
@@ -447,14 +439,15 @@ def _score_items(
447
439
  media_type=str(media_type),
448
440
  score=float(similarity),
449
441
  rank=1,
450
- text=snippet,
442
+ text=evidence_text,
451
443
  content_ref=None,
452
444
  span_start=span_start,
453
445
  span_end=span_end,
454
- stage="vector",
446
+ stage="tf-vector",
455
447
  recipe_id="",
456
448
  run_id="",
457
- hash=hash_text(snippet),
449
+ metadata=getattr(catalog_item, "metadata", {}) or {},
450
+ hash=hash_text(evidence_text or ""),
458
451
  )
459
452
  )
460
453
  return evidence_items
biblicus/chunking.py ADDED
@@ -0,0 +1,396 @@
1
+ """
2
+ Chunking primitives for text retrieval backends.
3
+
4
+ Chunking converts a document-sized text string into smaller spans that can be embedded or indexed.
5
+ The chunk span offsets are expressed as character positions into the original text string.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+ from typing import List, Optional
12
+
13
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
14
+
15
+
16
+ class TokenSpan(BaseModel):
17
+ """
18
+ A token with its character span in a source string.
19
+
20
+ :ivar token: Token text.
21
+ :vartype token: str
22
+ :ivar span_start: Inclusive start character offset.
23
+ :vartype span_start: int
24
+ :ivar span_end: Exclusive end character offset.
25
+ :vartype span_end: int
26
+ """
27
+
28
+ model_config = ConfigDict(extra="forbid")
29
+
30
+ token: str = Field(min_length=1)
31
+ span_start: int = Field(ge=0)
32
+ span_end: int = Field(ge=0)
33
+
34
+ @model_validator(mode="after")
35
+ def _validate_span(self) -> "TokenSpan":
36
+ if self.span_end <= self.span_start:
37
+ raise ValueError("token span_end must be greater than span_start")
38
+ return self
39
+
40
+
41
+ class Tokenizer(ABC):
42
+ """
43
+ Interface for producing token spans from text.
44
+
45
+ Tokenizers are used by token-window chunking strategies to convert token indices into
46
+ stable character spans.
47
+
48
+ :ivar tokenizer_id: Tokenizer identifier.
49
+ :vartype tokenizer_id: str
50
+ """
51
+
52
+ tokenizer_id: str
53
+
54
+ @abstractmethod
55
+ def tokenize(self, text: str) -> List[TokenSpan]:
56
+ """
57
+ Tokenize a string and return spans for each token.
58
+
59
+ :param text: Input text.
60
+ :type text: str
61
+ :return: Token spans.
62
+ :rtype: list[TokenSpan]
63
+ """
64
+ raise NotImplementedError
65
+
66
+
67
+ class WhitespaceTokenizer(Tokenizer):
68
+ """
69
+ Tokenizer that treats runs of non-whitespace characters as tokens.
70
+ """
71
+
72
+ tokenizer_id = "whitespace"
73
+
74
+ def tokenize(self, text: str) -> List[TokenSpan]:
75
+ """
76
+ Tokenize a string by whitespace boundaries.
77
+
78
+ :param text: Input text.
79
+ :type text: str
80
+ :return: Token spans for each non-whitespace token.
81
+ :rtype: list[TokenSpan]
82
+ """
83
+ import re
84
+
85
+ spans: List[TokenSpan] = []
86
+ for match in re.finditer(r"\S+", text):
87
+ spans.append(
88
+ TokenSpan(
89
+ token=match.group(0),
90
+ span_start=int(match.start()),
91
+ span_end=int(match.end()),
92
+ )
93
+ )
94
+ return spans
95
+
96
+
97
+ class TextChunk(BaseModel):
98
+ """
99
+ A chunk extracted from a larger text string.
100
+
101
+ :ivar chunk_id: Stable chunk identifier within a build.
102
+ :vartype chunk_id: int
103
+ :ivar item_id: Source item identifier.
104
+ :vartype item_id: str
105
+ :ivar span_start: Inclusive start character offset.
106
+ :vartype span_start: int
107
+ :ivar span_end: Exclusive end character offset.
108
+ :vartype span_end: int
109
+ :ivar text: Chunk text.
110
+ :vartype text: str
111
+ """
112
+
113
+ model_config = ConfigDict(extra="forbid")
114
+
115
+ chunk_id: int = Field(ge=0)
116
+ item_id: str = Field(min_length=1)
117
+ span_start: int = Field(ge=0)
118
+ span_end: int = Field(ge=0)
119
+ text: str
120
+
121
+ @model_validator(mode="after")
122
+ def _validate_span(self) -> "TextChunk":
123
+ if self.span_end <= self.span_start:
124
+ raise ValueError("chunk span_end must be greater than span_start")
125
+ if not isinstance(self.text, str) or not self.text:
126
+ raise ValueError("chunk text must be non-empty")
127
+ return self
128
+
129
+
130
+ class Chunker(ABC):
131
+ """
132
+ Interface for converting text into chunks.
133
+
134
+ :ivar chunker_id: Chunker identifier.
135
+ :vartype chunker_id: str
136
+ """
137
+
138
+ chunker_id: str
139
+
140
+ @abstractmethod
141
+ def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
142
+ """
143
+ Split a text string into chunks.
144
+
145
+ :param item_id: Item identifier that produced the text.
146
+ :type item_id: str
147
+ :param text: Full text to chunk.
148
+ :type text: str
149
+ :param starting_chunk_id: Starting chunk identifier.
150
+ :type starting_chunk_id: int
151
+ :return: Chunk list.
152
+ :rtype: list[TextChunk]
153
+ """
154
+ raise NotImplementedError
155
+
156
+
157
+ class FixedCharWindowChunker(Chunker):
158
+ """
159
+ Chunker that produces overlapping fixed-size character windows.
160
+ """
161
+
162
+ chunker_id = "fixed-char-window"
163
+
164
+ def __init__(self, *, window_characters: int, overlap_characters: int) -> None:
165
+ self._window_characters = int(window_characters)
166
+ self._overlap_characters = int(overlap_characters)
167
+ if self._window_characters <= 0:
168
+ raise ValueError("window_characters must be greater than 0")
169
+ if self._overlap_characters < 0:
170
+ raise ValueError("overlap_characters must be greater than or equal to 0")
171
+ if self._overlap_characters >= self._window_characters:
172
+ raise ValueError("overlap_characters must be less than window_characters")
173
+
174
+ def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
175
+ """
176
+ Chunk a text string into fixed-size character windows.
177
+
178
+ :param item_id: Item identifier.
179
+ :type item_id: str
180
+ :param text: Input text.
181
+ :type text: str
182
+ :param starting_chunk_id: Starting chunk identifier.
183
+ :type starting_chunk_id: int
184
+ :return: Chunk list.
185
+ :rtype: list[TextChunk]
186
+ """
187
+ chunks: List[TextChunk] = []
188
+ chunk_id = int(starting_chunk_id)
189
+ position = 0
190
+ stride = self._window_characters - self._overlap_characters
191
+ while position < len(text):
192
+ span_start = position
193
+ span_end = min(position + self._window_characters, len(text))
194
+ chunk_text = text[span_start:span_end]
195
+ if chunk_text.strip():
196
+ chunks.append(
197
+ TextChunk(
198
+ chunk_id=chunk_id,
199
+ item_id=item_id,
200
+ span_start=span_start,
201
+ span_end=span_end,
202
+ text=chunk_text,
203
+ )
204
+ )
205
+ chunk_id += 1
206
+ if span_end >= len(text):
207
+ position = len(text)
208
+ else:
209
+ position += stride
210
+ return chunks
211
+
212
+
213
+ class ParagraphChunker(Chunker):
214
+ """
215
+ Chunker that produces paragraph spans separated by blank lines.
216
+ """
217
+
218
+ chunker_id = "paragraph"
219
+
220
+ def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
221
+ """
222
+ Chunk a text string by paragraph boundaries.
223
+
224
+ :param item_id: Item identifier.
225
+ :type item_id: str
226
+ :param text: Input text.
227
+ :type text: str
228
+ :param starting_chunk_id: Starting chunk identifier.
229
+ :type starting_chunk_id: int
230
+ :return: Chunk list.
231
+ :rtype: list[TextChunk]
232
+ """
233
+ import re
234
+
235
+ chunks: List[TextChunk] = []
236
+ chunk_id = int(starting_chunk_id)
237
+ for match in re.finditer(r"(?:[^\n]|\n(?!\n))+", text):
238
+ span_start = int(match.start())
239
+ span_end = int(match.end())
240
+ chunk_text = text[span_start:span_end]
241
+ if not chunk_text.strip():
242
+ continue
243
+ chunks.append(
244
+ TextChunk(
245
+ chunk_id=chunk_id,
246
+ item_id=item_id,
247
+ span_start=span_start,
248
+ span_end=span_end,
249
+ text=chunk_text,
250
+ )
251
+ )
252
+ chunk_id += 1
253
+ return chunks
254
+
255
+
256
+ class FixedTokenWindowChunker(Chunker):
257
+ """
258
+ Chunker that produces overlapping fixed-size token windows.
259
+ """
260
+
261
+ chunker_id = "fixed-token-window"
262
+
263
+ def __init__(self, *, window_tokens: int, overlap_tokens: int, tokenizer: Tokenizer) -> None:
264
+ self._window_tokens = int(window_tokens)
265
+ self._overlap_tokens = int(overlap_tokens)
266
+ self._tokenizer = tokenizer
267
+ if self._window_tokens <= 0:
268
+ raise ValueError("window_tokens must be greater than 0")
269
+ if self._overlap_tokens < 0:
270
+ raise ValueError("overlap_tokens must be greater than or equal to 0")
271
+ if self._overlap_tokens >= self._window_tokens:
272
+ raise ValueError("overlap_tokens must be less than window_tokens")
273
+
274
+ def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
275
+ """
276
+ Chunk a text string into fixed-size token windows.
277
+
278
+ :param item_id: Item identifier.
279
+ :type item_id: str
280
+ :param text: Input text.
281
+ :type text: str
282
+ :param starting_chunk_id: Starting chunk identifier.
283
+ :type starting_chunk_id: int
284
+ :return: Chunk list.
285
+ :rtype: list[TextChunk]
286
+ """
287
+ token_spans = self._tokenizer.tokenize(text)
288
+ if not token_spans:
289
+ return []
290
+ chunks: List[TextChunk] = []
291
+ chunk_id = int(starting_chunk_id)
292
+ stride = self._window_tokens - self._overlap_tokens
293
+ token_index = 0
294
+ while token_index < len(token_spans):
295
+ window_end = min(token_index + self._window_tokens, len(token_spans))
296
+ span_start = token_spans[token_index].span_start
297
+ span_end = token_spans[window_end - 1].span_end
298
+ chunk_text = text[span_start:span_end]
299
+ chunks.append(
300
+ TextChunk(
301
+ chunk_id=chunk_id,
302
+ item_id=item_id,
303
+ span_start=span_start,
304
+ span_end=span_end,
305
+ text=chunk_text,
306
+ )
307
+ )
308
+ chunk_id += 1
309
+ if window_end >= len(token_spans):
310
+ token_index = len(token_spans)
311
+ else:
312
+ token_index += stride
313
+ return chunks
314
+
315
+
316
+ class TokenizerConfig(BaseModel):
317
+ """
318
+ Configuration for tokenizer selection.
319
+
320
+ :ivar tokenizer_id: Tokenizer identifier.
321
+ :vartype tokenizer_id: str
322
+ """
323
+
324
+ model_config = ConfigDict(extra="forbid")
325
+
326
+ tokenizer_id: str = Field(min_length=1)
327
+
328
+ def build_tokenizer(self) -> Tokenizer:
329
+ """
330
+ Build a tokenizer instance from this configuration.
331
+
332
+ :return: Tokenizer instance.
333
+ :rtype: Tokenizer
334
+ :raises ValueError: If the tokenizer identifier is unknown.
335
+ """
336
+ if self.tokenizer_id == WhitespaceTokenizer.tokenizer_id:
337
+ return WhitespaceTokenizer()
338
+ raise ValueError(f"Unknown tokenizer_id: {self.tokenizer_id!r}")
339
+
340
+
341
+ class ChunkerConfig(BaseModel):
342
+ """
343
+ Configuration for chunker selection.
344
+
345
+ :ivar chunker_id: Chunker identifier.
346
+ :vartype chunker_id: str
347
+ :ivar window_characters: Window size for fixed character chunking.
348
+ :vartype window_characters: int or None
349
+ :ivar overlap_characters: Overlap size for fixed character chunking.
350
+ :vartype overlap_characters: int or None
351
+ :ivar window_tokens: Window size for fixed token chunking.
352
+ :vartype window_tokens: int or None
353
+ :ivar overlap_tokens: Overlap size for fixed token chunking.
354
+ :vartype overlap_tokens: int or None
355
+ """
356
+
357
+ model_config = ConfigDict(extra="forbid")
358
+
359
+ chunker_id: str = Field(min_length=1)
360
+ window_characters: Optional[int] = Field(default=None, ge=1)
361
+ overlap_characters: Optional[int] = Field(default=None, ge=0)
362
+ window_tokens: Optional[int] = Field(default=None, ge=1)
363
+ overlap_tokens: Optional[int] = Field(default=None, ge=0)
364
+
365
+ def build_chunker(self, *, tokenizer: Optional[Tokenizer]) -> Chunker:
366
+ """
367
+ Build a chunker instance from this configuration.
368
+
369
+ :param tokenizer: Tokenizer used by token-window chunking strategies.
370
+ :type tokenizer: Tokenizer or None
371
+ :return: Chunker instance.
372
+ :rtype: Chunker
373
+ :raises ValueError: If required configuration is missing or unknown.
374
+ """
375
+ if self.chunker_id == FixedCharWindowChunker.chunker_id:
376
+ if self.window_characters is None or self.overlap_characters is None:
377
+ raise ValueError(
378
+ "fixed-char-window requires window_characters and overlap_characters"
379
+ )
380
+ return FixedCharWindowChunker(
381
+ window_characters=self.window_characters,
382
+ overlap_characters=self.overlap_characters,
383
+ )
384
+ if self.chunker_id == ParagraphChunker.chunker_id:
385
+ return ParagraphChunker()
386
+ if self.chunker_id == FixedTokenWindowChunker.chunker_id:
387
+ if self.window_tokens is None or self.overlap_tokens is None:
388
+ raise ValueError("fixed-token-window requires window_tokens and overlap_tokens")
389
+ if tokenizer is None:
390
+ raise ValueError("tokenizer configuration is required for fixed-token-window")
391
+ return FixedTokenWindowChunker(
392
+ window_tokens=self.window_tokens,
393
+ overlap_tokens=self.overlap_tokens,
394
+ tokenizer=tokenizer,
395
+ )
396
+ raise ValueError(f"Unknown chunker_id: {self.chunker_id!r}")