biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,191 @@
1
+ """
2
+ High-level knowledge base workflow for turnkey usage.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from tempfile import TemporaryDirectory
10
+ from typing import List, Optional, Sequence
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from .backends import get_backend
15
+ from .context import (
16
+ ContextPack,
17
+ ContextPackPolicy,
18
+ TokenBudget,
19
+ build_context_pack,
20
+ fit_context_pack_to_token_budget,
21
+ )
22
+ from .corpus import Corpus
23
+ from .models import QueryBudget, RetrievalResult, RetrievalRun
24
+
25
+
26
+ class KnowledgeBaseDefaults(BaseModel):
27
+ """
28
+ Default configuration for a knowledge base workflow.
29
+
30
+ :ivar backend_id: Backend identifier to use for retrieval.
31
+ :vartype backend_id: str
32
+ :ivar recipe_name: Human-readable retrieval recipe name.
33
+ :vartype recipe_name: str
34
+ :ivar query_budget: Default query budget to apply to retrieval.
35
+ :vartype query_budget: QueryBudget
36
+ :ivar tags: Tags to apply when importing the folder.
37
+ :vartype tags: list[str]
38
+ """
39
+
40
+ model_config = ConfigDict(extra="forbid")
41
+
42
+ backend_id: str = Field(default="scan", min_length=1)
43
+ recipe_name: str = Field(default="Knowledge base", min_length=1)
44
+ query_budget: QueryBudget = Field(
45
+ default_factory=lambda: QueryBudget(
46
+ max_total_items=5,
47
+ max_total_characters=2000,
48
+ max_items_per_source=None,
49
+ )
50
+ )
51
+ tags: List[str] = Field(default_factory=list)
52
+
53
+
54
+ @dataclass
55
+ class KnowledgeBase:
56
+ """
57
+ High-level knowledge base wrapper for turnkey workflows.
58
+
59
+ :ivar corpus: Corpus instance that stores the ingested items.
60
+ :vartype corpus: Corpus
61
+ :ivar backend_id: Backend identifier used for retrieval.
62
+ :vartype backend_id: str
63
+ :ivar run: Retrieval run manifest associated with the knowledge base.
64
+ :vartype run: RetrievalRun
65
+ :ivar defaults: Default configuration used for this knowledge base.
66
+ :vartype defaults: KnowledgeBaseDefaults
67
+ """
68
+
69
+ corpus: Corpus
70
+ backend_id: str
71
+ run: RetrievalRun
72
+ defaults: KnowledgeBaseDefaults
73
+ _temp_dir: Optional[TemporaryDirectory]
74
+
75
+ @classmethod
76
+ def from_folder(
77
+ cls,
78
+ folder: str | Path,
79
+ *,
80
+ backend_id: Optional[str] = None,
81
+ recipe_name: Optional[str] = None,
82
+ query_budget: Optional[QueryBudget] = None,
83
+ tags: Optional[Sequence[str]] = None,
84
+ corpus_root: Optional[str | Path] = None,
85
+ ) -> "KnowledgeBase":
86
+ """
87
+ Build a knowledge base from a folder of files.
88
+
89
+ :param folder: Folder containing source files.
90
+ :type folder: str or Path
91
+ :param backend_id: Optional backend identifier override.
92
+ :type backend_id: str or None
93
+ :param recipe_name: Optional recipe name override.
94
+ :type recipe_name: str or None
95
+ :param query_budget: Optional query budget override.
96
+ :type query_budget: QueryBudget or None
97
+ :param tags: Optional tags to apply during import.
98
+ :type tags: Sequence[str] or None
99
+ :param corpus_root: Optional corpus root override.
100
+ :type corpus_root: str or Path or None
101
+ :return: Knowledge base instance.
102
+ :rtype: KnowledgeBase
103
+ :raises FileNotFoundError: If the folder does not exist.
104
+ :raises NotADirectoryError: If the folder is not a directory.
105
+ """
106
+ source_root = Path(folder).resolve()
107
+ if not source_root.exists():
108
+ raise FileNotFoundError(f"Knowledge base folder does not exist: {source_root}")
109
+ if not source_root.is_dir():
110
+ raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
111
+
112
+ defaults = KnowledgeBaseDefaults()
113
+ resolved_backend_id = backend_id or defaults.backend_id
114
+ resolved_recipe_name = recipe_name or defaults.recipe_name
115
+ resolved_query_budget = query_budget or defaults.query_budget
116
+ resolved_tags = list(tags) if tags is not None else defaults.tags
117
+
118
+ temp_dir: Optional[TemporaryDirectory] = None
119
+ if corpus_root is None:
120
+ temp_dir = TemporaryDirectory(prefix="biblicus-knowledge-base-")
121
+ corpus_root_path = Path(temp_dir.name) / "corpus"
122
+ else:
123
+ corpus_root_path = Path(corpus_root).resolve()
124
+
125
+ corpus = Corpus.init(corpus_root_path)
126
+ corpus.import_tree(source_root, tags=resolved_tags)
127
+
128
+ backend = get_backend(resolved_backend_id)
129
+ run = backend.build_run(corpus, recipe_name=resolved_recipe_name, config={})
130
+
131
+ return cls(
132
+ corpus=corpus,
133
+ backend_id=resolved_backend_id,
134
+ run=run,
135
+ defaults=KnowledgeBaseDefaults(
136
+ backend_id=resolved_backend_id,
137
+ recipe_name=resolved_recipe_name,
138
+ query_budget=resolved_query_budget,
139
+ tags=resolved_tags,
140
+ ),
141
+ _temp_dir=temp_dir,
142
+ )
143
+
144
+ def query(self, query_text: str, *, budget: Optional[QueryBudget] = None) -> RetrievalResult:
145
+ """
146
+ Query the knowledge base for evidence.
147
+
148
+ :param query_text: Query text to execute.
149
+ :type query_text: str
150
+ :param budget: Optional budget override.
151
+ :type budget: QueryBudget or None
152
+ :return: Retrieval result containing evidence.
153
+ :rtype: RetrievalResult
154
+ """
155
+ backend = get_backend(self.backend_id)
156
+ resolved_budget = budget or self.defaults.query_budget
157
+ return backend.query(
158
+ self.corpus,
159
+ run=self.run,
160
+ query_text=query_text,
161
+ budget=resolved_budget,
162
+ )
163
+
164
+ def context_pack(
165
+ self,
166
+ result: RetrievalResult,
167
+ *,
168
+ join_with: str = "\n\n",
169
+ max_tokens: Optional[int] = None,
170
+ ) -> ContextPack:
171
+ """
172
+ Build a context pack from a retrieval result.
173
+
174
+ :param result: Retrieval result to convert into context.
175
+ :type result: RetrievalResult
176
+ :param join_with: Join string for evidence blocks.
177
+ :type join_with: str
178
+ :param max_tokens: Optional token budget for the context pack.
179
+ :type max_tokens: int or None
180
+ :return: Context pack text and metadata.
181
+ :rtype: ContextPack
182
+ """
183
+ policy = ContextPackPolicy(join_with=join_with)
184
+ context_pack = build_context_pack(result, policy=policy)
185
+ if max_tokens is None:
186
+ return context_pack
187
+ return fit_context_pack_to_token_budget(
188
+ context_pack,
189
+ policy=policy,
190
+ token_budget=TokenBudget(max_tokens=max_tokens),
191
+ )
biblicus/models.py ADDED
@@ -0,0 +1,445 @@
1
+ """
2
+ Pydantic models for Biblicus domain concepts.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
10
+
11
+ from .constants import SCHEMA_VERSION
12
+ from .hooks import HookSpec
13
+
14
+
15
+ class CorpusConfig(BaseModel):
16
+ """
17
+ Canonical on-disk config for a local Biblicus corpus.
18
+
19
+ :ivar schema_version: Version of the corpus config schema.
20
+ :vartype schema_version: int
21
+ :ivar created_at: International Organization for Standardization 8601 timestamp for corpus creation.
22
+ :vartype created_at: str
23
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
24
+ :vartype corpus_uri: str
25
+ :ivar raw_dir: Relative path to the raw items folder.
26
+ :vartype raw_dir: str
27
+ :ivar notes: Optional free-form notes for operators.
28
+ :vartype notes: dict[str, Any] or None
29
+ :ivar hooks: Optional hook specifications for corpus lifecycle events.
30
+ :vartype hooks: list[HookSpec] or None
31
+ """
32
+
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ schema_version: int = Field(ge=1)
36
+ created_at: str
37
+ corpus_uri: str
38
+ raw_dir: str = "raw"
39
+ notes: Optional[Dict[str, Any]] = None
40
+ hooks: Optional[List[HookSpec]] = None
41
+
42
+ @model_validator(mode="after")
43
+ def _enforce_schema_version(self) -> "CorpusConfig":
44
+ if self.schema_version != SCHEMA_VERSION:
45
+ raise ValueError(f"Unsupported corpus config schema version: {self.schema_version}")
46
+ return self
47
+
48
+
49
+ class IngestResult(BaseModel):
50
+ """
51
+ Minimal summary for an ingestion event.
52
+
53
+ :ivar item_id: Universally unique identifier assigned to the ingested item.
54
+ :vartype item_id: str
55
+ :ivar relpath: Relative path to the raw item file.
56
+ :vartype relpath: str
57
+ :ivar sha256: Secure Hash Algorithm 256 digest of the stored bytes.
58
+ :vartype sha256: str
59
+ """
60
+
61
+ model_config = ConfigDict(extra="forbid")
62
+
63
+ item_id: str
64
+ relpath: str
65
+ sha256: str
66
+
67
+
68
+ class CatalogItem(BaseModel):
69
+ """
70
+ Catalog entry derived from a raw corpus item.
71
+
72
+ :ivar id: Universally unique identifier of the item.
73
+ :vartype id: str
74
+ :ivar relpath: Relative path to the raw item file.
75
+ :vartype relpath: str
76
+ :ivar sha256: Secure Hash Algorithm 256 digest of the stored bytes.
77
+ :vartype sha256: str
78
+ :ivar bytes: Size of the raw item in bytes.
79
+ :vartype bytes: int
80
+ :ivar media_type: Internet Assigned Numbers Authority media type for the item.
81
+ :vartype media_type: str
82
+ :ivar title: Optional human title extracted from metadata.
83
+ :vartype title: str or None
84
+ :ivar tags: Tags extracted or supplied for the item.
85
+ :vartype tags: list[str]
86
+ :ivar metadata: Merged front matter or sidecar metadata.
87
+ :vartype metadata: dict[str, Any]
88
+ :ivar created_at: International Organization for Standardization 8601 timestamp when the item was first indexed.
89
+ :vartype created_at: str
90
+ :ivar source_uri: Optional source uniform resource identifier used at ingestion time.
91
+ :vartype source_uri: str or None
92
+ """
93
+
94
+ model_config = ConfigDict(extra="forbid")
95
+
96
+ id: str
97
+ relpath: str
98
+ sha256: str
99
+ bytes: int = Field(ge=0)
100
+ media_type: str
101
+ title: Optional[str] = None
102
+ tags: List[str] = Field(default_factory=list)
103
+ metadata: Dict[str, Any] = Field(default_factory=dict)
104
+ created_at: str
105
+ source_uri: Optional[str] = None
106
+
107
+
108
+ class CorpusCatalog(BaseModel):
109
+ """
110
+ Snapshot of the derived corpus catalog.
111
+
112
+ :ivar schema_version: Version of the catalog schema.
113
+ :vartype schema_version: int
114
+ :ivar generated_at: International Organization for Standardization 8601 timestamp of catalog generation.
115
+ :vartype generated_at: str
116
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
117
+ :vartype corpus_uri: str
118
+ :ivar raw_dir: Relative path to the raw items folder.
119
+ :vartype raw_dir: str
120
+ :ivar latest_run_id: Latest retrieval run identifier, if any.
121
+ :vartype latest_run_id: str or None
122
+ :ivar items: Mapping of item IDs to catalog entries.
123
+ :vartype items: dict[str, CatalogItem]
124
+ :ivar order: Display order of item IDs (most recent first).
125
+ :vartype order: list[str]
126
+ """
127
+
128
+ model_config = ConfigDict(extra="forbid")
129
+
130
+ schema_version: int = Field(ge=1)
131
+ generated_at: str
132
+ corpus_uri: str
133
+ raw_dir: str = "raw"
134
+ latest_run_id: Optional[str] = None
135
+ items: Dict[str, CatalogItem] = Field(default_factory=dict)
136
+ order: List[str] = Field(default_factory=list)
137
+
138
+ @model_validator(mode="after")
139
+ def _enforce_schema_version(self) -> "CorpusCatalog":
140
+ if self.schema_version != SCHEMA_VERSION:
141
+ raise ValueError(f"Unsupported catalog schema version: {self.schema_version}")
142
+ return self
143
+
144
+
145
+ class ExtractionRunReference(BaseModel):
146
+ """
147
+ Reference to an extraction run.
148
+
149
+ :ivar extractor_id: Extractor plugin identifier.
150
+ :vartype extractor_id: str
151
+ :ivar run_id: Extraction run identifier.
152
+ :vartype run_id: str
153
+ """
154
+
155
+ model_config = ConfigDict(extra="forbid")
156
+
157
+ extractor_id: str = Field(min_length=1)
158
+ run_id: str = Field(min_length=1)
159
+
160
+ def as_string(self) -> str:
161
+ """
162
+ Serialize the reference as a single string.
163
+
164
+ :return: Reference in the form extractor_id:run_id.
165
+ :rtype: str
166
+ """
167
+ return f"{self.extractor_id}:{self.run_id}"
168
+
169
+
170
+ def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
171
+ """
172
+ Parse an extraction run reference in the form extractor_id:run_id.
173
+
174
+ :param value: Raw reference string.
175
+ :type value: str
176
+ :return: Parsed extraction run reference.
177
+ :rtype: ExtractionRunReference
178
+ :raises ValueError: If the reference is not well formed.
179
+ """
180
+ if ":" not in value:
181
+ raise ValueError("Extraction run reference must be extractor_id:run_id")
182
+ extractor_id, run_id = value.split(":", 1)
183
+ extractor_id = extractor_id.strip()
184
+ run_id = run_id.strip()
185
+ if not extractor_id or not run_id:
186
+ raise ValueError(
187
+ "Extraction run reference must be extractor_id:run_id with non-empty parts"
188
+ )
189
+ return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
190
+
191
+
192
+ class ExtractionRunListEntry(BaseModel):
193
+ """
194
+ Summary entry for an extraction run stored in a corpus.
195
+
196
+ :ivar extractor_id: Extractor plugin identifier.
197
+ :vartype extractor_id: str
198
+ :ivar run_id: Extraction run identifier.
199
+ :vartype run_id: str
200
+ :ivar recipe_id: Deterministic recipe identifier.
201
+ :vartype recipe_id: str
202
+ :ivar recipe_name: Human-readable recipe name.
203
+ :vartype recipe_name: str
204
+ :ivar catalog_generated_at: Catalog timestamp used for the run.
205
+ :vartype catalog_generated_at: str
206
+ :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
207
+ :vartype created_at: str
208
+ :ivar stats: Run statistics.
209
+ :vartype stats: dict[str, object]
210
+ """
211
+
212
+ model_config = ConfigDict(extra="forbid")
213
+
214
+ extractor_id: str = Field(min_length=1)
215
+ run_id: str = Field(min_length=1)
216
+ recipe_id: str = Field(min_length=1)
217
+ recipe_name: str = Field(min_length=1)
218
+ catalog_generated_at: str = Field(min_length=1)
219
+ created_at: str = Field(min_length=1)
220
+ stats: Dict[str, object] = Field(default_factory=dict)
221
+
222
+
223
+ class QueryBudget(BaseModel):
224
+ """
225
+ Evidence selection budget for retrieval.
226
+
227
+ :ivar max_total_items: Maximum number of evidence items to return.
228
+ :vartype max_total_items: int
229
+ :ivar max_total_characters: Optional maximum total characters across evidence text.
230
+ :vartype max_total_characters: int or None
231
+ :ivar max_items_per_source: Optional cap per source uniform resource identifier.
232
+ :vartype max_items_per_source: int or None
233
+ """
234
+
235
+ model_config = ConfigDict(extra="forbid")
236
+
237
+ max_total_items: int = Field(ge=1)
238
+ max_total_characters: Optional[int] = Field(default=None, ge=1)
239
+ max_items_per_source: Optional[int] = Field(default=None, ge=1)
240
+
241
+
242
+ class Evidence(BaseModel):
243
+ """
244
+ Structured retrieval evidence returned from a backend.
245
+
246
+ :ivar item_id: Item identifier that produced the evidence.
247
+ :vartype item_id: str
248
+ :ivar source_uri: Source uniform resource identifier from ingestion metadata.
249
+ :vartype source_uri: str or None
250
+ :ivar media_type: Media type for the evidence item.
251
+ :vartype media_type: str
252
+ :ivar score: Retrieval score (higher is better).
253
+ :vartype score: float
254
+ :ivar rank: Rank within the final evidence list (1-based).
255
+ :vartype rank: int
256
+ :ivar text: Optional text payload for the evidence.
257
+ :vartype text: str or None
258
+ :ivar content_ref: Optional reference for non-text content.
259
+ :vartype content_ref: str or None
260
+ :ivar span_start: Optional start offset in the source text.
261
+ :vartype span_start: int or None
262
+ :ivar span_end: Optional end offset in the source text.
263
+ :vartype span_end: int or None
264
+ :ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
265
+ :vartype stage: str
266
+ :ivar recipe_id: Recipe identifier used to create the run.
267
+ :vartype recipe_id: str
268
+ :ivar run_id: Retrieval run identifier.
269
+ :vartype run_id: str
270
+ :ivar hash: Optional content hash for provenance.
271
+ :vartype hash: str or None
272
+ """
273
+
274
+ model_config = ConfigDict(extra="forbid")
275
+
276
+ item_id: str
277
+ source_uri: Optional[str] = None
278
+ media_type: str
279
+ score: float
280
+ rank: int = Field(ge=1)
281
+ text: Optional[str] = None
282
+ content_ref: Optional[str] = None
283
+ span_start: Optional[int] = None
284
+ span_end: Optional[int] = None
285
+ stage: str
286
+ recipe_id: str
287
+ run_id: str
288
+ hash: Optional[str] = None
289
+
290
+ @model_validator(mode="after")
291
+ def _require_text_or_reference(self) -> "Evidence":
292
+ has_text = isinstance(self.text, str) and self.text.strip()
293
+ has_ref = isinstance(self.content_ref, str) and self.content_ref.strip()
294
+ if not has_text and not has_ref:
295
+ raise ValueError("Evidence must include either text or content_ref")
296
+ return self
297
+
298
+
299
+ class RecipeManifest(BaseModel):
300
+ """
301
+ Reproducible configuration for a retrieval backend.
302
+
303
+ :ivar recipe_id: Deterministic recipe identifier.
304
+ :vartype recipe_id: str
305
+ :ivar backend_id: Backend identifier for the recipe.
306
+ :vartype backend_id: str
307
+ :ivar name: Human-readable name for the recipe.
308
+ :vartype name: str
309
+ :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
310
+ :vartype created_at: str
311
+ :ivar config: Backend-specific configuration values.
312
+ :vartype config: dict[str, Any]
313
+ :ivar description: Optional human description.
314
+ :vartype description: str or None
315
+ """
316
+
317
+ model_config = ConfigDict(extra="forbid")
318
+
319
+ recipe_id: str
320
+ backend_id: str
321
+ name: str
322
+ created_at: str
323
+ config: Dict[str, Any] = Field(default_factory=dict)
324
+ description: Optional[str] = None
325
+
326
+
327
+ class RetrievalRun(BaseModel):
328
+ """
329
+ Immutable record of a retrieval materialization or on-demand run.
330
+
331
+ :ivar run_id: Unique run identifier.
332
+ :vartype run_id: str
333
+ :ivar recipe: Recipe manifest for this run.
334
+ :vartype recipe: RecipeManifest
335
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
336
+ :vartype corpus_uri: str
337
+ :ivar catalog_generated_at: Catalog timestamp used for the run.
338
+ :vartype catalog_generated_at: str
339
+ :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
340
+ :vartype created_at: str
341
+ :ivar artifact_paths: Relative paths to materialized artifacts.
342
+ :vartype artifact_paths: list[str]
343
+ :ivar stats: Backend-specific run statistics.
344
+ :vartype stats: dict[str, Any]
345
+ """
346
+
347
+ model_config = ConfigDict(extra="forbid")
348
+
349
+ run_id: str
350
+ recipe: RecipeManifest
351
+ corpus_uri: str
352
+ catalog_generated_at: str
353
+ created_at: str
354
+ artifact_paths: List[str] = Field(default_factory=list)
355
+ stats: Dict[str, Any] = Field(default_factory=dict)
356
+
357
+
358
+ class RetrievalResult(BaseModel):
359
+ """
360
+ Retrieval result bundle returned from a backend query.
361
+
362
+ :ivar query_text: Query text issued against the backend.
363
+ :vartype query_text: str
364
+ :ivar budget: Evidence selection budget applied to results.
365
+ :vartype budget: QueryBudget
366
+ :ivar run_id: Retrieval run identifier.
367
+ :vartype run_id: str
368
+ :ivar recipe_id: Recipe identifier used for this query.
369
+ :vartype recipe_id: str
370
+ :ivar backend_id: Backend identifier used for this query.
371
+ :vartype backend_id: str
372
+ :ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
373
+ :vartype generated_at: str
374
+ :ivar evidence: Evidence objects selected under the budget.
375
+ :vartype evidence: list[Evidence]
376
+ :ivar stats: Backend-specific query statistics.
377
+ :vartype stats: dict[str, Any]
378
+ """
379
+
380
+ model_config = ConfigDict(extra="forbid")
381
+
382
+ query_text: str
383
+ budget: QueryBudget
384
+ run_id: str
385
+ recipe_id: str
386
+ backend_id: str
387
+ generated_at: str
388
+ evidence: List[Evidence] = Field(default_factory=list)
389
+ stats: Dict[str, Any] = Field(default_factory=dict)
390
+
391
+
392
+ class ExtractedText(BaseModel):
393
+ """
394
+ Text payload produced by an extractor plugin.
395
+
396
+ :ivar text: Extracted text content.
397
+ :vartype text: str
398
+ :ivar producer_extractor_id: Extractor identifier that produced this text.
399
+ :vartype producer_extractor_id: str
400
+ :ivar source_step_index: Optional pipeline step index where this text originated.
401
+ :vartype source_step_index: int or None
402
+ """
403
+
404
+ model_config = ConfigDict(extra="forbid")
405
+
406
+ text: str
407
+ producer_extractor_id: str = Field(min_length=1)
408
+ source_step_index: Optional[int] = Field(default=None, ge=1)
409
+
410
+
411
+ class ExtractionStepOutput(BaseModel):
412
+ """
413
+ In-memory representation of a pipeline step output for a single item.
414
+
415
+ :ivar step_index: One-based pipeline step index.
416
+ :vartype step_index: int
417
+ :ivar extractor_id: Extractor identifier for the step.
418
+ :vartype extractor_id: str
419
+ :ivar status: Step status, extracted, skipped, or errored.
420
+ :vartype status: str
421
+ :ivar text: Extracted text content, when produced.
422
+ :vartype text: str or None
423
+ :ivar text_characters: Character count of the extracted text.
424
+ :vartype text_characters: int
425
+ :ivar producer_extractor_id: Extractor identifier that produced the text content.
426
+ :vartype producer_extractor_id: str or None
427
+ :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
428
+ :vartype source_step_index: int or None
429
+ :ivar error_type: Optional error type name for errored steps.
430
+ :vartype error_type: str or None
431
+ :ivar error_message: Optional error message for errored steps.
432
+ :vartype error_message: str or None
433
+ """
434
+
435
+ model_config = ConfigDict(extra="forbid")
436
+
437
+ step_index: int = Field(ge=1)
438
+ extractor_id: str
439
+ status: str
440
+ text: Optional[str] = None
441
+ text_characters: int = Field(default=0, ge=0)
442
+ producer_extractor_id: Optional[str] = None
443
+ source_step_index: Optional[int] = Field(default=None, ge=1)
444
+ error_type: Optional[str] = None
445
+ error_message: Optional[str] = None