biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/crawl.py ADDED
@@ -0,0 +1,186 @@
1
+ """
2
+ Website crawl utilities for Biblicus corpora.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from collections import deque
8
+ from html.parser import HTMLParser
9
+ from typing import Deque, List, Optional, Set
10
+ from urllib.parse import urldefrag, urljoin
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from .ignore import load_corpus_ignore_spec
15
+ from .sources import load_source
16
+
17
+
18
+ class CrawlRequest(BaseModel):
19
+ """
20
+ Request describing a website crawl into a corpus.
21
+
22
+ :ivar root_url: Initial uniform resource locator to fetch.
23
+ :vartype root_url: str
24
+ :ivar allowed_prefix: Uniform resource locator prefix that limits which links are eligible for crawl.
25
+ :vartype allowed_prefix: str
26
+ :ivar max_items: Maximum number of items to store during the crawl.
27
+ :vartype max_items: int
28
+ :ivar tags: Tags to apply to stored items.
29
+ :vartype tags: list[str]
30
+ """
31
+
32
+ model_config = ConfigDict(extra="forbid")
33
+
34
+ root_url: str = Field(min_length=1)
35
+ allowed_prefix: str = Field(min_length=1)
36
+ max_items: int = Field(default=50, ge=1)
37
+ tags: List[str] = Field(default_factory=list)
38
+
39
+
40
+ class CrawlResult(BaseModel):
41
+ """
42
+ Summary result for a crawl execution.
43
+
44
+ :ivar crawl_id: Crawl identifier used in the corpus raw import namespace.
45
+ :vartype crawl_id: str
46
+ :ivar discovered_items: Total number of distinct uniform resource locators discovered.
47
+ :vartype discovered_items: int
48
+ :ivar fetched_items: Number of eligible items fetched over hypertext transfer protocol.
49
+ :vartype fetched_items: int
50
+ :ivar stored_items: Number of items stored into the corpus.
51
+ :vartype stored_items: int
52
+ :ivar skipped_outside_prefix_items: Number of discovered items outside the allowed prefix.
53
+ :vartype skipped_outside_prefix_items: int
54
+ :ivar skipped_ignored_items: Number of eligible items skipped due to corpus ignore rules.
55
+ :vartype skipped_ignored_items: int
56
+ :ivar errored_items: Number of eligible items that failed to fetch or store.
57
+ :vartype errored_items: int
58
+ """
59
+
60
+ model_config = ConfigDict(extra="forbid")
61
+
62
+ crawl_id: str
63
+ discovered_items: int = Field(default=0, ge=0)
64
+ fetched_items: int = Field(default=0, ge=0)
65
+ stored_items: int = Field(default=0, ge=0)
66
+ skipped_outside_prefix_items: int = Field(default=0, ge=0)
67
+ skipped_ignored_items: int = Field(default=0, ge=0)
68
+ errored_items: int = Field(default=0, ge=0)
69
+
70
+
71
+ class _LinkExtractor(HTMLParser):
72
+ def __init__(self) -> None:
73
+ super().__init__()
74
+ self.links: List[str] = []
75
+
76
+ def handle_starttag(self, tag: str, attrs): # type: ignore[no-untyped-def]
77
+ _ = tag
78
+ for key, value in attrs:
79
+ if key in {"href", "src"} and isinstance(value, str) and value.strip():
80
+ self.links.append(value.strip())
81
+
82
+
83
+ def _normalize_crawl_url(candidate: str, *, base_url: str) -> Optional[str]:
84
+ joined = urljoin(base_url, candidate)
85
+ joined, _fragment = urldefrag(joined)
86
+ joined = joined.strip()
87
+ if joined.startswith(("mailto:", "javascript:")):
88
+ return None
89
+ return joined
90
+
91
+
92
+ def _crawl_relative_path(url: str, *, allowed_prefix: str) -> str:
93
+ relative = url[len(allowed_prefix) :].lstrip("/")
94
+ if not relative or relative.endswith("/"):
95
+ relative = relative.rstrip("/") + "/index.html" if relative else "index.html"
96
+ return relative
97
+
98
+
99
+ def _should_parse_links(media_type: str) -> bool:
100
+ return media_type.startswith("text/html")
101
+
102
+
103
+ def _discover_links(html_text: str, *, base_url: str) -> List[str]:
104
+ parser = _LinkExtractor()
105
+ parser.feed(html_text)
106
+ discovered: List[str] = []
107
+ for raw in parser.links:
108
+ normalized = _normalize_crawl_url(raw, base_url=base_url)
109
+ if normalized is not None:
110
+ discovered.append(normalized)
111
+ return discovered
112
+
113
+
114
+ def crawl_into_corpus(*, corpus, request: CrawlRequest) -> CrawlResult: # type: ignore[no-untyped-def]
115
+ """
116
+ Crawl a website prefix into a corpus.
117
+
118
+ :param corpus: Target corpus to receive crawled items.
119
+ :type corpus: biblicus.corpus.Corpus
120
+ :param request: Crawl request describing limits and allowed prefix.
121
+ :type request: CrawlRequest
122
+ :return: Crawl result summary.
123
+ :rtype: CrawlResult
124
+ """
125
+ ignore_spec = load_corpus_ignore_spec(corpus.root)
126
+ allowed_prefix = request.allowed_prefix
127
+ root_url = request.root_url
128
+
129
+ crawl_id = corpus.create_crawl_id()
130
+
131
+ queue: Deque[str] = deque([root_url])
132
+ seen: Set[str] = set()
133
+ stored_count = 0
134
+ fetched_count = 0
135
+ skipped_outside_prefix_count = 0
136
+ skipped_ignored_count = 0
137
+ errored_count = 0
138
+ discovered_urls: Set[str] = set()
139
+
140
+ while queue and stored_count < request.max_items:
141
+ url = queue.popleft()
142
+ if url in seen:
143
+ continue
144
+ seen.add(url)
145
+ discovered_urls.add(url)
146
+
147
+ if not url.startswith(allowed_prefix):
148
+ skipped_outside_prefix_count += 1
149
+ continue
150
+
151
+ relative_path = _crawl_relative_path(url, allowed_prefix=allowed_prefix)
152
+ if ignore_spec.matches(relative_path):
153
+ skipped_ignored_count += 1
154
+ continue
155
+
156
+ try:
157
+ payload = load_source(url)
158
+ fetched_count += 1
159
+ corpus.ingest_crawled_payload(
160
+ crawl_id=crawl_id,
161
+ relative_path=relative_path,
162
+ data=payload.data,
163
+ filename=payload.filename,
164
+ media_type=payload.media_type,
165
+ source_uri=payload.source_uri,
166
+ tags=request.tags,
167
+ )
168
+ stored_count += 1
169
+ except Exception:
170
+ errored_count += 1
171
+ continue
172
+
173
+ if _should_parse_links(payload.media_type):
174
+ text = payload.data.decode("utf-8", errors="replace")
175
+ for discovered in _discover_links(text, base_url=url):
176
+ queue.append(discovered)
177
+
178
+ return CrawlResult(
179
+ crawl_id=crawl_id,
180
+ discovered_items=len(discovered_urls),
181
+ fetched_items=fetched_count,
182
+ stored_items=stored_count,
183
+ skipped_outside_prefix_items=skipped_outside_prefix_count,
184
+ skipped_ignored_items=skipped_ignored_count,
185
+ errored_items=errored_count,
186
+ )
biblicus/errors.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Error types for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+
8
+ class ExtractionRunFatalError(RuntimeError):
9
+ """
10
+ Fatal extraction run error that should abort the entire run.
11
+
12
+ This exception is used for conditions that indicate a configuration or environment problem
13
+ rather than a per-item extraction failure. For example, a selection extractor that depends
14
+ on referenced extraction run manifests treats missing manifests as fatal.
15
+ """
biblicus/evaluation.py ADDED
@@ -0,0 +1,257 @@
1
+ """
2
+ Evaluation utilities for Biblicus retrieval runs.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
13
+
14
+ from .backends import get_backend
15
+ from .constants import DATASET_SCHEMA_VERSION
16
+ from .corpus import Corpus
17
+ from .models import QueryBudget, RetrievalResult, RetrievalRun
18
+ from .time import utc_now_iso
19
+
20
+
21
+ class EvaluationQuery(BaseModel):
22
+ """
23
+ Query record for retrieval evaluation.
24
+
25
+ :ivar query_id: Unique identifier for the query.
26
+ :vartype query_id: str
27
+ :ivar query_text: Natural language query to execute.
28
+ :vartype query_text: str
29
+ :ivar expected_item_id: Optional expected item identifier.
30
+ :vartype expected_item_id: str or None
31
+ :ivar expected_source_uri: Optional expected source uniform resource identifier.
32
+ :vartype expected_source_uri: str or None
33
+ :ivar kind: Query kind (gold or synthetic).
34
+ :vartype kind: str
35
+ """
36
+
37
+ model_config = ConfigDict(extra="forbid")
38
+
39
+ query_id: str
40
+ query_text: str
41
+ expected_item_id: Optional[str] = None
42
+ expected_source_uri: Optional[str] = None
43
+ kind: str = Field(default="gold")
44
+
45
+ @model_validator(mode="after")
46
+ def _require_expectation(self) -> "EvaluationQuery":
47
+ if not self.expected_item_id and not self.expected_source_uri:
48
+ raise ValueError(
49
+ "Evaluation queries must include expected_item_id or expected_source_uri"
50
+ )
51
+ return self
52
+
53
+
54
+ class EvaluationDataset(BaseModel):
55
+ """
56
+ Dataset for retrieval evaluation.
57
+
58
+ :ivar schema_version: Dataset schema version.
59
+ :vartype schema_version: int
60
+ :ivar name: Dataset name.
61
+ :vartype name: str
62
+ :ivar description: Optional description.
63
+ :vartype description: str or None
64
+ :ivar queries: List of evaluation queries.
65
+ :vartype queries: list[EvaluationQuery]
66
+ """
67
+
68
+ model_config = ConfigDict(extra="forbid")
69
+
70
+ schema_version: int = Field(ge=1)
71
+ name: str
72
+ description: Optional[str] = None
73
+ queries: List[EvaluationQuery] = Field(default_factory=list)
74
+
75
+ @model_validator(mode="after")
76
+ def _enforce_schema_version(self) -> "EvaluationDataset":
77
+ if self.schema_version != DATASET_SCHEMA_VERSION:
78
+ raise ValueError(f"Unsupported dataset schema version: {self.schema_version}")
79
+ return self
80
+
81
+
82
+ class EvaluationResult(BaseModel):
83
+ """
84
+ Result bundle for a retrieval evaluation.
85
+
86
+ :ivar dataset: Dataset metadata.
87
+ :vartype dataset: dict[str, object]
88
+ :ivar backend_id: Backend identifier.
89
+ :vartype backend_id: str
90
+ :ivar run_id: Retrieval run identifier.
91
+ :vartype run_id: str
92
+ :ivar evaluated_at: International Organization for Standardization 8601 evaluation timestamp.
93
+ :vartype evaluated_at: str
94
+ :ivar metrics: Quality metrics for retrieval.
95
+ :vartype metrics: dict[str, float]
96
+ :ivar system: System metrics for retrieval.
97
+ :vartype system: dict[str, float]
98
+ """
99
+
100
+ model_config = ConfigDict(extra="forbid")
101
+
102
+ dataset: Dict[str, object]
103
+ backend_id: str
104
+ run_id: str
105
+ evaluated_at: str
106
+ metrics: Dict[str, float]
107
+ system: Dict[str, float]
108
+
109
+
110
+ def load_dataset(path: Path) -> EvaluationDataset:
111
+ """
112
+ Load an evaluation dataset from JavaScript Object Notation.
113
+
114
+ :param path: Path to the dataset JavaScript Object Notation file.
115
+ :type path: Path
116
+ :return: Parsed evaluation dataset.
117
+ :rtype: EvaluationDataset
118
+ """
119
+ data = json.loads(path.read_text(encoding="utf-8"))
120
+ return EvaluationDataset.model_validate(data)
121
+
122
+
123
+ def evaluate_run(
124
+ *,
125
+ corpus: Corpus,
126
+ run: RetrievalRun,
127
+ dataset: EvaluationDataset,
128
+ budget: QueryBudget,
129
+ ) -> EvaluationResult:
130
+ """
131
+ Evaluate a retrieval run against a dataset.
132
+
133
+ :param corpus: Corpus associated with the run.
134
+ :type corpus: Corpus
135
+ :param run: Retrieval run manifest.
136
+ :type run: RetrievalRun
137
+ :param dataset: Evaluation dataset.
138
+ :type dataset: EvaluationDataset
139
+ :param budget: Evidence selection budget.
140
+ :type budget: QueryBudget
141
+ :return: Evaluation result bundle.
142
+ :rtype: EvaluationResult
143
+ """
144
+ backend = get_backend(run.recipe.backend_id)
145
+ latency_seconds: List[float] = []
146
+ hit_count = 0
147
+ reciprocal_ranks: List[float] = []
148
+
149
+ for query in dataset.queries:
150
+ timer_start = time.perf_counter()
151
+ result = backend.query(corpus, run=run, query_text=query.query_text, budget=budget)
152
+ elapsed_seconds = time.perf_counter() - timer_start
153
+ latency_seconds.append(elapsed_seconds)
154
+ expected_rank = _expected_rank(result, query)
155
+ if expected_rank is not None:
156
+ hit_count += 1
157
+ reciprocal_ranks.append(1.0 / expected_rank)
158
+ else:
159
+ reciprocal_ranks.append(0.0)
160
+
161
+ total_queries = max(len(dataset.queries), 1)
162
+ max_total_items = float(budget.max_total_items)
163
+ hit_rate = hit_count / total_queries
164
+ precision_at_max_total_items = hit_count / (total_queries * max_total_items)
165
+ mean_reciprocal_rank = sum(reciprocal_ranks) / total_queries
166
+
167
+ metrics = {
168
+ "hit_rate": hit_rate,
169
+ "precision_at_max_total_items": precision_at_max_total_items,
170
+ "mean_reciprocal_rank": mean_reciprocal_rank,
171
+ }
172
+ system = {
173
+ "average_latency_milliseconds": _average_latency_milliseconds(latency_seconds),
174
+ "percentile_95_latency_milliseconds": _percentile_95_latency_milliseconds(latency_seconds),
175
+ "index_bytes": float(_run_artifact_bytes(corpus, run)),
176
+ }
177
+ dataset_meta = {
178
+ "name": dataset.name,
179
+ "description": dataset.description,
180
+ "queries": len(dataset.queries),
181
+ }
182
+ return EvaluationResult(
183
+ dataset=dataset_meta,
184
+ backend_id=run.recipe.backend_id,
185
+ run_id=run.run_id,
186
+ evaluated_at=utc_now_iso(),
187
+ metrics=metrics,
188
+ system=system,
189
+ )
190
+
191
+
192
+ def _expected_rank(result: RetrievalResult, query: EvaluationQuery) -> Optional[int]:
193
+ """
194
+ Locate the first evidence rank that matches the expected item or source.
195
+
196
+ :param result: Retrieval result for a query.
197
+ :type result: RetrievalResult
198
+ :param query: Evaluation query definition.
199
+ :type query: EvaluationQuery
200
+ :return: Rank of the first matching evidence item, or None.
201
+ :rtype: int or None
202
+ """
203
+ for evidence in result.evidence:
204
+ if query.expected_item_id and evidence.item_id == query.expected_item_id:
205
+ return evidence.rank
206
+ if query.expected_source_uri and evidence.source_uri == query.expected_source_uri:
207
+ return evidence.rank
208
+ return None
209
+
210
+
211
+ def _average_latency_milliseconds(latencies: List[float]) -> float:
212
+ """
213
+ Compute average latency in milliseconds.
214
+
215
+ :param latencies: Latency samples in seconds.
216
+ :type latencies: list[float]
217
+ :return: Average latency in milliseconds.
218
+ :rtype: float
219
+ """
220
+ if not latencies:
221
+ return 0.0
222
+ return sum(latencies) / len(latencies) * 1000.0
223
+
224
+
225
+ def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
226
+ """
227
+ Compute the percentile 95 latency in milliseconds.
228
+
229
+ :param latencies: Latency samples in seconds.
230
+ :type latencies: list[float]
231
+ :return: Percentile 95 latency in milliseconds.
232
+ :rtype: float
233
+ """
234
+ if not latencies:
235
+ return 0.0
236
+ sorted_latencies = sorted(latencies)
237
+ percentile_index = int(round(0.95 * (len(sorted_latencies) - 1)))
238
+ return sorted_latencies[percentile_index] * 1000.0
239
+
240
+
241
+ def _run_artifact_bytes(corpus: Corpus, run: RetrievalRun) -> int:
242
+ """
243
+ Sum artifact sizes for a retrieval run.
244
+
245
+ :param corpus: Corpus that owns the artifacts.
246
+ :type corpus: Corpus
247
+ :param run: Retrieval run manifest.
248
+ :type run: RetrievalRun
249
+ :return: Total artifact bytes.
250
+ :rtype: int
251
+ """
252
+ total_bytes = 0
253
+ for artifact_relpath in run.artifact_paths:
254
+ artifact_path = corpus.root / artifact_relpath
255
+ if artifact_path.exists():
256
+ total_bytes += artifact_path.stat().st_size
257
+ return total_bytes
@@ -0,0 +1,201 @@
1
+ """
2
+ Evidence processing stages for Biblicus.
3
+
4
+ Retrieval backends return ranked evidence. Additional stages can be applied without changing the
5
+ backend implementation:
6
+
7
+ - Rerank: reorder evidence.
8
+ - Filter: remove evidence.
9
+
10
+ These stages are explicit so they can be configured, tested, and evaluated independently from the
11
+ retrieval backend.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from typing import Any, Dict, List
18
+
19
+ from pydantic import BaseModel, ConfigDict, Field
20
+
21
+ from .models import Evidence
22
+
23
+
24
+ class EvidenceReranker(ABC):
25
+ """
26
+ Evidence reranker interface.
27
+
28
+ :param reranker_id: Stable identifier for this reranker implementation.
29
+ :type reranker_id: str
30
+ """
31
+
32
+ reranker_id: str
33
+
34
+ @abstractmethod
35
+ def rerank(self, *, query_text: str, evidence: List[Evidence]) -> List[Evidence]:
36
+ """
37
+ Reorder evidence for the given query.
38
+
39
+ :param query_text: Query text associated with the evidence.
40
+ :type query_text: str
41
+ :param evidence: Evidence objects to rerank.
42
+ :type evidence: list[Evidence]
43
+ :return: Reranked evidence list.
44
+ :rtype: list[Evidence]
45
+ """
46
+
47
+
48
+ class EvidenceFilter(ABC):
49
+ """
50
+ Evidence filter interface.
51
+
52
+ :param filter_id: Stable identifier for this filter implementation.
53
+ :type filter_id: str
54
+ """
55
+
56
+ filter_id: str
57
+
58
+ @abstractmethod
59
+ def filter(
60
+ self, *, query_text: str, evidence: List[Evidence], config: Dict[str, Any]
61
+ ) -> List[Evidence]:
62
+ """
63
+ Filter evidence for the given query.
64
+
65
+ :param query_text: Query text associated with the evidence.
66
+ :type query_text: str
67
+ :param evidence: Evidence objects to filter.
68
+ :type evidence: list[Evidence]
69
+ :param config: Filter-specific configuration values.
70
+ :type config: dict[str, Any]
71
+ :return: Filtered evidence list.
72
+ :rtype: list[Evidence]
73
+ """
74
+
75
+
76
+ class EvidenceRerankLongestText(EvidenceReranker):
77
+ """
78
+ Reranker that prioritizes evidence with longer text.
79
+
80
+ This is a deterministic policy that is useful when a downstream context pack is limited by a
81
+ character or token budget and longer evidence is preferred.
82
+
83
+ :ivar reranker_id: Stable reranker identifier.
84
+ :vartype reranker_id: str
85
+ """
86
+
87
+ reranker_id = "rerank-longest-text"
88
+
89
+ def rerank(self, *, query_text: str, evidence: List[Evidence]) -> List[Evidence]:
90
+ """
91
+ Reorder evidence by descending text length.
92
+
93
+ :param query_text: Query text associated with the evidence.
94
+ :type query_text: str
95
+ :param evidence: Evidence objects to rerank.
96
+ :type evidence: list[Evidence]
97
+ :return: Evidence list ordered by text length.
98
+ :rtype: list[Evidence]
99
+ """
100
+ return sorted(
101
+ evidence,
102
+ key=lambda evidence_item: (-len((evidence_item.text or "").strip()), evidence_item.item_id),
103
+ )
104
+
105
+
106
+ class EvidenceFilterMinimumScoreConfig(BaseModel):
107
+ """
108
+ Configuration for the minimum score evidence filter.
109
+
110
+ :ivar minimum_score: Evidence with score below this threshold is removed.
111
+ :vartype minimum_score: float
112
+ """
113
+
114
+ model_config = ConfigDict(extra="forbid")
115
+
116
+ minimum_score: float = Field(ge=0.0)
117
+
118
+
119
+ class EvidenceFilterMinimumScore(EvidenceFilter):
120
+ """
121
+ Filter that removes evidence below a minimum score threshold.
122
+
123
+ :ivar filter_id: Stable filter identifier.
124
+ :vartype filter_id: str
125
+ """
126
+
127
+ filter_id = "filter-minimum-score"
128
+
129
+ def filter(
130
+ self, *, query_text: str, evidence: List[Evidence], config: Dict[str, Any]
131
+ ) -> List[Evidence]:
132
+ """
133
+ Filter evidence by score threshold.
134
+
135
+ :param query_text: Query text associated with the evidence.
136
+ :type query_text: str
137
+ :param evidence: Evidence objects to filter.
138
+ :type evidence: list[Evidence]
139
+ :param config: Filter configuration values.
140
+ :type config: dict[str, Any]
141
+ :return: Evidence list with low-score items removed.
142
+ :rtype: list[Evidence]
143
+ """
144
+ parsed_config = EvidenceFilterMinimumScoreConfig.model_validate(config)
145
+ return [
146
+ evidence_item
147
+ for evidence_item in evidence
148
+ if float(evidence_item.score) >= parsed_config.minimum_score
149
+ ]
150
+
151
+
152
+ _EVIDENCE_RERANKERS: Dict[str, EvidenceReranker] = {
153
+ EvidenceRerankLongestText.reranker_id: EvidenceRerankLongestText(),
154
+ }
155
+
156
+ _EVIDENCE_FILTERS: Dict[str, EvidenceFilter] = {
157
+ EvidenceFilterMinimumScore.filter_id: EvidenceFilterMinimumScore(),
158
+ }
159
+
160
+
161
+ def apply_evidence_reranker(
162
+ *, reranker_id: str, query_text: str, evidence: List[Evidence]
163
+ ) -> List[Evidence]:
164
+ """
165
+ Apply a reranker to evidence by identifier.
166
+
167
+ :param reranker_id: Reranker identifier.
168
+ :type reranker_id: str
169
+ :param query_text: Query text associated with the evidence.
170
+ :type query_text: str
171
+ :param evidence: Evidence objects to rerank.
172
+ :type evidence: list[Evidence]
173
+ :return: Reranked evidence list.
174
+ :rtype: list[Evidence]
175
+ :raises KeyError: If the reranker identifier is unknown.
176
+ """
177
+ reranker = _EVIDENCE_RERANKERS[reranker_id]
178
+ return reranker.rerank(query_text=query_text, evidence=evidence)
179
+
180
+
181
+ def apply_evidence_filter(
182
+ *, filter_id: str, query_text: str, evidence: List[Evidence], config: Dict[str, Any]
183
+ ) -> List[Evidence]:
184
+ """
185
+ Apply a filter to evidence by identifier.
186
+
187
+ :param filter_id: Filter identifier.
188
+ :type filter_id: str
189
+ :param query_text: Query text associated with the evidence.
190
+ :type query_text: str
191
+ :param evidence: Evidence objects to filter.
192
+ :type evidence: list[Evidence]
193
+ :param config: Filter-specific configuration values.
194
+ :type config: dict[str, Any]
195
+ :return: Filtered evidence list.
196
+ :rtype: list[Evidence]
197
+ :raises KeyError: If the filter identifier is unknown.
198
+ """
199
+ evidence_filter = _EVIDENCE_FILTERS[filter_id]
200
+ return evidence_filter.filter(query_text=query_text, evidence=evidence, config=config)
201
+