biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +177 -53
  11. biblicus/corpus.py +209 -59
  12. biblicus/crawl.py +186 -0
  13. biblicus/errors.py +15 -0
  14. biblicus/evaluation.py +4 -8
  15. biblicus/extraction.py +280 -79
  16. biblicus/extractors/__init__.py +14 -3
  17. biblicus/extractors/base.py +12 -5
  18. biblicus/extractors/metadata_text.py +13 -5
  19. biblicus/extractors/openai_stt.py +180 -0
  20. biblicus/extractors/pass_through_text.py +16 -6
  21. biblicus/extractors/pdf_text.py +100 -0
  22. biblicus/extractors/pipeline.py +105 -0
  23. biblicus/extractors/rapidocr_text.py +129 -0
  24. biblicus/extractors/select_longest_text.py +105 -0
  25. biblicus/extractors/select_text.py +100 -0
  26. biblicus/extractors/unstructured_text.py +100 -0
  27. biblicus/frontmatter.py +0 -3
  28. biblicus/hook_logging.py +0 -5
  29. biblicus/hook_manager.py +3 -5
  30. biblicus/hooks.py +3 -7
  31. biblicus/ignore.py +0 -3
  32. biblicus/models.py +118 -0
  33. biblicus/retrieval.py +0 -4
  34. biblicus/sources.py +44 -9
  35. biblicus/time.py +1 -2
  36. biblicus/uris.py +3 -4
  37. biblicus/user_config.py +138 -0
  38. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
  39. biblicus-0.4.0.dist-info/RECORD +45 -0
  40. biblicus/extractors/cascade.py +0 -101
  41. biblicus-0.2.0.dist-info/RECORD +0 -32
  42. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
  45. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
biblicus/crawl.py ADDED
@@ -0,0 +1,186 @@
1
+ """
2
+ Website crawl utilities for Biblicus corpora.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from collections import deque
8
+ from html.parser import HTMLParser
9
+ from typing import Deque, List, Optional, Set
10
+ from urllib.parse import urldefrag, urljoin
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from .ignore import load_corpus_ignore_spec
15
+ from .sources import load_source
16
+
17
+
18
+ class CrawlRequest(BaseModel):
19
+ """
20
+ Request describing a website crawl into a corpus.
21
+
22
+ :ivar root_url: Initial uniform resource locator to fetch.
23
+ :vartype root_url: str
24
+ :ivar allowed_prefix: Uniform resource locator prefix that limits which links are eligible for crawl.
25
+ :vartype allowed_prefix: str
26
+ :ivar max_items: Maximum number of items to store during the crawl.
27
+ :vartype max_items: int
28
+ :ivar tags: Tags to apply to stored items.
29
+ :vartype tags: list[str]
30
+ """
31
+
32
+ model_config = ConfigDict(extra="forbid")
33
+
34
+ root_url: str = Field(min_length=1)
35
+ allowed_prefix: str = Field(min_length=1)
36
+ max_items: int = Field(default=50, ge=1)
37
+ tags: List[str] = Field(default_factory=list)
38
+
39
+
40
+ class CrawlResult(BaseModel):
41
+ """
42
+ Summary result for a crawl execution.
43
+
44
+ :ivar crawl_id: Crawl identifier used in the corpus raw import namespace.
45
+ :vartype crawl_id: str
46
+ :ivar discovered_items: Total number of distinct uniform resource locators discovered.
47
+ :vartype discovered_items: int
48
+ :ivar fetched_items: Number of eligible items fetched over hypertext transfer protocol.
49
+ :vartype fetched_items: int
50
+ :ivar stored_items: Number of items stored into the corpus.
51
+ :vartype stored_items: int
52
+ :ivar skipped_outside_prefix_items: Number of discovered items outside the allowed prefix.
53
+ :vartype skipped_outside_prefix_items: int
54
+ :ivar skipped_ignored_items: Number of eligible items skipped due to corpus ignore rules.
55
+ :vartype skipped_ignored_items: int
56
+ :ivar errored_items: Number of eligible items that failed to fetch or store.
57
+ :vartype errored_items: int
58
+ """
59
+
60
+ model_config = ConfigDict(extra="forbid")
61
+
62
+ crawl_id: str
63
+ discovered_items: int = Field(default=0, ge=0)
64
+ fetched_items: int = Field(default=0, ge=0)
65
+ stored_items: int = Field(default=0, ge=0)
66
+ skipped_outside_prefix_items: int = Field(default=0, ge=0)
67
+ skipped_ignored_items: int = Field(default=0, ge=0)
68
+ errored_items: int = Field(default=0, ge=0)
69
+
70
+
71
+ class _LinkExtractor(HTMLParser):
72
+ def __init__(self) -> None:
73
+ super().__init__()
74
+ self.links: List[str] = []
75
+
76
+ def handle_starttag(self, tag: str, attrs): # type: ignore[no-untyped-def]
77
+ _ = tag
78
+ for key, value in attrs:
79
+ if key in {"href", "src"} and isinstance(value, str) and value.strip():
80
+ self.links.append(value.strip())
81
+
82
+
83
+ def _normalize_crawl_url(candidate: str, *, base_url: str) -> Optional[str]:
84
+ joined = urljoin(base_url, candidate)
85
+ joined, _fragment = urldefrag(joined)
86
+ joined = joined.strip()
87
+ if joined.startswith(("mailto:", "javascript:")):
88
+ return None
89
+ return joined
90
+
91
+
92
+ def _crawl_relative_path(url: str, *, allowed_prefix: str) -> str:
93
+ relative = url[len(allowed_prefix) :].lstrip("/")
94
+ if not relative or relative.endswith("/"):
95
+ relative = relative.rstrip("/") + "/index.html" if relative else "index.html"
96
+ return relative
97
+
98
+
99
+ def _should_parse_links(media_type: str) -> bool:
100
+ return media_type.startswith("text/html")
101
+
102
+
103
+ def _discover_links(html_text: str, *, base_url: str) -> List[str]:
104
+ parser = _LinkExtractor()
105
+ parser.feed(html_text)
106
+ discovered: List[str] = []
107
+ for raw in parser.links:
108
+ normalized = _normalize_crawl_url(raw, base_url=base_url)
109
+ if normalized is not None:
110
+ discovered.append(normalized)
111
+ return discovered
112
+
113
+
114
+ def crawl_into_corpus(*, corpus, request: CrawlRequest) -> CrawlResult: # type: ignore[no-untyped-def]
115
+ """
116
+ Crawl a website prefix into a corpus.
117
+
118
+ :param corpus: Target corpus to receive crawled items.
119
+ :type corpus: biblicus.corpus.Corpus
120
+ :param request: Crawl request describing limits and allowed prefix.
121
+ :type request: CrawlRequest
122
+ :return: Crawl result summary.
123
+ :rtype: CrawlResult
124
+ """
125
+ ignore_spec = load_corpus_ignore_spec(corpus.root)
126
+ allowed_prefix = request.allowed_prefix
127
+ root_url = request.root_url
128
+
129
+ crawl_id = corpus.create_crawl_id()
130
+
131
+ queue: Deque[str] = deque([root_url])
132
+ seen: Set[str] = set()
133
+ stored_count = 0
134
+ fetched_count = 0
135
+ skipped_outside_prefix_count = 0
136
+ skipped_ignored_count = 0
137
+ errored_count = 0
138
+ discovered_urls: Set[str] = set()
139
+
140
+ while queue and stored_count < request.max_items:
141
+ url = queue.popleft()
142
+ if url in seen:
143
+ continue
144
+ seen.add(url)
145
+ discovered_urls.add(url)
146
+
147
+ if not url.startswith(allowed_prefix):
148
+ skipped_outside_prefix_count += 1
149
+ continue
150
+
151
+ relative_path = _crawl_relative_path(url, allowed_prefix=allowed_prefix)
152
+ if ignore_spec.matches(relative_path):
153
+ skipped_ignored_count += 1
154
+ continue
155
+
156
+ try:
157
+ payload = load_source(url)
158
+ fetched_count += 1
159
+ corpus.ingest_crawled_payload(
160
+ crawl_id=crawl_id,
161
+ relative_path=relative_path,
162
+ data=payload.data,
163
+ filename=payload.filename,
164
+ media_type=payload.media_type,
165
+ source_uri=payload.source_uri,
166
+ tags=request.tags,
167
+ )
168
+ stored_count += 1
169
+ except Exception:
170
+ errored_count += 1
171
+ continue
172
+
173
+ if _should_parse_links(payload.media_type):
174
+ text = payload.data.decode("utf-8", errors="replace")
175
+ for discovered in _discover_links(text, base_url=url):
176
+ queue.append(discovered)
177
+
178
+ return CrawlResult(
179
+ crawl_id=crawl_id,
180
+ discovered_items=len(discovered_urls),
181
+ fetched_items=fetched_count,
182
+ stored_items=stored_count,
183
+ skipped_outside_prefix_items=skipped_outside_prefix_count,
184
+ skipped_ignored_items=skipped_ignored_count,
185
+ errored_items=errored_count,
186
+ )
biblicus/errors.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Error types for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+
8
+ class ExtractionRunFatalError(RuntimeError):
9
+ """
10
+ Fatal extraction run error that should abort the entire run.
11
+
12
+ This exception is used for conditions that indicate a configuration or environment problem
13
+ rather than a per-item extraction failure. For example, a selection extractor that depends
14
+ on referenced extraction run manifests treats missing manifests as fatal.
15
+ """
biblicus/evaluation.py CHANGED
@@ -11,8 +11,8 @@ from typing import Dict, List, Optional
11
11
 
12
12
  from pydantic import BaseModel, ConfigDict, Field, model_validator
13
13
 
14
- from .constants import DATASET_SCHEMA_VERSION
15
14
  from .backends import get_backend
15
+ from .constants import DATASET_SCHEMA_VERSION
16
16
  from .corpus import Corpus
17
17
  from .models import QueryBudget, RetrievalResult, RetrievalRun
18
18
  from .time import utc_now_iso
@@ -45,7 +45,9 @@ class EvaluationQuery(BaseModel):
45
45
  @model_validator(mode="after")
46
46
  def _require_expectation(self) -> "EvaluationQuery":
47
47
  if not self.expected_item_id and not self.expected_source_uri:
48
- raise ValueError("Evaluation queries must include expected_item_id or expected_source_uri")
48
+ raise ValueError(
49
+ "Evaluation queries must include expected_item_id or expected_source_uri"
50
+ )
49
51
  return self
50
52
 
51
53
 
@@ -114,7 +116,6 @@ def load_dataset(path: Path) -> EvaluationDataset:
114
116
  :return: Parsed evaluation dataset.
115
117
  :rtype: EvaluationDataset
116
118
  """
117
-
118
119
  data = json.loads(path.read_text(encoding="utf-8"))
119
120
  return EvaluationDataset.model_validate(data)
120
121
 
@@ -140,7 +141,6 @@ def evaluate_run(
140
141
  :return: Evaluation result bundle.
141
142
  :rtype: EvaluationResult
142
143
  """
143
-
144
144
  backend = get_backend(run.recipe.backend_id)
145
145
  latency_seconds: List[float] = []
146
146
  hit_count = 0
@@ -200,7 +200,6 @@ def _expected_rank(result: RetrievalResult, query: EvaluationQuery) -> Optional[
200
200
  :return: Rank of the first matching evidence item, or None.
201
201
  :rtype: int or None
202
202
  """
203
-
204
203
  for evidence in result.evidence:
205
204
  if query.expected_item_id and evidence.item_id == query.expected_item_id:
206
205
  return evidence.rank
@@ -218,7 +217,6 @@ def _average_latency_milliseconds(latencies: List[float]) -> float:
218
217
  :return: Average latency in milliseconds.
219
218
  :rtype: float
220
219
  """
221
-
222
220
  if not latencies:
223
221
  return 0.0
224
222
  return sum(latencies) / len(latencies) * 1000.0
@@ -233,7 +231,6 @@ def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
233
231
  :return: Percentile 95 latency in milliseconds.
234
232
  :rtype: float
235
233
  """
236
-
237
234
  if not latencies:
238
235
  return 0.0
239
236
  sorted_latencies = sorted(latencies)
@@ -252,7 +249,6 @@ def _run_artifact_bytes(corpus: Corpus, run: RetrievalRun) -> int:
252
249
  :return: Total artifact bytes.
253
250
  :rtype: int
254
251
  """
255
-
256
252
  total_bytes = 0
257
253
  for artifact_relpath in run.artifact_paths:
258
254
  artifact_path = corpus.root / artifact_relpath