biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/ignore.py ADDED
@@ -0,0 +1,67 @@
1
+ """
2
+ Corpus ignore rules for bulk import and crawling.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import fnmatch
8
+ from pathlib import Path
9
+ from typing import List
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+
14
+ class CorpusIgnoreSpec(BaseModel):
15
+ """
16
+ Parsed corpus ignore patterns.
17
+
18
+ Patterns are matched against a forward-slash relative path string.
19
+
20
+ :ivar patterns: Glob patterns to ignore.
21
+ :vartype patterns: list[str]
22
+ """
23
+
24
+ model_config = ConfigDict(extra="forbid")
25
+
26
+ patterns: List[str] = Field(default_factory=list)
27
+
28
+ def matches(self, relpath: str) -> bool:
29
+ """
30
+ Return True if the relative path matches any ignore pattern.
31
+
32
+ :param relpath: Forward-slash relative path.
33
+ :type relpath: str
34
+ :return: True if the path should be ignored.
35
+ :rtype: bool
36
+ """
37
+
38
+ normalized = relpath.replace("\\", "/").lstrip("/")
39
+ return any(fnmatch.fnmatch(normalized, pattern) for pattern in self.patterns)
40
+
41
+
42
+ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
43
+ """
44
+ Load ignore patterns from the corpus ignore file, if present.
45
+
46
+ The ignore file is stored at the corpus root as `.biblicusignore`.
47
+
48
+ :param corpus_root: Corpus root directory.
49
+ :type corpus_root: Path
50
+ :return: Parsed ignore specification.
51
+ :rtype: CorpusIgnoreSpec
52
+ """
53
+
54
+ ignore_path = corpus_root / ".biblicusignore"
55
+ if not ignore_path.is_file():
56
+ return CorpusIgnoreSpec(patterns=[])
57
+
58
+ patterns: List[str] = []
59
+ for raw_line in ignore_path.read_text(encoding="utf-8").splitlines():
60
+ line = raw_line.strip()
61
+ if not line:
62
+ continue
63
+ if line.startswith("#"):
64
+ continue
65
+ patterns.append(line)
66
+ return CorpusIgnoreSpec(patterns=patterns)
67
+
biblicus/models.py CHANGED
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional
9
9
  from pydantic import BaseModel, ConfigDict, Field, model_validator
10
10
 
11
11
  from .constants import SCHEMA_VERSION
12
+ from .hooks import HookSpec
12
13
 
13
14
 
14
15
  class CorpusConfig(BaseModel):
@@ -25,6 +26,8 @@ class CorpusConfig(BaseModel):
25
26
  :vartype raw_dir: str
26
27
  :ivar notes: Optional free-form notes for operators.
27
28
  :vartype notes: dict[str, Any] or None
29
+ :ivar hooks: Optional hook specifications for corpus lifecycle events.
30
+ :vartype hooks: list[HookSpec] or None
28
31
  """
29
32
 
30
33
  model_config = ConfigDict(extra="forbid")
@@ -34,6 +37,7 @@ class CorpusConfig(BaseModel):
34
37
  corpus_uri: str
35
38
  raw_dir: str = "raw"
36
39
  notes: Optional[Dict[str, Any]] = None
40
+ hooks: Optional[List[HookSpec]] = None
37
41
 
38
42
  @model_validator(mode="after")
39
43
  def _enforce_schema_version(self) -> "CorpusConfig":
@@ -305,3 +309,19 @@ class RetrievalResult(BaseModel):
305
309
  generated_at: str
306
310
  evidence: List[Evidence] = Field(default_factory=list)
307
311
  stats: Dict[str, Any] = Field(default_factory=dict)
312
+
313
+
314
+ class ExtractedText(BaseModel):
315
+ """
316
+ Text payload produced by an extractor plugin.
317
+
318
+ :ivar text: Extracted text content.
319
+ :vartype text: str
320
+ :ivar producer_extractor_id: Extractor identifier that produced this text.
321
+ :vartype producer_extractor_id: str
322
+ """
323
+
324
+ model_config = ConfigDict(extra="forbid")
325
+
326
+ text: str
327
+ producer_extractor_id: str = Field(min_length=1)
biblicus/sources.py CHANGED
@@ -53,6 +53,46 @@ def _media_type_from_filename(name: str) -> str:
53
53
  return media_type or "application/octet-stream"
54
54
 
55
55
 
56
+ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
57
+ """
58
+ Sniff a media type from leading bytes for a small set of common formats.
59
+
60
+ :param data: Raw bytes to inspect.
61
+ :type data: bytes
62
+ :return: Detected media type or None.
63
+ :rtype: str or None
64
+ """
65
+
66
+ prefix = data[:32]
67
+ if prefix.startswith(b"%PDF-"):
68
+ return "application/pdf"
69
+ if prefix.startswith(b"\x89PNG\r\n\x1a\n"):
70
+ return "image/png"
71
+ if prefix[:3] == b"\xff\xd8\xff":
72
+ return "image/jpeg"
73
+ if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(b"<html"):
74
+ return "text/html"
75
+ return None
76
+
77
+
78
+ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
79
+ """
80
+ Ensure the filename has a usable extension for the media type.
81
+
82
+ :param filename: Filename candidate.
83
+ :type filename: str
84
+ :param media_type: Media type to target.
85
+ :type media_type: str
86
+ :return: Filename with extension.
87
+ :rtype: str
88
+ """
89
+
90
+ if Path(filename).suffix:
91
+ return filename
92
+ ext = mimetypes.guess_extension(media_type) or ""
93
+ return filename + ext if ext else filename
94
+
95
+
56
96
  @dataclass(frozen=True)
57
97
  class SourcePayload:
58
98
  """
@@ -115,6 +155,11 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
115
155
  content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
116
156
  filename = _filename_from_url_path(parsed.path)
117
157
  media_type = content_type or _media_type_from_filename(filename)
158
+ if media_type == "application/octet-stream":
159
+ sniffed = _sniff_media_type_from_bytes(response_bytes)
160
+ if sniffed:
161
+ media_type = sniffed
162
+ filename = _ensure_extension_for_media_type(filename, media_type)
118
163
  if Path(filename).suffix.lower() in {".md", ".markdown"}:
119
164
  media_type = "text/markdown"
120
165
  return SourcePayload(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -20,6 +20,9 @@ Dynamic: license-file
20
20
 
21
21
  # Biblicus
22
22
 
23
+ ![Continuous integration][continuous-integration-badge]
24
+ ![Coverage][coverage-badge]
25
+
23
26
  Make your documents usable by your assistant, then decide later how you will search and retrieve them.
24
27
 
25
28
  If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
@@ -45,6 +48,84 @@ The framework is a small, explicit vocabulary that appears in code, specificatio
45
48
  - Recipe is a named configuration for a backend.
46
49
  - Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
47
50
 
51
+ ## Diagram
52
+
53
+ This diagram shows how a corpus becomes evidence for an assistant.
54
+ The legend shows what the border styles and fill styles mean.
55
+ The your code region is where you decide how to turn evidence into context and how to call a model.
56
+
57
+ ```mermaid
58
+ %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
59
+ flowchart LR
60
+ subgraph Legend[Legend]
61
+ direction LR
62
+ LegendArtifact[Stored artifact or evidence]
63
+ LegendStep[Step]
64
+ LegendArtifact --- LegendStep
65
+ end
66
+
67
+ subgraph Main[" "]
68
+ direction TB
69
+
70
+ subgraph StableCore[Stable core]
71
+ direction TB
72
+ Source[Source items] --> Ingest[Ingest]
73
+ Ingest --> Raw[Raw item files]
74
+ Raw --> Catalog[Catalog file]
75
+ end
76
+
77
+ subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
78
+ direction LR
79
+
80
+ subgraph BackendIngestionIndexing[Ingestion and indexing]
81
+ direction TB
82
+ Catalog --> Build[Build run]
83
+ Build --> BackendIndex[Backend index]
84
+ BackendIndex --> Run[Run manifest]
85
+ end
86
+
87
+ subgraph BackendRetrievalGeneration[Retrieval and generation]
88
+ direction TB
89
+ Run --> Query[Query]
90
+ Query --> Evidence[Evidence]
91
+ end
92
+ end
93
+
94
+ Evidence --> Context
95
+
96
+ subgraph YourCode[Your code]
97
+ direction TB
98
+ Context[Assistant context] --> Model[Large language model call]
99
+ Model --> Answer[Answer]
100
+ end
101
+
102
+ style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
103
+ style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
104
+ style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
105
+ style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
106
+ style BackendRetrievalGeneration fill:#ffffff,stroke:#cfd8dc,color:#111111
107
+
108
+ style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
109
+ style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
110
+ style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
111
+ style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
112
+ style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
113
+ style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
114
+ style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
115
+ style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
116
+
117
+ style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
118
+ style Build fill:#eceff1,stroke:#90a4ae,color:#111111
119
+ style Query fill:#eceff1,stroke:#90a4ae,color:#111111
120
+ style Model fill:#eceff1,stroke:#90a4ae,color:#111111
121
+ end
122
+
123
+ style Legend fill:#ffffff,stroke:#ffffff,color:#111111
124
+ style Main fill:#ffffff,stroke:#ffffff,color:#111111
125
+ style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
126
+ style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
127
+ ```
128
+
48
129
  ## Practical value
49
130
 
50
131
  - You can ingest raw material once, then try many retrieval approaches over time.
@@ -110,7 +191,11 @@ In an assistant system, retrieval usually produces context for a model call. Thi
110
191
  The documents below are written to be read in order.
111
192
 
112
193
  - [Architecture][architecture]
194
+ - [Corpus][corpus]
195
+ - [Text extraction][text-extraction]
113
196
  - [Backends][backends]
197
+ - [Next steps][next-steps]
198
+ - [Testing][testing]
114
199
 
115
200
  ## Metadata and catalog
116
201
 
@@ -143,12 +228,20 @@ Use `scripts/download_wikipedia.py` to download a small integration corpus from
143
228
 
144
229
  The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
145
230
 
231
+ Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
232
+
146
233
  ## Tests and coverage
147
234
 
148
235
  ```
149
236
  python3 scripts/test.py
150
237
  ```
151
238
 
239
+ To include integration scenarios that download public test data at runtime, run this command.
240
+
241
+ ```
242
+ python3 scripts/test.py --integration
243
+ ```
244
+
152
245
  ## Releases
153
246
 
154
247
  Releases are automated from the main branch using semantic versioning and conventional commit messages.
@@ -171,4 +264,11 @@ License terms are in `LICENSE`.
171
264
 
172
265
  [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
173
266
  [architecture]: docs/ARCHITECTURE.md
267
+ [corpus]: docs/CORPUS.md
268
+ [text-extraction]: docs/EXTRACTION.md
174
269
  [backends]: docs/BACKENDS.md
270
+ [next-steps]: docs/NEXT_STEPS.md
271
+ [testing]: docs/TESTING.md
272
+
273
+ [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
274
+ [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
@@ -0,0 +1,32 @@
1
+ biblicus/__init__.py,sha256=3IXdbt-q80_BlKDwTsZw7MScRW4hBgQ-Vn6xHbgNwE8,432
2
+ biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
+ biblicus/cli.py,sha256=zDD6juQGTDmrRE2DHUku-G3wV3AtXjwYTNDFACwpdC0,19501
4
+ biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
5
+ biblicus/corpus.py,sha256=5naoFi0GSKBg4RFd6wOU-U30NMbG6bfs_RM90JcvDGA,47460
6
+ biblicus/evaluation.py,sha256=H_W35vF5_L4B2JCfLu19VRu402tZ2pFkN2BbBP69lVY,8119
7
+ biblicus/extraction.py,sha256=WX1LRsKrsyHI4Wido6gMwukzRGf5cfPWvRASgu_MRN4,10614
8
+ biblicus/frontmatter.py,sha256=8Tqlpd3bVzZrGRB9Rdj2IwHMSJLvd2ABxMNOi3L5br4,2466
9
+ biblicus/hook_logging.py,sha256=8Rl3BpkfTexSJ7rFi94kl6DMRDD-8eu2N7zv18wXyUM,5371
10
+ biblicus/hook_manager.py,sha256=ucDZoVM-9fg1gQAhUxi-PECaNlHoegAxb-kYCx-OMZs,6987
11
+ biblicus/hooks.py,sha256=OfG3VsCDWQVVZnOTQHnN9GQ0AIws9SK6-85WYTrKkzk,7847
12
+ biblicus/ignore.py,sha256=Di37CTlg6Mg3SKJc2qxZcZdYX00IcTORB2hb0g-Jins,1803
13
+ biblicus/models.py,sha256=6cgJX7Jmm5rBVrXWH46fQf3v__jSyDy73MnKaUQMSHQ,11099
14
+ biblicus/retrieval.py,sha256=T7HELWCNAxZ26yj7dPH8IBUaxV_gx8Ql9iwwGz0teyI,4184
15
+ biblicus/sources.py,sha256=C4P8oM6d50tLXr4z9Shsv4z-hDiQuylXfkT3Bx03dEM,5844
16
+ biblicus/time.py,sha256=rvp2fJXSLVmyA76GCfNKtZoifASodemJTOWN8smPt0s,486
17
+ biblicus/uris.py,sha256=sRDyGmoHr_H4XR4qv_lSbQJXylYD0fNEr02H5wjomnQ,1986
18
+ biblicus/backends/__init__.py,sha256=5OXKSzsn7THhwh9T5StOvEqojx_85XXuYSGdTpMK11U,1214
19
+ biblicus/backends/base.py,sha256=699TKygGgL72Ifkhz1V890nOK6BslwO0-OY7xeqZl-I,1764
20
+ biblicus/backends/scan.py,sha256=DZ-CgZ0jy6_928hu4dASJ8_JH7BTfF8gwVkVhd38W1U,12421
21
+ biblicus/backends/sqlite_full_text_search.py,sha256=FMpASLeK5diK-Uyhr4pqtpDpb_Qyk5_XRaXAKUHDzjs,16502
22
+ biblicus/extractors/__init__.py,sha256=_6Z_JkLoDYwmay76y1fy11lCSqDDizMDPX3Vke_l8x4,1008
23
+ biblicus/extractors/base.py,sha256=yvp709uUCnPEbK-bx6u5WKNPPH3SBWhbSaewoyUIgvA,1870
24
+ biblicus/extractors/cascade.py,sha256=ExojAYsARtF99zVg78wY_wifVfDaJFa6wiRIaT-cpRo,3209
25
+ biblicus/extractors/metadata_text.py,sha256=C0i8fcEC9aLmwhSdK9IlZVZ9ugOocIe0y522pSjvaCA,3203
26
+ biblicus/extractors/pass_through_text.py,sha256=ngDyI13RpCldP-OzV4q9lBTGPxDL6MDxp7OCo1rORyQ,2421
27
+ biblicus-0.2.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
28
+ biblicus-0.2.0.dist-info/METADATA,sha256=nTB344GRVrKuT6oPOrWBpFA_BiG3UAIgq3wCoHEVDgw,10307
29
+ biblicus-0.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
30
+ biblicus-0.2.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
31
+ biblicus-0.2.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
32
+ biblicus-0.2.0.dist-info/RECORD,,
@@ -1,22 +0,0 @@
1
- biblicus/__init__.py,sha256=o_1kQ7q9DCcjH7zm5MAvPx49hArnSvbr88kHKzBFMvM,432
2
- biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
- biblicus/cli.py,sha256=DwnvcDmjelzUq_9VMo_U_-FoBs3Si3QONVJdWGonXs4,15116
4
- biblicus/constants.py,sha256=t8p0yStpJAYPxsFlM0u5zJcQr_ARKEqEnIgNckjyF5Y,196
5
- biblicus/corpus.py,sha256=953gzT77HvYeTs2pcBXyixYRTxh65nm1JtlHVfKvCzg,30921
6
- biblicus/evaluation.py,sha256=H_W35vF5_L4B2JCfLu19VRu402tZ2pFkN2BbBP69lVY,8119
7
- biblicus/frontmatter.py,sha256=8Tqlpd3bVzZrGRB9Rdj2IwHMSJLvd2ABxMNOi3L5br4,2466
8
- biblicus/models.py,sha256=ZDb7-t9pycPpgZWVs5CcrpyeA_8OZLoQk-aflKjU7M4,10512
9
- biblicus/retrieval.py,sha256=T7HELWCNAxZ26yj7dPH8IBUaxV_gx8Ql9iwwGz0teyI,4184
10
- biblicus/sources.py,sha256=XFF75kqMyYdeYy6k8NtDnOmCxAmroW7DH6mdzWMPMuY,4358
11
- biblicus/time.py,sha256=rvp2fJXSLVmyA76GCfNKtZoifASodemJTOWN8smPt0s,486
12
- biblicus/uris.py,sha256=sRDyGmoHr_H4XR4qv_lSbQJXylYD0fNEr02H5wjomnQ,1986
13
- biblicus/backends/__init__.py,sha256=5OXKSzsn7THhwh9T5StOvEqojx_85XXuYSGdTpMK11U,1214
14
- biblicus/backends/base.py,sha256=699TKygGgL72Ifkhz1V890nOK6BslwO0-OY7xeqZl-I,1764
15
- biblicus/backends/scan.py,sha256=qvktqHIB0459sjzEO4EnS1PCXwwM19LjOx8oaDoU7DQ,9245
16
- biblicus/backends/sqlite_full_text_search.py,sha256=s_3gsEcdlxSFuluWcug4XEklwEoY42_Dgd7luY-BqqI,14152
17
- biblicus-0.1.1.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
18
- biblicus-0.1.1.dist-info/METADATA,sha256=lgvWJUgESiwWTCZ6_uUzgZeM3SkvnwjIzcsb8OE53BA,6635
19
- biblicus-0.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
20
- biblicus-0.1.1.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
21
- biblicus-0.1.1.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
22
- biblicus-0.1.1.dist-info/RECORD,,