biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +96 -13
- biblicus/backends/sqlite_full_text_search.py +74 -14
- biblicus/cli.py +126 -19
- biblicus/constants.py +2 -0
- biblicus/corpus.py +455 -45
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +529 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/models.py +107 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +85 -5
- biblicus/time.py +0 -1
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- biblicus-0.3.0.dist-info/METADATA +336 -0
- biblicus-0.3.0.dist-info/RECORD +44 -0
- biblicus-0.1.1.dist-info/METADATA +0 -174
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
biblicus/sources.py
CHANGED
|
@@ -21,7 +21,6 @@ def _looks_like_uri(value: str) -> bool:
|
|
|
21
21
|
:return: True if the string has a valid uniform resource identifier scheme prefix.
|
|
22
22
|
:rtype: bool
|
|
23
23
|
"""
|
|
24
|
-
|
|
25
24
|
return "://" in value and value.split("://", 1)[0].isidentifier()
|
|
26
25
|
|
|
27
26
|
|
|
@@ -34,7 +33,6 @@ def _filename_from_url_path(path: str) -> str:
|
|
|
34
33
|
:return: Filename or a fallback name.
|
|
35
34
|
:rtype: str
|
|
36
35
|
"""
|
|
37
|
-
|
|
38
36
|
filename = Path(unquote(path)).name
|
|
39
37
|
return filename or "download"
|
|
40
38
|
|
|
@@ -48,11 +46,86 @@ def _media_type_from_filename(name: str) -> str:
|
|
|
48
46
|
:return: Guessed media type or application/octet-stream.
|
|
49
47
|
:rtype: str
|
|
50
48
|
"""
|
|
51
|
-
|
|
52
49
|
media_type, _ = mimetypes.guess_type(name)
|
|
53
50
|
return media_type or "application/octet-stream"
|
|
54
51
|
|
|
55
52
|
|
|
53
|
+
def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
|
|
54
|
+
"""
|
|
55
|
+
Sniff a media type from leading bytes for a small set of common formats.
|
|
56
|
+
|
|
57
|
+
:param data: Raw bytes to inspect.
|
|
58
|
+
:type data: bytes
|
|
59
|
+
:return: Detected media type or None.
|
|
60
|
+
:rtype: str or None
|
|
61
|
+
"""
|
|
62
|
+
prefix = data[:32]
|
|
63
|
+
if prefix.startswith(b"%PDF-"):
|
|
64
|
+
return "application/pdf"
|
|
65
|
+
if prefix.startswith(b"\x89PNG\r\n\x1a\n"):
|
|
66
|
+
return "image/png"
|
|
67
|
+
if prefix[:3] == b"\xff\xd8\xff":
|
|
68
|
+
return "image/jpeg"
|
|
69
|
+
if prefix.startswith(b"RIFF") and prefix[8:12] == b"WAVE":
|
|
70
|
+
return "audio/x-wav"
|
|
71
|
+
if prefix.startswith(b"ID3") or (
|
|
72
|
+
len(prefix) >= 2 and prefix[0] == 0xFF and (prefix[1] & 0xE0) == 0xE0
|
|
73
|
+
):
|
|
74
|
+
return "audio/mpeg"
|
|
75
|
+
if prefix.startswith(b"OggS"):
|
|
76
|
+
return "audio/ogg"
|
|
77
|
+
if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(
|
|
78
|
+
b"<html"
|
|
79
|
+
):
|
|
80
|
+
return "text/html"
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _normalize_media_type(*, filename: str, media_type: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Normalize media types that are commonly mislabelled by upstream sources.
|
|
87
|
+
|
|
88
|
+
This function exists to keep the corpus usable for humans. When a source provides a filename
|
|
89
|
+
extension that users recognize (for example, ``.ogg``), Biblicus prefers a matching media type
|
|
90
|
+
so that downstream processing can make reasonable decisions.
|
|
91
|
+
|
|
92
|
+
:param filename: Filename associated with the payload.
|
|
93
|
+
:type filename: str
|
|
94
|
+
:param media_type: Media type reported or guessed for the payload.
|
|
95
|
+
:type media_type: str
|
|
96
|
+
:return: Normalized media type.
|
|
97
|
+
:rtype: str
|
|
98
|
+
"""
|
|
99
|
+
suffix = Path(filename).suffix.lower()
|
|
100
|
+
if media_type in {"application/ogg", "application/x-ogg"} and suffix in {
|
|
101
|
+
".ogg",
|
|
102
|
+
".oga",
|
|
103
|
+
".ogx",
|
|
104
|
+
}:
|
|
105
|
+
return "audio/ogg"
|
|
106
|
+
return media_type
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Ensure the filename has a usable extension for the media type.
|
|
112
|
+
|
|
113
|
+
:param filename: Filename candidate.
|
|
114
|
+
:type filename: str
|
|
115
|
+
:param media_type: Media type to target.
|
|
116
|
+
:type media_type: str
|
|
117
|
+
:return: Filename with extension.
|
|
118
|
+
:rtype: str
|
|
119
|
+
"""
|
|
120
|
+
if Path(filename).suffix:
|
|
121
|
+
return filename
|
|
122
|
+
if media_type == "audio/ogg":
|
|
123
|
+
ext = ".ogg"
|
|
124
|
+
else:
|
|
125
|
+
ext = mimetypes.guess_extension(media_type) or ""
|
|
126
|
+
return filename + ext if ext else filename
|
|
127
|
+
|
|
128
|
+
|
|
56
129
|
@dataclass(frozen=True)
|
|
57
130
|
class SourcePayload:
|
|
58
131
|
"""
|
|
@@ -87,7 +160,6 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
87
160
|
:raises ValueError: If a file:// uniform resource identifier has a non-local host.
|
|
88
161
|
:raises NotImplementedError: If the uniform resource identifier scheme is unsupported.
|
|
89
162
|
"""
|
|
90
|
-
|
|
91
163
|
if isinstance(source, Path):
|
|
92
164
|
path = source.resolve()
|
|
93
165
|
media_type = _media_type_from_filename(path.name)
|
|
@@ -104,7 +176,9 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
104
176
|
parsed = urlparse(source)
|
|
105
177
|
if parsed.scheme == "file":
|
|
106
178
|
if parsed.netloc not in ("", "localhost"):
|
|
107
|
-
raise ValueError(
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
|
|
181
|
+
)
|
|
108
182
|
path = Path(unquote(parsed.path)).resolve()
|
|
109
183
|
return load_source(path, source_uri=source_uri or source)
|
|
110
184
|
|
|
@@ -115,6 +189,12 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
115
189
|
content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
|
|
116
190
|
filename = _filename_from_url_path(parsed.path)
|
|
117
191
|
media_type = content_type or _media_type_from_filename(filename)
|
|
192
|
+
if media_type == "application/octet-stream":
|
|
193
|
+
sniffed = _sniff_media_type_from_bytes(response_bytes)
|
|
194
|
+
if sniffed:
|
|
195
|
+
media_type = sniffed
|
|
196
|
+
filename = _ensure_extension_for_media_type(filename, media_type)
|
|
197
|
+
media_type = _normalize_media_type(filename=filename, media_type=media_type)
|
|
118
198
|
if Path(filename).suffix.lower() in {".md", ".markdown"}:
|
|
119
199
|
media_type = "text/markdown"
|
|
120
200
|
return SourcePayload(
|
biblicus/time.py
CHANGED
biblicus/uris.py
CHANGED
|
@@ -18,7 +18,6 @@ def _looks_like_uri(value: str) -> bool:
|
|
|
18
18
|
:return: True if the string has a valid uniform resource identifier scheme prefix.
|
|
19
19
|
:rtype: bool
|
|
20
20
|
"""
|
|
21
|
-
|
|
22
21
|
return "://" in value and value.split("://", 1)[0].isidentifier()
|
|
23
22
|
|
|
24
23
|
|
|
@@ -33,7 +32,6 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
|
|
|
33
32
|
:raises NotImplementedError: If a non-file uniform resource identifier scheme is used.
|
|
34
33
|
:raises ValueError: If a file:// uniform resource identifier has a non-local host.
|
|
35
34
|
"""
|
|
36
|
-
|
|
37
35
|
if isinstance(ref, Path):
|
|
38
36
|
return ref.resolve()
|
|
39
37
|
|
|
@@ -45,7 +43,9 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
|
|
|
45
43
|
f"(got {parsed.scheme}://)"
|
|
46
44
|
)
|
|
47
45
|
if parsed.netloc not in ("", "localhost"):
|
|
48
|
-
raise ValueError(
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
|
|
48
|
+
)
|
|
49
49
|
return Path(unquote(parsed.path)).resolve()
|
|
50
50
|
|
|
51
51
|
return Path(ref).resolve()
|
|
@@ -60,5 +60,4 @@ def normalize_corpus_uri(ref: Union[str, Path]) -> str:
|
|
|
60
60
|
:return: Canonical file:// uniform resource identifier.
|
|
61
61
|
:rtype: str
|
|
62
62
|
"""
|
|
63
|
-
|
|
64
63
|
return corpus_ref_to_path(ref).as_uri()
|
biblicus/user_config.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
User configuration file loading for Biblicus.
|
|
3
|
+
|
|
4
|
+
User configuration is intended for small, local settings such as credentials for optional
|
|
5
|
+
integrations. It is separate from corpus configuration.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
15
|
+
|
|
16
|
+
from ._vendor.dotyaml import ConfigLoader
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OpenAiUserConfig(BaseModel):
|
|
20
|
+
"""
|
|
21
|
+
Configuration for OpenAI integrations.
|
|
22
|
+
|
|
23
|
+
:ivar api_key: OpenAI API key used for authenticated requests.
|
|
24
|
+
:vartype api_key: str
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
model_config = ConfigDict(extra="forbid")
|
|
28
|
+
|
|
29
|
+
api_key: str = Field(min_length=1)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BiblicusUserConfig(BaseModel):
|
|
33
|
+
"""
|
|
34
|
+
Parsed user configuration for Biblicus.
|
|
35
|
+
|
|
36
|
+
:ivar openai: Optional OpenAI configuration.
|
|
37
|
+
:vartype openai: OpenAiUserConfig or None
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
model_config = ConfigDict(extra="forbid")
|
|
41
|
+
|
|
42
|
+
openai: Optional[OpenAiUserConfig] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def default_user_config_paths(
|
|
46
|
+
*, cwd: Optional[Path] = None, home: Optional[Path] = None
|
|
47
|
+
) -> list[Path]:
|
|
48
|
+
"""
|
|
49
|
+
Compute the default user configuration file search paths.
|
|
50
|
+
|
|
51
|
+
The search order is:
|
|
52
|
+
|
|
53
|
+
1. Home configuration: ``~/.biblicus/config.yml``
|
|
54
|
+
2. Local configuration: ``./.biblicus/config.yml``
|
|
55
|
+
|
|
56
|
+
Local configuration overrides home configuration when both exist.
|
|
57
|
+
|
|
58
|
+
:param cwd: Optional working directory to use instead of the process current directory.
|
|
59
|
+
:type cwd: Path or None
|
|
60
|
+
:param home: Optional home directory to use instead of the current user's home directory.
|
|
61
|
+
:type home: Path or None
|
|
62
|
+
:return: Ordered list of configuration file paths.
|
|
63
|
+
:rtype: list[Path]
|
|
64
|
+
"""
|
|
65
|
+
resolved_home = (home or Path.home()).expanduser()
|
|
66
|
+
resolved_cwd = cwd or Path.cwd()
|
|
67
|
+
return [
|
|
68
|
+
resolved_home / ".biblicus" / "config.yml",
|
|
69
|
+
resolved_cwd / ".biblicus" / "config.yml",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
|
|
74
|
+
merged: Dict[str, Any] = {key: value for key, value in base.items()}
|
|
75
|
+
for key, value in override.items():
|
|
76
|
+
if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
|
|
77
|
+
merged[key] = _deep_merge(merged[key], value)
|
|
78
|
+
else:
|
|
79
|
+
merged[key] = value
|
|
80
|
+
return merged
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _load_dotyaml_data(path: Path) -> Dict[str, Any]:
|
|
84
|
+
"""
|
|
85
|
+
Load a dotyaml configuration file and return a nested mapping.
|
|
86
|
+
|
|
87
|
+
:param path: Configuration file path.
|
|
88
|
+
:type path: Path
|
|
89
|
+
:return: Parsed YAML data mapping.
|
|
90
|
+
:rtype: dict[str, Any]
|
|
91
|
+
"""
|
|
92
|
+
loader = ConfigLoader(prefix="", load_dotenv_first=False)
|
|
93
|
+
loaded = loader.load_from_yaml(path)
|
|
94
|
+
return loaded if isinstance(loaded, dict) else {}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def load_user_config(*, paths: Optional[list[Path]] = None) -> BiblicusUserConfig:
|
|
98
|
+
"""
|
|
99
|
+
Load user configuration from known locations.
|
|
100
|
+
|
|
101
|
+
This function merges multiple configuration files in order. Later files override earlier files.
|
|
102
|
+
|
|
103
|
+
:param paths: Optional explicit search paths. When omitted, the default paths are used.
|
|
104
|
+
:type paths: list[Path] or None
|
|
105
|
+
:return: Parsed user configuration. When no files exist, the configuration is empty.
|
|
106
|
+
:rtype: BiblicusUserConfig
|
|
107
|
+
:raises ValueError: If an existing configuration file is not parseable.
|
|
108
|
+
"""
|
|
109
|
+
search_paths = paths or default_user_config_paths()
|
|
110
|
+
merged_data: Dict[str, Any] = {}
|
|
111
|
+
|
|
112
|
+
for path in search_paths:
|
|
113
|
+
if not path.is_file():
|
|
114
|
+
continue
|
|
115
|
+
loaded = _load_dotyaml_data(path)
|
|
116
|
+
merged_data = _deep_merge(merged_data, loaded)
|
|
117
|
+
|
|
118
|
+
return BiblicusUserConfig.model_validate(merged_data)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
|
|
122
|
+
"""
|
|
123
|
+
Resolve an OpenAI API key from environment or user configuration.
|
|
124
|
+
|
|
125
|
+
Environment takes precedence over configuration.
|
|
126
|
+
|
|
127
|
+
:param config: Optional pre-loaded user configuration.
|
|
128
|
+
:type config: BiblicusUserConfig or None
|
|
129
|
+
:return: API key string, or None when no key is available.
|
|
130
|
+
:rtype: str or None
|
|
131
|
+
"""
|
|
132
|
+
env_key = os.environ.get("OPENAI_API_KEY")
|
|
133
|
+
if env_key:
|
|
134
|
+
return env_key
|
|
135
|
+
loaded = config or load_user_config()
|
|
136
|
+
if loaded.openai is None:
|
|
137
|
+
return None
|
|
138
|
+
return loaded.openai.api_key
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: biblicus
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: PyYAML>=6.0
|
|
11
|
+
Requires-Dist: pypdf>=4.0
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
14
|
+
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
15
|
+
Requires-Dist: sphinx>=7.0; extra == "dev"
|
|
16
|
+
Requires-Dist: myst-parser>=2.0; extra == "dev"
|
|
17
|
+
Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
|
|
18
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
19
|
+
Requires-Dist: black>=24.0; extra == "dev"
|
|
20
|
+
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
21
|
+
Provides-Extra: openai
|
|
22
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
23
|
+
Provides-Extra: unstructured
|
|
24
|
+
Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
25
|
+
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
|
+
Provides-Extra: ocr
|
|
27
|
+
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# Biblicus
|
|
31
|
+
|
|
32
|
+
![Continuous integration][continuous-integration-badge]
|
|
33
|
+
![Coverage][coverage-badge]
|
|
34
|
+
![Documentation][documentation-badge]
|
|
35
|
+
|
|
36
|
+
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
37
|
+
|
|
38
|
+
If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
|
|
39
|
+
|
|
40
|
+
The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
|
|
41
|
+
|
|
42
|
+
This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
|
|
43
|
+
|
|
44
|
+
It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
|
|
45
|
+
|
|
46
|
+
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
47
|
+
|
|
48
|
+
## A beginner friendly mental model
|
|
49
|
+
|
|
50
|
+
Think in three stages.
|
|
51
|
+
|
|
52
|
+
- Ingest puts raw items into a corpus. This is file first and human inspectable.
|
|
53
|
+
- Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
|
|
54
|
+
- Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
|
|
55
|
+
|
|
56
|
+
If you learn a few project words, the rest of the system becomes predictable.
|
|
57
|
+
|
|
58
|
+
- Corpus is the folder that holds raw items and their metadata.
|
|
59
|
+
- Item is the raw bytes plus optional metadata and source information.
|
|
60
|
+
- Catalog is the rebuildable index of the corpus.
|
|
61
|
+
- Extraction run is a recorded extraction build that produces text artifacts.
|
|
62
|
+
- Backend is a pluggable retrieval implementation.
|
|
63
|
+
- Run is a recorded retrieval build for a corpus.
|
|
64
|
+
- Evidence is what retrieval returns, with identifiers and source information.
|
|
65
|
+
|
|
66
|
+
## Diagram
|
|
67
|
+
|
|
68
|
+
This diagram shows how a corpus becomes evidence for an assistant.
|
|
69
|
+
Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
|
|
70
|
+
The legend shows what the block styles mean.
|
|
71
|
+
Your code is where you decide how to turn evidence into context and how to call a model.
|
|
72
|
+
|
|
73
|
+
```mermaid
|
|
74
|
+
%%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
|
|
75
|
+
flowchart LR
|
|
76
|
+
subgraph Legend[Legend]
|
|
77
|
+
direction LR
|
|
78
|
+
LegendArtifact[Stored artifact or evidence]
|
|
79
|
+
LegendStep[Step]
|
|
80
|
+
LegendStable[Stable region]
|
|
81
|
+
LegendPluggable[Pluggable region]
|
|
82
|
+
LegendArtifact --- LegendStep
|
|
83
|
+
LegendStable --- LegendPluggable
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
subgraph Main[" "]
|
|
87
|
+
direction TB
|
|
88
|
+
|
|
89
|
+
subgraph StableCore[Stable core]
|
|
90
|
+
direction TB
|
|
91
|
+
Source[Source items] --> Ingest[Ingest]
|
|
92
|
+
Ingest --> Raw[Raw item files]
|
|
93
|
+
Raw --> Catalog[Catalog file]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
|
|
97
|
+
direction TB
|
|
98
|
+
Catalog --> Extract[Extract pipeline]
|
|
99
|
+
Extract --> ExtractedText[Extracted text artifacts]
|
|
100
|
+
ExtractedText --> ExtractionRun[Extraction run manifest]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
|
|
104
|
+
direction LR
|
|
105
|
+
|
|
106
|
+
subgraph BackendIngestionIndexing[Ingestion and indexing]
|
|
107
|
+
direction TB
|
|
108
|
+
ExtractionRun --> Build[Build run]
|
|
109
|
+
Build --> BackendIndex[Backend index]
|
|
110
|
+
BackendIndex --> Run[Run manifest]
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
subgraph BackendRetrievalGeneration[Retrieval and generation]
|
|
114
|
+
direction TB
|
|
115
|
+
Run --> Query[Query]
|
|
116
|
+
Query --> Evidence[Evidence]
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
Evidence --> Context
|
|
121
|
+
|
|
122
|
+
subgraph YourCode[Your code]
|
|
123
|
+
direction TB
|
|
124
|
+
Context[Assistant context] --> Model[Large language model call]
|
|
125
|
+
Model --> Answer[Answer]
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
129
|
+
style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
130
|
+
style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
131
|
+
style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
|
|
132
|
+
style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
133
|
+
style BackendRetrievalGeneration fill:#ffffff,stroke:#cfd8dc,color:#111111
|
|
134
|
+
|
|
135
|
+
style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
136
|
+
style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
137
|
+
style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
138
|
+
style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
139
|
+
style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
140
|
+
style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
141
|
+
style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
142
|
+
style Context fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
143
|
+
style Answer fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
144
|
+
style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
145
|
+
|
|
146
|
+
style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
147
|
+
style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
148
|
+
style Build fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
149
|
+
style Query fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
150
|
+
style Model fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
style Legend fill:#ffffff,stroke:#ffffff,color:#111111
|
|
154
|
+
style Main fill:#ffffff,stroke:#ffffff,color:#111111
|
|
155
|
+
style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
|
|
156
|
+
style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
|
|
157
|
+
style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
|
|
158
|
+
style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Practical value
|
|
162
|
+
|
|
163
|
+
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
164
|
+
- You can keep raw files readable and portable, without locking your data inside a database.
|
|
165
|
+
- You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
|
|
166
|
+
|
|
167
|
+
## Typical flow
|
|
168
|
+
|
|
169
|
+
- Initialize a corpus folder.
|
|
170
|
+
- Ingest items from file paths, web addresses, or text input.
|
|
171
|
+
- Run extraction when you want derived text artifacts from non-text sources.
|
|
172
|
+
- Reindex to refresh the catalog after edits.
|
|
173
|
+
- Build a retrieval run with a backend.
|
|
174
|
+
- Query the run to collect evidence and evaluate it with datasets.
|
|
175
|
+
|
|
176
|
+
## Install
|
|
177
|
+
|
|
178
|
+
This repository is a working Python package. Install it into a virtual environment from the repository root.
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
python3 -m pip install -e .
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
After the first release, you can install it from Python Package Index.
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
python3 -m pip install biblicus
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Optional extras
|
|
191
|
+
|
|
192
|
+
Some extractors are optional so the base install stays small.
|
|
193
|
+
|
|
194
|
+
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
195
|
+
- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
196
|
+
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
197
|
+
|
|
198
|
+
## Quick start
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
mkdir -p notes
|
|
202
|
+
echo "A small file note" > notes/example.txt
|
|
203
|
+
|
|
204
|
+
biblicus init corpora/example
|
|
205
|
+
biblicus ingest --corpus corpora/example notes/example.txt
|
|
206
|
+
echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
|
|
207
|
+
biblicus list --corpus corpora/example
|
|
208
|
+
biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
|
|
209
|
+
biblicus build --corpus corpora/example --backend scan
|
|
210
|
+
biblicus query --corpus corpora/example --query "note"
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Python usage
|
|
214
|
+
|
|
215
|
+
From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
|
|
216
|
+
|
|
217
|
+
- Create a corpus with `Corpus.init` or open one with `Corpus.open`.
|
|
218
|
+
- Ingest notes with `Corpus.ingest_note`.
|
|
219
|
+
- Ingest files or web addresses with `Corpus.ingest_source`.
|
|
220
|
+
- List items with `Corpus.list_items`.
|
|
221
|
+
- Build a retrieval run with `get_backend` and `backend.build_run`.
|
|
222
|
+
- Query a run with `backend.query`.
|
|
223
|
+
- Evaluate with `evaluate_run`.
|
|
224
|
+
|
|
225
|
+
## How it fits into an assistant
|
|
226
|
+
|
|
227
|
+
In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
|
|
228
|
+
|
|
229
|
+
- Use a corpus as the source of truth for raw items.
|
|
230
|
+
- Use a backend run to build any derived artifacts needed for retrieval.
|
|
231
|
+
- Use queries to obtain evidence objects.
|
|
232
|
+
- Convert evidence into the format your framework expects, such as message content, tool output, or citations.
|
|
233
|
+
|
|
234
|
+
## Learn more
|
|
235
|
+
|
|
236
|
+
Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
|
|
237
|
+
|
|
238
|
+
The documents below are written to be read in order.
|
|
239
|
+
|
|
240
|
+
- [Architecture][architecture]
|
|
241
|
+
- [Roadmap][roadmap]
|
|
242
|
+
- [Feature index][feature-index]
|
|
243
|
+
- [Corpus][corpus]
|
|
244
|
+
- [Text extraction][text-extraction]
|
|
245
|
+
- [User configuration][user-configuration]
|
|
246
|
+
- [Backends][backends]
|
|
247
|
+
- [Demos][demos]
|
|
248
|
+
- [Testing][testing]
|
|
249
|
+
|
|
250
|
+
## Metadata and catalog
|
|
251
|
+
|
|
252
|
+
Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
|
|
253
|
+
|
|
254
|
+
## Corpus layout
|
|
255
|
+
|
|
256
|
+
```
|
|
257
|
+
corpus/
|
|
258
|
+
raw/
|
|
259
|
+
item.bin
|
|
260
|
+
item.bin.biblicus.yml
|
|
261
|
+
.biblicus/
|
|
262
|
+
config.json
|
|
263
|
+
catalog.json
|
|
264
|
+
runs/
|
|
265
|
+
run-id.json
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## Retrieval backends
|
|
269
|
+
|
|
270
|
+
Two backends are included.
|
|
271
|
+
|
|
272
|
+
- `scan` is a minimal baseline that scans raw items directly.
|
|
273
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
|
|
274
|
+
|
|
275
|
+
## Integration corpus and evaluation dataset
|
|
276
|
+
|
|
277
|
+
Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
|
|
278
|
+
|
|
279
|
+
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
280
|
+
|
|
281
|
+
Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
|
|
282
|
+
|
|
283
|
+
## Tests and coverage
|
|
284
|
+
|
|
285
|
+
```
|
|
286
|
+
python3 scripts/test.py
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
To include integration scenarios that download public test data at runtime, run this command.
|
|
290
|
+
|
|
291
|
+
```
|
|
292
|
+
python3 scripts/test.py --integration
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## Releases
|
|
296
|
+
|
|
297
|
+
Releases are automated from the main branch using semantic versioning and conventional commit messages.
|
|
298
|
+
|
|
299
|
+
The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
|
|
300
|
+
|
|
301
|
+
Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
|
|
302
|
+
|
|
303
|
+
## Documentation
|
|
304
|
+
|
|
305
|
+
Reference documentation is generated from Sphinx style docstrings.
|
|
306
|
+
|
|
307
|
+
Install development dependencies:
|
|
308
|
+
|
|
309
|
+
```
|
|
310
|
+
python3 -m pip install -e ".[dev]"
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
Build the documentation:
|
|
314
|
+
|
|
315
|
+
```
|
|
316
|
+
python3 -m sphinx -b html docs docs/_build
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
## License
|
|
320
|
+
|
|
321
|
+
License terms are in `LICENSE`.
|
|
322
|
+
|
|
323
|
+
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
324
|
+
[architecture]: docs/ARCHITECTURE.md
|
|
325
|
+
[roadmap]: docs/ROADMAP.md
|
|
326
|
+
[feature-index]: docs/FEATURE_INDEX.md
|
|
327
|
+
[corpus]: docs/CORPUS.md
|
|
328
|
+
[text-extraction]: docs/EXTRACTION.md
|
|
329
|
+
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
330
|
+
[backends]: docs/BACKENDS.md
|
|
331
|
+
[demos]: docs/DEMOS.md
|
|
332
|
+
[testing]: docs/TESTING.md
|
|
333
|
+
|
|
334
|
+
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
335
|
+
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
336
|
+
[documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
biblicus/__init__.py,sha256=1vPJokNgr7JcDO9eJ2SRR8VLkFG44ZaSACSaalogvYQ,432
|
|
2
|
+
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
+
biblicus/cli.py,sha256=k09mMToSawDC7TbetwtK0RItTLO84EOJCZQKDRA-b9Y,19229
|
|
4
|
+
biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
|
|
5
|
+
biblicus/corpus.py,sha256=oBg5nbDoDBkkXaW180ixtvU9Yh0y9nOiZDEMKomtrVU,47688
|
|
6
|
+
biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
|
|
7
|
+
biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
|
|
8
|
+
biblicus/extraction.py,sha256=MYaHkhj0NWBKNcaohnLvNiHLwyps9JyZGaTxX5gHR-A,19281
|
|
9
|
+
biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
|
|
10
|
+
biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
11
|
+
biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
12
|
+
biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
|
|
13
|
+
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
14
|
+
biblicus/models.py,sha256=fdpPRtWmtirjEKpOPL_6ZVRY0vpA2WRqMwNrOqPaauM,14204
|
|
15
|
+
biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
16
|
+
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
17
|
+
biblicus/time.py,sha256=NEHkJLJ3RH1PdJVAWMYbNCBnCb6UW9DVBLo7Qh1zO88,485
|
|
18
|
+
biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
|
|
19
|
+
biblicus/user_config.py,sha256=DqO08yLn82DhTiFpmIyyLj_J0nMbrtE8xieTj2Cgd6A,4287
|
|
20
|
+
biblicus/_vendor/dotyaml/__init__.py,sha256=e4zbejeJRwlD4I0q3YvotMypO19lXqmT8iyU1q6SvhY,376
|
|
21
|
+
biblicus/_vendor/dotyaml/interpolation.py,sha256=PfUAEEOTFobv7Ox0E6nAxht6BqhHIDe4hP32fZn5TOs,1992
|
|
22
|
+
biblicus/_vendor/dotyaml/loader.py,sha256=KePkjyhKZSvQZphmlmlzTYZJBQsqL5qhtGV1y7G6wzM,5624
|
|
23
|
+
biblicus/_vendor/dotyaml/transformer.py,sha256=2AKPS8DMOPuYtzmM-dlwIqVbARfbBH5jYV1m5qpR49E,3725
|
|
24
|
+
biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
|
|
25
|
+
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
26
|
+
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
27
|
+
biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
|
|
28
|
+
biblicus/extractors/__init__.py,sha256=X3pu18QL85IBpYf56l6_5PUxFPhEN5qLTlOrxYpfGck,1776
|
|
29
|
+
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
30
|
+
biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
|
|
31
|
+
biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
|
|
32
|
+
biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
|
|
33
|
+
biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
|
|
34
|
+
biblicus/extractors/pipeline.py,sha256=LY6eM3ypw50MDB2cPEQqZrjxkhVvIc6sv4UEhHdNDrE,3208
|
|
35
|
+
biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuWxlnY-GU,4531
|
|
36
|
+
biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
|
|
37
|
+
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
38
|
+
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
39
|
+
biblicus-0.3.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
40
|
+
biblicus-0.3.0.dist-info/METADATA,sha256=MHE8tAh9jGiMwk5X9jPSnhRFB6uAZa3T8jo_c1zrIZM,13202
|
|
41
|
+
biblicus-0.3.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
42
|
+
biblicus-0.3.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
43
|
+
biblicus-0.3.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
44
|
+
biblicus-0.3.0.dist-info/RECORD,,
|