biblicus 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +3 -1
- biblicus/extractors/__init__.py +2 -0
- biblicus/extractors/markitdown_text.py +128 -0
- biblicus/knowledge_base.py +191 -0
- {biblicus-0.5.0.dist-info → biblicus-0.7.0.dist-info}/METADATA +57 -4
- {biblicus-0.5.0.dist-info → biblicus-0.7.0.dist-info}/RECORD +10 -8
- {biblicus-0.5.0.dist-info → biblicus-0.7.0.dist-info}/WHEEL +0 -0
- {biblicus-0.5.0.dist-info → biblicus-0.7.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.5.0.dist-info → biblicus-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.5.0.dist-info → biblicus-0.7.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@ Biblicus public package interface.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from .corpus import Corpus
|
|
6
|
+
from .knowledge_base import KnowledgeBase
|
|
6
7
|
from .models import (
|
|
7
8
|
CorpusConfig,
|
|
8
9
|
Evidence,
|
|
@@ -19,10 +20,11 @@ __all__ = [
|
|
|
19
20
|
"CorpusConfig",
|
|
20
21
|
"Evidence",
|
|
21
22
|
"IngestResult",
|
|
23
|
+
"KnowledgeBase",
|
|
22
24
|
"QueryBudget",
|
|
23
25
|
"RecipeManifest",
|
|
24
26
|
"RetrievalResult",
|
|
25
27
|
"RetrievalRun",
|
|
26
28
|
]
|
|
27
29
|
|
|
28
|
-
__version__ = "0.
|
|
30
|
+
__version__ = "0.7.0"
|
biblicus/extractors/__init__.py
CHANGED
|
@@ -7,6 +7,7 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict
|
|
8
8
|
|
|
9
9
|
from .base import TextExtractor
|
|
10
|
+
from .markitdown_text import MarkItDownExtractor
|
|
10
11
|
from .metadata_text import MetadataTextExtractor
|
|
11
12
|
from .openai_stt import OpenAiSpeechToTextExtractor
|
|
12
13
|
from .pass_through_text import PassThroughTextExtractor
|
|
@@ -30,6 +31,7 @@ def get_extractor(extractor_id: str) -> TextExtractor:
|
|
|
30
31
|
"""
|
|
31
32
|
extractors: Dict[str, TextExtractor] = {
|
|
32
33
|
MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
|
|
34
|
+
MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
|
|
33
35
|
PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
|
|
34
36
|
PipelineExtractor.extractor_id: PipelineExtractor(),
|
|
35
37
|
PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MarkItDown-based text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor depends on an optional library so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MarkItDownExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for the MarkItDown extractor.
|
|
23
|
+
|
|
24
|
+
:ivar enable_plugins: Whether to enable MarkItDown plugins.
|
|
25
|
+
:vartype enable_plugins: bool
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
enable_plugins: bool = Field(default=False)
|
|
31
|
+
|
|
32
|
+
class MarkItDownExtractor(TextExtractor):
|
|
33
|
+
"""
|
|
34
|
+
Extractor plugin backed by the `markitdown` library.
|
|
35
|
+
|
|
36
|
+
This extractor converts non-text items into Markdown-like text. It skips text items so
|
|
37
|
+
the pass-through extractor remains the canonical choice for text inputs and Markdown
|
|
38
|
+
front matter handling.
|
|
39
|
+
|
|
40
|
+
:ivar extractor_id: Extractor identifier.
|
|
41
|
+
:vartype extractor_id: str
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
extractor_id = "markitdown"
|
|
45
|
+
|
|
46
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
47
|
+
"""
|
|
48
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
49
|
+
|
|
50
|
+
:param config: Configuration mapping.
|
|
51
|
+
:type config: dict[str, Any]
|
|
52
|
+
:return: Parsed config.
|
|
53
|
+
:rtype: MarkItDownExtractorConfig
|
|
54
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
import markitdown
|
|
58
|
+
from markitdown import MarkItDown # noqa: F401
|
|
59
|
+
except ImportError as import_error:
|
|
60
|
+
raise ExtractionRunFatalError(
|
|
61
|
+
"MarkItDown extractor requires an optional dependency. "
|
|
62
|
+
'Install it with pip install "biblicus[markitdown]".'
|
|
63
|
+
) from import_error
|
|
64
|
+
if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
|
|
65
|
+
raise ExtractionRunFatalError(
|
|
66
|
+
"MarkItDown requires Python 3.10 or higher. "
|
|
67
|
+
"Upgrade your interpreter or use a compatible extractor."
|
|
68
|
+
)
|
|
69
|
+
return MarkItDownExtractorConfig.model_validate(config)
|
|
70
|
+
|
|
71
|
+
def extract_text(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
corpus: Corpus,
|
|
75
|
+
item: CatalogItem,
|
|
76
|
+
config: BaseModel,
|
|
77
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
78
|
+
) -> Optional[ExtractedText]:
|
|
79
|
+
"""
|
|
80
|
+
Extract text for a non-text item using MarkItDown.
|
|
81
|
+
|
|
82
|
+
:param corpus: Corpus containing the item bytes.
|
|
83
|
+
:type corpus: Corpus
|
|
84
|
+
:param item: Catalog item being processed.
|
|
85
|
+
:type item: CatalogItem
|
|
86
|
+
:param config: Parsed configuration model.
|
|
87
|
+
:type config: MarkItDownExtractorConfig
|
|
88
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
89
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
90
|
+
:return: Extracted text payload, or None when the item is already text.
|
|
91
|
+
:rtype: ExtractedText or None
|
|
92
|
+
"""
|
|
93
|
+
parsed_config = (
|
|
94
|
+
config
|
|
95
|
+
if isinstance(config, MarkItDownExtractorConfig)
|
|
96
|
+
else MarkItDownExtractorConfig.model_validate(config)
|
|
97
|
+
)
|
|
98
|
+
_ = previous_extractions
|
|
99
|
+
media_type = item.media_type
|
|
100
|
+
if media_type == "text/markdown" or media_type.startswith("text/"):
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
from markitdown import MarkItDown
|
|
104
|
+
|
|
105
|
+
source_path = corpus.root / item.relpath
|
|
106
|
+
converter = MarkItDown(enable_plugins=parsed_config.enable_plugins)
|
|
107
|
+
conversion_result = converter.convert(str(source_path))
|
|
108
|
+
extracted_text = _resolve_markitdown_text(conversion_result).strip()
|
|
109
|
+
return ExtractedText(text=extracted_text, producer_extractor_id=self.extractor_id)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _resolve_markitdown_text(conversion_result: object) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Resolve a text payload from a MarkItDown conversion result.
|
|
115
|
+
|
|
116
|
+
:param conversion_result: Result returned by the MarkItDown converter.
|
|
117
|
+
:type conversion_result: object
|
|
118
|
+
:return: Extracted text payload or an empty string.
|
|
119
|
+
:rtype: str
|
|
120
|
+
"""
|
|
121
|
+
if isinstance(conversion_result, str):
|
|
122
|
+
return conversion_result
|
|
123
|
+
if conversion_result is None:
|
|
124
|
+
return ""
|
|
125
|
+
text_content = getattr(conversion_result, "text_content", None)
|
|
126
|
+
if isinstance(text_content, str):
|
|
127
|
+
return text_content
|
|
128
|
+
return ""
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
High-level knowledge base workflow for turnkey usage.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from tempfile import TemporaryDirectory
|
|
10
|
+
from typing import List, Optional, Sequence
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from .backends import get_backend
|
|
15
|
+
from .context import (
|
|
16
|
+
ContextPack,
|
|
17
|
+
ContextPackPolicy,
|
|
18
|
+
TokenBudget,
|
|
19
|
+
build_context_pack,
|
|
20
|
+
fit_context_pack_to_token_budget,
|
|
21
|
+
)
|
|
22
|
+
from .corpus import Corpus
|
|
23
|
+
from .models import QueryBudget, RetrievalResult, RetrievalRun
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class KnowledgeBaseDefaults(BaseModel):
|
|
27
|
+
"""
|
|
28
|
+
Default configuration for a knowledge base workflow.
|
|
29
|
+
|
|
30
|
+
:ivar backend_id: Backend identifier to use for retrieval.
|
|
31
|
+
:vartype backend_id: str
|
|
32
|
+
:ivar recipe_name: Human-readable retrieval recipe name.
|
|
33
|
+
:vartype recipe_name: str
|
|
34
|
+
:ivar query_budget: Default query budget to apply to retrieval.
|
|
35
|
+
:vartype query_budget: QueryBudget
|
|
36
|
+
:ivar tags: Tags to apply when importing the folder.
|
|
37
|
+
:vartype tags: list[str]
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
model_config = ConfigDict(extra="forbid")
|
|
41
|
+
|
|
42
|
+
backend_id: str = Field(default="scan", min_length=1)
|
|
43
|
+
recipe_name: str = Field(default="Knowledge base", min_length=1)
|
|
44
|
+
query_budget: QueryBudget = Field(
|
|
45
|
+
default_factory=lambda: QueryBudget(
|
|
46
|
+
max_total_items=5,
|
|
47
|
+
max_total_characters=2000,
|
|
48
|
+
max_items_per_source=None,
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
tags: List[str] = Field(default_factory=list)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class KnowledgeBase:
|
|
56
|
+
"""
|
|
57
|
+
High-level knowledge base wrapper for turnkey workflows.
|
|
58
|
+
|
|
59
|
+
:ivar corpus: Corpus instance that stores the ingested items.
|
|
60
|
+
:vartype corpus: Corpus
|
|
61
|
+
:ivar backend_id: Backend identifier used for retrieval.
|
|
62
|
+
:vartype backend_id: str
|
|
63
|
+
:ivar run: Retrieval run manifest associated with the knowledge base.
|
|
64
|
+
:vartype run: RetrievalRun
|
|
65
|
+
:ivar defaults: Default configuration used for this knowledge base.
|
|
66
|
+
:vartype defaults: KnowledgeBaseDefaults
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
corpus: Corpus
|
|
70
|
+
backend_id: str
|
|
71
|
+
run: RetrievalRun
|
|
72
|
+
defaults: KnowledgeBaseDefaults
|
|
73
|
+
_temp_dir: Optional[TemporaryDirectory]
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_folder(
|
|
77
|
+
cls,
|
|
78
|
+
folder: str | Path,
|
|
79
|
+
*,
|
|
80
|
+
backend_id: Optional[str] = None,
|
|
81
|
+
recipe_name: Optional[str] = None,
|
|
82
|
+
query_budget: Optional[QueryBudget] = None,
|
|
83
|
+
tags: Optional[Sequence[str]] = None,
|
|
84
|
+
corpus_root: Optional[str | Path] = None,
|
|
85
|
+
) -> "KnowledgeBase":
|
|
86
|
+
"""
|
|
87
|
+
Build a knowledge base from a folder of files.
|
|
88
|
+
|
|
89
|
+
:param folder: Folder containing source files.
|
|
90
|
+
:type folder: str or Path
|
|
91
|
+
:param backend_id: Optional backend identifier override.
|
|
92
|
+
:type backend_id: str or None
|
|
93
|
+
:param recipe_name: Optional recipe name override.
|
|
94
|
+
:type recipe_name: str or None
|
|
95
|
+
:param query_budget: Optional query budget override.
|
|
96
|
+
:type query_budget: QueryBudget or None
|
|
97
|
+
:param tags: Optional tags to apply during import.
|
|
98
|
+
:type tags: Sequence[str] or None
|
|
99
|
+
:param corpus_root: Optional corpus root override.
|
|
100
|
+
:type corpus_root: str or Path or None
|
|
101
|
+
:return: Knowledge base instance.
|
|
102
|
+
:rtype: KnowledgeBase
|
|
103
|
+
:raises FileNotFoundError: If the folder does not exist.
|
|
104
|
+
:raises NotADirectoryError: If the folder is not a directory.
|
|
105
|
+
"""
|
|
106
|
+
source_root = Path(folder).resolve()
|
|
107
|
+
if not source_root.exists():
|
|
108
|
+
raise FileNotFoundError(f"Knowledge base folder does not exist: {source_root}")
|
|
109
|
+
if not source_root.is_dir():
|
|
110
|
+
raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
|
|
111
|
+
|
|
112
|
+
defaults = KnowledgeBaseDefaults()
|
|
113
|
+
resolved_backend_id = backend_id or defaults.backend_id
|
|
114
|
+
resolved_recipe_name = recipe_name or defaults.recipe_name
|
|
115
|
+
resolved_query_budget = query_budget or defaults.query_budget
|
|
116
|
+
resolved_tags = list(tags) if tags is not None else defaults.tags
|
|
117
|
+
|
|
118
|
+
temp_dir: Optional[TemporaryDirectory] = None
|
|
119
|
+
if corpus_root is None:
|
|
120
|
+
temp_dir = TemporaryDirectory(prefix="biblicus-knowledge-base-")
|
|
121
|
+
corpus_root_path = Path(temp_dir.name) / "corpus"
|
|
122
|
+
else:
|
|
123
|
+
corpus_root_path = Path(corpus_root).resolve()
|
|
124
|
+
|
|
125
|
+
corpus = Corpus.init(corpus_root_path)
|
|
126
|
+
corpus.import_tree(source_root, tags=resolved_tags)
|
|
127
|
+
|
|
128
|
+
backend = get_backend(resolved_backend_id)
|
|
129
|
+
run = backend.build_run(corpus, recipe_name=resolved_recipe_name, config={})
|
|
130
|
+
|
|
131
|
+
return cls(
|
|
132
|
+
corpus=corpus,
|
|
133
|
+
backend_id=resolved_backend_id,
|
|
134
|
+
run=run,
|
|
135
|
+
defaults=KnowledgeBaseDefaults(
|
|
136
|
+
backend_id=resolved_backend_id,
|
|
137
|
+
recipe_name=resolved_recipe_name,
|
|
138
|
+
query_budget=resolved_query_budget,
|
|
139
|
+
tags=resolved_tags,
|
|
140
|
+
),
|
|
141
|
+
_temp_dir=temp_dir,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def query(self, query_text: str, *, budget: Optional[QueryBudget] = None) -> RetrievalResult:
|
|
145
|
+
"""
|
|
146
|
+
Query the knowledge base for evidence.
|
|
147
|
+
|
|
148
|
+
:param query_text: Query text to execute.
|
|
149
|
+
:type query_text: str
|
|
150
|
+
:param budget: Optional budget override.
|
|
151
|
+
:type budget: QueryBudget or None
|
|
152
|
+
:return: Retrieval result containing evidence.
|
|
153
|
+
:rtype: RetrievalResult
|
|
154
|
+
"""
|
|
155
|
+
backend = get_backend(self.backend_id)
|
|
156
|
+
resolved_budget = budget or self.defaults.query_budget
|
|
157
|
+
return backend.query(
|
|
158
|
+
self.corpus,
|
|
159
|
+
run=self.run,
|
|
160
|
+
query_text=query_text,
|
|
161
|
+
budget=resolved_budget,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def context_pack(
|
|
165
|
+
self,
|
|
166
|
+
result: RetrievalResult,
|
|
167
|
+
*,
|
|
168
|
+
join_with: str = "\n\n",
|
|
169
|
+
max_tokens: Optional[int] = None,
|
|
170
|
+
) -> ContextPack:
|
|
171
|
+
"""
|
|
172
|
+
Build a context pack from a retrieval result.
|
|
173
|
+
|
|
174
|
+
:param result: Retrieval result to convert into context.
|
|
175
|
+
:type result: RetrievalResult
|
|
176
|
+
:param join_with: Join string for evidence blocks.
|
|
177
|
+
:type join_with: str
|
|
178
|
+
:param max_tokens: Optional token budget for the context pack.
|
|
179
|
+
:type max_tokens: int or None
|
|
180
|
+
:return: Context pack text and metadata.
|
|
181
|
+
:rtype: ContextPack
|
|
182
|
+
"""
|
|
183
|
+
policy = ContextPackPolicy(join_with=join_with)
|
|
184
|
+
context_pack = build_context_pack(result, policy=policy)
|
|
185
|
+
if max_tokens is None:
|
|
186
|
+
return context_pack
|
|
187
|
+
return fit_context_pack_to_token_budget(
|
|
188
|
+
context_pack,
|
|
189
|
+
policy=policy,
|
|
190
|
+
token_budget=TokenBudget(max_tokens=max_tokens),
|
|
191
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -25,6 +25,8 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
|
25
25
|
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
26
|
Provides-Extra: ocr
|
|
27
27
|
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
+
Provides-Extra: markitdown
|
|
29
|
+
Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
|
|
28
30
|
Dynamic: license-file
|
|
29
31
|
|
|
30
32
|
# Biblicus
|
|
@@ -45,6 +47,40 @@ It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or
|
|
|
45
47
|
|
|
46
48
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
47
49
|
|
|
50
|
+
## Start with a knowledge base
|
|
51
|
+
|
|
52
|
+
If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
|
|
53
|
+
|
|
54
|
+
This example assumes a folder called `notes/` with a few `.txt` files. The knowledge base handles sensible defaults and still gives you a clear context pack for your model call.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from biblicus.knowledge_base import KnowledgeBase
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
kb = KnowledgeBase.from_folder("notes")
|
|
61
|
+
result = kb.query("Primary button style preference")
|
|
62
|
+
context_pack = kb.context_pack(result, max_tokens=800)
|
|
63
|
+
|
|
64
|
+
print(context_pack.text)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
If you want to run a real, executable version of this story, use `scripts/readme_end_to_end_demo.py` from a fresh clone.
|
|
68
|
+
|
|
69
|
+
This simplified sequence diagram shows the same idea at a high level.
|
|
70
|
+
|
|
71
|
+
```mermaid
|
|
72
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
73
|
+
sequenceDiagram
|
|
74
|
+
participant App as Your assistant code
|
|
75
|
+
participant KB as Knowledge base
|
|
76
|
+
participant LLM as Large language model
|
|
77
|
+
|
|
78
|
+
App->>KB: query
|
|
79
|
+
KB-->>App: evidence and context
|
|
80
|
+
App->>LLM: context plus prompt
|
|
81
|
+
LLM-->>App: response draft
|
|
82
|
+
```
|
|
83
|
+
|
|
48
84
|
## A simple mental model
|
|
49
85
|
|
|
50
86
|
Think in three stages.
|
|
@@ -72,7 +108,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
|
|
|
72
108
|
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
73
109
|
|
|
74
110
|
```mermaid
|
|
75
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
111
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
76
112
|
sequenceDiagram
|
|
77
113
|
participant User
|
|
78
114
|
participant App as Your assistant code
|
|
@@ -126,6 +162,7 @@ Some extractors are optional so the base install stays small.
|
|
|
126
162
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
127
163
|
- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
128
164
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
165
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
129
166
|
|
|
130
167
|
## Quick start
|
|
131
168
|
|
|
@@ -153,11 +190,11 @@ biblicus crawl --corpus corpora/example \\
|
|
|
153
190
|
--tag crawled
|
|
154
191
|
```
|
|
155
192
|
|
|
156
|
-
## End-to-end example:
|
|
193
|
+
## End-to-end example: lower-level control
|
|
157
194
|
|
|
158
195
|
The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
|
|
159
196
|
|
|
160
|
-
|
|
197
|
+
This version shows the lower-level pieces explicitly. You are building the corpus, controlling each memory string, choosing the backend, and shaping the context pack yourself.
|
|
161
198
|
|
|
162
199
|
```python
|
|
163
200
|
from biblicus.backends import get_backend
|
|
@@ -383,6 +420,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
383
420
|
|
|
384
421
|
- [Corpus][corpus]
|
|
385
422
|
- [Text extraction][text-extraction]
|
|
423
|
+
- [Knowledge base][knowledge-base]
|
|
386
424
|
- [Backends][backends]
|
|
387
425
|
- [Context packs][context-packs]
|
|
388
426
|
- [Testing and evaluation][testing]
|
|
@@ -432,6 +470,20 @@ Two backends are included.
|
|
|
432
470
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
433
471
|
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
|
|
434
472
|
|
|
473
|
+
## Extraction backends
|
|
474
|
+
|
|
475
|
+
These extractors are built in. Optional ones require extra dependencies.
|
|
476
|
+
|
|
477
|
+
- `pass-through-text` reads text items and strips Markdown front matter.
|
|
478
|
+
- `metadata-text` turns catalog metadata into a small text artifact.
|
|
479
|
+
- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
|
|
480
|
+
- `select-text` chooses one prior extraction result in a pipeline.
|
|
481
|
+
- `select-longest-text` chooses the longest prior extraction result.
|
|
482
|
+
- `ocr-rapidocr` does optical character recognition on images (optional).
|
|
483
|
+
- `stt-openai` performs speech to text on audio (optional).
|
|
484
|
+
- `unstructured` provides broad document parsing (optional).
|
|
485
|
+
- `markitdown` converts many formats into Markdown-like text (optional).
|
|
486
|
+
|
|
435
487
|
## Integration corpus and evaluation dataset
|
|
436
488
|
|
|
437
489
|
Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
|
|
@@ -485,6 +537,7 @@ License terms are in `LICENSE`.
|
|
|
485
537
|
[roadmap]: docs/ROADMAP.md
|
|
486
538
|
[feature-index]: docs/FEATURE_INDEX.md
|
|
487
539
|
[corpus]: docs/CORPUS.md
|
|
540
|
+
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
488
541
|
[text-extraction]: docs/EXTRACTION.md
|
|
489
542
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
490
543
|
[backends]: docs/BACKENDS.md
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=zpBSDOPXCoqBcc2QNjRWf_4dD4FKnBgUDl3j_ZG2_cA,495
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
3
|
biblicus/cli.py,sha256=hBau464XNdSGdWeOCE2Q7dm0P8I4sR0W-NgVT0wPmh4,27724
|
|
4
4
|
biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
|
|
@@ -14,6 +14,7 @@ biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
|
14
14
|
biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
15
15
|
biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
|
|
16
16
|
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
17
|
+
biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
|
|
17
18
|
biblicus/models.py,sha256=6SWQ2Czg9O3zjuam8a4m8V3LlEgcGLbEctYDB6F1rRs,15317
|
|
18
19
|
biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
19
20
|
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
@@ -28,8 +29,9 @@ biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98
|
|
|
28
29
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
29
30
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
30
31
|
biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
|
|
31
|
-
biblicus/extractors/__init__.py,sha256=
|
|
32
|
+
biblicus/extractors/__init__.py,sha256=ctf6TkGViOpxr1s1TGMs40emcXImQZ71p0uOEBvLy9s,1890
|
|
32
33
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
34
|
+
biblicus/extractors/markitdown_text.py,sha256=-7N8ebi3pYfNPnplccyy3qvsKi6uImC1xyo_dSDiD10,4546
|
|
33
35
|
biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
|
|
34
36
|
biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
|
|
35
37
|
biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
|
|
@@ -39,9 +41,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
|
|
|
39
41
|
biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
|
|
40
42
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
41
43
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
42
|
-
biblicus-0.
|
|
43
|
-
biblicus-0.
|
|
44
|
-
biblicus-0.
|
|
45
|
-
biblicus-0.
|
|
46
|
-
biblicus-0.
|
|
47
|
-
biblicus-0.
|
|
44
|
+
biblicus-0.7.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
45
|
+
biblicus-0.7.0.dist-info/METADATA,sha256=tt46S2yJOUMhhAQFvLayZmEPJ5q7hNSP4CnUGBS2eT0,22315
|
|
46
|
+
biblicus-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
47
|
+
biblicus-0.7.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
48
|
+
biblicus-0.7.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
49
|
+
biblicus-0.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|