biblicus 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -3,6 +3,7 @@ Biblicus public package interface.
3
3
  """
4
4
 
5
5
  from .corpus import Corpus
6
+ from .knowledge_base import KnowledgeBase
6
7
  from .models import (
7
8
  CorpusConfig,
8
9
  Evidence,
@@ -19,10 +20,11 @@ __all__ = [
19
20
  "CorpusConfig",
20
21
  "Evidence",
21
22
  "IngestResult",
23
+ "KnowledgeBase",
22
24
  "QueryBudget",
23
25
  "RecipeManifest",
24
26
  "RetrievalResult",
25
27
  "RetrievalRun",
26
28
  ]
27
29
 
28
- __version__ = "0.5.0"
30
+ __version__ = "0.7.0"
@@ -7,6 +7,7 @@ from __future__ import annotations
7
7
  from typing import Dict
8
8
 
9
9
  from .base import TextExtractor
10
+ from .markitdown_text import MarkItDownExtractor
10
11
  from .metadata_text import MetadataTextExtractor
11
12
  from .openai_stt import OpenAiSpeechToTextExtractor
12
13
  from .pass_through_text import PassThroughTextExtractor
@@ -30,6 +31,7 @@ def get_extractor(extractor_id: str) -> TextExtractor:
30
31
  """
31
32
  extractors: Dict[str, TextExtractor] = {
32
33
  MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
34
+ MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
33
35
  PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
34
36
  PipelineExtractor.extractor_id: PipelineExtractor(),
35
37
  PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
@@ -0,0 +1,128 @@
1
+ """
2
+ MarkItDown-based text extraction plugin.
3
+
4
+ This extractor depends on an optional library so the core installation stays small.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import sys
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from ..corpus import Corpus
15
+ from ..errors import ExtractionRunFatalError
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+
20
+ class MarkItDownExtractorConfig(BaseModel):
21
+ """
22
+ Configuration for the MarkItDown extractor.
23
+
24
+ :ivar enable_plugins: Whether to enable MarkItDown plugins.
25
+ :vartype enable_plugins: bool
26
+ """
27
+
28
+ model_config = ConfigDict(extra="forbid")
29
+
30
+ enable_plugins: bool = Field(default=False)
31
+
32
+ class MarkItDownExtractor(TextExtractor):
33
+ """
34
+ Extractor plugin backed by the `markitdown` library.
35
+
36
+ This extractor converts non-text items into Markdown-like text. It skips text items so
37
+ the pass-through extractor remains the canonical choice for text inputs and Markdown
38
+ front matter handling.
39
+
40
+ :ivar extractor_id: Extractor identifier.
41
+ :vartype extractor_id: str
42
+ """
43
+
44
+ extractor_id = "markitdown"
45
+
46
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
47
+ """
48
+ Validate extractor configuration and ensure the dependency is installed.
49
+
50
+ :param config: Configuration mapping.
51
+ :type config: dict[str, Any]
52
+ :return: Parsed config.
53
+ :rtype: MarkItDownExtractorConfig
54
+ :raises ExtractionRunFatalError: If the optional dependency is not installed.
55
+ """
56
+ try:
57
+ import markitdown
58
+ from markitdown import MarkItDown # noqa: F401
59
+ except ImportError as import_error:
60
+ raise ExtractionRunFatalError(
61
+ "MarkItDown extractor requires an optional dependency. "
62
+ 'Install it with pip install "biblicus[markitdown]".'
63
+ ) from import_error
64
+ if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
65
+ raise ExtractionRunFatalError(
66
+ "MarkItDown requires Python 3.10 or higher. "
67
+ "Upgrade your interpreter or use a compatible extractor."
68
+ )
69
+ return MarkItDownExtractorConfig.model_validate(config)
70
+
71
+ def extract_text(
72
+ self,
73
+ *,
74
+ corpus: Corpus,
75
+ item: CatalogItem,
76
+ config: BaseModel,
77
+ previous_extractions: List[ExtractionStepOutput],
78
+ ) -> Optional[ExtractedText]:
79
+ """
80
+ Extract text for a non-text item using MarkItDown.
81
+
82
+ :param corpus: Corpus containing the item bytes.
83
+ :type corpus: Corpus
84
+ :param item: Catalog item being processed.
85
+ :type item: CatalogItem
86
+ :param config: Parsed configuration model.
87
+ :type config: MarkItDownExtractorConfig
88
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
89
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
90
+ :return: Extracted text payload, or None when the item is already text.
91
+ :rtype: ExtractedText or None
92
+ """
93
+ parsed_config = (
94
+ config
95
+ if isinstance(config, MarkItDownExtractorConfig)
96
+ else MarkItDownExtractorConfig.model_validate(config)
97
+ )
98
+ _ = previous_extractions
99
+ media_type = item.media_type
100
+ if media_type == "text/markdown" or media_type.startswith("text/"):
101
+ return None
102
+
103
+ from markitdown import MarkItDown
104
+
105
+ source_path = corpus.root / item.relpath
106
+ converter = MarkItDown(enable_plugins=parsed_config.enable_plugins)
107
+ conversion_result = converter.convert(str(source_path))
108
+ extracted_text = _resolve_markitdown_text(conversion_result).strip()
109
+ return ExtractedText(text=extracted_text, producer_extractor_id=self.extractor_id)
110
+
111
+
112
+ def _resolve_markitdown_text(conversion_result: object) -> str:
113
+ """
114
+ Resolve a text payload from a MarkItDown conversion result.
115
+
116
+ :param conversion_result: Result returned by the MarkItDown converter.
117
+ :type conversion_result: object
118
+ :return: Extracted text payload or an empty string.
119
+ :rtype: str
120
+ """
121
+ if isinstance(conversion_result, str):
122
+ return conversion_result
123
+ if conversion_result is None:
124
+ return ""
125
+ text_content = getattr(conversion_result, "text_content", None)
126
+ if isinstance(text_content, str):
127
+ return text_content
128
+ return ""
@@ -0,0 +1,191 @@
1
+ """
2
+ High-level knowledge base workflow for turnkey usage.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from tempfile import TemporaryDirectory
10
+ from typing import List, Optional, Sequence
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from .backends import get_backend
15
+ from .context import (
16
+ ContextPack,
17
+ ContextPackPolicy,
18
+ TokenBudget,
19
+ build_context_pack,
20
+ fit_context_pack_to_token_budget,
21
+ )
22
+ from .corpus import Corpus
23
+ from .models import QueryBudget, RetrievalResult, RetrievalRun
24
+
25
+
26
+ class KnowledgeBaseDefaults(BaseModel):
27
+ """
28
+ Default configuration for a knowledge base workflow.
29
+
30
+ :ivar backend_id: Backend identifier to use for retrieval.
31
+ :vartype backend_id: str
32
+ :ivar recipe_name: Human-readable retrieval recipe name.
33
+ :vartype recipe_name: str
34
+ :ivar query_budget: Default query budget to apply to retrieval.
35
+ :vartype query_budget: QueryBudget
36
+ :ivar tags: Tags to apply when importing the folder.
37
+ :vartype tags: list[str]
38
+ """
39
+
40
+ model_config = ConfigDict(extra="forbid")
41
+
42
+ backend_id: str = Field(default="scan", min_length=1)
43
+ recipe_name: str = Field(default="Knowledge base", min_length=1)
44
+ query_budget: QueryBudget = Field(
45
+ default_factory=lambda: QueryBudget(
46
+ max_total_items=5,
47
+ max_total_characters=2000,
48
+ max_items_per_source=None,
49
+ )
50
+ )
51
+ tags: List[str] = Field(default_factory=list)
52
+
53
+
54
+ @dataclass
55
+ class KnowledgeBase:
56
+ """
57
+ High-level knowledge base wrapper for turnkey workflows.
58
+
59
+ :ivar corpus: Corpus instance that stores the ingested items.
60
+ :vartype corpus: Corpus
61
+ :ivar backend_id: Backend identifier used for retrieval.
62
+ :vartype backend_id: str
63
+ :ivar run: Retrieval run manifest associated with the knowledge base.
64
+ :vartype run: RetrievalRun
65
+ :ivar defaults: Default configuration used for this knowledge base.
66
+ :vartype defaults: KnowledgeBaseDefaults
67
+ """
68
+
69
+ corpus: Corpus
70
+ backend_id: str
71
+ run: RetrievalRun
72
+ defaults: KnowledgeBaseDefaults
73
+ _temp_dir: Optional[TemporaryDirectory]
74
+
75
+ @classmethod
76
+ def from_folder(
77
+ cls,
78
+ folder: str | Path,
79
+ *,
80
+ backend_id: Optional[str] = None,
81
+ recipe_name: Optional[str] = None,
82
+ query_budget: Optional[QueryBudget] = None,
83
+ tags: Optional[Sequence[str]] = None,
84
+ corpus_root: Optional[str | Path] = None,
85
+ ) -> "KnowledgeBase":
86
+ """
87
+ Build a knowledge base from a folder of files.
88
+
89
+ :param folder: Folder containing source files.
90
+ :type folder: str or Path
91
+ :param backend_id: Optional backend identifier override.
92
+ :type backend_id: str or None
93
+ :param recipe_name: Optional recipe name override.
94
+ :type recipe_name: str or None
95
+ :param query_budget: Optional query budget override.
96
+ :type query_budget: QueryBudget or None
97
+ :param tags: Optional tags to apply during import.
98
+ :type tags: Sequence[str] or None
99
+ :param corpus_root: Optional corpus root override.
100
+ :type corpus_root: str or Path or None
101
+ :return: Knowledge base instance.
102
+ :rtype: KnowledgeBase
103
+ :raises FileNotFoundError: If the folder does not exist.
104
+ :raises NotADirectoryError: If the folder is not a directory.
105
+ """
106
+ source_root = Path(folder).resolve()
107
+ if not source_root.exists():
108
+ raise FileNotFoundError(f"Knowledge base folder does not exist: {source_root}")
109
+ if not source_root.is_dir():
110
+ raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
111
+
112
+ defaults = KnowledgeBaseDefaults()
113
+ resolved_backend_id = backend_id or defaults.backend_id
114
+ resolved_recipe_name = recipe_name or defaults.recipe_name
115
+ resolved_query_budget = query_budget or defaults.query_budget
116
+ resolved_tags = list(tags) if tags is not None else defaults.tags
117
+
118
+ temp_dir: Optional[TemporaryDirectory] = None
119
+ if corpus_root is None:
120
+ temp_dir = TemporaryDirectory(prefix="biblicus-knowledge-base-")
121
+ corpus_root_path = Path(temp_dir.name) / "corpus"
122
+ else:
123
+ corpus_root_path = Path(corpus_root).resolve()
124
+
125
+ corpus = Corpus.init(corpus_root_path)
126
+ corpus.import_tree(source_root, tags=resolved_tags)
127
+
128
+ backend = get_backend(resolved_backend_id)
129
+ run = backend.build_run(corpus, recipe_name=resolved_recipe_name, config={})
130
+
131
+ return cls(
132
+ corpus=corpus,
133
+ backend_id=resolved_backend_id,
134
+ run=run,
135
+ defaults=KnowledgeBaseDefaults(
136
+ backend_id=resolved_backend_id,
137
+ recipe_name=resolved_recipe_name,
138
+ query_budget=resolved_query_budget,
139
+ tags=resolved_tags,
140
+ ),
141
+ _temp_dir=temp_dir,
142
+ )
143
+
144
+ def query(self, query_text: str, *, budget: Optional[QueryBudget] = None) -> RetrievalResult:
145
+ """
146
+ Query the knowledge base for evidence.
147
+
148
+ :param query_text: Query text to execute.
149
+ :type query_text: str
150
+ :param budget: Optional budget override.
151
+ :type budget: QueryBudget or None
152
+ :return: Retrieval result containing evidence.
153
+ :rtype: RetrievalResult
154
+ """
155
+ backend = get_backend(self.backend_id)
156
+ resolved_budget = budget or self.defaults.query_budget
157
+ return backend.query(
158
+ self.corpus,
159
+ run=self.run,
160
+ query_text=query_text,
161
+ budget=resolved_budget,
162
+ )
163
+
164
+ def context_pack(
165
+ self,
166
+ result: RetrievalResult,
167
+ *,
168
+ join_with: str = "\n\n",
169
+ max_tokens: Optional[int] = None,
170
+ ) -> ContextPack:
171
+ """
172
+ Build a context pack from a retrieval result.
173
+
174
+ :param result: Retrieval result to convert into context.
175
+ :type result: RetrievalResult
176
+ :param join_with: Join string for evidence blocks.
177
+ :type join_with: str
178
+ :param max_tokens: Optional token budget for the context pack.
179
+ :type max_tokens: int or None
180
+ :return: Context pack text and metadata.
181
+ :rtype: ContextPack
182
+ """
183
+ policy = ContextPackPolicy(join_with=join_with)
184
+ context_pack = build_context_pack(result, policy=policy)
185
+ if max_tokens is None:
186
+ return context_pack
187
+ return fit_context_pack_to_token_budget(
188
+ context_pack,
189
+ policy=policy,
190
+ token_budget=TokenBudget(max_tokens=max_tokens),
191
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -25,6 +25,8 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
25
25
  Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
26
26
  Provides-Extra: ocr
27
27
  Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
28
+ Provides-Extra: markitdown
29
+ Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
28
30
  Dynamic: license-file
29
31
 
30
32
  # Biblicus
@@ -45,6 +47,40 @@ It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or
45
47
 
46
48
  See [retrieval augmented generation overview] for a short introduction to the idea.
47
49
 
50
+ ## Start with a knowledge base
51
+
52
+ If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
53
+
54
+ This example assumes a folder called `notes/` with a few `.txt` files. The knowledge base handles sensible defaults and still gives you a clear context pack for your model call.
55
+
56
+ ```python
57
+ from biblicus.knowledge_base import KnowledgeBase
58
+
59
+
60
+ kb = KnowledgeBase.from_folder("notes")
61
+ result = kb.query("Primary button style preference")
62
+ context_pack = kb.context_pack(result, max_tokens=800)
63
+
64
+ print(context_pack.text)
65
+ ```
66
+
67
+ If you want to run a real, executable version of this story, use `scripts/readme_end_to_end_demo.py` from a fresh clone.
68
+
69
+ This simplified sequence diagram shows the same idea at a high level.
70
+
71
+ ```mermaid
72
+ %%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
73
+ sequenceDiagram
74
+ participant App as Your assistant code
75
+ participant KB as Knowledge base
76
+ participant LLM as Large language model
77
+
78
+ App->>KB: query
79
+ KB-->>App: evidence and context
80
+ App->>LLM: context plus prompt
81
+ LLM-->>App: response draft
82
+ ```
83
+
48
84
  ## A simple mental model
49
85
 
50
86
  Think in three stages.
@@ -72,7 +108,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
72
108
  This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
73
109
 
74
110
  ```mermaid
75
- %%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
111
+ %%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
76
112
  sequenceDiagram
77
113
  participant User
78
114
  participant App as Your assistant code
@@ -126,6 +162,7 @@ Some extractors are optional so the base install stays small.
126
162
  - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
127
163
  - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
128
164
  - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
165
+ - MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
129
166
 
130
167
  ## Quick start
131
168
 
@@ -153,11 +190,11 @@ biblicus crawl --corpus corpora/example \\
153
190
  --tag crawled
154
191
  ```
155
192
 
156
- ## End-to-end example: evidence to assistant context
193
+ ## End-to-end example: lower-level control
157
194
 
158
195
  The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
159
196
 
160
- Start with a few short “memories” from a chat system. Each memory is stored as a normal item in the corpus.
197
+ This version shows the lower-level pieces explicitly. You are building the corpus, controlling each memory string, choosing the backend, and shaping the context pack yourself.
161
198
 
162
199
  ```python
163
200
  from biblicus.backends import get_backend
@@ -383,6 +420,7 @@ The documents below follow the pipeline from raw items to model context:
383
420
 
384
421
  - [Corpus][corpus]
385
422
  - [Text extraction][text-extraction]
423
+ - [Knowledge base][knowledge-base]
386
424
  - [Backends][backends]
387
425
  - [Context packs][context-packs]
388
426
  - [Testing and evaluation][testing]
@@ -432,6 +470,20 @@ Two backends are included.
432
470
  - `scan` is a minimal baseline that scans raw items directly.
433
471
  - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
434
472
 
473
+ ## Extraction backends
474
+
475
+ These extractors are built in. Optional ones require extra dependencies.
476
+
477
+ - `pass-through-text` reads text items and strips Markdown front matter.
478
+ - `metadata-text` turns catalog metadata into a small text artifact.
479
+ - `pdf-text` extracts text from Portable Document Format items with `pypdf`.
480
+ - `select-text` chooses one prior extraction result in a pipeline.
481
+ - `select-longest-text` chooses the longest prior extraction result.
482
+ - `ocr-rapidocr` does optical character recognition on images (optional).
483
+ - `stt-openai` performs speech to text on audio (optional).
484
+ - `unstructured` provides broad document parsing (optional).
485
+ - `markitdown` converts many formats into Markdown-like text (optional).
486
+
435
487
  ## Integration corpus and evaluation dataset
436
488
 
437
489
  Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
@@ -485,6 +537,7 @@ License terms are in `LICENSE`.
485
537
  [roadmap]: docs/ROADMAP.md
486
538
  [feature-index]: docs/FEATURE_INDEX.md
487
539
  [corpus]: docs/CORPUS.md
540
+ [knowledge-base]: docs/KNOWLEDGE_BASE.md
488
541
  [text-extraction]: docs/EXTRACTION.md
489
542
  [user-configuration]: docs/USER_CONFIGURATION.md
490
543
  [backends]: docs/BACKENDS.md
@@ -1,4 +1,4 @@
1
- biblicus/__init__.py,sha256=9YH3nGunYPrO2wrwwya94mgHqWXnGOiIwDCB1THgGqo,432
1
+ biblicus/__init__.py,sha256=zpBSDOPXCoqBcc2QNjRWf_4dD4FKnBgUDl3j_ZG2_cA,495
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
3
  biblicus/cli.py,sha256=hBau464XNdSGdWeOCE2Q7dm0P8I4sR0W-NgVT0wPmh4,27724
4
4
  biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
@@ -14,6 +14,7 @@ biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
14
14
  biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
15
15
  biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
16
16
  biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
17
+ biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
17
18
  biblicus/models.py,sha256=6SWQ2Czg9O3zjuam8a4m8V3LlEgcGLbEctYDB6F1rRs,15317
18
19
  biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
19
20
  biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
@@ -28,8 +29,9 @@ biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98
28
29
  biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
29
30
  biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
30
31
  biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
31
- biblicus/extractors/__init__.py,sha256=X3pu18QL85IBpYf56l6_5PUxFPhEN5qLTlOrxYpfGck,1776
32
+ biblicus/extractors/__init__.py,sha256=ctf6TkGViOpxr1s1TGMs40emcXImQZ71p0uOEBvLy9s,1890
32
33
  biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
34
+ biblicus/extractors/markitdown_text.py,sha256=-7N8ebi3pYfNPnplccyy3qvsKi6uImC1xyo_dSDiD10,4546
33
35
  biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
34
36
  biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
35
37
  biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
@@ -39,9 +41,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
39
41
  biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
40
42
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
41
43
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
42
- biblicus-0.5.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
43
- biblicus-0.5.0.dist-info/METADATA,sha256=SHMtWua4egS09DGjX-YZviQOXojtkVvgrisgPmnlSnk,19666
44
- biblicus-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
45
- biblicus-0.5.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
46
- biblicus-0.5.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
47
- biblicus-0.5.0.dist-info/RECORD,,
44
+ biblicus-0.7.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
45
+ biblicus-0.7.0.dist-info/METADATA,sha256=tt46S2yJOUMhhAQFvLayZmEPJ5q7hNSP4CnUGBS2eT0,22315
46
+ biblicus-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
47
+ biblicus-0.7.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
48
+ biblicus-0.7.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
49
+ biblicus-0.7.0.dist-info/RECORD,,