biblicus 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/extractors/__init__.py +2 -0
- biblicus/extractors/markitdown_text.py +128 -0
- {biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/METADATA +20 -3
- {biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/RECORD +9 -8
- {biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/WHEEL +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/extractors/__init__.py
CHANGED
|
@@ -7,6 +7,7 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict
|
|
8
8
|
|
|
9
9
|
from .base import TextExtractor
|
|
10
|
+
from .markitdown_text import MarkItDownExtractor
|
|
10
11
|
from .metadata_text import MetadataTextExtractor
|
|
11
12
|
from .openai_stt import OpenAiSpeechToTextExtractor
|
|
12
13
|
from .pass_through_text import PassThroughTextExtractor
|
|
@@ -30,6 +31,7 @@ def get_extractor(extractor_id: str) -> TextExtractor:
|
|
|
30
31
|
"""
|
|
31
32
|
extractors: Dict[str, TextExtractor] = {
|
|
32
33
|
MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
|
|
34
|
+
MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
|
|
33
35
|
PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
|
|
34
36
|
PipelineExtractor.extractor_id: PipelineExtractor(),
|
|
35
37
|
PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MarkItDown-based text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor depends on an optional library so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MarkItDownExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for the MarkItDown extractor.
|
|
23
|
+
|
|
24
|
+
:ivar enable_plugins: Whether to enable MarkItDown plugins.
|
|
25
|
+
:vartype enable_plugins: bool
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
enable_plugins: bool = Field(default=False)
|
|
31
|
+
|
|
32
|
+
class MarkItDownExtractor(TextExtractor):
|
|
33
|
+
"""
|
|
34
|
+
Extractor plugin backed by the `markitdown` library.
|
|
35
|
+
|
|
36
|
+
This extractor converts non-text items into Markdown-like text. It skips text items so
|
|
37
|
+
the pass-through extractor remains the canonical choice for text inputs and Markdown
|
|
38
|
+
front matter handling.
|
|
39
|
+
|
|
40
|
+
:ivar extractor_id: Extractor identifier.
|
|
41
|
+
:vartype extractor_id: str
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
extractor_id = "markitdown"
|
|
45
|
+
|
|
46
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
47
|
+
"""
|
|
48
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
49
|
+
|
|
50
|
+
:param config: Configuration mapping.
|
|
51
|
+
:type config: dict[str, Any]
|
|
52
|
+
:return: Parsed config.
|
|
53
|
+
:rtype: MarkItDownExtractorConfig
|
|
54
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
import markitdown
|
|
58
|
+
from markitdown import MarkItDown # noqa: F401
|
|
59
|
+
except ImportError as import_error:
|
|
60
|
+
raise ExtractionRunFatalError(
|
|
61
|
+
"MarkItDown extractor requires an optional dependency. "
|
|
62
|
+
'Install it with pip install "biblicus[markitdown]".'
|
|
63
|
+
) from import_error
|
|
64
|
+
if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
|
|
65
|
+
raise ExtractionRunFatalError(
|
|
66
|
+
"MarkItDown requires Python 3.10 or higher. "
|
|
67
|
+
"Upgrade your interpreter or use a compatible extractor."
|
|
68
|
+
)
|
|
69
|
+
return MarkItDownExtractorConfig.model_validate(config)
|
|
70
|
+
|
|
71
|
+
def extract_text(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
corpus: Corpus,
|
|
75
|
+
item: CatalogItem,
|
|
76
|
+
config: BaseModel,
|
|
77
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
78
|
+
) -> Optional[ExtractedText]:
|
|
79
|
+
"""
|
|
80
|
+
Extract text for a non-text item using MarkItDown.
|
|
81
|
+
|
|
82
|
+
:param corpus: Corpus containing the item bytes.
|
|
83
|
+
:type corpus: Corpus
|
|
84
|
+
:param item: Catalog item being processed.
|
|
85
|
+
:type item: CatalogItem
|
|
86
|
+
:param config: Parsed configuration model.
|
|
87
|
+
:type config: MarkItDownExtractorConfig
|
|
88
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
89
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
90
|
+
:return: Extracted text payload, or None when the item is already text.
|
|
91
|
+
:rtype: ExtractedText or None
|
|
92
|
+
"""
|
|
93
|
+
parsed_config = (
|
|
94
|
+
config
|
|
95
|
+
if isinstance(config, MarkItDownExtractorConfig)
|
|
96
|
+
else MarkItDownExtractorConfig.model_validate(config)
|
|
97
|
+
)
|
|
98
|
+
_ = previous_extractions
|
|
99
|
+
media_type = item.media_type
|
|
100
|
+
if media_type == "text/markdown" or media_type.startswith("text/"):
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
from markitdown import MarkItDown
|
|
104
|
+
|
|
105
|
+
source_path = corpus.root / item.relpath
|
|
106
|
+
converter = MarkItDown(enable_plugins=parsed_config.enable_plugins)
|
|
107
|
+
conversion_result = converter.convert(str(source_path))
|
|
108
|
+
extracted_text = _resolve_markitdown_text(conversion_result).strip()
|
|
109
|
+
return ExtractedText(text=extracted_text, producer_extractor_id=self.extractor_id)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _resolve_markitdown_text(conversion_result: object) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Resolve a text payload from a MarkItDown conversion result.
|
|
115
|
+
|
|
116
|
+
:param conversion_result: Result returned by the MarkItDown converter.
|
|
117
|
+
:type conversion_result: object
|
|
118
|
+
:return: Extracted text payload or an empty string.
|
|
119
|
+
:rtype: str
|
|
120
|
+
"""
|
|
121
|
+
if isinstance(conversion_result, str):
|
|
122
|
+
return conversion_result
|
|
123
|
+
if conversion_result is None:
|
|
124
|
+
return ""
|
|
125
|
+
text_content = getattr(conversion_result, "text_content", None)
|
|
126
|
+
if isinstance(text_content, str):
|
|
127
|
+
return text_content
|
|
128
|
+
return ""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -25,6 +25,8 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
|
25
25
|
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
26
|
Provides-Extra: ocr
|
|
27
27
|
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
+
Provides-Extra: markitdown
|
|
29
|
+
Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
|
|
28
30
|
Dynamic: license-file
|
|
29
31
|
|
|
30
32
|
# Biblicus
|
|
@@ -67,7 +69,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
|
|
|
67
69
|
This simplified sequence diagram shows the same idea at a high level.
|
|
68
70
|
|
|
69
71
|
```mermaid
|
|
70
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
72
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
71
73
|
sequenceDiagram
|
|
72
74
|
participant App as Your assistant code
|
|
73
75
|
participant KB as Knowledge base
|
|
@@ -106,7 +108,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
|
|
|
106
108
|
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
107
109
|
|
|
108
110
|
```mermaid
|
|
109
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
111
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
110
112
|
sequenceDiagram
|
|
111
113
|
participant User
|
|
112
114
|
participant App as Your assistant code
|
|
@@ -160,6 +162,7 @@ Some extractors are optional so the base install stays small.
|
|
|
160
162
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
161
163
|
- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
162
164
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
165
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
163
166
|
|
|
164
167
|
## Quick start
|
|
165
168
|
|
|
@@ -467,6 +470,20 @@ Two backends are included.
|
|
|
467
470
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
468
471
|
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
|
|
469
472
|
|
|
473
|
+
## Extraction backends
|
|
474
|
+
|
|
475
|
+
These extractors are built in. Optional ones require extra dependencies.
|
|
476
|
+
|
|
477
|
+
- `pass-through-text` reads text items and strips Markdown front matter.
|
|
478
|
+
- `metadata-text` turns catalog metadata into a small text artifact.
|
|
479
|
+
- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
|
|
480
|
+
- `select-text` chooses one prior extraction result in a pipeline.
|
|
481
|
+
- `select-longest-text` chooses the longest prior extraction result.
|
|
482
|
+
- `ocr-rapidocr` does optical character recognition on images (optional).
|
|
483
|
+
- `stt-openai` performs speech to text on audio (optional).
|
|
484
|
+
- `unstructured` provides broad document parsing (optional).
|
|
485
|
+
- `markitdown` converts many formats into Markdown-like text (optional).
|
|
486
|
+
|
|
470
487
|
## Integration corpus and evaluation dataset
|
|
471
488
|
|
|
472
489
|
Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=zpBSDOPXCoqBcc2QNjRWf_4dD4FKnBgUDl3j_ZG2_cA,495
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
3
|
biblicus/cli.py,sha256=hBau464XNdSGdWeOCE2Q7dm0P8I4sR0W-NgVT0wPmh4,27724
|
|
4
4
|
biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
|
|
@@ -29,8 +29,9 @@ biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98
|
|
|
29
29
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
30
30
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
31
31
|
biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
|
|
32
|
-
biblicus/extractors/__init__.py,sha256=
|
|
32
|
+
biblicus/extractors/__init__.py,sha256=ctf6TkGViOpxr1s1TGMs40emcXImQZ71p0uOEBvLy9s,1890
|
|
33
33
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
34
|
+
biblicus/extractors/markitdown_text.py,sha256=-7N8ebi3pYfNPnplccyy3qvsKi6uImC1xyo_dSDiD10,4546
|
|
34
35
|
biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
|
|
35
36
|
biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
|
|
36
37
|
biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
|
|
@@ -40,9 +41,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
|
|
|
40
41
|
biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
|
|
41
42
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
42
43
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
43
|
-
biblicus-0.
|
|
44
|
-
biblicus-0.
|
|
45
|
-
biblicus-0.
|
|
46
|
-
biblicus-0.
|
|
47
|
-
biblicus-0.
|
|
48
|
-
biblicus-0.
|
|
44
|
+
biblicus-0.7.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
45
|
+
biblicus-0.7.0.dist-info/METADATA,sha256=tt46S2yJOUMhhAQFvLayZmEPJ5q7hNSP4CnUGBS2eT0,22315
|
|
46
|
+
biblicus-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
47
|
+
biblicus-0.7.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
48
|
+
biblicus-0.7.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
49
|
+
biblicus-0.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|