biblicus 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -27,4 +27,4 @@ __all__ = [
27
27
  "RetrievalRun",
28
28
  ]
29
29
 
30
- __version__ = "0.6.0"
30
+ __version__ = "0.7.0"
@@ -7,6 +7,7 @@ from __future__ import annotations
7
7
  from typing import Dict
8
8
 
9
9
  from .base import TextExtractor
10
+ from .markitdown_text import MarkItDownExtractor
10
11
  from .metadata_text import MetadataTextExtractor
11
12
  from .openai_stt import OpenAiSpeechToTextExtractor
12
13
  from .pass_through_text import PassThroughTextExtractor
@@ -30,6 +31,7 @@ def get_extractor(extractor_id: str) -> TextExtractor:
30
31
  """
31
32
  extractors: Dict[str, TextExtractor] = {
32
33
  MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
34
+ MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
33
35
  PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
34
36
  PipelineExtractor.extractor_id: PipelineExtractor(),
35
37
  PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
@@ -0,0 +1,128 @@
1
+ """
2
+ MarkItDown-based text extraction plugin.
3
+
4
+ This extractor depends on an optional library so the core installation stays small.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import sys
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from ..corpus import Corpus
15
+ from ..errors import ExtractionRunFatalError
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+
20
+ class MarkItDownExtractorConfig(BaseModel):
21
+ """
22
+ Configuration for the MarkItDown extractor.
23
+
24
+ :ivar enable_plugins: Whether to enable MarkItDown plugins.
25
+ :vartype enable_plugins: bool
26
+ """
27
+
28
+ model_config = ConfigDict(extra="forbid")
29
+
30
+ enable_plugins: bool = Field(default=False)
31
+
32
+ class MarkItDownExtractor(TextExtractor):
33
+ """
34
+ Extractor plugin backed by the `markitdown` library.
35
+
36
+ This extractor converts non-text items into Markdown-like text. It skips text items so
37
+ the pass-through extractor remains the canonical choice for text inputs and Markdown
38
+ front matter handling.
39
+
40
+ :ivar extractor_id: Extractor identifier.
41
+ :vartype extractor_id: str
42
+ """
43
+
44
+ extractor_id = "markitdown"
45
+
46
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
47
+ """
48
+ Validate extractor configuration and ensure the dependency is installed.
49
+
50
+ :param config: Configuration mapping.
51
+ :type config: dict[str, Any]
52
+ :return: Parsed config.
53
+ :rtype: MarkItDownExtractorConfig
54
+ :raises ExtractionRunFatalError: If the optional dependency is not installed.
55
+ """
56
+ try:
57
+ import markitdown
58
+ from markitdown import MarkItDown # noqa: F401
59
+ except ImportError as import_error:
60
+ raise ExtractionRunFatalError(
61
+ "MarkItDown extractor requires an optional dependency. "
62
+ 'Install it with pip install "biblicus[markitdown]".'
63
+ ) from import_error
64
+ if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
65
+ raise ExtractionRunFatalError(
66
+ "MarkItDown requires Python 3.10 or higher. "
67
+ "Upgrade your interpreter or use a compatible extractor."
68
+ )
69
+ return MarkItDownExtractorConfig.model_validate(config)
70
+
71
+ def extract_text(
72
+ self,
73
+ *,
74
+ corpus: Corpus,
75
+ item: CatalogItem,
76
+ config: BaseModel,
77
+ previous_extractions: List[ExtractionStepOutput],
78
+ ) -> Optional[ExtractedText]:
79
+ """
80
+ Extract text for a non-text item using MarkItDown.
81
+
82
+ :param corpus: Corpus containing the item bytes.
83
+ :type corpus: Corpus
84
+ :param item: Catalog item being processed.
85
+ :type item: CatalogItem
86
+ :param config: Parsed configuration model.
87
+ :type config: MarkItDownExtractorConfig
88
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
89
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
90
+ :return: Extracted text payload, or None when the item is already text.
91
+ :rtype: ExtractedText or None
92
+ """
93
+ parsed_config = (
94
+ config
95
+ if isinstance(config, MarkItDownExtractorConfig)
96
+ else MarkItDownExtractorConfig.model_validate(config)
97
+ )
98
+ _ = previous_extractions
99
+ media_type = item.media_type
100
+ if media_type == "text/markdown" or media_type.startswith("text/"):
101
+ return None
102
+
103
+ from markitdown import MarkItDown
104
+
105
+ source_path = corpus.root / item.relpath
106
+ converter = MarkItDown(enable_plugins=parsed_config.enable_plugins)
107
+ conversion_result = converter.convert(str(source_path))
108
+ extracted_text = _resolve_markitdown_text(conversion_result).strip()
109
+ return ExtractedText(text=extracted_text, producer_extractor_id=self.extractor_id)
110
+
111
+
112
+ def _resolve_markitdown_text(conversion_result: object) -> str:
113
+ """
114
+ Resolve a text payload from a MarkItDown conversion result.
115
+
116
+ :param conversion_result: Result returned by the MarkItDown converter.
117
+ :type conversion_result: object
118
+ :return: Extracted text payload or an empty string.
119
+ :rtype: str
120
+ """
121
+ if isinstance(conversion_result, str):
122
+ return conversion_result
123
+ if conversion_result is None:
124
+ return ""
125
+ text_content = getattr(conversion_result, "text_content", None)
126
+ if isinstance(text_content, str):
127
+ return text_content
128
+ return ""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -25,6 +25,8 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
25
25
  Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
26
26
  Provides-Extra: ocr
27
27
  Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
28
+ Provides-Extra: markitdown
29
+ Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
28
30
  Dynamic: license-file
29
31
 
30
32
  # Biblicus
@@ -67,7 +69,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
67
69
  This simplified sequence diagram shows the same idea at a high level.
68
70
 
69
71
  ```mermaid
70
- %%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
72
+ %%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
71
73
  sequenceDiagram
72
74
  participant App as Your assistant code
73
75
  participant KB as Knowledge base
@@ -106,7 +108,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
106
108
  This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
107
109
 
108
110
  ```mermaid
109
- %%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
111
+ %%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
110
112
  sequenceDiagram
111
113
  participant User
112
114
  participant App as Your assistant code
@@ -160,6 +162,7 @@ Some extractors are optional so the base install stays small.
160
162
  - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
161
163
  - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
162
164
  - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
165
+ - MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
163
166
 
164
167
  ## Quick start
165
168
 
@@ -467,6 +470,20 @@ Two backends are included.
467
470
  - `scan` is a minimal baseline that scans raw items directly.
468
471
  - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
469
472
 
473
+ ## Extraction backends
474
+
475
+ These extractors are built in. Optional ones require extra dependencies.
476
+
477
+ - `pass-through-text` reads text items and strips Markdown front matter.
478
+ - `metadata-text` turns catalog metadata into a small text artifact.
479
+ - `pdf-text` extracts text from Portable Document Format items with `pypdf`.
480
+ - `select-text` chooses one prior extraction result in a pipeline.
481
+ - `select-longest-text` chooses the longest prior extraction result.
482
+ - `ocr-rapidocr` does optical character recognition on images (optional).
483
+ - `stt-openai` performs speech to text on audio (optional).
484
+ - `unstructured` provides broad document parsing (optional).
485
+ - `markitdown` converts many formats into Markdown-like text (optional).
486
+
470
487
  ## Integration corpus and evaluation dataset
471
488
 
472
489
  Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
@@ -1,4 +1,4 @@
1
- biblicus/__init__.py,sha256=jxBNIMVKudpRsbzdiE5CmU6nIjgnNhCRq0OZLSwt_kM,495
1
+ biblicus/__init__.py,sha256=zpBSDOPXCoqBcc2QNjRWf_4dD4FKnBgUDl3j_ZG2_cA,495
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
3
  biblicus/cli.py,sha256=hBau464XNdSGdWeOCE2Q7dm0P8I4sR0W-NgVT0wPmh4,27724
4
4
  biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
@@ -29,8 +29,9 @@ biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98
29
29
  biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
30
30
  biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
31
31
  biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
32
- biblicus/extractors/__init__.py,sha256=X3pu18QL85IBpYf56l6_5PUxFPhEN5qLTlOrxYpfGck,1776
32
+ biblicus/extractors/__init__.py,sha256=ctf6TkGViOpxr1s1TGMs40emcXImQZ71p0uOEBvLy9s,1890
33
33
  biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
34
+ biblicus/extractors/markitdown_text.py,sha256=-7N8ebi3pYfNPnplccyy3qvsKi6uImC1xyo_dSDiD10,4546
34
35
  biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
35
36
  biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
36
37
  biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
@@ -40,9 +41,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
40
41
  biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
41
42
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
42
43
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
43
- biblicus-0.6.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
44
- biblicus-0.6.0.dist-info/METADATA,sha256=NXcMvQZklQCSukUOGcZaLSw_aqUm6wFojy6k_pfZvzc,21311
45
- biblicus-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
46
- biblicus-0.6.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
47
- biblicus-0.6.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
48
- biblicus-0.6.0.dist-info/RECORD,,
44
+ biblicus-0.7.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
45
+ biblicus-0.7.0.dist-info/METADATA,sha256=tt46S2yJOUMhhAQFvLayZmEPJ5q7hNSP4CnUGBS2eT0,22315
46
+ biblicus-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
47
+ biblicus-0.7.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
48
+ biblicus-0.7.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
49
+ biblicus-0.7.0.dist-info/RECORD,,