PyPI - biblicus - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

biblicus 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

biblicus/__init__.py CHANGED Viewed

@@ -27,4 +27,4 @@ __all__ = [
     "RetrievalRun",
 ]
-__version__ = "0.6.0"
+__version__ = "0.7.0"

biblicus/extractors/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from __future__ import annotations
 from typing import Dict
 from .base import TextExtractor
+from .markitdown_text import MarkItDownExtractor
 from .metadata_text import MetadataTextExtractor
 from .openai_stt import OpenAiSpeechToTextExtractor
 from .pass_through_text import PassThroughTextExtractor
@@ -30,6 +31,7 @@ def get_extractor(extractor_id: str) -> TextExtractor:
     """
     extractors: Dict[str, TextExtractor] = {
         MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
+        MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
         PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
         PipelineExtractor.extractor_id: PipelineExtractor(),
         PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),

biblicus/extractors/markitdown_text.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""
+MarkItDown-based text extraction plugin.
+This extractor depends on an optional library so the core installation stays small.
+"""
+from __future__ import annotations
+import sys
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, ConfigDict, Field
+from ..corpus import Corpus
+from ..errors import ExtractionRunFatalError
+from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
+from .base import TextExtractor
+class MarkItDownExtractorConfig(BaseModel):
+    """
+    Configuration for the MarkItDown extractor.
+    :ivar enable_plugins: Whether to enable MarkItDown plugins.
+    :vartype enable_plugins: bool
+    """
+    model_config = ConfigDict(extra="forbid")
+    enable_plugins: bool = Field(default=False)
+class MarkItDownExtractor(TextExtractor):
+    """
+    Extractor plugin backed by the `markitdown` library.
+    This extractor converts non-text items into Markdown-like text. It skips text items so
+    the pass-through extractor remains the canonical choice for text inputs and Markdown
+    front matter handling.
+    :ivar extractor_id: Extractor identifier.
+    :vartype extractor_id: str
+    """
+    extractor_id = "markitdown"
+    def validate_config(self, config: Dict[str, Any]) -> BaseModel:
+        """
+        Validate extractor configuration and ensure the dependency is installed.
+        :param config: Configuration mapping.
+        :type config: dict[str, Any]
+        :return: Parsed config.
+        :rtype: MarkItDownExtractorConfig
+        :raises ExtractionRunFatalError: If the optional dependency is not installed.
+        """
+        try:
+            import markitdown
+            from markitdown import MarkItDown  # noqa: F401
+        except ImportError as import_error:
+            raise ExtractionRunFatalError(
+                "MarkItDown extractor requires an optional dependency. "
+                'Install it with pip install "biblicus[markitdown]".'
+            ) from import_error
+        if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
+            raise ExtractionRunFatalError(
+                "MarkItDown requires Python 3.10 or higher. "
+                "Upgrade your interpreter or use a compatible extractor."
+            )
+        return MarkItDownExtractorConfig.model_validate(config)
+    def extract_text(
+        self,
+        *,
+        corpus: Corpus,
+        item: CatalogItem,
+        config: BaseModel,
+        previous_extractions: List[ExtractionStepOutput],
+    ) -> Optional[ExtractedText]:
+        """
+        Extract text for a non-text item using MarkItDown.
+        :param corpus: Corpus containing the item bytes.
+        :type corpus: Corpus
+        :param item: Catalog item being processed.
+        :type item: CatalogItem
+        :param config: Parsed configuration model.
+        :type config: MarkItDownExtractorConfig
+        :param previous_extractions: Prior step outputs for this item within the pipeline.
+        :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
+        :return: Extracted text payload, or None when the item is already text.
+        :rtype: ExtractedText or None
+        """
+        parsed_config = (
+            config
+            if isinstance(config, MarkItDownExtractorConfig)
+            else MarkItDownExtractorConfig.model_validate(config)
+        )
+        _ = previous_extractions
+        media_type = item.media_type
+        if media_type == "text/markdown" or media_type.startswith("text/"):
+            return None
+        from markitdown import MarkItDown
+        source_path = corpus.root / item.relpath
+        converter = MarkItDown(enable_plugins=parsed_config.enable_plugins)
+        conversion_result = converter.convert(str(source_path))
+        extracted_text = _resolve_markitdown_text(conversion_result).strip()
+        return ExtractedText(text=extracted_text, producer_extractor_id=self.extractor_id)
+def _resolve_markitdown_text(conversion_result: object) -> str:
+    """
+    Resolve a text payload from a MarkItDown conversion result.
+    :param conversion_result: Result returned by the MarkItDown converter.
+    :type conversion_result: object
+    :return: Extracted text payload or an empty string.
+    :rtype: str
+    """
+    if isinstance(conversion_result, str):
+        return conversion_result
+    if conversion_result is None:
+        return ""
+    text_content = getattr(conversion_result, "text_content", None)
+    if isinstance(text_content, str):
+        return text_content
+    return ""

{biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: biblicus
-Version: 0.6.0
+Version: 0.7.0
 Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
 License: MIT
 Requires-Python: >=3.9
@@ -25,6 +25,8 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
 Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
 Provides-Extra: ocr
 Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
+Provides-Extra: markitdown
+Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
 Dynamic: license-file
 # Biblicus
@@ -67,7 +69,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
 This simplified sequence diagram shows the same idea at a high level.
 ```mermaid
-%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
+%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
 sequenceDiagram
   participant App as Your assistant code
   participant KB as Knowledge base
@@ -106,7 +108,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
 This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
 ```mermaid
-%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
+%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
 sequenceDiagram
   participant User
   participant App as Your assistant code
@@ -160,6 +162,7 @@ Some extractors are optional so the base install stays small.
 - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
 - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
 - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
+- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
 ## Quick start
@@ -467,6 +470,20 @@ Two backends are included.
 - `scan` is a minimal baseline that scans raw items directly.
 - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
+## Extraction backends
+These extractors are built in. Optional ones require extra dependencies.
+- `pass-through-text` reads text items and strips Markdown front matter.
+- `metadata-text` turns catalog metadata into a small text artifact.
+- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
+- `select-text` chooses one prior extraction result in a pipeline.
+- `select-longest-text` chooses the longest prior extraction result.
+- `ocr-rapidocr` does optical character recognition on images (optional).
+- `stt-openai` performs speech to text on audio (optional).
+- `unstructured` provides broad document parsing (optional).
+- `markitdown` converts many formats into Markdown-like text (optional).
 ## Integration corpus and evaluation dataset
 Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.

{biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-biblicus/__init__.py,sha256=jxBNIMVKudpRsbzdiE5CmU6nIjgnNhCRq0OZLSwt_kM,495
+biblicus/__init__.py,sha256=zpBSDOPXCoqBcc2QNjRWf_4dD4FKnBgUDl3j_ZG2_cA,495
 biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
 biblicus/cli.py,sha256=hBau464XNdSGdWeOCE2Q7dm0P8I4sR0W-NgVT0wPmh4,27724
 biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
@@ -29,8 +29,9 @@ biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98
 biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
 biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
 biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
-biblicus/extractors/__init__.py,sha256=X3pu18QL85IBpYf56l6_5PUxFPhEN5qLTlOrxYpfGck,1776
+biblicus/extractors/__init__.py,sha256=ctf6TkGViOpxr1s1TGMs40emcXImQZ71p0uOEBvLy9s,1890
 biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
+biblicus/extractors/markitdown_text.py,sha256=-7N8ebi3pYfNPnplccyy3qvsKi6uImC1xyo_dSDiD10,4546
 biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
 biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
 biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
@@ -40,9 +41,9 @@ biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuW
 biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
 biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
 biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
-biblicus-0.6.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
-biblicus-0.6.0.dist-info/METADATA,sha256=NXcMvQZklQCSukUOGcZaLSw_aqUm6wFojy6k_pfZvzc,21311
-biblicus-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-biblicus-0.6.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
-biblicus-0.6.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
-biblicus-0.6.0.dist-info/RECORD,,
+biblicus-0.7.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
+biblicus-0.7.0.dist-info/METADATA,sha256=tt46S2yJOUMhhAQFvLayZmEPJ5q7hNSP4CnUGBS2eT0,22315
+biblicus-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+biblicus-0.7.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
+biblicus-0.7.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
+biblicus-0.7.0.dist-info/RECORD,,

{biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{biblicus-0.6.0.dist-info → biblicus-0.7.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

biblicus 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

biblicus 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl