PyPI - biblicus - Versions diffs - 0.6.0__tar.gz → 0.7.0__tar.gz - Mend

biblicus 0.6.0tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

{biblicus-0.6.0/src/biblicus.egg-info → biblicus-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: biblicus
-Version: 0.6.0
+Version: 0.7.0
 Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
 License: MIT
 Requires-Python: >=3.9
@@ -25,6 +25,8 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
 Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
 Provides-Extra: ocr
 Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
+Provides-Extra: markitdown
+Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
 Dynamic: license-file
 # Biblicus
@@ -67,7 +69,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
 This simplified sequence diagram shows the same idea at a high level.
 ```mermaid
-%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
+%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
 sequenceDiagram
   participant App as Your assistant code
   participant KB as Knowledge base
@@ -106,7 +108,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
 This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
 ```mermaid
-%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
+%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
 sequenceDiagram
   participant User
   participant App as Your assistant code
@@ -160,6 +162,7 @@ Some extractors are optional so the base install stays small.
 - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
 - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
 - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
+- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
 ## Quick start
@@ -467,6 +470,20 @@ Two backends are included.
 - `scan` is a minimal baseline that scans raw items directly.
 - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
+## Extraction backends
+These extractors are built in. Optional ones require extra dependencies.
+- `pass-through-text` reads text items and strips Markdown front matter.
+- `metadata-text` turns catalog metadata into a small text artifact.
+- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
+- `select-text` chooses one prior extraction result in a pipeline.
+- `select-longest-text` chooses the longest prior extraction result.
+- `ocr-rapidocr` does optical character recognition on images (optional).
+- `stt-openai` performs speech to text on audio (optional).
+- `unstructured` provides broad document parsing (optional).
+- `markitdown` converts many formats into Markdown-like text (optional).
 ## Integration corpus and evaluation dataset
 Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.

{biblicus-0.6.0 → biblicus-0.7.0}/README.md RENAMED Viewed

@@ -38,7 +38,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
 This simplified sequence diagram shows the same idea at a high level.
 ```mermaid
-%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
+%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
 sequenceDiagram
   participant App as Your assistant code
   participant KB as Knowledge base
@@ -77,7 +77,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
 This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
 ```mermaid
-%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
+%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
 sequenceDiagram
   participant User
   participant App as Your assistant code
@@ -131,6 +131,7 @@ Some extractors are optional so the base install stays small.
 - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
 - Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
 - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
+- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
 ## Quick start
@@ -438,6 +439,20 @@ Two backends are included.
 - `scan` is a minimal baseline that scans raw items directly.
 - `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
+## Extraction backends
+These extractors are built in. Optional ones require extra dependencies.
+- `pass-through-text` reads text items and strips Markdown front matter.
+- `metadata-text` turns catalog metadata into a small text artifact.
+- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
+- `select-text` chooses one prior extraction result in a pipeline.
+- `select-longest-text` chooses the longest prior extraction result.
+- `ocr-rapidocr` does optical character recognition on images (optional).
+- `stt-openai` performs speech to text on audio (optional).
+- `unstructured` provides broad document parsing (optional).
+- `markitdown` converts many formats into Markdown-like text (optional).
 ## Integration corpus and evaluation dataset
 Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.

{biblicus-0.6.0 → biblicus-0.7.0}/docs/DEMOS.md RENAMED Viewed

@@ -221,6 +221,25 @@ python3 -m biblicus build --corpus corpora/pdf_samples --backend sqlite-full-tex
 python3 -m biblicus query --corpus corpora/pdf_samples --query "Dummy PDF file"
 ```
+### Wikipedia retrieval demo (Python)
+This example downloads a few Wikipedia summaries about retrieval and knowledge bases, builds an extraction run, creates a local full text index, and returns evidence plus a context pack.
+```
+rm -rf corpora/wikipedia_rag_demo
+python3 scripts/wikipedia_rag_demo.py --corpus corpora/wikipedia_rag_demo --force
+```
+### MarkItDown extraction demo (Python 3.10+)
+MarkItDown requires Python 3.10 or higher. This example uses the `py311` conda environment to run the extractor over the mixed sample corpus.
+```
+conda run -n py311 python -m pip install -e . "markitdown[all]"
+conda run -n py311 python scripts/download_mixed_samples.py --corpus corpora/markitdown_demo_py311 --force
+conda run -n py311 python -m biblicus extract build --corpus corpora/markitdown_demo_py311 --step markitdown
+```
 ### Mixed modality integration corpus
 This example assembles a tiny mixed corpus with a Markdown note, a Hypertext Markup Language page, an image, a Portable Document Format file with extractable text, and a generated Portable Document Format file with no extractable text.

{biblicus-0.6.0 → biblicus-0.7.0}/docs/EXTRACTION.md RENAMED Viewed

@@ -71,6 +71,27 @@ To install:
 python3 -m pip install "biblicus[unstructured]"
 ```
+`markitdown`
+- Converts common document formats into Markdown-like text
+- Backed by the optional `markitdown` dependency
+- Requires Python 3.10 or higher
+- Skips items that are already text so the pass-through extractor remains the canonical choice for text items
+- This means it will not process `text/html` or other text media types unless that policy changes
+To install:
+```
+python3 -m pip install "biblicus[markitdown]"
+```
+Example:
+```
+python3 -m biblicus extract build --corpus corpora/extraction-demo \\
+  --step markitdown
+```
 `ocr-rapidocr`
 - Optical character recognition for image items

{biblicus-0.6.0 → biblicus-0.7.0}/docs/FEATURE_INDEX.md RENAMED Viewed

@@ -123,6 +123,7 @@ What it does:
 - Includes a Portable Document Format text extractor plugin.
 - Includes a speech to text extractor plugin for audio items.
 - Includes a selection extractor step for choosing extracted text within a pipeline.
+- Includes a MarkItDown extractor plugin for document conversion.
 Documentation:
@@ -139,6 +140,7 @@ Behavior specifications:
 - `features/ocr_extractor.feature`
 - `features/stt_extractor.feature`
 - `features/unstructured_extractor.feature`
+- `features/markitdown_extractor.feature`
 - `features/integration_unstructured_extraction.feature`
 Primary implementation:

{biblicus-0.6.0 → biblicus-0.7.0}/docs/ROADMAP.md RENAMED Viewed

@@ -124,6 +124,21 @@ Acceptance checks:
 These are valuable, but intentionally not the near-term focus while retrieval becomes practical end to end.
+### In-memory corpus for ephemeral workflows
+Goal: allow programmatic, temporary corpora that live in memory for short-lived agents or tests.
+Deliverables:
+- A memory-backed corpus implementation that supports the same ingestion and catalog APIs.
+- A serialization option for snapshots so ephemeral corpora can be persisted when needed.
+- Documentation that explains tradeoffs versus file-based corpora.
+Acceptance checks:
+- Behavior specifications cover ingestion, listing, and reindexing in memory.
+- Retrieval and extraction can operate on the in-memory corpus without special casing.
 ### Extractor datasets and evaluation harness
 Goal: compare extraction approaches in a way that is measurable, repeatable, and useful for practical engineering decisions.

{biblicus-0.6.0 → biblicus-0.7.0}/features/environment.py RENAMED Viewed

@@ -134,6 +134,32 @@ def after_scenario(context, scenario) -> None:
                 sys.modules.pop(name, None)
         context._fake_rapidocr_unavailable_installed = False
         context._fake_rapidocr_unavailable_original_modules = {}
+    if getattr(context, "_fake_markitdown_installed", False):
+        original_modules = getattr(context, "_fake_markitdown_original_modules", {})
+        for name in [
+            "markitdown",
+        ]:
+            if name in original_modules:
+                sys.modules[name] = original_modules[name]
+            else:
+                sys.modules.pop(name, None)
+        context._fake_markitdown_installed = False
+        context._fake_markitdown_original_modules = {}
+    if getattr(context, "_fake_markitdown_unavailable_installed", False):
+        original_modules = getattr(context, "_fake_markitdown_unavailable_original_modules", {})
+        for name in [
+            "markitdown",
+        ]:
+            if name in original_modules:
+                sys.modules[name] = original_modules[name]
+            else:
+                sys.modules.pop(name, None)
+        context._fake_markitdown_unavailable_installed = False
+        context._fake_markitdown_unavailable_original_modules = {}
+    original_sys_version_info = getattr(context, "_original_sys_version_info", None)
+    if original_sys_version_info is not None:
+        sys.version_info = original_sys_version_info
+        context._original_sys_version_info = None
     if hasattr(context, "_tmp"):
         context._tmp.cleanup()

biblicus-0.7.0/features/markitdown_extractor.feature ADDED Viewed

@@ -0,0 +1,99 @@
+Feature: MarkItDown extractor plugin
+  The MarkItDown extractor converts non-text items into Markdown-like text as an optional dependency.
+  Scenario: MarkItDown extractor requires an optional dependency
+    Given I initialized a corpus at "corpus"
+    And the MarkItDown dependency is unavailable
+    And a Portable Document Format file "hello.pdf" exists with text "Hello"
+    When I ingest the file "hello.pdf" into corpus "corpus"
+    And I attempt to build a "markitdown" extraction run in corpus "corpus"
+    Then the command fails with exit code 2
+    And standard error includes "biblicus[markitdown]"
+  Scenario: MarkItDown extractor rejects unsupported Python versions
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available but marked as real
+    And a Portable Document Format file "hello.pdf" exists with text "Hello"
+    When I ingest the file "hello.pdf" into corpus "corpus"
+    And I attempt to build a "markitdown" extraction run in corpus "corpus"
+    Then the command fails with exit code 2
+    And standard error includes "Python 3.10"
+  Scenario: MarkItDown extractor skips text items
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available
+    When I ingest the text "alpha" with title "Alpha" and tags "a" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extraction run does not include extracted text for the last ingested item
+  Scenario: MarkItDown extractor produces extracted text for a non-text item
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available that returns text "Extracted by MarkItDown" for filename "doc.pdf"
+    And a binary file "doc.pdf" exists
+    When I ingest the file "doc.pdf" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extraction run includes extracted text for the last ingested item
+    And the extracted text for the last ingested item equals "Extracted by MarkItDown"
+    And the extraction run item provenance uses extractor "markitdown"
+  Scenario: MarkItDown extractor records empty output when it cannot extract text
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available that returns empty output for filename "empty.pdf"
+    And a binary file "empty.pdf" exists
+    When I ingest the file "empty.pdf" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extraction run includes extracted text for the last ingested item
+    And the extracted text for the last ingested item is empty
+    And the extraction run stats include extracted_empty_items 1
+  Scenario: MarkItDown extractor records empty output when conversion returns None
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available that returns None for filename "none.pdf"
+    And a binary file "none.pdf" exists
+    When I ingest the file "none.pdf" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extraction run includes extracted text for the last ingested item
+    And the extracted text for the last ingested item is empty
+    And the extraction run stats include extracted_empty_items 1
+  Scenario: MarkItDown extractor accepts string results
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available that returns a string for filename "string.pdf"
+    And a binary file "string.pdf" exists
+    When I ingest the file "string.pdf" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extraction run includes extracted text for the last ingested item
+    And the extracted text for the last ingested item equals "Extracted by MarkItDown"
+    And the extraction run item provenance uses extractor "markitdown"
+  Scenario: MarkItDown extractor records empty output for non-text conversion output
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available that returns non-text output for filename "nonstr.pdf"
+    And a binary file "nonstr.pdf" exists
+    When I ingest the file "nonstr.pdf" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extracted text for the last ingested item is empty
+    And the extraction run stats include extracted_empty_items 1
+  Scenario: MarkItDown extractor ignores whitespace output
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available that returns whitespace output for filename "whitespace.pdf"
+    And a binary file "whitespace.pdf" exists
+    When I ingest the file "whitespace.pdf" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extracted text for the last ingested item is empty
+    And the extraction run stats include extracted_empty_items 1
+  Scenario: MarkItDown extractor records per-item errors and continues
+    Given I initialized a corpus at "corpus"
+    And a fake MarkItDown library is available that raises a RuntimeError for filename "boom.pdf"
+    And a binary file "boom.pdf" exists
+    And a fake MarkItDown library is available that returns text "ok" for filename "ok.pdf"
+    And a binary file "ok.pdf" exists
+    When I ingest the file "boom.pdf" into corpus "corpus"
+    And I ingest the file "ok.pdf" into corpus "corpus"
+    And I build a "markitdown" extraction run in corpus "corpus"
+    Then the extracted text for the last ingested item equals "ok"
+    And the extraction run includes an errored result for the first ingested item
+    And the extraction run error type for the first ingested item equals "RuntimeError"
+    And the extraction run stats include errored_items 1

biblicus-0.7.0/features/steps/markitdown_steps.py ADDED Viewed

@@ -0,0 +1,173 @@
+from __future__ import annotations
+import sys
+import types
+from dataclasses import dataclass
+from typing import Dict, Optional
+from behave import given
+@dataclass
+class _FakeMarkItDownBehavior:
+    mode: str
+    text: Optional[str] = None
+def _ensure_fake_markitdown_behaviors(context) -> Dict[str, _FakeMarkItDownBehavior]:
+    behaviors = getattr(context, "fake_markitdown_behaviors", None)
+    if behaviors is None:
+        behaviors = {}
+        context.fake_markitdown_behaviors = behaviors
+    return behaviors
+def _install_fake_markitdown_module(context) -> None:
+    already_installed = getattr(context, "_fake_markitdown_installed", False)
+    if already_installed:
+        return
+    original_modules: Dict[str, object] = {}
+    module_names = [
+        "markitdown",
+    ]
+    for name in module_names:
+        if name in sys.modules:
+            original_modules[name] = sys.modules[name]
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    class _ConversionResult:
+        def __init__(self, text_content: object) -> None:
+            self.text_content = text_content
+    class MarkItDown:
+        def __init__(self, *, enable_plugins: bool = False) -> None:
+            self.enable_plugins = enable_plugins
+        def convert(self, filename: str) -> object:
+            base_name = filename.rsplit("/", 1)[-1]
+            normalized_name = base_name.split("--", 1)[-1] if "--" in base_name else base_name
+            behavior = behaviors.get(normalized_name)
+            if behavior is None:
+                return _ConversionResult("")
+            if behavior.mode == "error":
+                raise RuntimeError("fake markitdown error")
+            if behavior.mode == "empty":
+                return _ConversionResult("")
+            if behavior.mode == "none":
+                return None
+            if behavior.mode == "string":
+                return behavior.text or ""
+            if behavior.mode == "nonstring":
+                return _ConversionResult(123)
+            if behavior.mode == "whitespace":
+                return _ConversionResult("   ")
+            if behavior.mode == "text":
+                return _ConversionResult(behavior.text or "")
+            return _ConversionResult("")
+    markitdown_module = types.ModuleType("markitdown")
+    markitdown_module.MarkItDown = MarkItDown
+    markitdown_module.__biblicus_fake__ = True
+    sys.modules["markitdown"] = markitdown_module
+    context._fake_markitdown_installed = True
+    context._fake_markitdown_original_modules = original_modules
+def _install_markitdown_unavailable_module(context) -> None:
+    already_installed = getattr(context, "_fake_markitdown_unavailable_installed", False)
+    if already_installed:
+        return
+    original_modules: Dict[str, object] = {}
+    if "markitdown" in sys.modules:
+        original_modules["markitdown"] = sys.modules["markitdown"]
+    markitdown_module = types.ModuleType("markitdown")
+    sys.modules["markitdown"] = markitdown_module
+    context._fake_markitdown_unavailable_installed = True
+    context._fake_markitdown_unavailable_original_modules = original_modules
+@given("a fake MarkItDown library is available")
+def step_fake_markitdown_available(context) -> None:
+    _install_fake_markitdown_module(context)
+@given(
+    'a fake MarkItDown library is available that returns text "{text}" for filename "{filename}"'
+)
+def step_fake_markitdown_returns_text(context, text: str, filename: str) -> None:
+    _install_fake_markitdown_module(context)
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    behaviors[filename] = _FakeMarkItDownBehavior(mode="text", text=text)
+@given(
+    'a fake MarkItDown library is available that returns empty output for filename "{filename}"'
+)
+def step_fake_markitdown_returns_empty(context, filename: str) -> None:
+    _install_fake_markitdown_module(context)
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    behaviors[filename] = _FakeMarkItDownBehavior(mode="empty", text=None)
+@given('a fake MarkItDown library is available that returns None for filename "{filename}"')
+def step_fake_markitdown_returns_none(context, filename: str) -> None:
+    _install_fake_markitdown_module(context)
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    behaviors[filename] = _FakeMarkItDownBehavior(mode="none", text=None)
+@given('a fake MarkItDown library is available that returns a string for filename "{filename}"')
+def step_fake_markitdown_returns_string(context, filename: str) -> None:
+    _install_fake_markitdown_module(context)
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    behaviors[filename] = _FakeMarkItDownBehavior(mode="string", text="Extracted by MarkItDown")
+@given(
+    'a fake MarkItDown library is available that returns non-text output for filename "{filename}"'
+)
+def step_fake_markitdown_returns_nonstring(context, filename: str) -> None:
+    _install_fake_markitdown_module(context)
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    behaviors[filename] = _FakeMarkItDownBehavior(mode="nonstring", text=None)
+@given(
+    'a fake MarkItDown library is available that raises a RuntimeError for filename "{filename}"'
+)
+def step_fake_markitdown_raises_error(context, filename: str) -> None:
+    _install_fake_markitdown_module(context)
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    behaviors[filename] = _FakeMarkItDownBehavior(mode="error", text=None)
+@given(
+    'a fake MarkItDown library is available that returns whitespace output for filename "{filename}"'
+)
+def step_fake_markitdown_returns_whitespace(context, filename: str) -> None:
+    _install_fake_markitdown_module(context)
+    behaviors = _ensure_fake_markitdown_behaviors(context)
+    behaviors[filename] = _FakeMarkItDownBehavior(mode="whitespace", text=None)
+@given("the MarkItDown dependency is unavailable")
+def step_markitdown_dependency_unavailable(context) -> None:
+    _install_markitdown_unavailable_module(context)
+@given("a fake MarkItDown library is available but marked as real")
+def step_fake_markitdown_marked_real(context) -> None:
+    _install_fake_markitdown_module(context)
+    markitdown_module = sys.modules.get("markitdown")
+    if markitdown_module is not None:
+        markitdown_module.__biblicus_fake__ = False
+    if not hasattr(context, "_original_sys_version_info"):
+        context._original_sys_version_info = sys.version_info
+    sys.version_info = (3, 9, 0, "final", 0)

{biblicus-0.6.0 → biblicus-0.7.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "biblicus"
-version = "0.6.0"
+version = "0.7.0"
 description = "Command line interface and Python library for corpus ingestion, retrieval, and evaluation."
 readme = "README.md"
 requires-python = ">=3.9"
@@ -36,6 +36,9 @@ unstructured = [
 ocr = [
   "rapidocr-onnxruntime>=1.3.0",
 ]
+markitdown = [
+  "markitdown[all]>=0.1.0; python_version>='3.10'",
+]
 [project.scripts]
 biblicus = "biblicus.cli:main"
@@ -54,6 +57,7 @@ omit = ["*/biblicus/_vendor/*"]
 [tool.coverage.report]
 show_missing = true
 skip_covered = false
+fail_under = 100
 exclude_lines = [
   "pragma: no cover",
   "if __name__ == .__main__.:",

{biblicus-0.6.0 → biblicus-0.7.0}/scripts/test.py RENAMED Viewed

@@ -64,6 +64,8 @@ def main() -> int:
     Scenarios that require the optional Unstructured dependency are tagged ``@unstructured``
     and are excluded unless you also pass ``--unstructured``.
+    The coverage report enforces the configured minimum coverage threshold.
     :return: Exit code.
     :rtype: int
     """
@@ -100,12 +102,21 @@ def main() -> int:
         behave_args.extend(["--tags", "~@ocr"])
     if args.integration and not args.unstructured:
         behave_args.extend(["--tags", "~@unstructured"])
-    rc = _run([sys.executable, "-m", "coverage", "run", "-m", "behave", *behave_args], env=env)
-    _run([sys.executable, "-m", "coverage", "report", "-m"], env=env)
-    _run([sys.executable, "-m", "coverage", "html", "-d", str(htmlcov_dir)], env=env)
+    behave_exit_code = _run(
+        [sys.executable, "-m", "coverage", "run", "-m", "behave", *behave_args],
+        env=env,
+    )
+    coverage_report_exit_code = _run(
+        [sys.executable, "-m", "coverage", "report", "-m"],
+        env=env,
+    )
+    coverage_html_exit_code = _run(
+        [sys.executable, "-m", "coverage", "html", "-d", str(htmlcov_dir)],
+        env=env,
+    )
     print(f"Coverage report in Hypertext Markup Language: {htmlcov_dir / 'index.html'}")
-    return int(rc)
+    return int(max(behave_exit_code, coverage_report_exit_code, coverage_html_exit_code))
 if __name__ == "__main__":

biblicus 0.6.0__tar.gz → 0.7.0__tar.gz

biblicus 0.6.0tar.gz → 0.7.0tar.gz