PyPI - biblicus - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

biblicus 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

biblicus/__init__.py +2 -2
biblicus/_vendor/dotyaml/__init__.py +14 -0
biblicus/_vendor/dotyaml/interpolation.py +63 -0
biblicus/_vendor/dotyaml/loader.py +181 -0
biblicus/_vendor/dotyaml/transformer.py +135 -0
biblicus/backends/__init__.py +0 -2
biblicus/backends/base.py +3 -3
biblicus/backends/scan.py +21 -15
biblicus/backends/sqlite_full_text_search.py +14 -15
biblicus/cli.py +33 -49
biblicus/corpus.py +39 -58
biblicus/errors.py +15 -0
biblicus/evaluation.py +4 -8
biblicus/extraction.py +276 -77
biblicus/extractors/__init__.py +14 -3
biblicus/extractors/base.py +12 -5
biblicus/extractors/metadata_text.py +13 -5
biblicus/extractors/openai_stt.py +180 -0
biblicus/extractors/pass_through_text.py +16 -6
biblicus/extractors/pdf_text.py +100 -0
biblicus/extractors/pipeline.py +105 -0
biblicus/extractors/rapidocr_text.py +129 -0
biblicus/extractors/select_longest_text.py +105 -0
biblicus/extractors/select_text.py +100 -0
biblicus/extractors/unstructured_text.py +100 -0
biblicus/frontmatter.py +0 -3
biblicus/hook_logging.py +0 -5
biblicus/hook_manager.py +3 -5
biblicus/hooks.py +3 -7
biblicus/ignore.py +0 -3
biblicus/models.py +87 -0
biblicus/retrieval.py +0 -4
biblicus/sources.py +44 -9
biblicus/time.py +0 -1
biblicus/uris.py +3 -4
biblicus/user_config.py +138 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
biblicus-0.3.0.dist-info/RECORD +44 -0
biblicus/extractors/cascade.py +0 -101
biblicus-0.2.0.dist-info/RECORD +0 -32
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0

biblicus/user_config.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""
+User configuration file loading for Biblicus.
+User configuration is intended for small, local settings such as credentials for optional
+integrations. It is separate from corpus configuration.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+from pydantic import BaseModel, ConfigDict, Field
+from ._vendor.dotyaml import ConfigLoader
+class OpenAiUserConfig(BaseModel):
+    """
+    Configuration for OpenAI integrations.
+    :ivar api_key: OpenAI API key used for authenticated requests.
+    :vartype api_key: str
+    """
+    model_config = ConfigDict(extra="forbid")
+    api_key: str = Field(min_length=1)
+class BiblicusUserConfig(BaseModel):
+    """
+    Parsed user configuration for Biblicus.
+    :ivar openai: Optional OpenAI configuration.
+    :vartype openai: OpenAiUserConfig or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    openai: Optional[OpenAiUserConfig] = None
+def default_user_config_paths(
+    *, cwd: Optional[Path] = None, home: Optional[Path] = None
+) -> list[Path]:
+    """
+    Compute the default user configuration file search paths.
+    The search order is:
+    1. Home configuration: ``~/.biblicus/config.yml``
+    2. Local configuration: ``./.biblicus/config.yml``
+    Local configuration overrides home configuration when both exist.
+    :param cwd: Optional working directory to use instead of the process current directory.
+    :type cwd: Path or None
+    :param home: Optional home directory to use instead of the current user's home directory.
+    :type home: Path or None
+    :return: Ordered list of configuration file paths.
+    :rtype: list[Path]
+    """
+    resolved_home = (home or Path.home()).expanduser()
+    resolved_cwd = cwd or Path.cwd()
+    return [
+        resolved_home / ".biblicus" / "config.yml",
+        resolved_cwd / ".biblicus" / "config.yml",
+    ]
+def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
+    merged: Dict[str, Any] = {key: value for key, value in base.items()}
+    for key, value in override.items():
+        if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
+            merged[key] = _deep_merge(merged[key], value)
+        else:
+            merged[key] = value
+    return merged
+def _load_dotyaml_data(path: Path) -> Dict[str, Any]:
+    """
+    Load a dotyaml configuration file and return a nested mapping.
+    :param path: Configuration file path.
+    :type path: Path
+    :return: Parsed YAML data mapping.
+    :rtype: dict[str, Any]
+    """
+    loader = ConfigLoader(prefix="", load_dotenv_first=False)
+    loaded = loader.load_from_yaml(path)
+    return loaded if isinstance(loaded, dict) else {}
+def load_user_config(*, paths: Optional[list[Path]] = None) -> BiblicusUserConfig:
+    """
+    Load user configuration from known locations.
+    This function merges multiple configuration files in order. Later files override earlier files.
+    :param paths: Optional explicit search paths. When omitted, the default paths are used.
+    :type paths: list[Path] or None
+    :return: Parsed user configuration. When no files exist, the configuration is empty.
+    :rtype: BiblicusUserConfig
+    :raises ValueError: If an existing configuration file is not parseable.
+    """
+    search_paths = paths or default_user_config_paths()
+    merged_data: Dict[str, Any] = {}
+    for path in search_paths:
+        if not path.is_file():
+            continue
+        loaded = _load_dotyaml_data(path)
+        merged_data = _deep_merge(merged_data, loaded)
+    return BiblicusUserConfig.model_validate(merged_data)
+def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
+    """
+    Resolve an OpenAI API key from environment or user configuration.
+    Environment takes precedence over configuration.
+    :param config: Optional pre-loaded user configuration.
+    :type config: BiblicusUserConfig or None
+    :return: API key string, or None when no key is available.
+    :rtype: str or None
+    """
+    env_key = os.environ.get("OPENAI_API_KEY")
+    if env_key:
+        return env_key
+    loaded = config or load_user_config()
+    if loaded.openai is None:
+        return None
+    return loaded.openai.api_key

{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: biblicus
-Version: 0.2.0
+Version: 0.3.0
 Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
 License: MIT
 Requires-Python: >=3.9
@@ -8,20 +8,30 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pydantic>=2.0
 Requires-Dist: PyYAML>=6.0
+Requires-Dist: pypdf>=4.0
 Provides-Extra: dev
 Requires-Dist: behave>=1.2.6; extra == "dev"
 Requires-Dist: coverage[toml]>=7.0; extra == "dev"
 Requires-Dist: sphinx>=7.0; extra == "dev"
 Requires-Dist: myst-parser>=2.0; extra == "dev"
+Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
 Requires-Dist: ruff>=0.4.0; extra == "dev"
 Requires-Dist: black>=24.0; extra == "dev"
 Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
+Provides-Extra: openai
+Requires-Dist: openai>=1.0; extra == "openai"
+Provides-Extra: unstructured
+Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
+Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
+Provides-Extra: ocr
+Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
 Dynamic: license-file
 # Biblicus
 ![Continuous integration][continuous-integration-badge]
 ![Coverage][coverage-badge]
+![Documentation][documentation-badge]
 Make your documents usable by your assistant, then decide later how you will search and retrieve them.
@@ -31,28 +41,34 @@ The first practical problem is not retrieval. It is collection and care. You nee
 This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
-It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
+It can be used alongside LangChain, Tactus, Pydantic AI, or the agent development kit. Use it from Python or from the command line interface.
 See [retrieval augmented generation overview] for a short introduction to the idea.
-## The framework
+## A beginner friendly mental model
-The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
+Think in three stages.
+- Ingest puts raw items into a corpus. This is file first and human inspectable.
+- Extract turns items into usable text. This is where you would do text extraction from Portable Document Format files, optical character recognition for images, or speech to text for audio. If an item is already text, extraction can simply read it. Extraction outputs are derived artifacts, not edits to the raw files.
+- Retrieve searches extracted text and returns evidence. Evidence is structured so you can turn it into context for your model call in whatever way your project prefers.
+If you learn a few project words, the rest of the system becomes predictable.
 - Corpus is the folder that holds raw items and their metadata.
-- Item is the raw bytes of a document or other artifact, plus its source.
+- Item is the raw bytes plus optional metadata and source information.
 - Catalog is the rebuildable index of the corpus.
-- Evidence is what retrieval returns, ready to be turned into context for a large language model.
-- Run is a recorded retrieval build for a corpus.
+- Extraction run is a recorded extraction build that produces text artifacts.
 - Backend is a pluggable retrieval implementation.
-- Recipe is a named configuration for a backend.
-- Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
+- Run is a recorded retrieval build for a corpus.
+- Evidence is what retrieval returns, with identifiers and source information.
 ## Diagram
 This diagram shows how a corpus becomes evidence for an assistant.
-The legend shows what the border styles and fill styles mean.
-The your code region is where you decide how to turn evidence into context and how to call a model.
+Extraction is introduced here as a separate stage so you can swap extraction approaches without changing the raw corpus.
+The legend shows what the block styles mean.
+Your code is where you decide how to turn evidence into context and how to call a model.
 ```mermaid
 %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
@@ -61,7 +77,10 @@ flowchart LR
     direction LR
     LegendArtifact[Stored artifact or evidence]
     LegendStep[Step]
+    LegendStable[Stable region]
+    LegendPluggable[Pluggable region]
     LegendArtifact --- LegendStep
+    LegendStable --- LegendPluggable
   end
   subgraph Main[" "]
@@ -74,12 +93,19 @@ flowchart LR
       Raw --> Catalog[Catalog file]
     end
+    subgraph PluggableExtractionPipeline[Pluggable extraction pipeline]
+      direction TB
+      Catalog --> Extract[Extract pipeline]
+      Extract --> ExtractedText[Extracted text artifacts]
+      ExtractedText --> ExtractionRun[Extraction run manifest]
+    end
     subgraph PluggableRetrievalBackend[Pluggable retrieval backend]
       direction LR
       subgraph BackendIngestionIndexing[Ingestion and indexing]
         direction TB
-        Catalog --> Build[Build run]
+        ExtractionRun --> Build[Build run]
         Build --> BackendIndex[Backend index]
         BackendIndex --> Run[Run manifest]
       end
@@ -100,6 +126,7 @@ flowchart LR
     end
     style StableCore fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
+    style PluggableExtractionPipeline fill:#ffffff,stroke:#5e35b1,stroke-dasharray:6 3,stroke-width:2px,color:#111111
     style PluggableRetrievalBackend fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
     style YourCode fill:#ffffff,stroke:#d81b60,stroke-width:2px,color:#111111
     style BackendIngestionIndexing fill:#ffffff,stroke:#cfd8dc,color:#111111
@@ -107,6 +134,8 @@ flowchart LR
     style Raw fill:#f3e5f5,stroke:#8e24aa,color:#111111
     style Catalog fill:#f3e5f5,stroke:#8e24aa,color:#111111
+    style ExtractedText fill:#f3e5f5,stroke:#8e24aa,color:#111111
+    style ExtractionRun fill:#f3e5f5,stroke:#8e24aa,color:#111111
     style BackendIndex fill:#f3e5f5,stroke:#8e24aa,color:#111111
     style Run fill:#f3e5f5,stroke:#8e24aa,color:#111111
     style Evidence fill:#f3e5f5,stroke:#8e24aa,color:#111111
@@ -115,6 +144,7 @@ flowchart LR
     style Source fill:#f3e5f5,stroke:#8e24aa,color:#111111
     style Ingest fill:#eceff1,stroke:#90a4ae,color:#111111
+    style Extract fill:#eceff1,stroke:#90a4ae,color:#111111
     style Build fill:#eceff1,stroke:#90a4ae,color:#111111
     style Query fill:#eceff1,stroke:#90a4ae,color:#111111
     style Model fill:#eceff1,stroke:#90a4ae,color:#111111
@@ -124,6 +154,8 @@ flowchart LR
   style Main fill:#ffffff,stroke:#ffffff,color:#111111
   style LegendArtifact fill:#f3e5f5,stroke:#8e24aa,color:#111111
   style LegendStep fill:#eceff1,stroke:#90a4ae,color:#111111
+  style LegendStable fill:#ffffff,stroke:#8e24aa,stroke-width:2px,color:#111111
+  style LegendPluggable fill:#ffffff,stroke:#1e88e5,stroke-dasharray:6 3,stroke-width:2px,color:#111111
 ```
 ## Practical value
@@ -136,6 +168,7 @@ flowchart LR
 - Initialize a corpus folder.
 - Ingest items from file paths, web addresses, or text input.
+- Run extraction when you want derived text artifacts from non-text sources.
 - Reindex to refresh the catalog after edits.
 - Build a retrieval run with a backend.
 - Query the run to collect evidence and evaluate it with datasets.
@@ -154,13 +187,25 @@ After the first release, you can install it from Python Package Index.
 python3 -m pip install biblicus
 ```
+### Optional extras
+Some extractors are optional so the base install stays small.
+- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
+- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
+- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
 ## Quick start
 ```
+mkdir -p notes
+echo "A small file note" > notes/example.txt
 biblicus init corpora/example
 biblicus ingest --corpus corpora/example notes/example.txt
 echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
 biblicus list --corpus corpora/example
+biblicus extract --corpus corpora/example --step pass-through-text --step metadata-text
 biblicus build --corpus corpora/example --backend scan
 biblicus query --corpus corpora/example --query "note"
 ```
@@ -188,13 +233,18 @@ In an assistant system, retrieval usually produces context for a model call. Thi
 ## Learn more
+Full documentation is available on [ReadTheDocs](https://biblicus.readthedocs.io/).
 The documents below are written to be read in order.
 - [Architecture][architecture]
+- [Roadmap][roadmap]
+- [Feature index][feature-index]
 - [Corpus][corpus]
 - [Text extraction][text-extraction]
+- [User configuration][user-configuration]
 - [Backends][backends]
-- [Next steps][next-steps]
+- [Demos][demos]
 - [Testing][testing]
 ## Metadata and catalog
@@ -252,10 +302,18 @@ Publishing uses a Python Package Index token stored in the GitHub secret named P
 ## Documentation
-Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
+Reference documentation is generated from Sphinx style docstrings.
+Install development dependencies:
+```
+python3 -m pip install -e ".[dev]"
+```
+Build the documentation:
 ```
-sphinx-build -b html docs docs/_build
+python3 -m sphinx -b html docs docs/_build
 ```
 ## License
@@ -264,11 +322,15 @@ License terms are in `LICENSE`.
 [retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
 [architecture]: docs/ARCHITECTURE.md
+[roadmap]: docs/ROADMAP.md
+[feature-index]: docs/FEATURE_INDEX.md
 [corpus]: docs/CORPUS.md
 [text-extraction]: docs/EXTRACTION.md
+[user-configuration]: docs/USER_CONFIGURATION.md
 [backends]: docs/BACKENDS.md
-[next-steps]: docs/NEXT_STEPS.md
+[demos]: docs/DEMOS.md
 [testing]: docs/TESTING.md
 [continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
 [coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
+[documentation-badge]: https://readthedocs.org/projects/biblicus/badge/?version=latest

biblicus-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,44 @@
+biblicus/__init__.py,sha256=1vPJokNgr7JcDO9eJ2SRR8VLkFG44ZaSACSaalogvYQ,432
+biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
+biblicus/cli.py,sha256=k09mMToSawDC7TbetwtK0RItTLO84EOJCZQKDRA-b9Y,19229
+biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
+biblicus/corpus.py,sha256=oBg5nbDoDBkkXaW180ixtvU9Yh0y9nOiZDEMKomtrVU,47688
+biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
+biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
+biblicus/extraction.py,sha256=MYaHkhj0NWBKNcaohnLvNiHLwyps9JyZGaTxX5gHR-A,19281
+biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
+biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
+biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
+biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
+biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
+biblicus/models.py,sha256=fdpPRtWmtirjEKpOPL_6ZVRY0vpA2WRqMwNrOqPaauM,14204
+biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
+biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
+biblicus/time.py,sha256=NEHkJLJ3RH1PdJVAWMYbNCBnCb6UW9DVBLo7Qh1zO88,485
+biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
+biblicus/user_config.py,sha256=DqO08yLn82DhTiFpmIyyLj_J0nMbrtE8xieTj2Cgd6A,4287
+biblicus/_vendor/dotyaml/__init__.py,sha256=e4zbejeJRwlD4I0q3YvotMypO19lXqmT8iyU1q6SvhY,376
+biblicus/_vendor/dotyaml/interpolation.py,sha256=PfUAEEOTFobv7Ox0E6nAxht6BqhHIDe4hP32fZn5TOs,1992
+biblicus/_vendor/dotyaml/loader.py,sha256=KePkjyhKZSvQZphmlmlzTYZJBQsqL5qhtGV1y7G6wzM,5624
+biblicus/_vendor/dotyaml/transformer.py,sha256=2AKPS8DMOPuYtzmM-dlwIqVbARfbBH5jYV1m5qpR49E,3725
+biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
+biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
+biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
+biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
+biblicus/extractors/__init__.py,sha256=X3pu18QL85IBpYf56l6_5PUxFPhEN5qLTlOrxYpfGck,1776
+biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
+biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
+biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
+biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
+biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
+biblicus/extractors/pipeline.py,sha256=LY6eM3ypw50MDB2cPEQqZrjxkhVvIc6sv4UEhHdNDrE,3208
+biblicus/extractors/rapidocr_text.py,sha256=OMAuZealLSSTFVVmBalT-AFJy2pEpHyyvpuWxlnY-GU,4531
+biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
+biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
+biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
+biblicus-0.3.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
+biblicus-0.3.0.dist-info/METADATA,sha256=MHE8tAh9jGiMwk5X9jPSnhRFB6uAZa3T8jo_c1zrIZM,13202
+biblicus-0.3.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+biblicus-0.3.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
+biblicus-0.3.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
+biblicus-0.3.0.dist-info/RECORD,,

biblicus/extractors/cascade.py DELETED Viewed

@@ -1,101 +0,0 @@
-"""
-Cascade extractor plugin that composes multiple extractors.
-"""
-from __future__ import annotations
-from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-from ..corpus import Corpus
-from ..models import CatalogItem, ExtractedText
-from .base import TextExtractor
-class CascadeStepSpec(BaseModel):
-    """
-    Single extractor step within a cascade pipeline.
-    :ivar extractor_id: Extractor plugin identifier.
-    :vartype extractor_id: str
-    :ivar config: Extractor configuration mapping.
-    :vartype config: dict[str, Any]
-    """
-    model_config = ConfigDict(extra="forbid")
-    extractor_id: str = Field(min_length=1)
-    config: Dict[str, Any] = Field(default_factory=dict)
-class CascadeExtractorConfig(BaseModel):
-    """
-    Configuration for the cascade extractor.
-    :ivar steps: Ordered list of extractor steps to try.
-    :vartype steps: list[CascadeStepSpec]
-    """
-    model_config = ConfigDict(extra="forbid")
-    steps: List[CascadeStepSpec] = Field(min_length=1)
-    @model_validator(mode="after")
-    def _forbid_self_reference(self) -> "CascadeExtractorConfig":
-        if any(step.extractor_id == "cascade" for step in self.steps):
-            raise ValueError("Cascade extractor cannot include itself as a step")
-        return self
-class CascadeExtractor(TextExtractor):
-    """
-    Extractor that tries a sequence of extractors and uses the first usable text result.
-    A result is considered usable when its text is non-empty after stripping whitespace.
-    :ivar extractor_id: Extractor identifier.
-    :vartype extractor_id: str
-    """
-    extractor_id = "cascade"
-    def validate_config(self, config: Dict[str, Any]) -> BaseModel:
-        """
-        Validate cascade extractor configuration.
-        :param config: Configuration mapping.
-        :type config: dict[str, Any]
-        :return: Parsed config.
-        :rtype: CascadeExtractorConfig
-        """
-        return CascadeExtractorConfig.model_validate(config)
-    def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
-        """
-        Run each configured extractor step until usable text is produced.
-        :param corpus: Corpus containing the item bytes.
-        :type corpus: Corpus
-        :param item: Catalog item being processed.
-        :type item: CatalogItem
-        :param config: Parsed configuration model.
-        :type config: CascadeExtractorConfig
-        :return: Extracted text payload or None.
-        :rtype: ExtractedText or None
-        """
-        cascade_config = config if isinstance(config, CascadeExtractorConfig) else CascadeExtractorConfig.model_validate(config)
-        for step in cascade_config.steps:
-            from . import get_extractor
-            extractor = get_extractor(step.extractor_id)
-            parsed_step_config = extractor.validate_config(step.config)
-            result = extractor.extract_text(corpus=corpus, item=item, config=parsed_step_config)
-            if result is None:
-                continue
-            if not result.text.strip():
-                continue
-            return result
-        return None

biblicus-0.2.0.dist-info/RECORD DELETED Viewed

@@ -1,32 +0,0 @@
-biblicus/__init__.py,sha256=3IXdbt-q80_BlKDwTsZw7MScRW4hBgQ-Vn6xHbgNwE8,432
-biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
-biblicus/cli.py,sha256=zDD6juQGTDmrRE2DHUku-G3wV3AtXjwYTNDFACwpdC0,19501
-biblicus/constants.py,sha256=R6fZDoLVMCwgKvTaxEx7G0CstwHGaUTlW9MsmNLDZ44,269
-biblicus/corpus.py,sha256=5naoFi0GSKBg4RFd6wOU-U30NMbG6bfs_RM90JcvDGA,47460
-biblicus/evaluation.py,sha256=H_W35vF5_L4B2JCfLu19VRu402tZ2pFkN2BbBP69lVY,8119
-biblicus/extraction.py,sha256=WX1LRsKrsyHI4Wido6gMwukzRGf5cfPWvRASgu_MRN4,10614
-biblicus/frontmatter.py,sha256=8Tqlpd3bVzZrGRB9Rdj2IwHMSJLvd2ABxMNOi3L5br4,2466
-biblicus/hook_logging.py,sha256=8Rl3BpkfTexSJ7rFi94kl6DMRDD-8eu2N7zv18wXyUM,5371
-biblicus/hook_manager.py,sha256=ucDZoVM-9fg1gQAhUxi-PECaNlHoegAxb-kYCx-OMZs,6987
-biblicus/hooks.py,sha256=OfG3VsCDWQVVZnOTQHnN9GQ0AIws9SK6-85WYTrKkzk,7847
-biblicus/ignore.py,sha256=Di37CTlg6Mg3SKJc2qxZcZdYX00IcTORB2hb0g-Jins,1803
-biblicus/models.py,sha256=6cgJX7Jmm5rBVrXWH46fQf3v__jSyDy73MnKaUQMSHQ,11099
-biblicus/retrieval.py,sha256=T7HELWCNAxZ26yj7dPH8IBUaxV_gx8Ql9iwwGz0teyI,4184
-biblicus/sources.py,sha256=C4P8oM6d50tLXr4z9Shsv4z-hDiQuylXfkT3Bx03dEM,5844
-biblicus/time.py,sha256=rvp2fJXSLVmyA76GCfNKtZoifASodemJTOWN8smPt0s,486
-biblicus/uris.py,sha256=sRDyGmoHr_H4XR4qv_lSbQJXylYD0fNEr02H5wjomnQ,1986
-biblicus/backends/__init__.py,sha256=5OXKSzsn7THhwh9T5StOvEqojx_85XXuYSGdTpMK11U,1214
-biblicus/backends/base.py,sha256=699TKygGgL72Ifkhz1V890nOK6BslwO0-OY7xeqZl-I,1764
-biblicus/backends/scan.py,sha256=DZ-CgZ0jy6_928hu4dASJ8_JH7BTfF8gwVkVhd38W1U,12421
-biblicus/backends/sqlite_full_text_search.py,sha256=FMpASLeK5diK-Uyhr4pqtpDpb_Qyk5_XRaXAKUHDzjs,16502
-biblicus/extractors/__init__.py,sha256=_6Z_JkLoDYwmay76y1fy11lCSqDDizMDPX3Vke_l8x4,1008
-biblicus/extractors/base.py,sha256=yvp709uUCnPEbK-bx6u5WKNPPH3SBWhbSaewoyUIgvA,1870
-biblicus/extractors/cascade.py,sha256=ExojAYsARtF99zVg78wY_wifVfDaJFa6wiRIaT-cpRo,3209
-biblicus/extractors/metadata_text.py,sha256=C0i8fcEC9aLmwhSdK9IlZVZ9ugOocIe0y522pSjvaCA,3203
-biblicus/extractors/pass_through_text.py,sha256=ngDyI13RpCldP-OzV4q9lBTGPxDL6MDxp7OCo1rORyQ,2421
-biblicus-0.2.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
-biblicus-0.2.0.dist-info/METADATA,sha256=nTB344GRVrKuT6oPOrWBpFA_BiG3UAIgq3wCoHEVDgw,10307
-biblicus-0.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-biblicus-0.2.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
-biblicus-0.2.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
-biblicus-0.2.0.dist-info/RECORD,,

{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

biblicus 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl