PyPI - biblicus - Versions diffs - 0.1.1__py3-none-any.whl - Mend

biblicus 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

biblicus/__init__.py +28 -0
biblicus/__main__.py +8 -0
biblicus/backends/__init__.py +44 -0
biblicus/backends/base.py +65 -0
biblicus/backends/scan.py +292 -0
biblicus/backends/sqlite_full_text_search.py +427 -0
biblicus/cli.py +468 -0
biblicus/constants.py +10 -0
biblicus/corpus.py +952 -0
biblicus/evaluation.py +261 -0
biblicus/frontmatter.py +92 -0
biblicus/models.py +307 -0
biblicus/retrieval.py +137 -0
biblicus/sources.py +132 -0
biblicus/time.py +18 -0
biblicus/uris.py +64 -0
biblicus-0.1.1.dist-info/METADATA +174 -0
biblicus-0.1.1.dist-info/RECORD +22 -0
biblicus-0.1.1.dist-info/WHEEL +5 -0
biblicus-0.1.1.dist-info/entry_points.txt +2 -0
biblicus-0.1.1.dist-info/licenses/LICENSE +21 -0
biblicus-0.1.1.dist-info/top_level.txt +1 -0

biblicus/retrieval.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""
+Shared retrieval helpers for Biblicus backends.
+"""
+from __future__ import annotations
+import hashlib
+import json
+from typing import Any, Dict, Iterable, List, Optional
+from .corpus import Corpus
+from .models import Evidence, QueryBudget, RecipeManifest, RetrievalRun
+from .time import utc_now_iso
+def create_recipe_manifest(
+    *,
+    backend_id: str,
+    name: str,
+    config: Dict[str, Any],
+    description: Optional[str] = None,
+) -> RecipeManifest:
+    """
+    Create a deterministic recipe manifest from a backend configuration.
+    :param backend_id: Backend identifier for the recipe.
+    :type backend_id: str
+    :param name: Human-readable recipe name.
+    :type name: str
+    :param config: Backend-specific configuration values.
+    :type config: dict[str, Any]
+    :param description: Optional recipe description.
+    :type description: str or None
+    :return: Deterministic recipe manifest.
+    :rtype: RecipeManifest
+    """
+    config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
+    recipe_seed = f"{backend_id}:{config_json}"
+    recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
+    return RecipeManifest(
+        recipe_id=recipe_id,
+        backend_id=backend_id,
+        name=name,
+        created_at=utc_now_iso(),
+        config=config,
+        description=description,
+    )
+def create_run_manifest(
+    corpus: Corpus,
+    *,
+    recipe: RecipeManifest,
+    stats: Dict[str, Any],
+    artifact_paths: Optional[List[str]] = None,
+) -> RetrievalRun:
+    """
+    Create a retrieval run manifest tied to the current catalog snapshot.
+    :param corpus: Corpus used to generate the run.
+    :type corpus: Corpus
+    :param recipe: Recipe manifest for the run.
+    :type recipe: RecipeManifest
+    :param stats: Backend-specific run statistics.
+    :type stats: dict[str, Any]
+    :param artifact_paths: Optional relative paths to materialized artifacts.
+    :type artifact_paths: list[str] or None
+    :return: Run manifest.
+    :rtype: RetrievalRun
+    """
+    catalog = corpus.load_catalog()
+    created_at = utc_now_iso()
+    run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
+    return RetrievalRun(
+        run_id=run_id,
+        recipe=recipe,
+        corpus_uri=catalog.corpus_uri,
+        catalog_generated_at=catalog.generated_at,
+        created_at=created_at,
+        artifact_paths=list(artifact_paths or []),
+        stats=stats,
+    )
+def hash_text(text: str) -> str:
+    """
+    Hash a text payload for provenance.
+    :param text: Text to hash.
+    :type text: str
+    :return: Secure Hash Algorithm 256 hex digest.
+    :rtype: str
+    """
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evidence]:
+    """
+    Apply a query budget to a ranked evidence list.
+    :param evidence: Ranked evidence iterable (highest score first).
+    :type evidence: Iterable[Evidence]
+    :param budget: Budget constraints to enforce.
+    :type budget: QueryBudget
+    :return: Evidence list respecting the budget.
+    :rtype: list[Evidence]
+    """
+    selected_evidence: List[Evidence] = []
+    source_counts: Dict[str, int] = {}
+    total_characters = 0
+    for candidate_evidence in evidence:
+        if len(selected_evidence) >= budget.max_total_items:
+            break
+        source_key = candidate_evidence.source_uri or candidate_evidence.item_id
+        if budget.max_items_per_source is not None:
+            if source_counts.get(source_key, 0) >= budget.max_items_per_source:
+                continue
+        text_character_count = len(candidate_evidence.text or "")
+        if budget.max_total_characters is not None:
+            if total_characters + text_character_count > budget.max_total_characters:
+                continue
+        selected_evidence.append(candidate_evidence)
+        source_counts[source_key] = source_counts.get(source_key, 0) + 1
+        total_characters += text_character_count
+    return [
+        evidence_item.model_copy(update={"rank": index})
+        for index, evidence_item in enumerate(selected_evidence, start=1)
+    ]

biblicus/sources.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""
+Source loading helpers for Biblicus ingestion.
+"""
+from __future__ import annotations
+import mimetypes
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+from urllib.parse import unquote, urlparse
+from urllib.request import Request, urlopen
+def _looks_like_uri(value: str) -> bool:
+    """
+    Check whether a string resembles a uniform resource identifier.
+    :param value: Candidate string.
+    :type value: str
+    :return: True if the string has a valid uniform resource identifier scheme prefix.
+    :rtype: bool
+    """
+    return "://" in value and value.split("://", 1)[0].isidentifier()
+def _filename_from_url_path(path: str) -> str:
+    """
+    Derive a filename from a uniform resource locator path.
+    :param path: Uniform resource locator path component.
+    :type path: str
+    :return: Filename or a fallback name.
+    :rtype: str
+    """
+    filename = Path(unquote(path)).name
+    return filename or "download"
+def _media_type_from_filename(name: str) -> str:
+    """
+    Guess media type from a filename.
+    :param name: Filename to inspect.
+    :type name: str
+    :return: Guessed media type or application/octet-stream.
+    :rtype: str
+    """
+    media_type, _ = mimetypes.guess_type(name)
+    return media_type or "application/octet-stream"
+@dataclass(frozen=True)
+class SourcePayload:
+    """
+    Loaded source payload for ingestion.
+    :ivar data: Raw bytes from the source.
+    :vartype data: bytes
+    :ivar filename: Suggested filename for the payload.
+    :vartype filename: str
+    :ivar media_type: Internet Assigned Numbers Authority media type for the payload.
+    :vartype media_type: str
+    :ivar source_uri: Source uniform resource identifier used to load the payload.
+    :vartype source_uri: str
+    """
+    data: bytes
+    filename: str
+    media_type: str
+    source_uri: str
+def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> SourcePayload:
+    """
+    Load bytes from a source reference.
+    :param source: File path or uniform resource locator to load.
+    :type source: str or Path
+    :param source_uri: Optional override for the source uniform resource identifier.
+    :type source_uri: str or None
+    :return: Source payload with bytes and metadata.
+    :rtype: SourcePayload
+    :raises ValueError: If a file:// uniform resource identifier has a non-local host.
+    :raises NotImplementedError: If the uniform resource identifier scheme is unsupported.
+    """
+    if isinstance(source, Path):
+        path = source.resolve()
+        media_type = _media_type_from_filename(path.name)
+        if path.suffix.lower() in {".md", ".markdown"}:
+            media_type = "text/markdown"
+        return SourcePayload(
+            data=path.read_bytes(),
+            filename=path.name,
+            media_type=media_type,
+            source_uri=source_uri or path.as_uri(),
+        )
+    if _looks_like_uri(source):
+        parsed = urlparse(source)
+        if parsed.scheme == "file":
+            if parsed.netloc not in ("", "localhost"):
+                raise ValueError(f"Unsupported file uniform resource identifier host: {parsed.netloc!r}")
+            path = Path(unquote(parsed.path)).resolve()
+            return load_source(path, source_uri=source_uri or source)
+        if parsed.scheme in {"http", "https"}:
+            request = Request(source, headers={"User-Agent": "biblicus/0"})
+            with urlopen(request, timeout=30) as response:
+                response_bytes = response.read()
+                content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
+                filename = _filename_from_url_path(parsed.path)
+                media_type = content_type or _media_type_from_filename(filename)
+                if Path(filename).suffix.lower() in {".md", ".markdown"}:
+                    media_type = "text/markdown"
+                return SourcePayload(
+                    data=response_bytes,
+                    filename=filename,
+                    media_type=media_type,
+                    source_uri=source_uri or source,
+                )
+        raise NotImplementedError(
+            f"Unsupported source uniform resource identifier scheme: {parsed.scheme}://"
+        )
+    path = Path(source).resolve()
+    return load_source(path, source_uri=source_uri)

biblicus/time.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+Time utilities for Biblicus.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+def utc_now_iso() -> str:
+    """
+    Return the current Coordinated Universal Time as an International Organization for Standardization 8601 string.
+    :return: Current Coordinated Universal Time timestamp in International Organization for Standardization 8601 format.
+    :rtype: str
+    """
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()

biblicus/uris.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""
+Uniform resource identifier and path helpers for Biblicus corpora.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Union
+from urllib.parse import unquote, urlparse
+def _looks_like_uri(value: str) -> bool:
+    """
+    Check whether a string resembles a uniform resource identifier.
+    :param value: Candidate string.
+    :type value: str
+    :return: True if the string has a valid uniform resource identifier scheme prefix.
+    :rtype: bool
+    """
+    return "://" in value and value.split("://", 1)[0].isidentifier()
+def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
+    """
+    Convert a corpus reference to a filesystem path.
+    :param ref: Filesystem path or file:// uniform resource identifier.
+    :type ref: str or Path
+    :return: Resolved filesystem path.
+    :rtype: Path
+    :raises NotImplementedError: If a non-file uniform resource identifier scheme is used.
+    :raises ValueError: If a file:// uniform resource identifier has a non-local host.
+    """
+    if isinstance(ref, Path):
+        return ref.resolve()
+    if _looks_like_uri(ref):
+        parsed = urlparse(ref)
+        if parsed.scheme != "file":
+            raise NotImplementedError(
+                "Only file:// corpus uniform resource identifiers are supported in version zero "
+                f"(got {parsed.scheme}://)"
+            )
+        if parsed.netloc not in ("", "localhost"):
+            raise ValueError(f"Unsupported file uniform resource identifier host: {parsed.netloc!r}")
+        return Path(unquote(parsed.path)).resolve()
+    return Path(ref).resolve()
+def normalize_corpus_uri(ref: Union[str, Path]) -> str:
+    """
+    Normalize a corpus reference into a file:// uniform resource identifier.
+    :param ref: Filesystem path or file:// uniform resource identifier.
+    :type ref: str or Path
+    :return: Canonical file:// uniform resource identifier.
+    :rtype: str
+    """
+    return corpus_ref_to_path(ref).as_uri()

biblicus-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,174 @@
+Metadata-Version: 2.4
+Name: biblicus
+Version: 0.1.1
+Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
+License: MIT
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pydantic>=2.0
+Requires-Dist: PyYAML>=6.0
+Provides-Extra: dev
+Requires-Dist: behave>=1.2.6; extra == "dev"
+Requires-Dist: coverage[toml]>=7.0; extra == "dev"
+Requires-Dist: sphinx>=7.0; extra == "dev"
+Requires-Dist: myst-parser>=2.0; extra == "dev"
+Requires-Dist: ruff>=0.4.0; extra == "dev"
+Requires-Dist: black>=24.0; extra == "dev"
+Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
+Dynamic: license-file
+# Biblicus
+Make your documents usable by your assistant, then decide later how you will search and retrieve them.
+If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
+The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
+This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
+It integrates with LangChain, Tactus, Pydantic AI, and the agent development kit. Use it from Python or from the command line interface.
+See [retrieval augmented generation overview] for a short introduction to the idea.
+## The framework
+The framework is a small, explicit vocabulary that appears in code, specifications, and documentation. If you learn these words, the rest of the system becomes predictable.
+- Corpus is the folder that holds raw items and their metadata.
+- Item is the raw bytes of a document or other artifact, plus its source.
+- Catalog is the rebuildable index of the corpus.
+- Evidence is what retrieval returns, ready to be turned into context for a large language model.
+- Run is a recorded retrieval build for a corpus.
+- Backend is a pluggable retrieval implementation.
+- Recipe is a named configuration for a backend.
+- Pipeline stage is a distinct retrieval step such as retrieve, rerank, and filter.
+## Practical value
+- You can ingest raw material once, then try many retrieval approaches over time.
+- You can keep raw files readable and portable, without locking your data inside a database.
+- You can evaluate retrieval runs against shared datasets and compare backends using the same corpus.
+## Typical flow
+- Initialize a corpus folder.
+- Ingest items from file paths, web addresses, or text input.
+- Reindex to refresh the catalog after edits.
+- Build a retrieval run with a backend.
+- Query the run to collect evidence and evaluate it with datasets.
+## Install
+This repository is a working Python package. Install it into a virtual environment from the repository root.
+```
+python3 -m pip install -e .
+```
+After the first release, you can install it from Python Package Index.
+```
+python3 -m pip install biblicus
+```
+## Quick start
+```
+biblicus init corpora/example
+biblicus ingest --corpus corpora/example notes/example.txt
+echo "A short note" | biblicus ingest --corpus corpora/example --stdin --title "First note"
+biblicus list --corpus corpora/example
+biblicus build --corpus corpora/example --backend scan
+biblicus query --corpus corpora/example --query "note"
+```
+## Python usage
+From Python, the same flow is available through the Corpus class and backend interfaces. The public surface area is small on purpose.
+- Create a corpus with `Corpus.init` or open one with `Corpus.open`.
+- Ingest notes with `Corpus.ingest_note`.
+- Ingest files or web addresses with `Corpus.ingest_source`.
+- List items with `Corpus.list_items`.
+- Build a retrieval run with `get_backend` and `backend.build_run`.
+- Query a run with `backend.query`.
+- Evaluate with `evaluate_run`.
+## How it fits into an assistant
+In an assistant system, retrieval usually produces context for a model call. This library treats evidence as the primary output so you can decide how to use it.
+- Use a corpus as the source of truth for raw items.
+- Use a backend run to build any derived artifacts needed for retrieval.
+- Use queries to obtain evidence objects.
+- Convert evidence into the format your framework expects, such as message content, tool output, or citations.
+## Learn more
+The documents below are written to be read in order.
+- [Architecture][architecture]
+- [Backends][backends]
+## Metadata and catalog
+Raw items are stored as files in the corpus raw directory. Metadata can live in a Markdown front matter block or a sidecar file with the suffix `.biblicus.yml`. The catalog lives in `.biblicus/catalog.json` and can be rebuilt at any time with `biblicus reindex`.
+## Corpus layout
+```
+corpus/
+  raw/
+    item.bin
+    item.bin.biblicus.yml
+  .biblicus/
+    config.json
+    catalog.json
+    runs/
+      run-id.json
+```
+## Retrieval backends
+Two backends are included.
+- `scan` is a minimal baseline that scans raw items directly.
+- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
+## Integration corpus and evaluation dataset
+Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
+The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
+## Tests and coverage
+```
+python3 scripts/test.py
+```
+## Releases
+Releases are automated from the main branch using semantic versioning and conventional commit messages.
+The release pipeline publishes a GitHub release and uploads the package to Python Package Index when continuous integration succeeds.
+Publishing uses a Python Package Index token stored in the GitHub secret named PYPI_TOKEN.
+## Documentation
+Reference documentation is generated from Sphinx style docstrings. Build the documentation with the command below.
+```
+sphinx-build -b html docs docs/_build
+```
+## License
+License terms are in `LICENSE`.
+[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
+[architecture]: docs/ARCHITECTURE.md
+[backends]: docs/BACKENDS.md

biblicus-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,22 @@
+biblicus/__init__.py,sha256=o_1kQ7q9DCcjH7zm5MAvPx49hArnSvbr88kHKzBFMvM,432
+biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
+biblicus/cli.py,sha256=DwnvcDmjelzUq_9VMo_U_-FoBs3Si3QONVJdWGonXs4,15116
+biblicus/constants.py,sha256=t8p0yStpJAYPxsFlM0u5zJcQr_ARKEqEnIgNckjyF5Y,196
+biblicus/corpus.py,sha256=953gzT77HvYeTs2pcBXyixYRTxh65nm1JtlHVfKvCzg,30921
+biblicus/evaluation.py,sha256=H_W35vF5_L4B2JCfLu19VRu402tZ2pFkN2BbBP69lVY,8119
+biblicus/frontmatter.py,sha256=8Tqlpd3bVzZrGRB9Rdj2IwHMSJLvd2ABxMNOi3L5br4,2466
+biblicus/models.py,sha256=ZDb7-t9pycPpgZWVs5CcrpyeA_8OZLoQk-aflKjU7M4,10512
+biblicus/retrieval.py,sha256=T7HELWCNAxZ26yj7dPH8IBUaxV_gx8Ql9iwwGz0teyI,4184
+biblicus/sources.py,sha256=XFF75kqMyYdeYy6k8NtDnOmCxAmroW7DH6mdzWMPMuY,4358
+biblicus/time.py,sha256=rvp2fJXSLVmyA76GCfNKtZoifASodemJTOWN8smPt0s,486
+biblicus/uris.py,sha256=sRDyGmoHr_H4XR4qv_lSbQJXylYD0fNEr02H5wjomnQ,1986
+biblicus/backends/__init__.py,sha256=5OXKSzsn7THhwh9T5StOvEqojx_85XXuYSGdTpMK11U,1214
+biblicus/backends/base.py,sha256=699TKygGgL72Ifkhz1V890nOK6BslwO0-OY7xeqZl-I,1764
+biblicus/backends/scan.py,sha256=qvktqHIB0459sjzEO4EnS1PCXwwM19LjOx8oaDoU7DQ,9245
+biblicus/backends/sqlite_full_text_search.py,sha256=s_3gsEcdlxSFuluWcug4XEklwEoY42_Dgd7luY-BqqI,14152
+biblicus-0.1.1.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
+biblicus-0.1.1.dist-info/METADATA,sha256=lgvWJUgESiwWTCZ6_uUzgZeM3SkvnwjIzcsb8OE53BA,6635
+biblicus-0.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+biblicus-0.1.1.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
+biblicus-0.1.1.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
+biblicus-0.1.1.dist-info/RECORD,,

biblicus-0.1.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.10.2)
+Root-Is-Purelib: true
+Tag: py3-none-any

biblicus-0.1.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ biblicus = biblicus.cli:main

biblicus-0.1.1.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Biblicus Contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

biblicus-0.1.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ biblicus