PyPI - biblicus - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

biblicus 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

biblicus/__init__.py +1 -1
biblicus/backends/scan.py +81 -4
biblicus/backends/sqlite_full_text_search.py +63 -2
biblicus/cli.py +123 -0
biblicus/constants.py +2 -0
biblicus/corpus.py +431 -2
biblicus/extraction.py +330 -0
biblicus/extractors/__init__.py +33 -0
biblicus/extractors/base.py +61 -0
biblicus/extractors/cascade.py +101 -0
biblicus/extractors/metadata_text.py +98 -0
biblicus/extractors/pass_through_text.py +74 -0
biblicus/hook_logging.py +185 -0
biblicus/hook_manager.py +205 -0
biblicus/hooks.py +265 -0
biblicus/ignore.py +67 -0
biblicus/models.py +20 -0
biblicus/sources.py +45 -0
{biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/METADATA +101 -1
biblicus-0.2.0.dist-info/RECORD +32 -0
biblicus-0.1.1.dist-info/RECORD +0 -22
{biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/WHEEL +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/top_level.txt +0 -0

biblicus/__init__.py CHANGED Viewed

@@ -25,4 +25,4 @@ __all__ = [
     "RetrievalRun",
 ]
-__version__ = "0.1.1"
+__version__ = "0.2.0"

biblicus/backends/scan.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Optional, Tuple
 from pydantic import BaseModel, ConfigDict, Field
 from ..corpus import Corpus
+from ..extraction import ExtractionRunReference, parse_extraction_run_reference
 from ..frontmatter import parse_front_matter
 from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
 from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
@@ -21,11 +22,14 @@ class ScanRecipeConfig(BaseModel):
     :ivar snippet_characters: Maximum characters to include in evidence snippets.
     :vartype snippet_characters: int
+    :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
+    :vartype extraction_run: str or None
     """
     model_config = ConfigDict(extra="forbid")
     snippet_characters: int = Field(default=400, ge=1)
+    extraction_run: Optional[str] = None
 class ScanBackend:
@@ -59,7 +63,7 @@ class ScanBackend:
             name=recipe_name,
             config=recipe_config.model_dump(),
         )
-        stats = {"items": len(catalog.items), "text_items": _count_text_items(catalog.items.values())}
+        stats = {"items": len(catalog.items), "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config)}
         run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
         corpus.write_run(run)
         return run
@@ -89,12 +93,14 @@ class ScanBackend:
         recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
         catalog = corpus.load_catalog()
+        extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
         query_tokens = _tokenize_query(query_text)
         scored_candidates = _score_items(
             corpus,
             catalog.items.values(),
             query_tokens,
             recipe_config.snippet_characters,
+            extraction_reference=extraction_reference,
         )
         sorted_candidates = sorted(
             scored_candidates,
@@ -124,18 +130,60 @@ class ScanBackend:
         )
-def _count_text_items(items: Iterable[object]) -> int:
+def _resolve_extraction_reference(corpus: Corpus, recipe_config: ScanRecipeConfig) -> Optional[ExtractionRunReference]:
+    """
+    Resolve an extraction run reference from a recipe config.
+    :param corpus: Corpus associated with the recipe.
+    :type corpus: Corpus
+    :param recipe_config: Parsed scan recipe configuration.
+    :type recipe_config: ScanRecipeConfig
+    :return: Parsed extraction reference or None.
+    :rtype: ExtractionRunReference or None
+    :raises FileNotFoundError: If an extraction run is referenced but not present.
+    """
+    if not recipe_config.extraction_run:
+        return None
+    extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
+    run_dir = corpus.extraction_run_dir(
+        extractor_id=extraction_reference.extractor_id,
+        run_id=extraction_reference.run_id,
+    )
+    if not run_dir.is_dir():
+        raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
+    return extraction_reference
+def _count_text_items(corpus: Corpus, items: Iterable[object], recipe_config: ScanRecipeConfig) -> int:
     """
     Count catalog items that represent text content.
+    When an extraction run is configured, extracted artifacts are treated as text.
+    :param corpus: Corpus containing the items.
+    :type corpus: Corpus
     :param items: Catalog items to inspect.
     :type items: Iterable[object]
+    :param recipe_config: Parsed scan recipe configuration.
+    :type recipe_config: ScanRecipeConfig
     :return: Number of text items.
     :rtype: int
     """
     text_item_count = 0
+    extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
     for catalog_item in items:
+        item_id = str(getattr(catalog_item, "id", ""))
+        if extraction_reference and item_id:
+            extracted_text = corpus.read_extracted_text(
+                extractor_id=extraction_reference.extractor_id,
+                run_id=extraction_reference.run_id,
+                item_id=item_id,
+            )
+            if isinstance(extracted_text, str) and extracted_text.strip():
+                text_item_count += 1
+                continue
         media_type = getattr(catalog_item, "media_type", "")
         if media_type == "text/markdown" or str(media_type).startswith("text/"):
             text_item_count += 1
@@ -155,20 +203,40 @@ def _tokenize_query(query_text: str) -> List[str]:
     return [token for token in query_text.lower().split() if token]
-def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
+def _load_text_from_item(
+    corpus: Corpus,
+    *,
+    item_id: str,
+    relpath: str,
+    media_type: str,
+    extraction_reference: Optional[ExtractionRunReference],
+) -> Optional[str]:
     """
     Load a text payload from a catalog item.
     :param corpus: Corpus containing the item.
     :type corpus: Corpus
+    :param item_id: Item identifier.
+    :type item_id: str
     :param relpath: Relative path to the stored content.
     :type relpath: str
     :param media_type: Media type for the stored content.
     :type media_type: str
+    :param extraction_reference: Optional extraction run reference.
+    :type extraction_reference: ExtractionRunReference or None
     :return: Text payload or None if not decodable as text.
     :rtype: str or None
     """
+    if extraction_reference:
+        extracted_text = corpus.read_extracted_text(
+            extractor_id=extraction_reference.extractor_id,
+            run_id=extraction_reference.run_id,
+            item_id=item_id,
+        )
+        if isinstance(extracted_text, str) and extracted_text.strip():
+            return extracted_text
     content_path = corpus.root / relpath
     raw_bytes = content_path.read_bytes()
     if media_type == "text/markdown":
@@ -240,6 +308,8 @@ def _score_items(
     items: Iterable[object],
     tokens: List[str],
     snippet_characters: int,
+    *,
+    extraction_reference: Optional[ExtractionRunReference],
 ) -> List[Evidence]:
     """
     Score catalog items by token frequency and return evidence candidates.
@@ -260,7 +330,14 @@ def _score_items(
     for catalog_item in items:
         media_type = getattr(catalog_item, "media_type", "")
         relpath = getattr(catalog_item, "relpath", "")
-        item_text = _load_text_from_item(corpus, relpath, media_type)
+        item_id = str(getattr(catalog_item, "id", ""))
+        item_text = _load_text_from_item(
+            corpus,
+            item_id=item_id,
+            relpath=relpath,
+            media_type=str(media_type),
+            extraction_reference=extraction_reference,
+        )
         if item_text is None:
             continue
         lower_text = item_text.lower()

biblicus/backends/sqlite_full_text_search.py CHANGED Viewed

@@ -12,6 +12,7 @@ from pydantic import BaseModel, ConfigDict, Field
 from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
 from ..corpus import Corpus
+from ..extraction import ExtractionRunReference, parse_extraction_run_reference
 from ..frontmatter import parse_front_matter
 from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
 from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
@@ -28,6 +29,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
     :vartype chunk_overlap: int
     :ivar snippet_characters: Maximum characters to include in evidence snippets.
     :vartype snippet_characters: int
+    :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
+    :vartype extraction_run: str or None
     """
     model_config = ConfigDict(extra="forbid")
@@ -35,6 +38,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
     chunk_size: int = Field(default=800, ge=1)
     chunk_overlap: int = Field(default=200, ge=0)
     snippet_characters: int = Field(default=400, ge=1)
+    extraction_run: Optional[str] = None
 class SqliteFullTextSearchBackend:
@@ -72,11 +76,13 @@ class SqliteFullTextSearchBackend:
         db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
         db_path = corpus.root / db_relpath
         corpus.runs_dir.mkdir(parents=True, exist_ok=True)
+        extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
         stats = _build_full_text_search_index(
             db_path=db_path,
             corpus=corpus,
             items=catalog.items.values(),
             recipe_config=recipe_config,
+            extraction_reference=extraction_reference,
         )
         run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
         corpus.write_run(run)
@@ -227,6 +233,7 @@ def _build_full_text_search_index(
     corpus: Corpus,
     items: Iterable[object],
     recipe_config: SqliteFullTextSearchRecipeConfig,
+    extraction_reference: Optional[ExtractionRunReference],
 ) -> Dict[str, int]:
     """
     Build a full-text search index from corpus items.
@@ -256,7 +263,13 @@ def _build_full_text_search_index(
             item_count += 1
             media_type = getattr(catalog_item, "media_type", "")
             relpath = getattr(catalog_item, "relpath", "")
-            item_text = _load_text_from_item(corpus, relpath, media_type)
+            item_text = _load_text_from_item(
+                corpus,
+                item_id=str(getattr(catalog_item, "id", "")),
+                relpath=str(relpath),
+                media_type=str(media_type),
+                extraction_reference=extraction_reference,
+            )
             if item_text is None:
                 continue
             text_item_count += 1
@@ -302,20 +315,40 @@ def _build_full_text_search_index(
         connection.close()
-def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
+def _load_text_from_item(
+    corpus: Corpus,
+    *,
+    item_id: str,
+    relpath: str,
+    media_type: str,
+    extraction_reference: Optional[ExtractionRunReference],
+) -> Optional[str]:
     """
     Load text content from a catalog item.
     :param corpus: Corpus containing the content.
     :type corpus: Corpus
+    :param item_id: Item identifier.
+    :type item_id: str
     :param relpath: Relative path to the content.
     :type relpath: str
     :param media_type: Media type for the content.
     :type media_type: str
+    :param extraction_reference: Optional extraction run reference.
+    :type extraction_reference: ExtractionRunReference or None
     :return: Text payload or None if not text.
     :rtype: str or None
     """
+    if extraction_reference:
+        extracted_text = corpus.read_extracted_text(
+            extractor_id=extraction_reference.extractor_id,
+            run_id=extraction_reference.run_id,
+            item_id=item_id,
+        )
+        if isinstance(extracted_text, str) and extracted_text.strip():
+            return extracted_text
     content_path = corpus.root / relpath
     raw_bytes = content_path.read_bytes()
     if media_type == "text/markdown":
@@ -327,6 +360,34 @@ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optio
     return None
+def _resolve_extraction_reference(
+    corpus: Corpus,
+    recipe_config: SqliteFullTextSearchRecipeConfig,
+) -> Optional[ExtractionRunReference]:
+    """
+    Resolve an extraction run reference from a recipe config.
+    :param corpus: Corpus associated with the recipe.
+    :type corpus: Corpus
+    :param recipe_config: Parsed backend recipe configuration.
+    :type recipe_config: SqliteFullTextSearchRecipeConfig
+    :return: Parsed extraction reference or None.
+    :rtype: ExtractionRunReference or None
+    :raises FileNotFoundError: If an extraction run is referenced but not present.
+    """
+    if not recipe_config.extraction_run:
+        return None
+    extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
+    run_dir = corpus.extraction_run_dir(
+        extractor_id=extraction_reference.extractor_id,
+        run_id=extraction_reference.run_id,
+    )
+    if not run_dir.is_dir():
+        raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
+    return extraction_reference
 def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
     """
     Yield overlapping chunks of text for indexing.

biblicus/cli.py CHANGED Viewed

@@ -14,6 +14,7 @@ from pydantic import ValidationError
 from .backends import get_backend
 from .corpus import Corpus
+from .extraction import build_extraction_run
 from .evaluation import evaluate_run, load_dataset
 from .models import QueryBudget
 from .uris import corpus_ref_to_path
@@ -187,6 +188,27 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
     return 0
+def cmd_import_tree(arguments: argparse.Namespace) -> int:
+    """
+    Import a folder tree into a corpus.
+    :param arguments: Parsed command-line interface arguments.
+    :type arguments: argparse.Namespace
+    :return: Exit code.
+    :rtype: int
+    """
+    corpus = (
+        Corpus.open(arguments.corpus)
+        if getattr(arguments, "corpus", None)
+        else Corpus.find(Path.cwd())
+    )
+    tags = _parse_tags(arguments.tags, arguments.tag)
+    stats = corpus.import_tree(Path(arguments.path), tags=tags)
+    print(json.dumps(stats, indent=2, sort_keys=False))
+    return 0
 def cmd_purge(arguments: argparse.Namespace) -> int:
     """
     Purge all items and derived artifacts from a corpus.
@@ -240,6 +262,44 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
     return config
+def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
+    """
+    Parse a cascade step specification.
+    :param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
+    :type raw_step: str
+    :return: Tuple of extractor_id and config mapping.
+    :rtype: tuple[str, dict[str, object]]
+    :raises ValueError: If the step spec is invalid.
+    """
+    raw_step = raw_step.strip()
+    if not raw_step:
+        raise ValueError("Step spec must be non-empty")
+    if ":" not in raw_step:
+        return raw_step, {}
+    extractor_id, raw_pairs = raw_step.split(":", 1)
+    extractor_id = extractor_id.strip()
+    if not extractor_id:
+        raise ValueError("Step spec must start with an extractor identifier")
+    config: Dict[str, object] = {}
+    raw_pairs = raw_pairs.strip()
+    if not raw_pairs:
+        return extractor_id, {}
+    for token in raw_pairs.split(","):
+        token = token.strip()
+        if not token:
+            continue
+        if "=" not in token:
+            raise ValueError(f"Step config values must be key=value (got {token!r})")
+        key, value = token.split("=", 1)
+        key = key.strip()
+        if not key:
+            raise ValueError("Step config keys must be non-empty")
+        config[key] = value
+    return extractor_id, config
 def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
     """
     Build a QueryBudget from command-line interface arguments.
@@ -279,6 +339,40 @@ def cmd_build(arguments: argparse.Namespace) -> int:
     return 0
+def cmd_extract(arguments: argparse.Namespace) -> int:
+    """
+    Build a text extraction run for the corpus.
+    :param arguments: Parsed command-line interface arguments.
+    :type arguments: argparse.Namespace
+    :return: Exit code.
+    :rtype: int
+    """
+    corpus = (
+        Corpus.open(arguments.corpus)
+        if getattr(arguments, "corpus", None)
+        else Corpus.find(Path.cwd())
+    )
+    config = _parse_config_pairs(arguments.config)
+    if getattr(arguments, "step", None):
+        if arguments.extractor != "cascade":
+            raise ValueError("--step is only supported for the cascade extractor")
+        steps: List[Dict[str, object]] = []
+        for raw_step in arguments.step:
+            extractor_id, step_config = _parse_step_spec(raw_step)
+            steps.append({"extractor_id": extractor_id, "config": step_config})
+        config = {"steps": steps}
+    manifest = build_extraction_run(
+        corpus,
+        extractor_id=arguments.extractor,
+        recipe_name=arguments.recipe_name,
+        config=config,
+    )
+    print(manifest.model_dump_json(indent=2))
+    return 0
 def cmd_query(arguments: argparse.Namespace) -> int:
     """
     Execute a retrieval query.
@@ -390,6 +484,13 @@ def build_parser() -> argparse.ArgumentParser:
     _add_common_corpus_arg(p_reindex)
     p_reindex.set_defaults(func=cmd_reindex)
+    p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
+    _add_common_corpus_arg(p_import_tree)
+    p_import_tree.add_argument("path", help="Folder tree root to import.")
+    p_import_tree.add_argument("--tags", default=None, help="Comma-separated tags to apply to imported items.")
+    p_import_tree.add_argument("--tag", action="append", help="Repeatable tag to apply to imported items.")
+    p_import_tree.set_defaults(func=cmd_import_tree)
     p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
     _add_common_corpus_arg(p_purge)
     p_purge.add_argument(
@@ -415,6 +516,28 @@ def build_parser() -> argparse.ArgumentParser:
     )
     p_build.set_defaults(func=cmd_build)
+    p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
+    _add_common_corpus_arg(p_extract)
+    p_extract.add_argument(
+        "--extractor",
+        required=True,
+        help="Extractor identifier (for example, pass-through-text, metadata-text, cascade).",
+    )
+    p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
+    p_extract.add_argument(
+        "--step",
+        action="append",
+        default=None,
+        help="Cascade step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
+    )
+    p_extract.add_argument(
+        "--config",
+        action="append",
+        default=None,
+        help="Extractor config as key=value (repeatable).",
+    )
+    p_extract.set_defaults(func=cmd_extract)
     p_query = sub.add_parser("query", help="Run a retrieval query.")
     _add_common_corpus_arg(p_query)
     p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")

biblicus/constants.py CHANGED Viewed

@@ -8,3 +8,5 @@ CORPUS_DIR_NAME = ".biblicus"
 DEFAULT_RAW_DIR = "raw"
 SIDECAR_SUFFIX = ".biblicus.yml"
 RUNS_DIR_NAME = "runs"
+EXTRACTION_RUNS_DIR_NAME = "extraction"
+HOOK_LOGS_DIR_NAME = "hook_logs"

biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

biblicus 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl