PyPI - biblicus - Versions diffs - 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

biblicus 0.1.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

biblicus/__init__.py +2 -2
biblicus/_vendor/dotyaml/__init__.py +14 -0
biblicus/_vendor/dotyaml/interpolation.py +63 -0
biblicus/_vendor/dotyaml/loader.py +181 -0
biblicus/_vendor/dotyaml/transformer.py +135 -0
biblicus/backends/__init__.py +0 -2
biblicus/backends/base.py +3 -3
biblicus/backends/scan.py +96 -13
biblicus/backends/sqlite_full_text_search.py +74 -14
biblicus/cli.py +126 -19
biblicus/constants.py +2 -0
biblicus/corpus.py +455 -45
biblicus/errors.py +15 -0
biblicus/evaluation.py +4 -8
biblicus/extraction.py +529 -0
biblicus/extractors/__init__.py +44 -0
biblicus/extractors/base.py +68 -0
biblicus/extractors/metadata_text.py +106 -0
biblicus/extractors/openai_stt.py +180 -0
biblicus/extractors/pass_through_text.py +84 -0
biblicus/extractors/pdf_text.py +100 -0
biblicus/extractors/pipeline.py +105 -0
biblicus/extractors/rapidocr_text.py +129 -0
biblicus/extractors/select_longest_text.py +105 -0
biblicus/extractors/select_text.py +100 -0
biblicus/extractors/unstructured_text.py +100 -0
biblicus/frontmatter.py +0 -3
biblicus/hook_logging.py +180 -0
biblicus/hook_manager.py +203 -0
biblicus/hooks.py +261 -0
biblicus/ignore.py +64 -0
biblicus/models.py +107 -0
biblicus/retrieval.py +0 -4
biblicus/sources.py +85 -5
biblicus/time.py +0 -1
biblicus/uris.py +3 -4
biblicus/user_config.py +138 -0
biblicus-0.3.0.dist-info/METADATA +336 -0
biblicus-0.3.0.dist-info/RECORD +44 -0
biblicus-0.1.1.dist-info/METADATA +0 -174
biblicus-0.1.1.dist-info/RECORD +0 -22
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0

biblicus/backends/sqlite_full_text_search.py CHANGED Viewed

@@ -13,7 +13,14 @@ from pydantic import BaseModel, ConfigDict, Field
 from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
 from ..corpus import Corpus
 from ..frontmatter import parse_front_matter
-from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
+from ..models import (
+    Evidence,
+    ExtractionRunReference,
+    QueryBudget,
+    RetrievalResult,
+    RetrievalRun,
+    parse_extraction_run_reference,
+)
 from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
 from ..time import utc_now_iso
@@ -28,6 +35,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
     :vartype chunk_overlap: int
     :ivar snippet_characters: Maximum characters to include in evidence snippets.
     :vartype snippet_characters: int
+    :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
+    :vartype extraction_run: str or None
     """
     model_config = ConfigDict(extra="forbid")
@@ -35,6 +44,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
     chunk_size: int = Field(default=800, ge=1)
     chunk_overlap: int = Field(default=200, ge=0)
     snippet_characters: int = Field(default=400, ge=1)
+    extraction_run: Optional[str] = None
 class SqliteFullTextSearchBackend:
@@ -47,7 +57,9 @@ class SqliteFullTextSearchBackend:
     backend_id = "sqlite-full-text-search"
-    def build_run(self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]) -> RetrievalRun:
+    def build_run(
+        self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
+    ) -> RetrievalRun:
         """
         Build a full-text search version five index for the corpus.
@@ -60,7 +72,6 @@ class SqliteFullTextSearchBackend:
         :return: Run manifest describing the build.
         :rtype: RetrievalRun
         """
         recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(config)
         catalog = corpus.load_catalog()
         recipe = create_recipe_manifest(
@@ -72,11 +83,13 @@ class SqliteFullTextSearchBackend:
         db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
         db_path = corpus.root / db_relpath
         corpus.runs_dir.mkdir(parents=True, exist_ok=True)
+        extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
         stats = _build_full_text_search_index(
             db_path=db_path,
             corpus=corpus,
             items=catalog.items.values(),
             recipe_config=recipe_config,
+            extraction_reference=extraction_reference,
         )
         run = run.model_copy(update={"artifact_paths": [db_relpath], "stats": stats})
         corpus.write_run(run)
@@ -104,7 +117,6 @@ class SqliteFullTextSearchBackend:
         :return: Retrieval results containing evidence.
         :rtype: RetrievalResult
         """
         recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
         db_path = _resolve_run_db_path(corpus, run)
         candidates = _query_full_text_search_index(
@@ -150,7 +162,6 @@ def _candidate_limit(max_total_items: int) -> int:
     :return: Candidate limit for backend search.
     :rtype: int
     """
     return max_total_items * 5
@@ -166,7 +177,6 @@ def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
     :rtype: Path
     :raises FileNotFoundError: If the run does not have artifact paths.
     """
     if not run.artifact_paths:
         raise FileNotFoundError("Run has no artifact paths to query")
     return corpus.root / run.artifact_paths[0]
@@ -182,7 +192,6 @@ def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
     :rtype: None
     :raises RuntimeError: If full-text search version five support is unavailable.
     """
     try:
         cursor = conn.execute(
             "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_full_text_search USING fts5(content)"
@@ -204,7 +213,6 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
     :return: None.
     :rtype: None
     """
     conn.execute(
         """
         CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
@@ -227,6 +235,7 @@ def _build_full_text_search_index(
     corpus: Corpus,
     items: Iterable[object],
     recipe_config: SqliteFullTextSearchRecipeConfig,
+    extraction_reference: Optional[ExtractionRunReference],
 ) -> Dict[str, int]:
     """
     Build a full-text search index from corpus items.
@@ -242,7 +251,6 @@ def _build_full_text_search_index(
     :return: Index statistics.
     :rtype: dict[str, int]
     """
     if db_path.exists():
         db_path.unlink()
     connection = sqlite3.connect(str(db_path))
@@ -256,7 +264,13 @@ def _build_full_text_search_index(
             item_count += 1
             media_type = getattr(catalog_item, "media_type", "")
             relpath = getattr(catalog_item, "relpath", "")
-            item_text = _load_text_from_item(corpus, relpath, media_type)
+            item_text = _load_text_from_item(
+                corpus,
+                item_id=str(getattr(catalog_item, "id", "")),
+                relpath=str(relpath),
+                media_type=str(media_type),
+                extraction_reference=extraction_reference,
+            )
             if item_text is None:
                 continue
             text_item_count += 1
@@ -302,19 +316,38 @@ def _build_full_text_search_index(
         connection.close()
-def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optional[str]:
+def _load_text_from_item(
+    corpus: Corpus,
+    *,
+    item_id: str,
+    relpath: str,
+    media_type: str,
+    extraction_reference: Optional[ExtractionRunReference],
+) -> Optional[str]:
     """
     Load text content from a catalog item.
     :param corpus: Corpus containing the content.
     :type corpus: Corpus
+    :param item_id: Item identifier.
+    :type item_id: str
     :param relpath: Relative path to the content.
     :type relpath: str
     :param media_type: Media type for the content.
     :type media_type: str
+    :param extraction_reference: Optional extraction run reference.
+    :type extraction_reference: ExtractionRunReference or None
     :return: Text payload or None if not text.
     :rtype: str or None
     """
+    if extraction_reference:
+        extracted_text = corpus.read_extracted_text(
+            extractor_id=extraction_reference.extractor_id,
+            run_id=extraction_reference.run_id,
+            item_id=item_id,
+        )
+        if isinstance(extracted_text, str) and extracted_text.strip():
+            return extracted_text
     content_path = corpus.root / relpath
     raw_bytes = content_path.read_bytes()
@@ -327,7 +360,36 @@ def _load_text_from_item(corpus: Corpus, relpath: str, media_type: str) -> Optio
     return None
-def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[Tuple[int, int, str]]:
+def _resolve_extraction_reference(
+    corpus: Corpus,
+    recipe_config: SqliteFullTextSearchRecipeConfig,
+) -> Optional[ExtractionRunReference]:
+    """
+    Resolve an extraction run reference from a recipe config.
+    :param corpus: Corpus associated with the recipe.
+    :type corpus: Corpus
+    :param recipe_config: Parsed backend recipe configuration.
+    :type recipe_config: SqliteFullTextSearchRecipeConfig
+    :return: Parsed extraction reference or None.
+    :rtype: ExtractionRunReference or None
+    :raises FileNotFoundError: If an extraction run is referenced but not present.
+    """
+    if not recipe_config.extraction_run:
+        return None
+    extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
+    run_dir = corpus.extraction_run_dir(
+        extractor_id=extraction_reference.extractor_id,
+        run_id=extraction_reference.run_id,
+    )
+    if not run_dir.is_dir():
+        raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
+    return extraction_reference
+def _iter_chunks(
+    text: str, *, chunk_size: int, chunk_overlap: int
+) -> Iterable[Tuple[int, int, str]]:
     """
     Yield overlapping chunks of text for indexing.
@@ -341,7 +403,6 @@ def _iter_chunks(text: str, *, chunk_size: int, chunk_overlap: int) -> Iterable[
     :rtype: Iterable[tuple[int, int, str]]
     :raises ValueError: If the overlap is greater than or equal to the chunk size.
     """
     if chunk_overlap >= chunk_size:
         raise ValueError("chunk_overlap must be smaller than chunk_size")
     start_offset = 0
@@ -374,7 +435,6 @@ def _query_full_text_search_index(
     :return: Evidence candidates.
     :rtype: list[Evidence]
     """
     connection = sqlite3.connect(str(db_path))
     try:
         rows = connection.execute(

biblicus/cli.py CHANGED Viewed

@@ -14,7 +14,9 @@ from pydantic import ValidationError
 from .backends import get_backend
 from .corpus import Corpus
+from .errors import ExtractionRunFatalError
 from .evaluation import evaluate_run, load_dataset
+from .extraction import build_extraction_run
 from .models import QueryBudget
 from .uris import corpus_ref_to_path
@@ -28,7 +30,6 @@ def _add_common_corpus_arg(parser: argparse.ArgumentParser) -> None:
     :return: None.
     :rtype: None
     """
     parser.add_argument(
         "--corpus",
         type=str,
@@ -50,7 +51,6 @@ def cmd_init(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus_path = corpus_ref_to_path(arguments.path)
     corpus = Corpus.init(corpus_path, force=arguments.force)
     print(f"Initialized corpus at {corpus.root}")
@@ -68,7 +68,6 @@ def _parse_tags(raw: Optional[str], raw_list: Optional[List[str]]) -> List[str]:
     :return: Deduplicated tag list.
     :rtype: list[str]
     """
     parsed_tags: List[str] = []
     if raw:
         parsed_tags.extend([tag.strip() for tag in raw.split(",") if tag.strip()])
@@ -93,7 +92,6 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -134,7 +132,6 @@ def cmd_list(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -156,7 +153,6 @@ def cmd_show(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -176,7 +172,6 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -187,6 +182,26 @@ def cmd_reindex(arguments: argparse.Namespace) -> int:
     return 0
+def cmd_import_tree(arguments: argparse.Namespace) -> int:
+    """
+    Import a folder tree into a corpus.
+    :param arguments: Parsed command-line interface arguments.
+    :type arguments: argparse.Namespace
+    :return: Exit code.
+    :rtype: int
+    """
+    corpus = (
+        Corpus.open(arguments.corpus)
+        if getattr(arguments, "corpus", None)
+        else Corpus.find(Path.cwd())
+    )
+    tags = _parse_tags(arguments.tags, arguments.tag)
+    stats = corpus.import_tree(Path(arguments.path), tags=tags)
+    print(json.dumps(stats, indent=2, sort_keys=False))
+    return 0
 def cmd_purge(arguments: argparse.Namespace) -> int:
     """
     Purge all items and derived artifacts from a corpus.
@@ -196,7 +211,6 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -219,7 +233,6 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
     :rtype: dict[str, object]
     :raises ValueError: If any entry is not key=value.
     """
     config: Dict[str, object] = {}
     for item in pairs or []:
         if "=" not in item:
@@ -240,6 +253,43 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
     return config
+def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
+    """
+    Parse a pipeline step specification.
+    :param raw_step: Step spec in the form extractor_id or extractor_id:key=value,key=value.
+    :type raw_step: str
+    :return: Tuple of extractor_id and config mapping.
+    :rtype: tuple[str, dict[str, object]]
+    :raises ValueError: If the step spec is invalid.
+    """
+    raw_step = raw_step.strip()
+    if not raw_step:
+        raise ValueError("Step spec must be non-empty")
+    if ":" not in raw_step:
+        return raw_step, {}
+    extractor_id, raw_pairs = raw_step.split(":", 1)
+    extractor_id = extractor_id.strip()
+    if not extractor_id:
+        raise ValueError("Step spec must start with an extractor identifier")
+    config: Dict[str, object] = {}
+    raw_pairs = raw_pairs.strip()
+    if not raw_pairs:
+        return extractor_id, {}
+    for token in raw_pairs.split(","):
+        token = token.strip()
+        if not token:
+            continue
+        if "=" not in token:
+            raise ValueError(f"Step config values must be key=value (got {token!r})")
+        key, value = token.split("=", 1)
+        key = key.strip()
+        if not key:
+            raise ValueError("Step config keys must be non-empty")
+        config[key] = value
+    return extractor_id, config
 def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
     """
     Build a QueryBudget from command-line interface arguments.
@@ -249,7 +299,6 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
     :return: Query budget instance.
     :rtype: QueryBudget
     """
     return QueryBudget(
         max_total_items=arguments.max_total_items,
         max_total_characters=arguments.max_total_characters,
@@ -266,7 +315,6 @@ def cmd_build(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -279,6 +327,38 @@ def cmd_build(arguments: argparse.Namespace) -> int:
     return 0
+def cmd_extract(arguments: argparse.Namespace) -> int:
+    """
+    Build a text extraction run for the corpus using a pipeline of extractors.
+    :param arguments: Parsed command-line interface arguments.
+    :type arguments: argparse.Namespace
+    :return: Exit code.
+    :rtype: int
+    """
+    corpus = (
+        Corpus.open(arguments.corpus)
+        if getattr(arguments, "corpus", None)
+        else Corpus.find(Path.cwd())
+    )
+    raw_steps = list(arguments.step or [])
+    if not raw_steps:
+        raise ValueError("Pipeline extraction requires at least one --step")
+    steps: List[Dict[str, object]] = []
+    for raw_step in raw_steps:
+        extractor_id, step_config = _parse_step_spec(raw_step)
+        steps.append({"extractor_id": extractor_id, "config": step_config})
+    config = {"steps": steps}
+    manifest = build_extraction_run(
+        corpus,
+        extractor_id="pipeline",
+        recipe_name=arguments.recipe_name,
+        config=config,
+    )
+    print(manifest.model_dump_json(indent=2))
+    return 0
 def cmd_query(arguments: argparse.Namespace) -> int:
     """
     Execute a retrieval query.
@@ -288,7 +368,6 @@ def cmd_query(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -319,7 +398,6 @@ def cmd_eval(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
@@ -343,7 +421,6 @@ def build_parser() -> argparse.ArgumentParser:
     :return: Argument parser instance.
     :rtype: argparse.ArgumentParser
     """
     parser = argparse.ArgumentParser(
         prog="biblicus",
         description="Biblicus command-line interface (minimum viable product)",
@@ -363,14 +440,18 @@ def build_parser() -> argparse.ArgumentParser:
     p_init = sub.add_parser("init", help="Initialize a new corpus at PATH.")
     p_init.add_argument("path", help="Corpus path or file:// uniform resource identifier.")
-    p_init.add_argument("--force", action="store_true", help="Overwrite existing config if present.")
+    p_init.add_argument(
+        "--force", action="store_true", help="Overwrite existing config if present."
+    )
     p_init.set_defaults(func=cmd_init)
     p_ingest = sub.add_parser("ingest", help="Ingest file(s) and/or text into the corpus.")
     _add_common_corpus_arg(p_ingest)
     p_ingest.add_argument("files", nargs="*", help="File paths to ingest.")
     p_ingest.add_argument("--note", default=None, help="Ingest a literal note as Markdown text.")
-    p_ingest.add_argument("--stdin", action="store_true", help="Read text to ingest from standard input.")
+    p_ingest.add_argument(
+        "--stdin", action="store_true", help="Read text to ingest from standard input."
+    )
     p_ingest.add_argument("--title", default=None, help="Optional title (for --note/--stdin).")
     p_ingest.add_argument("--tags", default=None, help="Comma-separated tags.")
     p_ingest.add_argument("--tag", action="append", help="Repeatable tag.")
@@ -386,11 +467,26 @@ def build_parser() -> argparse.ArgumentParser:
     p_show.add_argument("id", help="Item identifier (universally unique identifier).")
     p_show.set_defaults(func=cmd_show)
-    p_reindex = sub.add_parser("reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus.")
+    p_reindex = sub.add_parser(
+        "reindex", help="Rebuild/refresh the corpus catalog from the on-disk corpus."
+    )
     _add_common_corpus_arg(p_reindex)
     p_reindex.set_defaults(func=cmd_reindex)
-    p_purge = sub.add_parser("purge", help="Delete all items and derived files (requires confirmation).")
+    p_import_tree = sub.add_parser("import-tree", help="Import a folder tree into the corpus.")
+    _add_common_corpus_arg(p_import_tree)
+    p_import_tree.add_argument("path", help="Folder tree root to import.")
+    p_import_tree.add_argument(
+        "--tags", default=None, help="Comma-separated tags to apply to imported items."
+    )
+    p_import_tree.add_argument(
+        "--tag", action="append", help="Repeatable tag to apply to imported items."
+    )
+    p_import_tree.set_defaults(func=cmd_import_tree)
+    p_purge = sub.add_parser(
+        "purge", help="Delete all items and derived files (requires confirmation)."
+    )
     _add_common_corpus_arg(p_purge)
     p_purge.add_argument(
         "--confirm",
@@ -415,6 +511,17 @@ def build_parser() -> argparse.ArgumentParser:
     )
     p_build.set_defaults(func=cmd_build)
+    p_extract = sub.add_parser("extract", help="Build a text extraction run for the corpus.")
+    _add_common_corpus_arg(p_extract)
+    p_extract.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
+    p_extract.add_argument(
+        "--step",
+        action="append",
+        default=None,
+        help="Pipeline step spec in the form extractor_id or extractor_id:key=value,key=value (repeatable).",
+    )
+    p_extract.set_defaults(func=cmd_extract)
     p_query = sub.add_parser("query", help="Run a retrieval query.")
     _add_common_corpus_arg(p_query)
     p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
@@ -450,7 +557,6 @@ def main(argument_list: Optional[List[str]] = None) -> int:
     :return: Exit code.
     :rtype: int
     """
     parser = build_parser()
     arguments = parser.parse_args(argument_list)
     try:
@@ -460,6 +566,7 @@ def main(argument_list: Optional[List[str]] = None) -> int:
         FileExistsError,
         KeyError,
         ValueError,
+        ExtractionRunFatalError,
         NotImplementedError,
         ValidationError,
     ) as exception:

biblicus/constants.py CHANGED Viewed

@@ -8,3 +8,5 @@ CORPUS_DIR_NAME = ".biblicus"
 DEFAULT_RAW_DIR = "raw"
 SIDECAR_SUFFIX = ".biblicus.yml"
 RUNS_DIR_NAME = "runs"
+EXTRACTION_RUNS_DIR_NAME = "extraction"
+HOOK_LOGS_DIR_NAME = "hook_logs"

biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

biblicus 0.1.1py3-none-any.whl → 0.3.0py3-none-any.whl