PyPI - biblicus - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

biblicus 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

biblicus/__init__.py +1 -1
biblicus/analysis/__init__.py +40 -0
biblicus/analysis/base.py +49 -0
biblicus/analysis/llm.py +106 -0
biblicus/analysis/models.py +554 -0
biblicus/analysis/schema.py +18 -0
biblicus/analysis/topic_modeling.py +585 -0
biblicus/cli.py +160 -11
biblicus/constants.py +2 -0
biblicus/corpus.py +42 -0
biblicus/extraction.py +5 -0
biblicus/extractors/__init__.py +12 -0
biblicus/extractors/deepgram_stt.py +166 -0
biblicus/extractors/docling_granite_text.py +188 -0
biblicus/extractors/docling_smol_text.py +188 -0
biblicus/extractors/paddleocr_vl_text.py +305 -0
biblicus/extractors/rapidocr_text.py +8 -1
biblicus/extractors/select_override.py +121 -0
biblicus/extractors/select_smart_override.py +187 -0
biblicus/inference.py +104 -0
biblicus/models.py +6 -0
biblicus/user_config.py +76 -0
{biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/METADATA +120 -16
{biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/RECORD +28 -15
{biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/WHEEL +0 -0
{biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/top_level.txt +0 -0

biblicus/cli.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import Dict, List, Optional
 from pydantic import ValidationError
+from .analysis import get_analysis_backend
 from .backends import get_backend
 from .context import (
     ContextPackPolicy,
@@ -284,8 +285,50 @@ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
     raw_pairs = raw_pairs.strip()
     if not raw_pairs:
         return extractor_id, {}
-    for token in raw_pairs.split(","):
-        token = token.strip()
+    tokens = []
+    current_token = []
+    brace_depth = 0
+    bracket_depth = 0
+    in_quotes = False
+    escape_next = False
+    for char in raw_pairs:
+        if escape_next:
+            current_token.append(char)
+            escape_next = False
+            continue
+        if char == "\\":
+            escape_next = True
+            current_token.append(char)
+            continue
+        if char == '"' and brace_depth == 0 and bracket_depth == 0:
+            in_quotes = not in_quotes
+            current_token.append(char)
+            continue
+        if not in_quotes:
+            if char == "{":
+                brace_depth += 1
+            elif char == "}":
+                brace_depth -= 1
+            elif char == "[":
+                bracket_depth += 1
+            elif char == "]":
+                bracket_depth -= 1
+            elif char == "," and brace_depth == 0 and bracket_depth == 0:
+                tokens.append("".join(current_token).strip())
+                current_token = []
+                continue
+        current_token.append(char)
+    if current_token:
+        tokens.append("".join(current_token).strip())
+    for token in tokens:
         if not token:
             continue
         if "=" not in token:
@@ -344,22 +387,53 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
+    import yaml
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
         else Corpus.find(Path.cwd())
     )
-    raw_steps = list(arguments.step or [])
-    if not raw_steps:
-        raise ValueError("Pipeline extraction requires at least one --step")
-    steps: List[Dict[str, object]] = []
-    for raw_step in raw_steps:
-        extractor_id, step_config = _parse_step_spec(raw_step)
-        steps.append({"extractor_id": extractor_id, "config": step_config})
-    config = {"steps": steps}
+    # Load recipe from file if --recipe is provided
+    if getattr(arguments, "recipe", None):
+        recipe_path = Path(arguments.recipe)
+        if not recipe_path.exists():
+            raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
+        with open(recipe_path, "r", encoding="utf-8") as f:
+            recipe_data = yaml.safe_load(f)
+        loaded_extractor_id = recipe_data.get("extractor_id", "pipeline")
+        loaded_config = recipe_data.get("config", {})
+        # If the recipe specifies a non-pipeline extractor, wrap it in a pipeline
+        if loaded_extractor_id != "pipeline":
+            extractor_id = "pipeline"
+            config = {
+                "steps": [
+                    {
+                        "extractor_id": loaded_extractor_id,
+                        "config": loaded_config,
+                    }
+                ]
+            }
+        else:
+            extractor_id = loaded_extractor_id
+            config = loaded_config
+    else:
+        # Build from --step arguments
+        raw_steps = list(arguments.step or [])
+        if not raw_steps:
+            raise ValueError("Pipeline extraction requires at least one --step")
+        steps: List[Dict[str, object]] = []
+        for raw_step in raw_steps:
+            step_extractor_id, step_config = _parse_step_spec(raw_step)
+            steps.append({"extractor_id": step_extractor_id, "config": step_config})
+        config = {"steps": steps}
+        extractor_id = "pipeline"
     manifest = build_extraction_run(
         corpus,
-        extractor_id="pipeline",
+        extractor_id=extractor_id,
         recipe_name=arguments.recipe_name,
         config=config,
     )
@@ -563,6 +637,54 @@ def cmd_crawl(arguments: argparse.Namespace) -> int:
     return 0
+def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
+    """
+    Run topic modeling analysis for a corpus.
+    :param arguments: Parsed command-line interface arguments.
+    :type arguments: argparse.Namespace
+    :return: Exit code.
+    :rtype: int
+    """
+    import yaml
+    corpus = (
+        Corpus.open(arguments.corpus)
+        if getattr(arguments, "corpus", None)
+        else Corpus.find(Path.cwd())
+    )
+    recipe_path = Path(arguments.recipe)
+    if not recipe_path.is_file():
+        raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
+    recipe_data = yaml.safe_load(recipe_path.read_text(encoding="utf-8")) or {}
+    if not isinstance(recipe_data, dict):
+        raise ValueError("Topic modeling recipe must be a mapping/object")
+    if arguments.extraction_run:
+        extraction_run = parse_extraction_run_reference(arguments.extraction_run)
+    else:
+        extraction_run = corpus.latest_extraction_run_reference()
+        if extraction_run is None:
+            raise ValueError("Topic analysis requires an extraction run to supply text inputs")
+        print(
+            "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
+            file=sys.stderr,
+        )
+    backend = get_analysis_backend("topic-modeling")
+    try:
+        output = backend.run_analysis(
+            corpus,
+            recipe_name=arguments.recipe_name,
+            config=recipe_data,
+            extraction_run=extraction_run,
+        )
+    except ValidationError as exc:
+        raise ValueError(f"Invalid topic modeling recipe: {exc}") from exc
+    print(output.model_dump_json(indent=2))
+    return 0
 def build_parser() -> argparse.ArgumentParser:
     """
     Build the command-line interface argument parser.
@@ -668,6 +790,11 @@ def build_parser() -> argparse.ArgumentParser:
     p_extract_build.add_argument(
         "--recipe-name", default="default", help="Human-readable recipe name."
     )
+    p_extract_build.add_argument(
+        "--recipe",
+        default=None,
+        help="Path to YAML recipe file. If provided, --step arguments are ignored.",
+    )
     p_extract_build.add_argument(
         "--step",
         action="append",
@@ -774,6 +901,28 @@ def build_parser() -> argparse.ArgumentParser:
     p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
     p_crawl.set_defaults(func=cmd_crawl)
+    p_analyze = sub.add_parser("analyze", help="Run analysis pipelines for the corpus.")
+    analyze_sub = p_analyze.add_subparsers(dest="analyze_command", required=True)
+    p_analyze_topics = analyze_sub.add_parser("topics", help="Run topic modeling analysis.")
+    _add_common_corpus_arg(p_analyze_topics)
+    p_analyze_topics.add_argument(
+        "--recipe",
+        required=True,
+        help="Path to topic modeling recipe YAML.",
+    )
+    p_analyze_topics.add_argument(
+        "--recipe-name",
+        default="default",
+        help="Human-readable recipe name.",
+    )
+    p_analyze_topics.add_argument(
+        "--extraction-run",
+        default=None,
+        help="Extraction run reference in the form extractor_id:run_id.",
+    )
+    p_analyze_topics.set_defaults(func=cmd_analyze_topics)
     return parser

biblicus/constants.py CHANGED Viewed

@@ -4,9 +4,11 @@ Shared constants for Biblicus.
 SCHEMA_VERSION = 2
 DATASET_SCHEMA_VERSION = 1
+ANALYSIS_SCHEMA_VERSION = 1
 CORPUS_DIR_NAME = ".biblicus"
 DEFAULT_RAW_DIR = "raw"
 SIDECAR_SUFFIX = ".biblicus.yml"
 RUNS_DIR_NAME = "runs"
 EXTRACTION_RUNS_DIR_NAME = "extraction"
+ANALYSIS_RUNS_DIR_NAME = "analysis"
 HOOK_LOGS_DIR_NAME = "hook_logs"

biblicus/corpus.py CHANGED Viewed

@@ -16,6 +16,7 @@ import yaml
 from pydantic import ValidationError
 from .constants import (
+    ANALYSIS_RUNS_DIR_NAME,
     CORPUS_DIR_NAME,
     DEFAULT_RAW_DIR,
     EXTRACTION_RUNS_DIR_NAME,
@@ -32,6 +33,7 @@ from .models import (
     CorpusCatalog,
     CorpusConfig,
     ExtractionRunListEntry,
+    ExtractionRunReference,
     IngestResult,
     RetrievalRun,
 )
@@ -538,6 +540,16 @@ class Corpus:
         """
         return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
+    @property
+    def analysis_runs_dir(self) -> Path:
+        """
+        Location of analysis run artifacts.
+        :return: Path to the analysis runs directory.
+        :rtype: Path
+        """
+        return self.runs_dir / ANALYSIS_RUNS_DIR_NAME
     def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
         """
         Resolve an extraction run directory.
@@ -551,6 +563,19 @@ class Corpus:
         """
         return self.extraction_runs_dir / extractor_id / run_id
+    def analysis_run_dir(self, *, analysis_id: str, run_id: str) -> Path:
+        """
+        Resolve an analysis run directory.
+        :param analysis_id: Analysis backend identifier.
+        :type analysis_id: str
+        :param run_id: Analysis run identifier.
+        :type run_id: str
+        :return: Analysis run directory.
+        :rtype: Path
+        """
+        return self.analysis_runs_dir / analysis_id / run_id
     def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
         """
         Read extracted text for an item from an extraction run, when present.
@@ -647,6 +672,23 @@ class Corpus:
         entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
         return entries
+    def latest_extraction_run_reference(
+        self, *, extractor_id: Optional[str] = None
+    ) -> Optional[ExtractionRunReference]:
+        """
+        Return the most recent extraction run reference.
+        :param extractor_id: Optional extractor identifier filter.
+        :type extractor_id: str or None
+        :return: Latest extraction run reference or None when no runs exist.
+        :rtype: biblicus.models.ExtractionRunReference or None
+        """
+        entries = self.list_extraction_runs(extractor_id=extractor_id)
+        if not entries:
+            return None
+        latest = entries[0]
+        return ExtractionRunReference(extractor_id=latest.extractor_id, run_id=latest.run_id)
     def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
         """
         Delete an extraction run directory and its derived artifacts.

biblicus/extraction.py CHANGED Viewed

@@ -63,6 +63,8 @@ class ExtractionStepResult(BaseModel):
     :vartype producer_extractor_id: str or None
     :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
     :vartype source_step_index: int or None
+    :ivar confidence: Optional confidence score from 0.0 to 1.0.
+    :vartype confidence: float or None
     :ivar error_type: Optional error type name for errored steps.
     :vartype error_type: str or None
     :ivar error_message: Optional error message for errored steps.
@@ -78,6 +80,7 @@ class ExtractionStepResult(BaseModel):
     text_characters: int = Field(default=0, ge=0)
     producer_extractor_id: Optional[str] = None
     source_step_index: Optional[int] = Field(default=None, ge=1)
+    confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
     error_type: Optional[str] = None
     error_message: Optional[str] = None
@@ -447,6 +450,7 @@ def build_extraction_run(
                     text_characters=text_characters,
                     producer_extractor_id=extracted_text.producer_extractor_id,
                     source_step_index=extracted_text.source_step_index,
+                    confidence=extracted_text.confidence,
                     error_type=None,
                     error_message=None,
                 )
@@ -460,6 +464,7 @@ def build_extraction_run(
                     text_characters=text_characters,
                     producer_extractor_id=extracted_text.producer_extractor_id,
                     source_step_index=extracted_text.source_step_index,
+                    confidence=extracted_text.confidence,
                     error_type=None,
                     error_message=None,
                 )

biblicus/extractors/__init__.py CHANGED Viewed

@@ -7,14 +7,20 @@ from __future__ import annotations
 from typing import Dict
 from .base import TextExtractor
+from .deepgram_stt import DeepgramSpeechToTextExtractor
+from .docling_granite_text import DoclingGraniteExtractor
+from .docling_smol_text import DoclingSmolExtractor
 from .markitdown_text import MarkItDownExtractor
 from .metadata_text import MetadataTextExtractor
 from .openai_stt import OpenAiSpeechToTextExtractor
+from .paddleocr_vl_text import PaddleOcrVlExtractor
 from .pass_through_text import PassThroughTextExtractor
 from .pdf_text import PortableDocumentFormatTextExtractor
 from .pipeline import PipelineExtractor
 from .rapidocr_text import RapidOcrExtractor
 from .select_longest_text import SelectLongestTextExtractor
+from .select_override import SelectOverrideExtractor
+from .select_smart_override import SelectSmartOverrideExtractor
 from .select_text import SelectTextExtractor
 from .unstructured_text import UnstructuredExtractor
@@ -32,13 +38,19 @@ def get_extractor(extractor_id: str) -> TextExtractor:
     extractors: Dict[str, TextExtractor] = {
         MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
         MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
+        DoclingSmolExtractor.extractor_id: DoclingSmolExtractor(),
+        DoclingGraniteExtractor.extractor_id: DoclingGraniteExtractor(),
         PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
         PipelineExtractor.extractor_id: PipelineExtractor(),
         PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
         OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
+        DeepgramSpeechToTextExtractor.extractor_id: DeepgramSpeechToTextExtractor(),
         RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
+        PaddleOcrVlExtractor.extractor_id: PaddleOcrVlExtractor(),
         SelectTextExtractor.extractor_id: SelectTextExtractor(),
         SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
+        SelectSmartOverrideExtractor.extractor_id: SelectSmartOverrideExtractor(),
+        SelectOverrideExtractor.extractor_id: SelectOverrideExtractor(),
         UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
     }
     if extractor_id not in extractors:

biblicus/extractors/deepgram_stt.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""
+Deepgram-backed speech to text extractor plugin.
+This extractor is implemented as an optional dependency so the core installation stays small.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, ConfigDict, Field
+from ..corpus import Corpus
+from ..errors import ExtractionRunFatalError
+from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
+from ..user_config import resolve_deepgram_api_key
+from .base import TextExtractor
+class DeepgramSpeechToTextExtractorConfig(BaseModel):
+    """
+    Configuration for Deepgram speech to text extraction.
+    :ivar model: Deepgram transcription model identifier.
+    :vartype model: str
+    :ivar language: Optional language code hint for transcription.
+    :vartype language: str or None
+    :ivar punctuate: Whether to add punctuation to the transcript.
+    :vartype punctuate: bool
+    :ivar smart_format: Whether to apply smart formatting.
+    :vartype smart_format: bool
+    :ivar diarize: Whether to enable speaker diarization.
+    :vartype diarize: bool
+    :ivar filler_words: Whether to include filler words.
+    :vartype filler_words: bool
+    """
+    model_config = ConfigDict(extra="forbid")
+    model: str = Field(default="nova-3", min_length=1)
+    language: Optional[str] = Field(default=None, min_length=1)
+    punctuate: bool = Field(default=True)
+    smart_format: bool = Field(default=True)
+    diarize: bool = Field(default=False)
+    filler_words: bool = Field(default=False)
+class DeepgramSpeechToTextExtractor(TextExtractor):
+    """
+    Extractor plugin that transcribes audio items using the Deepgram API.
+    This extractor is intended as a practical, hosted speech to text implementation.
+    It skips non-audio items.
+    :ivar extractor_id: Extractor identifier.
+    :vartype extractor_id: str
+    """
+    extractor_id = "stt-deepgram"
+    def validate_config(self, config: Dict[str, Any]) -> BaseModel:
+        """
+        Validate extractor configuration and ensure prerequisites are available.
+        :param config: Configuration mapping.
+        :type config: dict[str, Any]
+        :return: Parsed configuration model.
+        :rtype: DeepgramSpeechToTextExtractorConfig
+        :raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
+        """
+        try:
+            from deepgram import DeepgramClient  # noqa: F401
+        except ImportError as import_error:
+            raise ExtractionRunFatalError(
+                "Deepgram speech to text extractor requires an optional dependency. "
+                'Install it with pip install "biblicus[deepgram]".'
+            ) from import_error
+        api_key = resolve_deepgram_api_key()
+        if api_key is None:
+            raise ExtractionRunFatalError(
+                "Deepgram speech to text extractor requires a Deepgram API key. "
+                "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
+                "deepgram.api_key."
+            )
+        return DeepgramSpeechToTextExtractorConfig.model_validate(config)
+    def extract_text(
+        self,
+        *,
+        corpus: Corpus,
+        item: CatalogItem,
+        config: BaseModel,
+        previous_extractions: List[ExtractionStepOutput],
+    ) -> Optional[ExtractedText]:
+        """
+        Transcribe an audio item.
+        :param corpus: Corpus containing the item bytes.
+        :type corpus: Corpus
+        :param item: Catalog item being processed.
+        :type item: CatalogItem
+        :param config: Parsed configuration model.
+        :type config: DeepgramSpeechToTextExtractorConfig
+        :param previous_extractions: Prior step outputs for this item within the pipeline.
+        :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
+        :return: Extracted text payload, or None when the item is not audio.
+        :rtype: ExtractedText or None
+        :raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
+        """
+        _ = previous_extractions
+        if not item.media_type.startswith("audio/"):
+            return None
+        parsed_config = (
+            config
+            if isinstance(config, DeepgramSpeechToTextExtractorConfig)
+            else DeepgramSpeechToTextExtractorConfig.model_validate(config)
+        )
+        api_key = resolve_deepgram_api_key()
+        if api_key is None:
+            raise ExtractionRunFatalError(
+                "Deepgram speech to text extractor requires a Deepgram API key. "
+                "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
+                "deepgram.api_key."
+            )
+        try:
+            from deepgram import DeepgramClient
+        except ImportError as import_error:
+            raise ExtractionRunFatalError(
+                "Deepgram speech to text extractor requires an optional dependency. "
+                'Install it with pip install "biblicus[deepgram]".'
+            ) from import_error
+        client = DeepgramClient(api_key=api_key)
+        source_path = corpus.root / item.relpath
+        options: Dict[str, Any] = {
+            "model": parsed_config.model,
+            "punctuate": parsed_config.punctuate,
+            "smart_format": parsed_config.smart_format,
+            "diarize": parsed_config.diarize,
+            "filler_words": parsed_config.filler_words,
+        }
+        if parsed_config.language is not None:
+            options["language"] = parsed_config.language
+        with source_path.open("rb") as audio_handle:
+            audio_data = audio_handle.read()
+            response = client.listen.rest.v("1").transcribe_file(
+                {"buffer": audio_data},
+                options,
+            )
+        transcript_text = ""
+        if hasattr(response, "results") and response.results:
+            channels = response.results.channels
+            if channels and len(channels) > 0:
+                alternatives = channels[0].alternatives
+                if alternatives and len(alternatives) > 0:
+                    transcript_text = alternatives[0].transcript or ""
+        return ExtractedText(text=transcript_text.strip(), producer_extractor_id=self.extractor_id)

biblicus 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

biblicus 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl