biblicus 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/cli.py CHANGED
@@ -12,6 +12,7 @@ from typing import Dict, List, Optional
12
12
 
13
13
  from pydantic import ValidationError
14
14
 
15
+ from .analysis import get_analysis_backend
15
16
  from .backends import get_backend
16
17
  from .context import (
17
18
  ContextPackPolicy,
@@ -284,8 +285,50 @@ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
284
285
  raw_pairs = raw_pairs.strip()
285
286
  if not raw_pairs:
286
287
  return extractor_id, {}
287
- for token in raw_pairs.split(","):
288
- token = token.strip()
288
+
289
+ tokens = []
290
+ current_token = []
291
+ brace_depth = 0
292
+ bracket_depth = 0
293
+ in_quotes = False
294
+ escape_next = False
295
+
296
+ for char in raw_pairs:
297
+ if escape_next:
298
+ current_token.append(char)
299
+ escape_next = False
300
+ continue
301
+
302
+ if char == "\\":
303
+ escape_next = True
304
+ current_token.append(char)
305
+ continue
306
+
307
+ if char == '"' and brace_depth == 0 and bracket_depth == 0:
308
+ in_quotes = not in_quotes
309
+ current_token.append(char)
310
+ continue
311
+
312
+ if not in_quotes:
313
+ if char == "{":
314
+ brace_depth += 1
315
+ elif char == "}":
316
+ brace_depth -= 1
317
+ elif char == "[":
318
+ bracket_depth += 1
319
+ elif char == "]":
320
+ bracket_depth -= 1
321
+ elif char == "," and brace_depth == 0 and bracket_depth == 0:
322
+ tokens.append("".join(current_token).strip())
323
+ current_token = []
324
+ continue
325
+
326
+ current_token.append(char)
327
+
328
+ if current_token:
329
+ tokens.append("".join(current_token).strip())
330
+
331
+ for token in tokens:
289
332
  if not token:
290
333
  continue
291
334
  if "=" not in token:
@@ -344,22 +387,53 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
344
387
  :return: Exit code.
345
388
  :rtype: int
346
389
  """
390
+ import yaml
391
+
347
392
  corpus = (
348
393
  Corpus.open(arguments.corpus)
349
394
  if getattr(arguments, "corpus", None)
350
395
  else Corpus.find(Path.cwd())
351
396
  )
352
- raw_steps = list(arguments.step or [])
353
- if not raw_steps:
354
- raise ValueError("Pipeline extraction requires at least one --step")
355
- steps: List[Dict[str, object]] = []
356
- for raw_step in raw_steps:
357
- extractor_id, step_config = _parse_step_spec(raw_step)
358
- steps.append({"extractor_id": extractor_id, "config": step_config})
359
- config = {"steps": steps}
397
+
398
+ # Load recipe from file if --recipe is provided
399
+ if getattr(arguments, "recipe", None):
400
+ recipe_path = Path(arguments.recipe)
401
+ if not recipe_path.exists():
402
+ raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
403
+ with open(recipe_path, "r", encoding="utf-8") as f:
404
+ recipe_data = yaml.safe_load(f)
405
+ loaded_extractor_id = recipe_data.get("extractor_id", "pipeline")
406
+ loaded_config = recipe_data.get("config", {})
407
+
408
+ # If the recipe specifies a non-pipeline extractor, wrap it in a pipeline
409
+ if loaded_extractor_id != "pipeline":
410
+ extractor_id = "pipeline"
411
+ config = {
412
+ "steps": [
413
+ {
414
+ "extractor_id": loaded_extractor_id,
415
+ "config": loaded_config,
416
+ }
417
+ ]
418
+ }
419
+ else:
420
+ extractor_id = loaded_extractor_id
421
+ config = loaded_config
422
+ else:
423
+ # Build from --step arguments
424
+ raw_steps = list(arguments.step or [])
425
+ if not raw_steps:
426
+ raise ValueError("Pipeline extraction requires at least one --step")
427
+ steps: List[Dict[str, object]] = []
428
+ for raw_step in raw_steps:
429
+ step_extractor_id, step_config = _parse_step_spec(raw_step)
430
+ steps.append({"extractor_id": step_extractor_id, "config": step_config})
431
+ config = {"steps": steps}
432
+ extractor_id = "pipeline"
433
+
360
434
  manifest = build_extraction_run(
361
435
  corpus,
362
- extractor_id="pipeline",
436
+ extractor_id=extractor_id,
363
437
  recipe_name=arguments.recipe_name,
364
438
  config=config,
365
439
  )
@@ -563,6 +637,54 @@ def cmd_crawl(arguments: argparse.Namespace) -> int:
563
637
  return 0
564
638
 
565
639
 
640
+ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
641
+ """
642
+ Run topic modeling analysis for a corpus.
643
+
644
+ :param arguments: Parsed command-line interface arguments.
645
+ :type arguments: argparse.Namespace
646
+ :return: Exit code.
647
+ :rtype: int
648
+ """
649
+ import yaml
650
+
651
+ corpus = (
652
+ Corpus.open(arguments.corpus)
653
+ if getattr(arguments, "corpus", None)
654
+ else Corpus.find(Path.cwd())
655
+ )
656
+ recipe_path = Path(arguments.recipe)
657
+ if not recipe_path.is_file():
658
+ raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
659
+ recipe_data = yaml.safe_load(recipe_path.read_text(encoding="utf-8")) or {}
660
+ if not isinstance(recipe_data, dict):
661
+ raise ValueError("Topic modeling recipe must be a mapping/object")
662
+
663
+ if arguments.extraction_run:
664
+ extraction_run = parse_extraction_run_reference(arguments.extraction_run)
665
+ else:
666
+ extraction_run = corpus.latest_extraction_run_reference()
667
+ if extraction_run is None:
668
+ raise ValueError("Topic analysis requires an extraction run to supply text inputs")
669
+ print(
670
+ "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
671
+ file=sys.stderr,
672
+ )
673
+
674
+ backend = get_analysis_backend("topic-modeling")
675
+ try:
676
+ output = backend.run_analysis(
677
+ corpus,
678
+ recipe_name=arguments.recipe_name,
679
+ config=recipe_data,
680
+ extraction_run=extraction_run,
681
+ )
682
+ except ValidationError as exc:
683
+ raise ValueError(f"Invalid topic modeling recipe: {exc}") from exc
684
+ print(output.model_dump_json(indent=2))
685
+ return 0
686
+
687
+
566
688
  def build_parser() -> argparse.ArgumentParser:
567
689
  """
568
690
  Build the command-line interface argument parser.
@@ -668,6 +790,11 @@ def build_parser() -> argparse.ArgumentParser:
668
790
  p_extract_build.add_argument(
669
791
  "--recipe-name", default="default", help="Human-readable recipe name."
670
792
  )
793
+ p_extract_build.add_argument(
794
+ "--recipe",
795
+ default=None,
796
+ help="Path to YAML recipe file. If provided, --step arguments are ignored.",
797
+ )
671
798
  p_extract_build.add_argument(
672
799
  "--step",
673
800
  action="append",
@@ -774,6 +901,28 @@ def build_parser() -> argparse.ArgumentParser:
774
901
  p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
775
902
  p_crawl.set_defaults(func=cmd_crawl)
776
903
 
904
+ p_analyze = sub.add_parser("analyze", help="Run analysis pipelines for the corpus.")
905
+ analyze_sub = p_analyze.add_subparsers(dest="analyze_command", required=True)
906
+
907
+ p_analyze_topics = analyze_sub.add_parser("topics", help="Run topic modeling analysis.")
908
+ _add_common_corpus_arg(p_analyze_topics)
909
+ p_analyze_topics.add_argument(
910
+ "--recipe",
911
+ required=True,
912
+ help="Path to topic modeling recipe YAML.",
913
+ )
914
+ p_analyze_topics.add_argument(
915
+ "--recipe-name",
916
+ default="default",
917
+ help="Human-readable recipe name.",
918
+ )
919
+ p_analyze_topics.add_argument(
920
+ "--extraction-run",
921
+ default=None,
922
+ help="Extraction run reference in the form extractor_id:run_id.",
923
+ )
924
+ p_analyze_topics.set_defaults(func=cmd_analyze_topics)
925
+
777
926
  return parser
778
927
 
779
928
 
biblicus/constants.py CHANGED
@@ -4,9 +4,11 @@ Shared constants for Biblicus.
4
4
 
5
5
  SCHEMA_VERSION = 2
6
6
  DATASET_SCHEMA_VERSION = 1
7
+ ANALYSIS_SCHEMA_VERSION = 1
7
8
  CORPUS_DIR_NAME = ".biblicus"
8
9
  DEFAULT_RAW_DIR = "raw"
9
10
  SIDECAR_SUFFIX = ".biblicus.yml"
10
11
  RUNS_DIR_NAME = "runs"
11
12
  EXTRACTION_RUNS_DIR_NAME = "extraction"
13
+ ANALYSIS_RUNS_DIR_NAME = "analysis"
12
14
  HOOK_LOGS_DIR_NAME = "hook_logs"
biblicus/corpus.py CHANGED
@@ -16,6 +16,7 @@ import yaml
16
16
  from pydantic import ValidationError
17
17
 
18
18
  from .constants import (
19
+ ANALYSIS_RUNS_DIR_NAME,
19
20
  CORPUS_DIR_NAME,
20
21
  DEFAULT_RAW_DIR,
21
22
  EXTRACTION_RUNS_DIR_NAME,
@@ -32,6 +33,7 @@ from .models import (
32
33
  CorpusCatalog,
33
34
  CorpusConfig,
34
35
  ExtractionRunListEntry,
36
+ ExtractionRunReference,
35
37
  IngestResult,
36
38
  RetrievalRun,
37
39
  )
@@ -538,6 +540,16 @@ class Corpus:
538
540
  """
539
541
  return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
540
542
 
543
+ @property
544
+ def analysis_runs_dir(self) -> Path:
545
+ """
546
+ Location of analysis run artifacts.
547
+
548
+ :return: Path to the analysis runs directory.
549
+ :rtype: Path
550
+ """
551
+ return self.runs_dir / ANALYSIS_RUNS_DIR_NAME
552
+
541
553
  def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
542
554
  """
543
555
  Resolve an extraction run directory.
@@ -551,6 +563,19 @@ class Corpus:
551
563
  """
552
564
  return self.extraction_runs_dir / extractor_id / run_id
553
565
 
566
+ def analysis_run_dir(self, *, analysis_id: str, run_id: str) -> Path:
567
+ """
568
+ Resolve an analysis run directory.
569
+
570
+ :param analysis_id: Analysis backend identifier.
571
+ :type analysis_id: str
572
+ :param run_id: Analysis run identifier.
573
+ :type run_id: str
574
+ :return: Analysis run directory.
575
+ :rtype: Path
576
+ """
577
+ return self.analysis_runs_dir / analysis_id / run_id
578
+
554
579
  def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
555
580
  """
556
581
  Read extracted text for an item from an extraction run, when present.
@@ -647,6 +672,23 @@ class Corpus:
647
672
  entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
648
673
  return entries
649
674
 
675
+ def latest_extraction_run_reference(
676
+ self, *, extractor_id: Optional[str] = None
677
+ ) -> Optional[ExtractionRunReference]:
678
+ """
679
+ Return the most recent extraction run reference.
680
+
681
+ :param extractor_id: Optional extractor identifier filter.
682
+ :type extractor_id: str or None
683
+ :return: Latest extraction run reference or None when no runs exist.
684
+ :rtype: biblicus.models.ExtractionRunReference or None
685
+ """
686
+ entries = self.list_extraction_runs(extractor_id=extractor_id)
687
+ if not entries:
688
+ return None
689
+ latest = entries[0]
690
+ return ExtractionRunReference(extractor_id=latest.extractor_id, run_id=latest.run_id)
691
+
650
692
  def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
651
693
  """
652
694
  Delete an extraction run directory and its derived artifacts.
biblicus/extraction.py CHANGED
@@ -63,6 +63,8 @@ class ExtractionStepResult(BaseModel):
63
63
  :vartype producer_extractor_id: str or None
64
64
  :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
65
65
  :vartype source_step_index: int or None
66
+ :ivar confidence: Optional confidence score from 0.0 to 1.0.
67
+ :vartype confidence: float or None
66
68
  :ivar error_type: Optional error type name for errored steps.
67
69
  :vartype error_type: str or None
68
70
  :ivar error_message: Optional error message for errored steps.
@@ -78,6 +80,7 @@ class ExtractionStepResult(BaseModel):
78
80
  text_characters: int = Field(default=0, ge=0)
79
81
  producer_extractor_id: Optional[str] = None
80
82
  source_step_index: Optional[int] = Field(default=None, ge=1)
83
+ confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
81
84
  error_type: Optional[str] = None
82
85
  error_message: Optional[str] = None
83
86
 
@@ -447,6 +450,7 @@ def build_extraction_run(
447
450
  text_characters=text_characters,
448
451
  producer_extractor_id=extracted_text.producer_extractor_id,
449
452
  source_step_index=extracted_text.source_step_index,
453
+ confidence=extracted_text.confidence,
450
454
  error_type=None,
451
455
  error_message=None,
452
456
  )
@@ -460,6 +464,7 @@ def build_extraction_run(
460
464
  text_characters=text_characters,
461
465
  producer_extractor_id=extracted_text.producer_extractor_id,
462
466
  source_step_index=extracted_text.source_step_index,
467
+ confidence=extracted_text.confidence,
463
468
  error_type=None,
464
469
  error_message=None,
465
470
  )
@@ -7,14 +7,20 @@ from __future__ import annotations
7
7
  from typing import Dict
8
8
 
9
9
  from .base import TextExtractor
10
+ from .deepgram_stt import DeepgramSpeechToTextExtractor
11
+ from .docling_granite_text import DoclingGraniteExtractor
12
+ from .docling_smol_text import DoclingSmolExtractor
10
13
  from .markitdown_text import MarkItDownExtractor
11
14
  from .metadata_text import MetadataTextExtractor
12
15
  from .openai_stt import OpenAiSpeechToTextExtractor
16
+ from .paddleocr_vl_text import PaddleOcrVlExtractor
13
17
  from .pass_through_text import PassThroughTextExtractor
14
18
  from .pdf_text import PortableDocumentFormatTextExtractor
15
19
  from .pipeline import PipelineExtractor
16
20
  from .rapidocr_text import RapidOcrExtractor
17
21
  from .select_longest_text import SelectLongestTextExtractor
22
+ from .select_override import SelectOverrideExtractor
23
+ from .select_smart_override import SelectSmartOverrideExtractor
18
24
  from .select_text import SelectTextExtractor
19
25
  from .unstructured_text import UnstructuredExtractor
20
26
 
@@ -32,13 +38,19 @@ def get_extractor(extractor_id: str) -> TextExtractor:
32
38
  extractors: Dict[str, TextExtractor] = {
33
39
  MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
34
40
  MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
41
+ DoclingSmolExtractor.extractor_id: DoclingSmolExtractor(),
42
+ DoclingGraniteExtractor.extractor_id: DoclingGraniteExtractor(),
35
43
  PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
36
44
  PipelineExtractor.extractor_id: PipelineExtractor(),
37
45
  PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
38
46
  OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
47
+ DeepgramSpeechToTextExtractor.extractor_id: DeepgramSpeechToTextExtractor(),
39
48
  RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
49
+ PaddleOcrVlExtractor.extractor_id: PaddleOcrVlExtractor(),
40
50
  SelectTextExtractor.extractor_id: SelectTextExtractor(),
41
51
  SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
52
+ SelectSmartOverrideExtractor.extractor_id: SelectSmartOverrideExtractor(),
53
+ SelectOverrideExtractor.extractor_id: SelectOverrideExtractor(),
42
54
  UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
43
55
  }
44
56
  if extractor_id not in extractors:
@@ -0,0 +1,166 @@
1
+ """
2
+ Deepgram-backed speech to text extractor plugin.
3
+
4
+ This extractor is implemented as an optional dependency so the core installation stays small.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from ..corpus import Corpus
14
+ from ..errors import ExtractionRunFatalError
15
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
+ from ..user_config import resolve_deepgram_api_key
17
+ from .base import TextExtractor
18
+
19
+
20
+ class DeepgramSpeechToTextExtractorConfig(BaseModel):
21
+ """
22
+ Configuration for Deepgram speech to text extraction.
23
+
24
+ :ivar model: Deepgram transcription model identifier.
25
+ :vartype model: str
26
+ :ivar language: Optional language code hint for transcription.
27
+ :vartype language: str or None
28
+ :ivar punctuate: Whether to add punctuation to the transcript.
29
+ :vartype punctuate: bool
30
+ :ivar smart_format: Whether to apply smart formatting.
31
+ :vartype smart_format: bool
32
+ :ivar diarize: Whether to enable speaker diarization.
33
+ :vartype diarize: bool
34
+ :ivar filler_words: Whether to include filler words.
35
+ :vartype filler_words: bool
36
+ """
37
+
38
+ model_config = ConfigDict(extra="forbid")
39
+
40
+ model: str = Field(default="nova-3", min_length=1)
41
+ language: Optional[str] = Field(default=None, min_length=1)
42
+ punctuate: bool = Field(default=True)
43
+ smart_format: bool = Field(default=True)
44
+ diarize: bool = Field(default=False)
45
+ filler_words: bool = Field(default=False)
46
+
47
+
48
+ class DeepgramSpeechToTextExtractor(TextExtractor):
49
+ """
50
+ Extractor plugin that transcribes audio items using the Deepgram API.
51
+
52
+ This extractor is intended as a practical, hosted speech to text implementation.
53
+ It skips non-audio items.
54
+
55
+ :ivar extractor_id: Extractor identifier.
56
+ :vartype extractor_id: str
57
+ """
58
+
59
+ extractor_id = "stt-deepgram"
60
+
61
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
62
+ """
63
+ Validate extractor configuration and ensure prerequisites are available.
64
+
65
+ :param config: Configuration mapping.
66
+ :type config: dict[str, Any]
67
+ :return: Parsed configuration model.
68
+ :rtype: DeepgramSpeechToTextExtractorConfig
69
+ :raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
70
+ """
71
+ try:
72
+ from deepgram import DeepgramClient # noqa: F401
73
+ except ImportError as import_error:
74
+ raise ExtractionRunFatalError(
75
+ "Deepgram speech to text extractor requires an optional dependency. "
76
+ 'Install it with pip install "biblicus[deepgram]".'
77
+ ) from import_error
78
+
79
+ api_key = resolve_deepgram_api_key()
80
+ if api_key is None:
81
+ raise ExtractionRunFatalError(
82
+ "Deepgram speech to text extractor requires a Deepgram API key. "
83
+ "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
84
+ "deepgram.api_key."
85
+ )
86
+
87
+ return DeepgramSpeechToTextExtractorConfig.model_validate(config)
88
+
89
+ def extract_text(
90
+ self,
91
+ *,
92
+ corpus: Corpus,
93
+ item: CatalogItem,
94
+ config: BaseModel,
95
+ previous_extractions: List[ExtractionStepOutput],
96
+ ) -> Optional[ExtractedText]:
97
+ """
98
+ Transcribe an audio item.
99
+
100
+ :param corpus: Corpus containing the item bytes.
101
+ :type corpus: Corpus
102
+ :param item: Catalog item being processed.
103
+ :type item: CatalogItem
104
+ :param config: Parsed configuration model.
105
+ :type config: DeepgramSpeechToTextExtractorConfig
106
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
107
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
108
+ :return: Extracted text payload, or None when the item is not audio.
109
+ :rtype: ExtractedText or None
110
+ :raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
111
+ """
112
+ _ = previous_extractions
113
+ if not item.media_type.startswith("audio/"):
114
+ return None
115
+
116
+ parsed_config = (
117
+ config
118
+ if isinstance(config, DeepgramSpeechToTextExtractorConfig)
119
+ else DeepgramSpeechToTextExtractorConfig.model_validate(config)
120
+ )
121
+
122
+ api_key = resolve_deepgram_api_key()
123
+ if api_key is None:
124
+ raise ExtractionRunFatalError(
125
+ "Deepgram speech to text extractor requires a Deepgram API key. "
126
+ "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
127
+ "deepgram.api_key."
128
+ )
129
+
130
+ try:
131
+ from deepgram import DeepgramClient
132
+ except ImportError as import_error:
133
+ raise ExtractionRunFatalError(
134
+ "Deepgram speech to text extractor requires an optional dependency. "
135
+ 'Install it with pip install "biblicus[deepgram]".'
136
+ ) from import_error
137
+
138
+ client = DeepgramClient(api_key=api_key)
139
+ source_path = corpus.root / item.relpath
140
+
141
+ options: Dict[str, Any] = {
142
+ "model": parsed_config.model,
143
+ "punctuate": parsed_config.punctuate,
144
+ "smart_format": parsed_config.smart_format,
145
+ "diarize": parsed_config.diarize,
146
+ "filler_words": parsed_config.filler_words,
147
+ }
148
+ if parsed_config.language is not None:
149
+ options["language"] = parsed_config.language
150
+
151
+ with source_path.open("rb") as audio_handle:
152
+ audio_data = audio_handle.read()
153
+ response = client.listen.rest.v("1").transcribe_file(
154
+ {"buffer": audio_data},
155
+ options,
156
+ )
157
+
158
+ transcript_text = ""
159
+ if hasattr(response, "results") and response.results:
160
+ channels = response.results.channels
161
+ if channels and len(channels) > 0:
162
+ alternatives = channels[0].alternatives
163
+ if alternatives and len(alternatives) > 0:
164
+ transcript_text = alternatives[0].transcript or ""
165
+
166
+ return ExtractedText(text=transcript_text.strip(), producer_extractor_id=self.extractor_id)