biblicus 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/__init__.py +40 -0
- biblicus/analysis/base.py +49 -0
- biblicus/analysis/llm.py +106 -0
- biblicus/analysis/models.py +554 -0
- biblicus/analysis/schema.py +18 -0
- biblicus/analysis/topic_modeling.py +585 -0
- biblicus/cli.py +160 -11
- biblicus/constants.py +2 -0
- biblicus/corpus.py +42 -0
- biblicus/extraction.py +5 -0
- biblicus/extractors/__init__.py +12 -0
- biblicus/extractors/deepgram_stt.py +166 -0
- biblicus/extractors/docling_granite_text.py +188 -0
- biblicus/extractors/docling_smol_text.py +188 -0
- biblicus/extractors/paddleocr_vl_text.py +305 -0
- biblicus/extractors/rapidocr_text.py +8 -1
- biblicus/extractors/select_override.py +121 -0
- biblicus/extractors/select_smart_override.py +187 -0
- biblicus/inference.py +104 -0
- biblicus/models.py +6 -0
- biblicus/user_config.py +76 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/METADATA +120 -16
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/RECORD +28 -15
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/WHEEL +0 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/top_level.txt +0 -0
biblicus/cli.py
CHANGED
|
@@ -12,6 +12,7 @@ from typing import Dict, List, Optional
|
|
|
12
12
|
|
|
13
13
|
from pydantic import ValidationError
|
|
14
14
|
|
|
15
|
+
from .analysis import get_analysis_backend
|
|
15
16
|
from .backends import get_backend
|
|
16
17
|
from .context import (
|
|
17
18
|
ContextPackPolicy,
|
|
@@ -284,8 +285,50 @@ def _parse_step_spec(raw_step: str) -> tuple[str, Dict[str, object]]:
|
|
|
284
285
|
raw_pairs = raw_pairs.strip()
|
|
285
286
|
if not raw_pairs:
|
|
286
287
|
return extractor_id, {}
|
|
287
|
-
|
|
288
|
-
|
|
288
|
+
|
|
289
|
+
tokens = []
|
|
290
|
+
current_token = []
|
|
291
|
+
brace_depth = 0
|
|
292
|
+
bracket_depth = 0
|
|
293
|
+
in_quotes = False
|
|
294
|
+
escape_next = False
|
|
295
|
+
|
|
296
|
+
for char in raw_pairs:
|
|
297
|
+
if escape_next:
|
|
298
|
+
current_token.append(char)
|
|
299
|
+
escape_next = False
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
if char == "\\":
|
|
303
|
+
escape_next = True
|
|
304
|
+
current_token.append(char)
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
if char == '"' and brace_depth == 0 and bracket_depth == 0:
|
|
308
|
+
in_quotes = not in_quotes
|
|
309
|
+
current_token.append(char)
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
if not in_quotes:
|
|
313
|
+
if char == "{":
|
|
314
|
+
brace_depth += 1
|
|
315
|
+
elif char == "}":
|
|
316
|
+
brace_depth -= 1
|
|
317
|
+
elif char == "[":
|
|
318
|
+
bracket_depth += 1
|
|
319
|
+
elif char == "]":
|
|
320
|
+
bracket_depth -= 1
|
|
321
|
+
elif char == "," and brace_depth == 0 and bracket_depth == 0:
|
|
322
|
+
tokens.append("".join(current_token).strip())
|
|
323
|
+
current_token = []
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
current_token.append(char)
|
|
327
|
+
|
|
328
|
+
if current_token:
|
|
329
|
+
tokens.append("".join(current_token).strip())
|
|
330
|
+
|
|
331
|
+
for token in tokens:
|
|
289
332
|
if not token:
|
|
290
333
|
continue
|
|
291
334
|
if "=" not in token:
|
|
@@ -344,22 +387,53 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
344
387
|
:return: Exit code.
|
|
345
388
|
:rtype: int
|
|
346
389
|
"""
|
|
390
|
+
import yaml
|
|
391
|
+
|
|
347
392
|
corpus = (
|
|
348
393
|
Corpus.open(arguments.corpus)
|
|
349
394
|
if getattr(arguments, "corpus", None)
|
|
350
395
|
else Corpus.find(Path.cwd())
|
|
351
396
|
)
|
|
352
|
-
|
|
353
|
-
if
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
397
|
+
|
|
398
|
+
# Load recipe from file if --recipe is provided
|
|
399
|
+
if getattr(arguments, "recipe", None):
|
|
400
|
+
recipe_path = Path(arguments.recipe)
|
|
401
|
+
if not recipe_path.exists():
|
|
402
|
+
raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
|
|
403
|
+
with open(recipe_path, "r", encoding="utf-8") as f:
|
|
404
|
+
recipe_data = yaml.safe_load(f)
|
|
405
|
+
loaded_extractor_id = recipe_data.get("extractor_id", "pipeline")
|
|
406
|
+
loaded_config = recipe_data.get("config", {})
|
|
407
|
+
|
|
408
|
+
# If the recipe specifies a non-pipeline extractor, wrap it in a pipeline
|
|
409
|
+
if loaded_extractor_id != "pipeline":
|
|
410
|
+
extractor_id = "pipeline"
|
|
411
|
+
config = {
|
|
412
|
+
"steps": [
|
|
413
|
+
{
|
|
414
|
+
"extractor_id": loaded_extractor_id,
|
|
415
|
+
"config": loaded_config,
|
|
416
|
+
}
|
|
417
|
+
]
|
|
418
|
+
}
|
|
419
|
+
else:
|
|
420
|
+
extractor_id = loaded_extractor_id
|
|
421
|
+
config = loaded_config
|
|
422
|
+
else:
|
|
423
|
+
# Build from --step arguments
|
|
424
|
+
raw_steps = list(arguments.step or [])
|
|
425
|
+
if not raw_steps:
|
|
426
|
+
raise ValueError("Pipeline extraction requires at least one --step")
|
|
427
|
+
steps: List[Dict[str, object]] = []
|
|
428
|
+
for raw_step in raw_steps:
|
|
429
|
+
step_extractor_id, step_config = _parse_step_spec(raw_step)
|
|
430
|
+
steps.append({"extractor_id": step_extractor_id, "config": step_config})
|
|
431
|
+
config = {"steps": steps}
|
|
432
|
+
extractor_id = "pipeline"
|
|
433
|
+
|
|
360
434
|
manifest = build_extraction_run(
|
|
361
435
|
corpus,
|
|
362
|
-
extractor_id=
|
|
436
|
+
extractor_id=extractor_id,
|
|
363
437
|
recipe_name=arguments.recipe_name,
|
|
364
438
|
config=config,
|
|
365
439
|
)
|
|
@@ -563,6 +637,54 @@ def cmd_crawl(arguments: argparse.Namespace) -> int:
|
|
|
563
637
|
return 0
|
|
564
638
|
|
|
565
639
|
|
|
640
|
+
def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
641
|
+
"""
|
|
642
|
+
Run topic modeling analysis for a corpus.
|
|
643
|
+
|
|
644
|
+
:param arguments: Parsed command-line interface arguments.
|
|
645
|
+
:type arguments: argparse.Namespace
|
|
646
|
+
:return: Exit code.
|
|
647
|
+
:rtype: int
|
|
648
|
+
"""
|
|
649
|
+
import yaml
|
|
650
|
+
|
|
651
|
+
corpus = (
|
|
652
|
+
Corpus.open(arguments.corpus)
|
|
653
|
+
if getattr(arguments, "corpus", None)
|
|
654
|
+
else Corpus.find(Path.cwd())
|
|
655
|
+
)
|
|
656
|
+
recipe_path = Path(arguments.recipe)
|
|
657
|
+
if not recipe_path.is_file():
|
|
658
|
+
raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
|
|
659
|
+
recipe_data = yaml.safe_load(recipe_path.read_text(encoding="utf-8")) or {}
|
|
660
|
+
if not isinstance(recipe_data, dict):
|
|
661
|
+
raise ValueError("Topic modeling recipe must be a mapping/object")
|
|
662
|
+
|
|
663
|
+
if arguments.extraction_run:
|
|
664
|
+
extraction_run = parse_extraction_run_reference(arguments.extraction_run)
|
|
665
|
+
else:
|
|
666
|
+
extraction_run = corpus.latest_extraction_run_reference()
|
|
667
|
+
if extraction_run is None:
|
|
668
|
+
raise ValueError("Topic analysis requires an extraction run to supply text inputs")
|
|
669
|
+
print(
|
|
670
|
+
"Warning: using latest extraction run; pass --extraction-run for reproducibility.",
|
|
671
|
+
file=sys.stderr,
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
backend = get_analysis_backend("topic-modeling")
|
|
675
|
+
try:
|
|
676
|
+
output = backend.run_analysis(
|
|
677
|
+
corpus,
|
|
678
|
+
recipe_name=arguments.recipe_name,
|
|
679
|
+
config=recipe_data,
|
|
680
|
+
extraction_run=extraction_run,
|
|
681
|
+
)
|
|
682
|
+
except ValidationError as exc:
|
|
683
|
+
raise ValueError(f"Invalid topic modeling recipe: {exc}") from exc
|
|
684
|
+
print(output.model_dump_json(indent=2))
|
|
685
|
+
return 0
|
|
686
|
+
|
|
687
|
+
|
|
566
688
|
def build_parser() -> argparse.ArgumentParser:
|
|
567
689
|
"""
|
|
568
690
|
Build the command-line interface argument parser.
|
|
@@ -668,6 +790,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
668
790
|
p_extract_build.add_argument(
|
|
669
791
|
"--recipe-name", default="default", help="Human-readable recipe name."
|
|
670
792
|
)
|
|
793
|
+
p_extract_build.add_argument(
|
|
794
|
+
"--recipe",
|
|
795
|
+
default=None,
|
|
796
|
+
help="Path to YAML recipe file. If provided, --step arguments are ignored.",
|
|
797
|
+
)
|
|
671
798
|
p_extract_build.add_argument(
|
|
672
799
|
"--step",
|
|
673
800
|
action="append",
|
|
@@ -774,6 +901,28 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
774
901
|
p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
|
|
775
902
|
p_crawl.set_defaults(func=cmd_crawl)
|
|
776
903
|
|
|
904
|
+
p_analyze = sub.add_parser("analyze", help="Run analysis pipelines for the corpus.")
|
|
905
|
+
analyze_sub = p_analyze.add_subparsers(dest="analyze_command", required=True)
|
|
906
|
+
|
|
907
|
+
p_analyze_topics = analyze_sub.add_parser("topics", help="Run topic modeling analysis.")
|
|
908
|
+
_add_common_corpus_arg(p_analyze_topics)
|
|
909
|
+
p_analyze_topics.add_argument(
|
|
910
|
+
"--recipe",
|
|
911
|
+
required=True,
|
|
912
|
+
help="Path to topic modeling recipe YAML.",
|
|
913
|
+
)
|
|
914
|
+
p_analyze_topics.add_argument(
|
|
915
|
+
"--recipe-name",
|
|
916
|
+
default="default",
|
|
917
|
+
help="Human-readable recipe name.",
|
|
918
|
+
)
|
|
919
|
+
p_analyze_topics.add_argument(
|
|
920
|
+
"--extraction-run",
|
|
921
|
+
default=None,
|
|
922
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
923
|
+
)
|
|
924
|
+
p_analyze_topics.set_defaults(func=cmd_analyze_topics)
|
|
925
|
+
|
|
777
926
|
return parser
|
|
778
927
|
|
|
779
928
|
|
biblicus/constants.py
CHANGED
|
@@ -4,9 +4,11 @@ Shared constants for Biblicus.
|
|
|
4
4
|
|
|
5
5
|
SCHEMA_VERSION = 2
|
|
6
6
|
DATASET_SCHEMA_VERSION = 1
|
|
7
|
+
ANALYSIS_SCHEMA_VERSION = 1
|
|
7
8
|
CORPUS_DIR_NAME = ".biblicus"
|
|
8
9
|
DEFAULT_RAW_DIR = "raw"
|
|
9
10
|
SIDECAR_SUFFIX = ".biblicus.yml"
|
|
10
11
|
RUNS_DIR_NAME = "runs"
|
|
11
12
|
EXTRACTION_RUNS_DIR_NAME = "extraction"
|
|
13
|
+
ANALYSIS_RUNS_DIR_NAME = "analysis"
|
|
12
14
|
HOOK_LOGS_DIR_NAME = "hook_logs"
|
biblicus/corpus.py
CHANGED
|
@@ -16,6 +16,7 @@ import yaml
|
|
|
16
16
|
from pydantic import ValidationError
|
|
17
17
|
|
|
18
18
|
from .constants import (
|
|
19
|
+
ANALYSIS_RUNS_DIR_NAME,
|
|
19
20
|
CORPUS_DIR_NAME,
|
|
20
21
|
DEFAULT_RAW_DIR,
|
|
21
22
|
EXTRACTION_RUNS_DIR_NAME,
|
|
@@ -32,6 +33,7 @@ from .models import (
|
|
|
32
33
|
CorpusCatalog,
|
|
33
34
|
CorpusConfig,
|
|
34
35
|
ExtractionRunListEntry,
|
|
36
|
+
ExtractionRunReference,
|
|
35
37
|
IngestResult,
|
|
36
38
|
RetrievalRun,
|
|
37
39
|
)
|
|
@@ -538,6 +540,16 @@ class Corpus:
|
|
|
538
540
|
"""
|
|
539
541
|
return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
|
|
540
542
|
|
|
543
|
+
@property
|
|
544
|
+
def analysis_runs_dir(self) -> Path:
|
|
545
|
+
"""
|
|
546
|
+
Location of analysis run artifacts.
|
|
547
|
+
|
|
548
|
+
:return: Path to the analysis runs directory.
|
|
549
|
+
:rtype: Path
|
|
550
|
+
"""
|
|
551
|
+
return self.runs_dir / ANALYSIS_RUNS_DIR_NAME
|
|
552
|
+
|
|
541
553
|
def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
|
|
542
554
|
"""
|
|
543
555
|
Resolve an extraction run directory.
|
|
@@ -551,6 +563,19 @@ class Corpus:
|
|
|
551
563
|
"""
|
|
552
564
|
return self.extraction_runs_dir / extractor_id / run_id
|
|
553
565
|
|
|
566
|
+
def analysis_run_dir(self, *, analysis_id: str, run_id: str) -> Path:
|
|
567
|
+
"""
|
|
568
|
+
Resolve an analysis run directory.
|
|
569
|
+
|
|
570
|
+
:param analysis_id: Analysis backend identifier.
|
|
571
|
+
:type analysis_id: str
|
|
572
|
+
:param run_id: Analysis run identifier.
|
|
573
|
+
:type run_id: str
|
|
574
|
+
:return: Analysis run directory.
|
|
575
|
+
:rtype: Path
|
|
576
|
+
"""
|
|
577
|
+
return self.analysis_runs_dir / analysis_id / run_id
|
|
578
|
+
|
|
554
579
|
def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
|
|
555
580
|
"""
|
|
556
581
|
Read extracted text for an item from an extraction run, when present.
|
|
@@ -647,6 +672,23 @@ class Corpus:
|
|
|
647
672
|
entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
|
|
648
673
|
return entries
|
|
649
674
|
|
|
675
|
+
def latest_extraction_run_reference(
|
|
676
|
+
self, *, extractor_id: Optional[str] = None
|
|
677
|
+
) -> Optional[ExtractionRunReference]:
|
|
678
|
+
"""
|
|
679
|
+
Return the most recent extraction run reference.
|
|
680
|
+
|
|
681
|
+
:param extractor_id: Optional extractor identifier filter.
|
|
682
|
+
:type extractor_id: str or None
|
|
683
|
+
:return: Latest extraction run reference or None when no runs exist.
|
|
684
|
+
:rtype: biblicus.models.ExtractionRunReference or None
|
|
685
|
+
"""
|
|
686
|
+
entries = self.list_extraction_runs(extractor_id=extractor_id)
|
|
687
|
+
if not entries:
|
|
688
|
+
return None
|
|
689
|
+
latest = entries[0]
|
|
690
|
+
return ExtractionRunReference(extractor_id=latest.extractor_id, run_id=latest.run_id)
|
|
691
|
+
|
|
650
692
|
def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
|
|
651
693
|
"""
|
|
652
694
|
Delete an extraction run directory and its derived artifacts.
|
biblicus/extraction.py
CHANGED
|
@@ -63,6 +63,8 @@ class ExtractionStepResult(BaseModel):
|
|
|
63
63
|
:vartype producer_extractor_id: str or None
|
|
64
64
|
:ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
|
|
65
65
|
:vartype source_step_index: int or None
|
|
66
|
+
:ivar confidence: Optional confidence score from 0.0 to 1.0.
|
|
67
|
+
:vartype confidence: float or None
|
|
66
68
|
:ivar error_type: Optional error type name for errored steps.
|
|
67
69
|
:vartype error_type: str or None
|
|
68
70
|
:ivar error_message: Optional error message for errored steps.
|
|
@@ -78,6 +80,7 @@ class ExtractionStepResult(BaseModel):
|
|
|
78
80
|
text_characters: int = Field(default=0, ge=0)
|
|
79
81
|
producer_extractor_id: Optional[str] = None
|
|
80
82
|
source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
83
|
+
confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
81
84
|
error_type: Optional[str] = None
|
|
82
85
|
error_message: Optional[str] = None
|
|
83
86
|
|
|
@@ -447,6 +450,7 @@ def build_extraction_run(
|
|
|
447
450
|
text_characters=text_characters,
|
|
448
451
|
producer_extractor_id=extracted_text.producer_extractor_id,
|
|
449
452
|
source_step_index=extracted_text.source_step_index,
|
|
453
|
+
confidence=extracted_text.confidence,
|
|
450
454
|
error_type=None,
|
|
451
455
|
error_message=None,
|
|
452
456
|
)
|
|
@@ -460,6 +464,7 @@ def build_extraction_run(
|
|
|
460
464
|
text_characters=text_characters,
|
|
461
465
|
producer_extractor_id=extracted_text.producer_extractor_id,
|
|
462
466
|
source_step_index=extracted_text.source_step_index,
|
|
467
|
+
confidence=extracted_text.confidence,
|
|
463
468
|
error_type=None,
|
|
464
469
|
error_message=None,
|
|
465
470
|
)
|
biblicus/extractors/__init__.py
CHANGED
|
@@ -7,14 +7,20 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict
|
|
8
8
|
|
|
9
9
|
from .base import TextExtractor
|
|
10
|
+
from .deepgram_stt import DeepgramSpeechToTextExtractor
|
|
11
|
+
from .docling_granite_text import DoclingGraniteExtractor
|
|
12
|
+
from .docling_smol_text import DoclingSmolExtractor
|
|
10
13
|
from .markitdown_text import MarkItDownExtractor
|
|
11
14
|
from .metadata_text import MetadataTextExtractor
|
|
12
15
|
from .openai_stt import OpenAiSpeechToTextExtractor
|
|
16
|
+
from .paddleocr_vl_text import PaddleOcrVlExtractor
|
|
13
17
|
from .pass_through_text import PassThroughTextExtractor
|
|
14
18
|
from .pdf_text import PortableDocumentFormatTextExtractor
|
|
15
19
|
from .pipeline import PipelineExtractor
|
|
16
20
|
from .rapidocr_text import RapidOcrExtractor
|
|
17
21
|
from .select_longest_text import SelectLongestTextExtractor
|
|
22
|
+
from .select_override import SelectOverrideExtractor
|
|
23
|
+
from .select_smart_override import SelectSmartOverrideExtractor
|
|
18
24
|
from .select_text import SelectTextExtractor
|
|
19
25
|
from .unstructured_text import UnstructuredExtractor
|
|
20
26
|
|
|
@@ -32,13 +38,19 @@ def get_extractor(extractor_id: str) -> TextExtractor:
|
|
|
32
38
|
extractors: Dict[str, TextExtractor] = {
|
|
33
39
|
MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
|
|
34
40
|
MarkItDownExtractor.extractor_id: MarkItDownExtractor(),
|
|
41
|
+
DoclingSmolExtractor.extractor_id: DoclingSmolExtractor(),
|
|
42
|
+
DoclingGraniteExtractor.extractor_id: DoclingGraniteExtractor(),
|
|
35
43
|
PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
|
|
36
44
|
PipelineExtractor.extractor_id: PipelineExtractor(),
|
|
37
45
|
PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
|
|
38
46
|
OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
|
|
47
|
+
DeepgramSpeechToTextExtractor.extractor_id: DeepgramSpeechToTextExtractor(),
|
|
39
48
|
RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
|
|
49
|
+
PaddleOcrVlExtractor.extractor_id: PaddleOcrVlExtractor(),
|
|
40
50
|
SelectTextExtractor.extractor_id: SelectTextExtractor(),
|
|
41
51
|
SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
|
|
52
|
+
SelectSmartOverrideExtractor.extractor_id: SelectSmartOverrideExtractor(),
|
|
53
|
+
SelectOverrideExtractor.extractor_id: SelectOverrideExtractor(),
|
|
42
54
|
UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
|
|
43
55
|
}
|
|
44
56
|
if extractor_id not in extractors:
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deepgram-backed speech to text extractor plugin.
|
|
3
|
+
|
|
4
|
+
This extractor is implemented as an optional dependency so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
12
|
+
|
|
13
|
+
from ..corpus import Corpus
|
|
14
|
+
from ..errors import ExtractionRunFatalError
|
|
15
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
|
+
from ..user_config import resolve_deepgram_api_key
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DeepgramSpeechToTextExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for Deepgram speech to text extraction.
|
|
23
|
+
|
|
24
|
+
:ivar model: Deepgram transcription model identifier.
|
|
25
|
+
:vartype model: str
|
|
26
|
+
:ivar language: Optional language code hint for transcription.
|
|
27
|
+
:vartype language: str or None
|
|
28
|
+
:ivar punctuate: Whether to add punctuation to the transcript.
|
|
29
|
+
:vartype punctuate: bool
|
|
30
|
+
:ivar smart_format: Whether to apply smart formatting.
|
|
31
|
+
:vartype smart_format: bool
|
|
32
|
+
:ivar diarize: Whether to enable speaker diarization.
|
|
33
|
+
:vartype diarize: bool
|
|
34
|
+
:ivar filler_words: Whether to include filler words.
|
|
35
|
+
:vartype filler_words: bool
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
model_config = ConfigDict(extra="forbid")
|
|
39
|
+
|
|
40
|
+
model: str = Field(default="nova-3", min_length=1)
|
|
41
|
+
language: Optional[str] = Field(default=None, min_length=1)
|
|
42
|
+
punctuate: bool = Field(default=True)
|
|
43
|
+
smart_format: bool = Field(default=True)
|
|
44
|
+
diarize: bool = Field(default=False)
|
|
45
|
+
filler_words: bool = Field(default=False)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
49
|
+
"""
|
|
50
|
+
Extractor plugin that transcribes audio items using the Deepgram API.
|
|
51
|
+
|
|
52
|
+
This extractor is intended as a practical, hosted speech to text implementation.
|
|
53
|
+
It skips non-audio items.
|
|
54
|
+
|
|
55
|
+
:ivar extractor_id: Extractor identifier.
|
|
56
|
+
:vartype extractor_id: str
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
extractor_id = "stt-deepgram"
|
|
60
|
+
|
|
61
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
62
|
+
"""
|
|
63
|
+
Validate extractor configuration and ensure prerequisites are available.
|
|
64
|
+
|
|
65
|
+
:param config: Configuration mapping.
|
|
66
|
+
:type config: dict[str, Any]
|
|
67
|
+
:return: Parsed configuration model.
|
|
68
|
+
:rtype: DeepgramSpeechToTextExtractorConfig
|
|
69
|
+
:raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
from deepgram import DeepgramClient # noqa: F401
|
|
73
|
+
except ImportError as import_error:
|
|
74
|
+
raise ExtractionRunFatalError(
|
|
75
|
+
"Deepgram speech to text extractor requires an optional dependency. "
|
|
76
|
+
'Install it with pip install "biblicus[deepgram]".'
|
|
77
|
+
) from import_error
|
|
78
|
+
|
|
79
|
+
api_key = resolve_deepgram_api_key()
|
|
80
|
+
if api_key is None:
|
|
81
|
+
raise ExtractionRunFatalError(
|
|
82
|
+
"Deepgram speech to text extractor requires a Deepgram API key. "
|
|
83
|
+
"Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
84
|
+
"deepgram.api_key."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return DeepgramSpeechToTextExtractorConfig.model_validate(config)
|
|
88
|
+
|
|
89
|
+
def extract_text(
|
|
90
|
+
self,
|
|
91
|
+
*,
|
|
92
|
+
corpus: Corpus,
|
|
93
|
+
item: CatalogItem,
|
|
94
|
+
config: BaseModel,
|
|
95
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
96
|
+
) -> Optional[ExtractedText]:
|
|
97
|
+
"""
|
|
98
|
+
Transcribe an audio item.
|
|
99
|
+
|
|
100
|
+
:param corpus: Corpus containing the item bytes.
|
|
101
|
+
:type corpus: Corpus
|
|
102
|
+
:param item: Catalog item being processed.
|
|
103
|
+
:type item: CatalogItem
|
|
104
|
+
:param config: Parsed configuration model.
|
|
105
|
+
:type config: DeepgramSpeechToTextExtractorConfig
|
|
106
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
107
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
108
|
+
:return: Extracted text payload, or None when the item is not audio.
|
|
109
|
+
:rtype: ExtractedText or None
|
|
110
|
+
:raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
|
|
111
|
+
"""
|
|
112
|
+
_ = previous_extractions
|
|
113
|
+
if not item.media_type.startswith("audio/"):
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
parsed_config = (
|
|
117
|
+
config
|
|
118
|
+
if isinstance(config, DeepgramSpeechToTextExtractorConfig)
|
|
119
|
+
else DeepgramSpeechToTextExtractorConfig.model_validate(config)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
api_key = resolve_deepgram_api_key()
|
|
123
|
+
if api_key is None:
|
|
124
|
+
raise ExtractionRunFatalError(
|
|
125
|
+
"Deepgram speech to text extractor requires a Deepgram API key. "
|
|
126
|
+
"Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
127
|
+
"deepgram.api_key."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
from deepgram import DeepgramClient
|
|
132
|
+
except ImportError as import_error:
|
|
133
|
+
raise ExtractionRunFatalError(
|
|
134
|
+
"Deepgram speech to text extractor requires an optional dependency. "
|
|
135
|
+
'Install it with pip install "biblicus[deepgram]".'
|
|
136
|
+
) from import_error
|
|
137
|
+
|
|
138
|
+
client = DeepgramClient(api_key=api_key)
|
|
139
|
+
source_path = corpus.root / item.relpath
|
|
140
|
+
|
|
141
|
+
options: Dict[str, Any] = {
|
|
142
|
+
"model": parsed_config.model,
|
|
143
|
+
"punctuate": parsed_config.punctuate,
|
|
144
|
+
"smart_format": parsed_config.smart_format,
|
|
145
|
+
"diarize": parsed_config.diarize,
|
|
146
|
+
"filler_words": parsed_config.filler_words,
|
|
147
|
+
}
|
|
148
|
+
if parsed_config.language is not None:
|
|
149
|
+
options["language"] = parsed_config.language
|
|
150
|
+
|
|
151
|
+
with source_path.open("rb") as audio_handle:
|
|
152
|
+
audio_data = audio_handle.read()
|
|
153
|
+
response = client.listen.rest.v("1").transcribe_file(
|
|
154
|
+
{"buffer": audio_data},
|
|
155
|
+
options,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
transcript_text = ""
|
|
159
|
+
if hasattr(response, "results") and response.results:
|
|
160
|
+
channels = response.results.channels
|
|
161
|
+
if channels and len(channels) > 0:
|
|
162
|
+
alternatives = channels[0].alternatives
|
|
163
|
+
if alternatives and len(alternatives) > 0:
|
|
164
|
+
transcript_text = alternatives[0].transcript or ""
|
|
165
|
+
|
|
166
|
+
return ExtractedText(text=transcript_text.strip(), producer_extractor_id=self.extractor_id)
|