biblicus 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/__init__.py +40 -0
- biblicus/analysis/base.py +49 -0
- biblicus/analysis/llm.py +106 -0
- biblicus/analysis/models.py +512 -0
- biblicus/analysis/schema.py +18 -0
- biblicus/analysis/topic_modeling.py +561 -0
- biblicus/cli.py +160 -11
- biblicus/constants.py +2 -0
- biblicus/corpus.py +42 -0
- biblicus/extraction.py +5 -0
- biblicus/extractors/__init__.py +14 -0
- biblicus/extractors/deepgram_stt.py +166 -0
- biblicus/extractors/docling_granite_text.py +188 -0
- biblicus/extractors/docling_smol_text.py +188 -0
- biblicus/extractors/markitdown_text.py +128 -0
- biblicus/extractors/paddleocr_vl_text.py +305 -0
- biblicus/extractors/rapidocr_text.py +8 -1
- biblicus/extractors/select_override.py +121 -0
- biblicus/extractors/select_smart_override.py +187 -0
- biblicus/inference.py +104 -0
- biblicus/models.py +6 -0
- biblicus/user_config.py +76 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/METADATA +120 -5
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/RECORD +29 -15
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/WHEEL +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/top_level.txt +0 -0
biblicus/inference.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Inference backend abstraction for machine learning powered components.
|
|
3
|
+
|
|
4
|
+
This module provides reusable configuration and credential resolution patterns for components
|
|
5
|
+
that can execute locally or via API providers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InferenceBackendMode(str, Enum):
|
|
18
|
+
"""Execution mode for inference backends."""
|
|
19
|
+
|
|
20
|
+
LOCAL = "local"
|
|
21
|
+
API = "api"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ApiProvider(str, Enum):
|
|
25
|
+
"""Supported application programming interface providers for inference."""
|
|
26
|
+
|
|
27
|
+
HUGGINGFACE = "huggingface"
|
|
28
|
+
OPENAI = "openai"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class InferenceBackendConfig(BaseModel):
|
|
32
|
+
"""
|
|
33
|
+
Composable configuration for inference backends.
|
|
34
|
+
|
|
35
|
+
This config can be embedded in extractor or transformer configurations to provide
|
|
36
|
+
a uniform interface for local versus application programming interface execution.
|
|
37
|
+
|
|
38
|
+
:ivar mode: Execution mode, local or application programming interface.
|
|
39
|
+
:vartype mode: InferenceBackendMode
|
|
40
|
+
:ivar api_provider: Application programming interface provider when mode is application programming interface.
|
|
41
|
+
:vartype api_provider: ApiProvider or None
|
|
42
|
+
:ivar api_key: Optional per-config application programming interface key override.
|
|
43
|
+
:vartype api_key: str or None
|
|
44
|
+
:ivar model_id: Optional model identifier for application programming interface requests.
|
|
45
|
+
:vartype model_id: str or None
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
model_config = ConfigDict(extra="forbid")
|
|
49
|
+
|
|
50
|
+
mode: InferenceBackendMode = Field(default=InferenceBackendMode.LOCAL)
|
|
51
|
+
api_provider: Optional[ApiProvider] = Field(default=None)
|
|
52
|
+
api_key: Optional[str] = Field(default=None)
|
|
53
|
+
model_id: Optional[str] = Field(default=None)
|
|
54
|
+
|
|
55
|
+
@model_validator(mode="after")
|
|
56
|
+
def _validate_api_provider_required(self) -> "InferenceBackendConfig":
|
|
57
|
+
if self.mode == InferenceBackendMode.API and self.api_provider is None:
|
|
58
|
+
raise ValueError("api_provider is required when mode is 'api'")
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def resolve_api_key(
|
|
63
|
+
provider: ApiProvider,
|
|
64
|
+
*,
|
|
65
|
+
config_override: Optional[str] = None,
|
|
66
|
+
) -> Optional[str]:
|
|
67
|
+
"""
|
|
68
|
+
Resolve an application programming interface key with precedence rules.
|
|
69
|
+
|
|
70
|
+
Precedence order (highest to lowest):
|
|
71
|
+
1. Explicit config override parameter
|
|
72
|
+
2. Environment variable for the provider
|
|
73
|
+
3. User configuration file
|
|
74
|
+
|
|
75
|
+
:param provider: Application programming interface provider to resolve key for.
|
|
76
|
+
:type provider: ApiProvider
|
|
77
|
+
:param config_override: Optional explicit key from configuration.
|
|
78
|
+
:type config_override: str or None
|
|
79
|
+
:return: Resolved application programming interface key or None if unavailable.
|
|
80
|
+
:rtype: str or None
|
|
81
|
+
"""
|
|
82
|
+
if config_override is not None:
|
|
83
|
+
return config_override
|
|
84
|
+
|
|
85
|
+
from .user_config import load_user_config
|
|
86
|
+
|
|
87
|
+
if provider == ApiProvider.HUGGINGFACE:
|
|
88
|
+
env_key = os.environ.get("HUGGINGFACE_API_KEY")
|
|
89
|
+
if env_key:
|
|
90
|
+
return env_key
|
|
91
|
+
user_config = load_user_config()
|
|
92
|
+
if user_config.huggingface is not None:
|
|
93
|
+
return user_config.huggingface.api_key
|
|
94
|
+
return None
|
|
95
|
+
elif provider == ApiProvider.OPENAI:
|
|
96
|
+
env_key = os.environ.get("OPENAI_API_KEY")
|
|
97
|
+
if env_key:
|
|
98
|
+
return env_key
|
|
99
|
+
user_config = load_user_config()
|
|
100
|
+
if user_config.openai is not None:
|
|
101
|
+
return user_config.openai.api_key
|
|
102
|
+
return None
|
|
103
|
+
else:
|
|
104
|
+
return None
|
biblicus/models.py
CHANGED
|
@@ -399,6 +399,8 @@ class ExtractedText(BaseModel):
|
|
|
399
399
|
:vartype producer_extractor_id: str
|
|
400
400
|
:ivar source_step_index: Optional pipeline step index where this text originated.
|
|
401
401
|
:vartype source_step_index: int or None
|
|
402
|
+
:ivar confidence: Optional confidence score from 0.0 to 1.0.
|
|
403
|
+
:vartype confidence: float or None
|
|
402
404
|
"""
|
|
403
405
|
|
|
404
406
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -406,6 +408,7 @@ class ExtractedText(BaseModel):
|
|
|
406
408
|
text: str
|
|
407
409
|
producer_extractor_id: str = Field(min_length=1)
|
|
408
410
|
source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
411
|
+
confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
409
412
|
|
|
410
413
|
|
|
411
414
|
class ExtractionStepOutput(BaseModel):
|
|
@@ -426,6 +429,8 @@ class ExtractionStepOutput(BaseModel):
|
|
|
426
429
|
:vartype producer_extractor_id: str or None
|
|
427
430
|
:ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
|
|
428
431
|
:vartype source_step_index: int or None
|
|
432
|
+
:ivar confidence: Optional confidence score from 0.0 to 1.0.
|
|
433
|
+
:vartype confidence: float or None
|
|
429
434
|
:ivar error_type: Optional error type name for errored steps.
|
|
430
435
|
:vartype error_type: str or None
|
|
431
436
|
:ivar error_message: Optional error message for errored steps.
|
|
@@ -441,5 +446,6 @@ class ExtractionStepOutput(BaseModel):
|
|
|
441
446
|
text_characters: int = Field(default=0, ge=0)
|
|
442
447
|
producer_extractor_id: Optional[str] = None
|
|
443
448
|
source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
449
|
+
confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
444
450
|
error_type: Optional[str] = None
|
|
445
451
|
error_message: Optional[str] = None
|
biblicus/user_config.py
CHANGED
|
@@ -29,17 +29,49 @@ class OpenAiUserConfig(BaseModel):
|
|
|
29
29
|
api_key: str = Field(min_length=1)
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
class HuggingFaceUserConfig(BaseModel):
|
|
33
|
+
"""
|
|
34
|
+
Configuration for HuggingFace integrations.
|
|
35
|
+
|
|
36
|
+
:ivar api_key: HuggingFace API key used for authenticated requests.
|
|
37
|
+
:vartype api_key: str
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
model_config = ConfigDict(extra="forbid")
|
|
41
|
+
|
|
42
|
+
api_key: str = Field(min_length=1)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DeepgramUserConfig(BaseModel):
|
|
46
|
+
"""
|
|
47
|
+
Configuration for Deepgram integrations.
|
|
48
|
+
|
|
49
|
+
:ivar api_key: Deepgram API key used for authenticated requests.
|
|
50
|
+
:vartype api_key: str
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
model_config = ConfigDict(extra="forbid")
|
|
54
|
+
|
|
55
|
+
api_key: str = Field(min_length=1)
|
|
56
|
+
|
|
57
|
+
|
|
32
58
|
class BiblicusUserConfig(BaseModel):
|
|
33
59
|
"""
|
|
34
60
|
Parsed user configuration for Biblicus.
|
|
35
61
|
|
|
36
62
|
:ivar openai: Optional OpenAI configuration.
|
|
37
63
|
:vartype openai: OpenAiUserConfig or None
|
|
64
|
+
:ivar huggingface: Optional HuggingFace configuration.
|
|
65
|
+
:vartype huggingface: HuggingFaceUserConfig or None
|
|
66
|
+
:ivar deepgram: Optional Deepgram configuration.
|
|
67
|
+
:vartype deepgram: DeepgramUserConfig or None
|
|
38
68
|
"""
|
|
39
69
|
|
|
40
70
|
model_config = ConfigDict(extra="forbid")
|
|
41
71
|
|
|
42
72
|
openai: Optional[OpenAiUserConfig] = None
|
|
73
|
+
huggingface: Optional[HuggingFaceUserConfig] = None
|
|
74
|
+
deepgram: Optional[DeepgramUserConfig] = None
|
|
43
75
|
|
|
44
76
|
|
|
45
77
|
def default_user_config_paths(
|
|
@@ -136,3 +168,47 @@ def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Op
|
|
|
136
168
|
if loaded.openai is None:
|
|
137
169
|
return None
|
|
138
170
|
return loaded.openai.api_key
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def resolve_huggingface_api_key(
|
|
174
|
+
*, config: Optional[BiblicusUserConfig] = None
|
|
175
|
+
) -> Optional[str]:
|
|
176
|
+
"""
|
|
177
|
+
Resolve a HuggingFace API key from environment or user configuration.
|
|
178
|
+
|
|
179
|
+
Environment takes precedence over configuration.
|
|
180
|
+
|
|
181
|
+
:param config: Optional pre-loaded user configuration.
|
|
182
|
+
:type config: BiblicusUserConfig or None
|
|
183
|
+
:return: API key string, or None when no key is available.
|
|
184
|
+
:rtype: str or None
|
|
185
|
+
"""
|
|
186
|
+
env_key = os.environ.get("HUGGINGFACE_API_KEY")
|
|
187
|
+
if env_key:
|
|
188
|
+
return env_key
|
|
189
|
+
loaded = config or load_user_config()
|
|
190
|
+
if loaded.huggingface is None:
|
|
191
|
+
return None
|
|
192
|
+
return loaded.huggingface.api_key
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def resolve_deepgram_api_key(
|
|
196
|
+
*, config: Optional[BiblicusUserConfig] = None
|
|
197
|
+
) -> Optional[str]:
|
|
198
|
+
"""
|
|
199
|
+
Resolve a Deepgram API key from environment or user configuration.
|
|
200
|
+
|
|
201
|
+
Environment takes precedence over configuration.
|
|
202
|
+
|
|
203
|
+
:param config: Optional pre-loaded user configuration.
|
|
204
|
+
:type config: BiblicusUserConfig or None
|
|
205
|
+
:return: API key string, or None when no key is available.
|
|
206
|
+
:rtype: str or None
|
|
207
|
+
"""
|
|
208
|
+
env_key = os.environ.get("DEEPGRAM_API_KEY")
|
|
209
|
+
if env_key:
|
|
210
|
+
return env_key
|
|
211
|
+
loaded = config or load_user_config()
|
|
212
|
+
if loaded.deepgram is None:
|
|
213
|
+
return None
|
|
214
|
+
return loaded.deepgram.api_key
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -25,6 +25,21 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
|
25
25
|
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
26
|
Provides-Extra: ocr
|
|
27
27
|
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
+
Provides-Extra: paddleocr
|
|
29
|
+
Requires-Dist: paddleocr>=2.7.0; extra == "paddleocr"
|
|
30
|
+
Requires-Dist: paddlepaddle>=2.5.0; extra == "paddleocr"
|
|
31
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "paddleocr"
|
|
32
|
+
Requires-Dist: requests>=2.28.0; extra == "paddleocr"
|
|
33
|
+
Provides-Extra: markitdown
|
|
34
|
+
Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
|
|
35
|
+
Provides-Extra: deepgram
|
|
36
|
+
Requires-Dist: deepgram-sdk>=3.0; extra == "deepgram"
|
|
37
|
+
Provides-Extra: docling
|
|
38
|
+
Requires-Dist: docling[vlm]>=2.0.0; extra == "docling"
|
|
39
|
+
Provides-Extra: docling-mlx
|
|
40
|
+
Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
|
|
41
|
+
Provides-Extra: topic-modeling
|
|
42
|
+
Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
|
|
28
43
|
Dynamic: license-file
|
|
29
44
|
|
|
30
45
|
# Biblicus
|
|
@@ -67,7 +82,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
|
|
|
67
82
|
This simplified sequence diagram shows the same idea at a high level.
|
|
68
83
|
|
|
69
84
|
```mermaid
|
|
70
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
85
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
71
86
|
sequenceDiagram
|
|
72
87
|
participant App as Your assistant code
|
|
73
88
|
participant KB as Knowledge base
|
|
@@ -106,7 +121,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
|
|
|
106
121
|
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
107
122
|
|
|
108
123
|
```mermaid
|
|
109
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
124
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
110
125
|
sequenceDiagram
|
|
111
126
|
participant User
|
|
112
127
|
participant App as Your assistant code
|
|
@@ -158,8 +173,14 @@ python3 -m pip install biblicus
|
|
|
158
173
|
Some extractors are optional so the base install stays small.
|
|
159
174
|
|
|
160
175
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
161
|
-
-
|
|
176
|
+
- Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
|
|
177
|
+
- Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
|
|
178
|
+
- Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
|
|
179
|
+
- Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
180
|
+
- Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
162
181
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
182
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
183
|
+
- Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
|
|
163
184
|
|
|
164
185
|
## Quick start
|
|
165
186
|
|
|
@@ -417,6 +438,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
417
438
|
|
|
418
439
|
- [Corpus][corpus]
|
|
419
440
|
- [Text extraction][text-extraction]
|
|
441
|
+
- [Speech to text][speech-to-text]
|
|
420
442
|
- [Knowledge base][knowledge-base]
|
|
421
443
|
- [Backends][backends]
|
|
422
444
|
- [Context packs][context-packs]
|
|
@@ -465,7 +487,97 @@ corpus/
|
|
|
465
487
|
Two backends are included.
|
|
466
488
|
|
|
467
489
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
468
|
-
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in
|
|
490
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
491
|
+
|
|
492
|
+
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
493
|
+
|
|
494
|
+
## Extraction backends
|
|
495
|
+
|
|
496
|
+
These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
|
|
497
|
+
|
|
498
|
+
### Text and document extraction
|
|
499
|
+
|
|
500
|
+
- [`pass-through-text`](docs/extractors/text-document/pass-through.md) reads text items and strips Markdown front matter.
|
|
501
|
+
- [`metadata-text`](docs/extractors/text-document/metadata.md) turns catalog metadata into a small text artifact.
|
|
502
|
+
- [`pdf-text`](docs/extractors/text-document/pdf.md) extracts text from Portable Document Format items with `pypdf`.
|
|
503
|
+
- [`unstructured`](docs/extractors/text-document/unstructured.md) provides broad document parsing (optional).
|
|
504
|
+
- [`markitdown`](docs/extractors/text-document/markitdown.md) converts many formats into Markdown-like text (optional).
|
|
505
|
+
|
|
506
|
+
### Optical character recognition
|
|
507
|
+
|
|
508
|
+
- [`ocr-rapidocr`](docs/extractors/ocr/rapidocr.md) does optical character recognition on images (optional).
|
|
509
|
+
- [`ocr-paddleocr-vl`](docs/extractors/ocr/paddleocr-vl.md) does advanced optical character recognition with PaddleOCR vision-language model (optional).
|
|
510
|
+
|
|
511
|
+
### Vision-language models
|
|
512
|
+
|
|
513
|
+
- [`docling-smol`](docs/extractors/vlm-document/docling-smol.md) uses the SmolDocling-256M vision-language model for fast document understanding (optional).
|
|
514
|
+
- [`docling-granite`](docs/extractors/vlm-document/docling-granite.md) uses the Granite Docling-258M vision-language model for high-accuracy extraction (optional).
|
|
515
|
+
|
|
516
|
+
### Speech to text
|
|
517
|
+
|
|
518
|
+
- [`stt-openai`](docs/extractors/speech-to-text/openai.md) performs speech to text on audio using OpenAI (optional).
|
|
519
|
+
- [`stt-deepgram`](docs/extractors/speech-to-text/deepgram.md) performs speech to text on audio using Deepgram (optional).
|
|
520
|
+
|
|
521
|
+
### Pipeline utilities
|
|
522
|
+
|
|
523
|
+
- [`select-text`](docs/extractors/pipeline-utilities/select-text.md) chooses one prior extraction result in a pipeline.
|
|
524
|
+
- [`select-longest-text`](docs/extractors/pipeline-utilities/select-longest.md) chooses the longest prior extraction result.
|
|
525
|
+
- [`select-override`](docs/extractors/pipeline-utilities/select-override.md) chooses the last extraction result for matching media types in a pipeline.
|
|
526
|
+
- [`select-smart-override`](docs/extractors/pipeline-utilities/select-smart-override.md) intelligently chooses between extraction results based on confidence and content quality.
|
|
527
|
+
|
|
528
|
+
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
529
|
+
|
|
530
|
+
## Topic modeling analysis
|
|
531
|
+
|
|
532
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
|
|
533
|
+
analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
|
|
534
|
+
processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
|
|
535
|
+
JavaScript Object Notation.
|
|
536
|
+
|
|
537
|
+
Run a topic analysis using a recipe file:
|
|
538
|
+
|
|
539
|
+
```
|
|
540
|
+
biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
|
|
544
|
+
reproducibility. The analysis output is stored under:
|
|
545
|
+
|
|
546
|
+
```
|
|
547
|
+
.biblicus/runs/analysis/topic-modeling/<run_id>/output.json
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
Minimal recipe example:
|
|
551
|
+
|
|
552
|
+
```yaml
|
|
553
|
+
schema_version: 1
|
|
554
|
+
text_source:
|
|
555
|
+
sample_size: 200
|
|
556
|
+
llm_extraction:
|
|
557
|
+
enabled: false
|
|
558
|
+
lexical_processing:
|
|
559
|
+
enabled: true
|
|
560
|
+
lowercase: true
|
|
561
|
+
strip_punctuation: false
|
|
562
|
+
collapse_whitespace: true
|
|
563
|
+
bertopic_analysis:
|
|
564
|
+
parameters:
|
|
565
|
+
min_topic_size: 8
|
|
566
|
+
nr_topics: 10
|
|
567
|
+
llm_fine_tuning:
|
|
568
|
+
enabled: false
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
572
|
+
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
573
|
+
|
|
574
|
+
For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
|
|
575
|
+
|
|
576
|
+
```
|
|
577
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
469
581
|
|
|
470
582
|
## Integration corpus and evaluation dataset
|
|
471
583
|
|
|
@@ -522,6 +634,9 @@ License terms are in `LICENSE`.
|
|
|
522
634
|
[corpus]: docs/CORPUS.md
|
|
523
635
|
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
524
636
|
[text-extraction]: docs/EXTRACTION.md
|
|
637
|
+
[extractor-reference]: docs/extractors/index.md
|
|
638
|
+
[backend-reference]: docs/backends/index.md
|
|
639
|
+
[speech-to-text]: docs/STT.md
|
|
525
640
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
526
641
|
[backends]: docs/BACKENDS.md
|
|
527
642
|
[context-packs]: docs/CONTEXT_PACK.md
|
|
@@ -1,48 +1,62 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=XhgZfXIpkQ5_SzHj-2Vqt_N3hvx6TSOv6KMdac6HfaI,495
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
-
biblicus/cli.py,sha256=
|
|
4
|
-
biblicus/constants.py,sha256
|
|
3
|
+
biblicus/cli.py,sha256=GVmZlCSZPUMBbq69yjN16f4xNw71edlFbGPHX3300oI,32643
|
|
4
|
+
biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
|
|
5
5
|
biblicus/context.py,sha256=qnT9CH7_ldoPcg-rxnUOtRhheOmpDAbF8uqhf8OdjC4,5832
|
|
6
|
-
biblicus/corpus.py,sha256=
|
|
6
|
+
biblicus/corpus.py,sha256=Pq2OvXom7giwD1tuWoM3RhFnak5YFx5bCh6JTd6JYtI,55554
|
|
7
7
|
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
8
8
|
biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
|
|
9
9
|
biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
|
|
10
10
|
biblicus/evidence_processing.py,sha256=EMv1AkV_Eufk-poBz9nRR1dZgC-QewvI-NrULBUGVGA,6074
|
|
11
|
-
biblicus/extraction.py,sha256=
|
|
11
|
+
biblicus/extraction.py,sha256=20lRxz6Te6IcA4d-rfT4qjJtgRG_c4YvrqfXNA7EYfs,19738
|
|
12
12
|
biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
|
|
13
13
|
biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
14
14
|
biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
15
15
|
biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
|
|
16
16
|
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
17
|
+
biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
|
|
17
18
|
biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
|
|
18
|
-
biblicus/models.py,sha256=
|
|
19
|
+
biblicus/models.py,sha256=vlvPP7AOZGtnHSq47-s9YW-fqLwjgYR6NBcSfeC8YKk,15665
|
|
19
20
|
biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
20
21
|
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
21
22
|
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
22
23
|
biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
|
|
23
|
-
biblicus/user_config.py,sha256=
|
|
24
|
+
biblicus/user_config.py,sha256=okK57CRmT0W_yrc45tMPRl_abT7-D96IOrCBZtKtumM,6507
|
|
24
25
|
biblicus/_vendor/dotyaml/__init__.py,sha256=e4zbejeJRwlD4I0q3YvotMypO19lXqmT8iyU1q6SvhY,376
|
|
25
26
|
biblicus/_vendor/dotyaml/interpolation.py,sha256=PfUAEEOTFobv7Ox0E6nAxht6BqhHIDe4hP32fZn5TOs,1992
|
|
26
27
|
biblicus/_vendor/dotyaml/loader.py,sha256=KePkjyhKZSvQZphmlmlzTYZJBQsqL5qhtGV1y7G6wzM,5624
|
|
27
28
|
biblicus/_vendor/dotyaml/transformer.py,sha256=2AKPS8DMOPuYtzmM-dlwIqVbARfbBH5jYV1m5qpR49E,3725
|
|
29
|
+
biblicus/analysis/__init__.py,sha256=TrKsE2GmdZDr3OARo2poa9H0powo0bjiEEWVx0tZmEg,1192
|
|
30
|
+
biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
|
|
31
|
+
biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
|
|
32
|
+
biblicus/analysis/models.py,sha256=XocDiEVF7ud53hd9eCFTuMXS68U-eBthpe7a6J9j6uU,17824
|
|
33
|
+
biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
|
|
34
|
+
biblicus/analysis/topic_modeling.py,sha256=Y_9Auh47_wRD4LXVZ_c-S7AYeO72wLu39CHHa_ZLunI,18352
|
|
28
35
|
biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
|
|
29
36
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
30
37
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
31
38
|
biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
|
|
32
|
-
biblicus/extractors/__init__.py,sha256=
|
|
39
|
+
biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
|
|
33
40
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
41
|
+
biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
|
|
42
|
+
biblicus/extractors/docling_granite_text.py,sha256=aFNx-HubvaMmVJHbNqk3CR_ilSwN96-phkaENT6E2B0,6879
|
|
43
|
+
biblicus/extractors/docling_smol_text.py,sha256=cSbQcT4O47MMcM6_pmQCvqgC5ferLvaxJnm3v9EQd0A,6811
|
|
44
|
+
biblicus/extractors/markitdown_text.py,sha256=-7N8ebi3pYfNPnplccyy3qvsKi6uImC1xyo_dSDiD10,4546
|
|
34
45
|
biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
|
|
35
46
|
biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
|
|
47
|
+
biblicus/extractors/paddleocr_vl_text.py,sha256=augbxZ-kx22yHvFR1b6CUAS2I6ktXFsJx8nLWRfvdOA,11722
|
|
36
48
|
biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
|
|
37
49
|
biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
|
|
38
50
|
biblicus/extractors/pipeline.py,sha256=LY6eM3ypw50MDB2cPEQqZrjxkhVvIc6sv4UEhHdNDrE,3208
|
|
39
|
-
biblicus/extractors/rapidocr_text.py,sha256=
|
|
51
|
+
biblicus/extractors/rapidocr_text.py,sha256=StvizEha5BkEG7i5KJmnOUtji89p5pghF4w8iQ-WwFk,4776
|
|
40
52
|
biblicus/extractors/select_longest_text.py,sha256=wRveXAfYLdj7CpGuo4RoD7zE6SIfylRCbv40z2azO0k,3702
|
|
53
|
+
biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_mvjehiSec,4014
|
|
54
|
+
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
41
55
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
42
56
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
43
|
-
biblicus-0.
|
|
44
|
-
biblicus-0.
|
|
45
|
-
biblicus-0.
|
|
46
|
-
biblicus-0.
|
|
47
|
-
biblicus-0.
|
|
48
|
-
biblicus-0.
|
|
57
|
+
biblicus-0.8.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
58
|
+
biblicus-0.8.0.dist-info/METADATA,sha256=I4zW3JWMOmyh4tBpR-D2MGAl9YCp9IqtFo8wxoNA1qQ,27116
|
|
59
|
+
biblicus-0.8.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
60
|
+
biblicus-0.8.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
61
|
+
biblicus-0.8.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
62
|
+
biblicus-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|