biblicus 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/__init__.py +40 -0
- biblicus/analysis/base.py +49 -0
- biblicus/analysis/llm.py +106 -0
- biblicus/analysis/models.py +512 -0
- biblicus/analysis/schema.py +18 -0
- biblicus/analysis/topic_modeling.py +561 -0
- biblicus/cli.py +160 -11
- biblicus/constants.py +2 -0
- biblicus/corpus.py +42 -0
- biblicus/extraction.py +5 -0
- biblicus/extractors/__init__.py +14 -0
- biblicus/extractors/deepgram_stt.py +166 -0
- biblicus/extractors/docling_granite_text.py +188 -0
- biblicus/extractors/docling_smol_text.py +188 -0
- biblicus/extractors/markitdown_text.py +128 -0
- biblicus/extractors/paddleocr_vl_text.py +305 -0
- biblicus/extractors/rapidocr_text.py +8 -1
- biblicus/extractors/select_override.py +121 -0
- biblicus/extractors/select_smart_override.py +187 -0
- biblicus/inference.py +104 -0
- biblicus/models.py +6 -0
- biblicus/user_config.py +76 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/METADATA +120 -5
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/RECORD +29 -15
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/WHEEL +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Analysis backend registry for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Type
|
|
8
|
+
|
|
9
|
+
from .base import CorpusAnalysisBackend
|
|
10
|
+
from .topic_modeling import TopicModelingBackend
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def available_analysis_backends() -> Dict[str, Type[CorpusAnalysisBackend]]:
|
|
14
|
+
"""
|
|
15
|
+
Return the registered analysis backends.
|
|
16
|
+
|
|
17
|
+
:return: Mapping of analysis identifiers to backend classes.
|
|
18
|
+
:rtype: dict[str, Type[CorpusAnalysisBackend]]
|
|
19
|
+
"""
|
|
20
|
+
return {
|
|
21
|
+
TopicModelingBackend.analysis_id: TopicModelingBackend,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_analysis_backend(analysis_id: str) -> CorpusAnalysisBackend:
|
|
26
|
+
"""
|
|
27
|
+
Instantiate an analysis backend by identifier.
|
|
28
|
+
|
|
29
|
+
:param analysis_id: Analysis backend identifier.
|
|
30
|
+
:type analysis_id: str
|
|
31
|
+
:return: Analysis backend instance.
|
|
32
|
+
:rtype: CorpusAnalysisBackend
|
|
33
|
+
:raises KeyError: If the analysis backend identifier is unknown.
|
|
34
|
+
"""
|
|
35
|
+
registry = available_analysis_backends()
|
|
36
|
+
backend_class = registry.get(analysis_id)
|
|
37
|
+
if backend_class is None:
|
|
38
|
+
known = ", ".join(sorted(registry))
|
|
39
|
+
raise KeyError(f"Unknown analysis backend '{analysis_id}'. Known backends: {known}")
|
|
40
|
+
return backend_class()
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Analysis backend interface for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Dict
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from ..corpus import Corpus
|
|
13
|
+
from ..models import ExtractionRunReference
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CorpusAnalysisBackend(ABC):
|
|
17
|
+
"""
|
|
18
|
+
Abstract interface for analysis backends.
|
|
19
|
+
|
|
20
|
+
:ivar analysis_id: Identifier string for the analysis backend.
|
|
21
|
+
:vartype analysis_id: str
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
analysis_id: str
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def run_analysis(
|
|
28
|
+
self,
|
|
29
|
+
corpus: Corpus,
|
|
30
|
+
*,
|
|
31
|
+
recipe_name: str,
|
|
32
|
+
config: Dict[str, object],
|
|
33
|
+
extraction_run: ExtractionRunReference,
|
|
34
|
+
) -> BaseModel:
|
|
35
|
+
"""
|
|
36
|
+
Run an analysis pipeline for a corpus.
|
|
37
|
+
|
|
38
|
+
:param corpus: Corpus to analyze.
|
|
39
|
+
:type corpus: Corpus
|
|
40
|
+
:param recipe_name: Human-readable recipe name.
|
|
41
|
+
:type recipe_name: str
|
|
42
|
+
:param config: Analysis configuration values.
|
|
43
|
+
:type config: dict[str, object]
|
|
44
|
+
:param extraction_run: Extraction run reference for text inputs.
|
|
45
|
+
:type extraction_run: biblicus.models.ExtractionRunReference
|
|
46
|
+
:return: Analysis output model.
|
|
47
|
+
:rtype: pydantic.BaseModel
|
|
48
|
+
"""
|
|
49
|
+
raise NotImplementedError
|
biblicus/analysis/llm.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lightweight LLM client configuration for analysis pipelines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import Field, field_validator
|
|
11
|
+
|
|
12
|
+
from ..user_config import resolve_openai_api_key
|
|
13
|
+
from .schema import AnalysisSchemaModel
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LlmProvider(str, Enum):
|
|
17
|
+
"""
|
|
18
|
+
Supported LLM providers.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
OPENAI = "openai"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LlmClientConfig(AnalysisSchemaModel):
|
|
25
|
+
"""
|
|
26
|
+
Configuration for an LLM client invocation.
|
|
27
|
+
|
|
28
|
+
:ivar provider: LLM provider identifier.
|
|
29
|
+
:vartype provider: LlmProvider
|
|
30
|
+
:ivar model: Model identifier for the provider.
|
|
31
|
+
:vartype model: str
|
|
32
|
+
:ivar api_key: Optional API key override.
|
|
33
|
+
:vartype api_key: str or None
|
|
34
|
+
:ivar temperature: Optional generation temperature.
|
|
35
|
+
:vartype temperature: float or None
|
|
36
|
+
:ivar max_tokens: Optional maximum output tokens.
|
|
37
|
+
:vartype max_tokens: int or None
|
|
38
|
+
:ivar max_retries: Optional maximum retry count for transient failures.
|
|
39
|
+
:vartype max_retries: int
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
provider: LlmProvider
|
|
43
|
+
model: str = Field(min_length=1)
|
|
44
|
+
api_key: Optional[str] = None
|
|
45
|
+
temperature: Optional[float] = Field(default=None, ge=0.0)
|
|
46
|
+
max_tokens: Optional[int] = Field(default=None, ge=1)
|
|
47
|
+
max_retries: int = Field(default=0, ge=0)
|
|
48
|
+
|
|
49
|
+
@field_validator("provider", mode="before")
|
|
50
|
+
@classmethod
|
|
51
|
+
def _parse_provider(cls, value: object) -> LlmProvider:
|
|
52
|
+
if isinstance(value, LlmProvider):
|
|
53
|
+
return value
|
|
54
|
+
if isinstance(value, str):
|
|
55
|
+
return LlmProvider(value)
|
|
56
|
+
raise ValueError("llm client provider must be a string or LlmProvider")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def generate_completion(
|
|
60
|
+
*,
|
|
61
|
+
client: LlmClientConfig,
|
|
62
|
+
system_prompt: Optional[str],
|
|
63
|
+
user_prompt: str,
|
|
64
|
+
) -> str:
|
|
65
|
+
"""
|
|
66
|
+
Generate a completion using the configured LLM provider.
|
|
67
|
+
|
|
68
|
+
:param client: LLM client configuration.
|
|
69
|
+
:type client: LlmClientConfig
|
|
70
|
+
:param system_prompt: Optional system prompt content.
|
|
71
|
+
:type system_prompt: str or None
|
|
72
|
+
:param user_prompt: User prompt content.
|
|
73
|
+
:type user_prompt: str
|
|
74
|
+
:return: Generated completion text.
|
|
75
|
+
:rtype: str
|
|
76
|
+
:raises ValueError: If required dependencies or credentials are missing.
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
from openai import OpenAI
|
|
80
|
+
except ImportError as import_error:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
"OpenAI LLM provider requires an optional dependency. "
|
|
83
|
+
'Install it with pip install "biblicus[openai]".'
|
|
84
|
+
) from import_error
|
|
85
|
+
api_key = client.api_key or resolve_openai_api_key()
|
|
86
|
+
if api_key is None:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"OpenAI LLM provider requires an OpenAI API key. "
|
|
89
|
+
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
90
|
+
"openai.api_key."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
messages = []
|
|
94
|
+
if system_prompt:
|
|
95
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
96
|
+
messages.append({"role": "user", "content": user_prompt})
|
|
97
|
+
|
|
98
|
+
client_instance = OpenAI(api_key=api_key)
|
|
99
|
+
response = client_instance.chat.completions.create(
|
|
100
|
+
model=client.model,
|
|
101
|
+
messages=messages,
|
|
102
|
+
temperature=client.temperature,
|
|
103
|
+
max_tokens=client.max_tokens,
|
|
104
|
+
)
|
|
105
|
+
content = response.choices[0].message.content
|
|
106
|
+
return str(content or "")
|