biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -27,4 +27,4 @@ __all__ = [
27
27
  "RetrievalRun",
28
28
  ]
29
29
 
30
- __version__ = "0.13.0"
30
+ __version__ = "0.15.0"
@@ -8,6 +8,6 @@ loading and interpolating YAML configuration files.
8
8
  from __future__ import annotations
9
9
 
10
10
  from .interpolation import interpolate_env_vars
11
- from .loader import ConfigLoader, load_config
11
+ from .loader import ConfigLoader, load_config, load_yaml_view
12
12
 
13
- __all__ = ["ConfigLoader", "interpolate_env_vars", "load_config"]
13
+ __all__ = ["ConfigLoader", "interpolate_env_vars", "load_config", "load_yaml_view"]
@@ -6,7 +6,7 @@ from __future__ import annotations
6
6
 
7
7
  import os
8
8
  from pathlib import Path
9
- from typing import Any, Dict, Optional, Union
9
+ from typing import Any, Dict, Iterable, Optional, Union
10
10
 
11
11
  import yaml
12
12
 
@@ -82,6 +82,45 @@ def load_config(
82
82
  return config
83
83
 
84
84
 
85
+ def _merge_mapping_values(base: Dict[str, Any], overlay: Dict[str, Any]) -> Dict[str, Any]:
86
+ merged: Dict[str, Any] = dict(base)
87
+ for key, value in overlay.items():
88
+ existing = merged.get(key)
89
+ if isinstance(existing, dict) and isinstance(value, dict):
90
+ merged[key] = _merge_mapping_values(existing, value)
91
+ else:
92
+ merged[key] = value
93
+ return merged
94
+
95
+
96
+ def load_yaml_view(paths: Iterable[Union[str, Path]]) -> Dict[str, Any]:
97
+ """
98
+ Load and compose one or more YAML files into a single mapping.
99
+
100
+ :param paths: Iterable of YAML file paths in precedence order.
101
+ :type paths: Iterable[str | Path]
102
+ :return: Composed YAML mapping.
103
+ :rtype: dict[str, Any]
104
+ :raises ValueError: If any YAML file does not contain a mapping.
105
+ """
106
+
107
+ composed: Dict[str, Any] = {}
108
+ for raw_path in paths:
109
+ yaml_path = Path(raw_path)
110
+ with open(yaml_path, "r", encoding="utf-8") as file:
111
+ yaml_data = yaml.safe_load(file)
112
+
113
+ if yaml_data is None:
114
+ yaml_data = {}
115
+
116
+ if not isinstance(yaml_data, dict):
117
+ raise ValueError(f"YAML content must be a mapping for {yaml_path}")
118
+
119
+ yaml_data = interpolate_env_vars(yaml_data)
120
+ composed = _merge_mapping_values(composed, yaml_data)
121
+ return composed
122
+
123
+
85
124
  class ConfigLoader:
86
125
  """
87
126
  Configuration loader that can read YAML files or environment variables.
@@ -0,0 +1,39 @@
1
+ """
2
+ Provider-backed AI utilities for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any
8
+
9
+ __all__ = [
10
+ "AiProvider",
11
+ "EmbeddingsClientConfig",
12
+ "LlmClientConfig",
13
+ "generate_completion",
14
+ "generate_embeddings",
15
+ "generate_embeddings_batch",
16
+ ]
17
+
18
+
19
+ def __getattr__(name: str) -> Any:
20
+ if name in {"AiProvider", "EmbeddingsClientConfig", "LlmClientConfig"}:
21
+ from .models import AiProvider, EmbeddingsClientConfig, LlmClientConfig
22
+
23
+ return {
24
+ "AiProvider": AiProvider,
25
+ "EmbeddingsClientConfig": EmbeddingsClientConfig,
26
+ "LlmClientConfig": LlmClientConfig,
27
+ }[name]
28
+ if name in {"generate_completion"}:
29
+ from .llm import generate_completion
30
+
31
+ return generate_completion
32
+ if name in {"generate_embeddings", "generate_embeddings_batch"}:
33
+ from .embeddings import generate_embeddings, generate_embeddings_batch
34
+
35
+ return {
36
+ "generate_embeddings": generate_embeddings,
37
+ "generate_embeddings_batch": generate_embeddings_batch,
38
+ }[name]
39
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,114 @@
1
+ """
2
+ Provider-backed text embeddings.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from typing import Any, List, Sequence
9
+
10
+ from .models import EmbeddingsClientConfig
11
+
12
+
13
+ def _require_dspy_embedder():
14
+ try:
15
+ import dspy
16
+ except ImportError as import_error:
17
+ raise ValueError(
18
+ "DSPy backend requires an optional dependency. "
19
+ 'Install it with pip install "biblicus[dspy]".'
20
+ ) from import_error
21
+ if not hasattr(dspy, "Embedder"):
22
+ raise ValueError(
23
+ "DSPy backend requires an optional dependency with Embedder support. "
24
+ 'Install it with pip install "biblicus[dspy]".'
25
+ )
26
+ return dspy
27
+
28
+
29
+ def generate_embeddings(*, client: EmbeddingsClientConfig, text: str) -> List[float]:
30
+ """
31
+ Generate a single embedding vector.
32
+
33
+ :param client: Embeddings client configuration.
34
+ :type client: biblicus.ai.models.EmbeddingsClientConfig
35
+ :param text: Input text to embed.
36
+ :type text: str
37
+ :return: Embedding vector.
38
+ :rtype: list[float]
39
+ """
40
+ vectors = generate_embeddings_batch(client=client, texts=[text])
41
+ return vectors[0]
42
+
43
+
44
+ def _chunks(texts: Sequence[str], batch_size: int) -> List[List[str]]:
45
+ return [list(texts[idx : idx + batch_size]) for idx in range(0, len(texts), batch_size)]
46
+
47
+
48
+ def _normalize_embeddings(embeddings: Any) -> List[List[float]]:
49
+ if hasattr(embeddings, "tolist"):
50
+ embeddings = embeddings.tolist()
51
+ if isinstance(embeddings, list) and embeddings and not isinstance(embeddings[0], list):
52
+ return [[float(value) for value in embeddings]]
53
+ return [[float(value) for value in row] for row in embeddings]
54
+
55
+
56
+ def generate_embeddings_batch(
57
+ *, client: EmbeddingsClientConfig, texts: Sequence[str]
58
+ ) -> List[List[float]]:
59
+ """
60
+ Generate embeddings for a batch of texts.
61
+
62
+ The implementation performs batched requests and can run requests concurrently.
63
+
64
+ :param client: Embeddings client configuration.
65
+ :type client: biblicus.ai.models.EmbeddingsClientConfig
66
+ :param texts: Text inputs to embed.
67
+ :type texts: Sequence[str]
68
+ :return: Embedding vectors in input order.
69
+ :rtype: list[list[float]]
70
+ :raises ValueError: If required dependencies or credentials are missing.
71
+ """
72
+ if not texts:
73
+ return []
74
+
75
+ dspy = _require_dspy_embedder()
76
+
77
+ model = client.litellm_model()
78
+ request_kwargs = client.build_litellm_kwargs()
79
+
80
+ items = list(texts)
81
+ if len(items) == 1:
82
+ embedder = dspy.Embedder(
83
+ model,
84
+ batch_size=1,
85
+ caching=False,
86
+ **request_kwargs,
87
+ )
88
+ embeddings = embedder(items[0])
89
+ return _normalize_embeddings(embeddings)
90
+
91
+ batches = _chunks(items, client.batch_size)
92
+ results: List[List[List[float]]] = [None for _ in range(len(batches))] # type: ignore[list-item]
93
+
94
+ def _embed_batch(batch_texts: List[str]) -> List[List[float]]:
95
+ embedder = dspy.Embedder(
96
+ model,
97
+ batch_size=len(batch_texts),
98
+ caching=False,
99
+ **request_kwargs,
100
+ )
101
+ embeddings = embedder(batch_texts)
102
+ return _normalize_embeddings(embeddings)
103
+
104
+ with ThreadPoolExecutor(max_workers=client.parallelism) as executor:
105
+ futures = {executor.submit(_embed_batch, batch): idx for idx, batch in enumerate(batches)}
106
+ for future in as_completed(futures):
107
+ idx = futures[future]
108
+ results[idx] = future.result()
109
+
110
+ flattened: List[List[float]] = []
111
+ for batch_vectors in results:
112
+ for vector in batch_vectors:
113
+ flattened.append(vector)
114
+ return flattened
biblicus/ai/llm.py ADDED
@@ -0,0 +1,138 @@
1
+ """
2
+ Provider-backed chat completions.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Any, Optional, Sequence
9
+
10
+ from .models import LlmClientConfig
11
+
12
+
13
+ @dataclass
14
+ class ChatCompletionResult:
15
+ """
16
+ Normalized response from a chat completion call.
17
+
18
+ :param text: Generated assistant text.
19
+ :type text: str
20
+ :param tool_calls: Structured tool calls from the provider.
21
+ :type tool_calls: list[dict[str, Any]]
22
+ """
23
+
24
+ text: str
25
+ tool_calls: list[dict[str, Any]]
26
+
27
+
28
+ def _require_dspy():
29
+ try:
30
+ import dspy
31
+ except ImportError as import_error:
32
+ raise ValueError(
33
+ "DSPy backend requires an optional dependency. "
34
+ 'Install it with pip install "biblicus[dspy]".'
35
+ ) from import_error
36
+ if not hasattr(dspy, "LM"):
37
+ raise ValueError(
38
+ "DSPy backend requires an optional dependency with LM support. "
39
+ 'Install it with pip install "biblicus[dspy]".'
40
+ )
41
+ return dspy
42
+
43
+
44
+ def _normalize_tool_calls(tool_calls: Sequence[object]) -> list[dict[str, Any]]:
45
+ normalized: list[dict[str, Any]] = []
46
+ for tool_call in tool_calls:
47
+ if isinstance(tool_call, dict):
48
+ function = tool_call.get("function") or {}
49
+ normalized.append(
50
+ {
51
+ "id": str(tool_call.get("id") or ""),
52
+ "type": str(tool_call.get("type") or "function"),
53
+ "function": {
54
+ "name": str(function.get("name") or ""),
55
+ "arguments": str(function.get("arguments") or ""),
56
+ },
57
+ }
58
+ )
59
+ continue
60
+ function = getattr(tool_call, "function", None)
61
+ normalized.append(
62
+ {
63
+ "id": str(getattr(tool_call, "id", "") or ""),
64
+ "type": str(getattr(tool_call, "type", "function") or "function"),
65
+ "function": {
66
+ "name": str(getattr(function, "name", "") or ""),
67
+ "arguments": str(getattr(function, "arguments", "") or ""),
68
+ },
69
+ }
70
+ )
71
+ return normalized
72
+
73
+
74
+ def chat_completion(
75
+ *,
76
+ client: LlmClientConfig,
77
+ messages: Sequence[dict[str, Any]],
78
+ tools: Optional[Sequence[dict[str, Any]]] = None,
79
+ tool_choice: Optional[str] = None,
80
+ ) -> ChatCompletionResult:
81
+ """
82
+ Execute a chat completion using DSPy (LiteLLM-backed).
83
+
84
+ :param client: LLM client configuration.
85
+ :type client: biblicus.ai.models.LlmClientConfig
86
+ :param messages: Chat messages payload.
87
+ :type messages: Sequence[dict[str, Any]]
88
+ :param tools: Optional tool definitions to pass through.
89
+ :type tools: Sequence[dict[str, Any]] or None
90
+ :param tool_choice: Optional tool choice directive.
91
+ :type tool_choice: str or None
92
+ :return: Normalized completion result.
93
+ :rtype: ChatCompletionResult
94
+ :raises ValueError: If required dependencies or credentials are missing.
95
+ """
96
+ dspy = _require_dspy()
97
+ lm = dspy.LM(client.litellm_model(), **client.build_litellm_kwargs())
98
+ request_kwargs: dict[str, Any] = {}
99
+ if tools:
100
+ request_kwargs["tools"] = list(tools)
101
+ if tool_choice:
102
+ request_kwargs["tool_choice"] = tool_choice
103
+ if client.response_format:
104
+ request_kwargs["response_format"] = {"type": client.response_format}
105
+
106
+ response = lm(messages=list(messages), **request_kwargs)
107
+ item = response[0] if isinstance(response, list) and response else response
108
+ if isinstance(item, dict):
109
+ text = str(item.get("text") or item.get("content") or "")
110
+ tool_calls = _normalize_tool_calls(item.get("tool_calls") or [])
111
+ return ChatCompletionResult(text=text, tool_calls=tool_calls)
112
+ return ChatCompletionResult(text=str(item or ""), tool_calls=[])
113
+
114
+
115
+ def generate_completion(
116
+ *,
117
+ client: LlmClientConfig,
118
+ system_prompt: Optional[str],
119
+ user_prompt: str,
120
+ ) -> str:
121
+ """
122
+ Generate a completion using the configured provider.
123
+
124
+ :param client: LLM client configuration.
125
+ :type client: biblicus.ai.models.LlmClientConfig
126
+ :param system_prompt: Optional system prompt content.
127
+ :type system_prompt: str or None
128
+ :param user_prompt: User prompt content.
129
+ :type user_prompt: str
130
+ :return: Generated completion text.
131
+ :rtype: str
132
+ :raises ValueError: If required dependencies or credentials are missing.
133
+ """
134
+ messages: list[dict[str, Any]] = []
135
+ if system_prompt:
136
+ messages.append({"role": "system", "content": system_prompt})
137
+ messages.append({"role": "user", "content": user_prompt})
138
+ return chat_completion(client=client, messages=messages).text
biblicus/ai/models.py ADDED
@@ -0,0 +1,226 @@
1
+ """
2
+ Pydantic models for provider-backed AI clients.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from enum import Enum
8
+ from typing import Any, Optional
9
+
10
+ from pydantic import Field, field_validator
11
+
12
+ from ..analysis.schema import AnalysisSchemaModel
13
+ from ..user_config import resolve_openai_api_key
14
+
15
+
16
+ class AiProvider(str, Enum):
17
+ """
18
+ Supported AI providers.
19
+ """
20
+
21
+ OPENAI = "openai"
22
+ BEDROCK = "bedrock"
23
+ ANTHROPIC = "anthropic"
24
+ GEMINI = "gemini"
25
+ OLLAMA = "ollama"
26
+ LITELLM = "litellm"
27
+
28
+
29
+ def _normalize_provider(value: object, *, error_label: str) -> str:
30
+ if isinstance(value, AiProvider):
31
+ return value.value
32
+ if isinstance(value, str):
33
+ return value.lower()
34
+ raise ValueError(f"{error_label} must be a string or AiProvider")
35
+
36
+
37
+ def _litellm_model(provider: str, model: str) -> str:
38
+ normalized_model = model.strip()
39
+ if "/" in normalized_model:
40
+ return normalized_model
41
+ return f"{provider}/{normalized_model}"
42
+
43
+
44
+ class LlmClientConfig(AnalysisSchemaModel):
45
+ """
46
+ Configuration for a chat completion invocation.
47
+
48
+ :ivar provider: Provider identifier.
49
+ :vartype provider: str or AiProvider
50
+ :ivar model: Model identifier.
51
+ :vartype model: str
52
+ :ivar api_key: Optional API key override.
53
+ :vartype api_key: str or None
54
+ :ivar api_base: Optional API base override.
55
+ :vartype api_base: str or None
56
+ :ivar temperature: Optional generation temperature.
57
+ :vartype temperature: float or None
58
+ :ivar max_tokens: Optional maximum output tokens.
59
+ :vartype max_tokens: int or None
60
+ :ivar response_format: Optional response format identifier.
61
+ :vartype response_format: str or None
62
+ :ivar max_retries: Optional maximum retry count for transient failures.
63
+ :vartype max_retries: int
64
+ :ivar timeout_seconds: Optional request timeout in seconds.
65
+ :vartype timeout_seconds: float or None
66
+ :ivar model_type: Optional model type identifier.
67
+ :vartype model_type: str or None
68
+ :ivar extra_params: Additional provider-specific parameters to pass through.
69
+ :vartype extra_params: dict[str, Any]
70
+ """
71
+
72
+ provider: str
73
+ model: str = Field(min_length=1)
74
+ api_key: Optional[str] = None
75
+ api_base: Optional[str] = None
76
+ temperature: Optional[float] = Field(default=None, ge=0.0)
77
+ max_tokens: Optional[int] = Field(default=None, ge=1)
78
+ response_format: Optional[str] = None
79
+ max_retries: int = Field(default=0, ge=0)
80
+ timeout_seconds: Optional[float] = Field(default=None, gt=0.0)
81
+ model_type: Optional[str] = None
82
+ extra_params: dict[str, Any] = Field(default_factory=dict)
83
+
84
+ @field_validator("provider", mode="before")
85
+ @classmethod
86
+ def _parse_provider(cls, value: object) -> str:
87
+ return _normalize_provider(value, error_label="llm client provider")
88
+
89
+ def litellm_model(self) -> str:
90
+ """
91
+ Resolve the DSPy model identifier for this client.
92
+
93
+ :return: DSPy model string (LiteLLM format).
94
+ :rtype: str
95
+ """
96
+ return _litellm_model(self.provider, self.model)
97
+
98
+ def resolve_api_key(self) -> Optional[str]:
99
+ """
100
+ Resolve an API key for the configured provider.
101
+
102
+ :return: API key string or None if not required.
103
+ :rtype: str or None
104
+ :raises ValueError: If OpenAI is configured and no key is available.
105
+ """
106
+ if self.api_key:
107
+ return self.api_key
108
+ if self.provider != AiProvider.OPENAI.value:
109
+ return None
110
+ api_key = resolve_openai_api_key()
111
+ if api_key is None:
112
+ raise ValueError(
113
+ "OpenAI provider requires an OpenAI API key. "
114
+ "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
115
+ "openai.api_key."
116
+ )
117
+ return api_key
118
+
119
+ def build_litellm_kwargs(self) -> dict[str, Any]:
120
+ """
121
+ Build DSPy keyword arguments for chat completions.
122
+
123
+ :return: Keyword arguments for DSPy (LiteLLM-backed).
124
+ :rtype: dict[str, Any]
125
+ """
126
+ api_key = self.resolve_api_key()
127
+ base_kwargs: dict[str, Any] = {
128
+ "api_key": api_key,
129
+ "api_base": self.api_base,
130
+ "temperature": self.temperature,
131
+ "max_tokens": self.max_tokens,
132
+ "model_type": self.model_type,
133
+ "timeout": self.timeout_seconds,
134
+ "num_retries": self.max_retries,
135
+ }
136
+ for key, value in (self.extra_params or {}).items():
137
+ base_kwargs[key] = value
138
+ return {key: value for key, value in base_kwargs.items() if value is not None}
139
+
140
+
141
+ class EmbeddingsClientConfig(AnalysisSchemaModel):
142
+ """
143
+ Configuration for an embeddings invocation.
144
+
145
+ :ivar provider: Provider identifier.
146
+ :vartype provider: str or AiProvider
147
+ :ivar model: Model identifier.
148
+ :vartype model: str
149
+ :ivar api_key: Optional API key override.
150
+ :vartype api_key: str or None
151
+ :ivar api_base: Optional API base override.
152
+ :vartype api_base: str or None
153
+ :ivar batch_size: Maximum number of texts per request.
154
+ :vartype batch_size: int
155
+ :ivar parallelism: Maximum number of concurrent requests.
156
+ :vartype parallelism: int
157
+ :ivar max_retries: Optional maximum retry count for transient failures.
158
+ :vartype max_retries: int
159
+ :ivar timeout_seconds: Optional request timeout in seconds.
160
+ :vartype timeout_seconds: float or None
161
+ :ivar extra_params: Additional provider-specific parameters to pass through.
162
+ :vartype extra_params: dict[str, Any]
163
+ """
164
+
165
+ provider: str
166
+ model: str = Field(min_length=1)
167
+ api_key: Optional[str] = None
168
+ api_base: Optional[str] = None
169
+ batch_size: int = Field(default=64, ge=1)
170
+ parallelism: int = Field(default=4, ge=1)
171
+ max_retries: int = Field(default=0, ge=0)
172
+ timeout_seconds: Optional[float] = Field(default=None, gt=0.0)
173
+ extra_params: dict[str, Any] = Field(default_factory=dict)
174
+
175
+ @field_validator("provider", mode="before")
176
+ @classmethod
177
+ def _parse_provider(cls, value: object) -> str:
178
+ return _normalize_provider(value, error_label="embeddings provider")
179
+
180
+ def litellm_model(self) -> str:
181
+ """
182
+ Resolve the DSPy model identifier for this client.
183
+
184
+ :return: DSPy model string (LiteLLM format).
185
+ :rtype: str
186
+ """
187
+ return _litellm_model(self.provider, self.model)
188
+
189
+ def resolve_api_key(self) -> Optional[str]:
190
+ """
191
+ Resolve an API key for the configured provider.
192
+
193
+ :return: API key string or None if not required.
194
+ :rtype: str or None
195
+ :raises ValueError: If OpenAI is configured and no key is available.
196
+ """
197
+ if self.api_key:
198
+ return self.api_key
199
+ if self.provider != AiProvider.OPENAI.value:
200
+ return None
201
+ api_key = resolve_openai_api_key()
202
+ if api_key is None:
203
+ raise ValueError(
204
+ "OpenAI provider requires an OpenAI API key. "
205
+ "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
206
+ "openai.api_key."
207
+ )
208
+ return api_key
209
+
210
+ def build_litellm_kwargs(self) -> dict[str, Any]:
211
+ """
212
+ Build DSPy keyword arguments for embeddings calls.
213
+
214
+ :return: Keyword arguments for DSPy (LiteLLM-backed).
215
+ :rtype: dict[str, Any]
216
+ """
217
+ api_key = self.resolve_api_key()
218
+ base_kwargs: dict[str, Any] = {
219
+ "api_key": api_key,
220
+ "api_base": self.api_base,
221
+ "timeout": self.timeout_seconds,
222
+ "num_retries": self.max_retries,
223
+ }
224
+ for key, value in (self.extra_params or {}).items():
225
+ base_kwargs[key] = value
226
+ return {key: value for key, value in base_kwargs.items() if value is not None}
@@ -7,8 +7,6 @@ from __future__ import annotations
7
7
  from typing import Dict, Type
8
8
 
9
9
  from .base import CorpusAnalysisBackend
10
- from .profiling import ProfilingBackend
11
- from .topic_modeling import TopicModelingBackend
12
10
 
13
11
 
14
12
  def available_analysis_backends() -> Dict[str, Type[CorpusAnalysisBackend]]:
@@ -18,9 +16,14 @@ def available_analysis_backends() -> Dict[str, Type[CorpusAnalysisBackend]]:
18
16
  :return: Mapping of analysis identifiers to backend classes.
19
17
  :rtype: dict[str, Type[CorpusAnalysisBackend]]
20
18
  """
19
+ from .markov import MarkovBackend
20
+ from .profiling import ProfilingBackend
21
+ from .topic_modeling import TopicModelingBackend
22
+
21
23
  return {
22
24
  ProfilingBackend.analysis_id: ProfilingBackend,
23
25
  TopicModelingBackend.analysis_id: TopicModelingBackend,
26
+ MarkovBackend.analysis_id: MarkovBackend,
24
27
  }
25
28
 
26
29